summary refs log tree commit diff
path: root/src/linux.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/linux.rs')
-rw-r--r--src/linux.rs431
1 files changed, 299 insertions, 132 deletions
diff --git a/src/linux.rs b/src/linux.rs
index 84edf5c..4a87f7d 100644
--- a/src/linux.rs
+++ b/src/linux.rs
@@ -27,13 +27,13 @@ use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
 use libc::{self, c_int, gid_t, uid_t};
 
-use audio_streams::DummyStreamSource;
+use audio_streams::shm_streams::NullShmStreamSource;
 #[cfg(feature = "gpu")]
 use devices::virtio::EventDevice;
 use devices::virtio::{self, VirtioDevice};
 use devices::{
-    self, HostBackendDeviceProvider, PciDevice, VfioDevice, VfioPciDevice, VirtioPciDevice,
-    XhciController,
+    self, HostBackendDeviceProvider, PciDevice, VfioContainer, VfioDevice, VfioPciDevice,
+    VirtioPciDevice, XhciController,
 };
 use io_jail::{self, Minijail};
 use kvm::*;
@@ -63,7 +63,6 @@ use vm_control::{
 };
 
 use crate::{Config, DiskOption, Executable, SharedDir, SharedDirKind, TouchDeviceOption};
-
 use arch::{self, LinuxArch, RunnableLinuxVm, VirtioDeviceStub, VmComponents, VmImage};
 
 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
@@ -97,7 +96,7 @@ pub enum Error {
     CreateVfioDevice(devices::vfio::VfioError),
     DeviceJail(io_jail::Error),
     DevicePivotRoot(io_jail::Error),
-    Disk(io::Error),
+    Disk(PathBuf, io::Error),
     DiskImageLock(sys_util::Error),
     DropCapabilities(sys_util::Error),
     FsDeviceNew(virtio::fs::Error),
@@ -184,7 +183,7 @@ impl Display for Error {
             CreateVfioDevice(e) => write!(f, "Failed to create vfio device {}", e),
             DeviceJail(e) => write!(f, "failed to jail device: {}", e),
             DevicePivotRoot(e) => write!(f, "failed to pivot root device: {}", e),
-            Disk(e) => write!(f, "failed to load disk image: {}", e),
+            Disk(p, e) => write!(f, "failed to load disk image {}: {}", p.display(), e),
             DiskImageLock(e) => write!(f, "failed to lock disk image: {}", e),
             DropCapabilities(e) => write!(f, "failed to drop process capabilities: {}", e),
             FsDeviceNew(e) => write!(f, "failed to create fs device: {}", e),
@@ -301,55 +300,85 @@ fn get_max_open_files() -> Result<libc::rlim64_t> {
     }
 }
 
+struct SandboxConfig<'a> {
+    limit_caps: bool,
+    log_failures: bool,
+    seccomp_policy: &'a Path,
+    uid_map: Option<&'a str>,
+    gid_map: Option<&'a str>,
+}
+
 fn create_base_minijail(
     root: &Path,
-    log_failures: bool,
-    seccomp_policy: &Path,
+    r_limit: Option<u64>,
+    config: Option<&SandboxConfig>,
 ) -> Result<Minijail> {
     // All child jails run in a new user namespace without any users mapped,
     // they run as nobody unless otherwise configured.
     let mut j = Minijail::new().map_err(Error::DeviceJail)?;
-    j.namespace_pids();
-    j.namespace_user();
-    j.namespace_user_disable_setgroups();
-    // Don't need any capabilities.
-    j.use_caps(0);
-    // Create a new mount namespace with an empty root FS.
-    j.namespace_vfs();
-    j.enter_pivot_root(root).map_err(Error::DevicePivotRoot)?;
-    // Run in an empty network namespace.
-    j.namespace_net();
-    // Most devices don't need to open many fds.
-    j.set_rlimit(libc::RLIMIT_NOFILE as i32, 1024, 1024)
-        .map_err(Error::SettingMaxOpenFiles)?;
-    // Apply the block device seccomp policy.
-    j.no_new_privs();
-
-    // By default we'll prioritize using the pre-compiled .bpf over the .policy
-    // file (the .bpf is expected to be compiled using "trap" as the failure
-    // behavior instead of the default "kill" behavior).
-    // Refer to the code comment for the "seccomp-log-failures"
-    // command-line parameter for an explanation about why the |log_failures|
-    // flag forces the use of .policy files (and the build-time alternative to
-    // this run-time flag).
-    let bpf_policy_file = seccomp_policy.with_extension("bpf");
-    if bpf_policy_file.exists() && !log_failures {
-        j.parse_seccomp_program(&bpf_policy_file)
-            .map_err(Error::DeviceJail)?;
-    } else {
-        // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP,
-        // which will correctly kill the entire device process if a worker
-        // thread commits a seccomp violation.
-        j.set_seccomp_filter_tsync();
-        if log_failures {
-            j.log_seccomp_filter_failures();
+
+    if let Some(config) = config {
+        j.namespace_pids();
+        j.namespace_user();
+        j.namespace_user_disable_setgroups();
+        if config.limit_caps {
+            // Don't need any capabilities.
+            j.use_caps(0);
         }
-        j.parse_seccomp_filters(&seccomp_policy.with_extension("policy"))
-            .map_err(Error::DeviceJail)?;
+        if let Some(uid_map) = config.uid_map {
+            j.uidmap(uid_map).map_err(Error::SettingUidMap)?;
+        }
+        if let Some(gid_map) = config.gid_map {
+            j.gidmap(gid_map).map_err(Error::SettingGidMap)?;
+        }
+        // Run in a new mount namespace.
+        j.namespace_vfs();
+
+        // Run in an empty network namespace.
+        j.namespace_net();
+
+        // Don't allow the device to gain new privileges.
+        j.no_new_privs();
+
+        // By default we'll prioritize using the pre-compiled .bpf over the .policy
+        // file (the .bpf is expected to be compiled using "trap" as the failure
+        // behavior instead of the default "kill" behavior).
+        // Refer to the code comment for the "seccomp-log-failures"
+        // command-line parameter for an explanation about why the |log_failures|
+        // flag forces the use of .policy files (and the build-time alternative to
+        // this run-time flag).
+        let bpf_policy_file = config.seccomp_policy.with_extension("bpf");
+        if bpf_policy_file.exists() && !config.log_failures {
+            j.parse_seccomp_program(&bpf_policy_file)
+                .map_err(Error::DeviceJail)?;
+        } else {
+            // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP,
+            // which will correctly kill the entire device process if a worker
+            // thread commits a seccomp violation.
+            j.set_seccomp_filter_tsync();
+            if config.log_failures {
+                j.log_seccomp_filter_failures();
+            }
+            j.parse_seccomp_filters(&config.seccomp_policy.with_extension("policy"))
+                .map_err(Error::DeviceJail)?;
+        }
+        j.use_seccomp_filter();
+        // Don't do init setup.
+        j.run_as_init();
+    }
+
+    // Only pivot_root if we are not re-using the current root directory.
+    if root != Path::new("/") {
+        // It's safe to call `namespace_vfs` multiple times.
+        j.namespace_vfs();
+        j.enter_pivot_root(root).map_err(Error::DevicePivotRoot)?;
     }
-    j.use_seccomp_filter();
-    // Don't do init setup.
-    j.run_as_init();
+
+    // Most devices don't need to open many fds.
+    let limit = if let Some(r) = r_limit { r } else { 1024u64 };
+    j.set_rlimit(libc::RLIMIT_NOFILE as i32, limit, limit)
+        .map_err(Error::SettingMaxOpenFiles)?;
+
     Ok(j)
 }
 
@@ -362,11 +391,14 @@ fn simple_jail(cfg: &Config, policy: &str) -> Result<Option<Minijail>> {
             return Err(Error::PivotRootDoesntExist(pivot_root));
         }
         let policy_path: PathBuf = cfg.seccomp_policy_dir.join(policy);
-        Ok(Some(create_base_minijail(
-            root_path,
-            cfg.seccomp_log_failures,
-            &policy_path,
-        )?))
+        let config = SandboxConfig {
+            limit_caps: true,
+            log_failures: cfg.seccomp_log_failures,
+            seccomp_policy: &policy_path,
+            uid_map: None,
+            gid_map: None,
+        };
+        Ok(Some(create_base_minijail(root_path, None, Some(&config))?))
     } else {
         Ok(None)
     }
@@ -388,7 +420,7 @@ fn create_block_device(
             .read(true)
             .write(!disk.read_only)
             .open(&disk.path)
-            .map_err(Error::Disk)?
+            .map_err(|e| Error::Disk(disk.path.to_path_buf(), e))?
     };
     // Lock the disk image to prevent other crosvm instances from using it.
     let lock_op = if disk.read_only {
@@ -473,13 +505,16 @@ fn create_tpm_device(cfg: &Config) -> DeviceResult {
 }
 
 fn create_single_touch_device(cfg: &Config, single_touch_spec: &TouchDeviceOption) -> DeviceResult {
-    let socket = single_touch_spec.path.into_unix_stream().map_err(|e| {
-        error!("failed configuring virtio single touch: {:?}", e);
-        e
-    })?;
-
-    let dev = virtio::new_single_touch(socket, single_touch_spec.width, single_touch_spec.height)
-        .map_err(Error::InputDeviceNew)?;
+    let socket = single_touch_spec
+        .get_path()
+        .into_unix_stream()
+        .map_err(|e| {
+            error!("failed configuring virtio single touch: {:?}", e);
+            e
+        })?;
+
+    let (width, height) = single_touch_spec.get_size();
+    let dev = virtio::new_single_touch(socket, width, height).map_err(Error::InputDeviceNew)?;
     Ok(VirtioDeviceStub {
         dev: Box::new(dev),
         jail: simple_jail(&cfg, "input_device")?,
@@ -487,13 +522,13 @@ fn create_single_touch_device(cfg: &Config, single_touch_spec: &TouchDeviceOptio
 }
 
 fn create_trackpad_device(cfg: &Config, trackpad_spec: &TouchDeviceOption) -> DeviceResult {
-    let socket = trackpad_spec.path.into_unix_stream().map_err(|e| {
+    let socket = trackpad_spec.get_path().into_unix_stream().map_err(|e| {
         error!("failed configuring virtio trackpad: {}", e);
         e
     })?;
 
-    let dev = virtio::new_trackpad(socket, trackpad_spec.width, trackpad_spec.height)
-        .map_err(Error::InputDeviceNew)?;
+    let (width, height) = trackpad_spec.get_size();
+    let dev = virtio::new_trackpad(socket, width, height).map_err(Error::InputDeviceNew)?;
 
     Ok(VirtioDeviceStub {
         dev: Box::new(dev),
@@ -768,45 +803,20 @@ fn create_fs_device(
     tag: &str,
     fs_cfg: virtio::fs::passthrough::Config,
 ) -> DeviceResult {
-    let mut j = Minijail::new().map_err(Error::DeviceJail)?;
-
-    if cfg.sandbox {
-        j.namespace_pids();
-        j.namespace_user();
-        j.namespace_user_disable_setgroups();
-        j.uidmap(uid_map).map_err(Error::SettingUidMap)?;
-        j.gidmap(gid_map).map_err(Error::SettingGidMap)?;
-
-        // Run in an empty network namespace.
-        j.namespace_net();
-
-        j.no_new_privs();
-
-        // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly kill
-        // the entire device process if a worker thread commits a seccomp violation.
-        let seccomp_policy = cfg.seccomp_policy_dir.join("fs_device");
-        j.set_seccomp_filter_tsync();
-        if cfg.seccomp_log_failures {
-            j.log_seccomp_filter_failures();
-        }
-        j.parse_seccomp_filters(&seccomp_policy)
-            .map_err(Error::DeviceJail)?;
-        j.use_seccomp_filter();
-
-        // Don't do init setup.
-        j.run_as_init();
-    }
-
-    // Create a new mount namespace with the source directory as the root. We need this even when
-    // sandboxing is disabled as the server relies on the host kernel to prevent path traversals
-    // from leaking out of the shared directory.
-    j.namespace_vfs();
-    j.enter_pivot_root(src).map_err(Error::DevicePivotRoot)?;
-
-    // The file server opens a lot of fds and needs a really high open file limit.
     let max_open_files = get_max_open_files()?;
-    j.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
-        .map_err(Error::SettingMaxOpenFiles)?;
+    let j = if cfg.sandbox {
+        let seccomp_policy = cfg.seccomp_policy_dir.join("fs_device");
+        let config = SandboxConfig {
+            limit_caps: false,
+            uid_map: Some(uid_map),
+            gid_map: Some(gid_map),
+            log_failures: cfg.seccomp_log_failures,
+            seccomp_policy: &seccomp_policy,
+        };
+        create_base_minijail(src, Some(max_open_files), Some(&config))?
+    } else {
+        create_base_minijail(src, Some(max_open_files), None)?
+    };
 
     // TODO(chirantan): Use more than one worker once the kernel driver has been fixed to not panic
     // when num_queues > 1.
@@ -818,25 +828,36 @@ fn create_fs_device(
     })
 }
 
-fn create_9p_device(cfg: &Config, src: &Path, tag: &str) -> DeviceResult {
-    let (jail, root) = match simple_jail(&cfg, "9p_device")? {
-        Some(mut jail) => {
-            //  The shared directory becomes the root of the device's file system.
-            let root = Path::new("/");
-            jail.mount_bind(src, root, true)?;
+fn create_9p_device(
+    cfg: &Config,
+    uid_map: &str,
+    gid_map: &str,
+    src: &Path,
+    tag: &str,
+) -> DeviceResult {
+    let max_open_files = get_max_open_files()?;
+    let (jail, root) = if cfg.sandbox {
+        let seccomp_policy = cfg.seccomp_policy_dir.join("9p_device");
+        let config = SandboxConfig {
+            limit_caps: false,
+            uid_map: Some(uid_map),
+            gid_map: Some(gid_map),
+            log_failures: cfg.seccomp_log_failures,
+            seccomp_policy: &seccomp_policy,
+        };
 
-            // We want bind mounts from the parent namespaces to propagate into the 9p server's
-            // namespace.
-            jail.set_remount_mode(libc::MS_SLAVE);
+        let mut jail = create_base_minijail(src, Some(max_open_files), Some(&config))?;
+        // We want bind mounts from the parent namespaces to propagate into the 9p server's
+        // namespace.
+        jail.set_remount_mode(libc::MS_SLAVE);
 
-            add_crosvm_user_to_jail(&mut jail, "p9")?;
-            (Some(jail), root)
-        }
-        None => {
-            // There's no bind mount so we tell the server to treat the source directory as the
-            // root.
-            (None, src)
-        }
+        //  The shared directory becomes the root of the device's file system.
+        let root = Path::new("/");
+        (Some(jail), root)
+    } else {
+        // There's no mount namespace so we tell the server to treat the source directory as the
+        // root.
+        (None, src)
     };
 
     let dev = virtio::P9::new(root, tag).map_err(Error::P9DeviceNew)?;
@@ -858,10 +879,11 @@ fn create_pmem_device(
         .read(true)
         .write(!disk.read_only)
         .open(&disk.path)
-        .map_err(Error::Disk)?;
+        .map_err(|e| Error::Disk(disk.path.to_path_buf(), e))?;
 
     let (disk_size, arena_size) = {
-        let metadata = std::fs::metadata(&disk.path).map_err(Error::Disk)?;
+        let metadata =
+            std::fs::metadata(&disk.path).map_err(|e| Error::Disk(disk.path.to_path_buf(), e))?;
         let disk_len = metadata.len();
         // Linux requires pmem region sizes to be 2 MiB aligned. Linux will fill any partial page
         // at the end of an mmap'd file and won't write back beyond the actual file length, but if
@@ -1022,16 +1044,22 @@ fn create_virtio_devices(
 
     #[cfg(feature = "gpu")]
     {
-        if cfg.gpu_parameters.is_some() {
+        if let Some(gpu_parameters) = &cfg.gpu_parameters {
             let mut event_devices = Vec::new();
             if cfg.display_window_mouse {
                 let (event_device_socket, virtio_dev_socket) =
                     UnixStream::pair().map_err(Error::CreateSocket)?;
-                // TODO(nkgold): the width/height here should match the display's height/width. When
-                // those settings are available as CLI options, we should use the CLI options here
-                // as well.
-                let dev = virtio::new_single_touch(virtio_dev_socket, 1280, 1024)
-                    .map_err(Error::InputDeviceNew)?;
+                let (single_touch_width, single_touch_height) = cfg
+                    .virtio_single_touch
+                    .as_ref()
+                    .map(|single_touch_spec| single_touch_spec.get_size())
+                    .unwrap_or((gpu_parameters.display_width, gpu_parameters.display_height));
+                let dev = virtio::new_single_touch(
+                    virtio_dev_socket,
+                    single_touch_width,
+                    single_touch_height,
+                )
+                .map_err(Error::InputDeviceNew)?;
                 devs.push(VirtioDeviceStub {
                     dev: Box::new(dev),
                     jail: simple_jail(&cfg, "input_device")?,
@@ -1077,7 +1105,7 @@ fn create_virtio_devices(
 
         let dev = match kind {
             SharedDirKind::FS => create_fs_device(cfg, uid_map, gid_map, src, tag, fs_cfg.clone())?,
-            SharedDirKind::P9 => create_9p_device(cfg, src, tag)?,
+            SharedDirKind::P9 => create_9p_device(cfg, uid_map, gid_map, src, tag)?,
         };
         devs.push(dev);
     }
@@ -1136,7 +1164,7 @@ fn create_devices(
     }
 
     if cfg.null_audio {
-        let server = Box::new(DummyStreamSource::new());
+        let server = Box::new(NullShmStreamSource::new());
         let null_audio = devices::Ac97Dev::new(mem.clone(), server);
 
         pci_devices.push((
@@ -1148,7 +1176,11 @@ fn create_devices(
     let usb_controller = Box::new(XhciController::new(mem.clone(), usb_provider));
     pci_devices.push((usb_controller, simple_jail(&cfg, "xhci")?));
 
-    if cfg.vfio.is_some() {
+    if let Some(vfio_path) = &cfg.vfio {
+        let vfio_container = Arc::new(Mutex::new(
+            VfioContainer::new().map_err(Error::CreateVfioDevice)?,
+        ));
+
         let (vfio_host_socket_irq, vfio_device_socket_irq) =
             msg_socket::pair::<VmIrqResponse, VmIrqRequest>().map_err(Error::CreateSocket)?;
         control_sockets.push(TaggedControlSocket::VmIrq(vfio_host_socket_irq));
@@ -1157,9 +1189,9 @@ fn create_devices(
             msg_socket::pair::<VmMemoryResponse, VmMemoryRequest>().map_err(Error::CreateSocket)?;
         control_sockets.push(TaggedControlSocket::VmMemory(vfio_host_socket_mem));
 
-        let vfio_path = cfg.vfio.as_ref().unwrap().as_path();
-        let vfiodevice =
-            VfioDevice::new(vfio_path, vm, mem.clone()).map_err(Error::CreateVfioDevice)?;
+        let vfio_path = vfio_path.as_path();
+        let vfiodevice = VfioDevice::new(vfio_path, vm, mem, vfio_container.clone())
+            .map_err(Error::CreateVfioDevice)?;
         let vfiopcidevice = Box::new(VfioPciDevice::new(
             vfiodevice,
             vfio_device_socket_irq,
@@ -1314,6 +1346,26 @@ fn runnable_vcpu(vcpu: Vcpu, use_kvm_signals: bool, cpu_id: u32) -> Option<Runna
     }
 }
 
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn inject_interrupt(pic: &Arc<Mutex<devices::Pic>>, vcpu: &RunnableVcpu) {
+    let mut pic = pic.lock();
+    if pic.interrupt_requested() && vcpu.ready_for_interrupt() {
+        if let Some(vector) = pic.get_external_interrupt() {
+            if let Err(e) = vcpu.interrupt(vector as u32) {
+                error!("PIC: failed to inject interrupt to vCPU0: {}", e);
+            }
+        }
+        // The second interrupt request should be handled immediately, so ask
+        // vCPU to exit as soon as possible.
+        if pic.interrupt_requested() {
+            vcpu.request_interrupt_window();
+        }
+    }
+}
+
+#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
+fn inject_interrupt(pic: &Arc<Mutex<devices::Pic>>, vcpu: &RunnableVcpu) {}
+
 fn run_vcpu(
     vcpu: Vcpu,
     cpu_id: u32,
@@ -1321,6 +1373,7 @@ fn run_vcpu(
     start_barrier: Arc<Barrier>,
     io_bus: devices::Bus,
     mmio_bus: devices::Bus,
+    split_irqchip: Option<(Arc<Mutex<devices::Pic>>, Arc<Mutex<devices::Ioapic>>)>,
     exit_evt: EventFd,
     requires_kvmclock_ctrl: bool,
     run_mode_arc: Arc<VcpuRunMode>,
@@ -1382,6 +1435,13 @@ fn run_vcpu(
                         }) => {
                             mmio_bus.write(address, &data[..size]);
                         }
+                        Ok(VcpuExit::IoapicEoi{vector}) => {
+                            if let Some((_, ioapic)) = &split_irqchip {
+                                ioapic.lock().end_of_interrupt(vector);
+                            } else {
+                                panic!("userspace ioapic not found in split irqchip mode, should be impossible.");
+                            }
+                        },
                         Ok(VcpuExit::Hlt) => break,
                         Ok(VcpuExit::Shutdown) => break,
                         Ok(VcpuExit::FailEntry {
@@ -1437,6 +1497,11 @@ fn run_vcpu(
                             run_mode_lock = run_mode_arc.cvar.wait(run_mode_lock);
                         }
                     }
+
+                    if cpu_id != 0 { continue; }
+                    if let Some((pic, _)) = &split_irqchip {
+                        inject_interrupt(pic, &vcpu);
+                    }
                 }
             }
         })
@@ -1554,10 +1619,15 @@ pub fn run_config(cfg: Config) -> Result<()> {
         msg_socket::pair::<VmMemoryResponse, VmMemoryRequest>().map_err(Error::CreateSocket)?;
     control_sockets.push(TaggedControlSocket::VmMemory(gpu_host_socket));
 
+    let (ioapic_host_socket, ioapic_device_socket) =
+        msg_socket::pair::<VmIrqResponse, VmIrqRequest>().map_err(Error::CreateSocket)?;
+    control_sockets.push(TaggedControlSocket::VmIrq(ioapic_host_socket));
+
     let sandbox = cfg.sandbox;
     let linux = Arch::build_vm(
         components,
         cfg.split_irqchip,
+        ioapic_device_socket,
         &cfg.serial_parameters,
         simple_jail(&cfg, "serial")?,
         |mem, vm, sys_allocator, exit_evt| {
@@ -1623,8 +1693,10 @@ fn run_control(
     #[derive(PollToken)]
     enum Token {
         Exit,
+        Suspend,
         ChildSignal,
         CheckAvailableMemory,
+        IrqFd { gsi: usize },
         LowMemory,
         LowmemTimer,
         VmControlServer,
@@ -1637,6 +1709,7 @@ fn run_control(
 
     let poll_ctx = PollContext::build_with(&[
         (&linux.exit_evt, Token::Exit),
+        (&linux.suspend_evt, Token::Suspend),
         (&sigchld_fd, Token::ChildSignal),
     ])
     .map_err(Error::PollContextAdd)?;
@@ -1674,6 +1747,16 @@ fn run_control(
         .add(&freemem_timer, Token::CheckAvailableMemory)
         .map_err(Error::PollContextAdd)?;
 
+    if let Some(gsi_relay) = &linux.gsi_relay {
+        for (gsi, evt) in gsi_relay.irqfd.into_iter().enumerate() {
+            if let Some(evt) = evt {
+                poll_ctx
+                    .add(evt, Token::IrqFd { gsi })
+                    .map_err(Error::PollContextAdd)?;
+            }
+        }
+    }
+
     // Used to add jitter to timer values so that we don't have a thundering herd problem when
     // multiple VMs are running.
     let mut simple_rng = SimpleRng::new(
@@ -1702,6 +1785,7 @@ fn run_control(
             vcpu_thread_barrier.clone(),
             linux.io_bus.clone(),
             linux.mmio_bus.clone(),
+            linux.split_irqchip.clone(),
             linux.exit_evt.try_clone().map_err(Error::CloneEventFd)?,
             linux.vm.check_extension(Cap::KvmclockCtrl),
             run_mode_arc.clone(),
@@ -1711,6 +1795,7 @@ fn run_control(
     }
     vcpu_thread_barrier.wait();
 
+    let mut ioapic_delayed = Vec::<usize>::default();
     'poll: loop {
         let events = {
             match poll_ctx.wait() {
@@ -1722,6 +1807,26 @@ fn run_control(
             }
         };
 
+        ioapic_delayed.retain(|&gsi| {
+            if let Some((_, ioapic)) = &linux.split_irqchip {
+                if let Ok(mut ioapic) = ioapic.try_lock() {
+                    // The unwrap will never fail because gsi_relay is Some iff split_irqchip is
+                    // Some.
+                    if linux.gsi_relay.as_ref().unwrap().irqfd_resample[gsi].is_some() {
+                        ioapic.service_irq(gsi, true);
+                    } else {
+                        ioapic.service_irq(gsi, true);
+                        ioapic.service_irq(gsi, false);
+                    }
+                    false
+                } else {
+                    true
+                }
+            } else {
+                true
+            }
+        });
+
         let mut vm_control_indices_to_remove = Vec::new();
         for event in events.iter_readable() {
             match event.token() {
@@ -1729,6 +1834,14 @@ fn run_control(
                     info!("vcpu requested shutdown");
                     break 'poll;
                 }
+                Token::Suspend => {
+                    info!("VM requested suspend");
+                    linux.suspend_evt.read().unwrap();
+                    run_mode_arc.set_and_notify(VmRunMode::Suspending);
+                    for handle in &vcpu_handles {
+                        let _ = handle.kill(SIGRTMIN() + 0);
+                    }
+                }
                 Token::ChildSignal => {
                     // Print all available siginfo structs, then exit the loop.
                     while let Some(siginfo) = sigchld_fd.read().map_err(Error::SignalFd)? {
@@ -1777,6 +1890,47 @@ fn run_control(
                         }
                     }
                 }
+                Token::IrqFd { gsi } => {
+                    if let Some((pic, ioapic)) = &linux.split_irqchip {
+                        // This will never fail because gsi_relay is Some iff split_irqchip is
+                        // Some.
+                        let gsi_relay = linux.gsi_relay.as_ref().unwrap();
+                        if let Some(eventfd) = &gsi_relay.irqfd[gsi] {
+                            eventfd.read().unwrap();
+                        } else {
+                            warn!(
+                                "irqfd {} not found in GSI relay, should be impossible.",
+                                gsi
+                            );
+                        }
+
+                        let mut pic = pic.lock();
+                        if gsi_relay.irqfd_resample[gsi].is_some() {
+                            pic.service_irq(gsi as u8, true);
+                        } else {
+                            pic.service_irq(gsi as u8, true);
+                            pic.service_irq(gsi as u8, false);
+                        }
+                        if let Err(e) = vcpu_handles[0].kill(SIGRTMIN() + 0) {
+                            warn!("PIC: failed to kick vCPU0: {}", e);
+                        }
+
+                        // When IOAPIC is configuring its redirection table, we should first
+                        // process its AddMsiRoute request, otherwise we would deadlock.
+                        if let Ok(mut ioapic) = ioapic.try_lock() {
+                            if gsi_relay.irqfd_resample[gsi].is_some() {
+                                ioapic.service_irq(gsi, true);
+                            } else {
+                                ioapic.service_irq(gsi, true);
+                                ioapic.service_irq(gsi, false);
+                            }
+                        } else {
+                            ioapic_delayed.push(gsi);
+                        }
+                    } else {
+                        panic!("split irqchip not found, should be impossible.");
+                    }
+                }
                 Token::LowMemory => {
                     if let Some(low_mem) = &low_mem {
                         let old_balloon_memory = current_balloon_memory;
@@ -1864,6 +2018,17 @@ fn run_control(
                                             VmRunMode::Exiting => {
                                                 break 'poll;
                                             }
+                                            VmRunMode::Running => {
+                                                if let VmRunMode::Suspending =
+                                                    *run_mode_arc.mtx.lock()
+                                                {
+                                                    linux.io_bus.notify_resume();
+                                                }
+                                                run_mode_arc.set_and_notify(VmRunMode::Running);
+                                                for handle in &vcpu_handles {
+                                                    let _ = handle.kill(SIGRTMIN() + 0);
+                                                }
+                                            }
                                             other => {
                                                 run_mode_arc.set_and_notify(other);
                                                 for handle in &vcpu_handles {
@@ -1922,8 +2087,10 @@ fn run_control(
         for event in events.iter_hungup() {
             match event.token() {
                 Token::Exit => {}
+                Token::Suspend => {}
                 Token::ChildSignal => {}
                 Token::CheckAvailableMemory => {}
+                Token::IrqFd { gsi: _ } => {}
                 Token::LowMemory => {}
                 Token::LowmemTimer => {}
                 Token::VmControlServer => {}