diff options
Diffstat (limited to 'src/linux.rs')
-rw-r--r-- | src/linux.rs | 431 |
1 files changed, 299 insertions, 132 deletions
diff --git a/src/linux.rs b/src/linux.rs index 84edf5c..4a87f7d 100644 --- a/src/linux.rs +++ b/src/linux.rs @@ -27,13 +27,13 @@ use std::time::{Duration, SystemTime, UNIX_EPOCH}; use libc::{self, c_int, gid_t, uid_t}; -use audio_streams::DummyStreamSource; +use audio_streams::shm_streams::NullShmStreamSource; #[cfg(feature = "gpu")] use devices::virtio::EventDevice; use devices::virtio::{self, VirtioDevice}; use devices::{ - self, HostBackendDeviceProvider, PciDevice, VfioDevice, VfioPciDevice, VirtioPciDevice, - XhciController, + self, HostBackendDeviceProvider, PciDevice, VfioContainer, VfioDevice, VfioPciDevice, + VirtioPciDevice, XhciController, }; use io_jail::{self, Minijail}; use kvm::*; @@ -63,7 +63,6 @@ use vm_control::{ }; use crate::{Config, DiskOption, Executable, SharedDir, SharedDirKind, TouchDeviceOption}; - use arch::{self, LinuxArch, RunnableLinuxVm, VirtioDeviceStub, VmComponents, VmImage}; #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] @@ -97,7 +96,7 @@ pub enum Error { CreateVfioDevice(devices::vfio::VfioError), DeviceJail(io_jail::Error), DevicePivotRoot(io_jail::Error), - Disk(io::Error), + Disk(PathBuf, io::Error), DiskImageLock(sys_util::Error), DropCapabilities(sys_util::Error), FsDeviceNew(virtio::fs::Error), @@ -184,7 +183,7 @@ impl Display for Error { CreateVfioDevice(e) => write!(f, "Failed to create vfio device {}", e), DeviceJail(e) => write!(f, "failed to jail device: {}", e), DevicePivotRoot(e) => write!(f, "failed to pivot root device: {}", e), - Disk(e) => write!(f, "failed to load disk image: {}", e), + Disk(p, e) => write!(f, "failed to load disk image {}: {}", p.display(), e), DiskImageLock(e) => write!(f, "failed to lock disk image: {}", e), DropCapabilities(e) => write!(f, "failed to drop process capabilities: {}", e), FsDeviceNew(e) => write!(f, "failed to create fs device: {}", e), @@ -301,55 +300,85 @@ fn get_max_open_files() -> Result<libc::rlim64_t> { } } +struct SandboxConfig<'a> { + limit_caps: bool, + log_failures: bool, + seccomp_policy: &'a Path, + uid_map: Option<&'a str>, + gid_map: Option<&'a str>, +} + fn create_base_minijail( root: &Path, - log_failures: bool, - seccomp_policy: &Path, + r_limit: Option<u64>, + config: Option<&SandboxConfig>, ) -> Result<Minijail> { // All child jails run in a new user namespace without any users mapped, // they run as nobody unless otherwise configured. let mut j = Minijail::new().map_err(Error::DeviceJail)?; - j.namespace_pids(); - j.namespace_user(); - j.namespace_user_disable_setgroups(); - // Don't need any capabilities. - j.use_caps(0); - // Create a new mount namespace with an empty root FS. - j.namespace_vfs(); - j.enter_pivot_root(root).map_err(Error::DevicePivotRoot)?; - // Run in an empty network namespace. - j.namespace_net(); - // Most devices don't need to open many fds. - j.set_rlimit(libc::RLIMIT_NOFILE as i32, 1024, 1024) - .map_err(Error::SettingMaxOpenFiles)?; - // Apply the block device seccomp policy. - j.no_new_privs(); - - // By default we'll prioritize using the pre-compiled .bpf over the .policy - // file (the .bpf is expected to be compiled using "trap" as the failure - // behavior instead of the default "kill" behavior). - // Refer to the code comment for the "seccomp-log-failures" - // command-line parameter for an explanation about why the |log_failures| - // flag forces the use of .policy files (and the build-time alternative to - // this run-time flag). - let bpf_policy_file = seccomp_policy.with_extension("bpf"); - if bpf_policy_file.exists() && !log_failures { - j.parse_seccomp_program(&bpf_policy_file) - .map_err(Error::DeviceJail)?; - } else { - // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, - // which will correctly kill the entire device process if a worker - // thread commits a seccomp violation. - j.set_seccomp_filter_tsync(); - if log_failures { - j.log_seccomp_filter_failures(); + + if let Some(config) = config { + j.namespace_pids(); + j.namespace_user(); + j.namespace_user_disable_setgroups(); + if config.limit_caps { + // Don't need any capabilities. + j.use_caps(0); } - j.parse_seccomp_filters(&seccomp_policy.with_extension("policy")) - .map_err(Error::DeviceJail)?; + if let Some(uid_map) = config.uid_map { + j.uidmap(uid_map).map_err(Error::SettingUidMap)?; + } + if let Some(gid_map) = config.gid_map { + j.gidmap(gid_map).map_err(Error::SettingGidMap)?; + } + // Run in a new mount namespace. + j.namespace_vfs(); + + // Run in an empty network namespace. + j.namespace_net(); + + // Don't allow the device to gain new privileges. + j.no_new_privs(); + + // By default we'll prioritize using the pre-compiled .bpf over the .policy + // file (the .bpf is expected to be compiled using "trap" as the failure + // behavior instead of the default "kill" behavior). + // Refer to the code comment for the "seccomp-log-failures" + // command-line parameter for an explanation about why the |log_failures| + // flag forces the use of .policy files (and the build-time alternative to + // this run-time flag). + let bpf_policy_file = config.seccomp_policy.with_extension("bpf"); + if bpf_policy_file.exists() && !config.log_failures { + j.parse_seccomp_program(&bpf_policy_file) + .map_err(Error::DeviceJail)?; + } else { + // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, + // which will correctly kill the entire device process if a worker + // thread commits a seccomp violation. + j.set_seccomp_filter_tsync(); + if config.log_failures { + j.log_seccomp_filter_failures(); + } + j.parse_seccomp_filters(&config.seccomp_policy.with_extension("policy")) + .map_err(Error::DeviceJail)?; + } + j.use_seccomp_filter(); + // Don't do init setup. + j.run_as_init(); + } + + // Only pivot_root if we are not re-using the current root directory. + if root != Path::new("/") { + // It's safe to call `namespace_vfs` multiple times. + j.namespace_vfs(); + j.enter_pivot_root(root).map_err(Error::DevicePivotRoot)?; } - j.use_seccomp_filter(); - // Don't do init setup. - j.run_as_init(); + + // Most devices don't need to open many fds. + let limit = if let Some(r) = r_limit { r } else { 1024u64 }; + j.set_rlimit(libc::RLIMIT_NOFILE as i32, limit, limit) + .map_err(Error::SettingMaxOpenFiles)?; + Ok(j) } @@ -362,11 +391,14 @@ fn simple_jail(cfg: &Config, policy: &str) -> Result<Option<Minijail>> { return Err(Error::PivotRootDoesntExist(pivot_root)); } let policy_path: PathBuf = cfg.seccomp_policy_dir.join(policy); - Ok(Some(create_base_minijail( - root_path, - cfg.seccomp_log_failures, - &policy_path, - )?)) + let config = SandboxConfig { + limit_caps: true, + log_failures: cfg.seccomp_log_failures, + seccomp_policy: &policy_path, + uid_map: None, + gid_map: None, + }; + Ok(Some(create_base_minijail(root_path, None, Some(&config))?)) } else { Ok(None) } @@ -388,7 +420,7 @@ fn create_block_device( .read(true) .write(!disk.read_only) .open(&disk.path) - .map_err(Error::Disk)? + .map_err(|e| Error::Disk(disk.path.to_path_buf(), e))? }; // Lock the disk image to prevent other crosvm instances from using it. let lock_op = if disk.read_only { @@ -473,13 +505,16 @@ fn create_tpm_device(cfg: &Config) -> DeviceResult { } fn create_single_touch_device(cfg: &Config, single_touch_spec: &TouchDeviceOption) -> DeviceResult { - let socket = single_touch_spec.path.into_unix_stream().map_err(|e| { - error!("failed configuring virtio single touch: {:?}", e); - e - })?; - - let dev = virtio::new_single_touch(socket, single_touch_spec.width, single_touch_spec.height) - .map_err(Error::InputDeviceNew)?; + let socket = single_touch_spec + .get_path() + .into_unix_stream() + .map_err(|e| { + error!("failed configuring virtio single touch: {:?}", e); + e + })?; + + let (width, height) = single_touch_spec.get_size(); + let dev = virtio::new_single_touch(socket, width, height).map_err(Error::InputDeviceNew)?; Ok(VirtioDeviceStub { dev: Box::new(dev), jail: simple_jail(&cfg, "input_device")?, @@ -487,13 +522,13 @@ fn create_single_touch_device(cfg: &Config, single_touch_spec: &TouchDeviceOptio } fn create_trackpad_device(cfg: &Config, trackpad_spec: &TouchDeviceOption) -> DeviceResult { - let socket = trackpad_spec.path.into_unix_stream().map_err(|e| { + let socket = trackpad_spec.get_path().into_unix_stream().map_err(|e| { error!("failed configuring virtio trackpad: {}", e); e })?; - let dev = virtio::new_trackpad(socket, trackpad_spec.width, trackpad_spec.height) - .map_err(Error::InputDeviceNew)?; + let (width, height) = trackpad_spec.get_size(); + let dev = virtio::new_trackpad(socket, width, height).map_err(Error::InputDeviceNew)?; Ok(VirtioDeviceStub { dev: Box::new(dev), @@ -768,45 +803,20 @@ fn create_fs_device( tag: &str, fs_cfg: virtio::fs::passthrough::Config, ) -> DeviceResult { - let mut j = Minijail::new().map_err(Error::DeviceJail)?; - - if cfg.sandbox { - j.namespace_pids(); - j.namespace_user(); - j.namespace_user_disable_setgroups(); - j.uidmap(uid_map).map_err(Error::SettingUidMap)?; - j.gidmap(gid_map).map_err(Error::SettingGidMap)?; - - // Run in an empty network namespace. - j.namespace_net(); - - j.no_new_privs(); - - // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly kill - // the entire device process if a worker thread commits a seccomp violation. - let seccomp_policy = cfg.seccomp_policy_dir.join("fs_device"); - j.set_seccomp_filter_tsync(); - if cfg.seccomp_log_failures { - j.log_seccomp_filter_failures(); - } - j.parse_seccomp_filters(&seccomp_policy) - .map_err(Error::DeviceJail)?; - j.use_seccomp_filter(); - - // Don't do init setup. - j.run_as_init(); - } - - // Create a new mount namespace with the source directory as the root. We need this even when - // sandboxing is disabled as the server relies on the host kernel to prevent path traversals - // from leaking out of the shared directory. - j.namespace_vfs(); - j.enter_pivot_root(src).map_err(Error::DevicePivotRoot)?; - - // The file server opens a lot of fds and needs a really high open file limit. let max_open_files = get_max_open_files()?; - j.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files) - .map_err(Error::SettingMaxOpenFiles)?; + let j = if cfg.sandbox { + let seccomp_policy = cfg.seccomp_policy_dir.join("fs_device"); + let config = SandboxConfig { + limit_caps: false, + uid_map: Some(uid_map), + gid_map: Some(gid_map), + log_failures: cfg.seccomp_log_failures, + seccomp_policy: &seccomp_policy, + }; + create_base_minijail(src, Some(max_open_files), Some(&config))? + } else { + create_base_minijail(src, Some(max_open_files), None)? + }; // TODO(chirantan): Use more than one worker once the kernel driver has been fixed to not panic // when num_queues > 1. @@ -818,25 +828,36 @@ fn create_fs_device( }) } -fn create_9p_device(cfg: &Config, src: &Path, tag: &str) -> DeviceResult { - let (jail, root) = match simple_jail(&cfg, "9p_device")? { - Some(mut jail) => { - // The shared directory becomes the root of the device's file system. - let root = Path::new("/"); - jail.mount_bind(src, root, true)?; +fn create_9p_device( + cfg: &Config, + uid_map: &str, + gid_map: &str, + src: &Path, + tag: &str, +) -> DeviceResult { + let max_open_files = get_max_open_files()?; + let (jail, root) = if cfg.sandbox { + let seccomp_policy = cfg.seccomp_policy_dir.join("9p_device"); + let config = SandboxConfig { + limit_caps: false, + uid_map: Some(uid_map), + gid_map: Some(gid_map), + log_failures: cfg.seccomp_log_failures, + seccomp_policy: &seccomp_policy, + }; - // We want bind mounts from the parent namespaces to propagate into the 9p server's - // namespace. - jail.set_remount_mode(libc::MS_SLAVE); + let mut jail = create_base_minijail(src, Some(max_open_files), Some(&config))?; + // We want bind mounts from the parent namespaces to propagate into the 9p server's + // namespace. + jail.set_remount_mode(libc::MS_SLAVE); - add_crosvm_user_to_jail(&mut jail, "p9")?; - (Some(jail), root) - } - None => { - // There's no bind mount so we tell the server to treat the source directory as the - // root. - (None, src) - } + // The shared directory becomes the root of the device's file system. + let root = Path::new("/"); + (Some(jail), root) + } else { + // There's no mount namespace so we tell the server to treat the source directory as the + // root. + (None, src) }; let dev = virtio::P9::new(root, tag).map_err(Error::P9DeviceNew)?; @@ -858,10 +879,11 @@ fn create_pmem_device( .read(true) .write(!disk.read_only) .open(&disk.path) - .map_err(Error::Disk)?; + .map_err(|e| Error::Disk(disk.path.to_path_buf(), e))?; let (disk_size, arena_size) = { - let metadata = std::fs::metadata(&disk.path).map_err(Error::Disk)?; + let metadata = + std::fs::metadata(&disk.path).map_err(|e| Error::Disk(disk.path.to_path_buf(), e))?; let disk_len = metadata.len(); // Linux requires pmem region sizes to be 2 MiB aligned. Linux will fill any partial page // at the end of an mmap'd file and won't write back beyond the actual file length, but if @@ -1022,16 +1044,22 @@ fn create_virtio_devices( #[cfg(feature = "gpu")] { - if cfg.gpu_parameters.is_some() { + if let Some(gpu_parameters) = &cfg.gpu_parameters { let mut event_devices = Vec::new(); if cfg.display_window_mouse { let (event_device_socket, virtio_dev_socket) = UnixStream::pair().map_err(Error::CreateSocket)?; - // TODO(nkgold): the width/height here should match the display's height/width. When - // those settings are available as CLI options, we should use the CLI options here - // as well. - let dev = virtio::new_single_touch(virtio_dev_socket, 1280, 1024) - .map_err(Error::InputDeviceNew)?; + let (single_touch_width, single_touch_height) = cfg + .virtio_single_touch + .as_ref() + .map(|single_touch_spec| single_touch_spec.get_size()) + .unwrap_or((gpu_parameters.display_width, gpu_parameters.display_height)); + let dev = virtio::new_single_touch( + virtio_dev_socket, + single_touch_width, + single_touch_height, + ) + .map_err(Error::InputDeviceNew)?; devs.push(VirtioDeviceStub { dev: Box::new(dev), jail: simple_jail(&cfg, "input_device")?, @@ -1077,7 +1105,7 @@ fn create_virtio_devices( let dev = match kind { SharedDirKind::FS => create_fs_device(cfg, uid_map, gid_map, src, tag, fs_cfg.clone())?, - SharedDirKind::P9 => create_9p_device(cfg, src, tag)?, + SharedDirKind::P9 => create_9p_device(cfg, uid_map, gid_map, src, tag)?, }; devs.push(dev); } @@ -1136,7 +1164,7 @@ fn create_devices( } if cfg.null_audio { - let server = Box::new(DummyStreamSource::new()); + let server = Box::new(NullShmStreamSource::new()); let null_audio = devices::Ac97Dev::new(mem.clone(), server); pci_devices.push(( @@ -1148,7 +1176,11 @@ fn create_devices( let usb_controller = Box::new(XhciController::new(mem.clone(), usb_provider)); pci_devices.push((usb_controller, simple_jail(&cfg, "xhci")?)); - if cfg.vfio.is_some() { + if let Some(vfio_path) = &cfg.vfio { + let vfio_container = Arc::new(Mutex::new( + VfioContainer::new().map_err(Error::CreateVfioDevice)?, + )); + let (vfio_host_socket_irq, vfio_device_socket_irq) = msg_socket::pair::<VmIrqResponse, VmIrqRequest>().map_err(Error::CreateSocket)?; control_sockets.push(TaggedControlSocket::VmIrq(vfio_host_socket_irq)); @@ -1157,9 +1189,9 @@ fn create_devices( msg_socket::pair::<VmMemoryResponse, VmMemoryRequest>().map_err(Error::CreateSocket)?; control_sockets.push(TaggedControlSocket::VmMemory(vfio_host_socket_mem)); - let vfio_path = cfg.vfio.as_ref().unwrap().as_path(); - let vfiodevice = - VfioDevice::new(vfio_path, vm, mem.clone()).map_err(Error::CreateVfioDevice)?; + let vfio_path = vfio_path.as_path(); + let vfiodevice = VfioDevice::new(vfio_path, vm, mem, vfio_container.clone()) + .map_err(Error::CreateVfioDevice)?; let vfiopcidevice = Box::new(VfioPciDevice::new( vfiodevice, vfio_device_socket_irq, @@ -1314,6 +1346,26 @@ fn runnable_vcpu(vcpu: Vcpu, use_kvm_signals: bool, cpu_id: u32) -> Option<Runna } } +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn inject_interrupt(pic: &Arc<Mutex<devices::Pic>>, vcpu: &RunnableVcpu) { + let mut pic = pic.lock(); + if pic.interrupt_requested() && vcpu.ready_for_interrupt() { + if let Some(vector) = pic.get_external_interrupt() { + if let Err(e) = vcpu.interrupt(vector as u32) { + error!("PIC: failed to inject interrupt to vCPU0: {}", e); + } + } + // The second interrupt request should be handled immediately, so ask + // vCPU to exit as soon as possible. + if pic.interrupt_requested() { + vcpu.request_interrupt_window(); + } + } +} + +#[cfg(any(target_arch = "arm", target_arch = "aarch64"))] +fn inject_interrupt(pic: &Arc<Mutex<devices::Pic>>, vcpu: &RunnableVcpu) {} + fn run_vcpu( vcpu: Vcpu, cpu_id: u32, @@ -1321,6 +1373,7 @@ fn run_vcpu( start_barrier: Arc<Barrier>, io_bus: devices::Bus, mmio_bus: devices::Bus, + split_irqchip: Option<(Arc<Mutex<devices::Pic>>, Arc<Mutex<devices::Ioapic>>)>, exit_evt: EventFd, requires_kvmclock_ctrl: bool, run_mode_arc: Arc<VcpuRunMode>, @@ -1382,6 +1435,13 @@ fn run_vcpu( }) => { mmio_bus.write(address, &data[..size]); } + Ok(VcpuExit::IoapicEoi{vector}) => { + if let Some((_, ioapic)) = &split_irqchip { + ioapic.lock().end_of_interrupt(vector); + } else { + panic!("userspace ioapic not found in split irqchip mode, should be impossible."); + } + }, Ok(VcpuExit::Hlt) => break, Ok(VcpuExit::Shutdown) => break, Ok(VcpuExit::FailEntry { @@ -1437,6 +1497,11 @@ fn run_vcpu( run_mode_lock = run_mode_arc.cvar.wait(run_mode_lock); } } + + if cpu_id != 0 { continue; } + if let Some((pic, _)) = &split_irqchip { + inject_interrupt(pic, &vcpu); + } } } }) @@ -1554,10 +1619,15 @@ pub fn run_config(cfg: Config) -> Result<()> { msg_socket::pair::<VmMemoryResponse, VmMemoryRequest>().map_err(Error::CreateSocket)?; control_sockets.push(TaggedControlSocket::VmMemory(gpu_host_socket)); + let (ioapic_host_socket, ioapic_device_socket) = + msg_socket::pair::<VmIrqResponse, VmIrqRequest>().map_err(Error::CreateSocket)?; + control_sockets.push(TaggedControlSocket::VmIrq(ioapic_host_socket)); + let sandbox = cfg.sandbox; let linux = Arch::build_vm( components, cfg.split_irqchip, + ioapic_device_socket, &cfg.serial_parameters, simple_jail(&cfg, "serial")?, |mem, vm, sys_allocator, exit_evt| { @@ -1623,8 +1693,10 @@ fn run_control( #[derive(PollToken)] enum Token { Exit, + Suspend, ChildSignal, CheckAvailableMemory, + IrqFd { gsi: usize }, LowMemory, LowmemTimer, VmControlServer, @@ -1637,6 +1709,7 @@ fn run_control( let poll_ctx = PollContext::build_with(&[ (&linux.exit_evt, Token::Exit), + (&linux.suspend_evt, Token::Suspend), (&sigchld_fd, Token::ChildSignal), ]) .map_err(Error::PollContextAdd)?; @@ -1674,6 +1747,16 @@ fn run_control( .add(&freemem_timer, Token::CheckAvailableMemory) .map_err(Error::PollContextAdd)?; + if let Some(gsi_relay) = &linux.gsi_relay { + for (gsi, evt) in gsi_relay.irqfd.into_iter().enumerate() { + if let Some(evt) = evt { + poll_ctx + .add(evt, Token::IrqFd { gsi }) + .map_err(Error::PollContextAdd)?; + } + } + } + // Used to add jitter to timer values so that we don't have a thundering herd problem when // multiple VMs are running. let mut simple_rng = SimpleRng::new( @@ -1702,6 +1785,7 @@ fn run_control( vcpu_thread_barrier.clone(), linux.io_bus.clone(), linux.mmio_bus.clone(), + linux.split_irqchip.clone(), linux.exit_evt.try_clone().map_err(Error::CloneEventFd)?, linux.vm.check_extension(Cap::KvmclockCtrl), run_mode_arc.clone(), @@ -1711,6 +1795,7 @@ fn run_control( } vcpu_thread_barrier.wait(); + let mut ioapic_delayed = Vec::<usize>::default(); 'poll: loop { let events = { match poll_ctx.wait() { @@ -1722,6 +1807,26 @@ fn run_control( } }; + ioapic_delayed.retain(|&gsi| { + if let Some((_, ioapic)) = &linux.split_irqchip { + if let Ok(mut ioapic) = ioapic.try_lock() { + // The unwrap will never fail because gsi_relay is Some iff split_irqchip is + // Some. + if linux.gsi_relay.as_ref().unwrap().irqfd_resample[gsi].is_some() { + ioapic.service_irq(gsi, true); + } else { + ioapic.service_irq(gsi, true); + ioapic.service_irq(gsi, false); + } + false + } else { + true + } + } else { + true + } + }); + let mut vm_control_indices_to_remove = Vec::new(); for event in events.iter_readable() { match event.token() { @@ -1729,6 +1834,14 @@ fn run_control( info!("vcpu requested shutdown"); break 'poll; } + Token::Suspend => { + info!("VM requested suspend"); + linux.suspend_evt.read().unwrap(); + run_mode_arc.set_and_notify(VmRunMode::Suspending); + for handle in &vcpu_handles { + let _ = handle.kill(SIGRTMIN() + 0); + } + } Token::ChildSignal => { // Print all available siginfo structs, then exit the loop. while let Some(siginfo) = sigchld_fd.read().map_err(Error::SignalFd)? { @@ -1777,6 +1890,47 @@ fn run_control( } } } + Token::IrqFd { gsi } => { + if let Some((pic, ioapic)) = &linux.split_irqchip { + // This will never fail because gsi_relay is Some iff split_irqchip is + // Some. + let gsi_relay = linux.gsi_relay.as_ref().unwrap(); + if let Some(eventfd) = &gsi_relay.irqfd[gsi] { + eventfd.read().unwrap(); + } else { + warn!( + "irqfd {} not found in GSI relay, should be impossible.", + gsi + ); + } + + let mut pic = pic.lock(); + if gsi_relay.irqfd_resample[gsi].is_some() { + pic.service_irq(gsi as u8, true); + } else { + pic.service_irq(gsi as u8, true); + pic.service_irq(gsi as u8, false); + } + if let Err(e) = vcpu_handles[0].kill(SIGRTMIN() + 0) { + warn!("PIC: failed to kick vCPU0: {}", e); + } + + // When IOAPIC is configuring its redirection table, we should first + // process its AddMsiRoute request, otherwise we would deadlock. + if let Ok(mut ioapic) = ioapic.try_lock() { + if gsi_relay.irqfd_resample[gsi].is_some() { + ioapic.service_irq(gsi, true); + } else { + ioapic.service_irq(gsi, true); + ioapic.service_irq(gsi, false); + } + } else { + ioapic_delayed.push(gsi); + } + } else { + panic!("split irqchip not found, should be impossible."); + } + } Token::LowMemory => { if let Some(low_mem) = &low_mem { let old_balloon_memory = current_balloon_memory; @@ -1864,6 +2018,17 @@ fn run_control( VmRunMode::Exiting => { break 'poll; } + VmRunMode::Running => { + if let VmRunMode::Suspending = + *run_mode_arc.mtx.lock() + { + linux.io_bus.notify_resume(); + } + run_mode_arc.set_and_notify(VmRunMode::Running); + for handle in &vcpu_handles { + let _ = handle.kill(SIGRTMIN() + 0); + } + } other => { run_mode_arc.set_and_notify(other); for handle in &vcpu_handles { @@ -1922,8 +2087,10 @@ fn run_control( for event in events.iter_hungup() { match event.token() { Token::Exit => {} + Token::Suspend => {} Token::ChildSignal => {} Token::CheckAvailableMemory => {} + Token::IrqFd { gsi: _ } => {} Token::LowMemory => {} Token::LowmemTimer => {} Token::VmControlServer => {} |