summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--devices/src/pci/vfio_pci.rs106
-rw-r--r--devices/src/vfio.rs131
-rw-r--r--src/linux.rs10
-rw-r--r--vfio_sys/src/vfio.rs6
-rw-r--r--vm_control/src/lib.rs22
5 files changed, 270 insertions, 5 deletions
diff --git a/devices/src/pci/vfio_pci.rs b/devices/src/pci/vfio_pci.rs
index fd1b3ca..07c7b60 100644
--- a/devices/src/pci/vfio_pci.rs
+++ b/devices/src/pci/vfio_pci.rs
@@ -9,10 +9,13 @@ use std::u32;
 use kvm::Datamatch;
 use msg_socket::{MsgReceiver, MsgSender};
 use resources::{Alloc, MmioType, SystemAllocator};
-use sys_util::{error, EventFd};
+use sys_util::{error, EventFd, MemoryMapping};
 
 use vfio_sys::*;
-use vm_control::{MaybeOwnedFd, VmIrqRequest, VmIrqRequestSocket, VmIrqResponse};
+use vm_control::{
+    MaybeOwnedFd, VmIrqRequest, VmIrqRequestSocket, VmIrqResponse, VmMemoryControlRequestSocket,
+    VmMemoryRequest, VmMemoryResponse,
+};
 
 use crate::pci::pci_device::{Error as PciDeviceError, PciDevice};
 use crate::pci::PciInterruptPin;
@@ -306,11 +309,19 @@ pub struct VfioPciDevice {
     io_regions: Vec<IoInfo>,
     msi_cap: Option<VfioMsiCap>,
     irq_type: Option<VfioIrqType>,
+    vm_socket_mem: VmMemoryControlRequestSocket,
+
+    // scratch MemoryMapping to avoid unmap beform vm exit
+    mem: Vec<MemoryMapping>,
 }
 
 impl VfioPciDevice {
     /// Constructs a new Vfio Pci device for the give Vfio device
-    pub fn new(device: VfioDevice, vfio_device_socket_irq: VmIrqRequestSocket) -> Self {
+    pub fn new(
+        device: VfioDevice,
+        vfio_device_socket_irq: VmIrqRequestSocket,
+        vfio_device_socket_mem: VmMemoryControlRequestSocket,
+    ) -> Self {
         let dev = Arc::new(device);
         let config = VfioPciConfig::new(Arc::clone(&dev));
         let msi_cap = VfioMsiCap::new(&config, vfio_device_socket_irq);
@@ -325,6 +336,8 @@ impl VfioPciDevice {
             io_regions: Vec::new(),
             msi_cap,
             irq_type: None,
+            vm_socket_mem: vfio_device_socket_mem,
+            mem: Vec::new(),
         }
     }
 
@@ -421,8 +434,85 @@ impl VfioPciDevice {
 
         self.enable_intx();
     }
+
+    fn add_bar_mmap(&self, index: u32, bar_addr: u64) -> Vec<MemoryMapping> {
+        let mut mem_map: Vec<MemoryMapping> = Vec::new();
+        if self.device.get_region_flags(index) & VFIO_REGION_INFO_FLAG_MMAP != 0 {
+            let mmaps = self.device.get_region_mmap(index);
+            if mmaps.is_empty() {
+                return mem_map;
+            }
+
+            for mmap in mmaps.iter() {
+                let mmap_offset = mmap.offset;
+                let mmap_size = mmap.size;
+                let guest_map_start = bar_addr + mmap_offset;
+                let region_offset = self.device.get_region_offset(index);
+                let offset: usize = (region_offset + mmap_offset) as usize;
+                if self
+                    .vm_socket_mem
+                    .send(&VmMemoryRequest::RegisterMmapMemory {
+                        fd: MaybeOwnedFd::Borrowed(self.device.as_raw_fd()),
+                        size: mmap_size as usize,
+                        offset,
+                        gpa: guest_map_start,
+                    })
+                    .is_err()
+                {
+                    break;
+                }
+
+                let response = match self.vm_socket_mem.recv() {
+                    Ok(res) => res,
+                    Err(_) => break,
+                };
+                match response {
+                    VmMemoryResponse::Ok => {
+                        // Even if vm has mapped this region, but it is in vm main process,
+                        // device process doesn't has this mapping, but vfio_dma_map() need it
+                        // in device process, so here map it again.
+                        let mmap = match MemoryMapping::from_fd_offset(
+                            self.device.as_ref(),
+                            mmap_size as usize,
+                            offset,
+                        ) {
+                            Ok(v) => v,
+                            Err(_e) => break,
+                        };
+                        let host = (&mmap).as_ptr() as u64;
+                        // Safe because the given guest_map_start is valid guest bar address. and
+                        // the host pointer is correct and valid guaranteed by MemoryMapping interface.
+                        match unsafe { self.device.vfio_dma_map(guest_map_start, mmap_size, host) }
+                        {
+                            Ok(_) => mem_map.push(mmap),
+                            Err(e) => {
+                                error!(
+                                    "{}, index: {}, bar_addr:0x{:x}, host:0x{:x}",
+                                    e, index, bar_addr, host
+                                );
+                                break;
+                            }
+                        }
+                    }
+                    _ => break,
+                }
+            }
+        }
+
+        mem_map
+    }
+
+    fn enable_bars_mmap(&mut self) {
+        for mmio_info in self.mmio_regions.iter() {
+            let mut mem_map = self.add_bar_mmap(mmio_info.bar_index, mmio_info.start);
+            self.mem.append(&mut mem_map);
+        }
+    }
 }
 
+const PCI_COMMAND: u8 = 0x4;
+const PCI_COMMAND_MEMORY: u8 = 0x2;
+
 impl PciDevice for VfioPciDevice {
     fn debug_label(&self) -> String {
         "vfio pci device".to_string()
@@ -443,6 +533,7 @@ impl PciDevice for VfioPciDevice {
         if let Some(msi_cap) = &self.msi_cap {
             fds.push(msi_cap.get_vm_socket());
         }
+        fds.push(self.vm_socket_mem.as_raw_fd());
         fds
     }
 
@@ -601,6 +692,15 @@ impl PciDevice for VfioPciDevice {
             None => (),
         }
 
+        // if guest enable memory access, then enable bar mappable once
+        if start == PCI_COMMAND as u64
+            && data.len() == 2
+            && data[0] & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY
+            && self.mem.is_empty()
+        {
+            self.enable_bars_mmap();
+        }
+
         self.device
             .region_write(VFIO_PCI_CONFIG_REGION_INDEX, data, start);
     }
diff --git a/devices/src/vfio.rs b/devices/src/vfio.rs
index d376cf9..c42c622 100644
--- a/devices/src/vfio.rs
+++ b/devices/src/vfio.rs
@@ -281,15 +281,20 @@ pub enum VfioIrqType {
 }
 
 struct VfioRegion {
+    // flags for this region: read/write/mmap
     flags: u32,
     size: u64,
+    // region offset used to read/write with vfio device fd
     offset: u64,
+    // vectors for mmap offset and size
+    mmaps: Vec<vfio_region_sparse_mmap_area>,
 }
 
 /// Vfio device for exposing regions which could be read/write to kernel vfio device.
 pub struct VfioDevice {
     dev: File,
     group: VfioGroup,
+    // vec for vfio device's regions
     regions: Vec<VfioRegion>,
     guest_mem: GuestMemory,
 }
@@ -453,6 +458,7 @@ impl VfioDevice {
         }
     }
 
+    #[allow(clippy::cast_ptr_alignment)]
     fn get_regions(dev: &File) -> Result<Vec<VfioRegion>, VfioError> {
         let mut regions: Vec<VfioRegion> = Vec::new();
         let mut dev_info = vfio_device_info {
@@ -473,8 +479,9 @@ impl VfioDevice {
         }
 
         for i in VFIO_PCI_BAR0_REGION_INDEX..dev_info.num_regions {
+            let argsz = mem::size_of::<vfio_region_info>() as u32;
             let mut reg_info = vfio_region_info {
-                argsz: mem::size_of::<vfio_region_info>() as u32,
+                argsz,
                 flags: 0,
                 index: i,
                 cap_offset: 0,
@@ -488,10 +495,93 @@ impl VfioDevice {
                 continue;
             }
 
+            let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::new();
+            if reg_info.argsz > argsz {
+                let cap_len: usize = (reg_info.argsz - argsz) as usize;
+                let mut region_with_cap =
+                    vec_with_array_field::<vfio_region_info_with_cap, u8>(cap_len);
+                region_with_cap[0].region_info.argsz = reg_info.argsz;
+                region_with_cap[0].region_info.flags = 0;
+                region_with_cap[0].region_info.index = i;
+                region_with_cap[0].region_info.cap_offset = 0;
+                region_with_cap[0].region_info.size = 0;
+                region_with_cap[0].region_info.offset = 0;
+                // Safe as we are the owner of dev and region_info which are valid value,
+                // and we verify the return value.
+                ret = unsafe {
+                    ioctl_with_mut_ref(
+                        dev,
+                        VFIO_DEVICE_GET_REGION_INFO(),
+                        &mut (region_with_cap[0].region_info),
+                    )
+                };
+                if ret < 0 {
+                    return Err(VfioError::VfioDeviceGetRegionInfo(get_error()));
+                }
+
+                if region_with_cap[0].region_info.flags & VFIO_REGION_INFO_FLAG_CAPS == 0 {
+                    continue;
+                }
+
+                let cap_header_sz = mem::size_of::<vfio_info_cap_header>() as u32;
+                let mmap_cap_sz = mem::size_of::<vfio_region_info_cap_sparse_mmap>() as u32;
+                let mmap_area_sz = mem::size_of::<vfio_region_sparse_mmap_area>() as u32;
+                let region_info_sz = reg_info.argsz;
+
+                // region_with_cap[0].cap_info may contain many structures, like
+                // vfio_region_info_cap_sparse_mmap struct or vfio_region_info_cap_type struct.
+                // Both of them begin with vfio_info_cap_header, so we will get individual cap from
+                // vfio_into_cap_header.
+                // Go through all the cap structs.
+                let info_ptr = region_with_cap.as_ptr() as *mut u8;
+                let mut offset = region_with_cap[0].region_info.cap_offset;
+                while offset != 0 {
+                    if offset + cap_header_sz >= region_info_sz {
+                        break;
+                    }
+                    // Safe, as cap_header struct is in this function allocated region_with_cap
+                    // vec.
+                    let cap_ptr = unsafe { info_ptr.offset(offset as isize) };
+                    let cap_header =
+                        unsafe { &*(cap_ptr as *mut u8 as *const vfio_info_cap_header) };
+                    if cap_header.id as u32 == VFIO_REGION_INFO_CAP_SPARSE_MMAP {
+                        if offset + mmap_cap_sz >= region_info_sz {
+                            break;
+                        }
+                        // cap_ptr is vfio_region_info_cap_sparse_mmap here
+                        // Safe, this vfio_region_info_cap_sparse_mmap is in this function allocated
+                        // region_with_cap vec.
+                        let sparse_mmap = unsafe {
+                            &*(cap_ptr as *mut u8 as *const vfio_region_info_cap_sparse_mmap)
+                        };
+
+                        let area_num = sparse_mmap.nr_areas;
+                        if offset + mmap_cap_sz + area_num * mmap_area_sz > region_info_sz {
+                            break;
+                        }
+                        // Safe, these vfio_region_sparse_mmap_area are in this function allocated
+                        // region_with_cap vec.
+                        let areas =
+                            unsafe { sparse_mmap.areas.as_slice(sparse_mmap.nr_areas as usize) };
+                        for area in areas.iter() {
+                            mmaps.push(area.clone());
+                        }
+                    }
+
+                    offset = cap_header.next;
+                }
+            } else if reg_info.flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
+                mmaps.push(vfio_region_sparse_mmap_area {
+                    offset: 0,
+                    size: reg_info.size,
+                });
+            }
+
             let region = VfioRegion {
                 flags: reg_info.flags,
                 size: reg_info.size,
                 offset: reg_info.offset,
+                mmaps,
             };
             regions.push(region);
         }
@@ -499,6 +589,45 @@ impl VfioDevice {
         Ok(regions)
     }
 
+    /// get a region's flag
+    /// the return's value may conatin:
+    ///     VFIO_REGION_INFO_FLAG_READ:  region supports read
+    ///     VFIO_REGION_INFO_FLAG_WRITE: region supports write
+    ///     VFIO_REGION_INFO_FLAG_MMAP:  region supports mmap
+    ///     VFIO_REGION_INFO_FLAG_CAPS:  region's info supports caps
+    pub fn get_region_flags(&self, index: u32) -> u32 {
+        match self.regions.get(index as usize) {
+            Some(v) => v.flags,
+            None => {
+                warn!("get_region_flags() with invalid index: {}", index);
+                0
+            }
+        }
+    }
+
+    /// get a region's offset
+    /// return: Region offset from the start of vfio device fd
+    pub fn get_region_offset(&self, index: u32) -> u64 {
+        match self.regions.get(index as usize) {
+            Some(v) => v.offset,
+            None => {
+                warn!("get_region_offset with invalid index: {}", index);
+                0
+            }
+        }
+    }
+
+    /// get a region's mmap info vector
+    pub fn get_region_mmap(&self, index: u32) -> Vec<vfio_region_sparse_mmap_area> {
+        match self.regions.get(index as usize) {
+            Some(v) => v.mmaps.clone(),
+            None => {
+                warn!("get_region_mmap with invalid index: {}", index);
+                Vec::new()
+            }
+        }
+    }
+
     /// Read region's data from VFIO device into buf
     /// index: region num
     /// buf: data destination and buf length is read size
diff --git a/src/linux.rs b/src/linux.rs
index 9b984d1..7257520 100644
--- a/src/linux.rs
+++ b/src/linux.rs
@@ -1084,10 +1084,18 @@ fn create_devices(
             msg_socket::pair::<VmIrqResponse, VmIrqRequest>().map_err(Error::CreateSocket)?;
         control_sockets.push(TaggedControlSocket::VmIrq(vfio_host_socket_irq));
 
+        let (vfio_host_socket_mem, vfio_device_socket_mem) =
+            msg_socket::pair::<VmMemoryResponse, VmMemoryRequest>().map_err(Error::CreateSocket)?;
+        control_sockets.push(TaggedControlSocket::VmMemory(vfio_host_socket_mem));
+
         let vfio_path = cfg.vfio.as_ref().unwrap().as_path();
         let vfiodevice =
             VfioDevice::new(vfio_path, vm, mem.clone()).map_err(Error::CreateVfioDevice)?;
-        let vfiopcidevice = Box::new(VfioPciDevice::new(vfiodevice, vfio_device_socket_irq));
+        let vfiopcidevice = Box::new(VfioPciDevice::new(
+            vfiodevice,
+            vfio_device_socket_irq,
+            vfio_device_socket_mem,
+        ));
         pci_devices.push((vfiopcidevice, simple_jail(&cfg, "vfio_device.policy")?));
     }
 
diff --git a/vfio_sys/src/vfio.rs b/vfio_sys/src/vfio.rs
index 3b88848..622b5db 100644
--- a/vfio_sys/src/vfio.rs
+++ b/vfio_sys/src/vfio.rs
@@ -225,6 +225,12 @@ pub struct vfio_region_info {
     pub offset: __u64,
 }
 #[repr(C)]
+#[derive(Debug, Default)]
+pub struct vfio_region_info_with_cap {
+    pub region_info: vfio_region_info,
+    pub cap_info: __IncompleteArrayField<u8>,
+}
+#[repr(C)]
 #[derive(Debug, Default, Copy, Clone)]
 pub struct vfio_region_sparse_mmap_area {
     pub offset: __u64,
diff --git a/vm_control/src/lib.rs b/vm_control/src/lib.rs
index eccee10..7e5faf5 100644
--- a/vm_control/src/lib.rs
+++ b/vm_control/src/lib.rs
@@ -201,6 +201,13 @@ pub enum VmMemoryRequest {
         height: u32,
         format: u32,
     },
+    /// Register mmaped memory into kvm's EPT.
+    RegisterMmapMemory {
+        fd: MaybeOwnedFd,
+        size: usize,
+        offset: usize,
+        gpa: u64,
+    },
 }
 
 impl VmMemoryRequest {
@@ -260,6 +267,21 @@ impl VmMemoryRequest {
                     Err(e) => VmMemoryResponse::Err(e),
                 }
             }
+            RegisterMmapMemory {
+                ref fd,
+                size,
+                offset,
+                gpa,
+            } => {
+                let mmap = match MemoryMapping::from_fd_offset(fd, size, offset) {
+                    Ok(v) => v,
+                    Err(_e) => return VmMemoryResponse::Err(SysError::new(EINVAL)),
+                };
+                match vm.add_mmio_memory(GuestAddress(gpa), mmap, false, false) {
+                    Ok(_) => VmMemoryResponse::Ok,
+                    Err(e) => VmMemoryResponse::Err(e),
+                }
+            }
         }
     }
 }