// Copyright 2017 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use std::alloc::Layout; use std::fmt::{self, Display}; use std::{mem, result}; use assertions::const_assert; use kvm; use kvm_sys::kvm_fpu; use kvm_sys::kvm_msr_entry; use kvm_sys::kvm_msrs; use kvm_sys::kvm_regs; use kvm_sys::kvm_sregs; use sys_util::{self, warn, GuestAddress, GuestMemory, LayoutAllocation}; use crate::gdt; #[derive(Debug)] pub enum Error { /// Setting up msrs failed. MsrIoctlFailed(sys_util::Error), /// Failed to configure the FPU. FpuIoctlFailed(sys_util::Error), /// Failed to get sregs for this cpu. GetSRegsIoctlFailed(sys_util::Error), /// Failed to set base registers for this cpu. SettingRegistersIoctl(sys_util::Error), /// Failed to set sregs for this cpu. SetSRegsIoctlFailed(sys_util::Error), /// Writing the GDT to RAM failed. WriteGDTFailure, /// Writing the IDT to RAM failed. WriteIDTFailure, /// Writing PML4 to RAM failed. WritePML4Address, /// Writing PDPTE to RAM failed. WritePDPTEAddress, /// Writing PDE to RAM failed. WritePDEAddress, } pub type Result = result::Result; impl std::error::Error for Error {} impl Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use self::Error::*; match self { MsrIoctlFailed(e) => write!(f, "setting up msrs failed: {}", e), FpuIoctlFailed(e) => write!(f, "failed to configure the FPU: {}", e), GetSRegsIoctlFailed(e) => write!(f, "failed to get sregs for this cpu: {}", e), SettingRegistersIoctl(e) => { write!(f, "failed to set base registers for this cpu: {}", e) } SetSRegsIoctlFailed(e) => write!(f, "failed to set sregs for this cpu: {}", e), WriteGDTFailure => write!(f, "writing the GDT to RAM failed"), WriteIDTFailure => write!(f, "writing the IDT to RAM failed"), WritePML4Address => write!(f, "writing PML4 to RAM failed"), WritePDPTEAddress => write!(f, "writing PDPTE to RAM failed"), WritePDEAddress => write!(f, "writing PDE to RAM failed"), } } } const MTRR_MEMTYPE_UC: u8 = 0x0; const MTRR_MEMTYPE_WB: u8 = 0x6; const MTRR_VAR_VALID: u64 = 0x800; const MTRR_ENABLE: u64 = 0x800; const MTRR_PHYS_BASE_MSR: u32 = 0x200; const MTRR_PHYS_MASK_MSR: u32 = 0x201; const VAR_MTRR_NUM_MASK: u64 = 0xFF; // Returns the value of the highest bit in a 64-bit value. Equivalent to // 1 << HighBitSet(x) fn get_power_of_two(data: u64) -> u64 { 1 << (64 - data.leading_zeros() - 1) } // Returns the max length which suitable for mtrr setting based on the // specified (base, len) fn get_max_len(base: u64, len: u64) -> u64 { let mut ret = get_power_of_two(len); while base % ret != 0 { ret >>= 1; } ret } // For the specified (Base, Len), returns (base, len) pair which could be // set into mtrr register. mtrr requires: the base-address alignment value can't be // less than its length fn get_mtrr_pairs(base: u64, len: u64) -> Vec<(u64, u64)> { let mut vecs = Vec::new(); let mut remains = len; let mut new = base; while remains != 0 { let max = get_max_len(new, remains); vecs.push((new, max)); remains -= max; new += max; } vecs } fn create_mtrr_entries(vpu: &kvm::Vcpu, pci_start: u64) -> Vec { let mut entries = Vec::::new(); // Get VAR MTRR num from MSR_MTRRcap let mut msrs = vec![kvm_msr_entry { index: crate::msr_index::MSR_MTRRcap, ..Default::default() }]; if vpu.get_msrs(&mut msrs).is_err() { warn!("get msrs fail, guest with pass through device may be very slow"); return entries; } let var_num = msrs[0].data & VAR_MTRR_NUM_MASK; // Set pci_start .. 4G as UC // all others are set to default WB let pci_len = (1 << 32) - pci_start; let vecs = get_mtrr_pairs(pci_start, pci_len); if vecs.len() as u64 > var_num { warn!( "mtrr fail for pci mmio, please check pci_start addr, guest with pass through device may be very slow" ); return entries; } let phys_mask: u64 = (1 << crate::cpuid::phy_max_address_bits()) - 1; for (idx, (base, len)) in vecs.iter().enumerate() { let reg_idx = idx as u32 * 2; entries.push(kvm_msr_entry { index: MTRR_PHYS_BASE_MSR + reg_idx, data: base | MTRR_MEMTYPE_UC as u64, ..Default::default() }); let mask: u64 = len.wrapping_neg() & phys_mask | MTRR_VAR_VALID; entries.push(kvm_msr_entry { index: MTRR_PHYS_MASK_MSR + reg_idx, data: mask, ..Default::default() }); } // Disable fixed MTRRs and enable variable MTRRs, set default type as WB entries.push(kvm_msr_entry { index: crate::msr_index::MSR_MTRRdefType, data: MTRR_ENABLE | MTRR_MEMTYPE_WB as u64, ..Default::default() }); entries } fn create_msr_entries(vcpu: &kvm::Vcpu, pci_start: u64) -> Vec { let mut entries = Vec::::new(); entries.push(kvm_msr_entry { index: crate::msr_index::MSR_IA32_SYSENTER_CS, data: 0x0, ..Default::default() }); entries.push(kvm_msr_entry { index: crate::msr_index::MSR_IA32_SYSENTER_ESP, data: 0x0, ..Default::default() }); entries.push(kvm_msr_entry { index: crate::msr_index::MSR_IA32_SYSENTER_EIP, data: 0x0, ..Default::default() }); // x86_64 specific msrs, we only run on x86_64 not x86 entries.push(kvm_msr_entry { index: crate::msr_index::MSR_STAR, data: 0x0, ..Default::default() }); entries.push(kvm_msr_entry { index: crate::msr_index::MSR_CSTAR, data: 0x0, ..Default::default() }); entries.push(kvm_msr_entry { index: crate::msr_index::MSR_KERNEL_GS_BASE, data: 0x0, ..Default::default() }); entries.push(kvm_msr_entry { index: crate::msr_index::MSR_SYSCALL_MASK, data: 0x0, ..Default::default() }); entries.push(kvm_msr_entry { index: crate::msr_index::MSR_LSTAR, data: 0x0, ..Default::default() }); // end of x86_64 specific code entries.push(kvm_msr_entry { index: crate::msr_index::MSR_IA32_TSC, data: 0x0, ..Default::default() }); entries.push(kvm_msr_entry { index: crate::msr_index::MSR_IA32_MISC_ENABLE, data: crate::msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64, ..Default::default() }); let mut mtrr_entries = create_mtrr_entries(vcpu, pci_start); entries.append(&mut mtrr_entries); entries } /// Configure Model specific registers for x86 /// /// # Arguments /// /// * `vcpu` - Structure for the vcpu that holds the vcpu fd. pub fn setup_msrs(vcpu: &kvm::Vcpu, pci_start: u64) -> Result<()> { const SIZE_OF_MSRS: usize = mem::size_of::(); const SIZE_OF_ENTRY: usize = mem::size_of::(); const ALIGN_OF_MSRS: usize = mem::align_of::(); const ALIGN_OF_ENTRY: usize = mem::align_of::(); const_assert!(ALIGN_OF_MSRS >= ALIGN_OF_ENTRY); let entry_vec = create_msr_entries(vcpu, pci_start); let size = SIZE_OF_MSRS + entry_vec.len() * SIZE_OF_ENTRY; let layout = Layout::from_size_align(size, ALIGN_OF_MSRS).expect("impossible layout"); let mut allocation = LayoutAllocation::zeroed(layout); // Safe to obtain an exclusive reference because there are no other // references to the allocation yet and all-zero is a valid bit pattern. let msrs = unsafe { allocation.as_mut::() }; unsafe { // Mapping the unsized array to a slice is unsafe becase the length isn't known. Providing // the length used to create the struct guarantees the entire slice is valid. let entries: &mut [kvm_msr_entry] = msrs.entries.as_mut_slice(entry_vec.len()); entries.copy_from_slice(&entry_vec); } msrs.nmsrs = entry_vec.len() as u32; vcpu.set_msrs(msrs).map_err(Error::MsrIoctlFailed)?; Ok(()) // msrs allocation is deallocated. } /// Configure FPU registers for x86 /// /// # Arguments /// /// * `vcpu` - Structure for the vcpu that holds the vcpu fd. pub fn setup_fpu(vcpu: &kvm::Vcpu) -> Result<()> { let fpu: kvm_fpu = kvm_fpu { fcw: 0x37f, mxcsr: 0x1f80, ..Default::default() }; vcpu.set_fpu(&fpu).map_err(Error::FpuIoctlFailed)?; Ok(()) } /// Configure base registers for x86 /// /// # Arguments /// /// * `vcpu` - Structure for the vcpu that holds the vcpu fd. /// * `boot_ip` - Starting instruction pointer. /// * `boot_sp` - Starting stack pointer. /// * `boot_si` - Must point to zero page address per Linux ABI. pub fn setup_regs(vcpu: &kvm::Vcpu, boot_ip: u64, boot_sp: u64, boot_si: u64) -> Result<()> { let regs: kvm_regs = kvm_regs { rflags: 0x0000000000000002u64, rip: boot_ip, rsp: boot_sp, rbp: boot_sp, rsi: boot_si, ..Default::default() }; vcpu.set_regs(®s).map_err(Error::SettingRegistersIoctl)?; Ok(()) } const X86_CR0_PE: u64 = 0x1; const X86_CR0_PG: u64 = 0x80000000; const X86_CR4_PAE: u64 = 0x20; const EFER_LME: u64 = 0x100; const EFER_LMA: u64 = 0x400; const BOOT_GDT_OFFSET: u64 = 0x500; const BOOT_IDT_OFFSET: u64 = 0x520; const BOOT_GDT_MAX: usize = 4; fn write_gdt_table(table: &[u64], guest_mem: &GuestMemory) -> Result<()> { let boot_gdt_addr = GuestAddress(BOOT_GDT_OFFSET); for (index, entry) in table.iter().enumerate() { let addr = guest_mem .checked_offset(boot_gdt_addr, (index * mem::size_of::()) as u64) .ok_or(Error::WriteGDTFailure)?; guest_mem .write_obj_at_addr(*entry, addr) .map_err(|_| Error::WriteGDTFailure)?; } Ok(()) } fn write_idt_value(val: u64, guest_mem: &GuestMemory) -> Result<()> { let boot_idt_addr = GuestAddress(BOOT_IDT_OFFSET); guest_mem .write_obj_at_addr(val, boot_idt_addr) .map_err(|_| Error::WriteIDTFailure) } fn configure_segments_and_sregs(mem: &GuestMemory, sregs: &mut kvm_sregs) -> Result<()> { let gdt_table: [u64; BOOT_GDT_MAX as usize] = [ gdt::gdt_entry(0, 0, 0), // NULL gdt::gdt_entry(0xa09b, 0, 0xfffff), // CODE gdt::gdt_entry(0xc093, 0, 0xfffff), // DATA gdt::gdt_entry(0x808b, 0, 0xfffff), // TSS ]; let code_seg = gdt::kvm_segment_from_gdt(gdt_table[1], 1); let data_seg = gdt::kvm_segment_from_gdt(gdt_table[2], 2); let tss_seg = gdt::kvm_segment_from_gdt(gdt_table[3], 3); // Write segments write_gdt_table(&gdt_table[..], mem)?; sregs.gdt.base = BOOT_GDT_OFFSET as u64; sregs.gdt.limit = mem::size_of_val(&gdt_table) as u16 - 1; write_idt_value(0, mem)?; sregs.idt.base = BOOT_IDT_OFFSET as u64; sregs.idt.limit = mem::size_of::() as u16 - 1; sregs.cs = code_seg; sregs.ds = data_seg; sregs.es = data_seg; sregs.fs = data_seg; sregs.gs = data_seg; sregs.ss = data_seg; sregs.tr = tss_seg; /* 64-bit protected mode */ sregs.cr0 |= X86_CR0_PE; sregs.efer |= EFER_LME; Ok(()) } fn setup_page_tables(mem: &GuestMemory, sregs: &mut kvm_sregs) -> Result<()> { // Puts PML4 right after zero page but aligned to 4k. let boot_pml4_addr = GuestAddress(0x9000); let boot_pdpte_addr = GuestAddress(0xa000); let boot_pde_addr = GuestAddress(0xb000); // Entry covering VA [0..512GB) mem.write_obj_at_addr(boot_pdpte_addr.offset() as u64 | 0x03, boot_pml4_addr) .map_err(|_| Error::WritePML4Address)?; // Entry covering VA [0..1GB) mem.write_obj_at_addr(boot_pde_addr.offset() as u64 | 0x03, boot_pdpte_addr) .map_err(|_| Error::WritePDPTEAddress)?; // 512 2MB entries together covering VA [0..1GB). Note we are assuming // CPU supports 2MB pages (/proc/cpuinfo has 'pse'). All modern CPUs do. for i in 0..512 { mem.write_obj_at_addr((i << 21) + 0x83u64, boot_pde_addr.unchecked_add(i * 8)) .map_err(|_| Error::WritePDEAddress)?; } sregs.cr3 = boot_pml4_addr.offset() as u64; sregs.cr4 |= X86_CR4_PAE; sregs.cr0 |= X86_CR0_PG; sregs.efer |= EFER_LMA; // Long mode is active. Must be auto-enabled with CR0_PG. Ok(()) } /// Configures the segment registers and system page tables for a given CPU. /// /// # Arguments /// /// * `mem` - The memory that will be passed to the guest. /// * `vcpu_fd` - The FD returned from the KVM_CREATE_VCPU ioctl. pub fn setup_sregs(mem: &GuestMemory, vcpu: &kvm::Vcpu) -> Result<()> { let mut sregs: kvm_sregs = vcpu.get_sregs().map_err(Error::GetSRegsIoctlFailed)?; configure_segments_and_sregs(mem, &mut sregs)?; setup_page_tables(mem, &mut sregs)?; // TODO(dgreid) - Can this be done once per system instead? vcpu.set_sregs(&sregs).map_err(Error::SetSRegsIoctlFailed)?; Ok(()) } #[cfg(test)] mod tests { use super::*; use sys_util::{GuestAddress, GuestMemory}; fn create_guest_mem() -> GuestMemory { GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap() } fn read_u64(gm: &GuestMemory, offset: u64) -> u64 { let read_addr = GuestAddress(offset); gm.read_obj_from_addr(read_addr).unwrap() } #[test] fn segments_and_sregs() { let mut sregs: kvm_sregs = Default::default(); let gm = create_guest_mem(); configure_segments_and_sregs(&gm, &mut sregs).unwrap(); assert_eq!(0x0, read_u64(&gm, BOOT_GDT_OFFSET)); assert_eq!(0xaf9b000000ffff, read_u64(&gm, BOOT_GDT_OFFSET + 8)); assert_eq!(0xcf93000000ffff, read_u64(&gm, BOOT_GDT_OFFSET + 16)); assert_eq!(0x8f8b000000ffff, read_u64(&gm, BOOT_GDT_OFFSET + 24)); assert_eq!(0x0, read_u64(&gm, BOOT_IDT_OFFSET)); assert_eq!(0, sregs.cs.base); assert_eq!(0xfffff, sregs.ds.limit); assert_eq!(0x10, sregs.es.selector); assert_eq!(1, sregs.fs.present); assert_eq!(1, sregs.gs.g); assert_eq!(0, sregs.ss.avl); assert_eq!(0, sregs.tr.base); assert_eq!(0xfffff, sregs.tr.limit); assert_eq!(0, sregs.tr.avl); assert_eq!(X86_CR0_PE, sregs.cr0); assert_eq!(EFER_LME, sregs.efer); } #[test] fn page_tables() { let mut sregs: kvm_sregs = Default::default(); let gm = create_guest_mem(); setup_page_tables(&gm, &mut sregs).unwrap(); assert_eq!(0xa003, read_u64(&gm, 0x9000)); assert_eq!(0xb003, read_u64(&gm, 0xa000)); for i in 0..512 { assert_eq!((i << 21) + 0x83u64, read_u64(&gm, 0xb000 + i * 8)); } assert_eq!(0x9000, sregs.cr3); assert_eq!(X86_CR4_PAE, sregs.cr4); assert_eq!(X86_CR0_PG, sregs.cr0); } }