// Copyright 2017 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use std::cmp; use std::fmt::{self, Display}; use std::io::{self, Seek, SeekFrom, Write}; use std::mem::{size_of, size_of_val}; use std::os::unix::io::{AsRawFd, RawFd}; use std::result; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; use std::thread; use std::time::Duration; use std::u32; use sync::Mutex; use sys_util::Error as SysError; use sys_util::Result as SysResult; use sys_util::{ error, info, warn, EventFd, FileReadWriteVolatile, FileSetLen, FileSync, GuestAddress, GuestMemory, GuestMemoryError, PollContext, PollToken, PunchHole, TimerFd, WriteZeroes, }; use data_model::{DataInit, Le16, Le32, Le64, VolatileMemory, VolatileMemoryError}; use msg_socket::{MsgReceiver, MsgSender}; use vm_control::{DiskControlCommand, DiskControlResponseSocket, DiskControlResult}; use super::{ DescriptorChain, Queue, VirtioDevice, INTERRUPT_STATUS_CONFIG_CHANGED, INTERRUPT_STATUS_USED_RING, TYPE_BLOCK, VIRTIO_F_VERSION_1, }; const QUEUE_SIZE: u16 = 256; const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE]; const SECTOR_SHIFT: u8 = 9; const SECTOR_SIZE: u64 = 0x01 << SECTOR_SHIFT; const MAX_DISCARD_SECTORS: u32 = u32::MAX; const MAX_WRITE_ZEROES_SECTORS: u32 = u32::MAX; // Hard-coded to 64 KiB (in 512-byte sectors) for now, // but this should probably be based on cluster size for qcow. const DISCARD_SECTOR_ALIGNMENT: u32 = 128; const VIRTIO_BLK_T_IN: u32 = 0; const VIRTIO_BLK_T_OUT: u32 = 1; const VIRTIO_BLK_T_FLUSH: u32 = 4; const VIRTIO_BLK_T_DISCARD: u32 = 11; const VIRTIO_BLK_T_WRITE_ZEROES: u32 = 13; const VIRTIO_BLK_S_OK: u8 = 0; const VIRTIO_BLK_S_IOERR: u8 = 1; const VIRTIO_BLK_S_UNSUPP: u8 = 2; const VIRTIO_BLK_F_RO: u32 = 5; const VIRTIO_BLK_F_BLK_SIZE: u32 = 6; const VIRTIO_BLK_F_FLUSH: u32 = 9; const VIRTIO_BLK_F_DISCARD: u32 = 13; const VIRTIO_BLK_F_WRITE_ZEROES: u32 = 14; #[derive(Copy, Clone, Debug, Default)] #[repr(C)] struct virtio_blk_geometry { cylinders: Le16, heads: u8, sectors: u8, } // Safe because it only has data and has no implicit padding. unsafe impl DataInit for virtio_blk_geometry {} #[derive(Copy, Clone, Debug, Default)] #[repr(C)] struct virtio_blk_topology { physical_block_exp: u8, alignment_offset: u8, min_io_size: Le16, opt_io_size: Le32, } // Safe because it only has data and has no implicit padding. unsafe impl DataInit for virtio_blk_topology {} #[derive(Copy, Clone, Debug, Default)] #[repr(C)] struct virtio_blk_config { capacity: Le64, size_max: Le32, seg_max: Le32, geometry: virtio_blk_geometry, blk_size: Le32, topology: virtio_blk_topology, writeback: u8, unused0: [u8; 3], max_discard_sectors: Le32, max_discard_seg: Le32, discard_sector_alignment: Le32, max_write_zeroes_sectors: Le32, max_write_zeroes_seg: Le32, write_zeroes_may_unmap: u8, unused1: [u8; 3], } // Safe because it only has data and has no implicit padding. unsafe impl DataInit for virtio_blk_config {} #[derive(Copy, Clone, Debug, Default)] #[repr(C)] struct virtio_blk_discard_write_zeroes { sector: Le64, num_sectors: Le32, flags: Le32, } const VIRTIO_BLK_DISCARD_WRITE_ZEROES_FLAG_UNMAP: u32 = 1 << 0; // Safe because it only has data and has no implicit padding. unsafe impl DataInit for virtio_blk_discard_write_zeroes {} pub trait DiskFile: FileSetLen + FileSync + FileReadWriteVolatile + PunchHole + Seek + WriteZeroes { } impl DiskFile for D { } #[derive(Copy, Clone, Debug, PartialEq)] enum RequestType { In, Out, Flush, Discard, WriteZeroes, Unsupported(u32), } impl Display for RequestType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use self::RequestType::*; match self { In => write!(f, "in"), Out => write!(f, "out"), Flush => write!(f, "flush"), Discard => write!(f, "discard"), WriteZeroes => write!(f, "write zeroes"), Unsupported(n) => write!(f, "unsupported({})", n), } } } #[derive(Debug)] enum ParseError { /// Guest gave us bad memory addresses GuestMemory(GuestMemoryError), /// Guest gave us offsets that would have overflowed a usize. CheckedOffset(GuestAddress, u64), /// Guest gave us a write only descriptor that protocol says to read from. UnexpectedWriteOnlyDescriptor, /// Guest gave us a read only descriptor that protocol says to write to. UnexpectedReadOnlyDescriptor, /// Guest gave us too few descriptors in a descriptor chain. DescriptorChainTooShort, /// Guest gave us a descriptor that was too short to use. DescriptorLengthTooSmall, } impl Display for ParseError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use self::ParseError::*; match self { GuestMemory(e) => write!(f, "bad guest memory address: {}", e), CheckedOffset(addr, offset) => write!(f, "{}+{} would overflow a usize", addr, offset), UnexpectedWriteOnlyDescriptor => write!(f, "unexpected write-only descriptor"), UnexpectedReadOnlyDescriptor => write!(f, "unexpected read-only descriptor"), DescriptorChainTooShort => write!(f, "descriptor chain too short"), DescriptorLengthTooSmall => write!(f, "descriptor length too small"), } } } fn request_type( mem: &GuestMemory, desc_addr: GuestAddress, ) -> result::Result { let type_ = mem .read_obj_from_addr(desc_addr) .map_err(ParseError::GuestMemory)?; match type_ { VIRTIO_BLK_T_IN => Ok(RequestType::In), VIRTIO_BLK_T_OUT => Ok(RequestType::Out), VIRTIO_BLK_T_FLUSH => Ok(RequestType::Flush), VIRTIO_BLK_T_DISCARD => Ok(RequestType::Discard), VIRTIO_BLK_T_WRITE_ZEROES => Ok(RequestType::WriteZeroes), t => Ok(RequestType::Unsupported(t)), } } fn sector(mem: &GuestMemory, desc_addr: GuestAddress) -> result::Result { const SECTOR_OFFSET: u64 = 8; let addr = match mem.checked_offset(desc_addr, SECTOR_OFFSET) { Some(v) => v, None => return Err(ParseError::CheckedOffset(desc_addr, SECTOR_OFFSET)), }; mem.read_obj_from_addr(addr) .map_err(ParseError::GuestMemory) } fn discard_write_zeroes_segment( mem: &GuestMemory, seg_addr: GuestAddress, ) -> result::Result { mem.read_obj_from_addr(seg_addr) .map_err(ParseError::GuestMemory) } #[derive(Debug)] enum ExecuteError { /// Error arming the flush timer. Flush(io::Error), ReadVolatile { addr: GuestAddress, length: u32, sector: u64, volatile_memory_error: VolatileMemoryError, }, ReadIo { addr: GuestAddress, length: u32, sector: u64, io_error: io::Error, }, Seek { ioerr: io::Error, sector: u64, }, TimerFd(SysError), WriteVolatile { addr: GuestAddress, length: u32, sector: u64, volatile_memory_error: VolatileMemoryError, }, WriteIo { addr: GuestAddress, length: u32, sector: u64, io_error: io::Error, }, DiscardWriteZeroes { ioerr: Option, sector: u64, num_sectors: u32, flags: u32, }, ReadOnly { request_type: RequestType, }, OutOfRange, Unsupported(u32), } impl Display for ExecuteError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use self::ExecuteError::*; match self { Flush(e) => write!(f, "failed to flush: {}", e), ReadVolatile { addr, length, sector, volatile_memory_error, } => write!( f, "memory error reading {} bytes from sector {} to address {}: {}", length, sector, addr, volatile_memory_error, ), ReadIo { addr, length, sector, io_error, } => write!( f, "io error reading {} bytes from sector {} to address {}: {}", length, sector, addr, io_error, ), Seek { ioerr, sector } => write!(f, "failed to seek to sector {}: {}", sector, ioerr), TimerFd(e) => write!(f, "{}", e), WriteVolatile { addr, length, sector, volatile_memory_error, } => write!( f, "memory error writing {} bytes from address {} to sector {}: {}", length, addr, sector, volatile_memory_error, ), WriteIo { addr, length, sector, io_error, } => write!( f, "io error writing {} bytes from address {} to sector {}: {}", length, addr, sector, io_error, ), DiscardWriteZeroes { ioerr: Some(ioerr), sector, num_sectors, flags, } => write!( f, "failed to perform discard or write zeroes; sector={} num_sectors={} flags={}; {}", sector, num_sectors, flags, ioerr, ), DiscardWriteZeroes { ioerr: None, sector, num_sectors, flags, } => write!( f, "failed to perform discard or write zeroes; sector={} num_sectors={} flags={}", sector, num_sectors, flags, ), ReadOnly { request_type } => write!(f, "read only; request_type={}", request_type), OutOfRange => write!(f, "out of range"), Unsupported(n) => write!(f, "unsupported ({})", n), } } } impl ExecuteError { fn status(&self) -> u8 { match self { ExecuteError::Flush(_) => VIRTIO_BLK_S_IOERR, ExecuteError::ReadIo { .. } => VIRTIO_BLK_S_IOERR, ExecuteError::ReadVolatile { .. } => VIRTIO_BLK_S_IOERR, ExecuteError::Seek { .. } => VIRTIO_BLK_S_IOERR, ExecuteError::TimerFd(_) => VIRTIO_BLK_S_IOERR, ExecuteError::WriteIo { .. } => VIRTIO_BLK_S_IOERR, ExecuteError::WriteVolatile { .. } => VIRTIO_BLK_S_IOERR, ExecuteError::DiscardWriteZeroes { .. } => VIRTIO_BLK_S_IOERR, ExecuteError::ReadOnly { .. } => VIRTIO_BLK_S_IOERR, ExecuteError::OutOfRange { .. } => VIRTIO_BLK_S_IOERR, ExecuteError::Unsupported(_) => VIRTIO_BLK_S_UNSUPP, } } } struct Request { request_type: RequestType, sector: u64, data_addr: GuestAddress, data_len: u32, status_addr: GuestAddress, discard_write_zeroes_seg: Option, } impl Request { fn parse( avail_desc: &DescriptorChain, mem: &GuestMemory, ) -> result::Result { // The head contains the request type which MUST be readable. if avail_desc.is_write_only() { return Err(ParseError::UnexpectedWriteOnlyDescriptor); } let req_type = request_type(&mem, avail_desc.addr)?; if req_type == RequestType::Flush { Request::parse_flush(avail_desc, mem) } else if req_type == RequestType::Discard || req_type == RequestType::WriteZeroes { Request::parse_discard_write_zeroes(avail_desc, mem, req_type) } else { Request::parse_read_write(avail_desc, mem, req_type) } } fn parse_flush( avail_desc: &DescriptorChain, mem: &GuestMemory, ) -> result::Result { let sector = sector(&mem, avail_desc.addr)?; let status_desc = avail_desc .next_descriptor() .ok_or(ParseError::DescriptorChainTooShort)?; // The status MUST always be writable if !status_desc.is_write_only() { return Err(ParseError::UnexpectedReadOnlyDescriptor); } if status_desc.len < 1 { return Err(ParseError::DescriptorLengthTooSmall); } Ok(Request { request_type: RequestType::Flush, sector, data_addr: GuestAddress(0), data_len: 0, status_addr: status_desc.addr, discard_write_zeroes_seg: None, }) } fn parse_discard_write_zeroes( avail_desc: &DescriptorChain, mem: &GuestMemory, req_type: RequestType, ) -> result::Result { let seg_desc = avail_desc .next_descriptor() .ok_or(ParseError::DescriptorChainTooShort)?; let status_desc = seg_desc .next_descriptor() .ok_or(ParseError::DescriptorChainTooShort)?; if seg_desc.is_write_only() { return Err(ParseError::UnexpectedWriteOnlyDescriptor); } // For simplicity, we currently only support a single segment // for discard and write zeroes commands. This allows the // request to be represented as a single Request object. if seg_desc.len < size_of::() as u32 { return Err(ParseError::DescriptorLengthTooSmall); } let seg = discard_write_zeroes_segment(&mem, seg_desc.addr)?; // The status MUST always be writable if !status_desc.is_write_only() { return Err(ParseError::UnexpectedReadOnlyDescriptor); } if status_desc.len < 1 { return Err(ParseError::DescriptorLengthTooSmall); } Ok(Request { request_type: req_type, sector: 0, data_addr: GuestAddress(0), data_len: 0, status_addr: status_desc.addr, discard_write_zeroes_seg: Some(seg), }) } fn parse_read_write( avail_desc: &DescriptorChain, mem: &GuestMemory, req_type: RequestType, ) -> result::Result { let sector = sector(&mem, avail_desc.addr)?; let data_desc = avail_desc .next_descriptor() .ok_or(ParseError::DescriptorChainTooShort)?; let status_desc = data_desc .next_descriptor() .ok_or(ParseError::DescriptorChainTooShort)?; if data_desc.is_write_only() && req_type == RequestType::Out { return Err(ParseError::UnexpectedWriteOnlyDescriptor); } if !data_desc.is_write_only() && req_type == RequestType::In { return Err(ParseError::UnexpectedReadOnlyDescriptor); } // The status MUST always be writable if !status_desc.is_write_only() { return Err(ParseError::UnexpectedReadOnlyDescriptor); } if status_desc.len < 1 { return Err(ParseError::DescriptorLengthTooSmall); } Ok(Request { request_type: req_type, sector, data_addr: data_desc.addr, data_len: data_desc.len, status_addr: status_desc.addr, discard_write_zeroes_seg: None, }) } fn execute( &self, read_only: bool, disk: &mut T, disk_size: u64, flush_timer: &mut TimerFd, flush_timer_armed: &mut bool, mem: &GuestMemory, ) -> result::Result { // Delay after a write when the file is auto-flushed. let flush_delay = Duration::from_secs(60); if read_only && self.request_type != RequestType::In { return Err(ExecuteError::ReadOnly { request_type: self.request_type, }); } /// Check that a request accesses only data within the disk's current size. /// All parameters are in units of bytes. fn check_range( io_start: u64, io_length: u64, disk_size: u64, ) -> result::Result<(), ExecuteError> { let io_end = io_start .checked_add(io_length) .ok_or(ExecuteError::OutOfRange)?; if io_end > disk_size { Err(ExecuteError::OutOfRange) } else { Ok(()) } } match self.request_type { RequestType::In => { let offset = self .sector .checked_shl(u32::from(SECTOR_SHIFT)) .ok_or(ExecuteError::OutOfRange)?; check_range(offset, u64::from(self.data_len), disk_size)?; disk.seek(SeekFrom::Start(offset)) .map_err(|e| ExecuteError::Seek { ioerr: e, sector: self.sector, })?; let mem_slice = mem .get_slice(self.data_addr.0, self.data_len as u64) .map_err(|volatile_memory_error| ExecuteError::ReadVolatile { addr: self.data_addr, length: self.data_len, sector: self.sector, volatile_memory_error, })?; disk.read_exact_volatile(mem_slice) .map_err(|io_error| ExecuteError::ReadIo { addr: self.data_addr, length: self.data_len, sector: self.sector, io_error, })?; return Ok(self.data_len); } RequestType::Out => { let offset = self .sector .checked_shl(u32::from(SECTOR_SHIFT)) .ok_or(ExecuteError::OutOfRange)?; check_range(offset, u64::from(self.data_len), disk_size)?; disk.seek(SeekFrom::Start(offset)) .map_err(|e| ExecuteError::Seek { ioerr: e, sector: self.sector, })?; let mem_slice = mem .get_slice(self.data_addr.0, self.data_len as u64) .map_err(|volatile_memory_error| ExecuteError::WriteVolatile { addr: self.data_addr, length: self.data_len, sector: self.sector, volatile_memory_error, })?; disk.write_all_volatile(mem_slice) .map_err(|io_error| ExecuteError::WriteIo { addr: self.data_addr, length: self.data_len, sector: self.sector, io_error, })?; if !*flush_timer_armed { flush_timer .reset(flush_delay, None) .map_err(ExecuteError::TimerFd)?; *flush_timer_armed = true; } } RequestType::Discard | RequestType::WriteZeroes => { if let Some(seg) = self.discard_write_zeroes_seg { let sector = seg.sector.to_native(); let num_sectors = seg.num_sectors.to_native(); let flags = seg.flags.to_native(); let valid_flags = if self.request_type == RequestType::WriteZeroes { VIRTIO_BLK_DISCARD_WRITE_ZEROES_FLAG_UNMAP } else { 0 }; if (flags & !valid_flags) != 0 { return Err(ExecuteError::DiscardWriteZeroes { ioerr: None, sector, num_sectors, flags, }); } let offset = sector .checked_shl(u32::from(SECTOR_SHIFT)) .ok_or(ExecuteError::OutOfRange)?; let length = u64::from(num_sectors) .checked_shl(u32::from(SECTOR_SHIFT)) .ok_or(ExecuteError::OutOfRange)?; check_range(offset, length, disk_size)?; if self.request_type == RequestType::Discard { // Since Discard is just a hint and some filesystems may not implement // FALLOC_FL_PUNCH_HOLE, ignore punch_hole errors. let _ = disk.punch_hole(offset, length); } else { disk.seek(SeekFrom::Start(offset)) .map_err(|e| ExecuteError::Seek { ioerr: e, sector })?; disk.write_zeroes(length as usize).map_err(|e| { ExecuteError::DiscardWriteZeroes { ioerr: Some(e), sector, num_sectors, flags, } })?; } } } RequestType::Flush => { disk.fsync().map_err(ExecuteError::Flush)?; flush_timer.clear().map_err(ExecuteError::TimerFd)?; *flush_timer_armed = false; } RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)), }; Ok(0) } } struct Worker { queues: Vec, mem: GuestMemory, disk_image: T, disk_size: Arc>, read_only: bool, interrupt_status: Arc, interrupt_evt: EventFd, interrupt_resample_evt: EventFd, } impl Worker { fn process_queue( &mut self, queue_index: usize, flush_timer: &mut TimerFd, flush_timer_armed: &mut bool, ) -> bool { let queue = &mut self.queues[queue_index]; let disk_size = self.disk_size.lock(); let mut needs_interrupt = false; while let Some(avail_desc) = queue.pop(&self.mem) { let len; match Request::parse(&avail_desc, &self.mem) { Ok(request) => { let status = match request.execute( self.read_only, &mut self.disk_image, *disk_size, flush_timer, flush_timer_armed, &self.mem, ) { Ok(l) => { len = l; VIRTIO_BLK_S_OK } Err(e) => { error!("failed executing disk request: {}", e); len = 1; // 1 byte for the status e.status() } }; // We use unwrap because the request parsing process already checked that the // status_addr was valid. self.mem .write_obj_at_addr(status, request.status_addr) .unwrap(); } Err(e) => { error!("failed processing available descriptor chain: {}", e); len = 0; } } queue.add_used(&self.mem, avail_desc.index, len); needs_interrupt = true; } needs_interrupt } fn resize(&mut self, new_size: u64) -> DiskControlResult { if self.read_only { error!("Attempted to resize read-only block device"); return DiskControlResult::Err(SysError::new(libc::EROFS)); } info!("Resizing block device to {} bytes", new_size); if let Err(e) = self.disk_image.set_len(new_size) { error!("Resizing disk failed! {}", e); return DiskControlResult::Err(SysError::new(libc::EIO)); } if let Ok(new_disk_size) = self.disk_image.seek(SeekFrom::End(0)) { let mut disk_size = self.disk_size.lock(); *disk_size = new_disk_size; } DiskControlResult::Ok } fn signal_used_queue(&self) { self.interrupt_status .fetch_or(INTERRUPT_STATUS_USED_RING as usize, Ordering::SeqCst); self.interrupt_evt.write(1).unwrap(); } fn signal_config_changed(&self) { self.interrupt_status .fetch_or(INTERRUPT_STATUS_CONFIG_CHANGED as usize, Ordering::SeqCst); self.interrupt_evt.write(1).unwrap(); } fn run( &mut self, queue_evt: EventFd, kill_evt: EventFd, control_socket: DiskControlResponseSocket, ) { #[derive(PollToken)] enum Token { FlushTimer, QueueAvailable, ControlRequest, InterruptResample, Kill, } let mut flush_timer = match TimerFd::new() { Ok(t) => t, Err(e) => { error!("Failed to create the flush timer: {}", e); return; } }; let mut flush_timer_armed = false; let poll_ctx: PollContext = match PollContext::new() .and_then(|pc| pc.add(&flush_timer, Token::FlushTimer).and(Ok(pc))) .and_then(|pc| pc.add(&queue_evt, Token::QueueAvailable).and(Ok(pc))) .and_then(|pc| pc.add(&control_socket, Token::ControlRequest).and(Ok(pc))) .and_then(|pc| { pc.add(&self.interrupt_resample_evt, Token::InterruptResample) .and(Ok(pc)) }) .and_then(|pc| pc.add(&kill_evt, Token::Kill).and(Ok(pc))) { Ok(pc) => pc, Err(e) => { error!("failed creating PollContext: {}", e); return; } }; 'poll: loop { let events = match poll_ctx.wait() { Ok(v) => v, Err(e) => { error!("failed polling for events: {}", e); break; } }; let mut needs_interrupt = false; let mut needs_config_interrupt = false; for event in events.iter_readable() { match event.token() { Token::FlushTimer => { if let Err(e) = self.disk_image.fsync() { error!("Failed to flush the disk: {}", e); break 'poll; } if let Err(e) = flush_timer.wait() { error!("Failed to clear flush timer: {}", e); break 'poll; } } Token::QueueAvailable => { if let Err(e) = queue_evt.read() { error!("failed reading queue EventFd: {}", e); break 'poll; } needs_interrupt |= self.process_queue(0, &mut flush_timer, &mut flush_timer_armed); } Token::ControlRequest => { let req = match control_socket.recv() { Ok(req) => req, Err(e) => { error!("control socket failed recv: {}", e); break 'poll; } }; let resp = match req { DiskControlCommand::Resize { new_size } => { needs_config_interrupt = true; self.resize(new_size) } }; if let Err(e) = control_socket.send(&resp) { error!("control socket failed send: {}", e); break 'poll; } } Token::InterruptResample => { let _ = self.interrupt_resample_evt.read(); if self.interrupt_status.load(Ordering::SeqCst) != 0 { self.interrupt_evt.write(1).unwrap(); } } Token::Kill => break 'poll, } } if needs_interrupt { self.signal_used_queue(); } if needs_config_interrupt { self.signal_config_changed(); } } } } /// Virtio device for exposing block level read/write operations on a host file. pub struct Block { kill_evt: Option, disk_image: Option, disk_size: Arc>, avail_features: u64, read_only: bool, control_socket: Option, } fn build_config_space(disk_size: u64) -> virtio_blk_config { virtio_blk_config { // If the image is not a multiple of the sector size, the tail bits are not exposed. capacity: Le64::from(disk_size >> SECTOR_SHIFT), blk_size: Le32::from(SECTOR_SIZE as u32), max_discard_sectors: Le32::from(MAX_DISCARD_SECTORS), discard_sector_alignment: Le32::from(DISCARD_SECTOR_ALIGNMENT), max_write_zeroes_sectors: Le32::from(MAX_WRITE_ZEROES_SECTORS), write_zeroes_may_unmap: 1, // Limit number of segments to 1 - see parse_discard_write_zeroes() max_discard_seg: Le32::from(1), max_write_zeroes_seg: Le32::from(1), ..Default::default() } } impl Block { /// Create a new virtio block device that operates on the given file. /// /// The given file must be seekable and sizable. pub fn new( mut disk_image: T, read_only: bool, control_socket: Option, ) -> SysResult> { let disk_size = disk_image.seek(SeekFrom::End(0))? as u64; if disk_size % SECTOR_SIZE != 0 { warn!( "Disk size {} is not a multiple of sector size {}; \ the remainder will not be visible to the guest.", disk_size, SECTOR_SIZE ); } let mut avail_features: u64 = 1 << VIRTIO_BLK_F_FLUSH; if read_only { avail_features |= 1 << VIRTIO_BLK_F_RO; } else { avail_features |= 1 << VIRTIO_BLK_F_DISCARD; avail_features |= 1 << VIRTIO_BLK_F_WRITE_ZEROES; } avail_features |= 1 << VIRTIO_F_VERSION_1; avail_features |= 1 << VIRTIO_BLK_F_BLK_SIZE; Ok(Block { kill_evt: None, disk_image: Some(disk_image), disk_size: Arc::new(Mutex::new(disk_size)), avail_features, read_only, control_socket, }) } } impl Drop for Block { fn drop(&mut self) { if let Some(kill_evt) = self.kill_evt.take() { // Ignore the result because there is nothing we can do about it. let _ = kill_evt.write(1); } } } impl VirtioDevice for Block { fn keep_fds(&self) -> Vec { let mut keep_fds = Vec::new(); if let Some(disk_image) = &self.disk_image { keep_fds.push(disk_image.as_raw_fd()); } if let Some(control_socket) = &self.control_socket { keep_fds.push(control_socket.as_raw_fd()); } keep_fds } fn features(&self) -> u64 { self.avail_features } fn device_type(&self) -> u32 { TYPE_BLOCK } fn queue_max_sizes(&self) -> &[u16] { QUEUE_SIZES } fn read_config(&self, offset: u64, mut data: &mut [u8]) { let config_space = { let disk_size = self.disk_size.lock(); build_config_space(*disk_size) }; let config_len = size_of_val(&config_space) as u64; if offset >= config_len { return; } if let Some(end) = offset.checked_add(data.len() as u64) { let offset = offset as usize; let end = cmp::min(end, config_len) as usize; // This write can't fail, offset and end are checked against config_len. data.write_all(&config_space.as_slice()[offset..end]) .unwrap(); } } fn activate( &mut self, mem: GuestMemory, interrupt_evt: EventFd, interrupt_resample_evt: EventFd, status: Arc, queues: Vec, mut queue_evts: Vec, ) { if queues.len() != 1 || queue_evts.len() != 1 { return; } let (self_kill_evt, kill_evt) = match EventFd::new().and_then(|e| Ok((e.try_clone()?, e))) { Ok(v) => v, Err(e) => { error!("failed creating kill EventFd pair: {}", e); return; } }; self.kill_evt = Some(self_kill_evt); let read_only = self.read_only; let disk_size = self.disk_size.clone(); if let Some(disk_image) = self.disk_image.take() { if let Some(control_socket) = self.control_socket.take() { let worker_result = thread::Builder::new() .name("virtio_blk".to_string()) .spawn(move || { let mut worker = Worker { queues, mem, disk_image, disk_size, read_only, interrupt_status: status, interrupt_evt, interrupt_resample_evt, }; worker.run(queue_evts.remove(0), kill_evt, control_socket); }); if let Err(e) = worker_result { error!("failed to spawn virtio_blk worker: {}", e); return; } } } } } #[cfg(test)] mod tests { use std::fs::{File, OpenOptions}; use tempfile::TempDir; use super::*; #[test] fn read_size() { let tempdir = TempDir::new().unwrap(); let mut path = tempdir.path().to_owned(); path.push("disk_image"); let f = File::create(&path).unwrap(); f.set_len(0x1000).unwrap(); let b = Block::new(f, true, None).unwrap(); let mut num_sectors = [0u8; 4]; b.read_config(0, &mut num_sectors); // size is 0x1000, so num_sectors is 8 (4096/512). assert_eq!([0x08, 0x00, 0x00, 0x00], num_sectors); let mut msw_sectors = [0u8; 4]; b.read_config(4, &mut msw_sectors); // size is 0x1000, so msw_sectors is 0. assert_eq!([0x00, 0x00, 0x00, 0x00], msw_sectors); } #[test] fn read_features() { let tempdir = TempDir::new().unwrap(); let mut path = tempdir.path().to_owned(); path.push("disk_image"); // read-write block device { let f = File::create(&path).unwrap(); let b = Block::new(f, false, None).unwrap(); // writable device should set VIRTIO_BLK_F_FLUSH + VIRTIO_BLK_F_DISCARD // + VIRTIO_BLK_F_WRITE_ZEROES + VIRTIO_F_VERSION_1 + VIRTIO_BLK_F_BLK_SIZE assert_eq!(0x100006240, b.features()); } // read-only block device { let f = File::create(&path).unwrap(); let b = Block::new(f, true, None).unwrap(); // read-only device should set VIRTIO_BLK_F_FLUSH and VIRTIO_BLK_F_RO // + VIRTIO_F_VERSION_1 + VIRTIO_BLK_F_BLK_SIZE assert_eq!(0x100000260, b.features()); } } #[test] fn read_last_sector() { let tempdir = TempDir::new().unwrap(); let mut path = tempdir.path().to_owned(); path.push("disk_image"); let mut f = OpenOptions::new() .read(true) .write(true) .create(true) .open(&path) .unwrap(); let disk_size = 0x1000; f.set_len(disk_size).unwrap(); let mem = GuestMemory::new(&[(GuestAddress(0u64), 4 * 1024 * 1024)]) .expect("Creating guest memory failed."); let req = Request { request_type: RequestType::In, sector: 7, // Disk is 8 sectors long, so this is the last valid sector. data_addr: GuestAddress(0x1000), data_len: 512, // Read 1 sector of data. status_addr: GuestAddress(0), discard_write_zeroes_seg: None, }; let mut flush_timer = TimerFd::new().expect("failed to create flush_timer"); let mut flush_timer_armed = false; assert_eq!( 512, req.execute( false, &mut f, disk_size, &mut flush_timer, &mut flush_timer_armed, &mem ) .expect("execute failed"), ); } #[test] fn read_beyond_last_sector() { let tempdir = TempDir::new().unwrap(); let mut path = tempdir.path().to_owned(); path.push("disk_image"); let mut f = OpenOptions::new() .read(true) .write(true) .create(true) .open(&path) .unwrap(); let disk_size = 0x1000; f.set_len(disk_size).unwrap(); let mem = GuestMemory::new(&[(GuestAddress(0u64), 4 * 1024 * 1024)]) .expect("Creating guest memory failed."); let req = Request { request_type: RequestType::In, sector: 7, // Disk is 8 sectors long, so this is the last valid sector. data_addr: GuestAddress(0x1000), data_len: 512 * 2, // Read 2 sectors of data (overlap the end of the disk). status_addr: GuestAddress(0), discard_write_zeroes_seg: None, }; let mut flush_timer = TimerFd::new().expect("failed to create flush_timer"); let mut flush_timer_armed = false; req.execute( false, &mut f, disk_size, &mut flush_timer, &mut flush_timer_armed, &mem, ) .expect_err("execute was supposed to fail"); } }