diff options
author | Chirantan Ekbote <chirantan@chromium.org> | 2019-08-16 16:18:25 +0900 |
---|---|---|
committer | Commit Bot <commit-bot@chromium.org> | 2019-11-01 06:20:27 +0000 |
commit | 2569b20f0fbfb976a7196ae9466c2c9d8ef506af (patch) | |
tree | 9c207f11be641e689393e81592d31213bfdb1eec /devices/src | |
parent | 85858f580eada8dd85c8c798ef3e98f18d92dc1e (diff) | |
download | crosvm-2569b20f0fbfb976a7196ae9466c2c9d8ef506af.tar crosvm-2569b20f0fbfb976a7196ae9466c2c9d8ef506af.tar.gz crosvm-2569b20f0fbfb976a7196ae9466c2c9d8ef506af.tar.bz2 crosvm-2569b20f0fbfb976a7196ae9466c2c9d8ef506af.tar.lz crosvm-2569b20f0fbfb976a7196ae9466c2c9d8ef506af.tar.xz crosvm-2569b20f0fbfb976a7196ae9466c2c9d8ef506af.tar.zst crosvm-2569b20f0fbfb976a7196ae9466c2c9d8ef506af.zip |
devices: fs: Add a passthrough file system
Add a "passthrough" file system implementation that just forwards it's requests to the appropriate system call. BUG=b:136128319 TEST=`tast run vm.VirtioFs` Change-Id: I802c91dd0af8cdd8b9e761d9f04f874ae41ec033 Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/crosvm/+/1758103 Tested-by: Chirantan Ekbote <chirantan@chromium.org> Tested-by: kokoro <noreply+kokoro@google.com> Commit-Queue: Chirantan Ekbote <chirantan@chromium.org> Reviewed-by: Stephen Barber <smbarber@chromium.org>
Diffstat (limited to 'devices/src')
-rw-r--r-- | devices/src/virtio/fs/passthrough.rs | 1504 |
1 files changed, 1504 insertions, 0 deletions
diff --git a/devices/src/virtio/fs/passthrough.rs b/devices/src/virtio/fs/passthrough.rs new file mode 100644 index 0000000..c42b212 --- /dev/null +++ b/devices/src/virtio/fs/passthrough.rs @@ -0,0 +1,1504 @@ +// Copyright 2019 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::collections::btree_map; +use std::collections::BTreeMap; +use std::ffi::{CStr, CString}; +use std::fs::File; +use std::io; +use std::mem::{self, size_of, MaybeUninit}; +use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; +use std::str::FromStr; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::{Arc, RwLock}; +use std::time::Duration; + +use data_model::DataInit; +use libc; +use sync::Mutex; +use sys_util::error; + +use crate::virtio::fs::filesystem::{ + Context, DirEntry, Entry, FileSystem, FsOptions, GetxattrReply, ListxattrReply, OpenOptions, + SetattrValid, ZeroCopyReader, ZeroCopyWriter, +}; +use crate::virtio::fs::fuse; +use crate::virtio::fs::multikey::MultikeyBTreeMap; + +const CURRENT_DIR_CSTR: &[u8] = b".\0"; +const PARENT_DIR_CSTR: &[u8] = b"..\0"; +const EMPTY_CSTR: &[u8] = b"\0"; +const ROOT_CSTR: &[u8] = b"/\0"; +const PROC_CSTR: &[u8] = b"/proc\0"; + +type Inode = u64; +type Handle = u64; + +#[derive(Clone, Copy, PartialOrd, Ord, PartialEq, Eq)] +struct InodeAltKey { + ino: libc::ino64_t, + dev: libc::dev_t, +} + +struct InodeData { + inode: Inode, + // Most of these aren't actually files but ¯\_(ツ)_/¯. + file: File, + refcount: AtomicU64, +} + +struct HandleData { + inode: Inode, + file: Mutex<File>, +} + +#[repr(C, packed)] +#[derive(Clone, Copy, Debug)] +struct LinuxDirent64 { + d_ino: libc::ino64_t, + d_off: libc::off64_t, + d_reclen: libc::c_ushort, + d_ty: libc::c_uchar, +} +unsafe impl DataInit for LinuxDirent64 {} + +/// The caching policy that the server should report to the client. +pub enum CachePolicy { + /// The client should never cache file data and all I/O should be directly forwarded + /// to the server. + Never, + + /// The client is free to choose when and how to cache file data. + Auto, + + /// The client should always cache file data. + Always, +} + +impl FromStr for CachePolicy { + type Err = &'static str; + + fn from_str(s: &str) -> Result<Self, Self::Err> { + match s { + "never" | "Never" | "NEVER" => Ok(CachePolicy::Never), + "auto" | "Auto" | "AUTO" => Ok(CachePolicy::Auto), + "always" | "Always" | "ALWAYS" => Ok(CachePolicy::Always), + _ => Err("invalid cache policy"), + } + } +} + +impl Default for CachePolicy { + fn default() -> Self { + CachePolicy::Auto + } +} + +macro_rules! scoped_cred { + ($name:ident, $ty:ty, $syscall_nr:expr) => { + #[derive(Debug)] + struct $name; + + impl $name { + // Changes the effective uid/gid of the current thread to `val`. Changes + // the thread's credentials back to root when the returned struct is dropped. + fn new(val: $ty) -> io::Result<Option<$name>> { + if val == 0 { + // Nothing to do since we are already uid 0. + return Ok(None); + } + + // We want credential changes to be per-thread because otherwise + // we might interfere with operations being carried out on other + // threads with different uids/gids. However, posix requires that + // all threads in a process share the same credentials. To do this + // libc uses signals to ensure that when one thread changes its + // credentials the other threads do the same thing. + // + // So instead we invoke the syscall directly in order to get around + // this limitation. Another option is to use the setfsuid and + // setfsgid systems calls. However since those calls have no way to + // return an error, it's preferable to do this instead. + + // This call is safe because it doesn't modify any memory and we + // check the return value. + let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) }; + if res == 0 { + Ok(Some($name)) + } else { + Err(io::Error::last_os_error()) + } + } + } + + impl Drop for $name { + fn drop(&mut self) { + let res = unsafe { libc::syscall($syscall_nr, -1, 0, -1) }; + if res < 0 { + error!( + "failed to change credentials back to root: {}", + io::Error::last_os_error(), + ); + } + } + } + }; +} +scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid); +scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid); + +fn set_creds( + uid: libc::uid_t, + gid: libc::gid_t, +) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)> { + // We have to change the gid before we change the uid because if we change the uid first then we + // lose the capability to change the gid. However changing back can happen in any order. + ScopedGid::new(gid).and_then(|gid| Ok((ScopedUid::new(uid)?, gid))) +} + +fn ebadf() -> io::Error { + io::Error::from_raw_os_error(libc::EBADF) +} + +fn stat(f: &File) -> io::Result<libc::stat64> { + let mut st = MaybeUninit::<libc::stat64>::zeroed(); + + // Safe because this is a constant value and a valid C string. + let pathname = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; + + // Safe because the kernel will only write data in `st` and we check the return + // value. + let res = unsafe { + libc::fstatat64( + f.as_raw_fd(), + pathname.as_ptr(), + st.as_mut_ptr(), + libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW, + ) + }; + if res >= 0 { + // Safe because the kernel guarantees that the struct is now fully initialized. + Ok(unsafe { st.assume_init() }) + } else { + Err(io::Error::last_os_error()) + } +} + +pub struct PassthroughFs { + // File descriptors for various points in the file system tree. These fds are always opened with + // the `O_PATH` option so they cannot be used for reading or writing any data. See the + // documentation of the `O_PATH` flag in `open(2)` for more details on what one can and cannot + // do with an fd opened with this flag. + inodes: RwLock<MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>>, + next_inode: AtomicU64, + + // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be + // used for reading and writing data. + handles: RwLock<BTreeMap<Handle, Arc<HandleData>>>, + next_handle: AtomicU64, + + // File descriptor pointing to the `/proc` directory. This is used to convert an fd from + // `inodes` into one that can go into `handles`. This is accomplished by reading the + // `self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are meant + // to be serving doesn't have access to `/proc`. + proc: File, + + // Whether writeback caching is enabled for this directory. This can improve write performance + // as it allows the guest to complete write requests before the data has been flushed to this + // server. However, this also has the possibility of causing data corruption as the contents of + // a file may change on disk while they are still buffered in the guest. So this should only be + // enabled when the guest has exclusive access to the directory being shared. + writeback: AtomicBool, + + timeout: Duration, + cache_policy: CachePolicy, +} + +impl PassthroughFs { + pub fn new(timeout: Duration, cache_policy: CachePolicy) -> io::Result<PassthroughFs> { + // Safe because this is a constant value and a valid C string. + let proc_cstr = unsafe { CStr::from_bytes_with_nul_unchecked(PROC_CSTR) }; + + // Safe because this doesn't modify any memory and we check the return value. + let fd = unsafe { + libc::openat( + libc::AT_FDCWD, + proc_cstr.as_ptr(), + libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, + ) + }; + if fd < 0 { + return Err(io::Error::last_os_error()); + } + + // Safe because we just opened this fd. + let proc = unsafe { File::from_raw_fd(fd) }; + + Ok(PassthroughFs { + inodes: RwLock::new(MultikeyBTreeMap::new()), + next_inode: AtomicU64::new(fuse::ROOT_ID + 1), + + handles: RwLock::new(BTreeMap::new()), + next_handle: AtomicU64::new(0), + + proc, + + writeback: AtomicBool::new(false), + timeout, + cache_policy, + }) + } + + pub fn keep_fds(&self) -> Vec<RawFd> { + vec![self.proc.as_raw_fd()] + } + + fn open_inode(&self, inode: Inode, mut flags: i32) -> io::Result<File> { + let data = self + .inodes + .read() + .unwrap() + .get(&inode) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + let pathname = CString::new(format!("self/fd/{}", data.file.as_raw_fd())) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + // When writeback caching is enabled, the kernel may send read requests even if the + // userspace program opened the file write-only. So we need to ensure that we have opened + // the file for reading as well as writing. + let writeback = self.writeback.load(Ordering::Relaxed); + if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY { + flags &= !libc::O_ACCMODE; + flags |= libc::O_RDWR; + } + + // When writeback caching is enabled the kernel is responsible for handling `O_APPEND`. + // However, this breaks atomicity as the file may have changed on disk, invalidating the + // cached copy of the data in the kernel and the offset that the kernel thinks is the end of + // the file. Just allow this for now as it is the user's responsibility to enable writeback + // caching only for directories that are not shared. It also means that we need to clear the + // `O_APPEND` flag. + if writeback && flags & libc::O_APPEND != 0 { + flags &= !libc::O_APPEND; + } + + // Safe because this doesn't modify any memory and we check the return value. We don't + // really check `flags` because if the kernel can't handle poorly specified flags then we + // have much bigger problems. Also, clear the `O_NOFOLLOW` flag if it is set since we need + // to follow the `/proc/self/fd` symlink to get the file. + let fd = unsafe { + libc::openat( + self.proc.as_raw_fd(), + pathname.as_ptr(), + (flags | libc::O_CLOEXEC) & (!libc::O_NOFOLLOW), + ) + }; + if fd < 0 { + return Err(io::Error::last_os_error()); + } + + // Safe because we just opened this fd. + Ok(unsafe { File::from_raw_fd(fd) }) + } + + fn do_lookup(&self, parent: Inode, name: &CStr) -> io::Result<Entry> { + let p = self + .inodes + .read() + .unwrap() + .get(&parent) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + // Safe because this doesn't modify any memory and we check the return value. + let fd = unsafe { + libc::openat( + p.file.as_raw_fd(), + name.as_ptr(), + libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, + ) + }; + if fd < 0 { + return Err(io::Error::last_os_error()); + } + + // Safe because we just opened this fd. + let f = unsafe { File::from_raw_fd(fd) }; + + let st = stat(&f)?; + + let altkey = InodeAltKey { + ino: st.st_ino, + dev: st.st_dev, + }; + let data = self.inodes.read().unwrap().get_alt(&altkey).map(Arc::clone); + + let inode = if let Some(data) = data { + // Matches with the release store in `forget`. + data.refcount.fetch_add(1, Ordering::Acquire); + data.inode + } else { + // There is a possible race here where 2 threads end up adding the same file + // into the inode list. However, since each of those will get a unique Inode + // value and unique file descriptors this shouldn't be that much of a problem. + let inode = self.next_inode.fetch_add(1, Ordering::Relaxed); + self.inodes.write().unwrap().insert( + inode, + InodeAltKey { + ino: st.st_ino, + dev: st.st_dev, + }, + Arc::new(InodeData { + inode, + file: f, + refcount: AtomicU64::new(1), + }), + ); + + inode + }; + + Ok(Entry { + inode, + generation: 0, + attr: st, + attr_timeout: self.timeout.clone(), + entry_timeout: self.timeout.clone(), + }) + } + + fn do_readdir<F>( + &self, + inode: Inode, + handle: Handle, + size: u32, + offset: u64, + mut add_entry: F, + ) -> io::Result<()> + where + F: FnMut(DirEntry) -> io::Result<usize>, + { + if size == 0 { + return Ok(()); + } + + let data = self + .handles + .read() + .unwrap() + .get(&handle) + .filter(|hd| hd.inode == inode) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + let mut buf = Vec::with_capacity(size as usize); + buf.resize(size as usize, 0); + + { + // Since we are going to work with the kernel offset, we have to acquire the file lock + // for both the `lseek64` and `getdents64` syscalls to ensure that no other thread + // changes the kernel offset while we are using it. + let dir = data.file.lock(); + + // Safe because this doesn't modify any memory and we check the return value. + let res = + unsafe { libc::lseek64(dir.as_raw_fd(), offset as libc::off64_t, libc::SEEK_SET) }; + if res < 0 { + return Err(io::Error::last_os_error()); + } + + // Safe because the kernel guarantees that it will only write to `buf` and we check the + // return value. + let res = unsafe { + libc::syscall( + libc::SYS_getdents64, + dir.as_raw_fd(), + buf.as_mut_ptr() as *mut LinuxDirent64, + size as libc::c_int, + ) + }; + if res < 0 { + return Err(io::Error::last_os_error()); + } + buf.resize(res as usize, 0); + + // Explicitly drop the lock so that it's not held while we fill in the fuse buffer. + mem::drop(dir); + } + + let mut rem = &buf[..]; + while rem.len() > 0 { + // We only use debug asserts here because these values are coming from the kernel and we + // trust them implicitly. + debug_assert!( + rem.len() >= size_of::<LinuxDirent64>(), + "not enough space left in `rem`" + ); + + let (front, back) = rem.split_at(size_of::<LinuxDirent64>()); + + let dirent64 = + LinuxDirent64::from_slice(front).expect("unable to get LinuxDirent64 from slice"); + + let namelen = dirent64.d_reclen as usize - size_of::<LinuxDirent64>(); + debug_assert!(namelen <= back.len(), "back is smaller than `namelen`"); + + let name = &back[..namelen]; + let res = if name.starts_with(CURRENT_DIR_CSTR) || name.starts_with(PARENT_DIR_CSTR) { + // We don't want to report the "." and ".." entries. However, returning `Ok(0)` will + // break the loop so return `Ok` with a non-zero value instead. + Ok(1) + } else { + add_entry(DirEntry { + ino: dirent64.d_ino, + offset: dirent64.d_off as u64, + type_: dirent64.d_ty as u32, + name, + }) + }; + + debug_assert!( + rem.len() >= dirent64.d_reclen as usize, + "rem is smaller than `d_reclen`" + ); + + match res { + Ok(0) => break, + Ok(_) => rem = &rem[dirent64.d_reclen as usize..], + Err(e) => return Err(e), + } + } + + Ok(()) + } + + fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)> { + let file = Mutex::new(self.open_inode(inode, flags as i32)?); + + let handle = self.next_handle.fetch_add(1, Ordering::Relaxed); + let data = HandleData { inode, file }; + + self.handles.write().unwrap().insert(handle, Arc::new(data)); + + let mut opts = OpenOptions::empty(); + match self.cache_policy { + // We only set the direct I/O option on files. + CachePolicy::Never => opts.set( + OpenOptions::DIRECT_IO, + flags & (libc::O_DIRECTORY as u32) == 0, + ), + CachePolicy::Always => opts |= OpenOptions::KEEP_CACHE, + _ => {} + }; + + Ok((Some(handle), opts)) + } + + fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> { + let mut handles = self.handles.write().unwrap(); + + if let btree_map::Entry::Occupied(e) = handles.entry(handle) { + if e.get().inode == inode { + // We don't need to close the file here because that will happen automatically when + // the last `Arc` is dropped. + e.remove(); + return Ok(()); + } + } + + Err(ebadf()) + } + + fn do_getattr(&self, inode: Inode) -> io::Result<(libc::stat64, Duration)> { + let data = self + .inodes + .read() + .unwrap() + .get(&inode) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + let st = stat(&data.file)?; + + Ok((st, self.timeout.clone())) + } + + fn do_unlink(&self, parent: Inode, name: &CStr, flags: libc::c_int) -> io::Result<()> { + let data = self + .inodes + .read() + .unwrap() + .get(&parent) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + // Safe because this doesn't modify any memory and we check the return value. + let res = unsafe { libc::unlinkat(data.file.as_raw_fd(), name.as_ptr(), flags) }; + if res == 0 { + Ok(()) + } else { + Err(io::Error::last_os_error()) + } + } +} + +fn forget_one( + inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>, + inode: Inode, + count: u64, +) { + if let Some(data) = inodes.get(&inode) { + // Acquiring the write lock on the inode map prevents new lookups from incrementing the + // refcount but there is the possibility that a previous lookup already acquired a + // reference to the inode data and is in the process of updating the refcount so we need + // to loop here until we can decrement successfully. + loop { + let refcount = data.refcount.load(Ordering::Relaxed); + + // Saturating sub because it doesn't make sense for a refcount to go below zero and + // we don't want misbehaving clients to cause integer overflow. + let new_count = refcount.saturating_sub(count); + + // Synchronizes with the acquire load in `do_lookup`. + if data + .refcount + .compare_and_swap(refcount, new_count, Ordering::Release) + == refcount + { + if new_count == 0 { + // We just removed the last refcount for this inode. There's no need for an + // acquire fence here because we hold a write lock on the inode map and any + // thread that is waiting to do a forget on the same inode will have to wait + // until we release the lock. So there's is no other release store for us to + // synchronize with before deleting the entry. + inodes.remove(&inode); + } + break; + } + } + } +} + +impl FileSystem for PassthroughFs { + type Inode = Inode; + type Handle = Handle; + + fn init(&self, capable: FsOptions) -> io::Result<FsOptions> { + // Safe because this is a constant value and a valid C string. + let root = unsafe { CStr::from_bytes_with_nul_unchecked(ROOT_CSTR) }; + + // Safe because this doesn't modify any memory and we check the return value. + // We use `O_PATH` because we just want this for traversing the directory tree + // and not for actually reading the contents. + let fd = unsafe { + libc::openat( + libc::AT_FDCWD, + root.as_ptr(), + libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, + ) + }; + if fd < 0 { + return Err(io::Error::last_os_error()); + } + + // Safe because we just opened this fd above. + let f = unsafe { File::from_raw_fd(fd) }; + + let st = stat(&f)?; + + // Safe because this doesn't modify any memory and there is no need to check the return + // value because this system call always succeeds. We need to clear the umask here because + // we want the client to be able to set all the bits in the mode. + unsafe { libc::umask(0o000) }; + + let mut inodes = self.inodes.write().unwrap(); + + // Not sure why the root inode gets a refcount of 2 but that's what libfuse does. + inodes.insert( + fuse::ROOT_ID, + InodeAltKey { + ino: st.st_ino, + dev: st.st_dev, + }, + Arc::new(InodeData { + inode: fuse::ROOT_ID, + file: f, + refcount: AtomicU64::new(2), + }), + ); + + let mut opts = FsOptions::DO_READDIRPLUS | FsOptions::READDIRPLUS_AUTO; + if capable.contains(FsOptions::WRITEBACK_CACHE) { + opts |= FsOptions::WRITEBACK_CACHE; + self.writeback.store(true, Ordering::Relaxed); + } + Ok(opts) + } + + fn destroy(&self) { + self.handles.write().unwrap().clear(); + self.inodes.write().unwrap().clear(); + } + + fn statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64> { + let data = self + .inodes + .read() + .unwrap() + .get(&inode) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + let mut out = MaybeUninit::<libc::statvfs64>::zeroed(); + + // Safe because this will only modify `out` and we check the return value. + let res = unsafe { libc::fstatvfs64(data.file.as_raw_fd(), out.as_mut_ptr()) }; + if res == 0 { + // Safe because the kernel guarantees that `out` has been initialized. + Ok(unsafe { out.assume_init() }) + } else { + Err(io::Error::last_os_error()) + } + } + + fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry> { + self.do_lookup(parent, name) + } + + fn forget(&self, _ctx: Context, inode: Inode, count: u64) { + let mut inodes = self.inodes.write().unwrap(); + + forget_one(&mut inodes, inode, count) + } + + fn batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>) { + let mut inodes = self.inodes.write().unwrap(); + + for (inode, count) in requests { + forget_one(&mut inodes, inode, count) + } + } + + fn opendir( + &self, + _ctx: Context, + inode: Inode, + flags: u32, + ) -> io::Result<(Option<Handle>, OpenOptions)> { + self.do_open(inode, flags | (libc::O_DIRECTORY as u32)) + } + + fn releasedir( + &self, + _ctx: Context, + inode: Inode, + _flags: u32, + handle: Handle, + ) -> io::Result<()> { + self.do_release(inode, handle) + } + + fn mkdir( + &self, + ctx: Context, + parent: Inode, + name: &CStr, + mode: u32, + umask: u32, + ) -> io::Result<Entry> { + let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; + let data = self + .inodes + .read() + .unwrap() + .get(&parent) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + // Safe because this doesn't modify any memory and we check the return value. + let res = unsafe { libc::mkdirat(data.file.as_raw_fd(), name.as_ptr(), mode & !umask) }; + if res == 0 { + self.do_lookup(parent, name) + } else { + Err(io::Error::last_os_error()) + } + } + + fn rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> { + self.do_unlink(parent, name, libc::AT_REMOVEDIR) + } + + fn readdir<F>( + &self, + _ctx: Context, + inode: Inode, + handle: Handle, + size: u32, + offset: u64, + add_entry: F, + ) -> io::Result<()> + where + F: FnMut(DirEntry) -> io::Result<usize>, + { + self.do_readdir(inode, handle, size, offset, add_entry) + } + + fn readdirplus<F>( + &self, + _ctx: Context, + inode: Inode, + handle: Handle, + size: u32, + offset: u64, + mut add_entry: F, + ) -> io::Result<()> + where + F: FnMut(DirEntry, Entry) -> io::Result<usize>, + { + self.do_readdir(inode, handle, size, offset, |dir_entry| { + // Safe because the kernel guarantees that the buffer is nul-terminated. Additionally, + // the kernel will pad the name with '\0' bytes up to 8-byte alignment and there's no + // way for us to know exactly how many padding bytes there are. This would cause + // `CStr::from_bytes_with_nul` to return an error because it would think there are + // interior '\0' bytes. We trust the kernel to provide us with properly formatted data + // so we'll just skip the checks here. + let name = unsafe { CStr::from_bytes_with_nul_unchecked(dir_entry.name) }; + let entry = self.do_lookup(inode, name)?; + + add_entry(dir_entry, entry) + }) + } + + fn open( + &self, + _ctx: Context, + inode: Inode, + flags: u32, + ) -> io::Result<(Option<Handle>, OpenOptions)> { + self.do_open(inode, flags) + } + + fn release( + &self, + _ctx: Context, + inode: Inode, + _flags: u32, + handle: Handle, + _flush: bool, + _flock_release: bool, + _lock_owner: Option<u64>, + ) -> io::Result<()> { + self.do_release(inode, handle) + } + + fn create( + &self, + ctx: Context, + parent: Inode, + name: &CStr, + mode: u32, + flags: u32, + umask: u32, + ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> { + let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; + let data = self + .inodes + .read() + .unwrap() + .get(&parent) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + // Safe because this doesn't modify any memory and we check the return value. We don't + // really check `flags` because if the kernel can't handle poorly specified flags then we + // have much bigger problems. + let fd = unsafe { + libc::openat( + data.file.as_raw_fd(), + name.as_ptr(), + flags as i32 | libc::O_CREAT | libc::O_CLOEXEC | libc::O_NOFOLLOW, + mode & !(umask & 0o777), + ) + }; + if fd < 0 { + return Err(io::Error::last_os_error()); + } + + // Safe because we just opened this fd. + let file = Mutex::new(unsafe { File::from_raw_fd(fd) }); + + let entry = self.do_lookup(parent, name)?; + + let handle = self.next_handle.fetch_add(1, Ordering::Relaxed); + let data = HandleData { + inode: entry.inode, + file, + }; + + self.handles.write().unwrap().insert(handle, Arc::new(data)); + + let mut opts = OpenOptions::empty(); + match self.cache_policy { + CachePolicy::Never => opts |= OpenOptions::DIRECT_IO, + CachePolicy::Always => opts |= OpenOptions::KEEP_CACHE, + _ => {} + }; + + Ok((entry, Some(handle), opts)) + } + + fn unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> { + self.do_unlink(parent, name, 0) + } + + fn read<W: io::Write + ZeroCopyWriter>( + &self, + _ctx: Context, + inode: Inode, + handle: Handle, + mut w: W, + size: u32, + offset: u64, + _lock_owner: Option<u64>, + _flags: u32, + ) -> io::Result<usize> { + let data = self + .handles + .read() + .unwrap() + .get(&handle) + .filter(|hd| hd.inode == inode) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + let mut f = data.file.lock(); + w.write_from(&mut f, size as usize, offset) + } + + fn write<R: io::Read + ZeroCopyReader>( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + mut r: R, + size: u32, + offset: u64, + _lock_owner: Option<u64>, + _delayed_write: bool, + _flags: u32, + ) -> io::Result<usize> { + // We need to change credentials during a write so that the kernel will remove setuid or + // setgid bits from the file if it was written to by someone other than the owner. + let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; + let data = self + .handles + .read() + .unwrap() + .get(&handle) + .filter(|hd| hd.inode == inode) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + let mut f = data.file.lock(); + r.read_to(&mut f, size as usize, offset) + } + + fn getattr( + &self, + _ctx: Context, + inode: Inode, + _handle: Option<Handle>, + ) -> io::Result<(libc::stat64, Duration)> { + self.do_getattr(inode) + } + + fn setattr( + &self, + _ctx: Context, + inode: Inode, + attr: libc::stat64, + handle: Option<Handle>, + valid: SetattrValid, + ) -> io::Result<(libc::stat64, Duration)> { + let inode_data = self + .inodes + .read() + .unwrap() + .get(&inode) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + enum Data { + Handle(Arc<HandleData>, RawFd), + ProcPath(CString), + } + + // If we have a handle then use it otherwise get a new fd from the inode. + let data = if let Some(handle) = handle { + let hd = self + .handles + .read() + .unwrap() + .get(&handle) + .filter(|hd| hd.inode == inode) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + let fd = hd.file.lock().as_raw_fd(); + Data::Handle(hd, fd) + } else { + let pathname = CString::new(format!("self/fd/{}", inode_data.file.as_raw_fd())) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Data::ProcPath(pathname) + }; + + if valid.contains(SetattrValid::MODE) { + // Safe because this doesn't modify any memory and we check the return value. + let res = unsafe { + match data { + Data::Handle(_, fd) => libc::fchmod(fd, attr.st_mode), + Data::ProcPath(ref p) => { + libc::fchmodat(self.proc.as_raw_fd(), p.as_ptr(), attr.st_mode, 0) + } + } + }; + if res < 0 { + return Err(io::Error::last_os_error()); + } + } + + if valid.intersects(SetattrValid::UID | SetattrValid::GID) { + let uid = if valid.contains(SetattrValid::UID) { + attr.st_uid + } else { + // Cannot use -1 here because these are unsigned values. + ::std::u32::MAX + }; + let gid = if valid.contains(SetattrValid::GID) { + attr.st_gid + } else { + // Cannot use -1 here because these are unsigned values. + ::std::u32::MAX + }; + + // Safe because this is a constant value and a valid C string. + let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; + + // Safe because this doesn't modify any memory and we check the return value. + let res = unsafe { + libc::fchownat( + inode_data.file.as_raw_fd(), + empty.as_ptr(), + uid, + gid, + libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW, + ) + }; + if res < 0 { + return Err(io::Error::last_os_error()); + } + } + + if valid.contains(SetattrValid::SIZE) { + // Safe because this doesn't modify any memory and we check the return value. + let res = match data { + Data::Handle(_, fd) => unsafe { libc::ftruncate(fd, attr.st_size) }, + _ => { + // There is no `ftruncateat` so we need to get a new fd and truncate it. + let f = self.open_inode(inode, libc::O_NONBLOCK | libc::O_RDWR)?; + unsafe { libc::ftruncate(f.as_raw_fd(), attr.st_size) } + } + }; + if res < 0 { + return Err(io::Error::last_os_error()); + } + } + + if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) { + let mut tvs = [ + libc::timespec { + tv_sec: 0, + tv_nsec: libc::UTIME_OMIT, + }, + libc::timespec { + tv_sec: 0, + tv_nsec: libc::UTIME_OMIT, + }, + ]; + + if valid.contains(SetattrValid::ATIME_NOW) { + tvs[0].tv_nsec = libc::UTIME_NOW; + } else if valid.contains(SetattrValid::ATIME) { + tvs[0].tv_sec = attr.st_atime; + tvs[0].tv_nsec = attr.st_atime_nsec; + } + + if valid.contains(SetattrValid::MTIME_NOW) { + tvs[1].tv_nsec = libc::UTIME_NOW; + } else if valid.contains(SetattrValid::MTIME) { + tvs[1].tv_sec = attr.st_mtime; + tvs[1].tv_nsec = attr.st_mtime_nsec; + } + + // Safe because this doesn't modify any memory and we check the return value. + let res = match data { + Data::Handle(_, fd) => unsafe { libc::futimens(fd, tvs.as_ptr()) }, + Data::ProcPath(ref p) => unsafe { + libc::utimensat(self.proc.as_raw_fd(), p.as_ptr(), tvs.as_ptr(), 0) + }, + }; + if res < 0 { + return Err(io::Error::last_os_error()); + } + } + + self.do_getattr(inode) + } + + fn rename( + &self, + _ctx: Context, + olddir: Inode, + oldname: &CStr, + newdir: Inode, + newname: &CStr, + flags: u32, + ) -> io::Result<()> { + let old_inode = self + .inodes + .read() + .unwrap() + .get(&olddir) + .map(Arc::clone) + .ok_or_else(ebadf)?; + let new_inode = self + .inodes + .read() + .unwrap() + .get(&newdir) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + // Safe because this doesn't modify any memory and we check the return value. + // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands + // and we have glibc 2.28. + let res = unsafe { + libc::syscall( + libc::SYS_renameat2, + old_inode.file.as_raw_fd(), + oldname.as_ptr(), + new_inode.file.as_raw_fd(), + newname.as_ptr(), + flags, + ) + }; + if res == 0 { + Ok(()) + } else { + Err(io::Error::last_os_error()) + } + } + + fn mknod( + &self, + ctx: Context, + parent: Inode, + name: &CStr, + mode: u32, + rdev: u32, + umask: u32, + ) -> io::Result<Entry> { + let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; + let data = self + .inodes + .read() + .unwrap() + .get(&parent) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + // Safe because this doesn't modify any memory and we check the return value. + let res = unsafe { + libc::mknodat( + data.file.as_raw_fd(), + name.as_ptr(), + (mode & !umask) as libc::mode_t, + rdev as libc::dev_t, + ) + }; + + if res < 0 { + Err(io::Error::last_os_error()) + } else { + self.do_lookup(parent, name) + } + } + + fn link( + &self, + _ctx: Context, + inode: Inode, + newparent: Inode, + newname: &CStr, + ) -> io::Result<Entry> { + let data = self + .inodes + .read() + .unwrap() + .get(&inode) + .map(Arc::clone) + .ok_or_else(ebadf)?; + let new_inode = self + .inodes + .read() + .unwrap() + .get(&newparent) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + // Safe because this is a constant value and a valid C string. + let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; + + // Safe because this doesn't modify any memory and we check the return value. + let res = unsafe { + libc::linkat( + data.file.as_raw_fd(), + empty.as_ptr(), + new_inode.file.as_raw_fd(), + newname.as_ptr(), + libc::AT_EMPTY_PATH, + ) + }; + if res == 0 { + self.do_lookup(newparent, newname) + } else { + Err(io::Error::last_os_error()) + } + } + + fn symlink( + &self, + ctx: Context, + linkname: &CStr, + parent: Inode, + name: &CStr, + ) -> io::Result<Entry> { + let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; + let data = self + .inodes + .read() + .unwrap() + .get(&parent) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + // Safe because this doesn't modify any memory and we check the return value. + let res = + unsafe { libc::symlinkat(linkname.as_ptr(), data.file.as_raw_fd(), name.as_ptr()) }; + if res == 0 { + self.do_lookup(parent, name) + } else { + Err(io::Error::last_os_error()) + } + } + + fn readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>> { + let data = self + .inodes + .read() + .unwrap() + .get(&inode) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + let mut buf = Vec::with_capacity(libc::PATH_MAX as usize); + buf.resize(libc::PATH_MAX as usize, 0); + + // Safe because this is a constant value and a valid C string. + let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; + + // Safe because this will only modify the contents of `buf` and we check the return value. + let res = unsafe { + libc::readlinkat( + data.file.as_raw_fd(), + empty.as_ptr(), + buf.as_mut_ptr() as *mut libc::c_char, + buf.len(), + ) + }; + if res < 0 { + return Err(io::Error::last_os_error()); + } + + buf.resize(res as usize, 0); + Ok(buf) + } + + fn flush( + &self, + _ctx: Context, + inode: Inode, + handle: Handle, + _lock_owner: u64, + ) -> io::Result<()> { + let data = self + .handles + .read() + .unwrap() + .get(&handle) + .filter(|hd| hd.inode == inode) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + // Since this method is called whenever an fd is closed in the client, we can emulate that + // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe + // because this doesn't modify any memory and we check the return values. + unsafe { + let newfd = libc::dup(data.file.lock().as_raw_fd()); + if newfd < 0 { + return Err(io::Error::last_os_error()); + } + + if libc::close(newfd) < 0 { + Err(io::Error::last_os_error()) + } else { + Ok(()) + } + } + } + + fn fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> { + let data = self + .handles + .read() + .unwrap() + .get(&handle) + .filter(|hd| hd.inode == inode) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + let fd = data.file.lock().as_raw_fd(); + + // Safe because this doesn't modify any memory and we check the return value. + let res = unsafe { + if datasync { + libc::fdatasync(fd) + } else { + libc::fsync(fd) + } + }; + + if res == 0 { + Ok(()) + } else { + Err(io::Error::last_os_error()) + } + } + + fn fsyncdir( + &self, + ctx: Context, + inode: Inode, + datasync: bool, + handle: Handle, + ) -> io::Result<()> { + self.fsync(ctx, inode, datasync, handle) + } + + fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> { + let data = self + .inodes + .read() + .unwrap() + .get(&inode) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + let st = stat(&data.file)?; + let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK); + + if mode == libc::F_OK { + // The file exists since we were able to call `stat(2)` on it. + return Ok(()); + } + + if (mode & libc::R_OK) != 0 { + if ctx.uid != 0 + && (st.st_uid != ctx.uid || st.st_mode & 0o400 == 0) + && (st.st_gid != ctx.gid || st.st_mode & 0o040 == 0) + && st.st_mode & 0o004 == 0 + { + return Err(io::Error::from_raw_os_error(libc::EACCES)); + } + } + + if (mode & libc::W_OK) != 0 { + if ctx.uid != 0 + && (st.st_uid != ctx.uid || st.st_mode & 0o200 == 0) + && (st.st_gid != ctx.gid || st.st_mode & 0o020 == 0) + && st.st_mode & 0o002 == 0 + { + return Err(io::Error::from_raw_os_error(libc::EACCES)); + } + } + + // root can only execute something if it is executable by one of the owner, the group, or + // everyone. + if (mode & libc::X_OK) != 0 { + if (ctx.uid != 0 || st.st_mode & 0o111 == 0) + && (st.st_uid != ctx.uid || st.st_mode & 0o100 == 0) + && (st.st_gid != ctx.gid || st.st_mode & 0o010 == 0) + && st.st_mode & 0o001 == 0 + { + return Err(io::Error::from_raw_os_error(libc::EACCES)); + } + } + + Ok(()) + } + + fn setxattr( + &self, + _ctx: Context, + inode: Inode, + name: &CStr, + value: &[u8], + flags: u32, + ) -> io::Result<()> { + // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we + // need to get a new fd. + let file = self.open_inode(inode, libc::O_RDONLY | libc::O_NONBLOCK)?; + + // Safe because this doesn't modify any memory and we check the return value. + let res = unsafe { + libc::fsetxattr( + file.as_raw_fd(), + name.as_ptr(), + value.as_ptr() as *const libc::c_void, + value.len(), + flags as libc::c_int, + ) + }; + if res == 0 { + Ok(()) + } else { + Err(io::Error::last_os_error()) + } + } + + fn getxattr( + &self, + _ctx: Context, + inode: Inode, + name: &CStr, + size: u32, + ) -> io::Result<GetxattrReply> { + // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we + // need to get a new fd. + let file = self.open_inode(inode, libc::O_RDONLY | libc::O_NONBLOCK)?; + + let mut buf = Vec::with_capacity(size as usize); + buf.resize(size as usize, 0); + + // Safe because this will only modify the contents of `buf`. + let res = unsafe { + libc::fgetxattr( + file.as_raw_fd(), + name.as_ptr(), + buf.as_mut_ptr() as *mut libc::c_void, + size as libc::size_t, + ) + }; + if res < 0 { + return Err(io::Error::last_os_error()); + } + + if size == 0 { + Ok(GetxattrReply::Count(res as u32)) + } else { + buf.resize(res as usize, 0); + Ok(GetxattrReply::Value(buf)) + } + } + + fn listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply> { + // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we + // need to get a new fd. + let file = self.open_inode(inode, libc::O_RDONLY | libc::O_NONBLOCK)?; + + let mut buf = Vec::with_capacity(size as usize); + buf.resize(size as usize, 0); + + // Safe because this will only modify the contents of `buf`. + let res = unsafe { + libc::flistxattr( + file.as_raw_fd(), + buf.as_mut_ptr() as *mut libc::c_char, + size as libc::size_t, + ) + }; + if res < 0 { + return Err(io::Error::last_os_error()); + } + + if size == 0 { + Ok(ListxattrReply::Count(res as u32)) + } else { + buf.resize(res as usize, 0); + Ok(ListxattrReply::Names(buf)) + } + } + + fn removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> { + // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we + // need to get a new fd. + let file = self.open_inode(inode, libc::O_RDONLY | libc::O_NONBLOCK)?; + + // Safe because this doesn't modify any memory and we check the return value. + let res = unsafe { libc::fremovexattr(file.as_raw_fd(), name.as_ptr()) }; + + if res == 0 { + Ok(()) + } else { + Err(io::Error::last_os_error()) + } + } + + fn fallocate( + &self, + _ctx: Context, + inode: Inode, + handle: Handle, + mode: u32, + offset: u64, + length: u64, + ) -> io::Result<()> { + let data = self + .handles + .read() + .unwrap() + .get(&handle) + .filter(|hd| hd.inode == inode) + .map(Arc::clone) + .ok_or_else(ebadf)?; + + let fd = data.file.lock().as_raw_fd(); + // Safe because this doesn't modify any memory and we check the return value. + let res = unsafe { + libc::fallocate64( + fd, + mode as libc::c_int, + offset as libc::off64_t, + length as libc::off64_t, + ) + }; + if res == 0 { + Ok(()) + } else { + Err(io::Error::last_os_error()) + } + } +} |