diff options
author | Dylan Reid <dgreid@chromium.org> | 2020-03-13 13:40:40 -0700 |
---|---|---|
committer | Commit Bot <commit-bot@chromium.org> | 2020-04-05 21:32:20 +0000 |
commit | a9a1d0227713d22df6262ca99dd622281e8dc9bf (patch) | |
tree | 3f99d11aa72dc30d1b05e3e1b3d96d3391c5090b | |
parent | 252d5b3cf3fd7a48fe9d610b59e3d6da9f2c6fe9 (diff) | |
download | crosvm-a9a1d0227713d22df6262ca99dd622281e8dc9bf.tar crosvm-a9a1d0227713d22df6262ca99dd622281e8dc9bf.tar.gz crosvm-a9a1d0227713d22df6262ca99dd622281e8dc9bf.tar.bz2 crosvm-a9a1d0227713d22df6262ca99dd622281e8dc9bf.tar.lz crosvm-a9a1d0227713d22df6262ca99dd622281e8dc9bf.tar.xz crosvm-a9a1d0227713d22df6262ca99dd622281e8dc9bf.tar.zst crosvm-a9a1d0227713d22df6262ca99dd622281e8dc9bf.zip |
Add io_uring interfaces
Provide a low-level interface to operating the new io_uring interface. This is an unsafe interface with the basic operations of setting up the ring, adding to it, removing from it, and polling it. This will be followed by a safe interface layer on top of this code, then by additions to the asynchronous executor that allow for asynchronously doing multiple operations to files from one context. Change-Id: I71f7ffb04ce8cb4da470deda9aee768ab95d3d98 Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/crosvm/+/2124009 Reviewed-by: Dylan Reid <dgreid@chromium.org> Reviewed-by: Zach Reizner <zachr@chromium.org> Tested-by: Dylan Reid <dgreid@chromium.org> Tested-by: kokoro <noreply+kokoro@google.com> Commit-Queue: Dylan Reid <dgreid@chromium.org>
-rw-r--r-- | io_uring/Cargo.toml | 15 | ||||
-rw-r--r-- | io_uring/src/bindings.rs | 516 | ||||
-rw-r--r-- | io_uring/src/lib.rs | 9 | ||||
-rw-r--r-- | io_uring/src/syscalls.rs | 41 | ||||
-rw-r--r-- | io_uring/src/uring.rs | 772 |
5 files changed, 1353 insertions, 0 deletions
diff --git a/io_uring/Cargo.toml b/io_uring/Cargo.toml new file mode 100644 index 0000000..1b633d3 --- /dev/null +++ b/io_uring/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "io_uring" +version = "0.1.0" +authors = ["The Chromium OS Authors"] +edition = "2018" + +[dependencies] +libc = "*" +syscall_defines = { path = "../syscall_defines" } +sys_util = { path = "../sys_util" } + +[dev-dependencies] +tempfile = { path = "../tempfile" } + +[workspace] diff --git a/io_uring/src/bindings.rs b/io_uring/src/bindings.rs new file mode 100644 index 0000000..382a738 --- /dev/null +++ b/io_uring/src/bindings.rs @@ -0,0 +1,516 @@ +/* automatically generated by rust-bindgen + * + * bindgen --with-derive-default include/uapi/linux/io_uring.h + */ + +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +#![allow(dead_code)] + +#[repr(C)] +#[derive(Default)] +pub struct __IncompleteArrayField<T>(::std::marker::PhantomData<T>); +impl<T> __IncompleteArrayField<T> { + #[inline] + pub fn new() -> Self { + __IncompleteArrayField(::std::marker::PhantomData) + } + #[inline] + pub unsafe fn as_ptr(&self) -> *const T { + ::std::mem::transmute(self) + } + #[inline] + pub unsafe fn as_mut_ptr(&mut self) -> *mut T { + ::std::mem::transmute(self) + } + #[inline] + pub unsafe fn as_slice(&self, len: usize) -> &[T] { + ::std::slice::from_raw_parts(self.as_ptr(), len) + } + #[inline] + pub unsafe fn as_mut_slice(&mut self, len: usize) -> &mut [T] { + ::std::slice::from_raw_parts_mut(self.as_mut_ptr(), len) + } +} +impl<T> ::std::fmt::Debug for __IncompleteArrayField<T> { + fn fmt(&self, fmt: &mut ::std::fmt::Formatter) -> ::std::fmt::Result { + fmt.write_str("__IncompleteArrayField") + } +} +impl<T> ::std::clone::Clone for __IncompleteArrayField<T> { + #[inline] + fn clone(&self) -> Self { + Self::new() + } +} +impl<T> ::std::marker::Copy for __IncompleteArrayField<T> {} +pub const NR_OPEN: ::std::os::raw::c_uint = 1024; +pub const NGROUPS_MAX: ::std::os::raw::c_uint = 65536; +pub const ARG_MAX: ::std::os::raw::c_uint = 131072; +pub const LINK_MAX: ::std::os::raw::c_uint = 127; +pub const MAX_CANON: ::std::os::raw::c_uint = 255; +pub const MAX_INPUT: ::std::os::raw::c_uint = 255; +pub const NAME_MAX: ::std::os::raw::c_uint = 255; +pub const PATH_MAX: ::std::os::raw::c_uint = 4096; +pub const PIPE_BUF: ::std::os::raw::c_uint = 4096; +pub const XATTR_NAME_MAX: ::std::os::raw::c_uint = 255; +pub const XATTR_SIZE_MAX: ::std::os::raw::c_uint = 65536; +pub const XATTR_LIST_MAX: ::std::os::raw::c_uint = 65536; +pub const RTSIG_MAX: ::std::os::raw::c_uint = 32; +pub const _IOC_NRBITS: ::std::os::raw::c_uint = 8; +pub const _IOC_TYPEBITS: ::std::os::raw::c_uint = 8; +pub const _IOC_SIZEBITS: ::std::os::raw::c_uint = 14; +pub const _IOC_DIRBITS: ::std::os::raw::c_uint = 2; +pub const _IOC_NRMASK: ::std::os::raw::c_uint = 255; +pub const _IOC_TYPEMASK: ::std::os::raw::c_uint = 255; +pub const _IOC_SIZEMASK: ::std::os::raw::c_uint = 16383; +pub const _IOC_DIRMASK: ::std::os::raw::c_uint = 3; +pub const _IOC_NRSHIFT: ::std::os::raw::c_uint = 0; +pub const _IOC_TYPESHIFT: ::std::os::raw::c_uint = 8; +pub const _IOC_SIZESHIFT: ::std::os::raw::c_uint = 16; +pub const _IOC_DIRSHIFT: ::std::os::raw::c_uint = 30; +pub const _IOC_NONE: ::std::os::raw::c_uint = 0; +pub const _IOC_WRITE: ::std::os::raw::c_uint = 1; +pub const _IOC_READ: ::std::os::raw::c_uint = 2; +pub const IOC_IN: ::std::os::raw::c_uint = 1073741824; +pub const IOC_OUT: ::std::os::raw::c_uint = 2147483648; +pub const IOC_INOUT: ::std::os::raw::c_uint = 3221225472; +pub const IOCSIZE_MASK: ::std::os::raw::c_uint = 1073676288; +pub const IOCSIZE_SHIFT: ::std::os::raw::c_uint = 16; +pub const __BITS_PER_LONG: ::std::os::raw::c_uint = 64; +pub const __FD_SETSIZE: ::std::os::raw::c_uint = 1024; +pub const MS_RDONLY: ::std::os::raw::c_uint = 1; +pub const MS_NOSUID: ::std::os::raw::c_uint = 2; +pub const MS_NODEV: ::std::os::raw::c_uint = 4; +pub const MS_NOEXEC: ::std::os::raw::c_uint = 8; +pub const MS_SYNCHRONOUS: ::std::os::raw::c_uint = 16; +pub const MS_REMOUNT: ::std::os::raw::c_uint = 32; +pub const MS_MANDLOCK: ::std::os::raw::c_uint = 64; +pub const MS_DIRSYNC: ::std::os::raw::c_uint = 128; +pub const MS_NOATIME: ::std::os::raw::c_uint = 1024; +pub const MS_NODIRATIME: ::std::os::raw::c_uint = 2048; +pub const MS_BIND: ::std::os::raw::c_uint = 4096; +pub const MS_MOVE: ::std::os::raw::c_uint = 8192; +pub const MS_REC: ::std::os::raw::c_uint = 16384; +pub const MS_VERBOSE: ::std::os::raw::c_uint = 32768; +pub const MS_SILENT: ::std::os::raw::c_uint = 32768; +pub const MS_POSIXACL: ::std::os::raw::c_uint = 65536; +pub const MS_UNBINDABLE: ::std::os::raw::c_uint = 131072; +pub const MS_PRIVATE: ::std::os::raw::c_uint = 262144; +pub const MS_SLAVE: ::std::os::raw::c_uint = 524288; +pub const MS_SHARED: ::std::os::raw::c_uint = 1048576; +pub const MS_RELATIME: ::std::os::raw::c_uint = 2097152; +pub const MS_KERNMOUNT: ::std::os::raw::c_uint = 4194304; +pub const MS_I_VERSION: ::std::os::raw::c_uint = 8388608; +pub const MS_STRICTATIME: ::std::os::raw::c_uint = 16777216; +pub const MS_LAZYTIME: ::std::os::raw::c_uint = 33554432; +pub const MS_SUBMOUNT: ::std::os::raw::c_uint = 67108864; +pub const MS_NOREMOTELOCK: ::std::os::raw::c_uint = 134217728; +pub const MS_NOSEC: ::std::os::raw::c_uint = 268435456; +pub const MS_BORN: ::std::os::raw::c_uint = 536870912; +pub const MS_ACTIVE: ::std::os::raw::c_uint = 1073741824; +pub const MS_NOUSER: ::std::os::raw::c_uint = 2147483648; +pub const MS_RMT_MASK: ::std::os::raw::c_uint = 41943121; +pub const MS_MGC_VAL: ::std::os::raw::c_uint = 3236757504; +pub const MS_MGC_MSK: ::std::os::raw::c_uint = 4294901760; +pub const OPEN_TREE_CLONE: ::std::os::raw::c_uint = 1; +pub const MOVE_MOUNT_F_SYMLINKS: ::std::os::raw::c_uint = 1; +pub const MOVE_MOUNT_F_AUTOMOUNTS: ::std::os::raw::c_uint = 2; +pub const MOVE_MOUNT_F_EMPTY_PATH: ::std::os::raw::c_uint = 4; +pub const MOVE_MOUNT_T_SYMLINKS: ::std::os::raw::c_uint = 16; +pub const MOVE_MOUNT_T_AUTOMOUNTS: ::std::os::raw::c_uint = 32; +pub const MOVE_MOUNT_T_EMPTY_PATH: ::std::os::raw::c_uint = 64; +pub const MOVE_MOUNT__MASK: ::std::os::raw::c_uint = 119; +pub const FSOPEN_CLOEXEC: ::std::os::raw::c_uint = 1; +pub const FSPICK_CLOEXEC: ::std::os::raw::c_uint = 1; +pub const FSPICK_SYMLINK_NOFOLLOW: ::std::os::raw::c_uint = 2; +pub const FSPICK_NO_AUTOMOUNT: ::std::os::raw::c_uint = 4; +pub const FSPICK_EMPTY_PATH: ::std::os::raw::c_uint = 8; +pub const FSMOUNT_CLOEXEC: ::std::os::raw::c_uint = 1; +pub const MOUNT_ATTR_RDONLY: ::std::os::raw::c_uint = 1; +pub const MOUNT_ATTR_NOSUID: ::std::os::raw::c_uint = 2; +pub const MOUNT_ATTR_NODEV: ::std::os::raw::c_uint = 4; +pub const MOUNT_ATTR_NOEXEC: ::std::os::raw::c_uint = 8; +pub const MOUNT_ATTR__ATIME: ::std::os::raw::c_uint = 112; +pub const MOUNT_ATTR_RELATIME: ::std::os::raw::c_uint = 0; +pub const MOUNT_ATTR_NOATIME: ::std::os::raw::c_uint = 16; +pub const MOUNT_ATTR_STRICTATIME: ::std::os::raw::c_uint = 32; +pub const MOUNT_ATTR_NODIRATIME: ::std::os::raw::c_uint = 128; +pub const INR_OPEN_CUR: ::std::os::raw::c_uint = 1024; +pub const INR_OPEN_MAX: ::std::os::raw::c_uint = 4096; +pub const BLOCK_SIZE_BITS: ::std::os::raw::c_uint = 10; +pub const BLOCK_SIZE: ::std::os::raw::c_uint = 1024; +pub const SEEK_SET: ::std::os::raw::c_uint = 0; +pub const SEEK_CUR: ::std::os::raw::c_uint = 1; +pub const SEEK_END: ::std::os::raw::c_uint = 2; +pub const SEEK_DATA: ::std::os::raw::c_uint = 3; +pub const SEEK_HOLE: ::std::os::raw::c_uint = 4; +pub const SEEK_MAX: ::std::os::raw::c_uint = 4; +pub const RENAME_NOREPLACE: ::std::os::raw::c_uint = 1; +pub const RENAME_EXCHANGE: ::std::os::raw::c_uint = 2; +pub const RENAME_WHITEOUT: ::std::os::raw::c_uint = 4; +pub const FILE_DEDUPE_RANGE_SAME: ::std::os::raw::c_uint = 0; +pub const FILE_DEDUPE_RANGE_DIFFERS: ::std::os::raw::c_uint = 1; +pub const NR_FILE: ::std::os::raw::c_uint = 8192; +pub const FS_XFLAG_REALTIME: ::std::os::raw::c_uint = 1; +pub const FS_XFLAG_PREALLOC: ::std::os::raw::c_uint = 2; +pub const FS_XFLAG_IMMUTABLE: ::std::os::raw::c_uint = 8; +pub const FS_XFLAG_APPEND: ::std::os::raw::c_uint = 16; +pub const FS_XFLAG_SYNC: ::std::os::raw::c_uint = 32; +pub const FS_XFLAG_NOATIME: ::std::os::raw::c_uint = 64; +pub const FS_XFLAG_NODUMP: ::std::os::raw::c_uint = 128; +pub const FS_XFLAG_RTINHERIT: ::std::os::raw::c_uint = 256; +pub const FS_XFLAG_PROJINHERIT: ::std::os::raw::c_uint = 512; +pub const FS_XFLAG_NOSYMLINKS: ::std::os::raw::c_uint = 1024; +pub const FS_XFLAG_EXTSIZE: ::std::os::raw::c_uint = 2048; +pub const FS_XFLAG_EXTSZINHERIT: ::std::os::raw::c_uint = 4096; +pub const FS_XFLAG_NODEFRAG: ::std::os::raw::c_uint = 8192; +pub const FS_XFLAG_FILESTREAM: ::std::os::raw::c_uint = 16384; +pub const FS_XFLAG_DAX: ::std::os::raw::c_uint = 32768; +pub const FS_XFLAG_COWEXTSIZE: ::std::os::raw::c_uint = 65536; +pub const FS_XFLAG_HASATTR: ::std::os::raw::c_uint = 2147483648; +pub const BMAP_IOCTL: ::std::os::raw::c_uint = 1; +pub const FSLABEL_MAX: ::std::os::raw::c_uint = 256; +pub const FS_KEY_DESCRIPTOR_SIZE: ::std::os::raw::c_uint = 8; +pub const FS_POLICY_FLAGS_PAD_4: ::std::os::raw::c_uint = 0; +pub const FS_POLICY_FLAGS_PAD_8: ::std::os::raw::c_uint = 1; +pub const FS_POLICY_FLAGS_PAD_16: ::std::os::raw::c_uint = 2; +pub const FS_POLICY_FLAGS_PAD_32: ::std::os::raw::c_uint = 3; +pub const FS_POLICY_FLAGS_PAD_MASK: ::std::os::raw::c_uint = 3; +pub const FS_POLICY_FLAG_DIRECT_KEY: ::std::os::raw::c_uint = 4; +pub const FS_POLICY_FLAGS_VALID: ::std::os::raw::c_uint = 7; +pub const FS_ENCRYPTION_MODE_INVALID: ::std::os::raw::c_uint = 0; +pub const FS_ENCRYPTION_MODE_AES_256_XTS: ::std::os::raw::c_uint = 1; +pub const FS_ENCRYPTION_MODE_AES_256_GCM: ::std::os::raw::c_uint = 2; +pub const FS_ENCRYPTION_MODE_AES_256_CBC: ::std::os::raw::c_uint = 3; +pub const FS_ENCRYPTION_MODE_AES_256_CTS: ::std::os::raw::c_uint = 4; +pub const FS_ENCRYPTION_MODE_AES_128_CBC: ::std::os::raw::c_uint = 5; +pub const FS_ENCRYPTION_MODE_AES_128_CTS: ::std::os::raw::c_uint = 6; +pub const FS_ENCRYPTION_MODE_SPECK128_256_XTS: ::std::os::raw::c_uint = 7; +pub const FS_ENCRYPTION_MODE_SPECK128_256_CTS: ::std::os::raw::c_uint = 8; +pub const FS_ENCRYPTION_MODE_ADIANTUM: ::std::os::raw::c_uint = 9; +pub const FS_KEY_DESC_PREFIX: &'static [u8; 9usize] = b"fscrypt:\0"; +pub const FS_KEY_DESC_PREFIX_SIZE: ::std::os::raw::c_uint = 8; +pub const FS_MAX_KEY_SIZE: ::std::os::raw::c_uint = 64; +pub const FS_SECRM_FL: ::std::os::raw::c_uint = 1; +pub const FS_UNRM_FL: ::std::os::raw::c_uint = 2; +pub const FS_COMPR_FL: ::std::os::raw::c_uint = 4; +pub const FS_SYNC_FL: ::std::os::raw::c_uint = 8; +pub const FS_IMMUTABLE_FL: ::std::os::raw::c_uint = 16; +pub const FS_APPEND_FL: ::std::os::raw::c_uint = 32; +pub const FS_NODUMP_FL: ::std::os::raw::c_uint = 64; +pub const FS_NOATIME_FL: ::std::os::raw::c_uint = 128; +pub const FS_DIRTY_FL: ::std::os::raw::c_uint = 256; +pub const FS_COMPRBLK_FL: ::std::os::raw::c_uint = 512; +pub const FS_NOCOMP_FL: ::std::os::raw::c_uint = 1024; +pub const FS_ENCRYPT_FL: ::std::os::raw::c_uint = 2048; +pub const FS_BTREE_FL: ::std::os::raw::c_uint = 4096; +pub const FS_INDEX_FL: ::std::os::raw::c_uint = 4096; +pub const FS_IMAGIC_FL: ::std::os::raw::c_uint = 8192; +pub const FS_JOURNAL_DATA_FL: ::std::os::raw::c_uint = 16384; +pub const FS_NOTAIL_FL: ::std::os::raw::c_uint = 32768; +pub const FS_DIRSYNC_FL: ::std::os::raw::c_uint = 65536; +pub const FS_TOPDIR_FL: ::std::os::raw::c_uint = 131072; +pub const FS_HUGE_FILE_FL: ::std::os::raw::c_uint = 262144; +pub const FS_EXTENT_FL: ::std::os::raw::c_uint = 524288; +pub const FS_EA_INODE_FL: ::std::os::raw::c_uint = 2097152; +pub const FS_EOFBLOCKS_FL: ::std::os::raw::c_uint = 4194304; +pub const FS_NOCOW_FL: ::std::os::raw::c_uint = 8388608; +pub const FS_INLINE_DATA_FL: ::std::os::raw::c_uint = 268435456; +pub const FS_PROJINHERIT_FL: ::std::os::raw::c_uint = 536870912; +pub const FS_RESERVED_FL: ::std::os::raw::c_uint = 2147483648; +pub const FS_FL_USER_VISIBLE: ::std::os::raw::c_uint = 253951; +pub const FS_FL_USER_MODIFIABLE: ::std::os::raw::c_uint = 229631; +pub const SYNC_FILE_RANGE_WAIT_BEFORE: ::std::os::raw::c_uint = 1; +pub const SYNC_FILE_RANGE_WRITE: ::std::os::raw::c_uint = 2; +pub const SYNC_FILE_RANGE_WAIT_AFTER: ::std::os::raw::c_uint = 4; +pub const SYNC_FILE_RANGE_WRITE_AND_WAIT: ::std::os::raw::c_uint = 7; +pub const IORING_SETUP_IOPOLL: ::std::os::raw::c_uint = 1; +pub const IORING_SETUP_SQPOLL: ::std::os::raw::c_uint = 2; +pub const IORING_SETUP_SQ_AFF: ::std::os::raw::c_uint = 4; +pub const IORING_SETUP_CQSIZE: ::std::os::raw::c_uint = 8; +pub const IORING_SETUP_CLAMP: ::std::os::raw::c_uint = 16; +pub const IORING_SETUP_ATTACH_WQ: ::std::os::raw::c_uint = 32; +pub const IORING_FSYNC_DATASYNC: ::std::os::raw::c_uint = 1; +pub const IORING_TIMEOUT_ABS: ::std::os::raw::c_uint = 1; +pub const IORING_OFF_SQ_RING: ::std::os::raw::c_uint = 0; +pub const IORING_OFF_CQ_RING: ::std::os::raw::c_uint = 134217728; +pub const IORING_OFF_SQES: ::std::os::raw::c_uint = 268435456; +pub const IORING_SQ_NEED_WAKEUP: ::std::os::raw::c_uint = 1; +pub const IORING_ENTER_GETEVENTS: ::std::os::raw::c_uint = 1; +pub const IORING_ENTER_SQ_WAKEUP: ::std::os::raw::c_uint = 2; +pub const IORING_FEAT_SINGLE_MMAP: ::std::os::raw::c_uint = 1; +pub const IORING_FEAT_NODROP: ::std::os::raw::c_uint = 2; +pub const IORING_FEAT_SUBMIT_STABLE: ::std::os::raw::c_uint = 4; +pub const IORING_FEAT_RW_CUR_POS: ::std::os::raw::c_uint = 8; +pub const IORING_FEAT_CUR_PERSONALITY: ::std::os::raw::c_uint = 16; +pub const IORING_REGISTER_BUFFERS: ::std::os::raw::c_uint = 0; +pub const IORING_UNREGISTER_BUFFERS: ::std::os::raw::c_uint = 1; +pub const IORING_REGISTER_FILES: ::std::os::raw::c_uint = 2; +pub const IORING_UNREGISTER_FILES: ::std::os::raw::c_uint = 3; +pub const IORING_REGISTER_EVENTFD: ::std::os::raw::c_uint = 4; +pub const IORING_UNREGISTER_EVENTFD: ::std::os::raw::c_uint = 5; +pub const IORING_REGISTER_FILES_UPDATE: ::std::os::raw::c_uint = 6; +pub const IORING_REGISTER_EVENTFD_ASYNC: ::std::os::raw::c_uint = 7; +pub const IORING_REGISTER_PROBE: ::std::os::raw::c_uint = 8; +pub const IORING_REGISTER_PERSONALITY: ::std::os::raw::c_uint = 9; +pub const IORING_UNREGISTER_PERSONALITY: ::std::os::raw::c_uint = 10; +pub const IO_URING_OP_SUPPORTED: ::std::os::raw::c_uint = 1; +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct file_clone_range { + pub src_fd: i64, + pub src_offset: u64, + pub src_length: u64, + pub dest_offset: u64, +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct fstrim_range { + pub start: u64, + pub len: u64, + pub minlen: u64, +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct file_dedupe_range_info { + pub dest_fd: i64, + pub dest_offset: u64, + pub bytes_deduped: u64, + pub status: i32, + pub reserved: u32, +} +#[repr(C)] +#[derive(Debug, Default)] +pub struct file_dedupe_range { + pub src_offset: u64, + pub src_length: u64, + pub dest_count: u16, + pub reserved1: u16, + pub reserved2: u32, + pub info: __IncompleteArrayField<file_dedupe_range_info>, +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct files_stat_struct { + pub nr_files: ::std::os::raw::c_ulong, + pub nr_free_files: ::std::os::raw::c_ulong, + pub max_files: ::std::os::raw::c_ulong, +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct inodes_stat_t { + pub nr_inodes: ::std::os::raw::c_long, + pub nr_unused: ::std::os::raw::c_long, + pub dummy: [::std::os::raw::c_long; 5usize], +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct fsxattr { + pub fsx_xflags: u32, + pub fsx_extsize: u32, + pub fsx_nextents: u32, + pub fsx_projid: u32, + pub fsx_cowextsize: u32, + pub fsx_pad: [::std::os::raw::c_uchar; 8usize], +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct fscrypt_policy { + pub version: u8, + pub contents_encryption_mode: u8, + pub filenames_encryption_mode: u8, + pub flags: u8, + pub master_key_descriptor: [u8; 8usize], +} +#[repr(C)] +#[derive(Copy, Clone)] +pub struct fscrypt_key { + pub mode: u32, + pub raw: [u8; 64usize], + pub size: u32, +} +impl Default for fscrypt_key { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} +pub type __kernel_rwf_t = ::std::os::raw::c_int; +#[repr(C)] +#[derive(Copy, Clone)] +pub struct io_uring_sqe { + pub opcode: u8, + pub flags: u8, + pub ioprio: u16, + pub fd: i32, + pub __bindgen_anon_1: io_uring_sqe__bindgen_ty_1, + pub addr: u64, + pub len: u32, + pub __bindgen_anon_2: io_uring_sqe__bindgen_ty_2, + pub user_data: u64, + pub __bindgen_anon_3: io_uring_sqe__bindgen_ty_3, +} +#[repr(C)] +#[derive(Copy, Clone)] +pub union io_uring_sqe__bindgen_ty_1 { + pub off: u64, + pub addr2: u64, + _bindgen_union_align: u64, +} +impl Default for io_uring_sqe__bindgen_ty_1 { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} +#[repr(C)] +#[derive(Copy, Clone)] +pub union io_uring_sqe__bindgen_ty_2 { + pub rw_flags: __kernel_rwf_t, + pub fsync_flags: u32, + pub poll_events: u16, + pub sync_range_flags: u32, + pub msg_flags: u32, + pub timeout_flags: u32, + pub accept_flags: u32, + pub cancel_flags: u32, + pub open_flags: u32, + pub statx_flags: u32, + pub fadvise_advice: u32, + _bindgen_union_align: u32, +} +impl Default for io_uring_sqe__bindgen_ty_2 { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} +#[repr(C)] +#[derive(Copy, Clone)] +pub union io_uring_sqe__bindgen_ty_3 { + pub __bindgen_anon_1: io_uring_sqe__bindgen_ty_3__bindgen_ty_1, + pub __pad2: [u64; 3usize], + _bindgen_union_align: [u64; 3usize], +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct io_uring_sqe__bindgen_ty_3__bindgen_ty_1 { + pub buf_index: u16, + pub personality: u16, +} +impl Default for io_uring_sqe__bindgen_ty_3 { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} +impl Default for io_uring_sqe { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} +pub const IOSQE_FIXED_FILE_BIT: _bindgen_ty_1 = 0; +pub const IOSQE_IO_DRAIN_BIT: _bindgen_ty_1 = 1; +pub const IOSQE_IO_LINK_BIT: _bindgen_ty_1 = 2; +pub const IOSQE_IO_HARDLINK_BIT: _bindgen_ty_1 = 3; +pub const IOSQE_ASYNC_BIT: _bindgen_ty_1 = 4; +pub type _bindgen_ty_1 = ::std::os::raw::c_uint; +pub const IORING_OP_NOP: _bindgen_ty_2 = 0; +pub const IORING_OP_READV: _bindgen_ty_2 = 1; +pub const IORING_OP_WRITEV: _bindgen_ty_2 = 2; +pub const IORING_OP_FSYNC: _bindgen_ty_2 = 3; +pub const IORING_OP_READ_FIXED: _bindgen_ty_2 = 4; +pub const IORING_OP_WRITE_FIXED: _bindgen_ty_2 = 5; +pub const IORING_OP_POLL_ADD: _bindgen_ty_2 = 6; +pub const IORING_OP_POLL_REMOVE: _bindgen_ty_2 = 7; +pub const IORING_OP_SYNC_FILE_RANGE: _bindgen_ty_2 = 8; +pub const IORING_OP_SENDMSG: _bindgen_ty_2 = 9; +pub const IORING_OP_RECVMSG: _bindgen_ty_2 = 10; +pub const IORING_OP_TIMEOUT: _bindgen_ty_2 = 11; +pub const IORING_OP_TIMEOUT_REMOVE: _bindgen_ty_2 = 12; +pub const IORING_OP_ACCEPT: _bindgen_ty_2 = 13; +pub const IORING_OP_ASYNC_CANCEL: _bindgen_ty_2 = 14; +pub const IORING_OP_LINK_TIMEOUT: _bindgen_ty_2 = 15; +pub const IORING_OP_CONNECT: _bindgen_ty_2 = 16; +pub const IORING_OP_FALLOCATE: _bindgen_ty_2 = 17; +pub const IORING_OP_OPENAT: _bindgen_ty_2 = 18; +pub const IORING_OP_CLOSE: _bindgen_ty_2 = 19; +pub const IORING_OP_FILES_UPDATE: _bindgen_ty_2 = 20; +pub const IORING_OP_STATX: _bindgen_ty_2 = 21; +pub const IORING_OP_READ: _bindgen_ty_2 = 22; +pub const IORING_OP_WRITE: _bindgen_ty_2 = 23; +pub const IORING_OP_FADVISE: _bindgen_ty_2 = 24; +pub const IORING_OP_MADVISE: _bindgen_ty_2 = 25; +pub const IORING_OP_SEND: _bindgen_ty_2 = 26; +pub const IORING_OP_RECV: _bindgen_ty_2 = 27; +pub const IORING_OP_OPENAT2: _bindgen_ty_2 = 28; +pub const IORING_OP_EPOLL_CTL: _bindgen_ty_2 = 29; +pub const IORING_OP_LAST: _bindgen_ty_2 = 30; +pub type _bindgen_ty_2 = ::std::os::raw::c_uint; +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct io_uring_cqe { + pub user_data: u64, + pub res: i32, + pub flags: u32, +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct io_sqring_offsets { + pub head: u32, + pub tail: u32, + pub ring_mask: u32, + pub ring_entries: u32, + pub flags: u32, + pub dropped: u32, + pub array: u32, + pub resv1: u32, + pub resv2: u64, +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct io_cqring_offsets { + pub head: u32, + pub tail: u32, + pub ring_mask: u32, + pub ring_entries: u32, + pub overflow: u32, + pub cqes: u32, + pub resv: [u64; 2usize], +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct io_uring_params { + pub sq_entries: u32, + pub cq_entries: u32, + pub flags: u32, + pub sq_thread_cpu: u32, + pub sq_thread_idle: u32, + pub features: u32, + pub wq_fd: u32, + pub resv: [u32; 3usize], + pub sq_off: io_sqring_offsets, + pub cq_off: io_cqring_offsets, +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct io_uring_files_update { + pub offset: u32, + pub resv: u32, + pub fds: u64, +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct io_uring_probe_op { + pub op: u8, + pub resv: u8, + pub flags: u16, + pub resv2: u32, +} +#[repr(C)] +#[derive(Debug, Default)] +pub struct io_uring_probe { + pub last_op: u8, + pub ops_len: u8, + pub resv: u16, + pub resv2: [u32; 3usize], + pub ops: __IncompleteArrayField<io_uring_probe_op>, +} diff --git a/io_uring/src/lib.rs b/io_uring/src/lib.rs new file mode 100644 index 0000000..071c503 --- /dev/null +++ b/io_uring/src/lib.rs @@ -0,0 +1,9 @@ +// Copyright 2020 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +mod bindings; +mod syscalls; +mod uring; + +pub use uring::*; diff --git a/io_uring/src/syscalls.rs b/io_uring/src/syscalls.rs new file mode 100644 index 0000000..e52892b --- /dev/null +++ b/io_uring/src/syscalls.rs @@ -0,0 +1,41 @@ +// Copyright 2020 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::os::unix::io::RawFd; +use std::ptr::null_mut; + +use libc::{c_int, c_long, c_void}; +use syscall_defines::linux::LinuxSyscall::*; + +use crate::bindings::*; + +/// Returns the system error as the result; +pub type Result<T> = std::result::Result<T, c_int>; + +pub unsafe fn io_uring_setup(num_entries: usize, params: &io_uring_params) -> Result<RawFd> { + let ret = libc::syscall( + SYS_io_uring_setup as c_long, + num_entries as c_int, + params as *const _, + ); + if ret < 0 { + return Err(*libc::__errno_location()); + } + Ok(ret as RawFd) +} + +pub unsafe fn io_uring_enter(fd: RawFd, to_submit: u64, to_wait: u64, flags: u32) -> Result<()> { + let ret = libc::syscall( + SYS_io_uring_enter as c_long, + fd, + to_submit as c_int, + to_wait as c_int, + flags as c_int, + null_mut::<*mut c_void>(), + ); + if ret < 0 { + return Err(*libc::__errno_location()); + } + Ok(()) +} diff --git a/io_uring/src/uring.rs b/io_uring/src/uring.rs new file mode 100644 index 0000000..0a51df1 --- /dev/null +++ b/io_uring/src/uring.rs @@ -0,0 +1,772 @@ +// Copyright 2020 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::fmt; +use std::fs::File; +use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; +use std::ptr::null_mut; +use std::sync::atomic::{AtomicU32, Ordering}; + +use sys_util::{MemoryMapping, WatchingEvents}; + +use crate::bindings::*; +use crate::syscalls::*; + +/// Holds per-operation, user specified data. The usage is up to the caller. The most common use is +/// for callers to identify each request. +pub type UserData = u64; + +#[derive(Debug)] +pub enum Error { + /// The call to `io_uring_enter` failed with the given errno. + RingEnter(libc::c_int), + /// The call to `io_uring_setup` failed with the given errno. + Setup(libc::c_int), + /// Failed to map the completion ring. + MappingCompleteRing(sys_util::MmapError), + /// Failed to map the submit ring. + MappingSubmitRing(sys_util::MmapError), + /// Failed to map submit entries. + MappingSubmitEntries(sys_util::MmapError), + /// Too many ops are already queued. + NoSpace, +} +pub type Result<T> = std::result::Result<T, Error>; + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::Error::*; + + match self { + RingEnter(e) => write!(f, "Failed to enter io uring {}", e), + Setup(e) => write!(f, "Failed to setup io uring {}", e), + MappingCompleteRing(e) => write!(f, "Failed to mmap completion ring {}", e), + MappingSubmitRing(e) => write!(f, "Failed to mmap submit ring {}", e), + MappingSubmitEntries(e) => write!(f, "Failed to mmap submit entries {}", e), + NoSpace => write!( + f, + "No space for more ring entries, try increasing the size passed to `new`", + ), + } + } +} + +/// Basic statistics about the operations that have been submitted to the uring. +#[derive(Default)] +pub struct URingStats { + total_enter_calls: u64, // Number of times the uring has been entered. + total_ops: u64, // Total ops submitted to io_uring. + total_complete: u64, // Total ops completed by io_uring. +} + +/// Unsafe wrapper for the kernel's io_uring interface. Allows for queueing multiple I/O operations +/// to the kernel and asynchronously handling the completion of these operations. +/// Use the various `add_*` functions to configure operations, then call `wait` to start +/// the operations and get any completed results. Each op is given a u64 user_data argument that is +/// used to identify the result when returned in the iterator provided by `wait`. +/// +/// # Example polling an FD for readable status. +/// +/// ``` +/// # use std::fs::File; +/// # use std::os::unix::io::AsRawFd; +/// # use std::path::Path; +/// # use sys_util::WatchingEvents; +/// # use io_uring::URingContext; +/// let f = File::open(Path::new("/dev/zero")).unwrap(); +/// let mut uring = URingContext::new(16).unwrap(); +/// uring +/// .add_poll_fd(f.as_raw_fd(), WatchingEvents::empty().set_read(), 454) +/// .unwrap(); +/// let (user_data, res) = uring.wait().unwrap().next().unwrap(); +/// assert_eq!(user_data, 454 as UserData); +/// assert_eq!(res.unwrap(), 1 as i32); +/// +/// ``` +pub struct URingContext { + ring_file: File, // Holds the io_uring context FD returned from io_uring_setup. + submit_ring: SubmitQueueState, + submit_queue_entries: SubmitQueueEntries, + complete_ring: CompleteQueueState, + io_vecs: Vec<libc::iovec>, + in_flight: usize, // The number of pending operations. + added: usize, // The number of ops added since the last call to `io_uring_enter`. + stats: URingStats, +} + +impl URingContext { + /// Creates a `URingContext` where the underlying uring has a space for `num_entries` + /// simultaneous operations. + pub fn new(num_entries: usize) -> Result<URingContext> { + let ring_params = io_uring_params::default(); + // The below unsafe block isolates the creation of the URingContext. Each step on it's own + // is unsafe. Using the uring FD for the mapping and the offsets returned by the kernel for + // base addresses maintains safety guarantees assuming the kernel API guarantees are + // trusted. + unsafe { + // Safe because the kernel is trusted to only modify params and `File` is created with + // an FD that it takes complete ownership of. + let fd = io_uring_setup(num_entries, &ring_params).map_err(Error::Setup)?; + let ring_file = File::from_raw_fd(fd); + + // Mmap the submit and completion queues. + // Safe because we trust the kernel to set valid sizes in `io_uring_setup` and any error + // is checked. + let submit_ring = SubmitQueueState::new( + MemoryMapping::from_fd_offset_populate( + &ring_file, + ring_params.sq_off.array as usize + + ring_params.sq_entries as usize * std::mem::size_of::<u32>(), + u64::from(IORING_OFF_SQ_RING), + ) + .map_err(Error::MappingSubmitRing)?, + &ring_params, + ); + + let submit_queue_entries = SubmitQueueEntries { + mmap: MemoryMapping::from_fd_offset_populate( + &ring_file, + ring_params.sq_entries as usize * std::mem::size_of::<io_uring_sqe>(), + u64::from(IORING_OFF_SQES), + ) + .map_err(Error::MappingSubmitEntries)?, + len: ring_params.sq_entries as usize, + }; + + let complete_ring = CompleteQueueState::new( + MemoryMapping::from_fd_offset_populate( + &ring_file, + ring_params.cq_off.cqes as usize + + ring_params.cq_entries as usize * std::mem::size_of::<io_uring_cqe>(), + u64::from(IORING_OFF_CQ_RING), + ) + .map_err(Error::MappingCompleteRing)?, + &ring_params, + ); + + Ok(URingContext { + ring_file, + submit_ring, + submit_queue_entries, + complete_ring, + io_vecs: vec![ + libc::iovec { + iov_base: null_mut(), + iov_len: 0 + }; + num_entries + ], + added: 0, + in_flight: 0, + stats: Default::default(), + }) + } + } + + // Call `f` with the next available sqe or return an error if none are available. + // After `f` returns, the sqe is appended to the kernel's queue. + fn prep_next_sqe<F>(&mut self, mut f: F) -> Result<()> + where + F: FnMut(&mut io_uring_sqe, &mut libc::iovec), + { + // Find the next free submission entry in the submit ring and fill it with an iovec. + // The below raw pointer derefs are safe because the memory the pointers use lives as long + // as the mmap in self. + let tail = self.submit_ring.pointers.tail(Ordering::Relaxed); + let next_tail = tail.wrapping_add(1); + if next_tail == self.submit_ring.pointers.head(Ordering::Acquire) { + return Err(Error::NoSpace); + } + // `tail` is the next sqe to use. + let index = (tail & self.submit_ring.ring_mask) as usize; + let sqe = self.submit_queue_entries.get_mut(index).unwrap(); + + f(sqe, &mut self.io_vecs[index]); + + // Tells the kernel to use the new index when processing the entry at that index. + self.submit_ring.set_array_entry(index, index as u32); + // Ensure the above writes to sqe are seen before the tail is updated. + // set_tail uses Release ordering when storing to the ring. + self.submit_ring.pointers.set_tail(next_tail); + + self.added += 1; + + Ok(()) + } + + unsafe fn add_rw_op( + &mut self, + ptr: *const u8, + len: usize, + fd: RawFd, + offset: u64, + user_data: UserData, + op: u8, + ) -> Result<()> { + self.prep_next_sqe(|sqe, iovec| { + iovec.iov_base = ptr as *const libc::c_void as *mut _; + iovec.iov_len = len; + sqe.opcode = op; + sqe.addr = iovec as *const _ as *const libc::c_void as u64; + sqe.len = 1; + sqe.__bindgen_anon_1.off = offset; + sqe.__bindgen_anon_3.__bindgen_anon_1.buf_index = 0; + sqe.ioprio = 0; + sqe.user_data = user_data; + sqe.flags = 0; + sqe.fd = fd; + })?; + + Ok(()) + } + + /// Asynchronously writes to `fd` from the address given in `ptr`. + /// # Safety + /// `add_write` will write up to `len` bytes of data from the address given by `ptr`. This is + /// only safe if the caller guarantees that the memory lives until the transaction is complete + /// and that completion has been returned from the `wait` function. In addition there must not + /// be other references to the data pointed to by `ptr` until the operation completes. Ensure + /// that the fd remains open until the op completes as well. + pub unsafe fn add_write( + &mut self, + ptr: *const u8, + len: usize, + fd: RawFd, + offset: u64, + user_data: UserData, + ) -> Result<()> { + self.add_rw_op(ptr, len, fd, offset, user_data, IORING_OP_WRITEV as u8) + } + + /// Asynchronously reads from `fd` to the address given in `ptr`. + /// # Safety + /// `add_read` will write up to `len` bytes of data to the address given by `ptr`. This is only + /// safe if the caller guarantees there are no other references to that memory and that the + /// memory lives until the transaction is complete and that completion has been returned from + /// the `wait` function. In addition there must not be any mutable references to the data + /// pointed to by `ptr` until the operation completes. Ensure that the fd remains open until + /// the op completes as well. + pub unsafe fn add_read( + &mut self, + ptr: *mut u8, + len: usize, + fd: RawFd, + offset: u64, + user_data: UserData, + ) -> Result<()> { + self.add_rw_op(ptr, len, fd, offset, user_data, IORING_OP_READV as u8) + } + + /// Syncs all completed operations, the ordering with in-flight async ops is not + /// defined. + pub fn add_fsync(&mut self, fd: RawFd, user_data: UserData) -> Result<()> { + self.prep_next_sqe(|sqe, _iovec| { + sqe.opcode = IORING_OP_FSYNC as u8; + sqe.fd = fd; + sqe.user_data = user_data; + + sqe.addr = 0; + sqe.len = 0; + sqe.__bindgen_anon_1.off = 0; + sqe.__bindgen_anon_3.__bindgen_anon_1.buf_index = 0; + sqe.__bindgen_anon_2.rw_flags = 0; + sqe.ioprio = 0; + sqe.flags = 0; + }) + } + + /// See the usage of `fallocate`, this asynchronously performs the same operations. + pub fn add_fallocate( + &mut self, + fd: RawFd, + offset: u64, + len: usize, + mode: u64, + user_data: UserData, + ) -> Result<()> { + // Note that len for fallocate in passed in the addr field of the sqe and the mode uses the + // len field. + self.prep_next_sqe(|sqe, _iovec| { + sqe.opcode = IORING_OP_FALLOCATE as u8; + + sqe.fd = fd; + sqe.addr = len as u64; + sqe.len = mode as u32; + sqe.__bindgen_anon_1.off = offset; + sqe.user_data = user_data; + + sqe.__bindgen_anon_3.__bindgen_anon_1.buf_index = 0; + sqe.__bindgen_anon_2.rw_flags = 0; + sqe.ioprio = 0; + sqe.flags = 0; + }) + } + + /// Adds an FD to be polled based on the given flags. + /// The user must keep the FD open until the operation completion is returned from + /// `wait`. + /// Note that io_uring is always a one shot poll. After the fd is returned, it must be re-added + /// to get future events. + pub fn add_poll_fd( + &mut self, + fd: RawFd, + events: WatchingEvents, + user_data: UserData, + ) -> Result<()> { + self.prep_next_sqe(|sqe, _iovec| { + sqe.opcode = IORING_OP_POLL_ADD as u8; + sqe.fd = fd; + sqe.user_data = user_data; + sqe.__bindgen_anon_2.poll_events = events.get_raw() as u16; + + sqe.addr = 0; + sqe.len = 0; + sqe.__bindgen_anon_1.off = 0; + sqe.__bindgen_anon_3.__bindgen_anon_1.buf_index = 0; + sqe.ioprio = 0; + sqe.flags = 0; + }) + } + + /// Removes an FD that was previously added with `add_poll_fd`. + pub fn remove_poll_fd( + &mut self, + fd: RawFd, + events: WatchingEvents, + user_data: UserData, + ) -> Result<()> { + self.prep_next_sqe(|sqe, _iovec| { + sqe.opcode = IORING_OP_POLL_REMOVE as u8; + sqe.fd = fd; + sqe.user_data = user_data; + sqe.__bindgen_anon_2.poll_events = events.get_raw() as u16; + + sqe.addr = 0; + sqe.len = 0; + sqe.__bindgen_anon_1.off = 0; + sqe.__bindgen_anon_3.__bindgen_anon_1.buf_index = 0; + sqe.ioprio = 0; + sqe.flags = 0; + }) + } + + /// Sends operations added with the `add_*` functions to the kernel. + pub fn submit(&mut self) -> Result<()> { + self.in_flight += self.added; + self.stats.total_ops = self.stats.total_ops.wrapping_add(self.added as u64); + if self.added > 0 { + self.stats.total_enter_calls = self.stats.total_enter_calls.wrapping_add(1); + unsafe { + // Safe because the only memory modified is in the completion queue. + io_uring_enter(self.ring_file.as_raw_fd(), self.added as u64, 1, 0) + .map_err(Error::RingEnter)?; + } + } + self.added = 0; + + Ok(()) + } + + /// Sends operations added with the `add_*` functions to the kernel and return an iterator to any + /// completed operations. `wait` blocks until at least one completion is ready. If called + /// without any new events added, this simply waits for any existing events to complete and + /// returns as soon an one or more is ready. + pub fn wait<'a>( + &'a mut self, + ) -> Result<impl Iterator<Item = (UserData, std::io::Result<i32>)> + 'a> { + let completed = self.complete_ring.num_completed(); + self.stats.total_complete = self.stats.total_complete.wrapping_add(completed as u64); + self.in_flight -= completed; + self.in_flight += self.added; + self.stats.total_ops = self.stats.total_ops.wrapping_add(self.added as u64); + if self.in_flight > 0 { + unsafe { + self.stats.total_enter_calls = self.stats.total_enter_calls.wrapping_add(1); + // Safe because the only memory modified is in the completion queue. + io_uring_enter( + self.ring_file.as_raw_fd(), + self.added as u64, + 1, + IORING_ENTER_GETEVENTS, + ) + .map_err(Error::RingEnter)?; + } + } + self.added = 0; + + // The CompletionQueue will iterate all completed ops. + Ok(&mut self.complete_ring) + } +} + +impl AsRawFd for URingContext { + fn as_raw_fd(&self) -> RawFd { + self.ring_file.as_raw_fd() + } +} + +struct SubmitQueueEntries { + mmap: MemoryMapping, + len: usize, +} + +impl SubmitQueueEntries { + fn get_mut(&mut self, index: usize) -> Option<&mut io_uring_sqe> { + if index >= self.len { + return None; + } + unsafe { + // Safe because the mut borrow of self resticts to one mutable reference at a time and + // we trust that the kernel has returned enough memory in io_uring_setup and mmap. + Some(&mut *(self.mmap.as_ptr() as *mut io_uring_sqe).add(index)) + } + } +} + +struct SubmitQueueState { + _mmap: MemoryMapping, + pointers: QueuePointers, + ring_mask: u32, + array: *mut u32, +} + +impl SubmitQueueState { + // # Safety + // Safe iff `mmap` is created by mapping from a uring FD at the SQ_RING offset and params is + // the params struct passed to io_uring_setup. + unsafe fn new(mmap: MemoryMapping, params: &io_uring_params) -> SubmitQueueState { + let ptr = mmap.as_ptr(); + // Transmutes are safe because a u32 is atomic on all supported architectures and the + // pointer will live until after self is dropped because the mmap is owned. + let head = ptr.add(params.sq_off.head as usize) as *const AtomicU32; + let tail = ptr.add(params.sq_off.tail as usize) as *const AtomicU32; + // This offset is guaranteed to be within the mmap so unwrap the result. + let ring_mask = mmap.read_obj(params.sq_off.ring_mask as usize).unwrap(); + let array = ptr.add(params.sq_off.array as usize) as *mut u32; + SubmitQueueState { + _mmap: mmap, + pointers: QueuePointers { head, tail }, + ring_mask, + array, + } + } + + // Sets the kernel's array entry at the given `index` to `value`. + fn set_array_entry(&self, index: usize, value: u32) { + // Safe because self being constructed from the correct mmap guaratees that the memory is + // valid to written. + unsafe { + std::ptr::write_volatile(self.array.add(index), value as u32); + } + } +} + +struct CompleteQueueState { + mmap: MemoryMapping, + pointers: QueuePointers, + ring_mask: u32, + cqes_offset: u32, + completed: usize, +} + +impl CompleteQueueState { + /// # Safety + /// Safe iff `mmap` is created by mapping from a uring FD at the CQ_RING offset and params is + /// the params struct passed to io_uring_setup. + unsafe fn new(mmap: MemoryMapping, params: &io_uring_params) -> CompleteQueueState { + let ptr = mmap.as_ptr(); + let head = ptr.add(params.cq_off.head as usize) as *const AtomicU32; + let tail = ptr.add(params.cq_off.tail as usize) as *const AtomicU32; + let ring_mask = mmap.read_obj(params.cq_off.ring_mask as usize).unwrap(); + CompleteQueueState { + mmap, + pointers: QueuePointers { head, tail }, + ring_mask, + cqes_offset: params.cq_off.cqes, + completed: 0, + } + } + + fn get_cqe(&self, head: u32) -> &io_uring_cqe { + unsafe { + // Safe because we trust that the kernel has returned enough memory in io_uring_setup + // and mmap and index is checked within range by the ring_mask. + let cqes = (self.mmap.as_ptr() as *const u8).add(self.cqes_offset as usize) + as *const io_uring_cqe; + + let index = head & self.ring_mask; + + &*cqes.add(index as usize) + } + } + + fn num_completed(&mut self) -> usize { + std::mem::replace(&mut self.completed, 0) + } +} + +// Return the completed ops with their result. +impl Iterator for CompleteQueueState { + type Item = (UserData, std::io::Result<i32>); + + fn next(&mut self) -> Option<Self::Item> { + // Safe because the pointers to the atomics are valid and the cqe must be in range + // because the kernel provided mask is applied to the index. + let head = self.pointers.head(Ordering::Relaxed); + + // Synchronize the read of tail after the read of head. + if head == self.pointers.tail(Ordering::Acquire) { + return None; + } + + self.completed += 1; + + let cqe = self.get_cqe(head); + let user_data = cqe.user_data; + let res = cqe.res; + + // Store the new head and ensure the reads above complete before the kernel sees the + // update to head, `set_head` uses `Release` ordering + let new_head = head.wrapping_add(1); + self.pointers.set_head(new_head); + + let io_res = match res { + r if r < 0 => Err(std::io::Error::from_raw_os_error(r)), + r => Ok(r), + }; + Some((user_data, io_res)) + } +} + +struct QueuePointers { + head: *const AtomicU32, + tail: *const AtomicU32, +} + +impl QueuePointers { + // Loads the tail pointer atomically with the given ordering. + fn tail(&self, ordering: Ordering) -> u32 { + // Safe because self being constructed from the correct mmap guaratees that the memory is + // valid to read. + unsafe { (*self.tail).load(ordering) } + } + + // Stores the new value of the tail in the submit queue. This allows the kernel to start + // processing entries that have been added up until the given tail pointer. + // Always stores with release ordering as that is the only valid way to use the pointer. + fn set_tail(&self, next_tail: u32) { + // Safe because self being constructed from the correct mmap guaratees that the memory is + // valid to read and it's used as an atomic to cover mutability concerns. + unsafe { (*self.tail).store(next_tail, Ordering::Release) } + } + + // Loads the head pointer atomically with the given ordering. + fn head(&self, ordering: Ordering) -> u32 { + // Safe because self being constructed from the correct mmap guaratees that the memory is + // valid to read. + unsafe { (*self.head).load(ordering) } + } + + // Stores the new value of the head in the submit queue. This allows the kernel to start + // processing entries that have been added up until the given head pointer. + // Always stores with release ordering as that is the only valid way to use the pointer. + fn set_head(&self, next_head: u32) { + // Safe because self being constructed from the correct mmap guaratees that the memory is + // valid to read and it's used as an atomic to cover mutability concerns. + unsafe { (*self.head).store(next_head, Ordering::Release) } + } +} + +#[cfg(test)] +mod tests { + use std::fs::OpenOptions; + use std::io::Write; + use std::path::{Path, PathBuf}; + use std::time::Duration; + + use sys_util::PollContext; + use tempfile::TempDir; + + use super::*; + + fn append_file_name(path: &Path, name: &str) -> PathBuf { + let mut joined = path.to_path_buf(); + joined.push(name); + joined + } + + #[test] + fn read_one_block_at_a_time() { + let tempdir = TempDir::new().unwrap(); + let file_path = append_file_name(tempdir.path(), "test"); + let queue_size = 128; + + let mut uring = URingContext::new(queue_size).unwrap(); + let mut buf = [0u8; 0x1000]; + let f = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(&file_path) + .unwrap(); + f.set_len(0x1000 * 2).unwrap(); + + for i in 0..queue_size * 2 { + unsafe { + // Safe because the `wait` call waits until the kernel is done with`buf`. + let index = i as u64; + uring + .add_read( + buf.as_mut_ptr(), + buf.len(), + f.as_raw_fd(), + (index % 2) * 0x1000, + index, + ) + .unwrap(); + let (user_data, res) = uring.wait().unwrap().next().unwrap(); + assert_eq!(user_data, i as UserData); + assert_eq!(res.unwrap(), buf.len() as i32); + } + } + } + + #[test] + fn write_one_block() { + let tempdir = TempDir::new().unwrap(); + let file_path = append_file_name(tempdir.path(), "test"); + + let mut uring = URingContext::new(16).unwrap(); + let mut buf = [0u8; 4096]; + let mut f = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(&file_path) + .unwrap(); + f.write(&buf).unwrap(); + f.write(&buf).unwrap(); + + unsafe { + // Safe because the `wait` call waits until the kernel is done mutating `buf`. + uring + .add_write(buf.as_mut_ptr(), buf.len(), f.as_raw_fd(), 0, 55) + .unwrap(); + let (user_data, res) = uring.wait().unwrap().next().unwrap(); + assert_eq!(user_data, 55 as UserData); + assert_eq!(res.unwrap(), buf.len() as i32); + } + } + + #[test] + fn write_one_submit_poll() { + let tempdir = TempDir::new().unwrap(); + let file_path = append_file_name(tempdir.path(), "test"); + + let mut uring = URingContext::new(16).unwrap(); + let mut buf = [0u8; 4096]; + let mut f = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(&file_path) + .unwrap(); + f.write(&buf).unwrap(); + f.write(&buf).unwrap(); + + let ctx: PollContext<u64> = PollContext::build_with(&[(&uring, 1)]).unwrap(); + { + // Test that the uring context isn't readable before any events are complete. + let events = ctx.wait_timeout(Duration::from_millis(1)).unwrap(); + assert!(events.iter_readable().next().is_none()); + } + + unsafe { + // Safe because the `wait` call waits until the kernel is done mutating `buf`. + uring + .add_write(buf.as_mut_ptr(), buf.len(), f.as_raw_fd(), 0, 55) + .unwrap(); + uring.submit().unwrap(); + // Poll for completion with epoll. + let events = ctx.wait().unwrap(); + let event = events.iter_readable().next().unwrap(); + assert_eq!(event.token(), 1); + let (user_data, res) = uring.wait().unwrap().next().unwrap(); + assert_eq!(user_data, 55 as UserData); + assert_eq!(res.unwrap(), buf.len() as i32); + } + } + #[test] + fn fallocate_fsync() { + let tempdir = TempDir::new().unwrap(); + let file_path = append_file_name(tempdir.path(), "test"); + + { + let buf = [0u8; 4096]; + let mut f = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(&file_path) + .unwrap(); + f.write(&buf).unwrap(); + } + + let init_size = std::fs::metadata(&file_path).unwrap().len() as usize; + let set_size = init_size + 1024 * 1024 * 50; + let f = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(&file_path) + .unwrap(); + + let mut uring = URingContext::new(16).unwrap(); + uring + .add_fallocate(f.as_raw_fd(), 0, set_size, 0, 66) + .unwrap(); + let (user_data, res) = uring.wait().unwrap().next().unwrap(); + assert_eq!(user_data, 66 as UserData); + assert_eq!(res.unwrap(), 0 as i32); + + uring.add_fsync(f.as_raw_fd(), 67).unwrap(); + let (user_data, res) = uring.wait().unwrap().next().unwrap(); + assert_eq!(user_data, 67 as UserData); + assert_eq!(res.unwrap(), 0 as i32); + + uring + .add_fallocate( + f.as_raw_fd(), + init_size as u64, + set_size - init_size, + (libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE) as u64, + 68, + ) + .unwrap(); + let (user_data, res) = uring.wait().unwrap().next().unwrap(); + assert_eq!(user_data, 68 as UserData); + assert_eq!(res.unwrap(), 0 as i32); + + drop(f); // Close to ensure directory entires for metadata are updated. + + let new_size = std::fs::metadata(&file_path).unwrap().len() as usize; + assert_eq!(new_size, set_size); + } + + #[test] + fn dev_zero_readable() { + let f = File::open(Path::new("/dev/zero")).unwrap(); + let mut uring = URingContext::new(16).unwrap(); + uring + .add_poll_fd(f.as_raw_fd(), WatchingEvents::empty().set_read(), 454) + .unwrap(); + let (user_data, res) = uring.wait().unwrap().next().unwrap(); + assert_eq!(user_data, 454 as UserData); + assert_eq!(res.unwrap(), 1 as i32); + } +} |