summary refs log tree commit diff
diff options
context:
space:
mode:
authorDylan Reid <dgreid@chromium.org>2020-03-13 13:40:40 -0700
committerCommit Bot <commit-bot@chromium.org>2020-04-05 21:32:20 +0000
commita9a1d0227713d22df6262ca99dd622281e8dc9bf (patch)
tree3f99d11aa72dc30d1b05e3e1b3d96d3391c5090b
parent252d5b3cf3fd7a48fe9d610b59e3d6da9f2c6fe9 (diff)
downloadcrosvm-a9a1d0227713d22df6262ca99dd622281e8dc9bf.tar
crosvm-a9a1d0227713d22df6262ca99dd622281e8dc9bf.tar.gz
crosvm-a9a1d0227713d22df6262ca99dd622281e8dc9bf.tar.bz2
crosvm-a9a1d0227713d22df6262ca99dd622281e8dc9bf.tar.lz
crosvm-a9a1d0227713d22df6262ca99dd622281e8dc9bf.tar.xz
crosvm-a9a1d0227713d22df6262ca99dd622281e8dc9bf.tar.zst
crosvm-a9a1d0227713d22df6262ca99dd622281e8dc9bf.zip
Add io_uring interfaces
Provide a low-level interface to operating the new io_uring interface.

This is an unsafe interface with the basic operations of setting up the
ring, adding to it, removing from it, and polling it.

This will be followed by a safe interface layer on top of this code,
then by additions to the asynchronous executor that allow for
asynchronously doing multiple operations to files from one context.

Change-Id: I71f7ffb04ce8cb4da470deda9aee768ab95d3d98
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/crosvm/+/2124009
Reviewed-by: Dylan Reid <dgreid@chromium.org>
Reviewed-by: Zach Reizner <zachr@chromium.org>
Tested-by: Dylan Reid <dgreid@chromium.org>
Tested-by: kokoro <noreply+kokoro@google.com>
Commit-Queue: Dylan Reid <dgreid@chromium.org>
-rw-r--r--io_uring/Cargo.toml15
-rw-r--r--io_uring/src/bindings.rs516
-rw-r--r--io_uring/src/lib.rs9
-rw-r--r--io_uring/src/syscalls.rs41
-rw-r--r--io_uring/src/uring.rs772
5 files changed, 1353 insertions, 0 deletions
diff --git a/io_uring/Cargo.toml b/io_uring/Cargo.toml
new file mode 100644
index 0000000..1b633d3
--- /dev/null
+++ b/io_uring/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "io_uring"
+version = "0.1.0"
+authors = ["The Chromium OS Authors"]
+edition = "2018"
+
+[dependencies]
+libc = "*"
+syscall_defines = { path = "../syscall_defines" }
+sys_util = { path = "../sys_util" }
+
+[dev-dependencies]
+tempfile = { path = "../tempfile" }
+
+[workspace]
diff --git a/io_uring/src/bindings.rs b/io_uring/src/bindings.rs
new file mode 100644
index 0000000..382a738
--- /dev/null
+++ b/io_uring/src/bindings.rs
@@ -0,0 +1,516 @@
+/* automatically generated by rust-bindgen
+ *
+ * bindgen --with-derive-default include/uapi/linux/io_uring.h
+ */
+
+#![allow(non_upper_case_globals)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+#![allow(dead_code)]
+
+#[repr(C)]
+#[derive(Default)]
+pub struct __IncompleteArrayField<T>(::std::marker::PhantomData<T>);
+impl<T> __IncompleteArrayField<T> {
+    #[inline]
+    pub fn new() -> Self {
+        __IncompleteArrayField(::std::marker::PhantomData)
+    }
+    #[inline]
+    pub unsafe fn as_ptr(&self) -> *const T {
+        ::std::mem::transmute(self)
+    }
+    #[inline]
+    pub unsafe fn as_mut_ptr(&mut self) -> *mut T {
+        ::std::mem::transmute(self)
+    }
+    #[inline]
+    pub unsafe fn as_slice(&self, len: usize) -> &[T] {
+        ::std::slice::from_raw_parts(self.as_ptr(), len)
+    }
+    #[inline]
+    pub unsafe fn as_mut_slice(&mut self, len: usize) -> &mut [T] {
+        ::std::slice::from_raw_parts_mut(self.as_mut_ptr(), len)
+    }
+}
+impl<T> ::std::fmt::Debug for __IncompleteArrayField<T> {
+    fn fmt(&self, fmt: &mut ::std::fmt::Formatter) -> ::std::fmt::Result {
+        fmt.write_str("__IncompleteArrayField")
+    }
+}
+impl<T> ::std::clone::Clone for __IncompleteArrayField<T> {
+    #[inline]
+    fn clone(&self) -> Self {
+        Self::new()
+    }
+}
+impl<T> ::std::marker::Copy for __IncompleteArrayField<T> {}
+pub const NR_OPEN: ::std::os::raw::c_uint = 1024;
+pub const NGROUPS_MAX: ::std::os::raw::c_uint = 65536;
+pub const ARG_MAX: ::std::os::raw::c_uint = 131072;
+pub const LINK_MAX: ::std::os::raw::c_uint = 127;
+pub const MAX_CANON: ::std::os::raw::c_uint = 255;
+pub const MAX_INPUT: ::std::os::raw::c_uint = 255;
+pub const NAME_MAX: ::std::os::raw::c_uint = 255;
+pub const PATH_MAX: ::std::os::raw::c_uint = 4096;
+pub const PIPE_BUF: ::std::os::raw::c_uint = 4096;
+pub const XATTR_NAME_MAX: ::std::os::raw::c_uint = 255;
+pub const XATTR_SIZE_MAX: ::std::os::raw::c_uint = 65536;
+pub const XATTR_LIST_MAX: ::std::os::raw::c_uint = 65536;
+pub const RTSIG_MAX: ::std::os::raw::c_uint = 32;
+pub const _IOC_NRBITS: ::std::os::raw::c_uint = 8;
+pub const _IOC_TYPEBITS: ::std::os::raw::c_uint = 8;
+pub const _IOC_SIZEBITS: ::std::os::raw::c_uint = 14;
+pub const _IOC_DIRBITS: ::std::os::raw::c_uint = 2;
+pub const _IOC_NRMASK: ::std::os::raw::c_uint = 255;
+pub const _IOC_TYPEMASK: ::std::os::raw::c_uint = 255;
+pub const _IOC_SIZEMASK: ::std::os::raw::c_uint = 16383;
+pub const _IOC_DIRMASK: ::std::os::raw::c_uint = 3;
+pub const _IOC_NRSHIFT: ::std::os::raw::c_uint = 0;
+pub const _IOC_TYPESHIFT: ::std::os::raw::c_uint = 8;
+pub const _IOC_SIZESHIFT: ::std::os::raw::c_uint = 16;
+pub const _IOC_DIRSHIFT: ::std::os::raw::c_uint = 30;
+pub const _IOC_NONE: ::std::os::raw::c_uint = 0;
+pub const _IOC_WRITE: ::std::os::raw::c_uint = 1;
+pub const _IOC_READ: ::std::os::raw::c_uint = 2;
+pub const IOC_IN: ::std::os::raw::c_uint = 1073741824;
+pub const IOC_OUT: ::std::os::raw::c_uint = 2147483648;
+pub const IOC_INOUT: ::std::os::raw::c_uint = 3221225472;
+pub const IOCSIZE_MASK: ::std::os::raw::c_uint = 1073676288;
+pub const IOCSIZE_SHIFT: ::std::os::raw::c_uint = 16;
+pub const __BITS_PER_LONG: ::std::os::raw::c_uint = 64;
+pub const __FD_SETSIZE: ::std::os::raw::c_uint = 1024;
+pub const MS_RDONLY: ::std::os::raw::c_uint = 1;
+pub const MS_NOSUID: ::std::os::raw::c_uint = 2;
+pub const MS_NODEV: ::std::os::raw::c_uint = 4;
+pub const MS_NOEXEC: ::std::os::raw::c_uint = 8;
+pub const MS_SYNCHRONOUS: ::std::os::raw::c_uint = 16;
+pub const MS_REMOUNT: ::std::os::raw::c_uint = 32;
+pub const MS_MANDLOCK: ::std::os::raw::c_uint = 64;
+pub const MS_DIRSYNC: ::std::os::raw::c_uint = 128;
+pub const MS_NOATIME: ::std::os::raw::c_uint = 1024;
+pub const MS_NODIRATIME: ::std::os::raw::c_uint = 2048;
+pub const MS_BIND: ::std::os::raw::c_uint = 4096;
+pub const MS_MOVE: ::std::os::raw::c_uint = 8192;
+pub const MS_REC: ::std::os::raw::c_uint = 16384;
+pub const MS_VERBOSE: ::std::os::raw::c_uint = 32768;
+pub const MS_SILENT: ::std::os::raw::c_uint = 32768;
+pub const MS_POSIXACL: ::std::os::raw::c_uint = 65536;
+pub const MS_UNBINDABLE: ::std::os::raw::c_uint = 131072;
+pub const MS_PRIVATE: ::std::os::raw::c_uint = 262144;
+pub const MS_SLAVE: ::std::os::raw::c_uint = 524288;
+pub const MS_SHARED: ::std::os::raw::c_uint = 1048576;
+pub const MS_RELATIME: ::std::os::raw::c_uint = 2097152;
+pub const MS_KERNMOUNT: ::std::os::raw::c_uint = 4194304;
+pub const MS_I_VERSION: ::std::os::raw::c_uint = 8388608;
+pub const MS_STRICTATIME: ::std::os::raw::c_uint = 16777216;
+pub const MS_LAZYTIME: ::std::os::raw::c_uint = 33554432;
+pub const MS_SUBMOUNT: ::std::os::raw::c_uint = 67108864;
+pub const MS_NOREMOTELOCK: ::std::os::raw::c_uint = 134217728;
+pub const MS_NOSEC: ::std::os::raw::c_uint = 268435456;
+pub const MS_BORN: ::std::os::raw::c_uint = 536870912;
+pub const MS_ACTIVE: ::std::os::raw::c_uint = 1073741824;
+pub const MS_NOUSER: ::std::os::raw::c_uint = 2147483648;
+pub const MS_RMT_MASK: ::std::os::raw::c_uint = 41943121;
+pub const MS_MGC_VAL: ::std::os::raw::c_uint = 3236757504;
+pub const MS_MGC_MSK: ::std::os::raw::c_uint = 4294901760;
+pub const OPEN_TREE_CLONE: ::std::os::raw::c_uint = 1;
+pub const MOVE_MOUNT_F_SYMLINKS: ::std::os::raw::c_uint = 1;
+pub const MOVE_MOUNT_F_AUTOMOUNTS: ::std::os::raw::c_uint = 2;
+pub const MOVE_MOUNT_F_EMPTY_PATH: ::std::os::raw::c_uint = 4;
+pub const MOVE_MOUNT_T_SYMLINKS: ::std::os::raw::c_uint = 16;
+pub const MOVE_MOUNT_T_AUTOMOUNTS: ::std::os::raw::c_uint = 32;
+pub const MOVE_MOUNT_T_EMPTY_PATH: ::std::os::raw::c_uint = 64;
+pub const MOVE_MOUNT__MASK: ::std::os::raw::c_uint = 119;
+pub const FSOPEN_CLOEXEC: ::std::os::raw::c_uint = 1;
+pub const FSPICK_CLOEXEC: ::std::os::raw::c_uint = 1;
+pub const FSPICK_SYMLINK_NOFOLLOW: ::std::os::raw::c_uint = 2;
+pub const FSPICK_NO_AUTOMOUNT: ::std::os::raw::c_uint = 4;
+pub const FSPICK_EMPTY_PATH: ::std::os::raw::c_uint = 8;
+pub const FSMOUNT_CLOEXEC: ::std::os::raw::c_uint = 1;
+pub const MOUNT_ATTR_RDONLY: ::std::os::raw::c_uint = 1;
+pub const MOUNT_ATTR_NOSUID: ::std::os::raw::c_uint = 2;
+pub const MOUNT_ATTR_NODEV: ::std::os::raw::c_uint = 4;
+pub const MOUNT_ATTR_NOEXEC: ::std::os::raw::c_uint = 8;
+pub const MOUNT_ATTR__ATIME: ::std::os::raw::c_uint = 112;
+pub const MOUNT_ATTR_RELATIME: ::std::os::raw::c_uint = 0;
+pub const MOUNT_ATTR_NOATIME: ::std::os::raw::c_uint = 16;
+pub const MOUNT_ATTR_STRICTATIME: ::std::os::raw::c_uint = 32;
+pub const MOUNT_ATTR_NODIRATIME: ::std::os::raw::c_uint = 128;
+pub const INR_OPEN_CUR: ::std::os::raw::c_uint = 1024;
+pub const INR_OPEN_MAX: ::std::os::raw::c_uint = 4096;
+pub const BLOCK_SIZE_BITS: ::std::os::raw::c_uint = 10;
+pub const BLOCK_SIZE: ::std::os::raw::c_uint = 1024;
+pub const SEEK_SET: ::std::os::raw::c_uint = 0;
+pub const SEEK_CUR: ::std::os::raw::c_uint = 1;
+pub const SEEK_END: ::std::os::raw::c_uint = 2;
+pub const SEEK_DATA: ::std::os::raw::c_uint = 3;
+pub const SEEK_HOLE: ::std::os::raw::c_uint = 4;
+pub const SEEK_MAX: ::std::os::raw::c_uint = 4;
+pub const RENAME_NOREPLACE: ::std::os::raw::c_uint = 1;
+pub const RENAME_EXCHANGE: ::std::os::raw::c_uint = 2;
+pub const RENAME_WHITEOUT: ::std::os::raw::c_uint = 4;
+pub const FILE_DEDUPE_RANGE_SAME: ::std::os::raw::c_uint = 0;
+pub const FILE_DEDUPE_RANGE_DIFFERS: ::std::os::raw::c_uint = 1;
+pub const NR_FILE: ::std::os::raw::c_uint = 8192;
+pub const FS_XFLAG_REALTIME: ::std::os::raw::c_uint = 1;
+pub const FS_XFLAG_PREALLOC: ::std::os::raw::c_uint = 2;
+pub const FS_XFLAG_IMMUTABLE: ::std::os::raw::c_uint = 8;
+pub const FS_XFLAG_APPEND: ::std::os::raw::c_uint = 16;
+pub const FS_XFLAG_SYNC: ::std::os::raw::c_uint = 32;
+pub const FS_XFLAG_NOATIME: ::std::os::raw::c_uint = 64;
+pub const FS_XFLAG_NODUMP: ::std::os::raw::c_uint = 128;
+pub const FS_XFLAG_RTINHERIT: ::std::os::raw::c_uint = 256;
+pub const FS_XFLAG_PROJINHERIT: ::std::os::raw::c_uint = 512;
+pub const FS_XFLAG_NOSYMLINKS: ::std::os::raw::c_uint = 1024;
+pub const FS_XFLAG_EXTSIZE: ::std::os::raw::c_uint = 2048;
+pub const FS_XFLAG_EXTSZINHERIT: ::std::os::raw::c_uint = 4096;
+pub const FS_XFLAG_NODEFRAG: ::std::os::raw::c_uint = 8192;
+pub const FS_XFLAG_FILESTREAM: ::std::os::raw::c_uint = 16384;
+pub const FS_XFLAG_DAX: ::std::os::raw::c_uint = 32768;
+pub const FS_XFLAG_COWEXTSIZE: ::std::os::raw::c_uint = 65536;
+pub const FS_XFLAG_HASATTR: ::std::os::raw::c_uint = 2147483648;
+pub const BMAP_IOCTL: ::std::os::raw::c_uint = 1;
+pub const FSLABEL_MAX: ::std::os::raw::c_uint = 256;
+pub const FS_KEY_DESCRIPTOR_SIZE: ::std::os::raw::c_uint = 8;
+pub const FS_POLICY_FLAGS_PAD_4: ::std::os::raw::c_uint = 0;
+pub const FS_POLICY_FLAGS_PAD_8: ::std::os::raw::c_uint = 1;
+pub const FS_POLICY_FLAGS_PAD_16: ::std::os::raw::c_uint = 2;
+pub const FS_POLICY_FLAGS_PAD_32: ::std::os::raw::c_uint = 3;
+pub const FS_POLICY_FLAGS_PAD_MASK: ::std::os::raw::c_uint = 3;
+pub const FS_POLICY_FLAG_DIRECT_KEY: ::std::os::raw::c_uint = 4;
+pub const FS_POLICY_FLAGS_VALID: ::std::os::raw::c_uint = 7;
+pub const FS_ENCRYPTION_MODE_INVALID: ::std::os::raw::c_uint = 0;
+pub const FS_ENCRYPTION_MODE_AES_256_XTS: ::std::os::raw::c_uint = 1;
+pub const FS_ENCRYPTION_MODE_AES_256_GCM: ::std::os::raw::c_uint = 2;
+pub const FS_ENCRYPTION_MODE_AES_256_CBC: ::std::os::raw::c_uint = 3;
+pub const FS_ENCRYPTION_MODE_AES_256_CTS: ::std::os::raw::c_uint = 4;
+pub const FS_ENCRYPTION_MODE_AES_128_CBC: ::std::os::raw::c_uint = 5;
+pub const FS_ENCRYPTION_MODE_AES_128_CTS: ::std::os::raw::c_uint = 6;
+pub const FS_ENCRYPTION_MODE_SPECK128_256_XTS: ::std::os::raw::c_uint = 7;
+pub const FS_ENCRYPTION_MODE_SPECK128_256_CTS: ::std::os::raw::c_uint = 8;
+pub const FS_ENCRYPTION_MODE_ADIANTUM: ::std::os::raw::c_uint = 9;
+pub const FS_KEY_DESC_PREFIX: &'static [u8; 9usize] = b"fscrypt:\0";
+pub const FS_KEY_DESC_PREFIX_SIZE: ::std::os::raw::c_uint = 8;
+pub const FS_MAX_KEY_SIZE: ::std::os::raw::c_uint = 64;
+pub const FS_SECRM_FL: ::std::os::raw::c_uint = 1;
+pub const FS_UNRM_FL: ::std::os::raw::c_uint = 2;
+pub const FS_COMPR_FL: ::std::os::raw::c_uint = 4;
+pub const FS_SYNC_FL: ::std::os::raw::c_uint = 8;
+pub const FS_IMMUTABLE_FL: ::std::os::raw::c_uint = 16;
+pub const FS_APPEND_FL: ::std::os::raw::c_uint = 32;
+pub const FS_NODUMP_FL: ::std::os::raw::c_uint = 64;
+pub const FS_NOATIME_FL: ::std::os::raw::c_uint = 128;
+pub const FS_DIRTY_FL: ::std::os::raw::c_uint = 256;
+pub const FS_COMPRBLK_FL: ::std::os::raw::c_uint = 512;
+pub const FS_NOCOMP_FL: ::std::os::raw::c_uint = 1024;
+pub const FS_ENCRYPT_FL: ::std::os::raw::c_uint = 2048;
+pub const FS_BTREE_FL: ::std::os::raw::c_uint = 4096;
+pub const FS_INDEX_FL: ::std::os::raw::c_uint = 4096;
+pub const FS_IMAGIC_FL: ::std::os::raw::c_uint = 8192;
+pub const FS_JOURNAL_DATA_FL: ::std::os::raw::c_uint = 16384;
+pub const FS_NOTAIL_FL: ::std::os::raw::c_uint = 32768;
+pub const FS_DIRSYNC_FL: ::std::os::raw::c_uint = 65536;
+pub const FS_TOPDIR_FL: ::std::os::raw::c_uint = 131072;
+pub const FS_HUGE_FILE_FL: ::std::os::raw::c_uint = 262144;
+pub const FS_EXTENT_FL: ::std::os::raw::c_uint = 524288;
+pub const FS_EA_INODE_FL: ::std::os::raw::c_uint = 2097152;
+pub const FS_EOFBLOCKS_FL: ::std::os::raw::c_uint = 4194304;
+pub const FS_NOCOW_FL: ::std::os::raw::c_uint = 8388608;
+pub const FS_INLINE_DATA_FL: ::std::os::raw::c_uint = 268435456;
+pub const FS_PROJINHERIT_FL: ::std::os::raw::c_uint = 536870912;
+pub const FS_RESERVED_FL: ::std::os::raw::c_uint = 2147483648;
+pub const FS_FL_USER_VISIBLE: ::std::os::raw::c_uint = 253951;
+pub const FS_FL_USER_MODIFIABLE: ::std::os::raw::c_uint = 229631;
+pub const SYNC_FILE_RANGE_WAIT_BEFORE: ::std::os::raw::c_uint = 1;
+pub const SYNC_FILE_RANGE_WRITE: ::std::os::raw::c_uint = 2;
+pub const SYNC_FILE_RANGE_WAIT_AFTER: ::std::os::raw::c_uint = 4;
+pub const SYNC_FILE_RANGE_WRITE_AND_WAIT: ::std::os::raw::c_uint = 7;
+pub const IORING_SETUP_IOPOLL: ::std::os::raw::c_uint = 1;
+pub const IORING_SETUP_SQPOLL: ::std::os::raw::c_uint = 2;
+pub const IORING_SETUP_SQ_AFF: ::std::os::raw::c_uint = 4;
+pub const IORING_SETUP_CQSIZE: ::std::os::raw::c_uint = 8;
+pub const IORING_SETUP_CLAMP: ::std::os::raw::c_uint = 16;
+pub const IORING_SETUP_ATTACH_WQ: ::std::os::raw::c_uint = 32;
+pub const IORING_FSYNC_DATASYNC: ::std::os::raw::c_uint = 1;
+pub const IORING_TIMEOUT_ABS: ::std::os::raw::c_uint = 1;
+pub const IORING_OFF_SQ_RING: ::std::os::raw::c_uint = 0;
+pub const IORING_OFF_CQ_RING: ::std::os::raw::c_uint = 134217728;
+pub const IORING_OFF_SQES: ::std::os::raw::c_uint = 268435456;
+pub const IORING_SQ_NEED_WAKEUP: ::std::os::raw::c_uint = 1;
+pub const IORING_ENTER_GETEVENTS: ::std::os::raw::c_uint = 1;
+pub const IORING_ENTER_SQ_WAKEUP: ::std::os::raw::c_uint = 2;
+pub const IORING_FEAT_SINGLE_MMAP: ::std::os::raw::c_uint = 1;
+pub const IORING_FEAT_NODROP: ::std::os::raw::c_uint = 2;
+pub const IORING_FEAT_SUBMIT_STABLE: ::std::os::raw::c_uint = 4;
+pub const IORING_FEAT_RW_CUR_POS: ::std::os::raw::c_uint = 8;
+pub const IORING_FEAT_CUR_PERSONALITY: ::std::os::raw::c_uint = 16;
+pub const IORING_REGISTER_BUFFERS: ::std::os::raw::c_uint = 0;
+pub const IORING_UNREGISTER_BUFFERS: ::std::os::raw::c_uint = 1;
+pub const IORING_REGISTER_FILES: ::std::os::raw::c_uint = 2;
+pub const IORING_UNREGISTER_FILES: ::std::os::raw::c_uint = 3;
+pub const IORING_REGISTER_EVENTFD: ::std::os::raw::c_uint = 4;
+pub const IORING_UNREGISTER_EVENTFD: ::std::os::raw::c_uint = 5;
+pub const IORING_REGISTER_FILES_UPDATE: ::std::os::raw::c_uint = 6;
+pub const IORING_REGISTER_EVENTFD_ASYNC: ::std::os::raw::c_uint = 7;
+pub const IORING_REGISTER_PROBE: ::std::os::raw::c_uint = 8;
+pub const IORING_REGISTER_PERSONALITY: ::std::os::raw::c_uint = 9;
+pub const IORING_UNREGISTER_PERSONALITY: ::std::os::raw::c_uint = 10;
+pub const IO_URING_OP_SUPPORTED: ::std::os::raw::c_uint = 1;
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct file_clone_range {
+    pub src_fd: i64,
+    pub src_offset: u64,
+    pub src_length: u64,
+    pub dest_offset: u64,
+}
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct fstrim_range {
+    pub start: u64,
+    pub len: u64,
+    pub minlen: u64,
+}
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct file_dedupe_range_info {
+    pub dest_fd: i64,
+    pub dest_offset: u64,
+    pub bytes_deduped: u64,
+    pub status: i32,
+    pub reserved: u32,
+}
+#[repr(C)]
+#[derive(Debug, Default)]
+pub struct file_dedupe_range {
+    pub src_offset: u64,
+    pub src_length: u64,
+    pub dest_count: u16,
+    pub reserved1: u16,
+    pub reserved2: u32,
+    pub info: __IncompleteArrayField<file_dedupe_range_info>,
+}
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct files_stat_struct {
+    pub nr_files: ::std::os::raw::c_ulong,
+    pub nr_free_files: ::std::os::raw::c_ulong,
+    pub max_files: ::std::os::raw::c_ulong,
+}
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct inodes_stat_t {
+    pub nr_inodes: ::std::os::raw::c_long,
+    pub nr_unused: ::std::os::raw::c_long,
+    pub dummy: [::std::os::raw::c_long; 5usize],
+}
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct fsxattr {
+    pub fsx_xflags: u32,
+    pub fsx_extsize: u32,
+    pub fsx_nextents: u32,
+    pub fsx_projid: u32,
+    pub fsx_cowextsize: u32,
+    pub fsx_pad: [::std::os::raw::c_uchar; 8usize],
+}
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct fscrypt_policy {
+    pub version: u8,
+    pub contents_encryption_mode: u8,
+    pub filenames_encryption_mode: u8,
+    pub flags: u8,
+    pub master_key_descriptor: [u8; 8usize],
+}
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct fscrypt_key {
+    pub mode: u32,
+    pub raw: [u8; 64usize],
+    pub size: u32,
+}
+impl Default for fscrypt_key {
+    fn default() -> Self {
+        unsafe { ::std::mem::zeroed() }
+    }
+}
+pub type __kernel_rwf_t = ::std::os::raw::c_int;
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct io_uring_sqe {
+    pub opcode: u8,
+    pub flags: u8,
+    pub ioprio: u16,
+    pub fd: i32,
+    pub __bindgen_anon_1: io_uring_sqe__bindgen_ty_1,
+    pub addr: u64,
+    pub len: u32,
+    pub __bindgen_anon_2: io_uring_sqe__bindgen_ty_2,
+    pub user_data: u64,
+    pub __bindgen_anon_3: io_uring_sqe__bindgen_ty_3,
+}
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union io_uring_sqe__bindgen_ty_1 {
+    pub off: u64,
+    pub addr2: u64,
+    _bindgen_union_align: u64,
+}
+impl Default for io_uring_sqe__bindgen_ty_1 {
+    fn default() -> Self {
+        unsafe { ::std::mem::zeroed() }
+    }
+}
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union io_uring_sqe__bindgen_ty_2 {
+    pub rw_flags: __kernel_rwf_t,
+    pub fsync_flags: u32,
+    pub poll_events: u16,
+    pub sync_range_flags: u32,
+    pub msg_flags: u32,
+    pub timeout_flags: u32,
+    pub accept_flags: u32,
+    pub cancel_flags: u32,
+    pub open_flags: u32,
+    pub statx_flags: u32,
+    pub fadvise_advice: u32,
+    _bindgen_union_align: u32,
+}
+impl Default for io_uring_sqe__bindgen_ty_2 {
+    fn default() -> Self {
+        unsafe { ::std::mem::zeroed() }
+    }
+}
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union io_uring_sqe__bindgen_ty_3 {
+    pub __bindgen_anon_1: io_uring_sqe__bindgen_ty_3__bindgen_ty_1,
+    pub __pad2: [u64; 3usize],
+    _bindgen_union_align: [u64; 3usize],
+}
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct io_uring_sqe__bindgen_ty_3__bindgen_ty_1 {
+    pub buf_index: u16,
+    pub personality: u16,
+}
+impl Default for io_uring_sqe__bindgen_ty_3 {
+    fn default() -> Self {
+        unsafe { ::std::mem::zeroed() }
+    }
+}
+impl Default for io_uring_sqe {
+    fn default() -> Self {
+        unsafe { ::std::mem::zeroed() }
+    }
+}
+pub const IOSQE_FIXED_FILE_BIT: _bindgen_ty_1 = 0;
+pub const IOSQE_IO_DRAIN_BIT: _bindgen_ty_1 = 1;
+pub const IOSQE_IO_LINK_BIT: _bindgen_ty_1 = 2;
+pub const IOSQE_IO_HARDLINK_BIT: _bindgen_ty_1 = 3;
+pub const IOSQE_ASYNC_BIT: _bindgen_ty_1 = 4;
+pub type _bindgen_ty_1 = ::std::os::raw::c_uint;
+pub const IORING_OP_NOP: _bindgen_ty_2 = 0;
+pub const IORING_OP_READV: _bindgen_ty_2 = 1;
+pub const IORING_OP_WRITEV: _bindgen_ty_2 = 2;
+pub const IORING_OP_FSYNC: _bindgen_ty_2 = 3;
+pub const IORING_OP_READ_FIXED: _bindgen_ty_2 = 4;
+pub const IORING_OP_WRITE_FIXED: _bindgen_ty_2 = 5;
+pub const IORING_OP_POLL_ADD: _bindgen_ty_2 = 6;
+pub const IORING_OP_POLL_REMOVE: _bindgen_ty_2 = 7;
+pub const IORING_OP_SYNC_FILE_RANGE: _bindgen_ty_2 = 8;
+pub const IORING_OP_SENDMSG: _bindgen_ty_2 = 9;
+pub const IORING_OP_RECVMSG: _bindgen_ty_2 = 10;
+pub const IORING_OP_TIMEOUT: _bindgen_ty_2 = 11;
+pub const IORING_OP_TIMEOUT_REMOVE: _bindgen_ty_2 = 12;
+pub const IORING_OP_ACCEPT: _bindgen_ty_2 = 13;
+pub const IORING_OP_ASYNC_CANCEL: _bindgen_ty_2 = 14;
+pub const IORING_OP_LINK_TIMEOUT: _bindgen_ty_2 = 15;
+pub const IORING_OP_CONNECT: _bindgen_ty_2 = 16;
+pub const IORING_OP_FALLOCATE: _bindgen_ty_2 = 17;
+pub const IORING_OP_OPENAT: _bindgen_ty_2 = 18;
+pub const IORING_OP_CLOSE: _bindgen_ty_2 = 19;
+pub const IORING_OP_FILES_UPDATE: _bindgen_ty_2 = 20;
+pub const IORING_OP_STATX: _bindgen_ty_2 = 21;
+pub const IORING_OP_READ: _bindgen_ty_2 = 22;
+pub const IORING_OP_WRITE: _bindgen_ty_2 = 23;
+pub const IORING_OP_FADVISE: _bindgen_ty_2 = 24;
+pub const IORING_OP_MADVISE: _bindgen_ty_2 = 25;
+pub const IORING_OP_SEND: _bindgen_ty_2 = 26;
+pub const IORING_OP_RECV: _bindgen_ty_2 = 27;
+pub const IORING_OP_OPENAT2: _bindgen_ty_2 = 28;
+pub const IORING_OP_EPOLL_CTL: _bindgen_ty_2 = 29;
+pub const IORING_OP_LAST: _bindgen_ty_2 = 30;
+pub type _bindgen_ty_2 = ::std::os::raw::c_uint;
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct io_uring_cqe {
+    pub user_data: u64,
+    pub res: i32,
+    pub flags: u32,
+}
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct io_sqring_offsets {
+    pub head: u32,
+    pub tail: u32,
+    pub ring_mask: u32,
+    pub ring_entries: u32,
+    pub flags: u32,
+    pub dropped: u32,
+    pub array: u32,
+    pub resv1: u32,
+    pub resv2: u64,
+}
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct io_cqring_offsets {
+    pub head: u32,
+    pub tail: u32,
+    pub ring_mask: u32,
+    pub ring_entries: u32,
+    pub overflow: u32,
+    pub cqes: u32,
+    pub resv: [u64; 2usize],
+}
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct io_uring_params {
+    pub sq_entries: u32,
+    pub cq_entries: u32,
+    pub flags: u32,
+    pub sq_thread_cpu: u32,
+    pub sq_thread_idle: u32,
+    pub features: u32,
+    pub wq_fd: u32,
+    pub resv: [u32; 3usize],
+    pub sq_off: io_sqring_offsets,
+    pub cq_off: io_cqring_offsets,
+}
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct io_uring_files_update {
+    pub offset: u32,
+    pub resv: u32,
+    pub fds: u64,
+}
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct io_uring_probe_op {
+    pub op: u8,
+    pub resv: u8,
+    pub flags: u16,
+    pub resv2: u32,
+}
+#[repr(C)]
+#[derive(Debug, Default)]
+pub struct io_uring_probe {
+    pub last_op: u8,
+    pub ops_len: u8,
+    pub resv: u16,
+    pub resv2: [u32; 3usize],
+    pub ops: __IncompleteArrayField<io_uring_probe_op>,
+}
diff --git a/io_uring/src/lib.rs b/io_uring/src/lib.rs
new file mode 100644
index 0000000..071c503
--- /dev/null
+++ b/io_uring/src/lib.rs
@@ -0,0 +1,9 @@
+// Copyright 2020 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+mod bindings;
+mod syscalls;
+mod uring;
+
+pub use uring::*;
diff --git a/io_uring/src/syscalls.rs b/io_uring/src/syscalls.rs
new file mode 100644
index 0000000..e52892b
--- /dev/null
+++ b/io_uring/src/syscalls.rs
@@ -0,0 +1,41 @@
+// Copyright 2020 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+use std::os::unix::io::RawFd;
+use std::ptr::null_mut;
+
+use libc::{c_int, c_long, c_void};
+use syscall_defines::linux::LinuxSyscall::*;
+
+use crate::bindings::*;
+
+/// Returns the system error as the result;
+pub type Result<T> = std::result::Result<T, c_int>;
+
+pub unsafe fn io_uring_setup(num_entries: usize, params: &io_uring_params) -> Result<RawFd> {
+    let ret = libc::syscall(
+        SYS_io_uring_setup as c_long,
+        num_entries as c_int,
+        params as *const _,
+    );
+    if ret < 0 {
+        return Err(*libc::__errno_location());
+    }
+    Ok(ret as RawFd)
+}
+
+pub unsafe fn io_uring_enter(fd: RawFd, to_submit: u64, to_wait: u64, flags: u32) -> Result<()> {
+    let ret = libc::syscall(
+        SYS_io_uring_enter as c_long,
+        fd,
+        to_submit as c_int,
+        to_wait as c_int,
+        flags as c_int,
+        null_mut::<*mut c_void>(),
+    );
+    if ret < 0 {
+        return Err(*libc::__errno_location());
+    }
+    Ok(())
+}
diff --git a/io_uring/src/uring.rs b/io_uring/src/uring.rs
new file mode 100644
index 0000000..0a51df1
--- /dev/null
+++ b/io_uring/src/uring.rs
@@ -0,0 +1,772 @@
+// Copyright 2020 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+use std::fmt;
+use std::fs::File;
+use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
+use std::ptr::null_mut;
+use std::sync::atomic::{AtomicU32, Ordering};
+
+use sys_util::{MemoryMapping, WatchingEvents};
+
+use crate::bindings::*;
+use crate::syscalls::*;
+
+/// Holds per-operation, user specified data. The usage is up to the caller. The most common use is
+/// for callers to identify each request.
+pub type UserData = u64;
+
+#[derive(Debug)]
+pub enum Error {
+    /// The call to `io_uring_enter` failed with the given errno.
+    RingEnter(libc::c_int),
+    /// The call to `io_uring_setup` failed with the given errno.
+    Setup(libc::c_int),
+    /// Failed to map the completion ring.
+    MappingCompleteRing(sys_util::MmapError),
+    /// Failed to map the submit ring.
+    MappingSubmitRing(sys_util::MmapError),
+    /// Failed to map submit entries.
+    MappingSubmitEntries(sys_util::MmapError),
+    /// Too many ops are already queued.
+    NoSpace,
+}
+pub type Result<T> = std::result::Result<T, Error>;
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        use self::Error::*;
+
+        match self {
+            RingEnter(e) => write!(f, "Failed to enter io uring {}", e),
+            Setup(e) => write!(f, "Failed to setup io uring {}", e),
+            MappingCompleteRing(e) => write!(f, "Failed to mmap completion ring {}", e),
+            MappingSubmitRing(e) => write!(f, "Failed to mmap submit ring {}", e),
+            MappingSubmitEntries(e) => write!(f, "Failed to mmap submit entries {}", e),
+            NoSpace => write!(
+                f,
+                "No space for more ring entries, try increasing the size passed to `new`",
+            ),
+        }
+    }
+}
+
+/// Basic statistics about the operations that have been submitted to the uring.
+#[derive(Default)]
+pub struct URingStats {
+    total_enter_calls: u64, // Number of times the uring has been entered.
+    total_ops: u64,         // Total ops submitted to io_uring.
+    total_complete: u64,    // Total ops completed by io_uring.
+}
+
+/// Unsafe wrapper for the kernel's io_uring interface. Allows for queueing multiple I/O operations
+/// to the kernel and asynchronously handling the completion of these operations.
+/// Use the various `add_*` functions to configure operations, then call `wait` to start
+/// the operations and get any completed results. Each op is given a u64 user_data argument that is
+/// used to identify the result when returned in the iterator provided by `wait`.
+///
+/// # Example polling an FD for readable status.
+///
+/// ```
+/// # use std::fs::File;
+/// # use std::os::unix::io::AsRawFd;
+/// # use std::path::Path;
+/// # use sys_util::WatchingEvents;
+/// # use io_uring::URingContext;
+/// let f = File::open(Path::new("/dev/zero")).unwrap();
+/// let mut uring = URingContext::new(16).unwrap();
+/// uring
+///   .add_poll_fd(f.as_raw_fd(), WatchingEvents::empty().set_read(), 454)
+/// .unwrap();
+/// let (user_data, res) = uring.wait().unwrap().next().unwrap();
+/// assert_eq!(user_data, 454 as UserData);
+/// assert_eq!(res.unwrap(), 1 as i32);
+///
+/// ```
+pub struct URingContext {
+    ring_file: File, // Holds the io_uring context FD returned from io_uring_setup.
+    submit_ring: SubmitQueueState,
+    submit_queue_entries: SubmitQueueEntries,
+    complete_ring: CompleteQueueState,
+    io_vecs: Vec<libc::iovec>,
+    in_flight: usize, // The number of pending operations.
+    added: usize,     // The number of ops added since the last call to `io_uring_enter`.
+    stats: URingStats,
+}
+
+impl URingContext {
+    /// Creates a `URingContext` where the underlying uring has a space for `num_entries`
+    /// simultaneous operations.
+    pub fn new(num_entries: usize) -> Result<URingContext> {
+        let ring_params = io_uring_params::default();
+        // The below unsafe block isolates the creation of the URingContext. Each step on it's own
+        // is unsafe. Using the uring FD for the mapping and the offsets returned by the kernel for
+        // base addresses maintains safety guarantees assuming the kernel API guarantees are
+        // trusted.
+        unsafe {
+            // Safe because the kernel is trusted to only modify params and `File` is created with
+            // an FD that it takes complete ownership of.
+            let fd = io_uring_setup(num_entries, &ring_params).map_err(Error::Setup)?;
+            let ring_file = File::from_raw_fd(fd);
+
+            // Mmap the submit and completion queues.
+            // Safe because we trust the kernel to set valid sizes in `io_uring_setup` and any error
+            // is checked.
+            let submit_ring = SubmitQueueState::new(
+                MemoryMapping::from_fd_offset_populate(
+                    &ring_file,
+                    ring_params.sq_off.array as usize
+                        + ring_params.sq_entries as usize * std::mem::size_of::<u32>(),
+                    u64::from(IORING_OFF_SQ_RING),
+                )
+                .map_err(Error::MappingSubmitRing)?,
+                &ring_params,
+            );
+
+            let submit_queue_entries = SubmitQueueEntries {
+                mmap: MemoryMapping::from_fd_offset_populate(
+                    &ring_file,
+                    ring_params.sq_entries as usize * std::mem::size_of::<io_uring_sqe>(),
+                    u64::from(IORING_OFF_SQES),
+                )
+                .map_err(Error::MappingSubmitEntries)?,
+                len: ring_params.sq_entries as usize,
+            };
+
+            let complete_ring = CompleteQueueState::new(
+                MemoryMapping::from_fd_offset_populate(
+                    &ring_file,
+                    ring_params.cq_off.cqes as usize
+                        + ring_params.cq_entries as usize * std::mem::size_of::<io_uring_cqe>(),
+                    u64::from(IORING_OFF_CQ_RING),
+                )
+                .map_err(Error::MappingCompleteRing)?,
+                &ring_params,
+            );
+
+            Ok(URingContext {
+                ring_file,
+                submit_ring,
+                submit_queue_entries,
+                complete_ring,
+                io_vecs: vec![
+                    libc::iovec {
+                        iov_base: null_mut(),
+                        iov_len: 0
+                    };
+                    num_entries
+                ],
+                added: 0,
+                in_flight: 0,
+                stats: Default::default(),
+            })
+        }
+    }
+
+    // Call `f` with the next available sqe or return an error if none are available.
+    // After `f` returns, the sqe is appended to the kernel's queue.
+    fn prep_next_sqe<F>(&mut self, mut f: F) -> Result<()>
+    where
+        F: FnMut(&mut io_uring_sqe, &mut libc::iovec),
+    {
+        // Find the next free submission entry in the submit ring and fill it with an iovec.
+        // The below raw pointer derefs are safe because the memory the pointers use lives as long
+        // as the mmap in self.
+        let tail = self.submit_ring.pointers.tail(Ordering::Relaxed);
+        let next_tail = tail.wrapping_add(1);
+        if next_tail == self.submit_ring.pointers.head(Ordering::Acquire) {
+            return Err(Error::NoSpace);
+        }
+        // `tail` is the next sqe to use.
+        let index = (tail & self.submit_ring.ring_mask) as usize;
+        let sqe = self.submit_queue_entries.get_mut(index).unwrap();
+
+        f(sqe, &mut self.io_vecs[index]);
+
+        // Tells the kernel to use the new index when processing the entry at that index.
+        self.submit_ring.set_array_entry(index, index as u32);
+        // Ensure the above writes to sqe are seen before the tail is updated.
+        // set_tail uses Release ordering when storing to the ring.
+        self.submit_ring.pointers.set_tail(next_tail);
+
+        self.added += 1;
+
+        Ok(())
+    }
+
+    unsafe fn add_rw_op(
+        &mut self,
+        ptr: *const u8,
+        len: usize,
+        fd: RawFd,
+        offset: u64,
+        user_data: UserData,
+        op: u8,
+    ) -> Result<()> {
+        self.prep_next_sqe(|sqe, iovec| {
+            iovec.iov_base = ptr as *const libc::c_void as *mut _;
+            iovec.iov_len = len;
+            sqe.opcode = op;
+            sqe.addr = iovec as *const _ as *const libc::c_void as u64;
+            sqe.len = 1;
+            sqe.__bindgen_anon_1.off = offset;
+            sqe.__bindgen_anon_3.__bindgen_anon_1.buf_index = 0;
+            sqe.ioprio = 0;
+            sqe.user_data = user_data;
+            sqe.flags = 0;
+            sqe.fd = fd;
+        })?;
+
+        Ok(())
+    }
+
+    /// Asynchronously writes to `fd` from the address given in `ptr`.
+    /// # Safety
+    /// `add_write` will write up to `len` bytes of data from the address given by `ptr`. This is
+    /// only safe if the caller guarantees that the memory lives until the transaction is complete
+    /// and that completion has been returned from the `wait` function. In addition there must not
+    /// be other references to the data pointed to by `ptr` until the operation completes.  Ensure
+    /// that the fd remains open until the op completes as well.
+    pub unsafe fn add_write(
+        &mut self,
+        ptr: *const u8,
+        len: usize,
+        fd: RawFd,
+        offset: u64,
+        user_data: UserData,
+    ) -> Result<()> {
+        self.add_rw_op(ptr, len, fd, offset, user_data, IORING_OP_WRITEV as u8)
+    }
+
+    /// Asynchronously reads from `fd` to the address given in `ptr`.
+    /// # Safety
+    /// `add_read` will write up to `len` bytes of data to the address given by `ptr`. This is only
+    /// safe if the caller guarantees there are no other references to that memory and that the
+    /// memory lives until the transaction is complete and that completion has been returned from
+    /// the `wait` function.  In addition there must not be any mutable references to the data
+    /// pointed to by `ptr` until the operation completes.  Ensure that the fd remains open until
+    /// the op completes as well.
+    pub unsafe fn add_read(
+        &mut self,
+        ptr: *mut u8,
+        len: usize,
+        fd: RawFd,
+        offset: u64,
+        user_data: UserData,
+    ) -> Result<()> {
+        self.add_rw_op(ptr, len, fd, offset, user_data, IORING_OP_READV as u8)
+    }
+
+    /// Syncs all completed operations, the ordering with in-flight async ops is not
+    /// defined.
+    pub fn add_fsync(&mut self, fd: RawFd, user_data: UserData) -> Result<()> {
+        self.prep_next_sqe(|sqe, _iovec| {
+            sqe.opcode = IORING_OP_FSYNC as u8;
+            sqe.fd = fd;
+            sqe.user_data = user_data;
+
+            sqe.addr = 0;
+            sqe.len = 0;
+            sqe.__bindgen_anon_1.off = 0;
+            sqe.__bindgen_anon_3.__bindgen_anon_1.buf_index = 0;
+            sqe.__bindgen_anon_2.rw_flags = 0;
+            sqe.ioprio = 0;
+            sqe.flags = 0;
+        })
+    }
+
+    /// See the usage of `fallocate`, this asynchronously performs the same operations.
+    pub fn add_fallocate(
+        &mut self,
+        fd: RawFd,
+        offset: u64,
+        len: usize,
+        mode: u64,
+        user_data: UserData,
+    ) -> Result<()> {
+        // Note that len for fallocate in passed in the addr field of the sqe and the mode uses the
+        // len field.
+        self.prep_next_sqe(|sqe, _iovec| {
+            sqe.opcode = IORING_OP_FALLOCATE as u8;
+
+            sqe.fd = fd;
+            sqe.addr = len as u64;
+            sqe.len = mode as u32;
+            sqe.__bindgen_anon_1.off = offset;
+            sqe.user_data = user_data;
+
+            sqe.__bindgen_anon_3.__bindgen_anon_1.buf_index = 0;
+            sqe.__bindgen_anon_2.rw_flags = 0;
+            sqe.ioprio = 0;
+            sqe.flags = 0;
+        })
+    }
+
+    /// Adds an FD to be polled based on the given flags.
+    /// The user must keep the FD open until the operation completion is returned from
+    /// `wait`.
+    /// Note that io_uring is always a one shot poll. After the fd is returned, it must be re-added
+    /// to get future events.
+    pub fn add_poll_fd(
+        &mut self,
+        fd: RawFd,
+        events: WatchingEvents,
+        user_data: UserData,
+    ) -> Result<()> {
+        self.prep_next_sqe(|sqe, _iovec| {
+            sqe.opcode = IORING_OP_POLL_ADD as u8;
+            sqe.fd = fd;
+            sqe.user_data = user_data;
+            sqe.__bindgen_anon_2.poll_events = events.get_raw() as u16;
+
+            sqe.addr = 0;
+            sqe.len = 0;
+            sqe.__bindgen_anon_1.off = 0;
+            sqe.__bindgen_anon_3.__bindgen_anon_1.buf_index = 0;
+            sqe.ioprio = 0;
+            sqe.flags = 0;
+        })
+    }
+
+    /// Removes an FD that was previously added with `add_poll_fd`.
+    pub fn remove_poll_fd(
+        &mut self,
+        fd: RawFd,
+        events: WatchingEvents,
+        user_data: UserData,
+    ) -> Result<()> {
+        self.prep_next_sqe(|sqe, _iovec| {
+            sqe.opcode = IORING_OP_POLL_REMOVE as u8;
+            sqe.fd = fd;
+            sqe.user_data = user_data;
+            sqe.__bindgen_anon_2.poll_events = events.get_raw() as u16;
+
+            sqe.addr = 0;
+            sqe.len = 0;
+            sqe.__bindgen_anon_1.off = 0;
+            sqe.__bindgen_anon_3.__bindgen_anon_1.buf_index = 0;
+            sqe.ioprio = 0;
+            sqe.flags = 0;
+        })
+    }
+
+    /// Sends operations added with the `add_*` functions to the kernel.
+    pub fn submit(&mut self) -> Result<()> {
+        self.in_flight += self.added;
+        self.stats.total_ops = self.stats.total_ops.wrapping_add(self.added as u64);
+        if self.added > 0 {
+            self.stats.total_enter_calls = self.stats.total_enter_calls.wrapping_add(1);
+            unsafe {
+                // Safe because the only memory modified is in the completion queue.
+                io_uring_enter(self.ring_file.as_raw_fd(), self.added as u64, 1, 0)
+                    .map_err(Error::RingEnter)?;
+            }
+        }
+        self.added = 0;
+
+        Ok(())
+    }
+
+    /// Sends operations added with the `add_*` functions to the kernel and return an iterator to any
+    /// completed operations. `wait` blocks until at least one completion is ready.  If called
+    /// without any new events added, this simply waits for any existing events to complete and
+    /// returns as soon an one or more is ready.
+    pub fn wait<'a>(
+        &'a mut self,
+    ) -> Result<impl Iterator<Item = (UserData, std::io::Result<i32>)> + 'a> {
+        let completed = self.complete_ring.num_completed();
+        self.stats.total_complete = self.stats.total_complete.wrapping_add(completed as u64);
+        self.in_flight -= completed;
+        self.in_flight += self.added;
+        self.stats.total_ops = self.stats.total_ops.wrapping_add(self.added as u64);
+        if self.in_flight > 0 {
+            unsafe {
+                self.stats.total_enter_calls = self.stats.total_enter_calls.wrapping_add(1);
+                // Safe because the only memory modified is in the completion queue.
+                io_uring_enter(
+                    self.ring_file.as_raw_fd(),
+                    self.added as u64,
+                    1,
+                    IORING_ENTER_GETEVENTS,
+                )
+                .map_err(Error::RingEnter)?;
+            }
+        }
+        self.added = 0;
+
+        // The CompletionQueue will iterate all completed ops.
+        Ok(&mut self.complete_ring)
+    }
+}
+
+impl AsRawFd for URingContext {
+    fn as_raw_fd(&self) -> RawFd {
+        self.ring_file.as_raw_fd()
+    }
+}
+
+struct SubmitQueueEntries {
+    mmap: MemoryMapping,
+    len: usize,
+}
+
+impl SubmitQueueEntries {
+    fn get_mut(&mut self, index: usize) -> Option<&mut io_uring_sqe> {
+        if index >= self.len {
+            return None;
+        }
+        unsafe {
+            // Safe because the mut borrow of self resticts to one mutable reference at a time and
+            // we trust that the kernel has returned enough memory in io_uring_setup and mmap.
+            Some(&mut *(self.mmap.as_ptr() as *mut io_uring_sqe).add(index))
+        }
+    }
+}
+
+struct SubmitQueueState {
+    _mmap: MemoryMapping,
+    pointers: QueuePointers,
+    ring_mask: u32,
+    array: *mut u32,
+}
+
+impl SubmitQueueState {
+    // # Safety
+    // Safe iff `mmap` is created by mapping from a uring FD at the SQ_RING offset and params is
+    // the params struct passed to io_uring_setup.
+    unsafe fn new(mmap: MemoryMapping, params: &io_uring_params) -> SubmitQueueState {
+        let ptr = mmap.as_ptr();
+        // Transmutes are safe because a u32 is atomic on all supported architectures and the
+        // pointer will live until after self is dropped because the mmap is owned.
+        let head = ptr.add(params.sq_off.head as usize) as *const AtomicU32;
+        let tail = ptr.add(params.sq_off.tail as usize) as *const AtomicU32;
+        // This offset is guaranteed to be within the mmap so unwrap the result.
+        let ring_mask = mmap.read_obj(params.sq_off.ring_mask as usize).unwrap();
+        let array = ptr.add(params.sq_off.array as usize) as *mut u32;
+        SubmitQueueState {
+            _mmap: mmap,
+            pointers: QueuePointers { head, tail },
+            ring_mask,
+            array,
+        }
+    }
+
+    // Sets the kernel's array entry at the given `index` to `value`.
+    fn set_array_entry(&self, index: usize, value: u32) {
+        // Safe because self being constructed from the correct mmap guaratees that the memory is
+        // valid to written.
+        unsafe {
+            std::ptr::write_volatile(self.array.add(index), value as u32);
+        }
+    }
+}
+
+struct CompleteQueueState {
+    mmap: MemoryMapping,
+    pointers: QueuePointers,
+    ring_mask: u32,
+    cqes_offset: u32,
+    completed: usize,
+}
+
+impl CompleteQueueState {
+    /// # Safety
+    /// Safe iff `mmap` is created by mapping from a uring FD at the CQ_RING offset and params is
+    /// the params struct passed to io_uring_setup.
+    unsafe fn new(mmap: MemoryMapping, params: &io_uring_params) -> CompleteQueueState {
+        let ptr = mmap.as_ptr();
+        let head = ptr.add(params.cq_off.head as usize) as *const AtomicU32;
+        let tail = ptr.add(params.cq_off.tail as usize) as *const AtomicU32;
+        let ring_mask = mmap.read_obj(params.cq_off.ring_mask as usize).unwrap();
+        CompleteQueueState {
+            mmap,
+            pointers: QueuePointers { head, tail },
+            ring_mask,
+            cqes_offset: params.cq_off.cqes,
+            completed: 0,
+        }
+    }
+
+    fn get_cqe(&self, head: u32) -> &io_uring_cqe {
+        unsafe {
+            // Safe because we trust that the kernel has returned enough memory in io_uring_setup
+            // and mmap and index is checked within range by the ring_mask.
+            let cqes = (self.mmap.as_ptr() as *const u8).add(self.cqes_offset as usize)
+                as *const io_uring_cqe;
+
+            let index = head & self.ring_mask;
+
+            &*cqes.add(index as usize)
+        }
+    }
+
+    fn num_completed(&mut self) -> usize {
+        std::mem::replace(&mut self.completed, 0)
+    }
+}
+
+// Return the completed ops with their result.
+impl Iterator for CompleteQueueState {
+    type Item = (UserData, std::io::Result<i32>);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        // Safe because the pointers to the atomics are valid and the cqe must be in range
+        // because the kernel provided mask is applied to the index.
+        let head = self.pointers.head(Ordering::Relaxed);
+
+        // Synchronize the read of tail after the read of head.
+        if head == self.pointers.tail(Ordering::Acquire) {
+            return None;
+        }
+
+        self.completed += 1;
+
+        let cqe = self.get_cqe(head);
+        let user_data = cqe.user_data;
+        let res = cqe.res;
+
+        // Store the new head and ensure the reads above complete before the kernel sees the
+        // update to head, `set_head` uses `Release` ordering
+        let new_head = head.wrapping_add(1);
+        self.pointers.set_head(new_head);
+
+        let io_res = match res {
+            r if r < 0 => Err(std::io::Error::from_raw_os_error(r)),
+            r => Ok(r),
+        };
+        Some((user_data, io_res))
+    }
+}
+
+struct QueuePointers {
+    head: *const AtomicU32,
+    tail: *const AtomicU32,
+}
+
+impl QueuePointers {
+    // Loads the tail pointer atomically with the given ordering.
+    fn tail(&self, ordering: Ordering) -> u32 {
+        // Safe because self being constructed from the correct mmap guaratees that the memory is
+        // valid to read.
+        unsafe { (*self.tail).load(ordering) }
+    }
+
+    // Stores the new value of the tail in the submit queue. This allows the kernel to start
+    // processing entries that have been added up until the given tail pointer.
+    // Always stores with release ordering as that is the only valid way to use the pointer.
+    fn set_tail(&self, next_tail: u32) {
+        // Safe because self being constructed from the correct mmap guaratees that the memory is
+        // valid to read and it's used as an atomic to cover mutability concerns.
+        unsafe { (*self.tail).store(next_tail, Ordering::Release) }
+    }
+
+    // Loads the head pointer atomically with the given ordering.
+    fn head(&self, ordering: Ordering) -> u32 {
+        // Safe because self being constructed from the correct mmap guaratees that the memory is
+        // valid to read.
+        unsafe { (*self.head).load(ordering) }
+    }
+
+    // Stores the new value of the head in the submit queue. This allows the kernel to start
+    // processing entries that have been added up until the given head pointer.
+    // Always stores with release ordering as that is the only valid way to use the pointer.
+    fn set_head(&self, next_head: u32) {
+        // Safe because self being constructed from the correct mmap guaratees that the memory is
+        // valid to read and it's used as an atomic to cover mutability concerns.
+        unsafe { (*self.head).store(next_head, Ordering::Release) }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::fs::OpenOptions;
+    use std::io::Write;
+    use std::path::{Path, PathBuf};
+    use std::time::Duration;
+
+    use sys_util::PollContext;
+    use tempfile::TempDir;
+
+    use super::*;
+
+    fn append_file_name(path: &Path, name: &str) -> PathBuf {
+        let mut joined = path.to_path_buf();
+        joined.push(name);
+        joined
+    }
+
+    #[test]
+    fn read_one_block_at_a_time() {
+        let tempdir = TempDir::new().unwrap();
+        let file_path = append_file_name(tempdir.path(), "test");
+        let queue_size = 128;
+
+        let mut uring = URingContext::new(queue_size).unwrap();
+        let mut buf = [0u8; 0x1000];
+        let f = OpenOptions::new()
+            .read(true)
+            .write(true)
+            .create(true)
+            .truncate(true)
+            .open(&file_path)
+            .unwrap();
+        f.set_len(0x1000 * 2).unwrap();
+
+        for i in 0..queue_size * 2 {
+            unsafe {
+                // Safe because the `wait` call waits until the kernel is done with`buf`.
+                let index = i as u64;
+                uring
+                    .add_read(
+                        buf.as_mut_ptr(),
+                        buf.len(),
+                        f.as_raw_fd(),
+                        (index % 2) * 0x1000,
+                        index,
+                    )
+                    .unwrap();
+                let (user_data, res) = uring.wait().unwrap().next().unwrap();
+                assert_eq!(user_data, i as UserData);
+                assert_eq!(res.unwrap(), buf.len() as i32);
+            }
+        }
+    }
+
+    #[test]
+    fn write_one_block() {
+        let tempdir = TempDir::new().unwrap();
+        let file_path = append_file_name(tempdir.path(), "test");
+
+        let mut uring = URingContext::new(16).unwrap();
+        let mut buf = [0u8; 4096];
+        let mut f = OpenOptions::new()
+            .read(true)
+            .write(true)
+            .create(true)
+            .truncate(true)
+            .open(&file_path)
+            .unwrap();
+        f.write(&buf).unwrap();
+        f.write(&buf).unwrap();
+
+        unsafe {
+            // Safe because the `wait` call waits until the kernel is done mutating `buf`.
+            uring
+                .add_write(buf.as_mut_ptr(), buf.len(), f.as_raw_fd(), 0, 55)
+                .unwrap();
+            let (user_data, res) = uring.wait().unwrap().next().unwrap();
+            assert_eq!(user_data, 55 as UserData);
+            assert_eq!(res.unwrap(), buf.len() as i32);
+        }
+    }
+
+    #[test]
+    fn write_one_submit_poll() {
+        let tempdir = TempDir::new().unwrap();
+        let file_path = append_file_name(tempdir.path(), "test");
+
+        let mut uring = URingContext::new(16).unwrap();
+        let mut buf = [0u8; 4096];
+        let mut f = OpenOptions::new()
+            .read(true)
+            .write(true)
+            .create(true)
+            .truncate(true)
+            .open(&file_path)
+            .unwrap();
+        f.write(&buf).unwrap();
+        f.write(&buf).unwrap();
+
+        let ctx: PollContext<u64> = PollContext::build_with(&[(&uring, 1)]).unwrap();
+        {
+            // Test that the uring context isn't readable before any events are complete.
+            let events = ctx.wait_timeout(Duration::from_millis(1)).unwrap();
+            assert!(events.iter_readable().next().is_none());
+        }
+
+        unsafe {
+            // Safe because the `wait` call waits until the kernel is done mutating `buf`.
+            uring
+                .add_write(buf.as_mut_ptr(), buf.len(), f.as_raw_fd(), 0, 55)
+                .unwrap();
+            uring.submit().unwrap();
+            // Poll for completion with epoll.
+            let events = ctx.wait().unwrap();
+            let event = events.iter_readable().next().unwrap();
+            assert_eq!(event.token(), 1);
+            let (user_data, res) = uring.wait().unwrap().next().unwrap();
+            assert_eq!(user_data, 55 as UserData);
+            assert_eq!(res.unwrap(), buf.len() as i32);
+        }
+    }
+    #[test]
+    fn fallocate_fsync() {
+        let tempdir = TempDir::new().unwrap();
+        let file_path = append_file_name(tempdir.path(), "test");
+
+        {
+            let buf = [0u8; 4096];
+            let mut f = OpenOptions::new()
+                .read(true)
+                .write(true)
+                .create(true)
+                .truncate(true)
+                .open(&file_path)
+                .unwrap();
+            f.write(&buf).unwrap();
+        }
+
+        let init_size = std::fs::metadata(&file_path).unwrap().len() as usize;
+        let set_size = init_size + 1024 * 1024 * 50;
+        let f = OpenOptions::new()
+            .read(true)
+            .write(true)
+            .create(true)
+            .open(&file_path)
+            .unwrap();
+
+        let mut uring = URingContext::new(16).unwrap();
+        uring
+            .add_fallocate(f.as_raw_fd(), 0, set_size, 0, 66)
+            .unwrap();
+        let (user_data, res) = uring.wait().unwrap().next().unwrap();
+        assert_eq!(user_data, 66 as UserData);
+        assert_eq!(res.unwrap(), 0 as i32);
+
+        uring.add_fsync(f.as_raw_fd(), 67).unwrap();
+        let (user_data, res) = uring.wait().unwrap().next().unwrap();
+        assert_eq!(user_data, 67 as UserData);
+        assert_eq!(res.unwrap(), 0 as i32);
+
+        uring
+            .add_fallocate(
+                f.as_raw_fd(),
+                init_size as u64,
+                set_size - init_size,
+                (libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE) as u64,
+                68,
+            )
+            .unwrap();
+        let (user_data, res) = uring.wait().unwrap().next().unwrap();
+        assert_eq!(user_data, 68 as UserData);
+        assert_eq!(res.unwrap(), 0 as i32);
+
+        drop(f); // Close to ensure directory entires for metadata are updated.
+
+        let new_size = std::fs::metadata(&file_path).unwrap().len() as usize;
+        assert_eq!(new_size, set_size);
+    }
+
+    #[test]
+    fn dev_zero_readable() {
+        let f = File::open(Path::new("/dev/zero")).unwrap();
+        let mut uring = URingContext::new(16).unwrap();
+        uring
+            .add_poll_fd(f.as_raw_fd(), WatchingEvents::empty().set_read(), 454)
+            .unwrap();
+        let (user_data, res) = uring.wait().unwrap().next().unwrap();
+        assert_eq!(user_data, 454 as UserData);
+        assert_eq!(res.unwrap(), 1 as i32);
+    }
+}