From c24ad78624c7f15abeabf05f621c6acd050efc4b Mon Sep 17 00:00:00 2001 From: Matt Delco Date: Fri, 14 Feb 2020 13:24:36 -0800 Subject: linux.rs: unify jail creation This change unifies two substantially similiar segments of code used to create a jail. BUG=none TEST=Ran 'build_test'. Local build, deployed to DUT, and verified that termina VM could still be used. Change-Id: Ib1f2f9bc5cfe1e6c9f3633af7e23f52e5eafe3c7 Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/crosvm/+/2057744 Tested-by: Matt Delco Tested-by: kokoro Reviewed-by: Dylan Reid Commit-Queue: Matt Delco --- src/linux.rs | 163 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 82 insertions(+), 81 deletions(-) (limited to 'src/linux.rs') diff --git a/src/linux.rs b/src/linux.rs index 0f8a848..bf2c014 100644 --- a/src/linux.rs +++ b/src/linux.rs @@ -304,55 +304,78 @@ fn get_max_open_files() -> Result { } } +struct SandboxConfig<'a> { + limit_caps: bool, + log_failures: bool, + seccomp_policy: &'a Path, + uid_map: Option<&'a str>, + gid_map: Option<&'a str>, +} + fn create_base_minijail( root: &Path, - log_failures: bool, - seccomp_policy: &Path, + r_limit: Option, + config: Option<&SandboxConfig>, ) -> Result { // All child jails run in a new user namespace without any users mapped, // they run as nobody unless otherwise configured. let mut j = Minijail::new().map_err(Error::DeviceJail)?; - j.namespace_pids(); - j.namespace_user(); - j.namespace_user_disable_setgroups(); - // Don't need any capabilities. - j.use_caps(0); + + if let Some(config) = config { + j.namespace_pids(); + j.namespace_user(); + j.namespace_user_disable_setgroups(); + if config.limit_caps { + // Don't need any capabilities. + j.use_caps(0); + } + if let Some(uid_map) = config.uid_map { + j.uidmap(uid_map).map_err(Error::SettingUidMap)?; + } + if let Some(gid_map) = config.gid_map { + j.gidmap(gid_map).map_err(Error::SettingGidMap)?; + } + // Run in an empty network namespace. + j.namespace_net(); + // Apply the block device seccomp policy. + j.no_new_privs(); + + // By default we'll prioritize using the pre-compiled .bpf over the .policy + // file (the .bpf is expected to be compiled using "trap" as the failure + // behavior instead of the default "kill" behavior). + // Refer to the code comment for the "seccomp-log-failures" + // command-line parameter for an explanation about why the |log_failures| + // flag forces the use of .policy files (and the build-time alternative to + // this run-time flag). + let bpf_policy_file = config.seccomp_policy.with_extension("bpf"); + if bpf_policy_file.exists() && !config.log_failures { + j.parse_seccomp_program(&bpf_policy_file) + .map_err(Error::DeviceJail)?; + } else { + // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, + // which will correctly kill the entire device process if a worker + // thread commits a seccomp violation. + j.set_seccomp_filter_tsync(); + if config.log_failures { + j.log_seccomp_filter_failures(); + } + j.parse_seccomp_filters(&config.seccomp_policy.with_extension("policy")) + .map_err(Error::DeviceJail)?; + } + j.use_seccomp_filter(); + // Don't do init setup. + j.run_as_init(); + } + // Create a new mount namespace with an empty root FS. j.namespace_vfs(); j.enter_pivot_root(root).map_err(Error::DevicePivotRoot)?; - // Run in an empty network namespace. - j.namespace_net(); + // Most devices don't need to open many fds. - j.set_rlimit(libc::RLIMIT_NOFILE as i32, 1024, 1024) + let limit = if let Some(r) = r_limit { r } else { 1024u64 }; + j.set_rlimit(libc::RLIMIT_NOFILE as i32, limit, limit) .map_err(Error::SettingMaxOpenFiles)?; - // Apply the block device seccomp policy. - j.no_new_privs(); - - // By default we'll prioritize using the pre-compiled .bpf over the .policy - // file (the .bpf is expected to be compiled using "trap" as the failure - // behavior instead of the default "kill" behavior). - // Refer to the code comment for the "seccomp-log-failures" - // command-line parameter for an explanation about why the |log_failures| - // flag forces the use of .policy files (and the build-time alternative to - // this run-time flag). - let bpf_policy_file = seccomp_policy.with_extension("bpf"); - if bpf_policy_file.exists() && !log_failures { - j.parse_seccomp_program(&bpf_policy_file) - .map_err(Error::DeviceJail)?; - } else { - // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, - // which will correctly kill the entire device process if a worker - // thread commits a seccomp violation. - j.set_seccomp_filter_tsync(); - if log_failures { - j.log_seccomp_filter_failures(); - } - j.parse_seccomp_filters(&seccomp_policy.with_extension("policy")) - .map_err(Error::DeviceJail)?; - } - j.use_seccomp_filter(); - // Don't do init setup. - j.run_as_init(); + Ok(j) } @@ -365,11 +388,14 @@ fn simple_jail(cfg: &Config, policy: &str) -> Result> { return Err(Error::PivotRootDoesntExist(pivot_root)); } let policy_path: PathBuf = cfg.seccomp_policy_dir.join(policy); - Ok(Some(create_base_minijail( - root_path, - cfg.seccomp_log_failures, - &policy_path, - )?)) + let config = SandboxConfig { + limit_caps: true, + log_failures: cfg.seccomp_log_failures, + seccomp_policy: &policy_path, + uid_map: None, + gid_map: None, + }; + Ok(Some(create_base_minijail(root_path, None, Some(&config))?)) } else { Ok(None) } @@ -774,45 +800,20 @@ fn create_fs_device( tag: &str, fs_cfg: virtio::fs::passthrough::Config, ) -> DeviceResult { - let mut j = Minijail::new().map_err(Error::DeviceJail)?; - - if cfg.sandbox { - j.namespace_pids(); - j.namespace_user(); - j.namespace_user_disable_setgroups(); - j.uidmap(uid_map).map_err(Error::SettingUidMap)?; - j.gidmap(gid_map).map_err(Error::SettingGidMap)?; - - // Run in an empty network namespace. - j.namespace_net(); - - j.no_new_privs(); - - // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly kill - // the entire device process if a worker thread commits a seccomp violation. - let seccomp_policy = cfg.seccomp_policy_dir.join("fs_device"); - j.set_seccomp_filter_tsync(); - if cfg.seccomp_log_failures { - j.log_seccomp_filter_failures(); - } - j.parse_seccomp_filters(&seccomp_policy) - .map_err(Error::DeviceJail)?; - j.use_seccomp_filter(); - - // Don't do init setup. - j.run_as_init(); - } - - // Create a new mount namespace with the source directory as the root. We need this even when - // sandboxing is disabled as the server relies on the host kernel to prevent path traversals - // from leaking out of the shared directory. - j.namespace_vfs(); - j.enter_pivot_root(src).map_err(Error::DevicePivotRoot)?; - - // The file server opens a lot of fds and needs a really high open file limit. let max_open_files = get_max_open_files()?; - j.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files) - .map_err(Error::SettingMaxOpenFiles)?; + let j = if cfg.sandbox { + let seccomp_policy = cfg.seccomp_policy_dir.join("fs_device"); + let config = SandboxConfig { + limit_caps: false, + uid_map: Some(uid_map), + gid_map: Some(gid_map), + log_failures: cfg.seccomp_log_failures, + seccomp_policy: &seccomp_policy, + }; + create_base_minijail(src, Some(max_open_files), Some(&config))? + } else { + create_base_minijail(src, Some(max_open_files), None)? + }; // TODO(chirantan): Use more than one worker once the kernel driver has been fixed to not panic // when num_queues > 1. -- cgit 1.4.1