summary refs log tree commit diff
path: root/sys_util
diff options
context:
space:
mode:
authorChirantan Ekbote <chirantan@chromium.org>2018-07-24 16:07:42 -0700
committerchrome-bot <chrome-bot@chromium.org>2018-07-27 15:29:07 -0700
commit448516e3f985dd13fb5cd16f2c9efbcf097f9fa5 (patch)
treeb0bb403fd73e311488f83c990df8978ae58b5f10 /sys_util
parent1187595da3acd15802ad011da7e3d6590e5d82e9 (diff)
downloadcrosvm-448516e3f985dd13fb5cd16f2c9efbcf097f9fa5.tar
crosvm-448516e3f985dd13fb5cd16f2c9efbcf097f9fa5.tar.gz
crosvm-448516e3f985dd13fb5cd16f2c9efbcf097f9fa5.tar.bz2
crosvm-448516e3f985dd13fb5cd16f2c9efbcf097f9fa5.tar.lz
crosvm-448516e3f985dd13fb5cd16f2c9efbcf097f9fa5.tar.xz
crosvm-448516e3f985dd13fb5cd16f2c9efbcf097f9fa5.tar.zst
crosvm-448516e3f985dd13fb5cd16f2c9efbcf097f9fa5.zip
balloon: Implement device policy
Implement a policy for the balloon device so that it starts taking
memory away from the VM when the system is under low memory conditions.
There are a few pieces here:

* Change the madvise call in MemoryMapping::dont_need_range to use
  MADV_REMOVE instead of MADV_DONTNEED.  The latter does nothing when
  the memory mapping is shared across multiple processes while the
  former immediately gives the pages in the specified range back to the
  kernel.  Subsequent accesses to memory in that range returns zero
  pages.
* Change the protocol between the balloon device process and the main
  crosvm process.  Previously, the device process expected the main
  process to send it increments in the amount of memory consumed by the
  balloon device.  Now, it instead just expects the absolute value of
  the memory that should be consumed.  To properly implement the policy
  the main process needs to keep track of the total memory consumed by
  the balloon device so this makes it easier to handle all the policy in
  one place.
* Add a policy for dealing with low memory situations.  When the VM
  starts up, we determine the maximum amount of memory that the balloon
  device should consume:

    * If the VM has more than 1.5GB of memory, the balloon device max is
      the size of the VM memory minus 1GB.
    * Otherwise, if the VM has at least 500MB, the balloon device max is
      50% of the size of the VM memory.
    * Otherwise, the max is 0.

  The increment used to change the size of the balloon is defined as
  1/16 of the max memory that the balloon device will consume.  When the
  crosvm main process detects that the system is low on memory, it
  immediately increases the balloon size by the increment (unless it has
  already reached the max).  It then starts 2 timers: one to check for
  low memory conditions again in 1 seconds (+ jitter) and another to
  check if the system is no longer low on memory in 1 minute (+ jitter)
  with a subsequent interval of 30 seconds (+ jitter).

  Under persistent low memory conditions the balloon device will consume
  the maximum memory after 16 seconds.  Once there is enough available
  memory the balloon size will shrink back down to 0 after at most 9
  minutes.

BUG=chromium:866193
TEST=manual
Start 2 VMs and write out a large file (size > system RAM) in each.
Observe /sys/kernel/mm/chromeos-low_mem/available and see that the
available memory steadily decreases until it goes under the low memory
margin at which point the available memory bounces back up as crosvm
frees up pages.
CQ-DEPEND=CL:1152214

Change-Id: I2046729683aa081c9d7ed039d902ad11737c1d52
Signed-off-by: Chirantan Ekbote <chirantan@chromium.org>
Reviewed-on: https://chromium-review.googlesource.com/1149155
Reviewed-by: Sonny Rao <sonnyrao@chromium.org>
Diffstat (limited to 'sys_util')
-rw-r--r--sys_util/src/guest_memory.rs4
-rw-r--r--sys_util/src/lib.rs2
-rw-r--r--sys_util/src/mmap.rs7
-rw-r--r--sys_util/src/timerfd.rs142
4 files changed, 150 insertions, 5 deletions
diff --git a/sys_util/src/guest_memory.rs b/sys_util/src/guest_memory.rs
index 31500ca..04ef124 100644
--- a/sys_util/src/guest_memory.rs
+++ b/sys_util/src/guest_memory.rs
@@ -132,10 +132,10 @@ impl GuestMemory {
     }
 
     /// Madvise away the address range in the host that is associated with the given guest range.
-    pub fn dont_need_range(&self, addr: GuestAddress, count: u64) -> Result<()> {
+    pub fn remove_range(&self, addr: GuestAddress, count: u64) -> Result<()> {
         self.do_in_region(addr, move |mapping, offset| {
             mapping
-                .dont_need_range(offset, count as usize)
+                .remove_range(offset, count as usize)
                 .map_err(|e| Error::MemoryAccess(addr, e))
         })
     }
diff --git a/sys_util/src/lib.rs b/sys_util/src/lib.rs
index caef4c2..30b293d 100644
--- a/sys_util/src/lib.rs
+++ b/sys_util/src/lib.rs
@@ -33,6 +33,7 @@ mod signalfd;
 mod sock_ctrl_msg;
 mod passwd;
 mod file_flags;
+mod timerfd;
 
 pub use mmap::*;
 pub use shm::*;
@@ -53,6 +54,7 @@ pub use sock_ctrl_msg::*;
 pub use passwd::*;
 pub use poll_token_derive::*;
 pub use file_flags::*;
+pub use timerfd::*;
 
 pub use mmap::Error as MmapError;
 pub use guest_memory::Error as GuestMemoryError;
diff --git a/sys_util/src/mmap.rs b/sys_util/src/mmap.rs
index 4ce72b6..ced48b1 100644
--- a/sys_util/src/mmap.rs
+++ b/sys_util/src/mmap.rs
@@ -316,8 +316,9 @@ impl MemoryMapping {
         Ok(())
     }
 
-    /// Uses madvise to tell the kernel the specified range won't be needed soon.
-    pub fn dont_need_range(&self, mem_offset: usize, count: usize) -> Result<()> {
+    /// Uses madvise to tell the kernel to remove the specified range.  Subsequent reads
+    /// to the pages in the range will return zero bytes.
+    pub fn remove_range(&self, mem_offset: usize, count: usize) -> Result<()> {
         self.range_end(mem_offset, count)
             .map_err(|_| Error::InvalidRange(mem_offset, count))?;
         let ret = unsafe {
@@ -325,7 +326,7 @@ impl MemoryMapping {
             // Next time it is read, it may return zero pages.
             libc::madvise((self.addr as usize + mem_offset) as *mut _,
                           count,
-                          libc::MADV_DONTNEED)
+                          libc::MADV_REMOVE)
         };
         if ret < 0 {
             Err(Error::InvalidRange(mem_offset, count))
diff --git a/sys_util/src/timerfd.rs b/sys_util/src/timerfd.rs
new file mode 100644
index 0000000..7d2c76a
--- /dev/null
+++ b/sys_util/src/timerfd.rs
@@ -0,0 +1,142 @@
+// Copyright 2018 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+use std::fs::File;
+use std::mem;
+use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd, RawFd};
+use std::ptr;
+use std::time::Duration;
+
+use libc::{self, CLOCK_MONOTONIC, TFD_CLOEXEC, timerfd_create, timerfd_settime};
+
+use {Result, errno_result};
+
+/// A safe wrapper around a Linux timerfd (man 2 timerfd_create).
+pub struct TimerFd(File);
+
+impl TimerFd {
+    /// Creates a new timerfd.  The timer is initally disarmed and must be armed by calling
+    /// `reset`.
+    pub fn new() -> Result<TimerFd> {
+        // Safe because this doesn't modify any memory and we check the return value.
+        let ret = unsafe { timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC) };
+        if ret < 0 {
+            return errno_result();
+        }
+
+        // Safe because we uniquely own the file descriptor.
+        Ok(TimerFd(unsafe { File::from_raw_fd(ret) } ))
+    }
+
+    /// Sets the timer to expire after `dur`.  If `interval` is not `None` it represents
+    /// the period for repeated expirations after the initial expiration.  Otherwise
+    /// the timer will expire just once.  Cancels any existing duration and repeating interval.
+    pub fn reset(&mut self, dur: Duration, interval: Option<Duration>) -> Result<()> {
+        // Safe because we are zero-initializing a struct with only primitive member fields.
+        let mut spec: libc::itimerspec = unsafe { mem::zeroed() };
+        spec.it_value.tv_sec = dur.as_secs() as libc::time_t;
+        spec.it_value.tv_nsec = dur.subsec_nanos() as libc::c_long;
+
+        if let Some(int) = interval {
+            spec.it_interval.tv_sec = int.as_secs() as libc::time_t;
+            spec.it_interval.tv_nsec = int.subsec_nanos() as libc::c_long;
+        }
+
+        // Safe because this doesn't modify any memory and we check the return value.
+        let ret = unsafe { timerfd_settime(self.as_raw_fd(), 0, &spec, ptr::null_mut()) };
+        if ret < 0 {
+            return errno_result();
+        }
+
+        Ok(())
+    }
+
+    /// Waits until the timer expires.  The return value represents the number of times the timer
+    /// has expired since the last time `wait` was called.  If the timer has not yet expired once
+    /// this call will block until it does.
+    pub fn wait(&mut self) -> Result<u64> {
+        let mut count = 0u64;
+
+        // Safe because this will only modify |buf| and we check the return value.
+        let ret = unsafe {
+            libc::read(self.as_raw_fd(),
+                       &mut count as *mut _ as *mut libc::c_void,
+                       mem::size_of_val(&count))
+        };
+        if ret < 0 {
+            return errno_result();
+        }
+
+        // The bytes in the buffer are guaranteed to be in native byte-order so we don't need to
+        // use from_le or from_be.
+        Ok(count)
+    }
+
+    /// Disarms the timer.
+    pub fn clear(&mut self) -> Result<()> {
+        // Safe because we are zero-initializing a struct with only primitive member fields.
+        let spec: libc::itimerspec = unsafe { mem::zeroed() };
+
+        // Safe because this doesn't modify any memory and we check the return value.
+        let ret = unsafe { timerfd_settime(self.as_raw_fd(),  0, &spec, ptr::null_mut()) };
+        if ret < 0 {
+            return errno_result();
+        }
+
+        Ok(())
+    }
+}
+
+impl AsRawFd for TimerFd {
+    fn as_raw_fd(&self) -> RawFd {
+        self.0.as_raw_fd()
+    }
+}
+
+impl FromRawFd for TimerFd {
+    unsafe fn from_raw_fd(fd: RawFd) -> Self {
+        TimerFd(File::from_raw_fd(fd))
+    }
+}
+
+impl IntoRawFd for TimerFd {
+    fn into_raw_fd(self) -> RawFd {
+        self.0.into_raw_fd()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::thread::sleep;
+    use std::time::{Duration, Instant};
+
+    #[test]
+    fn one_shot() {
+        let mut tfd = TimerFd::new().expect("failed to create timerfd");
+
+        let dur = Duration::from_millis(200);
+        let now = Instant::now();
+        tfd.reset(dur.clone(), None).expect("failed to arm timer");
+
+        let count = tfd.wait().expect("unable to wait for timer");
+
+        assert_eq!(count, 1);
+        assert!(now.elapsed() >= dur);
+    }
+
+    #[test]
+    fn repeating() {
+        let mut tfd = TimerFd::new().expect("failed to create timerfd");
+
+        let dur = Duration::from_millis(200);
+        let interval = Duration::from_millis(100);
+        tfd.reset(dur.clone(), Some(interval)).expect("failed to arm timer");
+
+        sleep(dur * 3);
+
+        let count = tfd.wait().expect("unable to wait for timer");
+        assert!(count >= 5, "count = {}", count);
+    }
+}