summary refs log tree commit diff
path: root/src/plugin/mod.rs
blob: 470d5f09dd57adaae6f9f8a05c259f924e15d3bf (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
// Copyright 2018 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

mod process;
mod vcpu;

use std::fmt::{self, Display};
use std::fs::File;
use std::io;
use std::os::unix::io::{AsRawFd, FromRawFd};
use std::os::unix::net::UnixDatagram;
use std::path::Path;
use std::result;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Barrier};
use std::thread;
use std::time::{Duration, Instant};

use libc::{
    c_int, c_ulong, fcntl, ioctl, socketpair, AF_UNIX, EAGAIN, EBADF, EDEADLK, EEXIST, EINTR,
    EINVAL, ENOENT, EOVERFLOW, EPERM, FIOCLEX, F_SETPIPE_SZ, MS_NODEV, MS_NOEXEC, MS_NOSUID,
    MS_RDONLY, SIGCHLD, SOCK_SEQPACKET,
};

use protobuf::ProtobufError;
use remain::sorted;

use io_jail::{self, Minijail};
use kvm::{Cap, Datamatch, IoeventAddress, Kvm, Vcpu, VcpuExit, Vm};
use net_util::{Error as TapError, Tap, TapT};
use sys_util::{
    block_signal, clear_signal, drop_capabilities, error, getegid, geteuid, info, pipe,
    register_rt_signal_handler, validate_raw_fd, warn, Error as SysError, EventFd, GuestMemory,
    Killable, MmapError, PollContext, PollToken, Result as SysResult, SignalFd, SignalFdError,
    SIGRTMIN,
};

use self::process::*;
use self::vcpu::*;
use crate::{Config, Executable};

const MAX_DATAGRAM_SIZE: usize = 4096;
const MAX_VCPU_DATAGRAM_SIZE: usize = 0x40000;

/// An error that occurs during the lifetime of a plugin process.
#[sorted]
pub enum Error {
    CloneEventFd(SysError),
    CloneVcpuPipe(io::Error),
    CreateEventFd(SysError),
    CreateIrqChip(SysError),
    CreateJail(io_jail::Error),
    CreateKvm(SysError),
    CreateMainSocket(SysError),
    CreatePIT(SysError),
    CreatePollContext(SysError),
    CreateSignalFd(SignalFdError),
    CreateSocketPair(io::Error),
    CreateTapFd(TapError),
    CreateVcpu(SysError),
    CreateVcpuSocket(SysError),
    CreateVm(SysError),
    DecodeRequest(ProtobufError),
    DropCapabilities(SysError),
    EncodeResponse(ProtobufError),
    Mount(io_jail::Error),
    MountDev(io_jail::Error),
    MountLib(io_jail::Error),
    MountLib64(io_jail::Error),
    MountPlugin(io_jail::Error),
    MountPluginLib(io_jail::Error),
    MountProc(io_jail::Error),
    MountRoot(io_jail::Error),
    NoRootDir,
    ParsePivotRoot(io_jail::Error),
    ParseSeccomp(io_jail::Error),
    PluginFailed(i32),
    PluginKill(SysError),
    PluginKilled(i32),
    PluginRunJail(io_jail::Error),
    PluginSocketHup,
    PluginSocketPoll(SysError),
    PluginSocketRecv(SysError),
    PluginSocketSend(SysError),
    PluginSpawn(io::Error),
    PluginTimeout,
    PluginWait(SysError),
    Poll(SysError),
    PollContextAdd(SysError),
    RootNotAbsolute,
    RootNotDir,
    SetGidMap(io_jail::Error),
    SetUidMap(io_jail::Error),
    SigChild {
        pid: u32,
        signo: u32,
        status: i32,
        code: i32,
    },
    SignalFd(SignalFdError),
    SpawnVcpu(io::Error),
    TapEnable(TapError),
    TapOpen(TapError),
    TapSetIp(TapError),
    TapSetMacAddress(TapError),
    TapSetNetmask(TapError),
    ValidateTapFd(SysError),
}

impl Display for Error {
    #[remain::check]
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        use self::Error::*;

        #[sorted]
        match self {
            CloneEventFd(e) => write!(f, "failed to clone eventfd: {}", e),
            CloneVcpuPipe(e) => write!(f, "failed to clone vcpu pipe: {}", e),
            CreateEventFd(e) => write!(f, "failed to create eventfd: {}", e),
            CreateIrqChip(e) => write!(f, "failed to create kvm irqchip: {}", e),
            CreateJail(e) => write!(f, "failed to create jail: {}", e),
            CreateKvm(e) => write!(f, "error creating Kvm: {}", e),
            CreateMainSocket(e) => write!(f, "error creating main request socket: {}", e),
            CreatePIT(e) => write!(f, "failed to create kvm PIT: {}", e),
            CreatePollContext(e) => write!(f, "failed to create poll context: {}", e),
            CreateSignalFd(e) => write!(f, "failed to create signalfd: {}", e),
            CreateSocketPair(e) => write!(f, "failed to create socket pair: {}", e),
            CreateTapFd(e) => write!(f, "failed to create tap device from raw fd: {}", e),
            CreateVcpu(e) => write!(f, "error creating vcpu: {}", e),
            CreateVcpuSocket(e) => write!(f, "error creating vcpu request socket: {}", e),
            CreateVm(e) => write!(f, "error creating vm: {}", e),
            DecodeRequest(e) => write!(f, "failed to decode plugin request: {}", e),
            DropCapabilities(e) => write!(f, "failed to drop process capabilities: {}", e),
            EncodeResponse(e) => write!(f, "failed to encode plugin response: {}", e),
            Mount(e) | MountDev(e) | MountLib(e) | MountLib64(e) | MountPlugin(e)
            | MountPluginLib(e) | MountProc(e) | MountRoot(e) => {
                write!(f, "failed to mount: {}", e)
            }
            NoRootDir => write!(f, "no root directory for jailed process to pivot root into"),
            ParsePivotRoot(e) => write!(f, "failed to set jail pivot root: {}", e),
            ParseSeccomp(e) => write!(f, "failed to parse jail seccomp filter: {}", e),
            PluginFailed(e) => write!(f, "plugin exited with error: {}", e),
            PluginKill(e) => write!(f, "error sending kill signal to plugin: {}", e),
            PluginKilled(e) => write!(f, "plugin exited with signal {}", e),
            PluginRunJail(e) => write!(f, "failed to run jail: {}", e),
            PluginSocketHup => write!(f, "plugin request socket has been hung up"),
            PluginSocketPoll(e) => write!(f, "failed to poll plugin request sockets: {}", e),
            PluginSocketRecv(e) => write!(f, "failed to recv from plugin request socket: {}", e),
            PluginSocketSend(e) => write!(f, "failed to send to plugin request socket: {}", e),
            PluginSpawn(e) => write!(f, "failed to spawn plugin: {}", e),
            PluginTimeout => write!(f, "plugin did not exit within timeout"),
            PluginWait(e) => write!(f, "error waiting for plugin to exit: {}", e),
            Poll(e) => write!(f, "failed to poll all FDs: {}", e),
            PollContextAdd(e) => write!(f, "failed to add fd to poll context: {}", e),
            RootNotAbsolute => write!(f, "path to the root directory must be absolute"),
            RootNotDir => write!(f, "specified root directory is not a directory"),
            SetGidMap(e) => write!(f, "failed to set gidmap for jail: {}", e),
            SetUidMap(e) => write!(f, "failed to set uidmap for jail: {}", e),
            SigChild {
                pid,
                signo,
                status,
                code,
            } => write!(
                f,
                "process {} died with signal {}, status {}, and code {}",
                pid, signo, status, code
            ),
            SignalFd(e) => write!(f, "failed to read signal fd: {}", e),
            SpawnVcpu(e) => write!(f, "error spawning vcpu thread: {}", e),
            TapEnable(e) => write!(f, "error enabling tap device: {}", e),
            TapOpen(e) => write!(f, "error opening tap device: {}", e),
            TapSetIp(e) => write!(f, "error setting tap ip: {}", e),
            TapSetMacAddress(e) => write!(f, "error setting tap mac address: {}", e),
            TapSetNetmask(e) => write!(f, "error setting tap netmask: {}", e),
            ValidateTapFd(e) => write!(f, "failed to validate raw tap fd: {}", e),
        }
    }
}

type Result<T> = result::Result<T, Error>;

fn new_seqpacket_pair() -> SysResult<(UnixDatagram, UnixDatagram)> {
    let mut fds = [0, 0];
    unsafe {
        let ret = socketpair(AF_UNIX, SOCK_SEQPACKET, 0, fds.as_mut_ptr());
        if ret == 0 {
            ioctl(fds[0], FIOCLEX);
            Ok((
                UnixDatagram::from_raw_fd(fds[0]),
                UnixDatagram::from_raw_fd(fds[1]),
            ))
        } else {
            Err(SysError::last())
        }
    }
}

struct VcpuPipe {
    crosvm_read: File,
    plugin_write: File,
    plugin_read: File,
    crosvm_write: File,
}

fn new_pipe_pair() -> SysResult<VcpuPipe> {
    let to_crosvm = pipe(true)?;
    let to_plugin = pipe(true)?;
    // Increasing the pipe size can be a nice-to-have to make sure that
    // messages get across atomically (and made sure that writes don't block),
    // though it's not necessary a hard requirement for things to work.
    let flags = unsafe {
        fcntl(
            to_crosvm.0.as_raw_fd(),
            F_SETPIPE_SZ,
            MAX_VCPU_DATAGRAM_SIZE as c_int,
        )
    };
    if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 {
        warn!(
            "Failed to adjust size of crosvm pipe (result {}): {}",
            flags,
            SysError::last()
        );
    }
    let flags = unsafe {
        fcntl(
            to_plugin.0.as_raw_fd(),
            F_SETPIPE_SZ,
            MAX_VCPU_DATAGRAM_SIZE as c_int,
        )
    };
    if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 {
        warn!(
            "Failed to adjust size of plugin pipe (result {}): {}",
            flags,
            SysError::last()
        );
    }
    Ok(VcpuPipe {
        crosvm_read: to_crosvm.0,
        plugin_write: to_crosvm.1,
        plugin_read: to_plugin.0,
        crosvm_write: to_plugin.1,
    })
}

fn proto_to_sys_err(e: ProtobufError) -> SysError {
    match e {
        ProtobufError::IoError(e) => SysError::new(e.raw_os_error().unwrap_or(EINVAL)),
        _ => SysError::new(EINVAL),
    }
}

fn io_to_sys_err(e: io::Error) -> SysError {
    SysError::new(e.raw_os_error().unwrap_or(EINVAL))
}

fn mmap_to_sys_err(e: MmapError) -> SysError {
    match e {
        MmapError::SystemCallFailed(e) => e,
        _ => SysError::new(EINVAL),
    }
}

fn create_plugin_jail(root: &Path, log_failures: bool, seccomp_policy: &Path) -> Result<Minijail> {
    // All child jails run in a new user namespace without any users mapped,
    // they run as nobody unless otherwise configured.
    let mut j = Minijail::new().map_err(Error::CreateJail)?;
    j.namespace_pids();
    j.namespace_user();
    j.uidmap(&format!("0 {0} 1", geteuid()))
        .map_err(Error::SetUidMap)?;
    j.gidmap(&format!("0 {0} 1", getegid()))
        .map_err(Error::SetGidMap)?;
    j.namespace_user_disable_setgroups();
    // Don't need any capabilities.
    j.use_caps(0);
    // Create a new mount namespace with an empty root FS.
    j.namespace_vfs();
    j.enter_pivot_root(root).map_err(Error::ParsePivotRoot)?;
    // Run in an empty network namespace.
    j.namespace_net();
    j.no_new_privs();
    // By default we'll prioritize using the pre-compiled .bpf over the .policy
    // file (the .bpf is expected to be compiled using "trap" as the failure
    // behavior instead of the default "kill" behavior).
    // Refer to the code comment for the "seccomp-log-failures"
    // command-line parameter for an explanation about why the |log_failures|
    // flag forces the use of .policy files (and the build-time alternative to
    // this run-time flag).
    let bpf_policy_file = seccomp_policy.with_extension("bpf");
    if bpf_policy_file.exists() && !log_failures {
        j.parse_seccomp_program(&bpf_policy_file)
            .map_err(Error::ParseSeccomp)?;
    } else {
        // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP,
        // which will correctly kill the entire device process if a worker
        // thread commits a seccomp violation.
        j.set_seccomp_filter_tsync();
        if log_failures {
            j.log_seccomp_filter_failures();
        }
        j.parse_seccomp_filters(&seccomp_policy.with_extension("policy"))
            .map_err(Error::ParseSeccomp)?;
    }
    j.use_seccomp_filter();
    // Don't do init setup.
    j.run_as_init();

    // Create a tmpfs in the plugin's root directory so that we can bind mount it's executable
    // file into it.  The size=67108864 is size=64*1024*1024 or size=64MB.
    j.mount_with_data(
        Path::new("none"),
        Path::new("/"),
        "tmpfs",
        (MS_NOSUID | MS_NODEV | MS_NOEXEC) as usize,
        "size=67108864",
    )
    .map_err(Error::MountRoot)?;

    // Because we requested to "run as init", minijail will not mount /proc for us even though
    // plugin will be running in its own PID namespace, so we have to mount it ourselves.
    j.mount(
        Path::new("proc"),
        Path::new("/proc"),
        "proc",
        (MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RDONLY) as usize,
    )
    .map_err(Error::MountProc)?;

    Ok(j)
}

/// Each `PluginObject` represents one object that was instantiated by the guest using the `Create`
/// request.
///
/// Each such object has an ID associated with it that exists in an ID space shared by every variant
/// of `PluginObject`. This allows all the objects to be indexed in a single map, and allows for a
/// common destroy method.
///

/// In addition to the destory method, each object may have methods specific to its variant type.
/// These variant methods must be done by matching the variant to the expected type for that method.
/// For example, getting the dirty log from a `Memory` object starting with an ID:
///
/// ```ignore
/// match objects.get(&request_id) {
///    Some(&PluginObject::Memory { slot, length }) => vm.get_dirty_log(slot, &mut dirty_log[..]),
///    _ => return Err(SysError::new(ENOENT)),
/// }
/// ```
enum PluginObject {
    IoEvent {
        evt: EventFd,
        addr: IoeventAddress,
        length: u32,
        datamatch: u64,
    },
    Memory {
        slot: u32,
        length: usize,
    },
    IrqEvent {
        irq_id: u32,
        evt: EventFd,
    },
}

impl PluginObject {
    fn destroy(self, vm: &mut Vm) -> SysResult<()> {
        match self {
            PluginObject::IoEvent {
                evt,
                addr,
                length,
                datamatch,
            } => match length {
                0 => vm.unregister_ioevent(&evt, addr, Datamatch::AnyLength),
                1 => vm.unregister_ioevent(&evt, addr, Datamatch::U8(Some(datamatch as u8))),
                2 => vm.unregister_ioevent(&evt, addr, Datamatch::U16(Some(datamatch as u16))),
                4 => vm.unregister_ioevent(&evt, addr, Datamatch::U32(Some(datamatch as u32))),
                8 => vm.unregister_ioevent(&evt, addr, Datamatch::U64(Some(datamatch as u64))),
                _ => Err(SysError::new(EINVAL)),
            },
            PluginObject::Memory { slot, .. } => vm.remove_mmio_memory(slot).and(Ok(())),
            PluginObject::IrqEvent { irq_id, evt } => vm.unregister_irqfd(&evt, irq_id),
        }
    }
}

pub fn run_vcpus(
    kvm: &Kvm,
    vm: &Vm,
    plugin: &Process,
    vcpu_count: u32,
    kill_signaled: &Arc<AtomicBool>,
    exit_evt: &EventFd,
    vcpu_handles: &mut Vec<thread::JoinHandle<()>>,
) -> Result<()> {
    let vcpu_thread_barrier = Arc::new(Barrier::new((vcpu_count) as usize));
    let use_kvm_signals = !kvm.check_extension(Cap::ImmediateExit);

    // If we need to force a vcpu to exit from a VM then a SIGRTMIN signal is sent
    // to that vcpu's thread.  If KVM is running the VM then it'll return -EINTR.
    // An issue is what to do when KVM isn't running the VM (where we could be
    // in the kernel or in the app).
    //
    // If KVM supports "immediate exit" then we set a signal handler that will
    // set the |immediate_exit| flag that tells KVM to return -EINTR before running
    // the VM.
    //
    // If KVM doesn't support immediate exit then we'll block SIGRTMIN in the app
    // and tell KVM to unblock SIGRTMIN before running the VM (at which point a blocked
    // signal might get asserted).  There's overhead to have KVM unblock and re-block
    // SIGRTMIN each time it runs the VM, so this mode should be avoided.

    if use_kvm_signals {
        unsafe {
            extern "C" fn handle_signal() {}
            // Our signal handler does nothing and is trivially async signal safe.
            // We need to install this signal handler even though we do block
            // the signal below, to ensure that this signal will interrupt
            // execution of KVM_RUN (this is implementation issue).
            register_rt_signal_handler(SIGRTMIN() + 0, handle_signal)
                .expect("failed to register vcpu signal handler");
        }
        // We do not really want the signal handler to run...
        block_signal(SIGRTMIN() + 0).expect("failed to block signal");
    } else {
        unsafe {
            extern "C" fn handle_signal() {
                Vcpu::set_local_immediate_exit(true);
            }
            register_rt_signal_handler(SIGRTMIN() + 0, handle_signal)
                .expect("failed to register vcpu signal handler");
        }
    }

    for cpu_id in 0..vcpu_count {
        let kill_signaled = kill_signaled.clone();
        let vcpu_thread_barrier = vcpu_thread_barrier.clone();
        let vcpu_exit_evt = exit_evt.try_clone().map_err(Error::CloneEventFd)?;
        let vcpu_plugin = plugin.create_vcpu(cpu_id)?;
        let vcpu = Vcpu::new(cpu_id as c_ulong, kvm, vm).map_err(Error::CreateVcpu)?;

        vcpu_handles.push(
            thread::Builder::new()
                .name(format!("crosvm_vcpu{}", cpu_id))
                .spawn(move || {
                    if use_kvm_signals {
                        // Tell KVM to not block anything when entering kvm run
                        // because we will be using first RT signal to kick the VCPU.
                        vcpu.set_signal_mask(&[])
                            .expect("failed to set up KVM VCPU signal mask");
                    }

                    let vcpu = vcpu
                        .to_runnable(Some(SIGRTMIN() + 0))
                        .expect("Failed to set thread id");

                    let res = vcpu_plugin.init(&vcpu);
                    vcpu_thread_barrier.wait();
                    if let Err(e) = res {
                        error!("failed to initialize vcpu {}: {}", cpu_id, e);
                    } else {
                        loop {
                            let mut interrupted_by_signal = false;
                            let run_res = vcpu.run();
                            match run_res {
                                Ok(run) => match run {
                                    VcpuExit::IoIn { port, mut size } => {
                                        let mut data = [0; 256];
                                        if size > data.len() {
                                            error!("unsupported IoIn size of {} bytes", size);
                                            size = data.len();
                                        }
                                        vcpu_plugin.io_read(port as u64, &mut data[..size], &vcpu);
                                        if let Err(e) = vcpu.set_data(&data[..size]) {
                                            error!("failed to set return data for IoIn: {}", e);
                                        }
                                    }
                                    VcpuExit::IoOut {
                                        port,
                                        mut size,
                                        data,
                                    } => {
                                        if size > data.len() {
                                            error!("unsupported IoOut size of {} bytes", size);
                                            size = data.len();
                                        }
                                        vcpu_plugin.io_write(port as u64, &data[..size], &vcpu);
                                    }
                                    VcpuExit::MmioRead { address, size } => {
                                        let mut data = [0; 8];
                                        vcpu_plugin.mmio_read(
                                            address as u64,
                                            &mut data[..size],
                                            &vcpu,
                                        );
                                        // Setting data for mmio can not fail.
                                        let _ = vcpu.set_data(&data[..size]);
                                    }
                                    VcpuExit::MmioWrite {
                                        address,
                                        size,
                                        data,
                                    } => {
                                        vcpu_plugin.mmio_write(
                                            address as u64,
                                            &data[..size],
                                            &vcpu,
                                        );
                                    }
                                    VcpuExit::HypervHcall { input, params } => {
                                        let mut data = [0; 8];
                                        vcpu_plugin.hyperv_call(input, params, &mut data, &vcpu);
                                        // Setting data for hyperv call can not fail.
                                        let _ = vcpu.set_data(&data);
                                    }
                                    VcpuExit::HypervSynic {
                                        msr,
                                        control,
                                        evt_page,
                                        msg_page,
                                    } => {
                                        vcpu_plugin
                                            .hyperv_synic(msr, control, evt_page, msg_page, &vcpu);
                                    }
                                    VcpuExit::Hlt => break,
                                    VcpuExit::Shutdown => break,
                                    VcpuExit::InternalError => {
                                        error!("vcpu {} has internal error", cpu_id);
                                        break;
                                    }
                                    r => warn!("unexpected vcpu exit: {:?}", r),
                                },
                                Err(e) => match e.errno() {
                                    EINTR => interrupted_by_signal = true,
                                    EAGAIN => {}
                                    _ => {
                                        error!("vcpu hit unknown error: {}", e);
                                        break;
                                    }
                                },
                            }
                            if kill_signaled.load(Ordering::SeqCst) {
                                break;
                            }

                            // Only handle the pause request if kvm reported that it was
                            // interrupted by a signal.  This helps to entire that KVM has had a chance
                            // to finish emulating any IO that may have immediately happened.
                            // If we eagerly check pre_run() then any IO that we
                            // just reported to the plugin won't have been processed yet by KVM.
                            // Not eagerly calling pre_run() also helps to reduce
                            // any overhead from checking if a pause request is pending.
                            // The assumption is that pause requests aren't common
                            // or frequent so it's better to optimize for the non-pause execution paths.
                            if interrupted_by_signal {
                                if use_kvm_signals {
                                    clear_signal(SIGRTMIN() + 0)
                                        .expect("failed to clear pending signal");
                                } else {
                                    vcpu.set_immediate_exit(false);
                                }

                                if let Err(e) = vcpu_plugin.pre_run(&vcpu) {
                                    error!("failed to process pause on vcpu {}: {}", cpu_id, e);
                                    break;
                                }
                            }
                        }
                    }
                    vcpu_exit_evt
                        .write(1)
                        .expect("failed to signal vcpu exit eventfd");
                })
                .map_err(Error::SpawnVcpu)?,
        );
    }
    Ok(())
}

#[derive(PollToken)]
enum Token {
    Exit,
    ChildSignal,
    Plugin { index: usize },
}

/// Run a VM with a plugin process specified by `cfg`.
///
/// Not every field of `cfg` will be used. In particular, most field that pertain to a specific
/// device are ignored because the plugin is responsible for emulating hardware.
pub fn run_config(cfg: Config) -> Result<()> {
    info!("crosvm starting plugin process");

    // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
    // before any jailed devices have been spawned, so that we can catch any of them that fail very
    // quickly.
    let sigchld_fd = SignalFd::new(SIGCHLD).map_err(Error::CreateSignalFd)?;

    let jail = if cfg.sandbox {
        // An empty directory for jailed plugin pivot root.
        let root_path = match &cfg.plugin_root {
            Some(dir) => dir,
            None => Path::new(option_env!("DEFAULT_PIVOT_ROOT").unwrap_or("/var/empty")),
        };

        if root_path.is_relative() {
            return Err(Error::RootNotAbsolute);
        }

        if !root_path.exists() {
            return Err(Error::NoRootDir);
        }

        if !root_path.is_dir() {
            return Err(Error::RootNotDir);
        }

        let policy_path = cfg.seccomp_policy_dir.join("plugin");
        let mut jail = create_plugin_jail(root_path, cfg.seccomp_log_failures, &policy_path)?;

        // Update gid map of the jail if caller provided supplemental groups.
        if !cfg.plugin_gid_maps.is_empty() {
            let map = format!("0 {} 1", getegid())
                + &cfg
                    .plugin_gid_maps
                    .into_iter()
                    .map(|m| format!(",{} {} {}", m.inner, m.outer, m.count))
                    .collect::<String>();
            jail.gidmap(&map).map_err(Error::SetGidMap)?;
        }

        // Mount minimal set of devices (full, zero, urandom, etc). We can not use
        // jail.mount_dev() here because crosvm may not be running with CAP_SYS_ADMIN.
        let device_names = ["full", "null", "urandom", "zero"];
        for name in &device_names {
            let device = Path::new("/dev").join(&name);
            jail.mount_bind(&device, &device, true)
                .map_err(Error::MountDev)?;
        }

        for bind_mount in &cfg.plugin_mounts {
            jail.mount_bind(&bind_mount.src, &bind_mount.dst, bind_mount.writable)
                .map_err(Error::Mount)?;
        }

        Some(jail)
    } else {
        None
    };

    let mut tap_interfaces: Vec<Tap> = Vec::new();
    if let Some(host_ip) = cfg.host_ip {
        if let Some(netmask) = cfg.netmask {
            if let Some(mac_address) = cfg.mac_address {
                let tap = Tap::new(false, false).map_err(Error::TapOpen)?;
                tap.set_ip_addr(host_ip).map_err(Error::TapSetIp)?;
                tap.set_netmask(netmask).map_err(Error::TapSetNetmask)?;
                tap.set_mac_address(mac_address)
                    .map_err(Error::TapSetMacAddress)?;

                tap.enable().map_err(Error::TapEnable)?;
                tap_interfaces.push(tap);
            }
        }
    }
    for tap_fd in cfg.tap_fd {
        // Safe because we ensure that we get a unique handle to the fd.
        let tap = unsafe {
            Tap::from_raw_fd(validate_raw_fd(tap_fd).map_err(Error::ValidateTapFd)?)
                .map_err(Error::CreateTapFd)?
        };
        tap_interfaces.push(tap);
    }

    let plugin_args: Vec<&str> = cfg.params.iter().map(|s| &s[..]).collect();

    let plugin_path = match cfg.executable_path {
        Some(Executable::Plugin(ref plugin_path)) => plugin_path.as_path(),
        _ => panic!("Executable was not a plugin"),
    };
    let vcpu_count = cfg.vcpu_count.unwrap_or(1);
    let mem = GuestMemory::new(&[]).unwrap();
    let kvm = Kvm::new().map_err(Error::CreateKvm)?;
    let mut vm = Vm::new(&kvm, mem).map_err(Error::CreateVm)?;
    vm.create_irq_chip().map_err(Error::CreateIrqChip)?;
    vm.create_pit().map_err(Error::CreatePIT)?;

    let mut plugin = Process::new(vcpu_count, plugin_path, &plugin_args, jail)?;
    // Now that the jail for the plugin has been created and we had a chance to adjust gids there,
    // we can drop all our capabilities in case we had any.
    drop_capabilities().map_err(Error::DropCapabilities)?;

    let mut res = Ok(());
    // If Some, we will exit after enough time is passed to shutdown cleanly.
    let mut dying_instant: Option<Instant> = None;
    let duration_to_die = Duration::from_millis(1000);

    let exit_evt = EventFd::new().map_err(Error::CreateEventFd)?;
    let kill_signaled = Arc::new(AtomicBool::new(false));
    let mut vcpu_handles = Vec::with_capacity(vcpu_count as usize);

    let poll_ctx =
        PollContext::build_with(&[(&exit_evt, Token::Exit), (&sigchld_fd, Token::ChildSignal)])
            .map_err(Error::PollContextAdd)?;

    let mut sockets_to_drop = Vec::new();
    let mut redo_poll_ctx_sockets = true;
    // In this loop, make every attempt to not return early. If an error is encountered, set `res`
    // to the error, set `dying_instant` to now, and signal the plugin that it will be killed soon.
    // If the plugin cannot be signaled because it is dead of `signal_kill` failed, simply break
    // from the poll loop so that the VCPU threads can be cleaned up.
    'poll: loop {
        // After we have waited long enough, it's time to give up and exit.
        if dying_instant
            .map(|i| i.elapsed() >= duration_to_die)
            .unwrap_or(false)
        {
            break;
        }

        if redo_poll_ctx_sockets {
            for (index, socket) in plugin.sockets().iter().enumerate() {
                poll_ctx
                    .add(socket, Token::Plugin { index })
                    .map_err(Error::PollContextAdd)?;
            }
        }

        let plugin_socket_count = plugin.sockets().len();
        let events = {
            let poll_res = match dying_instant {
                Some(inst) => poll_ctx.wait_timeout(duration_to_die - inst.elapsed()),
                None => poll_ctx.wait(),
            };
            match poll_res {
                Ok(v) => v,
                Err(e) => {
                    // Polling no longer works, time to break and cleanup,
                    if res.is_ok() {
                        res = Err(Error::Poll(e));
                    }
                    break;
                }
            }
        };
        for event in events.iter_readable() {
            match event.token() {
                Token::Exit => {
                    // No need to check the exit event if we are already doing cleanup.
                    let _ = poll_ctx.delete(&exit_evt);
                    dying_instant.get_or_insert(Instant::now());
                    let sig_res = plugin.signal_kill();
                    if res.is_ok() && sig_res.is_err() {
                        res = sig_res.map_err(Error::PluginKill);
                    }
                }
                Token::ChildSignal => {
                    // Print all available siginfo structs, then exit the loop.
                    loop {
                        match sigchld_fd.read() {
                            Ok(Some(siginfo)) => {
                                // If the plugin process has ended, there is no need to continue
                                // processing plugin connections, so we break early.
                                if siginfo.ssi_pid == plugin.pid() as u32 {
                                    break 'poll;
                                }
                                // Because SIGCHLD is not expected from anything other than the
                                // plugin process, report it as an error.
                                if res.is_ok() {
                                    res = Err(Error::SigChild {
                                        pid: siginfo.ssi_pid,
                                        signo: siginfo.ssi_signo,
                                        status: siginfo.ssi_status,
                                        code: siginfo.ssi_code,
                                    })
                                }
                            }
                            Ok(None) => break, // No more signals to read.
                            Err(e) => {
                                // Something really must be messed up for this to happen, continue
                                // processing connections for a limited time.
                                if res.is_ok() {
                                    res = Err(Error::SignalFd(e));
                                }
                                break;
                            }
                        }
                    }
                    // As we only spawn the plugin process, getting a SIGCHLD can only mean
                    // something went wrong.
                    dying_instant.get_or_insert(Instant::now());
                    let sig_res = plugin.signal_kill();
                    if res.is_ok() && sig_res.is_err() {
                        res = sig_res.map_err(Error::PluginKill);
                    }
                }
                Token::Plugin { index } => {
                    match plugin.handle_socket(index, &kvm, &mut vm, &vcpu_handles, &tap_interfaces)
                    {
                        Ok(_) => {}
                        // A HUP is an expected event for a socket, so don't bother warning about
                        // it.
                        Err(Error::PluginSocketHup) => sockets_to_drop.push(index),
                        // Only one connection out of potentially many is broken. Drop it, but don't
                        // start cleaning up. Because the error isn't returned, we will warn about
                        // it here.
                        Err(e) => {
                            warn!("error handling plugin socket: {}", e);
                            sockets_to_drop.push(index);
                        }
                    }
                }
            }
        }

        if vcpu_handles.is_empty() && dying_instant.is_none() && plugin.is_started() {
            let res = run_vcpus(
                &kvm,
                &vm,
                &plugin,
                vcpu_count,
                &kill_signaled,
                &exit_evt,
                &mut vcpu_handles,
            );
            if let Err(e) = res {
                dying_instant.get_or_insert(Instant::now());
                error!("failed to start vcpus: {}", e);
            }
        }

        redo_poll_ctx_sockets =
            !sockets_to_drop.is_empty() || plugin.sockets().len() != plugin_socket_count;

        // Cleanup all of the sockets that we have determined were disconnected or suffered some
        // other error.
        plugin.drop_sockets(&mut sockets_to_drop);
        sockets_to_drop.clear();

        if redo_poll_ctx_sockets {
            for socket in plugin.sockets() {
                let _ = poll_ctx.delete(socket);
            }
        }
    }

    // vcpu threads MUST see the kill signaled flag, otherwise they may re-enter the VM.
    kill_signaled.store(true, Ordering::SeqCst);
    // Depending on how we ended up here, the plugin process, or a VCPU thread waiting for requests
    // might be stuck. The `signal_kill` call will unstick all the VCPU threads by closing their
    // blocked connections.
    plugin.signal_kill().map_err(Error::PluginKill)?;
    for handle in vcpu_handles {
        match handle.kill(SIGRTMIN() + 0) {
            Ok(_) => {
                if let Err(e) = handle.join() {
                    error!("failed to join vcpu thread: {:?}", e);
                }
            }
            Err(e) => error!("failed to kill vcpu thread: {}", e),
        }
    }

    match plugin.try_wait() {
        // The plugin has run out of time by now
        Ok(ProcessStatus::Running) => Err(Error::PluginTimeout),
        // Return an error discovered earlier in this function.
        Ok(ProcessStatus::Success) => res,
        Ok(ProcessStatus::Fail(code)) => Err(Error::PluginFailed(code)),
        Ok(ProcessStatus::Signal(code)) => Err(Error::PluginKilled(code)),
        Err(e) => Err(Error::PluginWait(e)),
    }
}