1 // Copyright 2017 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::cmp::{max, min, Reverse};
6 use std::convert::TryFrom;
7 #[cfg(feature = "gpu")]
8 use std::env;
9 use std::error::Error as StdError;
10 use std::ffi::CStr;
11 use std::fmt::{self, Display};
12 use std::fs::{File, OpenOptions};
13 use std::io::{self, stdin, Read};
14 use std::iter;
15 use std::mem;
16 use std::net::Ipv4Addr;
17 #[cfg(feature = "gpu")]
18 use std::num::NonZeroU8;
19 use std::num::ParseIntError;
20 use std::os::unix::io::FromRawFd;
21 use std::os::unix::net::UnixStream;
22 use std::path::{Path, PathBuf};
23 use std::ptr;
24 use std::str;
25 use std::sync::{mpsc, Arc, Barrier};
26 
27 use std::thread;
28 use std::thread::JoinHandle;
29 use std::time::Duration;
30 
31 use libc::{self, c_int, gid_t, uid_t};
32 
33 use acpi_tables::sdt::SDT;
34 
35 use base::net::{UnixSeqpacketListener, UnlinkUnixSeqpacketListener};
36 use base::*;
37 use devices::virtio::vhost::user::{
38     Block as VhostUserBlock, Error as VhostUserError, Fs as VhostUserFs, Net as VhostUserNet,
39 };
40 #[cfg(feature = "gpu")]
41 use devices::virtio::EventDevice;
42 use devices::virtio::{self, Console, VirtioDevice};
43 #[cfg(feature = "audio")]
44 use devices::Ac97Dev;
45 use devices::{
46     self, HostBackendDeviceProvider, IrqChip, IrqEventIndex, KvmKernelIrqChip, PciDevice,
47     VcpuRunState, VfioContainer, VfioDevice, VfioPciDevice, VirtioPciDevice, XhciController,
48 };
49 use hypervisor::kvm::{Kvm, KvmVcpu, KvmVm};
50 use hypervisor::{HypervisorCap, Vcpu, VcpuExit, VcpuRunHandle, Vm, VmCap};
51 use minijail::{self, Minijail};
52 use net_util::{Error as NetError, MacAddress, Tap};
53 use remain::sorted;
54 use resources::{Alloc, MmioType, SystemAllocator};
55 use rutabaga_gfx::RutabagaGralloc;
56 use sync::Mutex;
57 use vm_control::*;
58 use vm_memory::{GuestAddress, GuestMemory, MemoryPolicy};
59 
60 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
61 use crate::gdb::{gdb_thread, GdbStub};
62 use crate::{
63     Config, DiskOption, Executable, SharedDir, SharedDirKind, TouchDeviceOption, VhostUserFsOption,
64     VhostUserOption,
65 };
66 use arch::{
67     self, LinuxArch, RunnableLinuxVm, SerialHardware, SerialParameters, VcpuAffinity,
68     VirtioDeviceStub, VmComponents, VmImage,
69 };
70 
71 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
72 use {
73     aarch64::AArch64 as Arch,
74     devices::IrqChipAArch64 as IrqChipArch,
75     hypervisor::{VcpuAArch64 as VcpuArch, VmAArch64 as VmArch},
76 };
77 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
78 use {
79     devices::{IrqChipX86_64 as IrqChipArch, KvmSplitIrqChip},
80     hypervisor::{VcpuX86_64 as VcpuArch, VmX86_64 as VmArch},
81     x86_64::X8664arch as Arch,
82 };
83 
84 #[sorted]
85 #[derive(Debug)]
86 pub enum Error {
87     AddGpuDeviceMemory(base::Error),
88     AddIrqChipVcpu(base::Error),
89     AddPmemDeviceMemory(base::Error),
90     AllocateGpuDeviceAddress,
91     AllocatePmemDeviceAddress(resources::Error),
92     BalloonActualTooLarge,
93     BalloonDeviceNew(virtio::BalloonError),
94     BlockDeviceNew(base::Error),
95     BlockSignal(base::signal::Error),
96     BuildVm(<Arch as LinuxArch>::Error),
97     ChownTpmStorage(base::Error),
98     CloneEvent(base::Error),
99     CloneVcpu(base::Error),
100     ConfigureVcpu(<Arch as LinuxArch>::Error),
101     #[cfg(feature = "audio")]
102     CreateAc97(devices::PciDeviceError),
103     CreateConsole(arch::serial::Error),
104     CreateControlServer(io::Error),
105     CreateDiskError(disk::Error),
106     CreateEvent(base::Error),
107     CreateGrallocError(rutabaga_gfx::RutabagaError),
108     CreateKvm(base::Error),
109     CreateSignalFd(base::SignalFdError),
110     CreateSocket(io::Error),
111     CreateTapDevice(NetError),
112     CreateTimer(base::Error),
113     CreateTpmStorage(PathBuf, io::Error),
114     CreateTube(TubeError),
115     CreateUsbProvider(devices::usb::host_backend::error::Error),
116     CreateVcpu(base::Error),
117     CreateVfioDevice(devices::vfio::VfioError),
118     CreateVm(base::Error),
119     CreateWaitContext(base::Error),
120     DeviceJail(minijail::Error),
121     DevicePivotRoot(minijail::Error),
122     #[cfg(feature = "direct")]
123     DirectIo(io::Error),
124     #[cfg(feature = "direct")]
125     DirectIrq(devices::DirectIrqError),
126     Disk(PathBuf, io::Error),
127     DiskImageLock(base::Error),
128     DropCapabilities(base::Error),
129     FsDeviceNew(virtio::fs::Error),
130     GetMaxOpenFiles(io::Error),
131     GetSignalMask(signal::Error),
132     GuestCachedMissing(),
133     GuestCachedTooLarge(std::num::TryFromIntError),
134     GuestFreeMissing(),
135     GuestFreeTooLarge(std::num::TryFromIntError),
136     GuestMemoryLayout(<Arch as LinuxArch>::Error),
137     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
138     HandleDebugCommand(<Arch as LinuxArch>::Error),
139     InputDeviceNew(virtio::InputError),
140     InputEventsOpen(std::io::Error),
141     InvalidFdPath,
142     InvalidWaylandPath,
143     IoJail(minijail::Error),
144     LoadKernel(Box<dyn StdError>),
145     MemoryTooLarge,
146     NetDeviceNew(virtio::NetError),
147     OpenAcpiTable(PathBuf, io::Error),
148     OpenAndroidFstab(PathBuf, io::Error),
149     OpenBios(PathBuf, io::Error),
150     OpenInitrd(PathBuf, io::Error),
151     OpenKernel(PathBuf, io::Error),
152     OpenVinput(PathBuf, io::Error),
153     P9DeviceNew(virtio::P9Error),
154     ParseMaxOpenFiles(ParseIntError),
155     PivotRootDoesntExist(&'static str),
156     PmemDeviceImageTooBig,
157     PmemDeviceNew(base::Error),
158     ReadMemAvailable(io::Error),
159     ReadStatm(io::Error),
160     RegisterBalloon(arch::DeviceRegistrationError),
161     RegisterBlock(arch::DeviceRegistrationError),
162     RegisterGpu(arch::DeviceRegistrationError),
163     RegisterNet(arch::DeviceRegistrationError),
164     RegisterP9(arch::DeviceRegistrationError),
165     RegisterRng(arch::DeviceRegistrationError),
166     RegisterSignalHandler(base::Error),
167     RegisterWayland(arch::DeviceRegistrationError),
168     ReserveGpuMemory(base::MmapError),
169     ReserveMemory(base::Error),
170     ReservePmemMemory(base::MmapError),
171     ResetTimer(base::Error),
172     RngDeviceNew(virtio::RngError),
173     RunnableVcpu(base::Error),
174     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
175     SendDebugStatus(Box<mpsc::SendError<VcpuDebugStatusMessage>>),
176     SettingGidMap(minijail::Error),
177     SettingMaxOpenFiles(minijail::Error),
178     SettingSignalMask(base::Error),
179     SettingUidMap(minijail::Error),
180     SignalFd(base::SignalFdError),
181     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
182     SpawnGdbServer(io::Error),
183     SpawnVcpu(io::Error),
184     Timer(base::Error),
185     ValidateRawDescriptor(base::Error),
186     VhostNetDeviceNew(virtio::vhost::Error),
187     VhostUserBlockDeviceNew(VhostUserError),
188     VhostUserFsDeviceNew(VhostUserError),
189     VhostUserNetDeviceNew(VhostUserError),
190     VhostUserNetWithNetArgs,
191     VhostVsockDeviceNew(virtio::vhost::Error),
192     VirtioPciDev(base::Error),
193     WaitContextAdd(base::Error),
194     WaitContextDelete(base::Error),
195     WaylandDeviceNew(base::Error),
196 }
197 
198 impl Display for Error {
199     #[remain::check]
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result200     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
201         use self::Error::*;
202 
203         #[sorted]
204         match self {
205             AddGpuDeviceMemory(e) => write!(f, "failed to add gpu device memory: {}", e),
206             AddIrqChipVcpu(e) => write!(f, "failed to add vcpu to irq chip: {}", e),
207             AddPmemDeviceMemory(e) => write!(f, "failed to add pmem device memory: {}", e),
208             AllocateGpuDeviceAddress => write!(f, "failed to allocate gpu device guest address"),
209             AllocatePmemDeviceAddress(e) => {
210                 write!(f, "failed to allocate memory for pmem device: {}", e)
211             }
212             BalloonActualTooLarge => write!(f, "balloon actual size is too large"),
213             BalloonDeviceNew(e) => write!(f, "failed to create balloon: {}", e),
214             BlockDeviceNew(e) => write!(f, "failed to create block device: {}", e),
215             BlockSignal(e) => write!(f, "failed to block signal: {}", e),
216             BuildVm(e) => write!(f, "The architecture failed to build the vm: {}", e),
217             ChownTpmStorage(e) => write!(f, "failed to chown tpm storage: {}", e),
218             CloneEvent(e) => write!(f, "failed to clone event: {}", e),
219             CloneVcpu(e) => write!(f, "failed to clone vcpu: {}", e),
220             ConfigureVcpu(e) => write!(f, "failed to configure vcpu: {}", e),
221             #[cfg(feature = "audio")]
222             CreateAc97(e) => write!(f, "failed to create ac97 device: {}", e),
223             CreateConsole(e) => write!(f, "failed to create console device: {}", e),
224             CreateControlServer(e) => write!(f, "failed to create control server: {}", e),
225             CreateDiskError(e) => write!(f, "failed to create virtual disk: {}", e),
226             CreateEvent(e) => write!(f, "failed to create event: {}", e),
227             CreateGrallocError(e) => write!(f, "failed to create gralloc: {}", e),
228             CreateKvm(e) => write!(f, "failed to create kvm: {}", e),
229             CreateSignalFd(e) => write!(f, "failed to create signalfd: {}", e),
230             CreateSocket(e) => write!(f, "failed to create socket: {}", e),
231             CreateTapDevice(e) => write!(f, "failed to create tap device: {}", e),
232             CreateTimer(e) => write!(f, "failed to create Timer: {}", e),
233             CreateTpmStorage(p, e) => {
234                 write!(f, "failed to create tpm storage dir {}: {}", p.display(), e)
235             }
236             CreateTube(e) => write!(f, "failed to create tube: {}", e),
237             CreateUsbProvider(e) => write!(f, "failed to create usb provider: {}", e),
238             CreateVcpu(e) => write!(f, "failed to create vcpu: {}", e),
239             CreateVfioDevice(e) => write!(f, "Failed to create vfio device {}", e),
240             CreateVm(e) => write!(f, "failed to create vm: {}", e),
241             CreateWaitContext(e) => write!(f, "failed to create wait context: {}", e),
242             DeviceJail(e) => write!(f, "failed to jail device: {}", e),
243             DevicePivotRoot(e) => write!(f, "failed to pivot root device: {}", e),
244             #[cfg(feature = "direct")]
245             DirectIo(e) => write!(f, "failed to open direct io device: {}", e),
246             #[cfg(feature = "direct")]
247             DirectIrq(e) => write!(f, "failed to enable interrupt forwarding: {}", e),
248             Disk(p, e) => write!(f, "failed to load disk image {}: {}", p.display(), e),
249             DiskImageLock(e) => write!(f, "failed to lock disk image: {}", e),
250             DropCapabilities(e) => write!(f, "failed to drop process capabilities: {}", e),
251             FsDeviceNew(e) => write!(f, "failed to create fs device: {}", e),
252             GetMaxOpenFiles(e) => write!(f, "failed to get max number of open files: {}", e),
253             GetSignalMask(e) => write!(f, "failed to retrieve signal mask for vcpu: {}", e),
254             GuestCachedMissing() => write!(f, "guest cached is missing from balloon stats"),
255             GuestCachedTooLarge(e) => write!(f, "guest cached is too large: {}", e),
256             GuestFreeMissing() => write!(f, "guest free is missing from balloon stats"),
257             GuestFreeTooLarge(e) => write!(f, "guest free is too large: {}", e),
258             GuestMemoryLayout(e) => write!(f, "failed to create guest memory layout: {}", e),
259             #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
260             HandleDebugCommand(e) => write!(f, "failed to handle a gdb command: {}", e),
261             InputDeviceNew(e) => write!(f, "failed to set up input device: {}", e),
262             InputEventsOpen(e) => write!(f, "failed to open event device: {}", e),
263             InvalidFdPath => write!(f, "failed parsing a /proc/self/fd/*"),
264             InvalidWaylandPath => write!(f, "wayland socket path has no parent or file name"),
265             IoJail(e) => write!(f, "{}", e),
266             LoadKernel(e) => write!(f, "failed to load kernel: {}", e),
267             MemoryTooLarge => write!(f, "requested memory size too large"),
268             NetDeviceNew(e) => write!(f, "failed to set up virtio networking: {}", e),
269             OpenAcpiTable(p, e) => write!(f, "failed to open ACPI file {}: {}", p.display(), e),
270             OpenAndroidFstab(p, e) => write!(
271                 f,
272                 "failed to open android fstab file {}: {}",
273                 p.display(),
274                 e
275             ),
276             OpenBios(p, e) => write!(f, "failed to open bios {}: {}", p.display(), e),
277             OpenInitrd(p, e) => write!(f, "failed to open initrd {}: {}", p.display(), e),
278             OpenKernel(p, e) => write!(f, "failed to open kernel image {}: {}", p.display(), e),
279             OpenVinput(p, e) => write!(f, "failed to open vinput device {}: {}", p.display(), e),
280             P9DeviceNew(e) => write!(f, "failed to create 9p device: {}", e),
281             ParseMaxOpenFiles(e) => write!(f, "failed to parse max number of open files: {}", e),
282             PivotRootDoesntExist(p) => write!(f, "{} doesn't exist, can't jail devices.", p),
283             PmemDeviceImageTooBig => {
284                 write!(f, "failed to create pmem device: pmem device image too big")
285             }
286             PmemDeviceNew(e) => write!(f, "failed to create pmem device: {}", e),
287             ReadMemAvailable(e) => write!(
288                 f,
289                 "failed to read /sys/kernel/mm/chromeos-low_mem/available: {}",
290                 e
291             ),
292             ReadStatm(e) => write!(f, "failed to read /proc/self/statm: {}", e),
293             RegisterBalloon(e) => write!(f, "error registering balloon device: {}", e),
294             RegisterBlock(e) => write!(f, "error registering block device: {}", e),
295             RegisterGpu(e) => write!(f, "error registering gpu device: {}", e),
296             RegisterNet(e) => write!(f, "error registering net device: {}", e),
297             RegisterP9(e) => write!(f, "error registering 9p device: {}", e),
298             RegisterRng(e) => write!(f, "error registering rng device: {}", e),
299             RegisterSignalHandler(e) => write!(f, "error registering signal handler: {}", e),
300             RegisterWayland(e) => write!(f, "error registering wayland device: {}", e),
301             ReserveGpuMemory(e) => write!(f, "failed to reserve gpu memory: {}", e),
302             ReserveMemory(e) => write!(f, "failed to reserve memory: {}", e),
303             ReservePmemMemory(e) => write!(f, "failed to reserve pmem memory: {}", e),
304             ResetTimer(e) => write!(f, "failed to reset Timer: {}", e),
305             RngDeviceNew(e) => write!(f, "failed to set up rng: {}", e),
306             RunnableVcpu(e) => write!(f, "failed to set thread id for vcpu: {}", e),
307             #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
308             SendDebugStatus(e) => write!(f, "failed to send a debug status to GDB thread: {}", e),
309             SettingGidMap(e) => write!(f, "error setting GID map: {}", e),
310             SettingMaxOpenFiles(e) => write!(f, "error setting max open files: {}", e),
311             SettingSignalMask(e) => write!(f, "failed to set the signal mask for vcpu: {}", e),
312             SettingUidMap(e) => write!(f, "error setting UID map: {}", e),
313             SignalFd(e) => write!(f, "failed to read signal fd: {}", e),
314             #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
315             SpawnGdbServer(e) => write!(f, "failed to spawn GDB thread: {}", e),
316             SpawnVcpu(e) => write!(f, "failed to spawn VCPU thread: {}", e),
317             Timer(e) => write!(f, "failed to read timer fd: {}", e),
318             ValidateRawDescriptor(e) => write!(f, "failed to validate raw descriptor: {}", e),
319             VhostNetDeviceNew(e) => write!(f, "failed to set up vhost networking: {}", e),
320             VhostUserBlockDeviceNew(e) => {
321                 write!(f, "failed to set up vhost-user block device: {}", e)
322             }
323             VhostUserFsDeviceNew(e) => write!(f, "failed to set up vhost-user fs device: {}", e),
324             VhostUserNetDeviceNew(e) => write!(f, "failed to set up vhost-user net device: {}", e),
325             VhostUserNetWithNetArgs => write!(
326                 f,
327                 "vhost-user-net cannot be used with any of --host_ip, --netmask or --mac"
328             ),
329             VhostVsockDeviceNew(e) => write!(f, "failed to set up virtual socket device: {}", e),
330             VirtioPciDev(e) => write!(f, "failed to create virtio pci dev: {}", e),
331             WaitContextAdd(e) => write!(f, "failed to add descriptor to wait context: {}", e),
332             WaitContextDelete(e) => {
333                 write!(f, "failed to remove descriptor from wait context: {}", e)
334             }
335             WaylandDeviceNew(e) => write!(f, "failed to create wayland device: {}", e),
336         }
337     }
338 }
339 
340 impl From<minijail::Error> for Error {
from(err: minijail::Error) -> Self341     fn from(err: minijail::Error) -> Self {
342         Error::IoJail(err)
343     }
344 }
345 
346 impl std::error::Error for Error {}
347 
348 type Result<T> = std::result::Result<T, Error>;
349 
350 enum TaggedControlTube {
351     Fs(Tube),
352     Vm(Tube),
353     VmMemory(Tube),
354     VmIrq(Tube),
355     VmMsync(Tube),
356 }
357 
358 impl AsRef<Tube> for TaggedControlTube {
as_ref(&self) -> &Tube359     fn as_ref(&self) -> &Tube {
360         use self::TaggedControlTube::*;
361         match &self {
362             Fs(tube) | Vm(tube) | VmMemory(tube) | VmIrq(tube) | VmMsync(tube) => tube,
363         }
364     }
365 }
366 
367 impl AsRawDescriptor for TaggedControlTube {
as_raw_descriptor(&self) -> RawDescriptor368     fn as_raw_descriptor(&self) -> RawDescriptor {
369         self.as_ref().as_raw_descriptor()
370     }
371 }
372 
get_max_open_files() -> Result<u64>373 fn get_max_open_files() -> Result<u64> {
374     let mut buf = mem::MaybeUninit::<libc::rlimit64>::zeroed();
375 
376     // Safe because this will only modify `buf` and we check the return value.
377     let res = unsafe { libc::prlimit64(0, libc::RLIMIT_NOFILE, ptr::null(), buf.as_mut_ptr()) };
378     if res == 0 {
379         // Safe because the kernel guarantees that the struct is fully initialized.
380         let limit = unsafe { buf.assume_init() };
381         Ok(limit.rlim_max)
382     } else {
383         Err(Error::GetMaxOpenFiles(io::Error::last_os_error()))
384     }
385 }
386 
387 struct SandboxConfig<'a> {
388     limit_caps: bool,
389     log_failures: bool,
390     seccomp_policy: &'a Path,
391     uid_map: Option<&'a str>,
392     gid_map: Option<&'a str>,
393 }
394 
create_base_minijail( root: &Path, r_limit: Option<u64>, config: Option<&SandboxConfig>, ) -> Result<Minijail>395 fn create_base_minijail(
396     root: &Path,
397     r_limit: Option<u64>,
398     config: Option<&SandboxConfig>,
399 ) -> Result<Minijail> {
400     // All child jails run in a new user namespace without any users mapped,
401     // they run as nobody unless otherwise configured.
402     let mut j = Minijail::new().map_err(Error::DeviceJail)?;
403 
404     if let Some(config) = config {
405         j.namespace_pids();
406         j.namespace_user();
407         j.namespace_user_disable_setgroups();
408         if config.limit_caps {
409             // Don't need any capabilities.
410             j.use_caps(0);
411         }
412         if let Some(uid_map) = config.uid_map {
413             j.uidmap(uid_map).map_err(Error::SettingUidMap)?;
414         }
415         if let Some(gid_map) = config.gid_map {
416             j.gidmap(gid_map).map_err(Error::SettingGidMap)?;
417         }
418         // Run in a new mount namespace.
419         j.namespace_vfs();
420 
421         // Run in an empty network namespace.
422         j.namespace_net();
423 
424         // Don't allow the device to gain new privileges.
425         j.no_new_privs();
426 
427         // By default we'll prioritize using the pre-compiled .bpf over the .policy
428         // file (the .bpf is expected to be compiled using "trap" as the failure
429         // behavior instead of the default "kill" behavior).
430         // Refer to the code comment for the "seccomp-log-failures"
431         // command-line parameter for an explanation about why the |log_failures|
432         // flag forces the use of .policy files (and the build-time alternative to
433         // this run-time flag).
434         let bpf_policy_file = config.seccomp_policy.with_extension("bpf");
435         if bpf_policy_file.exists() && !config.log_failures {
436             j.parse_seccomp_program(&bpf_policy_file)
437                 .map_err(Error::DeviceJail)?;
438         } else {
439             // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP,
440             // which will correctly kill the entire device process if a worker
441             // thread commits a seccomp violation.
442             j.set_seccomp_filter_tsync();
443             if config.log_failures {
444                 j.log_seccomp_filter_failures();
445             }
446             j.parse_seccomp_filters(&config.seccomp_policy.with_extension("policy"))
447                 .map_err(Error::DeviceJail)?;
448         }
449         j.use_seccomp_filter();
450         // Don't do init setup.
451         j.run_as_init();
452     }
453 
454     // Only pivot_root if we are not re-using the current root directory.
455     if root != Path::new("/") {
456         // It's safe to call `namespace_vfs` multiple times.
457         j.namespace_vfs();
458         j.enter_pivot_root(root).map_err(Error::DevicePivotRoot)?;
459     }
460 
461     // Most devices don't need to open many fds.
462     let limit = if let Some(r) = r_limit { r } else { 1024u64 };
463     j.set_rlimit(libc::RLIMIT_NOFILE as i32, limit, limit)
464         .map_err(Error::SettingMaxOpenFiles)?;
465 
466     Ok(j)
467 }
468 
simple_jail(cfg: &Config, policy: &str) -> Result<Option<Minijail>>469 fn simple_jail(cfg: &Config, policy: &str) -> Result<Option<Minijail>> {
470     if cfg.sandbox {
471         let pivot_root: &str = option_env!("DEFAULT_PIVOT_ROOT").unwrap_or("/var/empty");
472         // A directory for a jailed device's pivot root.
473         let root_path = Path::new(pivot_root);
474         if !root_path.exists() {
475             return Err(Error::PivotRootDoesntExist(pivot_root));
476         }
477         let policy_path: PathBuf = cfg.seccomp_policy_dir.join(policy);
478         let config = SandboxConfig {
479             limit_caps: true,
480             log_failures: cfg.seccomp_log_failures,
481             seccomp_policy: &policy_path,
482             uid_map: None,
483             gid_map: None,
484         };
485         Ok(Some(create_base_minijail(root_path, None, Some(&config))?))
486     } else {
487         Ok(None)
488     }
489 }
490 
491 type DeviceResult<T = VirtioDeviceStub> = std::result::Result<T, Error>;
492 
create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult493 fn create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult {
494     // Special case '/proc/self/fd/*' paths. The FD is already open, just use it.
495     let raw_image: File = if disk.path.parent() == Some(Path::new("/proc/self/fd")) {
496         // Safe because we will validate |raw_fd|.
497         unsafe { File::from_raw_descriptor(raw_descriptor_from_path(&disk.path)?) }
498     } else {
499         OpenOptions::new()
500             .read(true)
501             .write(!disk.read_only)
502             .open(&disk.path)
503             .map_err(|e| Error::Disk(disk.path.to_path_buf(), e))?
504     };
505     // Lock the disk image to prevent other crosvm instances from using it.
506     let lock_op = if disk.read_only {
507         FlockOperation::LockShared
508     } else {
509         FlockOperation::LockExclusive
510     };
511     flock(&raw_image, lock_op, true).map_err(Error::DiskImageLock)?;
512 
513     let dev = if disk::async_ok(&raw_image).map_err(Error::CreateDiskError)? {
514         let async_file = disk::create_async_disk_file(raw_image).map_err(Error::CreateDiskError)?;
515         Box::new(
516             virtio::BlockAsync::new(
517                 virtio::base_features(cfg.protected_vm),
518                 async_file,
519                 disk.read_only,
520                 disk.sparse,
521                 disk.block_size,
522                 disk.id,
523                 Some(disk_device_tube),
524             )
525             .map_err(Error::BlockDeviceNew)?,
526         ) as Box<dyn VirtioDevice>
527     } else {
528         let disk_file = disk::create_disk_file(raw_image).map_err(Error::CreateDiskError)?;
529         Box::new(
530             virtio::Block::new(
531                 virtio::base_features(cfg.protected_vm),
532                 disk_file,
533                 disk.read_only,
534                 disk.sparse,
535                 disk.block_size,
536                 disk.id,
537                 Some(disk_device_tube),
538             )
539             .map_err(Error::BlockDeviceNew)?,
540         ) as Box<dyn VirtioDevice>
541     };
542 
543     Ok(VirtioDeviceStub {
544         dev,
545         jail: simple_jail(&cfg, "block_device")?,
546     })
547 }
548 
create_vhost_user_block_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult549 fn create_vhost_user_block_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult {
550     let dev = VhostUserBlock::new(virtio::base_features(cfg.protected_vm), &opt.socket)
551         .map_err(Error::VhostUserBlockDeviceNew)?;
552 
553     Ok(VirtioDeviceStub {
554         dev: Box::new(dev),
555         // no sandbox here because virtqueue handling is exported to a different process.
556         jail: None,
557     })
558 }
559 
create_vhost_user_fs_device(cfg: &Config, option: &VhostUserFsOption) -> DeviceResult560 fn create_vhost_user_fs_device(cfg: &Config, option: &VhostUserFsOption) -> DeviceResult {
561     let dev = VhostUserFs::new(
562         virtio::base_features(cfg.protected_vm),
563         &option.socket,
564         &option.tag,
565     )
566     .map_err(Error::VhostUserFsDeviceNew)?;
567 
568     Ok(VirtioDeviceStub {
569         dev: Box::new(dev),
570         // no sandbox here because virtqueue handling is exported to a different process.
571         jail: None,
572     })
573 }
574 
create_rng_device(cfg: &Config) -> DeviceResult575 fn create_rng_device(cfg: &Config) -> DeviceResult {
576     let dev =
577         virtio::Rng::new(virtio::base_features(cfg.protected_vm)).map_err(Error::RngDeviceNew)?;
578 
579     Ok(VirtioDeviceStub {
580         dev: Box::new(dev),
581         jail: simple_jail(&cfg, "rng_device")?,
582     })
583 }
584 
585 #[cfg(feature = "tpm")]
create_tpm_device(cfg: &Config) -> DeviceResult586 fn create_tpm_device(cfg: &Config) -> DeviceResult {
587     use std::ffi::CString;
588     use std::fs;
589     use std::process;
590 
591     let tpm_storage: PathBuf;
592     let mut tpm_jail = simple_jail(&cfg, "tpm_device")?;
593 
594     match &mut tpm_jail {
595         Some(jail) => {
596             // Create a tmpfs in the device's root directory for tpm
597             // simulator storage. The size is 20*1024, or 20 KB.
598             jail.mount_with_data(
599                 Path::new("none"),
600                 Path::new("/"),
601                 "tmpfs",
602                 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
603                 "size=20480",
604             )?;
605 
606             let crosvm_ids = add_crosvm_user_to_jail(jail, "tpm")?;
607 
608             let pid = process::id();
609             let tpm_pid_dir = format!("/run/vm/tpm.{}", pid);
610             tpm_storage = Path::new(&tpm_pid_dir).to_owned();
611             fs::create_dir_all(&tpm_storage)
612                 .map_err(|e| Error::CreateTpmStorage(tpm_storage.to_owned(), e))?;
613             let tpm_pid_dir_c = CString::new(tpm_pid_dir).expect("no nul bytes");
614             chown(&tpm_pid_dir_c, crosvm_ids.uid, crosvm_ids.gid)
615                 .map_err(Error::ChownTpmStorage)?;
616 
617             jail.mount_bind(&tpm_storage, &tpm_storage, true)?;
618         }
619         None => {
620             // Path used inside cros_sdk which does not have /run/vm.
621             tpm_storage = Path::new("/tmp/tpm-simulator").to_owned();
622         }
623     }
624 
625     let dev = virtio::Tpm::new(tpm_storage);
626 
627     Ok(VirtioDeviceStub {
628         dev: Box::new(dev),
629         jail: tpm_jail,
630     })
631 }
632 
create_single_touch_device(cfg: &Config, single_touch_spec: &TouchDeviceOption) -> DeviceResult633 fn create_single_touch_device(cfg: &Config, single_touch_spec: &TouchDeviceOption) -> DeviceResult {
634     let socket = single_touch_spec
635         .get_path()
636         .into_unix_stream()
637         .map_err(|e| {
638             error!("failed configuring virtio single touch: {:?}", e);
639             e
640         })?;
641 
642     let (width, height) = single_touch_spec.get_size();
643     let dev = virtio::new_single_touch(
644         socket,
645         width,
646         height,
647         virtio::base_features(cfg.protected_vm),
648     )
649     .map_err(Error::InputDeviceNew)?;
650     Ok(VirtioDeviceStub {
651         dev: Box::new(dev),
652         jail: simple_jail(&cfg, "input_device")?,
653     })
654 }
655 
create_multi_touch_device(cfg: &Config, multi_touch_spec: &TouchDeviceOption) -> DeviceResult656 fn create_multi_touch_device(cfg: &Config, multi_touch_spec: &TouchDeviceOption) -> DeviceResult {
657     let socket = multi_touch_spec
658         .get_path()
659         .into_unix_stream()
660         .map_err(|e| {
661             error!("failed configuring virtio multi touch: {:?}", e);
662             e
663         })?;
664 
665     let (width, height) = multi_touch_spec.get_size();
666     let dev = virtio::new_multi_touch(
667         socket,
668         width,
669         height,
670         virtio::base_features(cfg.protected_vm),
671     )
672     .map_err(Error::InputDeviceNew)?;
673 
674     Ok(VirtioDeviceStub {
675         dev: Box::new(dev),
676         jail: simple_jail(&cfg, "input_device")?,
677     })
678 }
679 
create_trackpad_device(cfg: &Config, trackpad_spec: &TouchDeviceOption) -> DeviceResult680 fn create_trackpad_device(cfg: &Config, trackpad_spec: &TouchDeviceOption) -> DeviceResult {
681     let socket = trackpad_spec.get_path().into_unix_stream().map_err(|e| {
682         error!("failed configuring virtio trackpad: {}", e);
683         e
684     })?;
685 
686     let (width, height) = trackpad_spec.get_size();
687     let dev = virtio::new_trackpad(
688         socket,
689         width,
690         height,
691         virtio::base_features(cfg.protected_vm),
692     )
693     .map_err(Error::InputDeviceNew)?;
694 
695     Ok(VirtioDeviceStub {
696         dev: Box::new(dev),
697         jail: simple_jail(&cfg, "input_device")?,
698     })
699 }
700 
create_mouse_device<T: IntoUnixStream>(cfg: &Config, mouse_socket: T) -> DeviceResult701 fn create_mouse_device<T: IntoUnixStream>(cfg: &Config, mouse_socket: T) -> DeviceResult {
702     let socket = mouse_socket.into_unix_stream().map_err(|e| {
703         error!("failed configuring virtio mouse: {}", e);
704         e
705     })?;
706 
707     let dev = virtio::new_mouse(socket, virtio::base_features(cfg.protected_vm))
708         .map_err(Error::InputDeviceNew)?;
709 
710     Ok(VirtioDeviceStub {
711         dev: Box::new(dev),
712         jail: simple_jail(&cfg, "input_device")?,
713     })
714 }
715 
create_keyboard_device<T: IntoUnixStream>(cfg: &Config, keyboard_socket: T) -> DeviceResult716 fn create_keyboard_device<T: IntoUnixStream>(cfg: &Config, keyboard_socket: T) -> DeviceResult {
717     let socket = keyboard_socket.into_unix_stream().map_err(|e| {
718         error!("failed configuring virtio keyboard: {}", e);
719         e
720     })?;
721 
722     let dev = virtio::new_keyboard(socket, virtio::base_features(cfg.protected_vm))
723         .map_err(Error::InputDeviceNew)?;
724 
725     Ok(VirtioDeviceStub {
726         dev: Box::new(dev),
727         jail: simple_jail(&cfg, "input_device")?,
728     })
729 }
730 
create_switches_device<T: IntoUnixStream>(cfg: &Config, switches_socket: T) -> DeviceResult731 fn create_switches_device<T: IntoUnixStream>(cfg: &Config, switches_socket: T) -> DeviceResult {
732     let socket = switches_socket.into_unix_stream().map_err(|e| {
733         error!("failed configuring virtio switches: {}", e);
734         e
735     })?;
736 
737     let dev = virtio::new_switches(socket, virtio::base_features(cfg.protected_vm))
738         .map_err(Error::InputDeviceNew)?;
739 
740     Ok(VirtioDeviceStub {
741         dev: Box::new(dev),
742         jail: simple_jail(&cfg, "input_device")?,
743     })
744 }
745 
create_vinput_device(cfg: &Config, dev_path: &Path) -> DeviceResult746 fn create_vinput_device(cfg: &Config, dev_path: &Path) -> DeviceResult {
747     let dev_file = OpenOptions::new()
748         .read(true)
749         .write(true)
750         .open(dev_path)
751         .map_err(|e| Error::OpenVinput(dev_path.to_owned(), e))?;
752 
753     let dev = virtio::new_evdev(dev_file, virtio::base_features(cfg.protected_vm))
754         .map_err(Error::InputDeviceNew)?;
755 
756     Ok(VirtioDeviceStub {
757         dev: Box::new(dev),
758         jail: simple_jail(&cfg, "input_device")?,
759     })
760 }
761 
create_balloon_device(cfg: &Config, tube: Tube) -> DeviceResult762 fn create_balloon_device(cfg: &Config, tube: Tube) -> DeviceResult {
763     let dev = virtio::Balloon::new(virtio::base_features(cfg.protected_vm), tube)
764         .map_err(Error::BalloonDeviceNew)?;
765 
766     Ok(VirtioDeviceStub {
767         dev: Box::new(dev),
768         jail: simple_jail(&cfg, "balloon_device")?,
769     })
770 }
771 
create_tap_net_device(cfg: &Config, tap_fd: RawDescriptor) -> DeviceResult772 fn create_tap_net_device(cfg: &Config, tap_fd: RawDescriptor) -> DeviceResult {
773     // Safe because we ensure that we get a unique handle to the fd.
774     let tap = unsafe {
775         Tap::from_raw_descriptor(
776             validate_raw_descriptor(tap_fd).map_err(Error::ValidateRawDescriptor)?,
777         )
778         .map_err(Error::CreateTapDevice)?
779     };
780 
781     let mut vq_pairs = cfg.net_vq_pairs.unwrap_or(1);
782     let vcpu_count = cfg.vcpu_count.unwrap_or(1);
783     if vcpu_count < vq_pairs as usize {
784         error!("net vq pairs must be smaller than vcpu count, fall back to single queue mode");
785         vq_pairs = 1;
786     }
787     let features = virtio::base_features(cfg.protected_vm);
788     let dev = virtio::Net::from(features, tap, vq_pairs).map_err(Error::NetDeviceNew)?;
789 
790     Ok(VirtioDeviceStub {
791         dev: Box::new(dev),
792         jail: simple_jail(&cfg, "net_device")?,
793     })
794 }
795 
create_net_device( cfg: &Config, host_ip: Ipv4Addr, netmask: Ipv4Addr, mac_address: MacAddress, mem: &GuestMemory, ) -> DeviceResult796 fn create_net_device(
797     cfg: &Config,
798     host_ip: Ipv4Addr,
799     netmask: Ipv4Addr,
800     mac_address: MacAddress,
801     mem: &GuestMemory,
802 ) -> DeviceResult {
803     let mut vq_pairs = cfg.net_vq_pairs.unwrap_or(1);
804     let vcpu_count = cfg.vcpu_count.unwrap_or(1);
805     if vcpu_count < vq_pairs as usize {
806         error!("net vq pairs must be smaller than vcpu count, fall back to single queue mode");
807         vq_pairs = 1;
808     }
809 
810     let features = virtio::base_features(cfg.protected_vm);
811     let dev = if cfg.vhost_net {
812         let dev = virtio::vhost::Net::<Tap, vhost::Net<Tap>>::new(
813             &cfg.vhost_net_device_path,
814             features,
815             host_ip,
816             netmask,
817             mac_address,
818             mem,
819         )
820         .map_err(Error::VhostNetDeviceNew)?;
821         Box::new(dev) as Box<dyn VirtioDevice>
822     } else {
823         let dev = virtio::Net::<Tap>::new(features, host_ip, netmask, mac_address, vq_pairs)
824             .map_err(Error::NetDeviceNew)?;
825         Box::new(dev) as Box<dyn VirtioDevice>
826     };
827 
828     let policy = if cfg.vhost_net {
829         "vhost_net_device"
830     } else {
831         "net_device"
832     };
833 
834     Ok(VirtioDeviceStub {
835         dev,
836         jail: simple_jail(&cfg, policy)?,
837     })
838 }
839 
create_vhost_user_net_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult840 fn create_vhost_user_net_device(cfg: &Config, opt: &VhostUserOption) -> DeviceResult {
841     let dev = VhostUserNet::new(virtio::base_features(cfg.protected_vm), &opt.socket)
842         .map_err(Error::VhostUserNetDeviceNew)?;
843 
844     Ok(VirtioDeviceStub {
845         dev: Box::new(dev),
846         // no sandbox here because virtqueue handling is exported to a different process.
847         jail: None,
848     })
849 }
850 
851 #[cfg(feature = "gpu")]
create_gpu_device( cfg: &Config, exit_evt: &Event, gpu_device_tube: Tube, resource_bridges: Vec<Tube>, wayland_socket_path: Option<&PathBuf>, x_display: Option<String>, event_devices: Vec<EventDevice>, map_request: Arc<Mutex<Option<ExternalMapping>>>, mem: &GuestMemory, ) -> DeviceResult852 fn create_gpu_device(
853     cfg: &Config,
854     exit_evt: &Event,
855     gpu_device_tube: Tube,
856     resource_bridges: Vec<Tube>,
857     wayland_socket_path: Option<&PathBuf>,
858     x_display: Option<String>,
859     event_devices: Vec<EventDevice>,
860     map_request: Arc<Mutex<Option<ExternalMapping>>>,
861     mem: &GuestMemory,
862 ) -> DeviceResult {
863     let jailed_wayland_path = Path::new("/wayland-0");
864 
865     let mut display_backends = vec![
866         virtio::DisplayBackend::X(x_display),
867         virtio::DisplayBackend::Stub,
868     ];
869 
870     if let Some(socket_path) = wayland_socket_path {
871         display_backends.insert(
872             0,
873             virtio::DisplayBackend::Wayland(if cfg.sandbox {
874                 Some(jailed_wayland_path.to_owned())
875             } else {
876                 Some(socket_path.to_owned())
877             }),
878         );
879     }
880 
881     let dev = virtio::Gpu::new(
882         exit_evt.try_clone().map_err(Error::CloneEvent)?,
883         Some(gpu_device_tube),
884         NonZeroU8::new(1).unwrap(), // number of scanouts
885         resource_bridges,
886         display_backends,
887         cfg.gpu_parameters.as_ref().unwrap(),
888         event_devices,
889         map_request,
890         cfg.sandbox,
891         virtio::base_features(cfg.protected_vm),
892         cfg.wayland_socket_paths.clone(),
893         mem.clone(),
894     );
895 
896     let jail = match simple_jail(&cfg, "gpu_device")? {
897         Some(mut jail) => {
898             // Create a tmpfs in the device's root directory so that we can bind mount the
899             // dri directory into it.  The size=67108864 is size=64*1024*1024 or size=64MB.
900             jail.mount_with_data(
901                 Path::new("none"),
902                 Path::new("/"),
903                 "tmpfs",
904                 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
905                 "size=67108864",
906             )?;
907 
908             // Device nodes required for DRM.
909             let sys_dev_char_path = Path::new("/sys/dev/char");
910             jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?;
911             let sys_devices_path = Path::new("/sys/devices");
912             jail.mount_bind(sys_devices_path, sys_devices_path, false)?;
913 
914             let drm_dri_path = Path::new("/dev/dri");
915             if drm_dri_path.exists() {
916                 jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
917             }
918 
919             // Prepare GPU shader disk cache directory.
920             if let Some(cache_dir) = cfg
921                 .gpu_parameters
922                 .as_ref()
923                 .and_then(|params| params.cache_path.as_ref())
924             {
925                 if cfg!(any(target_arch = "arm", target_arch = "aarch64")) && cfg.sandbox {
926                     warn!("shader caching not yet supported on ARM with sandbox enabled");
927                     env::set_var("MESA_GLSL_CACHE_DISABLE", "true");
928                 } else {
929                     env::set_var("MESA_GLSL_CACHE_DISABLE", "false");
930                     env::set_var("MESA_GLSL_CACHE_DIR", cache_dir);
931                     if let Some(cache_size) = cfg
932                         .gpu_parameters
933                         .as_ref()
934                         .and_then(|params| params.cache_size.as_ref())
935                     {
936                         env::set_var("MESA_GLSL_CACHE_MAX_SIZE", cache_size);
937                     }
938                     let shadercache_path = Path::new(cache_dir);
939                     jail.mount_bind(shadercache_path, shadercache_path, true)?;
940                 }
941             }
942 
943             // If the ARM specific devices exist on the host, bind mount them in.
944             let mali0_path = Path::new("/dev/mali0");
945             if mali0_path.exists() {
946                 jail.mount_bind(mali0_path, mali0_path, true)?;
947             }
948 
949             let pvr_sync_path = Path::new("/dev/pvr_sync");
950             if pvr_sync_path.exists() {
951                 jail.mount_bind(pvr_sync_path, pvr_sync_path, true)?;
952             }
953 
954             // If the udmabuf driver exists on the host, bind mount it in.
955             let udmabuf_path = Path::new("/dev/udmabuf");
956             if udmabuf_path.exists() {
957                 jail.mount_bind(udmabuf_path, udmabuf_path, true)?;
958             }
959 
960             // Libraries that are required when mesa drivers are dynamically loaded.
961             let lib_dirs = &[
962                 "/usr/lib",
963                 "/usr/lib64",
964                 "/lib",
965                 "/lib64",
966                 "/usr/share/vulkan",
967             ];
968             for dir in lib_dirs {
969                 let dir_path = Path::new(dir);
970                 if dir_path.exists() {
971                     jail.mount_bind(dir_path, dir_path, false)?;
972                 }
973             }
974 
975             // Bind mount the wayland socket into jail's root. This is necessary since each
976             // new wayland context must open() the socket.  Don't bind mount the camera socket
977             // since it seems to cause problems on ARCVM (b/180126126) + Mali.  It's unclear if
978             // camera team will opt for virtio-camera or continue using virtio-wl, so this should
979             // be fine for now.
980             if let Some(path) = wayland_socket_path {
981                 jail.mount_bind(path, jailed_wayland_path, true)?;
982             }
983 
984             add_crosvm_user_to_jail(&mut jail, "gpu")?;
985 
986             // pvr driver requires read access to /proc/self/task/*/comm.
987             let proc_path = Path::new("/proc");
988             jail.mount(
989                 proc_path,
990                 proc_path,
991                 "proc",
992                 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RDONLY) as usize,
993             )?;
994 
995             // To enable perfetto tracing, we need to give access to the perfetto service IPC
996             // endpoints.
997             let perfetto_path = Path::new("/run/perfetto");
998             if perfetto_path.exists() {
999                 jail.mount_bind(perfetto_path, perfetto_path, true)?;
1000             }
1001 
1002             Some(jail)
1003         }
1004         None => None,
1005     };
1006 
1007     Ok(VirtioDeviceStub {
1008         dev: Box::new(dev),
1009         jail,
1010     })
1011 }
1012 
create_wayland_device( cfg: &Config, control_tube: Tube, resource_bridge: Option<Tube>, ) -> DeviceResult1013 fn create_wayland_device(
1014     cfg: &Config,
1015     control_tube: Tube,
1016     resource_bridge: Option<Tube>,
1017 ) -> DeviceResult {
1018     let wayland_socket_dirs = cfg
1019         .wayland_socket_paths
1020         .iter()
1021         .map(|(_name, path)| path.parent())
1022         .collect::<Option<Vec<_>>>()
1023         .ok_or(Error::InvalidWaylandPath)?;
1024 
1025     let features = virtio::base_features(cfg.protected_vm);
1026     let dev = virtio::Wl::new(
1027         features,
1028         cfg.wayland_socket_paths.clone(),
1029         control_tube,
1030         resource_bridge,
1031     )
1032     .map_err(Error::WaylandDeviceNew)?;
1033 
1034     let jail = match simple_jail(&cfg, "wl_device")? {
1035         Some(mut jail) => {
1036             // Create a tmpfs in the device's root directory so that we can bind mount the wayland
1037             // socket directory into it. The size=67108864 is size=64*1024*1024 or size=64MB.
1038             jail.mount_with_data(
1039                 Path::new("none"),
1040                 Path::new("/"),
1041                 "tmpfs",
1042                 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
1043                 "size=67108864",
1044             )?;
1045 
1046             // Bind mount the wayland socket's directory into jail's root. This is necessary since
1047             // each new wayland context must open() the socket. If the wayland socket is ever
1048             // destroyed and remade in the same host directory, new connections will be possible
1049             // without restarting the wayland device.
1050             for dir in &wayland_socket_dirs {
1051                 jail.mount_bind(dir, dir, true)?;
1052             }
1053             add_crosvm_user_to_jail(&mut jail, "Wayland")?;
1054 
1055             Some(jail)
1056         }
1057         None => None,
1058     };
1059 
1060     Ok(VirtioDeviceStub {
1061         dev: Box::new(dev),
1062         jail,
1063     })
1064 }
1065 
1066 #[cfg(any(feature = "video-decoder", feature = "video-encoder"))]
create_video_device( cfg: &Config, typ: devices::virtio::VideoDeviceType, resource_bridge: Tube, ) -> DeviceResult1067 fn create_video_device(
1068     cfg: &Config,
1069     typ: devices::virtio::VideoDeviceType,
1070     resource_bridge: Tube,
1071 ) -> DeviceResult {
1072     let jail = match simple_jail(&cfg, "video_device")? {
1073         Some(mut jail) => {
1074             match typ {
1075                 devices::virtio::VideoDeviceType::Decoder => {
1076                     add_crosvm_user_to_jail(&mut jail, "video-decoder")?
1077                 }
1078                 devices::virtio::VideoDeviceType::Encoder => {
1079                     add_crosvm_user_to_jail(&mut jail, "video-encoder")?
1080                 }
1081             };
1082 
1083             // Create a tmpfs in the device's root directory so that we can bind mount files.
1084             jail.mount_with_data(
1085                 Path::new("none"),
1086                 Path::new("/"),
1087                 "tmpfs",
1088                 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
1089                 "size=67108864",
1090             )?;
1091 
1092             // Render node for libvda.
1093             let dev_dri_path = Path::new("/dev/dri/renderD128");
1094             jail.mount_bind(dev_dri_path, dev_dri_path, false)?;
1095 
1096             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1097             {
1098                 // Device nodes used by libdrm through minigbm in libvda on AMD devices.
1099                 let sys_dev_char_path = Path::new("/sys/dev/char");
1100                 jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?;
1101                 let sys_devices_path = Path::new("/sys/devices");
1102                 jail.mount_bind(sys_devices_path, sys_devices_path, false)?;
1103 
1104                 // Required for loading dri libraries loaded by minigbm on AMD devices.
1105                 let lib_dir = Path::new("/usr/lib64");
1106                 jail.mount_bind(lib_dir, lib_dir, false)?;
1107             }
1108 
1109             // Device nodes required by libchrome which establishes Mojo connection in libvda.
1110             let dev_urandom_path = Path::new("/dev/urandom");
1111             jail.mount_bind(dev_urandom_path, dev_urandom_path, false)?;
1112             let system_bus_socket_path = Path::new("/run/dbus/system_bus_socket");
1113             jail.mount_bind(system_bus_socket_path, system_bus_socket_path, true)?;
1114 
1115             Some(jail)
1116         }
1117         None => None,
1118     };
1119 
1120     Ok(VirtioDeviceStub {
1121         dev: Box::new(devices::virtio::VideoDevice::new(
1122             virtio::base_features(cfg.protected_vm),
1123             typ,
1124             Some(resource_bridge),
1125         )),
1126         jail,
1127     })
1128 }
1129 
1130 #[cfg(any(feature = "video-decoder", feature = "video-encoder"))]
register_video_device( devs: &mut Vec<VirtioDeviceStub>, video_tube: Tube, cfg: &Config, typ: devices::virtio::VideoDeviceType, ) -> std::result::Result<(), Error>1131 fn register_video_device(
1132     devs: &mut Vec<VirtioDeviceStub>,
1133     video_tube: Tube,
1134     cfg: &Config,
1135     typ: devices::virtio::VideoDeviceType,
1136 ) -> std::result::Result<(), Error> {
1137     devs.push(create_video_device(cfg, typ, video_tube)?);
1138     Ok(())
1139 }
1140 
create_vhost_vsock_device(cfg: &Config, cid: u64, mem: &GuestMemory) -> DeviceResult1141 fn create_vhost_vsock_device(cfg: &Config, cid: u64, mem: &GuestMemory) -> DeviceResult {
1142     let features = virtio::base_features(cfg.protected_vm);
1143     let dev = virtio::vhost::Vsock::new(&cfg.vhost_vsock_device_path, features, cid, mem)
1144         .map_err(Error::VhostVsockDeviceNew)?;
1145 
1146     Ok(VirtioDeviceStub {
1147         dev: Box::new(dev),
1148         jail: simple_jail(&cfg, "vhost_vsock_device")?,
1149     })
1150 }
1151 
create_fs_device( cfg: &Config, uid_map: &str, gid_map: &str, src: &Path, tag: &str, fs_cfg: virtio::fs::passthrough::Config, device_tube: Tube, ) -> DeviceResult1152 fn create_fs_device(
1153     cfg: &Config,
1154     uid_map: &str,
1155     gid_map: &str,
1156     src: &Path,
1157     tag: &str,
1158     fs_cfg: virtio::fs::passthrough::Config,
1159     device_tube: Tube,
1160 ) -> DeviceResult {
1161     let max_open_files = get_max_open_files()?;
1162     let j = if cfg.sandbox {
1163         let seccomp_policy = cfg.seccomp_policy_dir.join("fs_device");
1164         let config = SandboxConfig {
1165             limit_caps: false,
1166             uid_map: Some(uid_map),
1167             gid_map: Some(gid_map),
1168             log_failures: cfg.seccomp_log_failures,
1169             seccomp_policy: &seccomp_policy,
1170         };
1171         let mut jail = create_base_minijail(src, Some(max_open_files), Some(&config))?;
1172         // We want bind mounts from the parent namespaces to propagate into the fs device's
1173         // namespace.
1174         jail.set_remount_mode(libc::MS_SLAVE);
1175 
1176         jail
1177     } else {
1178         create_base_minijail(src, Some(max_open_files), None)?
1179     };
1180 
1181     let features = virtio::base_features(cfg.protected_vm);
1182     // TODO(chirantan): Use more than one worker once the kernel driver has been fixed to not panic
1183     // when num_queues > 1.
1184     let dev =
1185         virtio::fs::Fs::new(features, tag, 1, fs_cfg, device_tube).map_err(Error::FsDeviceNew)?;
1186 
1187     Ok(VirtioDeviceStub {
1188         dev: Box::new(dev),
1189         jail: Some(j),
1190     })
1191 }
1192 
create_9p_device( cfg: &Config, uid_map: &str, gid_map: &str, src: &Path, tag: &str, mut p9_cfg: p9::Config, ) -> DeviceResult1193 fn create_9p_device(
1194     cfg: &Config,
1195     uid_map: &str,
1196     gid_map: &str,
1197     src: &Path,
1198     tag: &str,
1199     mut p9_cfg: p9::Config,
1200 ) -> DeviceResult {
1201     let max_open_files = get_max_open_files()?;
1202     let (jail, root) = if cfg.sandbox {
1203         let seccomp_policy = cfg.seccomp_policy_dir.join("9p_device");
1204         let config = SandboxConfig {
1205             limit_caps: false,
1206             uid_map: Some(uid_map),
1207             gid_map: Some(gid_map),
1208             log_failures: cfg.seccomp_log_failures,
1209             seccomp_policy: &seccomp_policy,
1210         };
1211 
1212         let mut jail = create_base_minijail(src, Some(max_open_files), Some(&config))?;
1213         // We want bind mounts from the parent namespaces to propagate into the 9p server's
1214         // namespace.
1215         jail.set_remount_mode(libc::MS_SLAVE);
1216 
1217         //  The shared directory becomes the root of the device's file system.
1218         let root = Path::new("/");
1219         (Some(jail), root)
1220     } else {
1221         // There's no mount namespace so we tell the server to treat the source directory as the
1222         // root.
1223         (None, src)
1224     };
1225 
1226     let features = virtio::base_features(cfg.protected_vm);
1227     p9_cfg.root = root.into();
1228     let dev = virtio::P9::new(features, tag, p9_cfg).map_err(Error::P9DeviceNew)?;
1229 
1230     Ok(VirtioDeviceStub {
1231         dev: Box::new(dev),
1232         jail,
1233     })
1234 }
1235 
create_pmem_device( cfg: &Config, vm: &mut impl Vm, resources: &mut SystemAllocator, disk: &DiskOption, index: usize, pmem_device_tube: Tube, ) -> DeviceResult1236 fn create_pmem_device(
1237     cfg: &Config,
1238     vm: &mut impl Vm,
1239     resources: &mut SystemAllocator,
1240     disk: &DiskOption,
1241     index: usize,
1242     pmem_device_tube: Tube,
1243 ) -> DeviceResult {
1244     // Special case '/proc/self/fd/*' paths. The FD is already open, just use it.
1245     let fd: File = if disk.path.parent() == Some(Path::new("/proc/self/fd")) {
1246         // Safe because we will validate |raw_fd|.
1247         unsafe { File::from_raw_descriptor(raw_descriptor_from_path(&disk.path)?) }
1248     } else {
1249         OpenOptions::new()
1250             .read(true)
1251             .write(!disk.read_only)
1252             .open(&disk.path)
1253             .map_err(|e| Error::Disk(disk.path.to_path_buf(), e))?
1254     };
1255 
1256     let arena_size = {
1257         let metadata =
1258             std::fs::metadata(&disk.path).map_err(|e| Error::Disk(disk.path.to_path_buf(), e))?;
1259         let disk_len = metadata.len();
1260         // Linux requires pmem region sizes to be 2 MiB aligned. Linux will fill any partial page
1261         // at the end of an mmap'd file and won't write back beyond the actual file length, but if
1262         // we just align the size of the file to 2 MiB then access beyond the last page of the
1263         // mapped file will generate SIGBUS. So use a memory mapping arena that will provide
1264         // padding up to 2 MiB.
1265         let alignment = 2 * 1024 * 1024;
1266         let align_adjust = if disk_len % alignment != 0 {
1267             alignment - (disk_len % alignment)
1268         } else {
1269             0
1270         };
1271         disk_len
1272             .checked_add(align_adjust)
1273             .ok_or(Error::PmemDeviceImageTooBig)?
1274     };
1275 
1276     let protection = {
1277         if disk.read_only {
1278             Protection::read()
1279         } else {
1280             Protection::read_write()
1281         }
1282     };
1283 
1284     let arena = {
1285         // Conversion from u64 to usize may fail on 32bit system.
1286         let arena_size = usize::try_from(arena_size).map_err(|_| Error::PmemDeviceImageTooBig)?;
1287 
1288         let mut arena = MemoryMappingArena::new(arena_size).map_err(Error::ReservePmemMemory)?;
1289         arena
1290             .add_fd_offset_protection(0, arena_size, &fd, 0, protection)
1291             .map_err(Error::ReservePmemMemory)?;
1292         arena
1293     };
1294 
1295     let mapping_address = resources
1296         .mmio_allocator(MmioType::High)
1297         .allocate_with_align(
1298             arena_size,
1299             Alloc::PmemDevice(index),
1300             format!("pmem_disk_image_{}", index),
1301             // Linux kernel requires pmem namespaces to be 128 MiB aligned.
1302             128 * 1024 * 1024, /* 128 MiB */
1303         )
1304         .map_err(Error::AllocatePmemDeviceAddress)?;
1305 
1306     let slot = vm
1307         .add_memory_region(
1308             GuestAddress(mapping_address),
1309             Box::new(arena),
1310             /* read_only = */ disk.read_only,
1311             /* log_dirty_pages = */ false,
1312         )
1313         .map_err(Error::AddPmemDeviceMemory)?;
1314 
1315     let dev = virtio::Pmem::new(
1316         virtio::base_features(cfg.protected_vm),
1317         fd,
1318         GuestAddress(mapping_address),
1319         slot,
1320         arena_size,
1321         Some(pmem_device_tube),
1322     )
1323     .map_err(Error::PmemDeviceNew)?;
1324 
1325     Ok(VirtioDeviceStub {
1326         dev: Box::new(dev) as Box<dyn VirtioDevice>,
1327         jail: simple_jail(&cfg, "pmem_device")?,
1328     })
1329 }
1330 
create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult1331 fn create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult {
1332     let mut keep_rds = Vec::new();
1333     let evt = Event::new().map_err(Error::CreateEvent)?;
1334     let dev = param
1335         .create_serial_device::<Console>(cfg.protected_vm, &evt, &mut keep_rds)
1336         .map_err(Error::CreateConsole)?;
1337 
1338     let jail = match simple_jail(&cfg, "serial")? {
1339         Some(mut jail) => {
1340             // Create a tmpfs in the device's root directory so that we can bind mount the
1341             // log socket directory into it.
1342             // The size=67108864 is size=64*1024*1024 or size=64MB.
1343             jail.mount_with_data(
1344                 Path::new("none"),
1345                 Path::new("/"),
1346                 "tmpfs",
1347                 (libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_NOSUID) as usize,
1348                 "size=67108864",
1349             )?;
1350             add_crosvm_user_to_jail(&mut jail, "serial")?;
1351             let res = param.add_bind_mounts(&mut jail);
1352             if res.is_err() {
1353                 error!("failed to add bind mounts for console device");
1354             }
1355             Some(jail)
1356         }
1357         None => None,
1358     };
1359 
1360     Ok(VirtioDeviceStub {
1361         dev: Box::new(dev),
1362         jail, // TODO(dverkamp): use a separate policy for console?
1363     })
1364 }
1365 
1366 // gpu_device_tube is not used when GPU support is disabled.
1367 #[cfg_attr(not(feature = "gpu"), allow(unused_variables))]
create_virtio_devices( cfg: &Config, mem: &GuestMemory, vm: &mut impl Vm, resources: &mut SystemAllocator, _exit_evt: &Event, wayland_device_tube: Tube, gpu_device_tube: Tube, balloon_device_tube: Tube, disk_device_tubes: &mut Vec<Tube>, pmem_device_tubes: &mut Vec<Tube>, map_request: Arc<Mutex<Option<ExternalMapping>>>, fs_device_tubes: &mut Vec<Tube>, ) -> DeviceResult<Vec<VirtioDeviceStub>>1368 fn create_virtio_devices(
1369     cfg: &Config,
1370     mem: &GuestMemory,
1371     vm: &mut impl Vm,
1372     resources: &mut SystemAllocator,
1373     _exit_evt: &Event,
1374     wayland_device_tube: Tube,
1375     gpu_device_tube: Tube,
1376     balloon_device_tube: Tube,
1377     disk_device_tubes: &mut Vec<Tube>,
1378     pmem_device_tubes: &mut Vec<Tube>,
1379     map_request: Arc<Mutex<Option<ExternalMapping>>>,
1380     fs_device_tubes: &mut Vec<Tube>,
1381 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
1382     let mut devs = Vec::new();
1383 
1384     for (_, param) in cfg
1385         .serial_parameters
1386         .iter()
1387         .filter(|(_k, v)| v.hardware == SerialHardware::VirtioConsole)
1388     {
1389         let dev = create_console_device(cfg, param)?;
1390         devs.push(dev);
1391     }
1392 
1393     for disk in &cfg.disks {
1394         let disk_device_tube = disk_device_tubes.remove(0);
1395         devs.push(create_block_device(cfg, disk, disk_device_tube)?);
1396     }
1397 
1398     for blk in &cfg.vhost_user_blk {
1399         devs.push(create_vhost_user_block_device(cfg, blk)?);
1400     }
1401 
1402     for (index, pmem_disk) in cfg.pmem_devices.iter().enumerate() {
1403         let pmem_device_tube = pmem_device_tubes.remove(0);
1404         devs.push(create_pmem_device(
1405             cfg,
1406             vm,
1407             resources,
1408             pmem_disk,
1409             index,
1410             pmem_device_tube,
1411         )?);
1412     }
1413 
1414     devs.push(create_rng_device(cfg)?);
1415 
1416     #[cfg(feature = "tpm")]
1417     {
1418         if cfg.software_tpm {
1419             devs.push(create_tpm_device(cfg)?);
1420         }
1421     }
1422 
1423     if let Some(single_touch_spec) = &cfg.virtio_single_touch {
1424         devs.push(create_single_touch_device(cfg, single_touch_spec)?);
1425     }
1426 
1427     if let Some(multi_touch_spec) = &cfg.virtio_multi_touch {
1428         devs.push(create_multi_touch_device(cfg, multi_touch_spec)?);
1429     }
1430 
1431     if let Some(trackpad_spec) = &cfg.virtio_trackpad {
1432         devs.push(create_trackpad_device(cfg, trackpad_spec)?);
1433     }
1434 
1435     if let Some(mouse_socket) = &cfg.virtio_mouse {
1436         devs.push(create_mouse_device(cfg, mouse_socket)?);
1437     }
1438 
1439     if let Some(keyboard_socket) = &cfg.virtio_keyboard {
1440         devs.push(create_keyboard_device(cfg, keyboard_socket)?);
1441     }
1442 
1443     if let Some(switches_socket) = &cfg.virtio_switches {
1444         devs.push(create_switches_device(cfg, switches_socket)?);
1445     }
1446 
1447     for dev_path in &cfg.virtio_input_evdevs {
1448         devs.push(create_vinput_device(cfg, dev_path)?);
1449     }
1450 
1451     devs.push(create_balloon_device(cfg, balloon_device_tube)?);
1452 
1453     // We checked above that if the IP is defined, then the netmask is, too.
1454     for tap_fd in &cfg.tap_fd {
1455         devs.push(create_tap_net_device(cfg, *tap_fd)?);
1456     }
1457 
1458     if let (Some(host_ip), Some(netmask), Some(mac_address)) =
1459         (cfg.host_ip, cfg.netmask, cfg.mac_address)
1460     {
1461         if !cfg.vhost_user_net.is_empty() {
1462             return Err(Error::VhostUserNetWithNetArgs);
1463         }
1464         devs.push(create_net_device(cfg, host_ip, netmask, mac_address, mem)?);
1465     }
1466 
1467     for net in &cfg.vhost_user_net {
1468         devs.push(create_vhost_user_net_device(cfg, net)?);
1469     }
1470 
1471     #[cfg_attr(not(feature = "gpu"), allow(unused_mut))]
1472     let mut resource_bridges = Vec::<Tube>::new();
1473 
1474     if !cfg.wayland_socket_paths.is_empty() {
1475         #[cfg_attr(not(feature = "gpu"), allow(unused_mut))]
1476         let mut wl_resource_bridge = None::<Tube>;
1477 
1478         #[cfg(feature = "gpu")]
1479         {
1480             if cfg.gpu_parameters.is_some() {
1481                 let (wl_socket, gpu_socket) = Tube::pair().map_err(Error::CreateTube)?;
1482                 resource_bridges.push(gpu_socket);
1483                 wl_resource_bridge = Some(wl_socket);
1484             }
1485         }
1486 
1487         devs.push(create_wayland_device(
1488             cfg,
1489             wayland_device_tube,
1490             wl_resource_bridge,
1491         )?);
1492     }
1493 
1494     #[cfg(feature = "video-decoder")]
1495     let video_dec_tube = if cfg.video_dec {
1496         let (video_tube, gpu_tube) = Tube::pair().map_err(Error::CreateTube)?;
1497         resource_bridges.push(gpu_tube);
1498         Some(video_tube)
1499     } else {
1500         None
1501     };
1502 
1503     #[cfg(feature = "video-encoder")]
1504     let video_enc_tube = if cfg.video_enc {
1505         let (video_tube, gpu_tube) = Tube::pair().map_err(Error::CreateTube)?;
1506         resource_bridges.push(gpu_tube);
1507         Some(video_tube)
1508     } else {
1509         None
1510     };
1511 
1512     #[cfg(feature = "gpu")]
1513     {
1514         if let Some(gpu_parameters) = &cfg.gpu_parameters {
1515             let mut event_devices = Vec::new();
1516             if cfg.display_window_mouse {
1517                 let (event_device_socket, virtio_dev_socket) =
1518                     UnixStream::pair().map_err(Error::CreateSocket)?;
1519                 let (multi_touch_width, multi_touch_height) = cfg
1520                     .virtio_multi_touch
1521                     .as_ref()
1522                     .map(|multi_touch_spec| multi_touch_spec.get_size())
1523                     .unwrap_or((gpu_parameters.display_width, gpu_parameters.display_height));
1524                 let dev = virtio::new_multi_touch(
1525                     virtio_dev_socket,
1526                     multi_touch_width,
1527                     multi_touch_height,
1528                     virtio::base_features(cfg.protected_vm),
1529                 )
1530                 .map_err(Error::InputDeviceNew)?;
1531                 devs.push(VirtioDeviceStub {
1532                     dev: Box::new(dev),
1533                     jail: simple_jail(&cfg, "input_device")?,
1534                 });
1535                 event_devices.push(EventDevice::touchscreen(event_device_socket));
1536             }
1537             if cfg.display_window_keyboard {
1538                 let (event_device_socket, virtio_dev_socket) =
1539                     UnixStream::pair().map_err(Error::CreateSocket)?;
1540                 let dev = virtio::new_keyboard(
1541                     virtio_dev_socket,
1542                     virtio::base_features(cfg.protected_vm),
1543                 )
1544                 .map_err(Error::InputDeviceNew)?;
1545                 devs.push(VirtioDeviceStub {
1546                     dev: Box::new(dev),
1547                     jail: simple_jail(&cfg, "input_device")?,
1548                 });
1549                 event_devices.push(EventDevice::keyboard(event_device_socket));
1550             }
1551             devs.push(create_gpu_device(
1552                 cfg,
1553                 _exit_evt,
1554                 gpu_device_tube,
1555                 resource_bridges,
1556                 // Use the unnamed socket for GPU display screens.
1557                 cfg.wayland_socket_paths.get(""),
1558                 cfg.x_display.clone(),
1559                 event_devices,
1560                 map_request,
1561                 mem,
1562             )?);
1563         }
1564     }
1565 
1566     #[cfg(feature = "video-decoder")]
1567     {
1568         if let Some(video_dec_tube) = video_dec_tube {
1569             register_video_device(
1570                 &mut devs,
1571                 video_dec_tube,
1572                 cfg,
1573                 devices::virtio::VideoDeviceType::Decoder,
1574             )?;
1575         }
1576     }
1577 
1578     #[cfg(feature = "video-encoder")]
1579     {
1580         if let Some(video_enc_tube) = video_enc_tube {
1581             register_video_device(
1582                 &mut devs,
1583                 video_enc_tube,
1584                 cfg,
1585                 devices::virtio::VideoDeviceType::Encoder,
1586             )?;
1587         }
1588     }
1589 
1590     if let Some(cid) = cfg.cid {
1591         devs.push(create_vhost_vsock_device(cfg, cid, mem)?);
1592     }
1593 
1594     for vhost_user_fs in &cfg.vhost_user_fs {
1595         devs.push(create_vhost_user_fs_device(cfg, &vhost_user_fs)?);
1596     }
1597 
1598     for shared_dir in &cfg.shared_dirs {
1599         let SharedDir {
1600             src,
1601             tag,
1602             kind,
1603             uid_map,
1604             gid_map,
1605             fs_cfg,
1606             p9_cfg,
1607         } = shared_dir;
1608 
1609         let dev = match kind {
1610             SharedDirKind::FS => {
1611                 let device_tube = fs_device_tubes.remove(0);
1612                 create_fs_device(cfg, uid_map, gid_map, src, tag, fs_cfg.clone(), device_tube)?
1613             }
1614             SharedDirKind::P9 => create_9p_device(cfg, uid_map, gid_map, src, tag, p9_cfg.clone())?,
1615         };
1616         devs.push(dev);
1617     }
1618 
1619     Ok(devs)
1620 }
1621 
create_devices( cfg: &Config, mem: &GuestMemory, vm: &mut impl Vm, resources: &mut SystemAllocator, exit_evt: &Event, control_tubes: &mut Vec<TaggedControlTube>, wayland_device_tube: Tube, gpu_device_tube: Tube, balloon_device_tube: Tube, disk_device_tubes: &mut Vec<Tube>, pmem_device_tubes: &mut Vec<Tube>, fs_device_tubes: &mut Vec<Tube>, usb_provider: HostBackendDeviceProvider, map_request: Arc<Mutex<Option<ExternalMapping>>>, ) -> DeviceResult<Vec<(Box<dyn PciDevice>, Option<Minijail>)>>1622 fn create_devices(
1623     cfg: &Config,
1624     mem: &GuestMemory,
1625     vm: &mut impl Vm,
1626     resources: &mut SystemAllocator,
1627     exit_evt: &Event,
1628     control_tubes: &mut Vec<TaggedControlTube>,
1629     wayland_device_tube: Tube,
1630     gpu_device_tube: Tube,
1631     balloon_device_tube: Tube,
1632     disk_device_tubes: &mut Vec<Tube>,
1633     pmem_device_tubes: &mut Vec<Tube>,
1634     fs_device_tubes: &mut Vec<Tube>,
1635     usb_provider: HostBackendDeviceProvider,
1636     map_request: Arc<Mutex<Option<ExternalMapping>>>,
1637 ) -> DeviceResult<Vec<(Box<dyn PciDevice>, Option<Minijail>)>> {
1638     let stubs = create_virtio_devices(
1639         &cfg,
1640         mem,
1641         vm,
1642         resources,
1643         exit_evt,
1644         wayland_device_tube,
1645         gpu_device_tube,
1646         balloon_device_tube,
1647         disk_device_tubes,
1648         pmem_device_tubes,
1649         map_request,
1650         fs_device_tubes,
1651     )?;
1652 
1653     let mut pci_devices = Vec::new();
1654 
1655     for stub in stubs {
1656         let (msi_host_tube, msi_device_tube) = Tube::pair().map_err(Error::CreateTube)?;
1657         control_tubes.push(TaggedControlTube::VmIrq(msi_host_tube));
1658         let dev = VirtioPciDevice::new(mem.clone(), stub.dev, msi_device_tube)
1659             .map_err(Error::VirtioPciDev)?;
1660         let dev = Box::new(dev) as Box<dyn PciDevice>;
1661         pci_devices.push((dev, stub.jail));
1662     }
1663 
1664     #[cfg(feature = "audio")]
1665     for ac97_param in &cfg.ac97_parameters {
1666         let dev = Ac97Dev::try_new(mem.clone(), ac97_param.clone()).map_err(Error::CreateAc97)?;
1667         let jail = simple_jail(&cfg, dev.minijail_policy())?;
1668         pci_devices.push((Box::new(dev), jail));
1669     }
1670 
1671     // Create xhci controller.
1672     let usb_controller = Box::new(XhciController::new(mem.clone(), usb_provider));
1673     pci_devices.push((usb_controller, simple_jail(&cfg, "xhci")?));
1674 
1675     if !cfg.vfio.is_empty() {
1676         let vfio_container = Arc::new(Mutex::new(
1677             VfioContainer::new().map_err(Error::CreateVfioDevice)?,
1678         ));
1679 
1680         for vfio_path in &cfg.vfio {
1681             // create MSI, MSI-X, and Mem request sockets for each vfio device
1682             let (vfio_host_tube_msi, vfio_device_tube_msi) =
1683                 Tube::pair().map_err(Error::CreateTube)?;
1684             control_tubes.push(TaggedControlTube::VmIrq(vfio_host_tube_msi));
1685 
1686             let (vfio_host_tube_msix, vfio_device_tube_msix) =
1687                 Tube::pair().map_err(Error::CreateTube)?;
1688             control_tubes.push(TaggedControlTube::VmIrq(vfio_host_tube_msix));
1689 
1690             let (vfio_host_tube_mem, vfio_device_tube_mem) =
1691                 Tube::pair().map_err(Error::CreateTube)?;
1692             control_tubes.push(TaggedControlTube::VmMemory(vfio_host_tube_mem));
1693 
1694             let vfiodevice = VfioDevice::new(vfio_path.as_path(), vm, mem, vfio_container.clone())
1695                 .map_err(Error::CreateVfioDevice)?;
1696             let mut vfiopcidevice = Box::new(VfioPciDevice::new(
1697                 vfiodevice,
1698                 vfio_device_tube_msi,
1699                 vfio_device_tube_msix,
1700                 vfio_device_tube_mem,
1701             ));
1702             // early reservation for pass-through PCI devices.
1703             if vfiopcidevice.allocate_address(resources).is_err() {
1704                 warn!(
1705                     "address reservation failed for vfio {}",
1706                     vfiopcidevice.debug_label()
1707                 );
1708             }
1709             pci_devices.push((vfiopcidevice, simple_jail(&cfg, "vfio_device")?));
1710         }
1711     }
1712 
1713     Ok(pci_devices)
1714 }
1715 
1716 #[derive(Copy, Clone)]
1717 #[cfg_attr(not(feature = "tpm"), allow(dead_code))]
1718 struct Ids {
1719     uid: uid_t,
1720     gid: gid_t,
1721 }
1722 
1723 // Set the uid/gid for the jailed process and give a basic id map. This is
1724 // required for bind mounts to work.
add_crosvm_user_to_jail(jail: &mut Minijail, feature: &str) -> Result<Ids>1725 fn add_crosvm_user_to_jail(jail: &mut Minijail, feature: &str) -> Result<Ids> {
1726     let crosvm_user_group = CStr::from_bytes_with_nul(b"crosvm\0").unwrap();
1727 
1728     let crosvm_uid = match get_user_id(&crosvm_user_group) {
1729         Ok(u) => u,
1730         Err(e) => {
1731             warn!("falling back to current user id for {}: {}", feature, e);
1732             geteuid()
1733         }
1734     };
1735 
1736     let crosvm_gid = match get_group_id(&crosvm_user_group) {
1737         Ok(u) => u,
1738         Err(e) => {
1739             warn!("falling back to current group id for {}: {}", feature, e);
1740             getegid()
1741         }
1742     };
1743 
1744     jail.change_uid(crosvm_uid);
1745     jail.change_gid(crosvm_gid);
1746     jail.uidmap(&format!("{0} {0} 1", crosvm_uid))
1747         .map_err(Error::SettingUidMap)?;
1748     jail.gidmap(&format!("{0} {0} 1", crosvm_gid))
1749         .map_err(Error::SettingGidMap)?;
1750 
1751     Ok(Ids {
1752         uid: crosvm_uid,
1753         gid: crosvm_gid,
1754     })
1755 }
1756 
raw_descriptor_from_path(path: &Path) -> Result<RawDescriptor>1757 fn raw_descriptor_from_path(path: &Path) -> Result<RawDescriptor> {
1758     if !path.is_file() {
1759         return Err(Error::InvalidFdPath);
1760     }
1761     let raw_descriptor = path
1762         .file_name()
1763         .and_then(|fd_osstr| fd_osstr.to_str())
1764         .and_then(|fd_str| fd_str.parse::<c_int>().ok())
1765         .ok_or(Error::InvalidFdPath)?;
1766     validate_raw_descriptor(raw_descriptor).map_err(Error::ValidateRawDescriptor)
1767 }
1768 
1769 trait IntoUnixStream {
into_unix_stream(self) -> Result<UnixStream>1770     fn into_unix_stream(self) -> Result<UnixStream>;
1771 }
1772 
1773 impl<'a> IntoUnixStream for &'a Path {
into_unix_stream(self) -> Result<UnixStream>1774     fn into_unix_stream(self) -> Result<UnixStream> {
1775         if self.parent() == Some(Path::new("/proc/self/fd")) {
1776             // Safe because we will validate |raw_fd|.
1777             unsafe { Ok(UnixStream::from_raw_fd(raw_descriptor_from_path(self)?)) }
1778         } else {
1779             UnixStream::connect(self).map_err(Error::InputEventsOpen)
1780         }
1781     }
1782 }
1783 impl<'a> IntoUnixStream for &'a PathBuf {
into_unix_stream(self) -> Result<UnixStream>1784     fn into_unix_stream(self) -> Result<UnixStream> {
1785         self.as_path().into_unix_stream()
1786     }
1787 }
1788 
1789 impl IntoUnixStream for UnixStream {
into_unix_stream(self) -> Result<UnixStream>1790     fn into_unix_stream(self) -> Result<UnixStream> {
1791         Ok(self)
1792     }
1793 }
1794 
setup_vcpu_signal_handler<T: Vcpu>(use_hypervisor_signals: bool) -> Result<()>1795 fn setup_vcpu_signal_handler<T: Vcpu>(use_hypervisor_signals: bool) -> Result<()> {
1796     if use_hypervisor_signals {
1797         unsafe {
1798             extern "C" fn handle_signal(_: c_int) {}
1799             // Our signal handler does nothing and is trivially async signal safe.
1800             register_rt_signal_handler(SIGRTMIN() + 0, handle_signal)
1801                 .map_err(Error::RegisterSignalHandler)?;
1802         }
1803         block_signal(SIGRTMIN() + 0).map_err(Error::BlockSignal)?;
1804     } else {
1805         unsafe {
1806             extern "C" fn handle_signal<T: Vcpu>(_: c_int) {
1807                 T::set_local_immediate_exit(true);
1808             }
1809             register_rt_signal_handler(SIGRTMIN() + 0, handle_signal::<T>)
1810                 .map_err(Error::RegisterSignalHandler)?;
1811         }
1812     }
1813     Ok(())
1814 }
1815 
1816 // Sets up a vcpu and converts it into a runnable vcpu.
runnable_vcpu<V>( cpu_id: usize, vcpu: Option<V>, vm: impl VmArch, irq_chip: &mut impl IrqChipArch, vcpu_count: usize, run_rt: bool, vcpu_affinity: Vec<usize>, no_smt: bool, has_bios: bool, use_hypervisor_signals: bool, ) -> Result<(V, VcpuRunHandle)> where V: VcpuArch,1817 fn runnable_vcpu<V>(
1818     cpu_id: usize,
1819     vcpu: Option<V>,
1820     vm: impl VmArch,
1821     irq_chip: &mut impl IrqChipArch,
1822     vcpu_count: usize,
1823     run_rt: bool,
1824     vcpu_affinity: Vec<usize>,
1825     no_smt: bool,
1826     has_bios: bool,
1827     use_hypervisor_signals: bool,
1828 ) -> Result<(V, VcpuRunHandle)>
1829 where
1830     V: VcpuArch,
1831 {
1832     let mut vcpu = match vcpu {
1833         Some(v) => v,
1834         None => {
1835             // If vcpu is None, it means this arch/hypervisor requires create_vcpu to be called from
1836             // the vcpu thread.
1837             match vm
1838                 .create_vcpu(cpu_id)
1839                 .map_err(Error::CreateVcpu)?
1840                 .downcast::<V>()
1841             {
1842                 Ok(v) => *v,
1843                 Err(_) => panic!("VM created wrong type of VCPU"),
1844             }
1845         }
1846     };
1847 
1848     irq_chip
1849         .add_vcpu(cpu_id, &vcpu)
1850         .map_err(Error::AddIrqChipVcpu)?;
1851 
1852     if !vcpu_affinity.is_empty() {
1853         if let Err(e) = set_cpu_affinity(vcpu_affinity) {
1854             error!("Failed to set CPU affinity: {}", e);
1855         }
1856     }
1857 
1858     Arch::configure_vcpu(
1859         vm.get_memory(),
1860         vm.get_hypervisor(),
1861         irq_chip,
1862         &mut vcpu,
1863         cpu_id,
1864         vcpu_count,
1865         has_bios,
1866         no_smt,
1867     )
1868     .map_err(Error::ConfigureVcpu)?;
1869 
1870     #[cfg(feature = "chromeos")]
1871     if let Err(e) = base::sched::enable_core_scheduling() {
1872         error!("Failed to enable core scheduling: {}", e);
1873     }
1874 
1875     if run_rt {
1876         const DEFAULT_VCPU_RT_LEVEL: u16 = 6;
1877         if let Err(e) = set_rt_prio_limit(u64::from(DEFAULT_VCPU_RT_LEVEL))
1878             .and_then(|_| set_rt_round_robin(i32::from(DEFAULT_VCPU_RT_LEVEL)))
1879         {
1880             warn!("Failed to set vcpu to real time: {}", e);
1881         }
1882     }
1883 
1884     if use_hypervisor_signals {
1885         let mut v = get_blocked_signals().map_err(Error::GetSignalMask)?;
1886         v.retain(|&x| x != SIGRTMIN() + 0);
1887         vcpu.set_signal_mask(&v).map_err(Error::SettingSignalMask)?;
1888     }
1889 
1890     let vcpu_run_handle = vcpu
1891         .take_run_handle(Some(SIGRTMIN() + 0))
1892         .map_err(Error::RunnableVcpu)?;
1893 
1894     Ok((vcpu, vcpu_run_handle))
1895 }
1896 
1897 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
handle_debug_msg<V>( cpu_id: usize, vcpu: &V, guest_mem: &GuestMemory, d: VcpuDebug, reply_tube: &mpsc::Sender<VcpuDebugStatusMessage>, ) -> Result<()> where V: VcpuArch + 'static,1898 fn handle_debug_msg<V>(
1899     cpu_id: usize,
1900     vcpu: &V,
1901     guest_mem: &GuestMemory,
1902     d: VcpuDebug,
1903     reply_tube: &mpsc::Sender<VcpuDebugStatusMessage>,
1904 ) -> Result<()>
1905 where
1906     V: VcpuArch + 'static,
1907 {
1908     match d {
1909         VcpuDebug::ReadRegs => {
1910             let msg = VcpuDebugStatusMessage {
1911                 cpu: cpu_id as usize,
1912                 msg: VcpuDebugStatus::RegValues(
1913                     Arch::debug_read_registers(vcpu as &V).map_err(Error::HandleDebugCommand)?,
1914                 ),
1915             };
1916             reply_tube
1917                 .send(msg)
1918                 .map_err(|e| Error::SendDebugStatus(Box::new(e)))
1919         }
1920         VcpuDebug::WriteRegs(regs) => {
1921             Arch::debug_write_registers(vcpu as &V, &regs).map_err(Error::HandleDebugCommand)?;
1922             reply_tube
1923                 .send(VcpuDebugStatusMessage {
1924                     cpu: cpu_id as usize,
1925                     msg: VcpuDebugStatus::CommandComplete,
1926                 })
1927                 .map_err(|e| Error::SendDebugStatus(Box::new(e)))
1928         }
1929         VcpuDebug::ReadMem(vaddr, len) => {
1930             let msg = VcpuDebugStatusMessage {
1931                 cpu: cpu_id as usize,
1932                 msg: VcpuDebugStatus::MemoryRegion(
1933                     Arch::debug_read_memory(vcpu as &V, guest_mem, vaddr, len)
1934                         .unwrap_or(Vec::new()),
1935                 ),
1936             };
1937             reply_tube
1938                 .send(msg)
1939                 .map_err(|e| Error::SendDebugStatus(Box::new(e)))
1940         }
1941         VcpuDebug::WriteMem(vaddr, buf) => {
1942             Arch::debug_write_memory(vcpu as &V, guest_mem, vaddr, &buf)
1943                 .map_err(Error::HandleDebugCommand)?;
1944             reply_tube
1945                 .send(VcpuDebugStatusMessage {
1946                     cpu: cpu_id as usize,
1947                     msg: VcpuDebugStatus::CommandComplete,
1948                 })
1949                 .map_err(|e| Error::SendDebugStatus(Box::new(e)))
1950         }
1951         VcpuDebug::EnableSinglestep => {
1952             Arch::debug_enable_singlestep(vcpu as &V).map_err(Error::HandleDebugCommand)?;
1953             reply_tube
1954                 .send(VcpuDebugStatusMessage {
1955                     cpu: cpu_id as usize,
1956                     msg: VcpuDebugStatus::CommandComplete,
1957                 })
1958                 .map_err(|e| Error::SendDebugStatus(Box::new(e)))
1959         }
1960         VcpuDebug::SetHwBreakPoint(addrs) => {
1961             Arch::debug_set_hw_breakpoints(vcpu as &V, &addrs)
1962                 .map_err(Error::HandleDebugCommand)?;
1963             reply_tube
1964                 .send(VcpuDebugStatusMessage {
1965                     cpu: cpu_id as usize,
1966                     msg: VcpuDebugStatus::CommandComplete,
1967                 })
1968                 .map_err(|e| Error::SendDebugStatus(Box::new(e)))
1969         }
1970     }
1971 }
1972 
run_vcpu<V>( cpu_id: usize, vcpu: Option<V>, vm: impl VmArch + 'static, mut irq_chip: impl IrqChipArch + 'static, vcpu_count: usize, run_rt: bool, vcpu_affinity: Vec<usize>, no_smt: bool, start_barrier: Arc<Barrier>, has_bios: bool, io_bus: devices::Bus, mmio_bus: devices::Bus, exit_evt: Event, requires_pvclock_ctrl: bool, from_main_tube: mpsc::Receiver<VcpuControl>, use_hypervisor_signals: bool, #[cfg(all(target_arch = "x86_64", feature = "gdb"))] to_gdb_tube: Option< mpsc::Sender<VcpuDebugStatusMessage>, >, ) -> Result<JoinHandle<()>> where V: VcpuArch + 'static,1973 fn run_vcpu<V>(
1974     cpu_id: usize,
1975     vcpu: Option<V>,
1976     vm: impl VmArch + 'static,
1977     mut irq_chip: impl IrqChipArch + 'static,
1978     vcpu_count: usize,
1979     run_rt: bool,
1980     vcpu_affinity: Vec<usize>,
1981     no_smt: bool,
1982     start_barrier: Arc<Barrier>,
1983     has_bios: bool,
1984     io_bus: devices::Bus,
1985     mmio_bus: devices::Bus,
1986     exit_evt: Event,
1987     requires_pvclock_ctrl: bool,
1988     from_main_tube: mpsc::Receiver<VcpuControl>,
1989     use_hypervisor_signals: bool,
1990     #[cfg(all(target_arch = "x86_64", feature = "gdb"))] to_gdb_tube: Option<
1991         mpsc::Sender<VcpuDebugStatusMessage>,
1992     >,
1993 ) -> Result<JoinHandle<()>>
1994 where
1995     V: VcpuArch + 'static,
1996 {
1997     thread::Builder::new()
1998         .name(format!("crosvm_vcpu{}", cpu_id))
1999         .spawn(move || {
2000             // The VCPU thread must trigger the `exit_evt` in all paths, and a `ScopedEvent`'s Drop
2001             // implementation accomplishes that.
2002             let _scoped_exit_evt = ScopedEvent::from(exit_evt);
2003 
2004             #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2005             let guest_mem = vm.get_memory().clone();
2006             let runnable_vcpu = runnable_vcpu(
2007                 cpu_id,
2008                 vcpu,
2009                 vm,
2010                 &mut irq_chip,
2011                 vcpu_count,
2012                 run_rt,
2013                 vcpu_affinity,
2014                 no_smt,
2015                 has_bios,
2016                 use_hypervisor_signals,
2017             );
2018 
2019             start_barrier.wait();
2020 
2021             let (vcpu, vcpu_run_handle) = match runnable_vcpu {
2022                 Ok(v) => v,
2023                 Err(e) => {
2024                     error!("failed to start vcpu {}: {}", cpu_id, e);
2025                     return;
2026                 }
2027             };
2028 
2029             let mut run_mode = VmRunMode::Running;
2030             #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2031             if to_gdb_tube.is_some() {
2032                 // Wait until a GDB client attaches
2033                 run_mode = VmRunMode::Breakpoint;
2034             }
2035 
2036             let mut interrupted_by_signal = false;
2037 
2038             'vcpu_loop: loop {
2039                 // Start by checking for messages to process and the run state of the CPU.
2040                 // An extra check here for Running so there isn't a need to call recv unless a
2041                 // message is likely to be ready because a signal was sent.
2042                 if interrupted_by_signal || run_mode != VmRunMode::Running {
2043                     'state_loop: loop {
2044                         // Tries to get a pending message without blocking first.
2045                         let msg = match from_main_tube.try_recv() {
2046                             Ok(m) => m,
2047                             Err(mpsc::TryRecvError::Empty) if run_mode == VmRunMode::Running => {
2048                                 // If the VM is running and no message is pending, the state won't
2049                                 // change.
2050                                 break 'state_loop;
2051                             }
2052                             Err(mpsc::TryRecvError::Empty) => {
2053                                 // If the VM is not running, wait until a message is ready.
2054                                 match from_main_tube.recv() {
2055                                     Ok(m) => m,
2056                                     Err(mpsc::RecvError) => {
2057                                         error!("Failed to read from main tube in vcpu");
2058                                         break 'vcpu_loop;
2059                                     }
2060                                 }
2061                             }
2062                             Err(mpsc::TryRecvError::Disconnected) => {
2063                                 error!("Failed to read from main tube in vcpu");
2064                                 break 'vcpu_loop;
2065                             }
2066                         };
2067 
2068                         // Collect all pending messages.
2069                         let mut messages = vec![msg];
2070                         messages.append(&mut from_main_tube.try_iter().collect());
2071 
2072                         for msg in messages {
2073                             match msg {
2074                                 VcpuControl::RunState(new_mode) => {
2075                                     run_mode = new_mode;
2076                                     match run_mode {
2077                                         VmRunMode::Running => break 'state_loop,
2078                                         VmRunMode::Suspending => {
2079                                             // On KVM implementations that use a paravirtualized
2080                                             // clock (e.g. x86), a flag must be set to indicate to
2081                                             // the guest kernel that a vCPU was suspended. The guest
2082                                             // kernel will use this flag to prevent the soft lockup
2083                                             // detection from triggering when this vCPU resumes,
2084                                             // which could happen days later in realtime.
2085                                             if requires_pvclock_ctrl {
2086                                                 if let Err(e) = vcpu.pvclock_ctrl() {
2087                                                     error!(
2088                                                         "failed to tell hypervisor vcpu {} is suspending: {}",
2089                                                         cpu_id, e
2090                                                     );
2091                                                 }
2092                                             }
2093                                         }
2094                                         VmRunMode::Breakpoint => {}
2095                                         VmRunMode::Exiting => break 'vcpu_loop,
2096                                     }
2097                                 }
2098                                 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2099                                 VcpuControl::Debug(d) => {
2100                                     match &to_gdb_tube {
2101                                         Some(ref ch) => {
2102                                             if let Err(e) = handle_debug_msg(
2103                                                 cpu_id, &vcpu, &guest_mem, d, &ch,
2104                                             ) {
2105                                                 error!("Failed to handle gdb message: {}", e);
2106                                             }
2107                                         },
2108                                         None => {
2109                                             error!("VcpuControl::Debug received while GDB feature is disabled: {:?}", d);
2110                                         }
2111                                     }
2112                                 }
2113                             }
2114                         }
2115                     }
2116                 }
2117 
2118                 interrupted_by_signal = false;
2119 
2120                 // Vcpus may have run a HLT instruction, which puts them into a state other than
2121                 // VcpuRunState::Runnable. In that case, this call to wait_until_runnable blocks
2122                 // until either the irqchip receives an interrupt for this vcpu, or until the main
2123                 // thread kicks this vcpu as a result of some VmControl operation. In most IrqChip
2124                 // implementations HLT instructions do not make it to crosvm, and thus this is a
2125                 // no-op that always returns VcpuRunState::Runnable.
2126                 match irq_chip.wait_until_runnable(&vcpu) {
2127                     Ok(VcpuRunState::Runnable) => {}
2128                     Ok(VcpuRunState::Interrupted) => interrupted_by_signal = true,
2129                     Err(e) => error!(
2130                         "error waiting for vcpu {} to become runnable: {}",
2131                         cpu_id, e
2132                     ),
2133                 }
2134 
2135                 if !interrupted_by_signal {
2136                     match vcpu.run(&vcpu_run_handle) {
2137                         Ok(VcpuExit::IoIn { port, mut size }) => {
2138                             let mut data = [0; 8];
2139                             if size > data.len() {
2140                                 error!("unsupported IoIn size of {} bytes", size);
2141                                 size = data.len();
2142                             }
2143                             io_bus.read(port as u64, &mut data[..size]);
2144                             if let Err(e) = vcpu.set_data(&data[..size]) {
2145                                 error!("failed to set return data for IoIn: {}", e);
2146                             }
2147                         }
2148                         Ok(VcpuExit::IoOut {
2149                             port,
2150                             mut size,
2151                             data,
2152                         }) => {
2153                             if size > data.len() {
2154                                 error!("unsupported IoOut size of {} bytes", size);
2155                                 size = data.len();
2156                             }
2157                             io_bus.write(port as u64, &data[..size]);
2158                         }
2159                         Ok(VcpuExit::MmioRead { address, size }) => {
2160                             let mut data = [0; 8];
2161                             mmio_bus.read(address, &mut data[..size]);
2162                             // Setting data for mmio can not fail.
2163                             let _ = vcpu.set_data(&data[..size]);
2164                         }
2165                         Ok(VcpuExit::MmioWrite {
2166                             address,
2167                             size,
2168                             data,
2169                         }) => {
2170                             mmio_bus.write(address, &data[..size]);
2171                         }
2172                         Ok(VcpuExit::IoapicEoi { vector }) => {
2173                             if let Err(e) = irq_chip.broadcast_eoi(vector) {
2174                                 error!(
2175                                     "failed to broadcast eoi {} on vcpu {}: {}",
2176                                     vector, cpu_id, e
2177                                 );
2178                             }
2179                         }
2180                         Ok(VcpuExit::IrqWindowOpen) => {}
2181                         Ok(VcpuExit::Hlt) => irq_chip.halted(cpu_id),
2182                         Ok(VcpuExit::Shutdown) => break,
2183                         Ok(VcpuExit::FailEntry {
2184                             hardware_entry_failure_reason,
2185                         }) => {
2186                             error!("vcpu hw run failure: {:#x}", hardware_entry_failure_reason);
2187                             break;
2188                         }
2189                         Ok(VcpuExit::SystemEvent(_, _)) => break,
2190                         Ok(VcpuExit::Debug { .. }) => {
2191                             #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2192                             {
2193                                 let msg = VcpuDebugStatusMessage {
2194                                     cpu: cpu_id as usize,
2195                                     msg: VcpuDebugStatus::HitBreakPoint,
2196                                 };
2197                                 if let Some(ref ch) = to_gdb_tube {
2198                                     if let Err(e) = ch.send(msg) {
2199                                         error!("failed to notify breakpoint to GDB thread: {}", e);
2200                                         break;
2201                                     }
2202                                 }
2203                                 run_mode = VmRunMode::Breakpoint;
2204                             }
2205                         }
2206                         Ok(r) => warn!("unexpected vcpu exit: {:?}", r),
2207                         Err(e) => match e.errno() {
2208                             libc::EINTR => interrupted_by_signal = true,
2209                             libc::EAGAIN => {}
2210                             _ => {
2211                                 error!("vcpu hit unknown error: {}", e);
2212                                 break;
2213                             }
2214                         },
2215                     }
2216                 }
2217 
2218                 if interrupted_by_signal {
2219                     if use_hypervisor_signals {
2220                         // Try to clear the signal that we use to kick VCPU if it is pending before
2221                         // attempting to handle pause requests.
2222                         if let Err(e) = clear_signal(SIGRTMIN() + 0) {
2223                             error!("failed to clear pending signal: {}", e);
2224                             break;
2225                         }
2226                     } else {
2227                         vcpu.set_immediate_exit(false);
2228                     }
2229                 }
2230 
2231                 if let Err(e) = irq_chip.inject_interrupts(&vcpu) {
2232                     error!("failed to inject interrupts for vcpu {}: {}", cpu_id, e);
2233                 }
2234             }
2235         })
2236         .map_err(Error::SpawnVcpu)
2237 }
2238 
2239 // Reads the contents of a file and converts the space-separated fields into a Vec of i64s.
2240 // Returns an error if any of the fields fail to parse.
file_fields_to_i64<P: AsRef<Path>>(path: P) -> io::Result<Vec<i64>>2241 fn file_fields_to_i64<P: AsRef<Path>>(path: P) -> io::Result<Vec<i64>> {
2242     let mut file = File::open(path)?;
2243 
2244     let mut buf = [0u8; 32];
2245     let count = file.read(&mut buf)?;
2246 
2247     let content =
2248         str::from_utf8(&buf[..count]).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2249     content
2250         .trim()
2251         .split_whitespace()
2252         .map(|x| {
2253             x.parse::<i64>()
2254                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
2255         })
2256         .collect()
2257 }
2258 
2259 // Reads the contents of a file and converts them into a u64, and if there
2260 // are multiple fields it only returns the first one.
file_to_i64<P: AsRef<Path>>(path: P, nth: usize) -> io::Result<i64>2261 fn file_to_i64<P: AsRef<Path>>(path: P, nth: usize) -> io::Result<i64> {
2262     file_fields_to_i64(path)?
2263         .into_iter()
2264         .nth(nth)
2265         .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "empty file"))
2266 }
2267 
create_kvm_kernel_irq_chip( vm: &KvmVm, vcpu_count: usize, _ioapic_device_tube: Tube, ) -> base::Result<impl IrqChipArch>2268 fn create_kvm_kernel_irq_chip(
2269     vm: &KvmVm,
2270     vcpu_count: usize,
2271     _ioapic_device_tube: Tube,
2272 ) -> base::Result<impl IrqChipArch> {
2273     let irq_chip = KvmKernelIrqChip::new(vm.try_clone()?, vcpu_count)?;
2274     Ok(irq_chip)
2275 }
2276 
2277 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
create_kvm_split_irq_chip( vm: &KvmVm, vcpu_count: usize, ioapic_device_tube: Tube, ) -> base::Result<impl IrqChipArch>2278 fn create_kvm_split_irq_chip(
2279     vm: &KvmVm,
2280     vcpu_count: usize,
2281     ioapic_device_tube: Tube,
2282 ) -> base::Result<impl IrqChipArch> {
2283     let irq_chip =
2284         KvmSplitIrqChip::new(vm.try_clone()?, vcpu_count, ioapic_device_tube, Some(120))?;
2285     Ok(irq_chip)
2286 }
2287 
run_config(cfg: Config) -> Result<()>2288 pub fn run_config(cfg: Config) -> Result<()> {
2289     let components = setup_vm_components(&cfg)?;
2290 
2291     let guest_mem_layout =
2292         Arch::guest_memory_layout(&components).map_err(Error::GuestMemoryLayout)?;
2293     let guest_mem = GuestMemory::new(&guest_mem_layout).unwrap();
2294     let mut mem_policy = MemoryPolicy::empty();
2295     if components.hugepages {
2296         mem_policy |= MemoryPolicy::USE_HUGEPAGES;
2297     }
2298     guest_mem.set_memory_policy(mem_policy);
2299     let kvm = Kvm::new_with_path(&cfg.kvm_device_path).map_err(Error::CreateKvm)?;
2300     let vm = KvmVm::new(&kvm, guest_mem).map_err(Error::CreateVm)?;
2301 
2302     if cfg.split_irqchip {
2303         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
2304         {
2305             unimplemented!("KVM split irqchip mode only supported on x86 processors")
2306         }
2307 
2308         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
2309         {
2310             run_vm::<KvmVcpu, _, _, _>(cfg, components, vm, create_kvm_split_irq_chip)
2311         }
2312     } else {
2313         run_vm::<KvmVcpu, _, _, _>(cfg, components, vm, create_kvm_kernel_irq_chip)
2314     }
2315 }
2316 
setup_vm_components(cfg: &Config) -> Result<VmComponents>2317 fn setup_vm_components(cfg: &Config) -> Result<VmComponents> {
2318     let initrd_image = if let Some(initrd_path) = &cfg.initrd_path {
2319         Some(File::open(initrd_path).map_err(|e| Error::OpenInitrd(initrd_path.clone(), e))?)
2320     } else {
2321         None
2322     };
2323 
2324     let vm_image = match cfg.executable_path {
2325         Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel(
2326             File::open(kernel_path).map_err(|e| Error::OpenKernel(kernel_path.to_path_buf(), e))?,
2327         ),
2328         Some(Executable::Bios(ref bios_path)) => VmImage::Bios(
2329             File::open(bios_path).map_err(|e| Error::OpenBios(bios_path.to_path_buf(), e))?,
2330         ),
2331         _ => panic!("Did not receive a bios or kernel, should be impossible."),
2332     };
2333 
2334     Ok(VmComponents {
2335         memory_size: cfg
2336             .memory
2337             .unwrap_or(256)
2338             .checked_mul(1024 * 1024)
2339             .ok_or(Error::MemoryTooLarge)?,
2340         vcpu_count: cfg.vcpu_count.unwrap_or(1),
2341         vcpu_affinity: cfg.vcpu_affinity.clone(),
2342         no_smt: cfg.no_smt,
2343         hugepages: cfg.hugepages,
2344         vm_image,
2345         android_fstab: cfg
2346             .android_fstab
2347             .as_ref()
2348             .map(|x| File::open(x).map_err(|e| Error::OpenAndroidFstab(x.to_path_buf(), e)))
2349             .map_or(Ok(None), |v| v.map(Some))?,
2350         pstore: cfg.pstore.clone(),
2351         initrd_image,
2352         extra_kernel_params: cfg.params.clone(),
2353         wayland_dmabuf: cfg.wayland_dmabuf,
2354         acpi_sdts: cfg
2355             .acpi_tables
2356             .iter()
2357             .map(|path| SDT::from_file(path).map_err(|e| Error::OpenAcpiTable(path.clone(), e)))
2358             .collect::<Result<Vec<SDT>>>()?,
2359         rt_cpus: cfg.rt_cpus.clone(),
2360         protected_vm: cfg.protected_vm,
2361         #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2362         gdb: None,
2363         dmi_path: cfg.dmi_path.clone(),
2364     })
2365 }
2366 
run_vm<Vcpu, V, I, FI>( cfg: Config, #[allow(unused_mut)] mut components: VmComponents, vm: V, create_irq_chip: FI, ) -> Result<()> where Vcpu: VcpuArch + 'static, V: VmArch + 'static, I: IrqChipArch + 'static, FI: FnOnce( &V, usize, Tube, ) -> base::Result<I>,2367 fn run_vm<Vcpu, V, I, FI>(
2368     cfg: Config,
2369     #[allow(unused_mut)] mut components: VmComponents,
2370     vm: V,
2371     create_irq_chip: FI,
2372 ) -> Result<()>
2373 where
2374     Vcpu: VcpuArch + 'static,
2375     V: VmArch + 'static,
2376     I: IrqChipArch + 'static,
2377     FI: FnOnce(
2378         &V,
2379         usize, // vcpu_count
2380         Tube,  // ioapic_device_tube
2381     ) -> base::Result<I>,
2382 {
2383     if cfg.sandbox {
2384         // Printing something to the syslog before entering minijail so that libc's syslogger has a
2385         // chance to open files necessary for its operation, like `/etc/localtime`. After jailing,
2386         // access to those files will not be possible.
2387         info!("crosvm entering multiprocess mode");
2388     }
2389 
2390     let (usb_control_tube, usb_provider) =
2391         HostBackendDeviceProvider::new().map_err(Error::CreateUsbProvider)?;
2392     // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
2393     // before any jailed devices have been spawned, so that we can catch any of them that fail very
2394     // quickly.
2395     let sigchld_fd = SignalFd::new(libc::SIGCHLD).map_err(Error::CreateSignalFd)?;
2396 
2397     let control_server_socket = match &cfg.socket_path {
2398         Some(path) => Some(UnlinkUnixSeqpacketListener(
2399             UnixSeqpacketListener::bind(path).map_err(Error::CreateControlServer)?,
2400         )),
2401         None => None,
2402     };
2403 
2404     let mut control_tubes = Vec::new();
2405 
2406     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2407     if let Some(port) = cfg.gdb {
2408         // GDB needs a control socket to interrupt vcpus.
2409         let (gdb_host_tube, gdb_control_tube) = Tube::pair().map_err(Error::CreateTube)?;
2410         control_tubes.push(TaggedControlTube::Vm(gdb_host_tube));
2411         components.gdb = Some((port, gdb_control_tube));
2412     }
2413 
2414     let (wayland_host_tube, wayland_device_tube) = Tube::pair().map_err(Error::CreateTube)?;
2415     control_tubes.push(TaggedControlTube::VmMemory(wayland_host_tube));
2416     // Balloon gets a special socket so balloon requests can be forwarded from the main process.
2417     let (balloon_host_tube, balloon_device_tube) = Tube::pair().map_err(Error::CreateTube)?;
2418 
2419     // Create one control socket per disk.
2420     let mut disk_device_tubes = Vec::new();
2421     let mut disk_host_tubes = Vec::new();
2422     let disk_count = cfg.disks.len();
2423     for _ in 0..disk_count {
2424         let (disk_host_tub, disk_device_tube) = Tube::pair().map_err(Error::CreateTube)?;
2425         disk_host_tubes.push(disk_host_tub);
2426         disk_device_tubes.push(disk_device_tube);
2427     }
2428 
2429     let mut pmem_device_tubes = Vec::new();
2430     let pmem_count = cfg.pmem_devices.len();
2431     for _ in 0..pmem_count {
2432         let (pmem_host_tube, pmem_device_tube) = Tube::pair().map_err(Error::CreateTube)?;
2433         pmem_device_tubes.push(pmem_device_tube);
2434         control_tubes.push(TaggedControlTube::VmMsync(pmem_host_tube));
2435     }
2436 
2437     let (gpu_host_tube, gpu_device_tube) = Tube::pair().map_err(Error::CreateTube)?;
2438     control_tubes.push(TaggedControlTube::VmMemory(gpu_host_tube));
2439 
2440     let (ioapic_host_tube, ioapic_device_tube) = Tube::pair().map_err(Error::CreateTube)?;
2441     control_tubes.push(TaggedControlTube::VmIrq(ioapic_host_tube));
2442 
2443     let battery = if cfg.battery_type.is_some() {
2444         let jail = match simple_jail(&cfg, "battery")? {
2445             #[cfg_attr(not(feature = "powerd-monitor-powerd"), allow(unused_mut))]
2446             Some(mut jail) => {
2447                 // Setup a bind mount to the system D-Bus socket if the powerd monitor is used.
2448                 #[cfg(feature = "power-monitor-powerd")]
2449                 {
2450                     add_crosvm_user_to_jail(&mut jail, "battery")?;
2451 
2452                     // Create a tmpfs in the device's root directory so that we can bind mount files.
2453                     jail.mount_with_data(
2454                         Path::new("none"),
2455                         Path::new("/"),
2456                         "tmpfs",
2457                         (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
2458                         "size=67108864",
2459                     )?;
2460 
2461                     let system_bus_socket_path = Path::new("/run/dbus/system_bus_socket");
2462                     jail.mount_bind(system_bus_socket_path, system_bus_socket_path, true)?;
2463                 }
2464                 Some(jail)
2465             }
2466             None => None,
2467         };
2468         (&cfg.battery_type, jail)
2469     } else {
2470         (&cfg.battery_type, None)
2471     };
2472 
2473     let gralloc = RutabagaGralloc::new().map_err(Error::CreateGrallocError)?;
2474     let map_request: Arc<Mutex<Option<ExternalMapping>>> = Arc::new(Mutex::new(None));
2475 
2476     let fs_count = cfg
2477         .shared_dirs
2478         .iter()
2479         .filter(|sd| sd.kind == SharedDirKind::FS)
2480         .count();
2481     let mut fs_device_tubes = Vec::with_capacity(fs_count);
2482     for _ in 0..fs_count {
2483         let (fs_host_tube, fs_device_tube) = Tube::pair().map_err(Error::CreateTube)?;
2484         control_tubes.push(TaggedControlTube::Fs(fs_host_tube));
2485         fs_device_tubes.push(fs_device_tube);
2486     }
2487 
2488     #[cfg_attr(not(feature = "direct"), allow(unused_mut))]
2489     let mut linux: RunnableLinuxVm<_, Vcpu, _> = Arch::build_vm(
2490         components,
2491         &cfg.serial_parameters,
2492         simple_jail(&cfg, "serial")?,
2493         battery,
2494         vm,
2495         |mem, vm, sys_allocator, exit_evt| {
2496             create_devices(
2497                 &cfg,
2498                 mem,
2499                 vm,
2500                 sys_allocator,
2501                 exit_evt,
2502                 &mut control_tubes,
2503                 wayland_device_tube,
2504                 gpu_device_tube,
2505                 balloon_device_tube,
2506                 &mut disk_device_tubes,
2507                 &mut pmem_device_tubes,
2508                 &mut fs_device_tubes,
2509                 usb_provider,
2510                 Arc::clone(&map_request),
2511             )
2512         },
2513         |vm, vcpu_count| create_irq_chip(vm, vcpu_count, ioapic_device_tube),
2514     )
2515     .map_err(Error::BuildVm)?;
2516 
2517     #[cfg(feature = "direct")]
2518     if let Some(pmio) = &cfg.direct_pmio {
2519         let direct_io =
2520             Arc::new(devices::DirectIo::new(&pmio.path, false).map_err(Error::DirectIo)?);
2521         for range in pmio.ranges.iter() {
2522             linux
2523                 .io_bus
2524                 .insert_sync(direct_io.clone(), range.0, range.1)
2525                 .unwrap();
2526         }
2527     };
2528 
2529     #[cfg(feature = "direct")]
2530     let mut irqs = Vec::new();
2531 
2532     #[cfg(feature = "direct")]
2533     for irq in &cfg.direct_level_irq {
2534         if !linux.resources.reserve_irq(*irq) {
2535             warn!("irq {} already reserved.", irq);
2536         }
2537         let trigger = Event::new().map_err(Error::CreateEvent)?;
2538         let resample = Event::new().map_err(Error::CreateEvent)?;
2539         linux
2540             .irq_chip
2541             .register_irq_event(*irq, &trigger, Some(&resample))
2542             .unwrap();
2543         let direct_irq =
2544             devices::DirectIrq::new(trigger, Some(resample)).map_err(Error::DirectIrq)?;
2545         direct_irq.irq_enable(*irq).map_err(Error::DirectIrq)?;
2546         irqs.push(direct_irq);
2547     }
2548 
2549     #[cfg(feature = "direct")]
2550     for irq in &cfg.direct_edge_irq {
2551         if !linux.resources.reserve_irq(*irq) {
2552             warn!("irq {} already reserved.", irq);
2553         }
2554         let trigger = Event::new().map_err(Error::CreateEvent)?;
2555         linux
2556             .irq_chip
2557             .register_irq_event(*irq, &trigger, None)
2558             .unwrap();
2559         let direct_irq = devices::DirectIrq::new(trigger, None).map_err(Error::DirectIrq)?;
2560         direct_irq.irq_enable(*irq).map_err(Error::DirectIrq)?;
2561         irqs.push(direct_irq);
2562     }
2563 
2564     run_control(
2565         linux,
2566         control_server_socket,
2567         control_tubes,
2568         balloon_host_tube,
2569         &disk_host_tubes,
2570         usb_control_tube,
2571         sigchld_fd,
2572         cfg.sandbox,
2573         Arc::clone(&map_request),
2574         cfg.balloon_bias,
2575         gralloc,
2576     )
2577 }
2578 
2579 /// Signals all running VCPUs to vmexit, sends VmRunMode message to each VCPU tube, and tells
2580 /// `irq_chip` to stop blocking halted VCPUs. The tube message is set first because both the
2581 /// signal and the irq_chip kick could cause the VCPU thread to continue through the VCPU run
2582 /// loop.
kick_all_vcpus( vcpu_handles: &[(JoinHandle<()>, mpsc::Sender<vm_control::VcpuControl>)], irq_chip: &impl IrqChip, run_mode: &VmRunMode, )2583 fn kick_all_vcpus(
2584     vcpu_handles: &[(JoinHandle<()>, mpsc::Sender<vm_control::VcpuControl>)],
2585     irq_chip: &impl IrqChip,
2586     run_mode: &VmRunMode,
2587 ) {
2588     for (handle, tube) in vcpu_handles {
2589         if let Err(e) = tube.send(VcpuControl::RunState(run_mode.clone())) {
2590             error!("failed to send VmRunMode: {}", e);
2591         }
2592         let _ = handle.kill(SIGRTMIN() + 0);
2593     }
2594     irq_chip.kick_halted_vcpus();
2595 }
2596 
2597 // BalloonPolicy determines the size to set the balloon.
2598 struct BalloonPolicy {
2599     // Estimate for when the guest starts aggressivly freeing memory.
2600     critical_guest_available: i64,
2601     critical_host_available: i64, // ChromeOS critical margin.
2602     guest_available_bias: i64,
2603     max_balloon_actual: i64, // The largest the balloon has ever been observed.
2604     prev_balloon_full_percent: i64, // How full was the balloon at the previous timestep.
2605     prev_guest_available: i64, // Available memory in the guest at the previous timestep.
2606 }
2607 
2608 const ONE_KB: i64 = 1024;
2609 const ONE_MB: i64 = 1024 * ONE_KB;
2610 
2611 const LOWMEM_AVAILABLE: &str = "/sys/kernel/mm/chromeos-low_mem/available";
2612 const LOWMEM_MARGIN: &str = "/sys/kernel/mm/chromeos-low_mem/margin";
2613 
2614 // BalloonPolicy implements the virtio balloon sizing logic.
2615 // The balloon is sized with the following heuristics:
2616 //   Balance Available
2617 //     The balloon is sized to balance the amount of available memory above a
2618 //     critical margin. The critical margin is the level at which memory is
2619 //     freed. In the host, this is the ChromeOS available critical margin, which
2620 //     is the trigger to kill tabs. In the guest, we estimate this level by
2621 //     tracking the minimum amount of available memory, discounting sharp
2622 //     'valleys'. If the guest manages to keep available memory above a given
2623 //     level even with some pressure, then we determine that this is the
2624 //     'critical' level for the guest. We don't update this critical value if
2625 //     the balloon is fully inflated because in that case, the guest may be out
2626 //     of memory to free.
2627 //   guest_available_bias
2628 //     Even if available memory is perfectly balanced between host and guest,
2629 //     The size of the balloon will still drift randomly depending on whether
2630 //     those host or guest reclaims memory first/faster every time memory is
2631 //     low. To encourage large balloons to shrink and small balloons to grow,
2632 //     the following bias is added to the guest critical margin:
2633 //         (guest_available_bias * balloon_full_percent) / 100
2634 //     This give the guest more memory when the balloon is full.
2635 impl BalloonPolicy {
new( memory_size: i64, critical_host_available: i64, guest_available_bias: i64, ) -> BalloonPolicy2636     fn new(
2637         memory_size: i64,
2638         critical_host_available: i64,
2639         guest_available_bias: i64,
2640     ) -> BalloonPolicy {
2641         // Estimate some reasonable initial maximum for balloon size.
2642         let max_balloon_actual = (memory_size * 3) / 4;
2643         // 400MB is above the zone min margin even for Crostini VMs on 16GB
2644         // devices (~85MB), and is above when Android Low Memory Killer kills
2645         // apps (~250MB).
2646         let critical_guest_available = 400 * ONE_MB;
2647 
2648         BalloonPolicy {
2649             critical_guest_available,
2650             critical_host_available,
2651             guest_available_bias,
2652             max_balloon_actual,
2653             prev_balloon_full_percent: 0,
2654             prev_guest_available: 0,
2655         }
2656     }
delta(&mut self, stats: BalloonStats, balloon_actual_u: u64) -> Result<i64>2657     fn delta(&mut self, stats: BalloonStats, balloon_actual_u: u64) -> Result<i64> {
2658         let guest_free = stats
2659             .free_memory
2660             .map(i64::try_from)
2661             .ok_or(Error::GuestFreeMissing())?
2662             .map_err(Error::GuestFreeTooLarge)?;
2663         let guest_cached = stats
2664             .disk_caches
2665             .map(i64::try_from)
2666             .ok_or(Error::GuestFreeMissing())?
2667             .map_err(Error::GuestFreeTooLarge)?;
2668         let balloon_actual = match balloon_actual_u {
2669             size if size < i64::max_value() as u64 => size as i64,
2670             _ => return Err(Error::BalloonActualTooLarge),
2671         };
2672         let guest_available = guest_free + guest_cached;
2673         // Available memory is reported in MB, and we need bytes.
2674         let host_available =
2675             file_to_i64(LOWMEM_AVAILABLE, 0).map_err(Error::ReadMemAvailable)? * ONE_MB;
2676         if self.max_balloon_actual < balloon_actual {
2677             self.max_balloon_actual = balloon_actual;
2678             info!(
2679                 "balloon updated max_balloon_actual to {} MiB",
2680                 self.max_balloon_actual / ONE_MB,
2681             );
2682         }
2683         let balloon_full_percent = balloon_actual * 100 / self.max_balloon_actual;
2684         // Update critical_guest_available if we see a lower available with the
2685         // balloon not fully inflated. If the balloon is completely inflated
2686         // there is a risk that the low available level we see comes at the cost
2687         // of stability. The Linux OOM Killer might have been forced to kill
2688         // something important, or page reclaim was so aggressive that there are
2689         // long UI hangs.
2690         if guest_available < self.critical_guest_available && balloon_full_percent < 95 {
2691             // To ignore temporary low memory states, we require that two guest
2692             // available measurements in a row are low.
2693             if self.prev_guest_available < self.critical_guest_available
2694                 && self.prev_balloon_full_percent < 95
2695             {
2696                 self.critical_guest_available = self.prev_guest_available;
2697                 info!(
2698                     "balloon updated critical_guest_available to {} MiB",
2699                     self.critical_guest_available / ONE_MB,
2700                 );
2701             }
2702         }
2703 
2704         // Compute the difference in available memory above the host and guest
2705         // critical thresholds.
2706         let bias = (self.guest_available_bias * balloon_full_percent) / 100;
2707         let guest_above_critical = guest_available - self.critical_guest_available - bias;
2708         let host_above_critical = host_available - self.critical_host_available;
2709         let balloon_delta = guest_above_critical - host_above_critical;
2710         // Only let the balloon take up MAX_CRITICAL_DELTA of available memory
2711         // below the critical level in host or guest.
2712         const MAX_CRITICAL_DELTA: i64 = 10 * ONE_MB;
2713         let balloon_delta_capped = if balloon_delta < 0 {
2714             // The balloon is deflating, taking memory from the host. Don't let
2715             // it take more than the amount of available memory above the
2716             // critical margin, plus MAX_CRITICAL_DELTA.
2717             max(
2718                 balloon_delta,
2719                 -(host_available - self.critical_host_available + MAX_CRITICAL_DELTA),
2720             )
2721         } else {
2722             // The balloon is inflating, taking memory from the guest. Don't let
2723             // it take more than the amount of available memory above the
2724             // critical margin, plus MAX_CRITICAL_DELTA.
2725             min(
2726                 balloon_delta,
2727                 guest_available - self.critical_guest_available + MAX_CRITICAL_DELTA,
2728             )
2729         };
2730 
2731         self.prev_balloon_full_percent = balloon_full_percent;
2732         self.prev_guest_available = guest_available;
2733 
2734         // Only return a value if target would change available above critical
2735         // by more than 1%, or we are within 1 MB of critical in host or guest.
2736         if guest_above_critical < ONE_MB
2737             || host_above_critical < ONE_MB
2738             || (balloon_delta.abs() * 100) / guest_above_critical > 1
2739             || (balloon_delta.abs() * 100) / host_above_critical > 1
2740         {
2741             // Finally, make sure the balloon delta won't cause a negative size.
2742             let result = max(balloon_delta_capped, -balloon_actual);
2743             if result != 0 {
2744                 info!(
2745                     "balloon delta={:<6} ha={:<6} hc={:<6} ga={:<6} gc={:<6} bias={:<6} full={:>3}%",
2746                     result / ONE_MB,
2747                     host_available / ONE_MB,
2748                     self.critical_host_available / ONE_MB,
2749                     guest_available / ONE_MB,
2750                     self.critical_guest_available / ONE_MB,
2751                     bias / ONE_MB,
2752                     balloon_full_percent,
2753                 );
2754             }
2755             return Ok(result);
2756         }
2757         Ok(0)
2758     }
2759 }
2760 
run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static, I: IrqChipArch + 'static>( mut linux: RunnableLinuxVm<V, Vcpu, I>, control_server_socket: Option<UnlinkUnixSeqpacketListener>, mut control_tubes: Vec<TaggedControlTube>, balloon_host_tube: Tube, disk_host_tubes: &[Tube], usb_control_tube: Tube, sigchld_fd: SignalFd, sandbox: bool, map_request: Arc<Mutex<Option<ExternalMapping>>>, balloon_bias: i64, mut gralloc: RutabagaGralloc, ) -> Result<()>2761 fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static, I: IrqChipArch + 'static>(
2762     mut linux: RunnableLinuxVm<V, Vcpu, I>,
2763     control_server_socket: Option<UnlinkUnixSeqpacketListener>,
2764     mut control_tubes: Vec<TaggedControlTube>,
2765     balloon_host_tube: Tube,
2766     disk_host_tubes: &[Tube],
2767     usb_control_tube: Tube,
2768     sigchld_fd: SignalFd,
2769     sandbox: bool,
2770     map_request: Arc<Mutex<Option<ExternalMapping>>>,
2771     balloon_bias: i64,
2772     mut gralloc: RutabagaGralloc,
2773 ) -> Result<()> {
2774     #[derive(PollToken)]
2775     enum Token {
2776         Exit,
2777         Suspend,
2778         ChildSignal,
2779         IrqFd { index: IrqEventIndex },
2780         BalanceMemory,
2781         BalloonResult,
2782         VmControlServer,
2783         VmControl { index: usize },
2784     }
2785 
2786     stdin()
2787         .set_raw_mode()
2788         .expect("failed to set terminal raw mode");
2789 
2790     let wait_ctx = WaitContext::build_with(&[
2791         (&linux.exit_evt, Token::Exit),
2792         (&linux.suspend_evt, Token::Suspend),
2793         (&sigchld_fd, Token::ChildSignal),
2794     ])
2795     .map_err(Error::WaitContextAdd)?;
2796 
2797     if let Some(socket_server) = &control_server_socket {
2798         wait_ctx
2799             .add(socket_server, Token::VmControlServer)
2800             .map_err(Error::WaitContextAdd)?;
2801     }
2802     for (index, socket) in control_tubes.iter().enumerate() {
2803         wait_ctx
2804             .add(socket.as_ref(), Token::VmControl { index })
2805             .map_err(Error::WaitContextAdd)?;
2806     }
2807 
2808     let events = linux
2809         .irq_chip
2810         .irq_event_tokens()
2811         .map_err(Error::WaitContextAdd)?;
2812 
2813     for (index, _gsi, evt) in events {
2814         wait_ctx
2815             .add(&evt, Token::IrqFd { index })
2816             .map_err(Error::WaitContextAdd)?;
2817     }
2818 
2819     // Balance available memory between guest and host every second.
2820     let mut balancemem_timer = Timer::new().map_err(Error::CreateTimer)?;
2821     let mut balloon_policy = if let Ok(critical_margin) = file_to_i64(LOWMEM_MARGIN, 0) {
2822         // Create timer request balloon stats every 1s.
2823         wait_ctx
2824             .add(&balancemem_timer, Token::BalanceMemory)
2825             .map_err(Error::WaitContextAdd)?;
2826         let balancemem_dur = Duration::from_secs(1);
2827         let balancemem_int = Duration::from_secs(1);
2828         balancemem_timer
2829             .reset(balancemem_dur, Some(balancemem_int))
2830             .map_err(Error::ResetTimer)?;
2831 
2832         // Listen for balloon statistics from the guest so we can balance.
2833         wait_ctx
2834             .add(&balloon_host_tube, Token::BalloonResult)
2835             .map_err(Error::WaitContextAdd)?;
2836         Some(BalloonPolicy::new(
2837             linux.vm.get_memory().memory_size() as i64,
2838             critical_margin * ONE_MB,
2839             balloon_bias,
2840         ))
2841     } else {
2842         warn!("Unable to open low mem margin, maybe not a chrome os kernel");
2843         None
2844     };
2845 
2846     if sandbox {
2847         // Before starting VCPUs, in case we started with some capabilities, drop them all.
2848         drop_capabilities().map_err(Error::DropCapabilities)?;
2849     }
2850 
2851     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2852     // Create a channel for GDB thread.
2853     let (to_gdb_channel, from_vcpu_channel) = if linux.gdb.is_some() {
2854         let (s, r) = mpsc::channel();
2855         (Some(s), Some(r))
2856     } else {
2857         (None, None)
2858     };
2859 
2860     let mut vcpu_handles = Vec::with_capacity(linux.vcpu_count);
2861     let vcpu_thread_barrier = Arc::new(Barrier::new(linux.vcpu_count + 1));
2862     let use_hypervisor_signals = !linux
2863         .vm
2864         .get_hypervisor()
2865         .check_capability(&HypervisorCap::ImmediateExit);
2866     setup_vcpu_signal_handler::<Vcpu>(use_hypervisor_signals)?;
2867 
2868     let vcpus: Vec<Option<_>> = match linux.vcpus.take() {
2869         Some(vec) => vec.into_iter().map(Some).collect(),
2870         None => iter::repeat_with(|| None).take(linux.vcpu_count).collect(),
2871     };
2872     for (cpu_id, vcpu) in vcpus.into_iter().enumerate() {
2873         let (to_vcpu_channel, from_main_channel) = mpsc::channel();
2874         let vcpu_affinity = match linux.vcpu_affinity.clone() {
2875             Some(VcpuAffinity::Global(v)) => v,
2876             Some(VcpuAffinity::PerVcpu(mut m)) => m.remove(&cpu_id).unwrap_or_default(),
2877             None => Default::default(),
2878         };
2879         let handle = run_vcpu(
2880             cpu_id,
2881             vcpu,
2882             linux.vm.try_clone().map_err(Error::CloneEvent)?,
2883             linux.irq_chip.try_clone().map_err(Error::CloneEvent)?,
2884             linux.vcpu_count,
2885             linux.rt_cpus.contains(&cpu_id),
2886             vcpu_affinity,
2887             linux.no_smt,
2888             vcpu_thread_barrier.clone(),
2889             linux.has_bios,
2890             linux.io_bus.clone(),
2891             linux.mmio_bus.clone(),
2892             linux.exit_evt.try_clone().map_err(Error::CloneEvent)?,
2893             linux.vm.check_capability(VmCap::PvClockSuspend),
2894             from_main_channel,
2895             use_hypervisor_signals,
2896             #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2897             to_gdb_channel.clone(),
2898         )?;
2899         vcpu_handles.push((handle, to_vcpu_channel));
2900     }
2901 
2902     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2903     // Spawn GDB thread.
2904     if let Some((gdb_port_num, gdb_control_tube)) = linux.gdb.take() {
2905         let to_vcpu_channels = vcpu_handles
2906             .iter()
2907             .map(|(_handle, channel)| channel.clone())
2908             .collect();
2909         let target = GdbStub::new(
2910             gdb_control_tube,
2911             to_vcpu_channels,
2912             from_vcpu_channel.unwrap(), // Must succeed to unwrap()
2913         );
2914         thread::Builder::new()
2915             .name("gdb".to_owned())
2916             .spawn(move || gdb_thread(target, gdb_port_num))
2917             .map_err(Error::SpawnGdbServer)?;
2918     };
2919 
2920     vcpu_thread_barrier.wait();
2921 
2922     'wait: loop {
2923         let events = {
2924             match wait_ctx.wait() {
2925                 Ok(v) => v,
2926                 Err(e) => {
2927                     error!("failed to poll: {}", e);
2928                     break;
2929                 }
2930             }
2931         };
2932 
2933         if let Err(e) = linux.irq_chip.process_delayed_irq_events() {
2934             warn!("can't deliver delayed irqs: {}", e);
2935         }
2936 
2937         let mut vm_control_indices_to_remove = Vec::new();
2938         for event in events.iter().filter(|e| e.is_readable) {
2939             match event.token {
2940                 Token::Exit => {
2941                     info!("vcpu requested shutdown");
2942                     break 'wait;
2943                 }
2944                 Token::Suspend => {
2945                     info!("VM requested suspend");
2946                     linux.suspend_evt.read().unwrap();
2947                     kick_all_vcpus(&vcpu_handles, &linux.irq_chip, &VmRunMode::Suspending);
2948                 }
2949                 Token::ChildSignal => {
2950                     // Print all available siginfo structs, then exit the loop.
2951                     while let Some(siginfo) = sigchld_fd.read().map_err(Error::SignalFd)? {
2952                         let pid = siginfo.ssi_pid;
2953                         let pid_label = match linux.pid_debug_label_map.get(&pid) {
2954                             Some(label) => format!("{} (pid {})", label, pid),
2955                             None => format!("pid {}", pid),
2956                         };
2957                         error!(
2958                             "child {} died: signo {}, status {}, code {}",
2959                             pid_label, siginfo.ssi_signo, siginfo.ssi_status, siginfo.ssi_code
2960                         );
2961                     }
2962                     break 'wait;
2963                 }
2964                 Token::IrqFd { index } => {
2965                     if let Err(e) = linux.irq_chip.service_irq_event(index) {
2966                         error!("failed to signal irq {}: {}", index, e);
2967                     }
2968                 }
2969                 Token::BalanceMemory => {
2970                     balancemem_timer.wait().map_err(Error::Timer)?;
2971                     let command = BalloonControlCommand::Stats {};
2972                     if let Err(e) = balloon_host_tube.send(&command) {
2973                         warn!("failed to send stats request to balloon device: {}", e);
2974                     }
2975                 }
2976                 Token::BalloonResult => {
2977                     match balloon_host_tube.recv() {
2978                         Ok(BalloonControlResult::Stats {
2979                             stats,
2980                             balloon_actual: balloon_actual_u,
2981                         }) => {
2982                             match balloon_policy
2983                                 .as_mut()
2984                                 .map(|p| p.delta(stats, balloon_actual_u))
2985                             {
2986                                 None => {
2987                                     error!(
2988                                         "got result from balloon stats, but no policy is running"
2989                                     );
2990                                 }
2991                                 Some(Err(e)) => {
2992                                     warn!("failed to run balloon policy {}", e);
2993                                 }
2994                                 Some(Ok(delta)) if delta != 0 => {
2995                                     let target = max((balloon_actual_u as i64) + delta, 0) as u64;
2996                                     let command =
2997                                         BalloonControlCommand::Adjust { num_bytes: target };
2998                                     if let Err(e) = balloon_host_tube.send(&command) {
2999                                         warn!(
3000                                             "failed to send memory value to balloon device: {}",
3001                                             e
3002                                         );
3003                                     }
3004                                 }
3005                                 Some(Ok(_)) => {}
3006                             }
3007                         }
3008                         Err(e) => {
3009                             error!("failed to recv BalloonControlResult: {}", e);
3010                         }
3011                     };
3012                 }
3013                 Token::VmControlServer => {
3014                     if let Some(socket_server) = &control_server_socket {
3015                         match socket_server.accept() {
3016                             Ok(socket) => {
3017                                 wait_ctx
3018                                     .add(
3019                                         &socket,
3020                                         Token::VmControl {
3021                                             index: control_tubes.len(),
3022                                         },
3023                                     )
3024                                     .map_err(Error::WaitContextAdd)?;
3025                                 control_tubes.push(TaggedControlTube::Vm(Tube::new(socket)));
3026                             }
3027                             Err(e) => error!("failed to accept socket: {}", e),
3028                         }
3029                     }
3030                 }
3031                 Token::VmControl { index } => {
3032                     if let Some(socket) = control_tubes.get(index) {
3033                         match socket {
3034                             TaggedControlTube::Vm(tube) => match tube.recv::<VmRequest>() {
3035                                 Ok(request) => {
3036                                     let mut run_mode_opt = None;
3037                                     let response = request.execute(
3038                                         &mut run_mode_opt,
3039                                         &balloon_host_tube,
3040                                         disk_host_tubes,
3041                                         &usb_control_tube,
3042                                         &mut linux.bat_control,
3043                                     );
3044                                     if let Err(e) = tube.send(&response) {
3045                                         error!("failed to send VmResponse: {}", e);
3046                                     }
3047                                     if let Some(run_mode) = run_mode_opt {
3048                                         info!("control socket changed run mode to {}", run_mode);
3049                                         match run_mode {
3050                                             VmRunMode::Exiting => {
3051                                                 break 'wait;
3052                                             }
3053                                             other => {
3054                                                 if other == VmRunMode::Running {
3055                                                     linux.io_bus.notify_resume();
3056                                                 }
3057                                                 kick_all_vcpus(
3058                                                     &vcpu_handles,
3059                                                     &linux.irq_chip,
3060                                                     &other,
3061                                                 );
3062                                             }
3063                                         }
3064                                     }
3065                                 }
3066                                 Err(e) => {
3067                                     if let TubeError::Disconnected = e {
3068                                         vm_control_indices_to_remove.push(index);
3069                                     } else {
3070                                         error!("failed to recv VmRequest: {}", e);
3071                                     }
3072                                 }
3073                             },
3074                             TaggedControlTube::VmMemory(tube) => {
3075                                 match tube.recv::<VmMemoryRequest>() {
3076                                     Ok(request) => {
3077                                         let response = request.execute(
3078                                             &mut linux.vm,
3079                                             &mut linux.resources,
3080                                             Arc::clone(&map_request),
3081                                             &mut gralloc,
3082                                         );
3083                                         if let Err(e) = tube.send(&response) {
3084                                             error!("failed to send VmMemoryControlResponse: {}", e);
3085                                         }
3086                                     }
3087                                     Err(e) => {
3088                                         if let TubeError::Disconnected = e {
3089                                             vm_control_indices_to_remove.push(index);
3090                                         } else {
3091                                             error!("failed to recv VmMemoryControlRequest: {}", e);
3092                                         }
3093                                     }
3094                                 }
3095                             }
3096                             TaggedControlTube::VmIrq(tube) => match tube.recv::<VmIrqRequest>() {
3097                                 Ok(request) => {
3098                                     let response = {
3099                                         let irq_chip = &mut linux.irq_chip;
3100                                         request.execute(
3101                                             |setup| match setup {
3102                                                 IrqSetup::Event(irq, ev) => {
3103                                                     if let Some(event_index) = irq_chip
3104                                                         .register_irq_event(irq, ev, None)?
3105                                                     {
3106                                                         match wait_ctx.add(
3107                                                             ev,
3108                                                             Token::IrqFd {
3109                                                                 index: event_index
3110                                                             },
3111                                                         ) {
3112                                                             Err(e) => {
3113                                                                 warn!("failed to add IrqFd to poll context: {}", e);
3114                                                                 Err(e)
3115                                                             },
3116                                                             Ok(_) => {
3117                                                                 Ok(())
3118                                                             }
3119                                                         }
3120                                                     } else {
3121                                                         Ok(())
3122                                                     }
3123                                                 }
3124                                                 IrqSetup::Route(route) => irq_chip.route_irq(route),
3125                                             },
3126                                             &mut linux.resources,
3127                                         )
3128                                     };
3129                                     if let Err(e) = tube.send(&response) {
3130                                         error!("failed to send VmIrqResponse: {}", e);
3131                                     }
3132                                 }
3133                                 Err(e) => {
3134                                     if let TubeError::Disconnected = e {
3135                                         vm_control_indices_to_remove.push(index);
3136                                     } else {
3137                                         error!("failed to recv VmIrqRequest: {}", e);
3138                                     }
3139                                 }
3140                             },
3141                             TaggedControlTube::VmMsync(tube) => {
3142                                 match tube.recv::<VmMsyncRequest>() {
3143                                     Ok(request) => {
3144                                         let response = request.execute(&mut linux.vm);
3145                                         if let Err(e) = tube.send(&response) {
3146                                             error!("failed to send VmMsyncResponse: {}", e);
3147                                         }
3148                                     }
3149                                     Err(e) => {
3150                                         if let TubeError::Disconnected = e {
3151                                             vm_control_indices_to_remove.push(index);
3152                                         } else {
3153                                             error!("failed to recv VmMsyncRequest: {}", e);
3154                                         }
3155                                     }
3156                                 }
3157                             }
3158                             TaggedControlTube::Fs(tube) => match tube.recv::<FsMappingRequest>() {
3159                                 Ok(request) => {
3160                                     let response =
3161                                         request.execute(&mut linux.vm, &mut linux.resources);
3162                                     if let Err(e) = tube.send(&response) {
3163                                         error!("failed to send VmResponse: {}", e);
3164                                     }
3165                                 }
3166                                 Err(e) => {
3167                                     if let TubeError::Disconnected = e {
3168                                         vm_control_indices_to_remove.push(index);
3169                                     } else {
3170                                         error!("failed to recv VmResponse: {}", e);
3171                                     }
3172                                 }
3173                             },
3174                         }
3175                     }
3176                 }
3177             }
3178         }
3179 
3180         for event in events.iter().filter(|e| e.is_hungup) {
3181             match event.token {
3182                 Token::Exit => {}
3183                 Token::Suspend => {}
3184                 Token::ChildSignal => {}
3185                 Token::IrqFd { index: _ } => {}
3186                 Token::BalanceMemory => {}
3187                 Token::BalloonResult => {}
3188                 Token::VmControlServer => {}
3189                 Token::VmControl { index } => {
3190                     // It's possible more data is readable and buffered while the socket is hungup,
3191                     // so don't delete the tube from the poll context until we're sure all the
3192                     // data is read.
3193                     if control_tubes
3194                         .get(index)
3195                         .map(|s| !s.as_ref().is_packet_ready())
3196                         .unwrap_or(false)
3197                     {
3198                         vm_control_indices_to_remove.push(index);
3199                     }
3200                 }
3201             }
3202         }
3203 
3204         // Sort in reverse so the highest indexes are removed first. This removal algorithm
3205         // preserves correct indexes as each element is removed.
3206         vm_control_indices_to_remove.sort_unstable_by_key(|&k| Reverse(k));
3207         vm_control_indices_to_remove.dedup();
3208         for index in vm_control_indices_to_remove {
3209             // Delete the socket from the `wait_ctx` synchronously. Otherwise, the kernel will do
3210             // this automatically when the FD inserted into the `wait_ctx` is closed after this
3211             // if-block, but this removal can be deferred unpredictably. In some instances where the
3212             // system is under heavy load, we can even get events returned by `wait_ctx` for an FD
3213             // that has already been closed. Because the token associated with that spurious event
3214             // now belongs to a different socket, the control loop will start to interact with
3215             // sockets that might not be ready to use. This can cause incorrect hangup detection or
3216             // blocking on a socket that will never be ready. See also: crbug.com/1019986
3217             if let Some(socket) = control_tubes.get(index) {
3218                 wait_ctx.delete(socket).map_err(Error::WaitContextDelete)?;
3219             }
3220 
3221             // This line implicitly drops the socket at `index` when it gets returned by
3222             // `swap_remove`. After this line, the socket at `index` is not the one from
3223             // `vm_control_indices_to_remove`. Because of this socket's change in index, we need to
3224             // use `wait_ctx.modify` to change the associated index in its `Token::VmControl`.
3225             control_tubes.swap_remove(index);
3226             if let Some(tube) = control_tubes.get(index) {
3227                 wait_ctx
3228                     .modify(tube, EventType::Read, Token::VmControl { index })
3229                     .map_err(Error::WaitContextAdd)?;
3230             }
3231         }
3232     }
3233 
3234     kick_all_vcpus(&vcpu_handles, &linux.irq_chip, &VmRunMode::Exiting);
3235     for (handle, _) in vcpu_handles {
3236         if let Err(e) = handle.join() {
3237             error!("failed to join vcpu thread: {:?}", e);
3238         }
3239     }
3240 
3241     // Explicitly drop the VM structure here to allow the devices to clean up before the
3242     // control sockets are closed when this function exits.
3243     mem::drop(linux);
3244 
3245     stdin()
3246         .set_canon_mode()
3247         .expect("failed to restore canonical mode for terminal");
3248 
3249     Ok(())
3250 }
3251