1 // Copyright 2018 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 mod process;
6 mod vcpu;
7
8 use std::fmt::{self, Display};
9 use std::fs::File;
10 use std::io;
11 use std::os::unix::net::UnixDatagram;
12 use std::path::Path;
13 use std::result;
14 use std::sync::atomic::{AtomicBool, Ordering};
15 use std::sync::{Arc, Barrier};
16 use std::thread;
17 use std::time::{Duration, Instant};
18
19 use libc::{
20 c_int, c_ulong, fcntl, ioctl, socketpair, AF_UNIX, EAGAIN, EBADF, EDEADLK, EEXIST, EINTR,
21 EINVAL, ENOENT, EOVERFLOW, EPERM, FIOCLEX, F_SETPIPE_SZ, MS_NODEV, MS_NOEXEC, MS_NOSUID,
22 MS_RDONLY, SIGCHLD, SOCK_SEQPACKET,
23 };
24
25 use protobuf::ProtobufError;
26 use remain::sorted;
27
28 use base::{
29 block_signal, clear_signal, drop_capabilities, error, getegid, geteuid, info, pipe,
30 register_rt_signal_handler, validate_raw_descriptor, warn, AsRawDescriptor, Error as SysError,
31 Event, FromRawDescriptor, Killable, MmapError, PollToken, Result as SysResult, SignalFd,
32 SignalFdError, WaitContext, SIGRTMIN,
33 };
34 use kvm::{Cap, Datamatch, IoeventAddress, Kvm, Vcpu, VcpuExit, Vm};
35 use minijail::{self, Minijail};
36 use net_util::{Error as TapError, Tap, TapT};
37 use vm_memory::{GuestMemory, MemoryPolicy};
38
39 use self::process::*;
40 use self::vcpu::*;
41 use crate::{Config, Executable};
42
43 const MAX_DATAGRAM_SIZE: usize = 4096;
44 const MAX_VCPU_DATAGRAM_SIZE: usize = 0x40000;
45
46 /// An error that occurs during the lifetime of a plugin process.
47 #[sorted]
48 pub enum Error {
49 CloneEvent(SysError),
50 CloneVcpuPipe(io::Error),
51 CreateEvent(SysError),
52 CreateIrqChip(SysError),
53 CreateJail(minijail::Error),
54 CreateKvm(SysError),
55 CreateMainSocket(SysError),
56 CreatePIT(SysError),
57 CreateSignalFd(SignalFdError),
58 CreateSocketPair(io::Error),
59 CreateTapFd(TapError),
60 CreateVcpu(SysError),
61 CreateVcpuSocket(SysError),
62 CreateVm(SysError),
63 CreateWaitContext(SysError),
64 DecodeRequest(ProtobufError),
65 DropCapabilities(SysError),
66 EncodeResponse(ProtobufError),
67 Mount(minijail::Error),
68 MountDev(minijail::Error),
69 MountLib(minijail::Error),
70 MountLib64(minijail::Error),
71 MountPlugin(minijail::Error),
72 MountPluginLib(minijail::Error),
73 MountProc(minijail::Error),
74 MountRoot(minijail::Error),
75 NoRootDir,
76 ParsePivotRoot(minijail::Error),
77 ParseSeccomp(minijail::Error),
78 PluginFailed(i32),
79 PluginKill(SysError),
80 PluginKilled(i32),
81 PluginRunJail(minijail::Error),
82 PluginSocketHup,
83 PluginSocketPoll(SysError),
84 PluginSocketRecv(SysError),
85 PluginSocketSend(SysError),
86 PluginSpawn(io::Error),
87 PluginTimeout,
88 PluginWait(SysError),
89 Poll(SysError),
90 RootNotAbsolute,
91 RootNotDir,
92 SetGidMap(minijail::Error),
93 SetUidMap(minijail::Error),
94 SigChild {
95 pid: u32,
96 signo: u32,
97 status: i32,
98 code: i32,
99 },
100 SignalFd(SignalFdError),
101 SpawnVcpu(io::Error),
102 TapEnable(TapError),
103 TapOpen(TapError),
104 TapSetIp(TapError),
105 TapSetMacAddress(TapError),
106 TapSetNetmask(TapError),
107 ValidateTapFd(SysError),
108 WaitContextAdd(SysError),
109 }
110
111 impl Display for Error {
112 #[remain::check]
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result113 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
114 use self::Error::*;
115
116 #[sorted]
117 match self {
118 CloneEvent(e) => write!(f, "failed to clone event: {}", e),
119 CloneVcpuPipe(e) => write!(f, "failed to clone vcpu pipe: {}", e),
120 CreateEvent(e) => write!(f, "failed to create event: {}", e),
121 CreateIrqChip(e) => write!(f, "failed to create kvm irqchip: {}", e),
122 CreateJail(e) => write!(f, "failed to create jail: {}", e),
123 CreateKvm(e) => write!(f, "error creating Kvm: {}", e),
124 CreateMainSocket(e) => write!(f, "error creating main request socket: {}", e),
125 CreatePIT(e) => write!(f, "failed to create kvm PIT: {}", e),
126 CreateSignalFd(e) => write!(f, "failed to create signalfd: {}", e),
127 CreateSocketPair(e) => write!(f, "failed to create socket pair: {}", e),
128 CreateTapFd(e) => write!(f, "failed to create tap device from raw fd: {}", e),
129 CreateVcpu(e) => write!(f, "error creating vcpu: {}", e),
130 CreateVcpuSocket(e) => write!(f, "error creating vcpu request socket: {}", e),
131 CreateVm(e) => write!(f, "error creating vm: {}", e),
132 CreateWaitContext(e) => write!(f, "failed to create wait context: {}", e),
133 DecodeRequest(e) => write!(f, "failed to decode plugin request: {}", e),
134 DropCapabilities(e) => write!(f, "failed to drop process capabilities: {}", e),
135 EncodeResponse(e) => write!(f, "failed to encode plugin response: {}", e),
136 Mount(e) | MountDev(e) | MountLib(e) | MountLib64(e) | MountPlugin(e)
137 | MountPluginLib(e) | MountProc(e) | MountRoot(e) => {
138 write!(f, "failed to mount: {}", e)
139 }
140 NoRootDir => write!(f, "no root directory for jailed process to pivot root into"),
141 ParsePivotRoot(e) => write!(f, "failed to set jail pivot root: {}", e),
142 ParseSeccomp(e) => write!(f, "failed to parse jail seccomp filter: {}", e),
143 PluginFailed(e) => write!(f, "plugin exited with error: {}", e),
144 PluginKill(e) => write!(f, "error sending kill signal to plugin: {}", e),
145 PluginKilled(e) => write!(f, "plugin exited with signal {}", e),
146 PluginRunJail(e) => write!(f, "failed to run jail: {}", e),
147 PluginSocketHup => write!(f, "plugin request socket has been hung up"),
148 PluginSocketPoll(e) => write!(f, "failed to poll plugin request sockets: {}", e),
149 PluginSocketRecv(e) => write!(f, "failed to recv from plugin request socket: {}", e),
150 PluginSocketSend(e) => write!(f, "failed to send to plugin request socket: {}", e),
151 PluginSpawn(e) => write!(f, "failed to spawn plugin: {}", e),
152 PluginTimeout => write!(f, "plugin did not exit within timeout"),
153 PluginWait(e) => write!(f, "error waiting for plugin to exit: {}", e),
154 Poll(e) => write!(f, "failed to poll all FDs: {}", e),
155 RootNotAbsolute => write!(f, "path to the root directory must be absolute"),
156 RootNotDir => write!(f, "specified root directory is not a directory"),
157 SetGidMap(e) => write!(f, "failed to set gidmap for jail: {}", e),
158 SetUidMap(e) => write!(f, "failed to set uidmap for jail: {}", e),
159 SigChild {
160 pid,
161 signo,
162 status,
163 code,
164 } => write!(
165 f,
166 "process {} died with signal {}, status {}, and code {}",
167 pid, signo, status, code
168 ),
169 SignalFd(e) => write!(f, "failed to read signal fd: {}", e),
170 SpawnVcpu(e) => write!(f, "error spawning vcpu thread: {}", e),
171 TapEnable(e) => write!(f, "error enabling tap device: {}", e),
172 TapOpen(e) => write!(f, "error opening tap device: {}", e),
173 TapSetIp(e) => write!(f, "error setting tap ip: {}", e),
174 TapSetMacAddress(e) => write!(f, "error setting tap mac address: {}", e),
175 TapSetNetmask(e) => write!(f, "error setting tap netmask: {}", e),
176 ValidateTapFd(e) => write!(f, "failed to validate raw tap fd: {}", e),
177 WaitContextAdd(e) => write!(f, "failed to add descriptor to wait context: {}", e),
178 }
179 }
180 }
181
182 type Result<T> = result::Result<T, Error>;
183
new_seqpacket_pair() -> SysResult<(UnixDatagram, UnixDatagram)>184 fn new_seqpacket_pair() -> SysResult<(UnixDatagram, UnixDatagram)> {
185 let mut fds = [0, 0];
186 unsafe {
187 let ret = socketpair(AF_UNIX, SOCK_SEQPACKET, 0, fds.as_mut_ptr());
188 if ret == 0 {
189 ioctl(fds[0], FIOCLEX);
190 Ok((
191 UnixDatagram::from_raw_descriptor(fds[0]),
192 UnixDatagram::from_raw_descriptor(fds[1]),
193 ))
194 } else {
195 Err(SysError::last())
196 }
197 }
198 }
199
200 struct VcpuPipe {
201 crosvm_read: File,
202 plugin_write: File,
203 plugin_read: File,
204 crosvm_write: File,
205 }
206
new_pipe_pair() -> SysResult<VcpuPipe>207 fn new_pipe_pair() -> SysResult<VcpuPipe> {
208 let to_crosvm = pipe(true)?;
209 let to_plugin = pipe(true)?;
210 // Increasing the pipe size can be a nice-to-have to make sure that
211 // messages get across atomically (and made sure that writes don't block),
212 // though it's not necessary a hard requirement for things to work.
213 let flags = unsafe {
214 fcntl(
215 to_crosvm.0.as_raw_descriptor(),
216 F_SETPIPE_SZ,
217 MAX_VCPU_DATAGRAM_SIZE as c_int,
218 )
219 };
220 if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 {
221 warn!(
222 "Failed to adjust size of crosvm pipe (result {}): {}",
223 flags,
224 SysError::last()
225 );
226 }
227 let flags = unsafe {
228 fcntl(
229 to_plugin.0.as_raw_descriptor(),
230 F_SETPIPE_SZ,
231 MAX_VCPU_DATAGRAM_SIZE as c_int,
232 )
233 };
234 if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 {
235 warn!(
236 "Failed to adjust size of plugin pipe (result {}): {}",
237 flags,
238 SysError::last()
239 );
240 }
241 Ok(VcpuPipe {
242 crosvm_read: to_crosvm.0,
243 plugin_write: to_crosvm.1,
244 plugin_read: to_plugin.0,
245 crosvm_write: to_plugin.1,
246 })
247 }
248
proto_to_sys_err(e: ProtobufError) -> SysError249 fn proto_to_sys_err(e: ProtobufError) -> SysError {
250 match e {
251 ProtobufError::IoError(e) => SysError::new(e.raw_os_error().unwrap_or(EINVAL)),
252 _ => SysError::new(EINVAL),
253 }
254 }
255
io_to_sys_err(e: io::Error) -> SysError256 fn io_to_sys_err(e: io::Error) -> SysError {
257 SysError::new(e.raw_os_error().unwrap_or(EINVAL))
258 }
259
mmap_to_sys_err(e: MmapError) -> SysError260 fn mmap_to_sys_err(e: MmapError) -> SysError {
261 match e {
262 MmapError::SystemCallFailed(e) => e,
263 _ => SysError::new(EINVAL),
264 }
265 }
266
create_plugin_jail(root: &Path, log_failures: bool, seccomp_policy: &Path) -> Result<Minijail>267 fn create_plugin_jail(root: &Path, log_failures: bool, seccomp_policy: &Path) -> Result<Minijail> {
268 // All child jails run in a new user namespace without any users mapped,
269 // they run as nobody unless otherwise configured.
270 let mut j = Minijail::new().map_err(Error::CreateJail)?;
271 j.namespace_pids();
272 j.namespace_user();
273 j.uidmap(&format!("0 {0} 1", geteuid()))
274 .map_err(Error::SetUidMap)?;
275 j.gidmap(&format!("0 {0} 1", getegid()))
276 .map_err(Error::SetGidMap)?;
277 j.namespace_user_disable_setgroups();
278 // Don't need any capabilities.
279 j.use_caps(0);
280 // Create a new mount namespace with an empty root FS.
281 j.namespace_vfs();
282 j.enter_pivot_root(root).map_err(Error::ParsePivotRoot)?;
283 // Run in an empty network namespace.
284 j.namespace_net();
285 j.no_new_privs();
286 // By default we'll prioritize using the pre-compiled .bpf over the .policy
287 // file (the .bpf is expected to be compiled using "trap" as the failure
288 // behavior instead of the default "kill" behavior).
289 // Refer to the code comment for the "seccomp-log-failures"
290 // command-line parameter for an explanation about why the |log_failures|
291 // flag forces the use of .policy files (and the build-time alternative to
292 // this run-time flag).
293 let bpf_policy_file = seccomp_policy.with_extension("bpf");
294 if bpf_policy_file.exists() && !log_failures {
295 j.parse_seccomp_program(&bpf_policy_file)
296 .map_err(Error::ParseSeccomp)?;
297 } else {
298 // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP,
299 // which will correctly kill the entire device process if a worker
300 // thread commits a seccomp violation.
301 j.set_seccomp_filter_tsync();
302 if log_failures {
303 j.log_seccomp_filter_failures();
304 }
305 j.parse_seccomp_filters(&seccomp_policy.with_extension("policy"))
306 .map_err(Error::ParseSeccomp)?;
307 }
308 j.use_seccomp_filter();
309 // Don't do init setup.
310 j.run_as_init();
311
312 // Create a tmpfs in the plugin's root directory so that we can bind mount it's executable
313 // file into it. The size=67108864 is size=64*1024*1024 or size=64MB.
314 j.mount_with_data(
315 Path::new("none"),
316 Path::new("/"),
317 "tmpfs",
318 (MS_NOSUID | MS_NODEV | MS_NOEXEC) as usize,
319 "size=67108864",
320 )
321 .map_err(Error::MountRoot)?;
322
323 // Because we requested to "run as init", minijail will not mount /proc for us even though
324 // plugin will be running in its own PID namespace, so we have to mount it ourselves.
325 j.mount(
326 Path::new("proc"),
327 Path::new("/proc"),
328 "proc",
329 (MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RDONLY) as usize,
330 )
331 .map_err(Error::MountProc)?;
332
333 Ok(j)
334 }
335
336 /// Each `PluginObject` represents one object that was instantiated by the guest using the `Create`
337 /// request.
338 ///
339 /// Each such object has an ID associated with it that exists in an ID space shared by every variant
340 /// of `PluginObject`. This allows all the objects to be indexed in a single map, and allows for a
341 /// common destroy method.
342 ///
343
344 /// In addition to the destory method, each object may have methods specific to its variant type.
345 /// These variant methods must be done by matching the variant to the expected type for that method.
346 /// For example, getting the dirty log from a `Memory` object starting with an ID:
347 ///
348 /// ```ignore
349 /// match objects.get(&request_id) {
350 /// Some(&PluginObject::Memory { slot, length }) => vm.get_dirty_log(slot, &mut dirty_log[..]),
351 /// _ => return Err(SysError::new(ENOENT)),
352 /// }
353 /// ```
354 enum PluginObject {
355 IoEvent {
356 evt: Event,
357 addr: IoeventAddress,
358 length: u32,
359 datamatch: u64,
360 },
361 Memory {
362 slot: u32,
363 length: usize,
364 },
365 IrqEvent {
366 irq_id: u32,
367 evt: Event,
368 },
369 }
370
371 impl PluginObject {
destroy(self, vm: &mut Vm) -> SysResult<()>372 fn destroy(self, vm: &mut Vm) -> SysResult<()> {
373 match self {
374 PluginObject::IoEvent {
375 evt,
376 addr,
377 length,
378 datamatch,
379 } => match length {
380 0 => vm.unregister_ioevent(&evt, addr, Datamatch::AnyLength),
381 1 => vm.unregister_ioevent(&evt, addr, Datamatch::U8(Some(datamatch as u8))),
382 2 => vm.unregister_ioevent(&evt, addr, Datamatch::U16(Some(datamatch as u16))),
383 4 => vm.unregister_ioevent(&evt, addr, Datamatch::U32(Some(datamatch as u32))),
384 8 => vm.unregister_ioevent(&evt, addr, Datamatch::U64(Some(datamatch as u64))),
385 _ => Err(SysError::new(EINVAL)),
386 },
387 PluginObject::Memory { slot, .. } => vm.remove_memory_region(slot).and(Ok(())),
388 PluginObject::IrqEvent { irq_id, evt } => vm.unregister_irqfd(&evt, irq_id),
389 }
390 }
391 }
392
run_vcpus( kvm: &Kvm, vm: &Vm, plugin: &Process, vcpu_count: u32, kill_signaled: &Arc<AtomicBool>, exit_evt: &Event, vcpu_handles: &mut Vec<thread::JoinHandle<()>>, ) -> Result<()>393 pub fn run_vcpus(
394 kvm: &Kvm,
395 vm: &Vm,
396 plugin: &Process,
397 vcpu_count: u32,
398 kill_signaled: &Arc<AtomicBool>,
399 exit_evt: &Event,
400 vcpu_handles: &mut Vec<thread::JoinHandle<()>>,
401 ) -> Result<()> {
402 let vcpu_thread_barrier = Arc::new(Barrier::new((vcpu_count) as usize));
403 let use_kvm_signals = !kvm.check_extension(Cap::ImmediateExit);
404
405 // If we need to force a vcpu to exit from a VM then a SIGRTMIN signal is sent
406 // to that vcpu's thread. If KVM is running the VM then it'll return -EINTR.
407 // An issue is what to do when KVM isn't running the VM (where we could be
408 // in the kernel or in the app).
409 //
410 // If KVM supports "immediate exit" then we set a signal handler that will
411 // set the |immediate_exit| flag that tells KVM to return -EINTR before running
412 // the VM.
413 //
414 // If KVM doesn't support immediate exit then we'll block SIGRTMIN in the app
415 // and tell KVM to unblock SIGRTMIN before running the VM (at which point a blocked
416 // signal might get asserted). There's overhead to have KVM unblock and re-block
417 // SIGRTMIN each time it runs the VM, so this mode should be avoided.
418
419 if use_kvm_signals {
420 unsafe {
421 extern "C" fn handle_signal(_: c_int) {}
422 // Our signal handler does nothing and is trivially async signal safe.
423 // We need to install this signal handler even though we do block
424 // the signal below, to ensure that this signal will interrupt
425 // execution of KVM_RUN (this is implementation issue).
426 register_rt_signal_handler(SIGRTMIN() + 0, handle_signal)
427 .expect("failed to register vcpu signal handler");
428 }
429 // We do not really want the signal handler to run...
430 block_signal(SIGRTMIN() + 0).expect("failed to block signal");
431 } else {
432 unsafe {
433 extern "C" fn handle_signal(_: c_int) {
434 Vcpu::set_local_immediate_exit(true);
435 }
436 register_rt_signal_handler(SIGRTMIN() + 0, handle_signal)
437 .expect("failed to register vcpu signal handler");
438 }
439 }
440
441 for cpu_id in 0..vcpu_count {
442 let kill_signaled = kill_signaled.clone();
443 let vcpu_thread_barrier = vcpu_thread_barrier.clone();
444 let vcpu_exit_evt = exit_evt.try_clone().map_err(Error::CloneEvent)?;
445 let vcpu_plugin = plugin.create_vcpu(cpu_id)?;
446 let vcpu = Vcpu::new(cpu_id as c_ulong, kvm, vm).map_err(Error::CreateVcpu)?;
447
448 vcpu_handles.push(
449 thread::Builder::new()
450 .name(format!("crosvm_vcpu{}", cpu_id))
451 .spawn(move || {
452 if use_kvm_signals {
453 // Tell KVM to not block anything when entering kvm run
454 // because we will be using first RT signal to kick the VCPU.
455 vcpu.set_signal_mask(&[])
456 .expect("failed to set up KVM VCPU signal mask");
457 }
458
459 #[cfg(feature = "chromeos")]
460 if let Err(e) = base::sched::enable_core_scheduling() {
461 error!("Failed to enable core scheduling: {}", e);
462 }
463
464 let vcpu = vcpu
465 .to_runnable(Some(SIGRTMIN() + 0))
466 .expect("Failed to set thread id");
467
468 let res = vcpu_plugin.init(&vcpu);
469 vcpu_thread_barrier.wait();
470 if let Err(e) = res {
471 error!("failed to initialize vcpu {}: {}", cpu_id, e);
472 } else {
473 loop {
474 let mut interrupted_by_signal = false;
475 let run_res = vcpu.run();
476 match run_res {
477 Ok(run) => match run {
478 VcpuExit::IoIn { port, mut size } => {
479 let mut data = [0; 256];
480 if size > data.len() {
481 error!("unsupported IoIn size of {} bytes", size);
482 size = data.len();
483 }
484 vcpu_plugin.io_read(port as u64, &mut data[..size], &vcpu);
485 if let Err(e) = vcpu.set_data(&data[..size]) {
486 error!("failed to set return data for IoIn: {}", e);
487 }
488 }
489 VcpuExit::IoOut {
490 port,
491 mut size,
492 data,
493 } => {
494 if size > data.len() {
495 error!("unsupported IoOut size of {} bytes", size);
496 size = data.len();
497 }
498 vcpu_plugin.io_write(port as u64, &data[..size], &vcpu);
499 }
500 VcpuExit::MmioRead { address, size } => {
501 let mut data = [0; 8];
502 vcpu_plugin.mmio_read(
503 address as u64,
504 &mut data[..size],
505 &vcpu,
506 );
507 // Setting data for mmio can not fail.
508 let _ = vcpu.set_data(&data[..size]);
509 }
510 VcpuExit::MmioWrite {
511 address,
512 size,
513 data,
514 } => {
515 vcpu_plugin.mmio_write(
516 address as u64,
517 &data[..size],
518 &vcpu,
519 );
520 }
521 VcpuExit::HypervHcall { input, params } => {
522 let mut data = [0; 8];
523 vcpu_plugin.hyperv_call(input, params, &mut data, &vcpu);
524 // Setting data for hyperv call can not fail.
525 let _ = vcpu.set_data(&data);
526 }
527 VcpuExit::HypervSynic {
528 msr,
529 control,
530 evt_page,
531 msg_page,
532 } => {
533 vcpu_plugin
534 .hyperv_synic(msr, control, evt_page, msg_page, &vcpu);
535 }
536 VcpuExit::Hlt => break,
537 VcpuExit::Shutdown => break,
538 VcpuExit::InternalError => {
539 error!("vcpu {} has internal error", cpu_id);
540 break;
541 }
542 r => warn!("unexpected vcpu exit: {:?}", r),
543 },
544 Err(e) => match e.errno() {
545 EINTR => interrupted_by_signal = true,
546 EAGAIN => {}
547 _ => {
548 error!("vcpu hit unknown error: {}", e);
549 break;
550 }
551 },
552 }
553 if kill_signaled.load(Ordering::SeqCst) {
554 break;
555 }
556
557 // Only handle the pause request if kvm reported that it was
558 // interrupted by a signal. This helps to entire that KVM has had a chance
559 // to finish emulating any IO that may have immediately happened.
560 // If we eagerly check pre_run() then any IO that we
561 // just reported to the plugin won't have been processed yet by KVM.
562 // Not eagerly calling pre_run() also helps to reduce
563 // any overhead from checking if a pause request is pending.
564 // The assumption is that pause requests aren't common
565 // or frequent so it's better to optimize for the non-pause execution paths.
566 if interrupted_by_signal {
567 if use_kvm_signals {
568 clear_signal(SIGRTMIN() + 0)
569 .expect("failed to clear pending signal");
570 } else {
571 vcpu.set_immediate_exit(false);
572 }
573
574 if let Err(e) = vcpu_plugin.pre_run(&vcpu) {
575 error!("failed to process pause on vcpu {}: {}", cpu_id, e);
576 break;
577 }
578 }
579 }
580 }
581 vcpu_exit_evt
582 .write(1)
583 .expect("failed to signal vcpu exit event");
584 })
585 .map_err(Error::SpawnVcpu)?,
586 );
587 }
588 Ok(())
589 }
590
591 #[derive(PollToken)]
592 enum Token {
593 Exit,
594 ChildSignal,
595 Plugin { index: usize },
596 }
597
598 /// Run a VM with a plugin process specified by `cfg`.
599 ///
600 /// Not every field of `cfg` will be used. In particular, most field that pertain to a specific
601 /// device are ignored because the plugin is responsible for emulating hardware.
run_config(cfg: Config) -> Result<()>602 pub fn run_config(cfg: Config) -> Result<()> {
603 info!("crosvm starting plugin process");
604
605 // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
606 // before any jailed devices have been spawned, so that we can catch any of them that fail very
607 // quickly.
608 let sigchld_fd = SignalFd::new(SIGCHLD).map_err(Error::CreateSignalFd)?;
609
610 let jail = if cfg.sandbox {
611 // An empty directory for jailed plugin pivot root.
612 let root_path = match &cfg.plugin_root {
613 Some(dir) => dir,
614 None => Path::new(option_env!("DEFAULT_PIVOT_ROOT").unwrap_or("/var/empty")),
615 };
616
617 if root_path.is_relative() {
618 return Err(Error::RootNotAbsolute);
619 }
620
621 if !root_path.exists() {
622 return Err(Error::NoRootDir);
623 }
624
625 if !root_path.is_dir() {
626 return Err(Error::RootNotDir);
627 }
628
629 let policy_path = cfg.seccomp_policy_dir.join("plugin");
630 let mut jail = create_plugin_jail(root_path, cfg.seccomp_log_failures, &policy_path)?;
631
632 // Update gid map of the jail if caller provided supplemental groups.
633 if !cfg.plugin_gid_maps.is_empty() {
634 let map = format!("0 {} 1", getegid())
635 + &cfg
636 .plugin_gid_maps
637 .into_iter()
638 .map(|m| format!(",{} {} {}", m.inner, m.outer, m.count))
639 .collect::<String>();
640 jail.gidmap(&map).map_err(Error::SetGidMap)?;
641 }
642
643 // Mount minimal set of devices (full, zero, urandom, etc). We can not use
644 // jail.mount_dev() here because crosvm may not be running with CAP_SYS_ADMIN.
645 let device_names = ["full", "null", "urandom", "zero"];
646 for name in &device_names {
647 let device = Path::new("/dev").join(&name);
648 jail.mount_bind(&device, &device, true)
649 .map_err(Error::MountDev)?;
650 }
651
652 for bind_mount in &cfg.plugin_mounts {
653 jail.mount_bind(&bind_mount.src, &bind_mount.dst, bind_mount.writable)
654 .map_err(Error::Mount)?;
655 }
656
657 Some(jail)
658 } else {
659 None
660 };
661
662 let mut tap_interfaces: Vec<Tap> = Vec::new();
663 if let Some(host_ip) = cfg.host_ip {
664 if let Some(netmask) = cfg.netmask {
665 if let Some(mac_address) = cfg.mac_address {
666 let tap = Tap::new(false, false).map_err(Error::TapOpen)?;
667 tap.set_ip_addr(host_ip).map_err(Error::TapSetIp)?;
668 tap.set_netmask(netmask).map_err(Error::TapSetNetmask)?;
669 tap.set_mac_address(mac_address)
670 .map_err(Error::TapSetMacAddress)?;
671
672 tap.enable().map_err(Error::TapEnable)?;
673 tap_interfaces.push(tap);
674 }
675 }
676 }
677 for tap_fd in cfg.tap_fd {
678 // Safe because we ensure that we get a unique handle to the fd.
679 let tap = unsafe {
680 Tap::from_raw_descriptor(validate_raw_descriptor(tap_fd).map_err(Error::ValidateTapFd)?)
681 .map_err(Error::CreateTapFd)?
682 };
683 tap_interfaces.push(tap);
684 }
685
686 let plugin_args: Vec<&str> = cfg.params.iter().map(|s| &s[..]).collect();
687
688 let plugin_path = match cfg.executable_path {
689 Some(Executable::Plugin(ref plugin_path)) => plugin_path.as_path(),
690 _ => panic!("Executable was not a plugin"),
691 };
692 let vcpu_count = cfg.vcpu_count.unwrap_or(1) as u32;
693 let mem = GuestMemory::new(&[]).unwrap();
694 let mut mem_policy = MemoryPolicy::empty();
695 if cfg.hugepages {
696 mem_policy |= MemoryPolicy::USE_HUGEPAGES;
697 }
698 mem.set_memory_policy(mem_policy);
699 let kvm = Kvm::new_with_path(&cfg.kvm_device_path).map_err(Error::CreateKvm)?;
700 let mut vm = Vm::new(&kvm, mem).map_err(Error::CreateVm)?;
701 vm.create_irq_chip().map_err(Error::CreateIrqChip)?;
702 vm.create_pit().map_err(Error::CreatePIT)?;
703
704 let mut plugin = Process::new(vcpu_count, plugin_path, &plugin_args, jail)?;
705 // Now that the jail for the plugin has been created and we had a chance to adjust gids there,
706 // we can drop all our capabilities in case we had any.
707 drop_capabilities().map_err(Error::DropCapabilities)?;
708
709 let mut res = Ok(());
710 // If Some, we will exit after enough time is passed to shutdown cleanly.
711 let mut dying_instant: Option<Instant> = None;
712 let duration_to_die = Duration::from_millis(1000);
713
714 let exit_evt = Event::new().map_err(Error::CreateEvent)?;
715 let kill_signaled = Arc::new(AtomicBool::new(false));
716 let mut vcpu_handles = Vec::with_capacity(vcpu_count as usize);
717
718 let wait_ctx =
719 WaitContext::build_with(&[(&exit_evt, Token::Exit), (&sigchld_fd, Token::ChildSignal)])
720 .map_err(Error::WaitContextAdd)?;
721
722 let mut sockets_to_drop = Vec::new();
723 let mut redo_wait_ctx_sockets = true;
724 // In this loop, make every attempt to not return early. If an error is encountered, set `res`
725 // to the error, set `dying_instant` to now, and signal the plugin that it will be killed soon.
726 // If the plugin cannot be signaled because it is dead of `signal_kill` failed, simply break
727 // from the poll loop so that the VCPU threads can be cleaned up.
728 'wait: loop {
729 // After we have waited long enough, it's time to give up and exit.
730 if dying_instant
731 .map(|i| i.elapsed() >= duration_to_die)
732 .unwrap_or(false)
733 {
734 break;
735 }
736
737 if redo_wait_ctx_sockets {
738 for (index, socket) in plugin.sockets().iter().enumerate() {
739 wait_ctx
740 .add(socket, Token::Plugin { index })
741 .map_err(Error::WaitContextAdd)?;
742 }
743 }
744
745 let plugin_socket_count = plugin.sockets().len();
746 let events = {
747 let poll_res = match dying_instant {
748 Some(inst) => wait_ctx.wait_timeout(duration_to_die - inst.elapsed()),
749 None => wait_ctx.wait(),
750 };
751 match poll_res {
752 Ok(v) => v,
753 Err(e) => {
754 // Polling no longer works, time to break and cleanup,
755 if res.is_ok() {
756 res = Err(Error::Poll(e));
757 }
758 break;
759 }
760 }
761 };
762 for event in events.iter().filter(|e| e.is_readable) {
763 match event.token {
764 Token::Exit => {
765 // No need to check the exit event if we are already doing cleanup.
766 let _ = wait_ctx.delete(&exit_evt);
767 dying_instant.get_or_insert(Instant::now());
768 let sig_res = plugin.signal_kill();
769 if res.is_ok() && sig_res.is_err() {
770 res = sig_res.map_err(Error::PluginKill);
771 }
772 }
773 Token::ChildSignal => {
774 // Print all available siginfo structs, then exit the loop.
775 loop {
776 match sigchld_fd.read() {
777 Ok(Some(siginfo)) => {
778 // If the plugin process has ended, there is no need to continue
779 // processing plugin connections, so we break early.
780 if siginfo.ssi_pid == plugin.pid() as u32 {
781 break 'wait;
782 }
783 // Because SIGCHLD is not expected from anything other than the
784 // plugin process, report it as an error.
785 if res.is_ok() {
786 res = Err(Error::SigChild {
787 pid: siginfo.ssi_pid,
788 signo: siginfo.ssi_signo,
789 status: siginfo.ssi_status,
790 code: siginfo.ssi_code,
791 })
792 }
793 }
794 Ok(None) => break, // No more signals to read.
795 Err(e) => {
796 // Something really must be messed up for this to happen, continue
797 // processing connections for a limited time.
798 if res.is_ok() {
799 res = Err(Error::SignalFd(e));
800 }
801 break;
802 }
803 }
804 }
805 // As we only spawn the plugin process, getting a SIGCHLD can only mean
806 // something went wrong.
807 dying_instant.get_or_insert(Instant::now());
808 let sig_res = plugin.signal_kill();
809 if res.is_ok() && sig_res.is_err() {
810 res = sig_res.map_err(Error::PluginKill);
811 }
812 }
813 Token::Plugin { index } => {
814 match plugin.handle_socket(index, &kvm, &mut vm, &vcpu_handles, &tap_interfaces)
815 {
816 Ok(_) => {}
817 // A HUP is an expected event for a socket, so don't bother warning about
818 // it.
819 Err(Error::PluginSocketHup) => sockets_to_drop.push(index),
820 // Only one connection out of potentially many is broken. Drop it, but don't
821 // start cleaning up. Because the error isn't returned, we will warn about
822 // it here.
823 Err(e) => {
824 warn!("error handling plugin socket: {}", e);
825 sockets_to_drop.push(index);
826 }
827 }
828 }
829 }
830 }
831
832 if vcpu_handles.is_empty() && dying_instant.is_none() && plugin.is_started() {
833 let res = run_vcpus(
834 &kvm,
835 &vm,
836 &plugin,
837 vcpu_count,
838 &kill_signaled,
839 &exit_evt,
840 &mut vcpu_handles,
841 );
842 if let Err(e) = res {
843 dying_instant.get_or_insert(Instant::now());
844 error!("failed to start vcpus: {}", e);
845 }
846 }
847
848 redo_wait_ctx_sockets =
849 !sockets_to_drop.is_empty() || plugin.sockets().len() != plugin_socket_count;
850
851 // Cleanup all of the sockets that we have determined were disconnected or suffered some
852 // other error.
853 plugin.drop_sockets(&mut sockets_to_drop);
854 sockets_to_drop.clear();
855
856 if redo_wait_ctx_sockets {
857 for socket in plugin.sockets() {
858 let _ = wait_ctx.delete(socket);
859 }
860 }
861 }
862
863 // vcpu threads MUST see the kill signaled flag, otherwise they may re-enter the VM.
864 kill_signaled.store(true, Ordering::SeqCst);
865 // Depending on how we ended up here, the plugin process, or a VCPU thread waiting for requests
866 // might be stuck. The `signal_kill` call will unstick all the VCPU threads by closing their
867 // blocked connections.
868 plugin.signal_kill().map_err(Error::PluginKill)?;
869 for handle in vcpu_handles {
870 match handle.kill(SIGRTMIN() + 0) {
871 Ok(_) => {
872 if let Err(e) = handle.join() {
873 error!("failed to join vcpu thread: {:?}", e);
874 }
875 }
876 Err(e) => error!("failed to kill vcpu thread: {}", e),
877 }
878 }
879
880 match plugin.try_wait() {
881 // The plugin has run out of time by now
882 Ok(ProcessStatus::Running) => Err(Error::PluginTimeout),
883 // Return an error discovered earlier in this function.
884 Ok(ProcessStatus::Success) => res,
885 Ok(ProcessStatus::Fail(code)) => Err(Error::PluginFailed(code)),
886 Ok(ProcessStatus::Signal(code)) => Err(Error::PluginKilled(code)),
887 Err(e) => Err(Error::PluginWait(e)),
888 }
889 }
890