1 // Copyright 2020 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! A crate for abstracting the underlying kernel hypervisor used in crosvm.
6 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
7 pub mod aarch64;
8 pub mod caps;
9 pub mod kvm;
10 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
11 pub mod x86_64;
12 
13 use std::os::raw::c_int;
14 use std::os::unix::io::AsRawFd;
15 
16 use serde::{Deserialize, Serialize};
17 
18 use base::{Event, MappedRegion, Protection, Result, SafeDescriptor};
19 use vm_memory::{GuestAddress, GuestMemory};
20 
21 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
22 pub use crate::aarch64::*;
23 pub use crate::caps::*;
24 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
25 pub use crate::x86_64::*;
26 
27 /// An index in the list of guest-mapped memory regions.
28 pub type MemSlot = u32;
29 
30 /// A trait for checking hypervisor capabilities.
31 pub trait Hypervisor: Send {
32     /// Makes a shallow clone of this `Hypervisor`.
try_clone(&self) -> Result<Self> where Self: Sized33     fn try_clone(&self) -> Result<Self>
34     where
35         Self: Sized;
36 
37     /// Checks if a particular `HypervisorCap` is available.
check_capability(&self, cap: &HypervisorCap) -> bool38     fn check_capability(&self, cap: &HypervisorCap) -> bool;
39 }
40 
41 /// A wrapper for using a VM and getting/setting its state.
42 pub trait Vm: Send {
43     /// Makes a shallow clone of this `Vm`.
try_clone(&self) -> Result<Self> where Self: Sized44     fn try_clone(&self) -> Result<Self>
45     where
46         Self: Sized;
47 
48     /// Checks if a particular `VmCap` is available.
49     ///
50     /// This is distinct from the `Hypervisor` version of this method because some extensions depend
51     /// on the particular `Vm` instance. This method is encouraged because it more accurately
52     /// reflects the usable capabilities.
check_capability(&self, c: VmCap) -> bool53     fn check_capability(&self, c: VmCap) -> bool;
54 
55     /// Gets the guest-mapped memory for the Vm.
get_memory(&self) -> &GuestMemory56     fn get_memory(&self) -> &GuestMemory;
57 
58     /// Inserts the given `MappedRegion` into the VM's address space at `guest_addr`.
59     ///
60     /// The slot that was assigned the memory mapping is returned on success.  The slot can be given
61     /// to `Vm::remove_memory_region` to remove the memory from the VM's address space and take back
62     /// ownership of `mem_region`.
63     ///
64     /// Note that memory inserted into the VM's address space must not overlap with any other memory
65     /// slot's region.
66     ///
67     /// If `read_only` is true, the guest will be able to read the memory as normal, but attempts to
68     /// write will trigger a mmio VM exit, leaving the memory untouched.
69     ///
70     /// If `log_dirty_pages` is true, the slot number can be used to retrieve the pages written to
71     /// by the guest with `get_dirty_log`.
add_memory_region( &mut self, guest_addr: GuestAddress, mem_region: Box<dyn MappedRegion>, read_only: bool, log_dirty_pages: bool, ) -> Result<MemSlot>72     fn add_memory_region(
73         &mut self,
74         guest_addr: GuestAddress,
75         mem_region: Box<dyn MappedRegion>,
76         read_only: bool,
77         log_dirty_pages: bool,
78     ) -> Result<MemSlot>;
79 
80     /// Does a synchronous msync of the memory mapped at `slot`, syncing `size` bytes starting at
81     /// `offset` from the start of the region.  `offset` must be page aligned.
msync_memory_region(&mut self, slot: MemSlot, offset: usize, size: usize) -> Result<()>82     fn msync_memory_region(&mut self, slot: MemSlot, offset: usize, size: usize) -> Result<()>;
83 
84     /// Removes and drops the `UserMemoryRegion` that was previously added at the given slot.
remove_memory_region(&mut self, slot: MemSlot) -> Result<Box<dyn MappedRegion>>85     fn remove_memory_region(&mut self, slot: MemSlot) -> Result<Box<dyn MappedRegion>>;
86 
87     /// Creates an emulated device.
create_device(&self, kind: DeviceKind) -> Result<SafeDescriptor>88     fn create_device(&self, kind: DeviceKind) -> Result<SafeDescriptor>;
89 
90     /// Gets the bitmap of dirty pages since the last call to `get_dirty_log` for the memory at
91     /// `slot`.  Only works on VMs that support `VmCap::DirtyLog`.
92     ///
93     /// The size of `dirty_log` must be at least as many bits as there are pages in the memory
94     /// region `slot` represents. For example, if the size of `slot` is 16 pages, `dirty_log` must
95     /// be 2 bytes or greater.
get_dirty_log(&self, slot: MemSlot, dirty_log: &mut [u8]) -> Result<()>96     fn get_dirty_log(&self, slot: MemSlot, dirty_log: &mut [u8]) -> Result<()>;
97 
98     /// Registers an event to be signaled whenever a certain address is written to.
99     ///
100     /// The `datamatch` parameter can be used to limit signaling `evt` to only the cases where the
101     /// value being written is equal to `datamatch`. Note that the size of `datamatch` is important
102     /// and must match the expected size of the guest's write.
103     ///
104     /// In all cases where `evt` is signaled, the ordinary vmexit to userspace that would be
105     /// triggered is prevented.
register_ioevent( &mut self, evt: &Event, addr: IoEventAddress, datamatch: Datamatch, ) -> Result<()>106     fn register_ioevent(
107         &mut self,
108         evt: &Event,
109         addr: IoEventAddress,
110         datamatch: Datamatch,
111     ) -> Result<()>;
112 
113     /// Unregisters an event previously registered with `register_ioevent`.
114     ///
115     /// The `evt`, `addr`, and `datamatch` set must be the same as the ones passed into
116     /// `register_ioevent`.
unregister_ioevent( &mut self, evt: &Event, addr: IoEventAddress, datamatch: Datamatch, ) -> Result<()>117     fn unregister_ioevent(
118         &mut self,
119         evt: &Event,
120         addr: IoEventAddress,
121         datamatch: Datamatch,
122     ) -> Result<()>;
123 
124     /// Trigger any matching registered io events based on an MMIO or PIO write at `addr`. The
125     /// `data` slice represents the contents and length of the write, which is used to compare with
126     /// the registered io events' Datamatch values. If the hypervisor does in-kernel IO event
127     /// delivery, this is a no-op.
handle_io_events(&self, addr: IoEventAddress, data: &[u8]) -> Result<()>128     fn handle_io_events(&self, addr: IoEventAddress, data: &[u8]) -> Result<()>;
129 
130     /// Retrieves the current timestamp of the paravirtual clock as seen by the current guest.
131     /// Only works on VMs that support `VmCap::PvClock`.
get_pvclock(&self) -> Result<ClockState>132     fn get_pvclock(&self) -> Result<ClockState>;
133 
134     /// Sets the current timestamp of the paravirtual clock as seen by the current guest.
135     /// Only works on VMs that support `VmCap::PvClock`.
set_pvclock(&self, state: &ClockState) -> Result<()>136     fn set_pvclock(&self, state: &ClockState) -> Result<()>;
137 
138     /// Maps `size` bytes starting at `fs_offset` bytes from within the given `fd`
139     /// at `offset` bytes from the start of the arena with `prot` protections.
140     /// `offset` must be page aligned.
141     ///
142     /// # Arguments
143     /// * `offset` - Page aligned offset into the arena in bytes.
144     /// * `size` - Size of memory region in bytes.
145     /// * `fd` - File descriptor to mmap from.
146     /// * `fd_offset` - Offset in bytes from the beginning of `fd` to start the mmap.
147     /// * `prot` - Protection (e.g. readable/writable) of the memory region.
add_fd_mapping( &mut self, slot: u32, offset: usize, size: usize, fd: &dyn AsRawFd, fd_offset: u64, prot: Protection, ) -> Result<()>148     fn add_fd_mapping(
149         &mut self,
150         slot: u32,
151         offset: usize,
152         size: usize,
153         fd: &dyn AsRawFd,
154         fd_offset: u64,
155         prot: Protection,
156     ) -> Result<()>;
157 
158     /// Remove `size`-byte mapping starting at `offset`.
remove_mapping(&mut self, slot: u32, offset: usize, size: usize) -> Result<()>159     fn remove_mapping(&mut self, slot: u32, offset: usize, size: usize) -> Result<()>;
160 }
161 
162 /// A unique fingerprint for a particular `VcpuRunHandle`, used in `Vcpu` impls to ensure the
163 /// `VcpuRunHandle ` they receive is the same one that was returned from `take_run_handle`.
164 #[derive(Clone, PartialEq, Eq)]
165 pub struct VcpuRunHandleFingerprint(u64);
166 
167 impl VcpuRunHandleFingerprint {
as_u64(&self) -> u64168     pub fn as_u64(&self) -> u64 {
169         self.0
170     }
171 }
172 
173 /// A handle returned by a `Vcpu` to be used with `Vcpu::run` to execute a virtual machine's VCPU.
174 ///
175 /// This is used to ensure that the caller has bound the `Vcpu` to a thread with
176 /// `Vcpu::take_run_handle` and to execute hypervisor specific cleanup routines when dropped.
177 pub struct VcpuRunHandle {
178     drop_fn: fn(),
179     fingerprint: VcpuRunHandleFingerprint,
180     // Prevents Send+Sync for this type.
181     phantom: std::marker::PhantomData<*mut ()>,
182 }
183 
184 impl VcpuRunHandle {
185     /// Used by `Vcpu` impls to create a unique run handle, that when dropped, will call the given
186     /// `drop_fn`.
new(drop_fn: fn()) -> Self187     pub fn new(drop_fn: fn()) -> Self {
188         // Creates a probably unique number with a hash of the current thread id and epoch time.
189         use std::hash::{Hash, Hasher};
190         let mut hasher = std::collections::hash_map::DefaultHasher::new();
191         std::time::Instant::now().hash(&mut hasher);
192         std::thread::current().id().hash(&mut hasher);
193         Self {
194             drop_fn,
195             fingerprint: VcpuRunHandleFingerprint(hasher.finish()),
196             phantom: std::marker::PhantomData,
197         }
198     }
199 
200     /// Gets the unique fingerprint which may be copied and compared freely.
fingerprint(&self) -> &VcpuRunHandleFingerprint201     pub fn fingerprint(&self) -> &VcpuRunHandleFingerprint {
202         &self.fingerprint
203     }
204 }
205 
206 impl Drop for VcpuRunHandle {
drop(&mut self)207     fn drop(&mut self) {
208         (self.drop_fn)();
209     }
210 }
211 
212 /// A virtual CPU holding a virtualized hardware thread's state, such as registers and interrupt
213 /// state, which may be used to execute virtual machines.
214 ///
215 /// To run, `take_run_handle` must be called to lock the vcpu to a thread. Then the returned
216 /// `VcpuRunHandle` can be used for running.
217 pub trait Vcpu: downcast_rs::DowncastSync {
218     /// Makes a shallow clone of this `Vcpu`.
try_clone(&self) -> Result<Self> where Self: Sized219     fn try_clone(&self) -> Result<Self>
220     where
221         Self: Sized;
222 
223     /// Casts this architecture specific trait object to the base trait object `Vcpu`.
as_vcpu(&self) -> &dyn Vcpu224     fn as_vcpu(&self) -> &dyn Vcpu;
225 
226     /// Returns a unique `VcpuRunHandle`. A `VcpuRunHandle` is required to run the guest.
227     ///
228     /// Assigns a vcpu to the current thread so that signal handlers can call
229     /// set_local_immediate_exit().  An optional signal number will be temporarily blocked while
230     /// assigning the vcpu to the thread and later blocked when `VcpuRunHandle` is destroyed.
231     ///
232     /// Returns an error, `EBUSY`, if the current thread already contains a Vcpu.
take_run_handle(&self, signal_num: Option<c_int>) -> Result<VcpuRunHandle>233     fn take_run_handle(&self, signal_num: Option<c_int>) -> Result<VcpuRunHandle>;
234 
235     /// Runs the VCPU until it exits, returning the reason for the exit.
236     ///
237     /// Note that the state of the VCPU and associated VM must be setup first for this to do
238     /// anything useful. The given `run_handle` must be the same as the one returned by
239     /// `take_run_handle` for this `Vcpu`.
run(&self, run_handle: &VcpuRunHandle) -> Result<VcpuExit>240     fn run(&self, run_handle: &VcpuRunHandle) -> Result<VcpuExit>;
241 
242     /// Returns the vcpu id.
id(&self) -> usize243     fn id(&self) -> usize;
244 
245     /// Sets the bit that requests an immediate exit.
set_immediate_exit(&self, exit: bool)246     fn set_immediate_exit(&self, exit: bool);
247 
248     /// Sets/clears the bit for immediate exit for the vcpu on the current thread.
set_local_immediate_exit(exit: bool) where Self: Sized249     fn set_local_immediate_exit(exit: bool)
250     where
251         Self: Sized;
252 
253     /// Returns a function pointer that invokes `set_local_immediate_exit` in a
254     /// signal-safe way when called.
set_local_immediate_exit_fn(&self) -> extern "C" fn()255     fn set_local_immediate_exit_fn(&self) -> extern "C" fn();
256 
257     /// Sets the data received by a mmio read, ioport in, or hypercall instruction.
258     ///
259     /// This function should be called after `Vcpu::run` returns an `VcpuExit::IoIn`,
260     /// `VcpuExit::MmioRead`, or 'VcpuExit::HypervHcall`.
set_data(&self, data: &[u8]) -> Result<()>261     fn set_data(&self, data: &[u8]) -> Result<()>;
262 
263     /// Signals to the hypervisor that this guest is being paused by userspace.  Only works on Vms
264     /// that support `VmCap::PvClockSuspend`.
pvclock_ctrl(&self) -> Result<()>265     fn pvclock_ctrl(&self) -> Result<()>;
266 
267     /// Specifies set of signals that are blocked during execution of `RunnableVcpu::run`.  Signals
268     /// that are not blocked will cause run to return with `VcpuExit::Intr`.  Only works on Vms that
269     /// support `VmCap::SignalMask`.
set_signal_mask(&self, signals: &[c_int]) -> Result<()>270     fn set_signal_mask(&self, signals: &[c_int]) -> Result<()>;
271 
272     /// Enables a hypervisor-specific extension on this Vcpu.  `cap` is a constant defined by the
273     /// hypervisor API (e.g., kvm.h).  `args` are the arguments for enabling the feature, if any.
274     ///
275     /// # Safety
276     /// This function is marked as unsafe because `args` may be interpreted as pointers for some
277     /// capabilities. The caller must ensure that any pointers passed in the `args` array are
278     /// allocated as the kernel expects, and that mutable pointers are owned.
enable_raw_capability(&self, cap: u32, args: &[u64; 4]) -> Result<()>279     unsafe fn enable_raw_capability(&self, cap: u32, args: &[u64; 4]) -> Result<()>;
280 }
281 
282 downcast_rs::impl_downcast!(sync Vcpu);
283 
284 /// An address either in programmable I/O space or in memory mapped I/O space.
285 #[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq, std::hash::Hash)]
286 pub enum IoEventAddress {
287     Pio(u64),
288     Mmio(u64),
289 }
290 
291 /// Used in `Vm::register_ioevent` to indicate a size and optionally value to match.
292 #[derive(PartialEq, Eq)]
293 pub enum Datamatch {
294     AnyLength,
295     U8(Option<u8>),
296     U16(Option<u16>),
297     U32(Option<u32>),
298     U64(Option<u64>),
299 }
300 
301 /// A reason why a VCPU exited. One of these returns every time `Vcpu::run` is called.
302 #[derive(Debug)]
303 pub enum VcpuExit {
304     /// An out port instruction was run on the given port with the given data.
305     IoOut {
306         port: u16,
307         size: usize,
308         data: [u8; 8],
309     },
310     /// An in port instruction was run on the given port.
311     ///
312     /// The data that the instruction receives should be set with `set_data` before `Vcpu::run` is
313     /// called again.
314     IoIn {
315         port: u16,
316         size: usize,
317     },
318     /// A read instruction was run against the given MMIO address.
319     ///
320     /// The data that the instruction receives should be set with `set_data` before `Vcpu::run` is
321     /// called again.
322     MmioRead {
323         address: u64,
324         size: usize,
325     },
326     /// A write instruction was run against the given MMIO address with the given data.
327     MmioWrite {
328         address: u64,
329         size: usize,
330         data: [u8; 8],
331     },
332     IoapicEoi {
333         vector: u8,
334     },
335     HypervSynic {
336         msr: u32,
337         control: u64,
338         evt_page: u64,
339         msg_page: u64,
340     },
341     HypervHcall {
342         input: u64,
343         params: [u64; 2],
344     },
345     Unknown,
346     Exception,
347     Hypercall,
348     Debug,
349     Hlt,
350     IrqWindowOpen,
351     Shutdown,
352     FailEntry {
353         hardware_entry_failure_reason: u64,
354     },
355     Intr,
356     SetTpr,
357     TprAccess,
358     S390Sieic,
359     S390Reset,
360     Dcr,
361     Nmi,
362     InternalError,
363     Osi,
364     PaprHcall,
365     S390Ucontrol,
366     Watchdog,
367     S390Tsch,
368     Epr,
369     /// The cpu triggered a system level event which is specified by the type field.
370     /// The first field is the event type and the second field is flags.
371     /// The possible event types are shutdown, reset, or crash.  So far there
372     /// are not any flags defined.
373     SystemEvent(u32 /* event_type */, u64 /* flags */),
374 }
375 
376 /// A device type to create with `Vm.create_device`.
377 #[derive(Clone, Copy, Debug, PartialEq)]
378 pub enum DeviceKind {
379     /// VFIO device for direct access to devices from userspace
380     Vfio,
381     /// ARM virtual general interrupt controller v2
382     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
383     ArmVgicV2,
384     /// ARM virtual general interrupt controller v3
385     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
386     ArmVgicV3,
387 }
388 
389 /// The source chip of an `IrqSource`
390 #[repr(C)]
391 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
392 pub enum IrqSourceChip {
393     PicPrimary,
394     PicSecondary,
395     Ioapic,
396     Gic,
397 }
398 
399 /// A source of IRQs in an `IrqRoute`.
400 #[repr(C)]
401 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
402 pub enum IrqSource {
403     Irqchip { chip: IrqSourceChip, pin: u32 },
404     Msi { address: u64, data: u32 },
405 }
406 
407 /// A single route for an IRQ.
408 #[repr(C)]
409 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
410 pub struct IrqRoute {
411     pub gsi: u32,
412     pub source: IrqSource,
413 }
414 
415 /// The state of the paravirtual clock.
416 #[derive(Debug, Default, Copy, Clone)]
417 pub struct ClockState {
418     /// Current pv clock timestamp, as seen by the guest
419     pub clock: u64,
420     /// Hypervisor-specific feature flags for the pv clock
421     pub flags: u32,
422 }
423 
424 /// The MPState represents the state of a processor.
425 #[repr(C)]
426 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
427 pub enum MPState {
428     /// the vcpu is currently running (x86/x86_64,arm/arm64)
429     Runnable,
430     /// the vcpu is an application processor (AP) which has not yet received an INIT signal
431     /// (x86/x86_64)
432     Uninitialized,
433     /// the vcpu has received an INIT signal, and is now ready for a SIPI (x86/x86_64)
434     InitReceived,
435     /// the vcpu has executed a HLT instruction and is waiting for an interrupt (x86/x86_64)
436     Halted,
437     /// the vcpu has just received a SIPI (vector accessible via KVM_GET_VCPU_EVENTS) (x86/x86_64)
438     SipiReceived,
439     /// the vcpu is stopped (arm/arm64)
440     Stopped,
441 }
442