1 // Copyright 2017 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! A safe wrapper around the kernel's KVM interface.
6 
7 mod cap;
8 
9 use std::cell::RefCell;
10 use std::cmp::{min, Ordering};
11 use std::collections::{BTreeMap, BinaryHeap};
12 use std::ffi::CString;
13 use std::fs::File;
14 use std::mem::size_of;
15 use std::ops::{Deref, DerefMut};
16 use std::os::raw::*;
17 use std::os::unix::prelude::OsStrExt;
18 use std::path::{Path, PathBuf};
19 use std::ptr::copy_nonoverlapping;
20 use std::sync::Arc;
21 use sync::Mutex;
22 
23 use base::{AsRawDescriptor, FromRawDescriptor, RawDescriptor};
24 use data_model::vec_with_array_field;
25 
26 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
27 use data_model::FlexibleArrayWrapper;
28 
29 use libc::sigset_t;
30 use libc::{open, EBUSY, EINVAL, ENOENT, ENOSPC, EOVERFLOW, O_CLOEXEC, O_RDWR};
31 
32 use kvm_sys::*;
33 
34 #[allow(unused_imports)]
35 use base::{
36     block_signal, ioctl, ioctl_with_mut_ptr, ioctl_with_mut_ref, ioctl_with_ptr, ioctl_with_ref,
37     ioctl_with_val, pagesize, signal, unblock_signal, warn, Error, Event, IoctlNr, MappedRegion,
38     MemoryMapping, MemoryMappingBuilder, MmapError, Result, SIGRTMIN,
39 };
40 use vm_memory::{GuestAddress, GuestMemory};
41 
42 pub use crate::cap::*;
43 
errno_result<T>() -> Result<T>44 fn errno_result<T>() -> Result<T> {
45     Err(Error::last())
46 }
47 
set_user_memory_region<F: AsRawDescriptor>( fd: &F, slot: u32, read_only: bool, log_dirty_pages: bool, guest_addr: u64, memory_size: u64, userspace_addr: *mut u8, ) -> Result<()>48 unsafe fn set_user_memory_region<F: AsRawDescriptor>(
49     fd: &F,
50     slot: u32,
51     read_only: bool,
52     log_dirty_pages: bool,
53     guest_addr: u64,
54     memory_size: u64,
55     userspace_addr: *mut u8,
56 ) -> Result<()> {
57     let mut flags = if read_only { KVM_MEM_READONLY } else { 0 };
58     if log_dirty_pages {
59         flags |= KVM_MEM_LOG_DIRTY_PAGES;
60     }
61     let region = kvm_userspace_memory_region {
62         slot,
63         flags,
64         guest_phys_addr: guest_addr,
65         memory_size,
66         userspace_addr: userspace_addr as u64,
67     };
68 
69     let ret = ioctl_with_ref(fd, KVM_SET_USER_MEMORY_REGION(), &region);
70     if ret == 0 {
71         Ok(())
72     } else {
73         errno_result()
74     }
75 }
76 
77 /// Helper function to determine the size in bytes of a dirty log bitmap for the given memory region
78 /// size.
79 ///
80 /// # Arguments
81 ///
82 /// * `size` - Number of bytes in the memory region being queried.
dirty_log_bitmap_size(size: usize) -> usize83 pub fn dirty_log_bitmap_size(size: usize) -> usize {
84     let page_size = pagesize();
85     (((size + page_size - 1) / page_size) + 7) / 8
86 }
87 
88 /// A wrapper around opening and using `/dev/kvm`.
89 ///
90 /// Useful for querying extensions and basic values from the KVM backend. A `Kvm` is required to
91 /// create a `Vm` object.
92 pub struct Kvm {
93     kvm: File,
94 }
95 
96 impl Kvm {
97     /// Opens `/dev/kvm/` and returns a Kvm object on success.
new() -> Result<Kvm>98     pub fn new() -> Result<Kvm> {
99         Kvm::new_with_path(&PathBuf::from("/dev/kvm"))
100     }
101 
102     /// Opens a KVM device at `device_path` and returns a Kvm object on success.
new_with_path(device_path: &Path) -> Result<Kvm>103     pub fn new_with_path(device_path: &Path) -> Result<Kvm> {
104         // Open calls are safe because we give a nul-terminated string and verify the result.
105         let c_path = CString::new(device_path.as_os_str().as_bytes()).unwrap();
106         let ret = unsafe { open(c_path.as_ptr(), O_RDWR | O_CLOEXEC) };
107         if ret < 0 {
108             return errno_result();
109         }
110         // Safe because we verify that ret is valid and we own the fd.
111         Ok(Kvm {
112             kvm: unsafe { File::from_raw_descriptor(ret) },
113         })
114     }
115 
check_extension_int(&self, c: Cap) -> i32116     fn check_extension_int(&self, c: Cap) -> i32 {
117         // Safe because we know that our file is a KVM fd and that the extension is one of the ones
118         // defined by kernel.
119         unsafe { ioctl_with_val(self, KVM_CHECK_EXTENSION(), c as c_ulong) }
120     }
121 
122     /// Checks if a particular `Cap` is available.
check_extension(&self, c: Cap) -> bool123     pub fn check_extension(&self, c: Cap) -> bool {
124         self.check_extension_int(c) == 1
125     }
126 
127     /// Gets the size of the mmap required to use vcpu's `kvm_run` structure.
get_vcpu_mmap_size(&self) -> Result<usize>128     pub fn get_vcpu_mmap_size(&self) -> Result<usize> {
129         // Safe because we know that our file is a KVM fd and we verify the return result.
130         let res = unsafe { ioctl(self, KVM_GET_VCPU_MMAP_SIZE()) };
131         if res > 0 {
132             Ok(res as usize)
133         } else {
134             errno_result()
135         }
136     }
137 
138     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_cpuid(&self, kind: IoctlNr) -> Result<CpuId>139     fn get_cpuid(&self, kind: IoctlNr) -> Result<CpuId> {
140         const MAX_KVM_CPUID_ENTRIES: usize = 256;
141         let mut cpuid = CpuId::new(MAX_KVM_CPUID_ENTRIES);
142 
143         let ret = unsafe {
144             // ioctl is unsafe. The kernel is trusted not to write beyond the bounds of the memory
145             // allocated for the struct. The limit is read from nent, which is set to the allocated
146             // size(MAX_KVM_CPUID_ENTRIES) above.
147             ioctl_with_mut_ptr(self, kind, cpuid.as_mut_ptr())
148         };
149         if ret < 0 {
150             return errno_result();
151         }
152 
153         Ok(cpuid)
154     }
155 
156     /// X86 specific call to get the system supported CPUID values
157     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_supported_cpuid(&self) -> Result<CpuId>158     pub fn get_supported_cpuid(&self) -> Result<CpuId> {
159         self.get_cpuid(KVM_GET_SUPPORTED_CPUID())
160     }
161 
162     /// X86 specific call to get the system emulated CPUID values
163     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_emulated_cpuid(&self) -> Result<CpuId>164     pub fn get_emulated_cpuid(&self) -> Result<CpuId> {
165         self.get_cpuid(KVM_GET_EMULATED_CPUID())
166     }
167 
168     /// X86 specific call to get list of supported MSRS
169     ///
170     /// See the documentation for KVM_GET_MSR_INDEX_LIST.
171     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_msr_index_list(&self) -> Result<Vec<u32>>172     pub fn get_msr_index_list(&self) -> Result<Vec<u32>> {
173         const MAX_KVM_MSR_ENTRIES: usize = 256;
174 
175         let mut msr_list = vec_with_array_field::<kvm_msr_list, u32>(MAX_KVM_MSR_ENTRIES);
176         msr_list[0].nmsrs = MAX_KVM_MSR_ENTRIES as u32;
177 
178         let ret = unsafe {
179             // ioctl is unsafe. The kernel is trusted not to write beyond the bounds of the memory
180             // allocated for the struct. The limit is read from nmsrs, which is set to the allocated
181             // size (MAX_KVM_MSR_ENTRIES) above.
182             ioctl_with_mut_ref(self, KVM_GET_MSR_INDEX_LIST(), &mut msr_list[0])
183         };
184         if ret < 0 {
185             return errno_result();
186         }
187 
188         let mut nmsrs = msr_list[0].nmsrs;
189 
190         // Mapping the unsized array to a slice is unsafe because the length isn't known.  Using
191         // the length we originally allocated with eliminates the possibility of overflow.
192         let indices: &[u32] = unsafe {
193             if nmsrs > MAX_KVM_MSR_ENTRIES as u32 {
194                 nmsrs = MAX_KVM_MSR_ENTRIES as u32;
195             }
196             msr_list[0].indices.as_slice(nmsrs as usize)
197         };
198 
199         Ok(indices.to_vec())
200     }
201 }
202 
203 impl AsRawDescriptor for Kvm {
as_raw_descriptor(&self) -> RawDescriptor204     fn as_raw_descriptor(&self) -> RawDescriptor {
205         self.kvm.as_raw_descriptor()
206     }
207 }
208 
209 /// An address either in programmable I/O space or in memory mapped I/O space.
210 #[derive(Copy, Clone, Debug)]
211 pub enum IoeventAddress {
212     Pio(u64),
213     Mmio(u64),
214 }
215 
216 /// Used in `Vm::register_ioevent` to indicate a size and optionally value to match.
217 pub enum Datamatch {
218     AnyLength,
219     U8(Option<u8>),
220     U16(Option<u16>),
221     U32(Option<u32>),
222     U64(Option<u64>),
223 }
224 
225 /// A source of IRQs in an `IrqRoute`.
226 pub enum IrqSource {
227     Irqchip { chip: u32, pin: u32 },
228     Msi { address: u64, data: u32 },
229 }
230 
231 /// A single route for an IRQ.
232 pub struct IrqRoute {
233     pub gsi: u32,
234     pub source: IrqSource,
235 }
236 
237 /// Interrupt controller IDs
238 pub enum PicId {
239     Primary = 0,
240     Secondary = 1,
241 }
242 
243 /// Number of pins on the IOAPIC.
244 pub const NUM_IOAPIC_PINS: usize = 24;
245 
246 // Used to invert the order when stored in a max-heap.
247 #[derive(Copy, Clone, Eq, PartialEq)]
248 struct MemSlot(u32);
249 
250 impl Ord for MemSlot {
cmp(&self, other: &MemSlot) -> Ordering251     fn cmp(&self, other: &MemSlot) -> Ordering {
252         // Notice the order is inverted so the lowest magnitude slot has the highest priority in a
253         // max-heap.
254         other.0.cmp(&self.0)
255     }
256 }
257 
258 impl PartialOrd for MemSlot {
partial_cmp(&self, other: &MemSlot) -> Option<Ordering>259     fn partial_cmp(&self, other: &MemSlot) -> Option<Ordering> {
260         Some(self.cmp(other))
261     }
262 }
263 
264 /// A wrapper around creating and using a VM.
265 pub struct Vm {
266     vm: File,
267     guest_mem: GuestMemory,
268     mem_regions: Arc<Mutex<BTreeMap<u32, Box<dyn MappedRegion>>>>,
269     mem_slot_gaps: Arc<Mutex<BinaryHeap<MemSlot>>>,
270 }
271 
272 impl Vm {
273     /// Constructs a new `Vm` using the given `Kvm` instance.
new(kvm: &Kvm, guest_mem: GuestMemory) -> Result<Vm>274     pub fn new(kvm: &Kvm, guest_mem: GuestMemory) -> Result<Vm> {
275         // Safe because we know kvm is a real kvm fd as this module is the only one that can make
276         // Kvm objects.
277         let ret = unsafe { ioctl(kvm, KVM_CREATE_VM()) };
278         if ret >= 0 {
279             // Safe because we verify the value of ret and we are the owners of the fd.
280             let vm_file = unsafe { File::from_raw_descriptor(ret) };
281             guest_mem.with_regions(|index, guest_addr, size, host_addr, _, _| {
282                 unsafe {
283                     // Safe because the guest regions are guaranteed not to overlap.
284                     set_user_memory_region(
285                         &vm_file,
286                         index as u32,
287                         false,
288                         false,
289                         guest_addr.offset() as u64,
290                         size as u64,
291                         host_addr as *mut u8,
292                     )
293                 }
294             })?;
295 
296             Ok(Vm {
297                 vm: vm_file,
298                 guest_mem,
299                 mem_regions: Arc::new(Mutex::new(BTreeMap::new())),
300                 mem_slot_gaps: Arc::new(Mutex::new(BinaryHeap::new())),
301             })
302         } else {
303             errno_result()
304         }
305     }
306 
307     /// Checks if a particular `Cap` is available.
308     ///
309     /// This is distinct from the `Kvm` version of this method because the some extensions depend on
310     /// the particular `Vm` existence. This method is encouraged by the kernel because it more
311     /// accurately reflects the usable capabilities.
check_extension(&self, c: Cap) -> bool312     pub fn check_extension(&self, c: Cap) -> bool {
313         // Safe because we know that our file is a KVM fd and that the extension is one of the ones
314         // defined by kernel.
315         unsafe { ioctl_with_val(self, KVM_CHECK_EXTENSION(), c as c_ulong) == 1 }
316     }
317 
318     /// Inserts the given `mem` into the VM's address space at `guest_addr`.
319     ///
320     /// The slot that was assigned the kvm memory mapping is returned on success. The slot can be
321     /// given to `Vm::remove_memory_region` to remove the memory from the VM's address space and
322     /// take back ownership of `mem`.
323     ///
324     /// Note that memory inserted into the VM's address space must not overlap with any other memory
325     /// slot's region.
326     ///
327     /// If `read_only` is true, the guest will be able to read the memory as normal, but attempts to
328     /// write will trigger a mmio VM exit, leaving the memory untouched.
329     ///
330     /// If `log_dirty_pages` is true, the slot number can be used to retrieve the pages written to
331     /// by the guest with `get_dirty_log`.
add_memory_region( &mut self, guest_addr: GuestAddress, mem: Box<dyn MappedRegion>, read_only: bool, log_dirty_pages: bool, ) -> Result<u32>332     pub fn add_memory_region(
333         &mut self,
334         guest_addr: GuestAddress,
335         mem: Box<dyn MappedRegion>,
336         read_only: bool,
337         log_dirty_pages: bool,
338     ) -> Result<u32> {
339         let size = mem.size() as u64;
340         let end_addr = guest_addr
341             .checked_add(size)
342             .ok_or_else(|| Error::new(EOVERFLOW))?;
343         if self.guest_mem.range_overlap(guest_addr, end_addr) {
344             return Err(Error::new(ENOSPC));
345         }
346         let mut regions = self.mem_regions.lock();
347         let mut gaps = self.mem_slot_gaps.lock();
348         let slot = match gaps.pop() {
349             Some(gap) => gap.0,
350             None => (regions.len() + self.guest_mem.num_regions() as usize) as u32,
351         };
352 
353         // Safe because we check that the given guest address is valid and has no overlaps. We also
354         // know that the pointer and size are correct because the MemoryMapping interface ensures
355         // this. We take ownership of the memory mapping so that it won't be unmapped until the slot
356         // is removed.
357         let res = unsafe {
358             set_user_memory_region(
359                 &self.vm,
360                 slot,
361                 read_only,
362                 log_dirty_pages,
363                 guest_addr.offset() as u64,
364                 size,
365                 mem.as_ptr(),
366             )
367         };
368 
369         if let Err(e) = res {
370             gaps.push(MemSlot(slot));
371             return Err(e);
372         }
373         regions.insert(slot, mem);
374         Ok(slot)
375     }
376 
377     /// Removes memory that was previously added at the given slot.
378     ///
379     /// Ownership of the host memory mapping associated with the given slot is returned on success.
remove_memory_region(&mut self, slot: u32) -> Result<Box<dyn MappedRegion>>380     pub fn remove_memory_region(&mut self, slot: u32) -> Result<Box<dyn MappedRegion>> {
381         let mut regions = self.mem_regions.lock();
382         if !regions.contains_key(&slot) {
383             return Err(Error::new(ENOENT));
384         }
385         // Safe because the slot is checked against the list of memory slots.
386         unsafe {
387             set_user_memory_region(&self.vm, slot, false, false, 0, 0, std::ptr::null_mut())?;
388         }
389         self.mem_slot_gaps.lock().push(MemSlot(slot));
390         // This remove will always succeed because of the contains_key check above.
391         Ok(regions.remove(&slot).unwrap())
392     }
393 
394     /// Gets the bitmap of dirty pages since the last call to `get_dirty_log` for the memory at
395     /// `slot`.
396     ///
397     /// The size of `dirty_log` must be at least as many bits as there are pages in the memory
398     /// region `slot` represents. For example, if the size of `slot` is 16 pages, `dirty_log` must
399     /// be 2 bytes or greater.
get_dirty_log(&self, slot: u32, dirty_log: &mut [u8]) -> Result<()>400     pub fn get_dirty_log(&self, slot: u32, dirty_log: &mut [u8]) -> Result<()> {
401         match self.mem_regions.lock().get(&slot) {
402             Some(mem) => {
403                 // Ensures that there are as many bytes in dirty_log as there are pages in the mmap.
404                 if dirty_log_bitmap_size(mem.size()) > dirty_log.len() {
405                     return Err(Error::new(EINVAL));
406                 }
407                 let mut dirty_log_kvm = kvm_dirty_log {
408                     slot,
409                     ..Default::default()
410                 };
411                 dirty_log_kvm.__bindgen_anon_1.dirty_bitmap = dirty_log.as_ptr() as *mut c_void;
412                 // Safe because the `dirty_bitmap` pointer assigned above is guaranteed to be valid
413                 // (because it's from a slice) and we checked that it will be large enough to hold
414                 // the entire log.
415                 let ret = unsafe { ioctl_with_ref(self, KVM_GET_DIRTY_LOG(), &dirty_log_kvm) };
416                 if ret == 0 {
417                     Ok(())
418                 } else {
419                     errno_result()
420                 }
421             }
422             _ => Err(Error::new(ENOENT)),
423         }
424     }
425 
426     /// Gets a reference to the guest memory owned by this VM.
427     ///
428     /// Note that `GuestMemory` does not include any mmio memory that may have been added after
429     /// this VM was constructed.
get_memory(&self) -> &GuestMemory430     pub fn get_memory(&self) -> &GuestMemory {
431         &self.guest_mem
432     }
433 
434     /// Sets the address of a one-page region in the VM's address space.
435     ///
436     /// See the documentation on the KVM_SET_IDENTITY_MAP_ADDR ioctl.
437     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
set_identity_map_addr(&self, addr: GuestAddress) -> Result<()>438     pub fn set_identity_map_addr(&self, addr: GuestAddress) -> Result<()> {
439         // Safe because we know that our file is a VM fd and we verify the return result.
440         let ret =
441             unsafe { ioctl_with_ref(self, KVM_SET_IDENTITY_MAP_ADDR(), &(addr.offset() as u64)) };
442         if ret == 0 {
443             Ok(())
444         } else {
445             errno_result()
446         }
447     }
448 
449     /// Retrieves the current timestamp of kvmclock as seen by the current guest.
450     ///
451     /// See the documentation on the KVM_GET_CLOCK ioctl.
452     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_clock(&self) -> Result<kvm_clock_data>453     pub fn get_clock(&self) -> Result<kvm_clock_data> {
454         // Safe because we know that our file is a VM fd, we know the kernel will only write
455         // correct amount of memory to our pointer, and we verify the return result.
456         let mut clock_data = unsafe { std::mem::zeroed() };
457         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_CLOCK(), &mut clock_data) };
458         if ret == 0 {
459             Ok(clock_data)
460         } else {
461             errno_result()
462         }
463     }
464 
465     /// Sets the current timestamp of kvmclock to the specified value.
466     ///
467     /// See the documentation on the KVM_SET_CLOCK ioctl.
468     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
set_clock(&self, clock_data: &kvm_clock_data) -> Result<()>469     pub fn set_clock(&self, clock_data: &kvm_clock_data) -> Result<()> {
470         // Safe because we know that our file is a VM fd, we know the kernel will only read
471         // correct amount of memory from our pointer, and we verify the return result.
472         let ret = unsafe { ioctl_with_ref(self, KVM_SET_CLOCK(), clock_data) };
473         if ret == 0 {
474             Ok(())
475         } else {
476             errno_result()
477         }
478     }
479 
480     /// Crates an in kernel interrupt controller.
481     ///
482     /// See the documentation on the KVM_CREATE_IRQCHIP ioctl.
483     #[cfg(any(
484         target_arch = "x86",
485         target_arch = "x86_64",
486         target_arch = "arm",
487         target_arch = "aarch64"
488     ))]
create_irq_chip(&self) -> Result<()>489     pub fn create_irq_chip(&self) -> Result<()> {
490         // Safe because we know that our file is a VM fd and we verify the return result.
491         let ret = unsafe { ioctl(self, KVM_CREATE_IRQCHIP()) };
492         if ret == 0 {
493             Ok(())
494         } else {
495             errno_result()
496         }
497     }
498 
499     /// Retrieves the state of given interrupt controller by issuing KVM_GET_IRQCHIP ioctl.
500     ///
501     /// Note that this call can only succeed after a call to `Vm::create_irq_chip`.
502     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_pic_state(&self, id: PicId) -> Result<kvm_pic_state>503     pub fn get_pic_state(&self, id: PicId) -> Result<kvm_pic_state> {
504         let mut irqchip_state = kvm_irqchip {
505             chip_id: id as u32,
506             ..Default::default()
507         };
508         let ret = unsafe {
509             // Safe because we know our file is a VM fd, we know the kernel will only write
510             // correct amount of memory to our pointer, and we verify the return result.
511             ioctl_with_mut_ref(self, KVM_GET_IRQCHIP(), &mut irqchip_state)
512         };
513         if ret == 0 {
514             Ok(unsafe {
515                 // Safe as we know that we are retrieving data related to the
516                 // PIC (primary or secondary) and not IOAPIC.
517                 irqchip_state.chip.pic
518             })
519         } else {
520             errno_result()
521         }
522     }
523 
524     /// Sets the state of given interrupt controller by issuing KVM_SET_IRQCHIP ioctl.
525     ///
526     /// Note that this call can only succeed after a call to `Vm::create_irq_chip`.
527     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
set_pic_state(&self, id: PicId, state: &kvm_pic_state) -> Result<()>528     pub fn set_pic_state(&self, id: PicId, state: &kvm_pic_state) -> Result<()> {
529         let mut irqchip_state = kvm_irqchip {
530             chip_id: id as u32,
531             ..Default::default()
532         };
533         irqchip_state.chip.pic = *state;
534         // Safe because we know that our file is a VM fd, we know the kernel will only read
535         // correct amount of memory from our pointer, and we verify the return result.
536         let ret = unsafe { ioctl_with_ref(self, KVM_SET_IRQCHIP(), &irqchip_state) };
537         if ret == 0 {
538             Ok(())
539         } else {
540             errno_result()
541         }
542     }
543 
544     /// Retrieves the state of IOAPIC by issuing KVM_GET_IRQCHIP ioctl.
545     ///
546     /// Note that this call can only succeed after a call to `Vm::create_irq_chip`.
547     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_ioapic_state(&self) -> Result<kvm_ioapic_state>548     pub fn get_ioapic_state(&self) -> Result<kvm_ioapic_state> {
549         let mut irqchip_state = kvm_irqchip {
550             chip_id: 2,
551             ..Default::default()
552         };
553         let ret = unsafe {
554             // Safe because we know our file is a VM fd, we know the kernel will only write
555             // correct amount of memory to our pointer, and we verify the return result.
556             ioctl_with_mut_ref(self, KVM_GET_IRQCHIP(), &mut irqchip_state)
557         };
558         if ret == 0 {
559             Ok(unsafe {
560                 // Safe as we know that we are retrieving data related to the
561                 // IOAPIC and not PIC.
562                 irqchip_state.chip.ioapic
563             })
564         } else {
565             errno_result()
566         }
567     }
568 
569     /// Sets the state of IOAPIC by issuing KVM_SET_IRQCHIP ioctl.
570     ///
571     /// Note that this call can only succeed after a call to `Vm::create_irq_chip`.
572     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
set_ioapic_state(&self, state: &kvm_ioapic_state) -> Result<()>573     pub fn set_ioapic_state(&self, state: &kvm_ioapic_state) -> Result<()> {
574         let mut irqchip_state = kvm_irqchip {
575             chip_id: 2,
576             ..Default::default()
577         };
578         irqchip_state.chip.ioapic = *state;
579         // Safe because we know that our file is a VM fd, we know the kernel will only read
580         // correct amount of memory from our pointer, and we verify the return result.
581         let ret = unsafe { ioctl_with_ref(self, KVM_SET_IRQCHIP(), &irqchip_state) };
582         if ret == 0 {
583             Ok(())
584         } else {
585             errno_result()
586         }
587     }
588 
589     /// Sets the level on the given irq to 1 if `active` is true, and 0 otherwise.
590     #[cfg(any(
591         target_arch = "x86",
592         target_arch = "x86_64",
593         target_arch = "arm",
594         target_arch = "aarch64"
595     ))]
set_irq_line(&self, irq: u32, active: bool) -> Result<()>596     pub fn set_irq_line(&self, irq: u32, active: bool) -> Result<()> {
597         let mut irq_level = kvm_irq_level::default();
598         irq_level.__bindgen_anon_1.irq = irq;
599         irq_level.level = if active { 1 } else { 0 };
600 
601         // Safe because we know that our file is a VM fd, we know the kernel will only read the
602         // correct amount of memory from our pointer, and we verify the return result.
603         let ret = unsafe { ioctl_with_ref(self, KVM_IRQ_LINE(), &irq_level) };
604         if ret == 0 {
605             Ok(())
606         } else {
607             errno_result()
608         }
609     }
610 
611     /// Creates a PIT as per the KVM_CREATE_PIT2 ioctl.
612     ///
613     /// Note that this call can only succeed after a call to `Vm::create_irq_chip`.
614     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
create_pit(&self) -> Result<()>615     pub fn create_pit(&self) -> Result<()> {
616         let pit_config = kvm_pit_config::default();
617         // Safe because we know that our file is a VM fd, we know the kernel will only read the
618         // correct amount of memory from our pointer, and we verify the return result.
619         let ret = unsafe { ioctl_with_ref(self, KVM_CREATE_PIT2(), &pit_config) };
620         if ret == 0 {
621             Ok(())
622         } else {
623             errno_result()
624         }
625     }
626 
627     /// Retrieves the state of PIT by issuing KVM_GET_PIT2 ioctl.
628     ///
629     /// Note that this call can only succeed after a call to `Vm::create_pit`.
630     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_pit_state(&self) -> Result<kvm_pit_state2>631     pub fn get_pit_state(&self) -> Result<kvm_pit_state2> {
632         // Safe because we know that our file is a VM fd, we know the kernel will only write
633         // correct amount of memory to our pointer, and we verify the return result.
634         let mut pit_state = unsafe { std::mem::zeroed() };
635         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_PIT2(), &mut pit_state) };
636         if ret == 0 {
637             Ok(pit_state)
638         } else {
639             errno_result()
640         }
641     }
642 
643     /// Sets the state of PIT by issuing KVM_SET_PIT2 ioctl.
644     ///
645     /// Note that this call can only succeed after a call to `Vm::create_pit`.
646     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
set_pit_state(&self, pit_state: &kvm_pit_state2) -> Result<()>647     pub fn set_pit_state(&self, pit_state: &kvm_pit_state2) -> Result<()> {
648         // Safe because we know that our file is a VM fd, we know the kernel will only read
649         // correct amount of memory from our pointer, and we verify the return result.
650         let ret = unsafe { ioctl_with_ref(self, KVM_SET_PIT2(), pit_state) };
651         if ret == 0 {
652             Ok(())
653         } else {
654             errno_result()
655         }
656     }
657 
658     /// Registers an event to be signaled whenever a certain address is written to.
659     ///
660     /// The `datamatch` parameter can be used to limit signaling `evt` to only the cases where the
661     /// value being written is equal to `datamatch`. Note that the size of `datamatch` is important
662     /// and must match the expected size of the guest's write.
663     ///
664     /// In all cases where `evt` is signaled, the ordinary vmexit to userspace that would be
665     /// triggered is prevented.
register_ioevent( &self, evt: &Event, addr: IoeventAddress, datamatch: Datamatch, ) -> Result<()>666     pub fn register_ioevent(
667         &self,
668         evt: &Event,
669         addr: IoeventAddress,
670         datamatch: Datamatch,
671     ) -> Result<()> {
672         self.ioeventfd(evt, addr, datamatch, false)
673     }
674 
675     /// Unregisters an event previously registered with `register_ioevent`.
676     ///
677     /// The `evt`, `addr`, and `datamatch` set must be the same as the ones passed into
678     /// `register_ioevent`.
unregister_ioevent( &self, evt: &Event, addr: IoeventAddress, datamatch: Datamatch, ) -> Result<()>679     pub fn unregister_ioevent(
680         &self,
681         evt: &Event,
682         addr: IoeventAddress,
683         datamatch: Datamatch,
684     ) -> Result<()> {
685         self.ioeventfd(evt, addr, datamatch, true)
686     }
687 
ioeventfd( &self, evt: &Event, addr: IoeventAddress, datamatch: Datamatch, deassign: bool, ) -> Result<()>688     fn ioeventfd(
689         &self,
690         evt: &Event,
691         addr: IoeventAddress,
692         datamatch: Datamatch,
693         deassign: bool,
694     ) -> Result<()> {
695         let (do_datamatch, datamatch_value, datamatch_len) = match datamatch {
696             Datamatch::AnyLength => (false, 0, 0),
697             Datamatch::U8(v) => match v {
698                 Some(u) => (true, u as u64, 1),
699                 None => (false, 0, 1),
700             },
701             Datamatch::U16(v) => match v {
702                 Some(u) => (true, u as u64, 2),
703                 None => (false, 0, 2),
704             },
705             Datamatch::U32(v) => match v {
706                 Some(u) => (true, u as u64, 4),
707                 None => (false, 0, 4),
708             },
709             Datamatch::U64(v) => match v {
710                 Some(u) => (true, u as u64, 8),
711                 None => (false, 0, 8),
712             },
713         };
714         let mut flags = 0;
715         if deassign {
716             flags |= 1 << kvm_ioeventfd_flag_nr_deassign;
717         }
718         if do_datamatch {
719             flags |= 1 << kvm_ioeventfd_flag_nr_datamatch
720         }
721         if let IoeventAddress::Pio(_) = addr {
722             flags |= 1 << kvm_ioeventfd_flag_nr_pio;
723         }
724         let ioeventfd = kvm_ioeventfd {
725             datamatch: datamatch_value,
726             len: datamatch_len,
727             addr: match addr {
728                 IoeventAddress::Pio(p) => p as u64,
729                 IoeventAddress::Mmio(m) => m,
730             },
731             fd: evt.as_raw_descriptor(),
732             flags,
733             ..Default::default()
734         };
735         // Safe because we know that our file is a VM fd, we know the kernel will only read the
736         // correct amount of memory from our pointer, and we verify the return result.
737         let ret = unsafe { ioctl_with_ref(self, KVM_IOEVENTFD(), &ioeventfd) };
738         if ret == 0 {
739             Ok(())
740         } else {
741             errno_result()
742         }
743     }
744 
745     /// Registers an event that will, when signalled, trigger the `gsi` irq, and `resample_evt` will
746     /// get triggered when the irqchip is resampled.
747     #[cfg(any(
748         target_arch = "x86",
749         target_arch = "x86_64",
750         target_arch = "arm",
751         target_arch = "aarch64"
752     ))]
register_irqfd_resample( &self, evt: &Event, resample_evt: &Event, gsi: u32, ) -> Result<()>753     pub fn register_irqfd_resample(
754         &self,
755         evt: &Event,
756         resample_evt: &Event,
757         gsi: u32,
758     ) -> Result<()> {
759         let irqfd = kvm_irqfd {
760             flags: KVM_IRQFD_FLAG_RESAMPLE,
761             fd: evt.as_raw_descriptor() as u32,
762             resamplefd: resample_evt.as_raw_descriptor() as u32,
763             gsi,
764             ..Default::default()
765         };
766         // Safe because we know that our file is a VM fd, we know the kernel will only read the
767         // correct amount of memory from our pointer, and we verify the return result.
768         let ret = unsafe { ioctl_with_ref(self, KVM_IRQFD(), &irqfd) };
769         if ret == 0 {
770             Ok(())
771         } else {
772             errno_result()
773         }
774     }
775 
776     /// Unregisters an event that was previously registered with
777     /// `register_irqfd`/`register_irqfd_resample`.
778     ///
779     /// The `evt` and `gsi` pair must be the same as the ones passed into
780     /// `register_irqfd`/`register_irqfd_resample`.
781     #[cfg(any(
782         target_arch = "x86",
783         target_arch = "x86_64",
784         target_arch = "arm",
785         target_arch = "aarch64"
786     ))]
unregister_irqfd(&self, evt: &Event, gsi: u32) -> Result<()>787     pub fn unregister_irqfd(&self, evt: &Event, gsi: u32) -> Result<()> {
788         let irqfd = kvm_irqfd {
789             fd: evt.as_raw_descriptor() as u32,
790             gsi,
791             flags: KVM_IRQFD_FLAG_DEASSIGN,
792             ..Default::default()
793         };
794         // Safe because we know that our file is a VM fd, we know the kernel will only read the
795         // correct amount of memory from our pointer, and we verify the return result.
796         let ret = unsafe { ioctl_with_ref(self, KVM_IRQFD(), &irqfd) };
797         if ret == 0 {
798             Ok(())
799         } else {
800             errno_result()
801         }
802     }
803 
804     /// Sets the GSI routing table, replacing any table set with previous calls to
805     /// `set_gsi_routing`.
806     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
set_gsi_routing(&self, routes: &[IrqRoute]) -> Result<()>807     pub fn set_gsi_routing(&self, routes: &[IrqRoute]) -> Result<()> {
808         let mut irq_routing =
809             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(routes.len());
810         irq_routing[0].nr = routes.len() as u32;
811 
812         // Safe because we ensured there is enough space in irq_routing to hold the number of
813         // route entries.
814         let irq_routes = unsafe { irq_routing[0].entries.as_mut_slice(routes.len()) };
815         for (route, irq_route) in routes.iter().zip(irq_routes.iter_mut()) {
816             irq_route.gsi = route.gsi;
817             match route.source {
818                 IrqSource::Irqchip { chip, pin } => {
819                     irq_route.type_ = KVM_IRQ_ROUTING_IRQCHIP;
820                     irq_route.u.irqchip = kvm_irq_routing_irqchip { irqchip: chip, pin }
821                 }
822                 IrqSource::Msi { address, data } => {
823                     irq_route.type_ = KVM_IRQ_ROUTING_MSI;
824                     irq_route.u.msi = kvm_irq_routing_msi {
825                         address_lo: address as u32,
826                         address_hi: (address >> 32) as u32,
827                         data,
828                         ..Default::default()
829                     }
830                 }
831             }
832         }
833 
834         let ret = unsafe { ioctl_with_ref(self, KVM_SET_GSI_ROUTING(), &irq_routing[0]) };
835         if ret == 0 {
836             Ok(())
837         } else {
838             errno_result()
839         }
840     }
841 
842     /// Enable the specified capability.
843     /// See documentation for KVM_ENABLE_CAP.
844     /// This function is marked as unsafe because `cap` may contain values which are interpreted as
845     /// pointers by the kernel.
kvm_enable_cap(&self, cap: &kvm_enable_cap) -> Result<()>846     pub unsafe fn kvm_enable_cap(&self, cap: &kvm_enable_cap) -> Result<()> {
847         // Safe because we allocated the struct and we know the kernel will read exactly the size of
848         // the struct.
849         let ret = ioctl_with_ref(self, KVM_ENABLE_CAP(), cap);
850         if ret < 0 {
851             errno_result()
852         } else {
853             Ok(())
854         }
855     }
856 }
857 
858 impl AsRawDescriptor for Vm {
as_raw_descriptor(&self) -> RawDescriptor859     fn as_raw_descriptor(&self) -> RawDescriptor {
860         self.vm.as_raw_descriptor()
861     }
862 }
863 
864 /// A reason why a VCPU exited. One of these returns every time `Vcpu::run` is called.
865 #[derive(Debug)]
866 pub enum VcpuExit {
867     /// An out port instruction was run on the given port with the given data.
868     IoOut {
869         port: u16,
870         size: usize,
871         data: [u8; 8],
872     },
873     /// An in port instruction was run on the given port.
874     ///
875     /// The date that the instruction receives should be set with `set_data` before `Vcpu::run` is
876     /// called again.
877     IoIn {
878         port: u16,
879         size: usize,
880     },
881     /// A read instruction was run against the given MMIO address.
882     ///
883     /// The date that the instruction receives should be set with `set_data` before `Vcpu::run` is
884     /// called again.
885     MmioRead {
886         address: u64,
887         size: usize,
888     },
889     /// A write instruction was run against the given MMIO address with the given data.
890     MmioWrite {
891         address: u64,
892         size: usize,
893         data: [u8; 8],
894     },
895     IoapicEoi {
896         vector: u8,
897     },
898     HypervSynic {
899         msr: u32,
900         control: u64,
901         evt_page: u64,
902         msg_page: u64,
903     },
904     HypervHcall {
905         input: u64,
906         params: [u64; 2],
907     },
908     Unknown,
909     Exception,
910     Hypercall,
911     Debug,
912     Hlt,
913     IrqWindowOpen,
914     Shutdown,
915     FailEntry {
916         hardware_entry_failure_reason: u64,
917     },
918     Intr,
919     SetTpr,
920     TprAccess,
921     S390Sieic,
922     S390Reset,
923     Dcr,
924     Nmi,
925     InternalError,
926     Osi,
927     PaprHcall,
928     S390Ucontrol,
929     Watchdog,
930     S390Tsch,
931     Epr,
932     /// The cpu triggered a system level event which is specified by the type field.
933     /// The first field is the event type and the second field is flags.
934     /// The possible event types are shutdown, reset, or crash.  So far there
935     /// are not any flags defined.
936     SystemEvent(u32 /* event_type */, u64 /* flags */),
937 }
938 
939 /// A wrapper around creating and using a VCPU.
940 /// `Vcpu` provides all functionality except for running. To run, `to_runnable` must be called to
941 /// lock the vcpu to a thread. Then the returned `RunnableVcpu` can be used for running.
942 pub struct Vcpu {
943     vcpu: File,
944     run_mmap: MemoryMapping,
945 }
946 
947 pub struct VcpuThread {
948     run: *mut kvm_run,
949     signal_num: Option<c_int>,
950 }
951 
952 thread_local!(static VCPU_THREAD: RefCell<Option<VcpuThread>> = RefCell::new(None));
953 
954 impl Vcpu {
955     /// Constructs a new VCPU for `vm`.
956     ///
957     /// The `id` argument is the CPU number between [0, max vcpus).
new(id: c_ulong, kvm: &Kvm, vm: &Vm) -> Result<Vcpu>958     pub fn new(id: c_ulong, kvm: &Kvm, vm: &Vm) -> Result<Vcpu> {
959         let run_mmap_size = kvm.get_vcpu_mmap_size()?;
960 
961         // Safe because we know that vm a VM fd and we verify the return result.
962         let vcpu_fd = unsafe { ioctl_with_val(vm, KVM_CREATE_VCPU(), id) };
963         if vcpu_fd < 0 {
964             return errno_result();
965         }
966 
967         // Wrap the vcpu now in case the following ? returns early. This is safe because we verified
968         // the value of the fd and we own the fd.
969         let vcpu = unsafe { File::from_raw_descriptor(vcpu_fd) };
970 
971         let run_mmap = MemoryMappingBuilder::new(run_mmap_size)
972             .from_file(&vcpu)
973             .build()
974             .map_err(|_| Error::new(ENOSPC))?;
975 
976         Ok(Vcpu { vcpu, run_mmap })
977     }
978 
979     /// Consumes `self` and returns a `RunnableVcpu`. A `RunnableVcpu` is required to run the
980     /// guest.
981     /// Assigns a vcpu to the current thread and stores it in a hash map that can be used by signal
982     /// handlers to call set_local_immediate_exit(). An optional signal number will be temporarily
983     /// blocked while assigning the vcpu to the thread and later blocked when `RunnableVcpu` is
984     /// destroyed.
985     ///
986     /// Returns an error, `EBUSY`, if the current thread already contains a Vcpu.
987     #[allow(clippy::cast_ptr_alignment)]
to_runnable(self, signal_num: Option<c_int>) -> Result<RunnableVcpu>988     pub fn to_runnable(self, signal_num: Option<c_int>) -> Result<RunnableVcpu> {
989         // Block signal while we add -- if a signal fires (very unlikely,
990         // as this means something is trying to pause the vcpu before it has
991         // even started) it'll try to grab the read lock while this write
992         // lock is grabbed and cause a deadlock.
993         // Assuming that a failure to block means it's already blocked.
994         let _blocked_signal = signal_num.map(BlockedSignal::new);
995 
996         VCPU_THREAD.with(|v| {
997             if v.borrow().is_none() {
998                 *v.borrow_mut() = Some(VcpuThread {
999                     run: self.run_mmap.as_ptr() as *mut kvm_run,
1000                     signal_num,
1001                 });
1002                 Ok(())
1003             } else {
1004                 Err(Error::new(EBUSY))
1005             }
1006         })?;
1007 
1008         Ok(RunnableVcpu {
1009             vcpu: self,
1010             phantom: Default::default(),
1011         })
1012     }
1013 
1014     /// Sets the data received by a mmio read, ioport in, or hypercall instruction.
1015     ///
1016     /// This function should be called after `Vcpu::run` returns an `VcpuExit::IoIn`,
1017     /// `VcpuExit::MmioRead`, or 'VcpuExit::HypervHcall`.
1018     #[allow(clippy::cast_ptr_alignment)]
set_data(&self, data: &[u8]) -> Result<()>1019     pub fn set_data(&self, data: &[u8]) -> Result<()> {
1020         // Safe because we know we mapped enough memory to hold the kvm_run struct because the
1021         // kernel told us how large it was. The pointer is page aligned so casting to a different
1022         // type is well defined, hence the clippy allow attribute.
1023         let run = unsafe { &mut *(self.run_mmap.as_ptr() as *mut kvm_run) };
1024         match run.exit_reason {
1025             KVM_EXIT_IO => {
1026                 let run_start = run as *mut kvm_run as *mut u8;
1027                 // Safe because the exit_reason (which comes from the kernel) told us which
1028                 // union field to use.
1029                 let io = unsafe { run.__bindgen_anon_1.io };
1030                 if io.direction as u32 != KVM_EXIT_IO_IN {
1031                     return Err(Error::new(EINVAL));
1032                 }
1033                 let data_size = (io.count as usize) * (io.size as usize);
1034                 if data_size != data.len() {
1035                     return Err(Error::new(EINVAL));
1036                 }
1037                 // The data_offset is defined by the kernel to be some number of bytes into the
1038                 // kvm_run structure, which we have fully mmap'd.
1039                 unsafe {
1040                     let data_ptr = run_start.offset(io.data_offset as isize);
1041                     copy_nonoverlapping(data.as_ptr(), data_ptr, data_size);
1042                 }
1043                 Ok(())
1044             }
1045             KVM_EXIT_MMIO => {
1046                 // Safe because the exit_reason (which comes from the kernel) told us which
1047                 // union field to use.
1048                 let mmio = unsafe { &mut run.__bindgen_anon_1.mmio };
1049                 if mmio.is_write != 0 {
1050                     return Err(Error::new(EINVAL));
1051                 }
1052                 let len = mmio.len as usize;
1053                 if len != data.len() {
1054                     return Err(Error::new(EINVAL));
1055                 }
1056                 mmio.data[..len].copy_from_slice(data);
1057                 Ok(())
1058             }
1059             KVM_EXIT_HYPERV => {
1060                 // Safe because the exit_reason (which comes from the kernel) told us which
1061                 // union field to use.
1062                 let hyperv = unsafe { &mut run.__bindgen_anon_1.hyperv };
1063                 if hyperv.type_ != KVM_EXIT_HYPERV_HCALL {
1064                     return Err(Error::new(EINVAL));
1065                 }
1066                 let hcall = unsafe { &mut hyperv.u.hcall };
1067                 if data.len() != std::mem::size_of::<u64>() {
1068                     return Err(Error::new(EINVAL));
1069                 }
1070                 hcall.result.to_ne_bytes().copy_from_slice(data);
1071                 Ok(())
1072             }
1073             _ => Err(Error::new(EINVAL)),
1074         }
1075     }
1076 
1077     /// Sets the bit that requests an immediate exit.
1078     #[allow(clippy::cast_ptr_alignment)]
set_immediate_exit(&self, exit: bool)1079     pub fn set_immediate_exit(&self, exit: bool) {
1080         // Safe because we know we mapped enough memory to hold the kvm_run struct because the
1081         // kernel told us how large it was. The pointer is page aligned so casting to a different
1082         // type is well defined, hence the clippy allow attribute.
1083         let run = unsafe { &mut *(self.run_mmap.as_ptr() as *mut kvm_run) };
1084         run.immediate_exit = if exit { 1 } else { 0 };
1085     }
1086 
1087     /// Sets/clears the bit for immediate exit for the vcpu on the current thread.
set_local_immediate_exit(exit: bool)1088     pub fn set_local_immediate_exit(exit: bool) {
1089         VCPU_THREAD.with(|v| {
1090             if let Some(state) = &(*v.borrow()) {
1091                 unsafe {
1092                     (*state.run).immediate_exit = if exit { 1 } else { 0 };
1093                 };
1094             }
1095         });
1096     }
1097 
1098     /// Gets the VCPU registers.
1099     #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))]
get_regs(&self) -> Result<kvm_regs>1100     pub fn get_regs(&self) -> Result<kvm_regs> {
1101         // Safe because we know that our file is a VCPU fd, we know the kernel will only read the
1102         // correct amount of memory from our pointer, and we verify the return result.
1103         let mut regs = unsafe { std::mem::zeroed() };
1104         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_REGS(), &mut regs) };
1105         if ret != 0 {
1106             return errno_result();
1107         }
1108         Ok(regs)
1109     }
1110 
1111     /// Sets the VCPU registers.
1112     #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))]
set_regs(&self, regs: &kvm_regs) -> Result<()>1113     pub fn set_regs(&self, regs: &kvm_regs) -> Result<()> {
1114         // Safe because we know that our file is a VCPU fd, we know the kernel will only read the
1115         // correct amount of memory from our pointer, and we verify the return result.
1116         let ret = unsafe { ioctl_with_ref(self, KVM_SET_REGS(), regs) };
1117         if ret != 0 {
1118             return errno_result();
1119         }
1120         Ok(())
1121     }
1122 
1123     /// Gets the VCPU special registers.
1124     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_sregs(&self) -> Result<kvm_sregs>1125     pub fn get_sregs(&self) -> Result<kvm_sregs> {
1126         // Safe because we know that our file is a VCPU fd, we know the kernel will only write the
1127         // correct amount of memory to our pointer, and we verify the return result.
1128         let mut regs = unsafe { std::mem::zeroed() };
1129         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_SREGS(), &mut regs) };
1130         if ret != 0 {
1131             return errno_result();
1132         }
1133         Ok(regs)
1134     }
1135 
1136     /// Sets the VCPU special registers.
1137     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
set_sregs(&self, sregs: &kvm_sregs) -> Result<()>1138     pub fn set_sregs(&self, sregs: &kvm_sregs) -> Result<()> {
1139         // Safe because we know that our file is a VCPU fd, we know the kernel will only read the
1140         // correct amount of memory from our pointer, and we verify the return result.
1141         let ret = unsafe { ioctl_with_ref(self, KVM_SET_SREGS(), sregs) };
1142         if ret != 0 {
1143             return errno_result();
1144         }
1145         Ok(())
1146     }
1147 
1148     /// Gets the VCPU FPU registers.
1149     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_fpu(&self) -> Result<kvm_fpu>1150     pub fn get_fpu(&self) -> Result<kvm_fpu> {
1151         // Safe because we know that our file is a VCPU fd, we know the kernel will only write the
1152         // correct amount of memory to our pointer, and we verify the return result.
1153         let mut regs = unsafe { std::mem::zeroed() };
1154         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_FPU(), &mut regs) };
1155         if ret != 0 {
1156             return errno_result();
1157         }
1158         Ok(regs)
1159     }
1160 
1161     /// X86 specific call to setup the FPU
1162     ///
1163     /// See the documentation for KVM_SET_FPU.
1164     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
set_fpu(&self, fpu: &kvm_fpu) -> Result<()>1165     pub fn set_fpu(&self, fpu: &kvm_fpu) -> Result<()> {
1166         let ret = unsafe {
1167             // Here we trust the kernel not to read past the end of the kvm_fpu struct.
1168             ioctl_with_ref(self, KVM_SET_FPU(), fpu)
1169         };
1170         if ret < 0 {
1171             return errno_result();
1172         }
1173         Ok(())
1174     }
1175 
1176     /// Gets the VCPU debug registers.
1177     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_debugregs(&self) -> Result<kvm_debugregs>1178     pub fn get_debugregs(&self) -> Result<kvm_debugregs> {
1179         // Safe because we know that our file is a VCPU fd, we know the kernel will only write the
1180         // correct amount of memory to our pointer, and we verify the return result.
1181         let mut regs = unsafe { std::mem::zeroed() };
1182         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_DEBUGREGS(), &mut regs) };
1183         if ret != 0 {
1184             return errno_result();
1185         }
1186         Ok(regs)
1187     }
1188 
1189     /// Sets the VCPU debug registers
1190     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
set_debugregs(&self, dregs: &kvm_debugregs) -> Result<()>1191     pub fn set_debugregs(&self, dregs: &kvm_debugregs) -> Result<()> {
1192         let ret = unsafe {
1193             // Here we trust the kernel not to read past the end of the kvm_fpu struct.
1194             ioctl_with_ref(self, KVM_SET_DEBUGREGS(), dregs)
1195         };
1196         if ret < 0 {
1197             return errno_result();
1198         }
1199         Ok(())
1200     }
1201 
1202     /// Gets the VCPU extended control registers
1203     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_xcrs(&self) -> Result<kvm_xcrs>1204     pub fn get_xcrs(&self) -> Result<kvm_xcrs> {
1205         // Safe because we know that our file is a VCPU fd, we know the kernel will only write the
1206         // correct amount of memory to our pointer, and we verify the return result.
1207         let mut regs = unsafe { std::mem::zeroed() };
1208         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_XCRS(), &mut regs) };
1209         if ret != 0 {
1210             return errno_result();
1211         }
1212         Ok(regs)
1213     }
1214 
1215     /// Sets the VCPU extended control registers
1216     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
set_xcrs(&self, xcrs: &kvm_xcrs) -> Result<()>1217     pub fn set_xcrs(&self, xcrs: &kvm_xcrs) -> Result<()> {
1218         let ret = unsafe {
1219             // Here we trust the kernel not to read past the end of the kvm_xcrs struct.
1220             ioctl_with_ref(self, KVM_SET_XCRS(), xcrs)
1221         };
1222         if ret < 0 {
1223             return errno_result();
1224         }
1225         Ok(())
1226     }
1227 
1228     /// X86 specific call to get the MSRS
1229     ///
1230     /// See the documentation for KVM_SET_MSRS.
1231     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_msrs(&self, msr_entries: &mut Vec<kvm_msr_entry>) -> Result<()>1232     pub fn get_msrs(&self, msr_entries: &mut Vec<kvm_msr_entry>) -> Result<()> {
1233         let mut msrs = vec_with_array_field::<kvm_msrs, kvm_msr_entry>(msr_entries.len());
1234         unsafe {
1235             // Mapping the unsized array to a slice is unsafe because the length isn't known.
1236             // Providing the length used to create the struct guarantees the entire slice is valid.
1237             let entries: &mut [kvm_msr_entry] = msrs[0].entries.as_mut_slice(msr_entries.len());
1238             entries.copy_from_slice(&msr_entries);
1239         }
1240         msrs[0].nmsrs = msr_entries.len() as u32;
1241         let ret = unsafe {
1242             // Here we trust the kernel not to read or write past the end of the kvm_msrs struct.
1243             ioctl_with_ref(self, KVM_GET_MSRS(), &msrs[0])
1244         };
1245         if ret < 0 {
1246             // KVM_SET_MSRS actually returns the number of msr entries written.
1247             return errno_result();
1248         }
1249         unsafe {
1250             let count = ret as usize;
1251             assert!(count <= msr_entries.len());
1252             let entries: &mut [kvm_msr_entry] = msrs[0].entries.as_mut_slice(count);
1253             msr_entries.truncate(count);
1254             msr_entries.copy_from_slice(&entries);
1255         }
1256         Ok(())
1257     }
1258 
1259     /// X86 specific call to setup the MSRS
1260     ///
1261     /// See the documentation for KVM_SET_MSRS.
1262     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
set_msrs(&self, msrs: &kvm_msrs) -> Result<()>1263     pub fn set_msrs(&self, msrs: &kvm_msrs) -> Result<()> {
1264         let ret = unsafe {
1265             // Here we trust the kernel not to read past the end of the kvm_msrs struct.
1266             ioctl_with_ref(self, KVM_SET_MSRS(), msrs)
1267         };
1268         if ret < 0 {
1269             // KVM_SET_MSRS actually returns the number of msr entries written.
1270             return errno_result();
1271         }
1272         Ok(())
1273     }
1274 
1275     /// X86 specific call to setup the CPUID registers
1276     ///
1277     /// See the documentation for KVM_SET_CPUID2.
1278     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
set_cpuid2(&self, cpuid: &CpuId) -> Result<()>1279     pub fn set_cpuid2(&self, cpuid: &CpuId) -> Result<()> {
1280         let ret = unsafe {
1281             // Here we trust the kernel not to read past the end of the kvm_msrs struct.
1282             ioctl_with_ptr(self, KVM_SET_CPUID2(), cpuid.as_ptr())
1283         };
1284         if ret < 0 {
1285             return errno_result();
1286         }
1287         Ok(())
1288     }
1289 
1290     /// X86 specific call to get the system emulated hyper-v CPUID values
1291     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_hyperv_cpuid(&self) -> Result<CpuId>1292     pub fn get_hyperv_cpuid(&self) -> Result<CpuId> {
1293         const MAX_KVM_CPUID_ENTRIES: usize = 256;
1294         let mut cpuid = CpuId::new(MAX_KVM_CPUID_ENTRIES);
1295 
1296         let ret = unsafe {
1297             // ioctl is unsafe. The kernel is trusted not to write beyond the bounds of the memory
1298             // allocated for the struct. The limit is read from nent, which is set to the allocated
1299             // size(MAX_KVM_CPUID_ENTRIES) above.
1300             ioctl_with_mut_ptr(self, KVM_GET_SUPPORTED_HV_CPUID(), cpuid.as_mut_ptr())
1301         };
1302         if ret < 0 {
1303             return errno_result();
1304         }
1305         Ok(cpuid)
1306     }
1307 
1308     /// X86 specific call to get the state of the "Local Advanced Programmable Interrupt Controller".
1309     ///
1310     /// See the documentation for KVM_GET_LAPIC.
1311     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_lapic(&self) -> Result<kvm_lapic_state>1312     pub fn get_lapic(&self) -> Result<kvm_lapic_state> {
1313         let mut klapic: kvm_lapic_state = Default::default();
1314 
1315         let ret = unsafe {
1316             // The ioctl is unsafe unless you trust the kernel not to write past the end of the
1317             // local_apic struct.
1318             ioctl_with_mut_ref(self, KVM_GET_LAPIC(), &mut klapic)
1319         };
1320         if ret < 0 {
1321             return errno_result();
1322         }
1323         Ok(klapic)
1324     }
1325 
1326     /// X86 specific call to set the state of the "Local Advanced Programmable Interrupt Controller".
1327     ///
1328     /// See the documentation for KVM_SET_LAPIC.
1329     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
set_lapic(&self, klapic: &kvm_lapic_state) -> Result<()>1330     pub fn set_lapic(&self, klapic: &kvm_lapic_state) -> Result<()> {
1331         let ret = unsafe {
1332             // The ioctl is safe because the kernel will only read from the klapic struct.
1333             ioctl_with_ref(self, KVM_SET_LAPIC(), klapic)
1334         };
1335         if ret < 0 {
1336             return errno_result();
1337         }
1338         Ok(())
1339     }
1340 
1341     /// Gets the vcpu's current "multiprocessing state".
1342     ///
1343     /// See the documentation for KVM_GET_MP_STATE. This call can only succeed after
1344     /// a call to `Vm::create_irq_chip`.
1345     ///
1346     /// Note that KVM defines the call for both x86 and s390 but we do not expect anyone
1347     /// to run crosvm on s390.
1348     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_mp_state(&self) -> Result<kvm_mp_state>1349     pub fn get_mp_state(&self) -> Result<kvm_mp_state> {
1350         // Safe because we know that our file is a VCPU fd, we know the kernel will only
1351         // write correct amount of memory to our pointer, and we verify the return result.
1352         let mut state: kvm_mp_state = unsafe { std::mem::zeroed() };
1353         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_MP_STATE(), &mut state) };
1354         if ret < 0 {
1355             return errno_result();
1356         }
1357         Ok(state)
1358     }
1359 
1360     /// Sets the vcpu's current "multiprocessing state".
1361     ///
1362     /// See the documentation for KVM_SET_MP_STATE. This call can only succeed after
1363     /// a call to `Vm::create_irq_chip`.
1364     ///
1365     /// Note that KVM defines the call for both x86 and s390 but we do not expect anyone
1366     /// to run crosvm on s390.
1367     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
set_mp_state(&self, state: &kvm_mp_state) -> Result<()>1368     pub fn set_mp_state(&self, state: &kvm_mp_state) -> Result<()> {
1369         let ret = unsafe {
1370             // The ioctl is safe because the kernel will only read from the kvm_mp_state struct.
1371             ioctl_with_ref(self, KVM_SET_MP_STATE(), state)
1372         };
1373         if ret < 0 {
1374             return errno_result();
1375         }
1376         Ok(())
1377     }
1378 
1379     /// Gets the vcpu's currently pending exceptions, interrupts, NMIs, etc
1380     ///
1381     /// See the documentation for KVM_GET_VCPU_EVENTS.
1382     ///
1383     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_vcpu_events(&self) -> Result<kvm_vcpu_events>1384     pub fn get_vcpu_events(&self) -> Result<kvm_vcpu_events> {
1385         // Safe because we know that our file is a VCPU fd, we know the kernel
1386         // will only write correct amount of memory to our pointer, and we
1387         // verify the return result.
1388         let mut events: kvm_vcpu_events = unsafe { std::mem::zeroed() };
1389         let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_VCPU_EVENTS(), &mut events) };
1390         if ret < 0 {
1391             return errno_result();
1392         }
1393         Ok(events)
1394     }
1395 
1396     /// Sets the vcpu's currently pending exceptions, interrupts, NMIs, etc
1397     ///
1398     /// See the documentation for KVM_SET_VCPU_EVENTS.
1399     ///
1400     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
set_vcpu_events(&self, events: &kvm_vcpu_events) -> Result<()>1401     pub fn set_vcpu_events(&self, events: &kvm_vcpu_events) -> Result<()> {
1402         let ret = unsafe {
1403             // The ioctl is safe because the kernel will only read from the
1404             // kvm_vcpu_events.
1405             ioctl_with_ref(self, KVM_SET_VCPU_EVENTS(), events)
1406         };
1407         if ret < 0 {
1408             return errno_result();
1409         }
1410         Ok(())
1411     }
1412 
1413     /// Enable the specified capability.
1414     /// See documentation for KVM_ENABLE_CAP.
1415     /// This function is marked as unsafe because `cap` may contain values which are interpreted as
1416     /// pointers by the kernel.
kvm_enable_cap(&self, cap: &kvm_enable_cap) -> Result<()>1417     pub unsafe fn kvm_enable_cap(&self, cap: &kvm_enable_cap) -> Result<()> {
1418         // Safe because we allocated the struct and we know the kernel will read exactly the size of
1419         // the struct.
1420         let ret = ioctl_with_ref(self, KVM_ENABLE_CAP(), cap);
1421         if ret < 0 {
1422             return errno_result();
1423         }
1424         Ok(())
1425     }
1426 
1427     /// Specifies set of signals that are blocked during execution of KVM_RUN.
1428     /// Signals that are not blocked will cause KVM_RUN to return with -EINTR.
1429     ///
1430     /// See the documentation for KVM_SET_SIGNAL_MASK
set_signal_mask(&self, signals: &[c_int]) -> Result<()>1431     pub fn set_signal_mask(&self, signals: &[c_int]) -> Result<()> {
1432         let sigset = signal::create_sigset(signals)?;
1433 
1434         let mut kvm_sigmask = vec_with_array_field::<kvm_signal_mask, sigset_t>(1);
1435         // Rust definition of sigset_t takes 128 bytes, but the kernel only
1436         // expects 8-bytes structure, so we can't write
1437         // kvm_sigmask.len  = size_of::<sigset_t>() as u32;
1438         kvm_sigmask[0].len = 8;
1439         // Ensure the length is not too big.
1440         const _ASSERT: usize = size_of::<sigset_t>() - 8usize;
1441 
1442         // Safe as we allocated exactly the needed space
1443         unsafe {
1444             copy_nonoverlapping(
1445                 &sigset as *const sigset_t as *const u8,
1446                 kvm_sigmask[0].sigset.as_mut_ptr(),
1447                 8,
1448             );
1449         }
1450 
1451         let ret = unsafe {
1452             // The ioctl is safe because the kernel will only read from the
1453             // kvm_signal_mask structure.
1454             ioctl_with_ref(self, KVM_SET_SIGNAL_MASK(), &kvm_sigmask[0])
1455         };
1456         if ret < 0 {
1457             return errno_result();
1458         }
1459         Ok(())
1460     }
1461 
1462     /// Sets the value of one register on this VCPU.  The id of the register is
1463     /// encoded as specified in the kernel documentation for KVM_SET_ONE_REG.
1464     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
set_one_reg(&self, reg_id: u64, data: u64) -> Result<()>1465     pub fn set_one_reg(&self, reg_id: u64, data: u64) -> Result<()> {
1466         let data_ref = &data as *const u64;
1467         let onereg = kvm_one_reg {
1468             id: reg_id,
1469             addr: data_ref as u64,
1470         };
1471         // safe because we allocated the struct and we know the kernel will read
1472         // exactly the size of the struct
1473         let ret = unsafe { ioctl_with_ref(self, KVM_SET_ONE_REG(), &onereg) };
1474         if ret < 0 {
1475             return errno_result();
1476         }
1477         Ok(())
1478     }
1479 }
1480 
1481 impl AsRawDescriptor for Vcpu {
as_raw_descriptor(&self) -> RawDescriptor1482     fn as_raw_descriptor(&self) -> RawDescriptor {
1483         self.vcpu.as_raw_descriptor()
1484     }
1485 }
1486 
1487 /// A Vcpu that has a thread and can be run. Created by calling `to_runnable` on a `Vcpu`.
1488 /// Implements `Deref` to a `Vcpu` so all `Vcpu` methods are usable, with the addition of the `run`
1489 /// function to execute the guest.
1490 pub struct RunnableVcpu {
1491     vcpu: Vcpu,
1492     // vcpus must stay on the same thread once they start.
1493     // Add the PhantomData pointer to ensure RunnableVcpu is not `Send`.
1494     phantom: std::marker::PhantomData<*mut u8>,
1495 }
1496 
1497 impl RunnableVcpu {
1498     /// Runs the VCPU until it exits, returning the reason for the exit.
1499     ///
1500     /// Note that the state of the VCPU and associated VM must be setup first for this to do
1501     /// anything useful.
1502     #[allow(clippy::cast_ptr_alignment)]
1503     // The pointer is page aligned so casting to a different type is well defined, hence the clippy
1504     // allow attribute.
run(&self) -> Result<VcpuExit>1505     pub fn run(&self) -> Result<VcpuExit> {
1506         // Safe because we know that our file is a VCPU fd and we verify the return result.
1507         let ret = unsafe { ioctl(self, KVM_RUN()) };
1508         if ret == 0 {
1509             // Safe because we know we mapped enough memory to hold the kvm_run struct because the
1510             // kernel told us how large it was.
1511             let run = unsafe { &*(self.run_mmap.as_ptr() as *const kvm_run) };
1512             match run.exit_reason {
1513                 KVM_EXIT_IO => {
1514                     // Safe because the exit_reason (which comes from the kernel) told us which
1515                     // union field to use.
1516                     let io = unsafe { run.__bindgen_anon_1.io };
1517                     let port = io.port;
1518                     let size = (io.count as usize) * (io.size as usize);
1519                     match io.direction as u32 {
1520                         KVM_EXIT_IO_IN => Ok(VcpuExit::IoIn { port, size }),
1521                         KVM_EXIT_IO_OUT => {
1522                             let mut data = [0; 8];
1523                             let run_start = run as *const kvm_run as *const u8;
1524                             // The data_offset is defined by the kernel to be some number of bytes
1525                             // into the kvm_run structure, which we have fully mmap'd.
1526                             unsafe {
1527                                 let data_ptr = run_start.offset(io.data_offset as isize);
1528                                 copy_nonoverlapping(
1529                                     data_ptr,
1530                                     data.as_mut_ptr(),
1531                                     min(size, data.len()),
1532                                 );
1533                             }
1534                             Ok(VcpuExit::IoOut { port, size, data })
1535                         }
1536                         _ => Err(Error::new(EINVAL)),
1537                     }
1538                 }
1539                 KVM_EXIT_MMIO => {
1540                     // Safe because the exit_reason (which comes from the kernel) told us which
1541                     // union field to use.
1542                     let mmio = unsafe { &run.__bindgen_anon_1.mmio };
1543                     let address = mmio.phys_addr;
1544                     let size = min(mmio.len as usize, mmio.data.len());
1545                     if mmio.is_write != 0 {
1546                         Ok(VcpuExit::MmioWrite {
1547                             address,
1548                             size,
1549                             data: mmio.data,
1550                         })
1551                     } else {
1552                         Ok(VcpuExit::MmioRead { address, size })
1553                     }
1554                 }
1555                 KVM_EXIT_IOAPIC_EOI => {
1556                     // Safe because the exit_reason (which comes from the kernel) told us which
1557                     // union field to use.
1558                     let vector = unsafe { run.__bindgen_anon_1.eoi.vector };
1559                     Ok(VcpuExit::IoapicEoi { vector })
1560                 }
1561                 KVM_EXIT_HYPERV => {
1562                     // Safe because the exit_reason (which comes from the kernel) told us which
1563                     // union field to use.
1564                     let hyperv = unsafe { &run.__bindgen_anon_1.hyperv };
1565                     match hyperv.type_ as u32 {
1566                         KVM_EXIT_HYPERV_SYNIC => {
1567                             let synic = unsafe { &hyperv.u.synic };
1568                             Ok(VcpuExit::HypervSynic {
1569                                 msr: synic.msr,
1570                                 control: synic.control,
1571                                 evt_page: synic.evt_page,
1572                                 msg_page: synic.msg_page,
1573                             })
1574                         }
1575                         KVM_EXIT_HYPERV_HCALL => {
1576                             let hcall = unsafe { &hyperv.u.hcall };
1577                             Ok(VcpuExit::HypervHcall {
1578                                 input: hcall.input,
1579                                 params: hcall.params,
1580                             })
1581                         }
1582                         _ => Err(Error::new(EINVAL)),
1583                     }
1584                 }
1585                 KVM_EXIT_UNKNOWN => Ok(VcpuExit::Unknown),
1586                 KVM_EXIT_EXCEPTION => Ok(VcpuExit::Exception),
1587                 KVM_EXIT_HYPERCALL => Ok(VcpuExit::Hypercall),
1588                 KVM_EXIT_DEBUG => Ok(VcpuExit::Debug),
1589                 KVM_EXIT_HLT => Ok(VcpuExit::Hlt),
1590                 KVM_EXIT_IRQ_WINDOW_OPEN => Ok(VcpuExit::IrqWindowOpen),
1591                 KVM_EXIT_SHUTDOWN => Ok(VcpuExit::Shutdown),
1592                 KVM_EXIT_FAIL_ENTRY => {
1593                     // Safe because the exit_reason (which comes from the kernel) told us which
1594                     // union field to use.
1595                     let hardware_entry_failure_reason = unsafe {
1596                         run.__bindgen_anon_1
1597                             .fail_entry
1598                             .hardware_entry_failure_reason
1599                     };
1600                     Ok(VcpuExit::FailEntry {
1601                         hardware_entry_failure_reason,
1602                     })
1603                 }
1604                 KVM_EXIT_INTR => Ok(VcpuExit::Intr),
1605                 KVM_EXIT_SET_TPR => Ok(VcpuExit::SetTpr),
1606                 KVM_EXIT_TPR_ACCESS => Ok(VcpuExit::TprAccess),
1607                 KVM_EXIT_S390_SIEIC => Ok(VcpuExit::S390Sieic),
1608                 KVM_EXIT_S390_RESET => Ok(VcpuExit::S390Reset),
1609                 KVM_EXIT_DCR => Ok(VcpuExit::Dcr),
1610                 KVM_EXIT_NMI => Ok(VcpuExit::Nmi),
1611                 KVM_EXIT_INTERNAL_ERROR => Ok(VcpuExit::InternalError),
1612                 KVM_EXIT_OSI => Ok(VcpuExit::Osi),
1613                 KVM_EXIT_PAPR_HCALL => Ok(VcpuExit::PaprHcall),
1614                 KVM_EXIT_S390_UCONTROL => Ok(VcpuExit::S390Ucontrol),
1615                 KVM_EXIT_WATCHDOG => Ok(VcpuExit::Watchdog),
1616                 KVM_EXIT_S390_TSCH => Ok(VcpuExit::S390Tsch),
1617                 KVM_EXIT_EPR => Ok(VcpuExit::Epr),
1618                 KVM_EXIT_SYSTEM_EVENT => {
1619                     // Safe because we know the exit reason told us this union
1620                     // field is valid
1621                     let event_type = unsafe { run.__bindgen_anon_1.system_event.type_ };
1622                     let event_flags = unsafe { run.__bindgen_anon_1.system_event.flags };
1623                     Ok(VcpuExit::SystemEvent(event_type, event_flags))
1624                 }
1625                 r => panic!("unknown kvm exit reason: {}", r),
1626             }
1627         } else {
1628             errno_result()
1629         }
1630     }
1631 }
1632 
1633 impl Deref for RunnableVcpu {
1634     type Target = Vcpu;
deref(&self) -> &Self::Target1635     fn deref(&self) -> &Self::Target {
1636         &self.vcpu
1637     }
1638 }
1639 
1640 impl DerefMut for RunnableVcpu {
deref_mut(&mut self) -> &mut Self::Target1641     fn deref_mut(&mut self) -> &mut Self::Target {
1642         &mut self.vcpu
1643     }
1644 }
1645 
1646 impl AsRawDescriptor for RunnableVcpu {
as_raw_descriptor(&self) -> RawDescriptor1647     fn as_raw_descriptor(&self) -> RawDescriptor {
1648         self.vcpu.as_raw_descriptor()
1649     }
1650 }
1651 
1652 impl Drop for RunnableVcpu {
drop(&mut self)1653     fn drop(&mut self) {
1654         VCPU_THREAD.with(|v| {
1655             // This assumes that a failure in `BlockedSignal::new` means the signal is already
1656             // blocked and there it should not be unblocked on exit.
1657             let _blocked_signal = &(*v.borrow())
1658                 .as_ref()
1659                 .and_then(|state| state.signal_num)
1660                 .map(BlockedSignal::new);
1661 
1662             *v.borrow_mut() = None;
1663         });
1664     }
1665 }
1666 
1667 /// Wrapper for kvm_cpuid2 which has a zero length array at the end.
1668 /// Hides the zero length array behind a bounds check.
1669 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1670 pub type CpuId = FlexibleArrayWrapper<kvm_cpuid2, kvm_cpuid_entry2>;
1671 
1672 // Represents a temporarily blocked signal. It will unblock the signal when dropped.
1673 struct BlockedSignal {
1674     signal_num: c_int,
1675 }
1676 
1677 impl BlockedSignal {
1678     // Returns a `BlockedSignal` if the specified signal can be blocked, otherwise None.
new(signal_num: c_int) -> Option<BlockedSignal>1679     fn new(signal_num: c_int) -> Option<BlockedSignal> {
1680         if block_signal(signal_num).is_ok() {
1681             Some(BlockedSignal { signal_num })
1682         } else {
1683             None
1684         }
1685     }
1686 }
1687 
1688 impl Drop for BlockedSignal {
drop(&mut self)1689     fn drop(&mut self) {
1690         let _ = unblock_signal(self.signal_num).expect("failed to restore signal mask");
1691     }
1692 }
1693 
1694 #[cfg(test)]
1695 mod tests {
1696     use super::*;
1697 
1698     #[test]
dirty_log_size()1699     fn dirty_log_size() {
1700         let page_size = pagesize();
1701         assert_eq!(dirty_log_bitmap_size(0), 0);
1702         assert_eq!(dirty_log_bitmap_size(page_size), 1);
1703         assert_eq!(dirty_log_bitmap_size(page_size * 8), 1);
1704         assert_eq!(dirty_log_bitmap_size(page_size * 8 + 1), 2);
1705         assert_eq!(dirty_log_bitmap_size(page_size * 100), 13);
1706     }
1707 
1708     #[test]
new()1709     fn new() {
1710         Kvm::new().unwrap();
1711     }
1712 
1713     #[test]
create_vm()1714     fn create_vm() {
1715         let kvm = Kvm::new().unwrap();
1716         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x1000)]).unwrap();
1717         Vm::new(&kvm, gm).unwrap();
1718     }
1719 
1720     #[test]
check_extension()1721     fn check_extension() {
1722         let kvm = Kvm::new().unwrap();
1723         assert!(kvm.check_extension(Cap::UserMemory));
1724         // I assume nobody is testing this on s390
1725         assert!(!kvm.check_extension(Cap::S390UserSigp));
1726     }
1727 
1728     #[test]
check_vm_extension()1729     fn check_vm_extension() {
1730         let kvm = Kvm::new().unwrap();
1731         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x1000)]).unwrap();
1732         let vm = Vm::new(&kvm, gm).unwrap();
1733         assert!(vm.check_extension(Cap::UserMemory));
1734         // I assume nobody is testing this on s390
1735         assert!(!vm.check_extension(Cap::S390UserSigp));
1736     }
1737 
1738     #[test]
1739     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_supported_cpuid()1740     fn get_supported_cpuid() {
1741         let kvm = Kvm::new().unwrap();
1742         let mut cpuid = kvm.get_supported_cpuid().unwrap();
1743         let cpuid_entries = cpuid.mut_entries_slice();
1744         assert!(cpuid_entries.len() > 0);
1745     }
1746 
1747     #[test]
1748     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_emulated_cpuid()1749     fn get_emulated_cpuid() {
1750         let kvm = Kvm::new().unwrap();
1751         kvm.get_emulated_cpuid().unwrap();
1752     }
1753 
1754     #[test]
1755     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_msr_index_list()1756     fn get_msr_index_list() {
1757         let kvm = Kvm::new().unwrap();
1758         let msr_list = kvm.get_msr_index_list().unwrap();
1759         assert!(msr_list.len() >= 2);
1760     }
1761 
1762     #[test]
add_memory()1763     fn add_memory() {
1764         let kvm = Kvm::new().unwrap();
1765         let gm = GuestMemory::new(&vec![
1766             (GuestAddress(0), 0x1000),
1767             (GuestAddress(0x5000), 0x5000),
1768         ])
1769         .unwrap();
1770         let mut vm = Vm::new(&kvm, gm).unwrap();
1771         let mem_size = 0x1000;
1772         let mem = MemoryMappingBuilder::new(mem_size).build().unwrap();
1773         vm.add_memory_region(GuestAddress(0x1000), Box::new(mem), false, false)
1774             .unwrap();
1775         let mem = MemoryMappingBuilder::new(mem_size).build().unwrap();
1776         vm.add_memory_region(GuestAddress(0x10000), Box::new(mem), false, false)
1777             .unwrap();
1778     }
1779 
1780     #[test]
add_memory_ro()1781     fn add_memory_ro() {
1782         let kvm = Kvm::new().unwrap();
1783         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x1000)]).unwrap();
1784         let mut vm = Vm::new(&kvm, gm).unwrap();
1785         let mem_size = 0x1000;
1786         let mem = MemoryMappingBuilder::new(mem_size).build().unwrap();
1787         vm.add_memory_region(GuestAddress(0x1000), Box::new(mem), true, false)
1788             .unwrap();
1789     }
1790 
1791     #[test]
remove_memory_region()1792     fn remove_memory_region() {
1793         let kvm = Kvm::new().unwrap();
1794         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x1000)]).unwrap();
1795         let mut vm = Vm::new(&kvm, gm).unwrap();
1796         let mem_size = 0x1000;
1797         let mem = MemoryMappingBuilder::new(mem_size).build().unwrap();
1798         let mem_ptr = mem.as_ptr();
1799         let slot = vm
1800             .add_memory_region(GuestAddress(0x1000), Box::new(mem), false, false)
1801             .unwrap();
1802         let removed_mem = vm.remove_memory_region(slot).unwrap();
1803         assert_eq!(removed_mem.size(), mem_size);
1804         assert_eq!(removed_mem.as_ptr(), mem_ptr);
1805     }
1806 
1807     #[test]
remove_invalid_memory()1808     fn remove_invalid_memory() {
1809         let kvm = Kvm::new().unwrap();
1810         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x1000)]).unwrap();
1811         let mut vm = Vm::new(&kvm, gm).unwrap();
1812         assert!(vm.remove_memory_region(0).is_err());
1813     }
1814 
1815     #[test]
overlap_memory()1816     fn overlap_memory() {
1817         let kvm = Kvm::new().unwrap();
1818         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap();
1819         let mut vm = Vm::new(&kvm, gm).unwrap();
1820         let mem_size = 0x2000;
1821         let mem = MemoryMappingBuilder::new(mem_size).build().unwrap();
1822         assert!(vm
1823             .add_memory_region(GuestAddress(0x2000), Box::new(mem), false, false)
1824             .is_err());
1825     }
1826 
1827     #[test]
get_memory()1828     fn get_memory() {
1829         let kvm = Kvm::new().unwrap();
1830         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x1000)]).unwrap();
1831         let vm = Vm::new(&kvm, gm).unwrap();
1832         let obj_addr = GuestAddress(0xf0);
1833         vm.get_memory().write_obj_at_addr(67u8, obj_addr).unwrap();
1834         let read_val: u8 = vm.get_memory().read_obj_from_addr(obj_addr).unwrap();
1835         assert_eq!(read_val, 67u8);
1836     }
1837 
1838     #[test]
1839     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
clock_handling()1840     fn clock_handling() {
1841         let kvm = Kvm::new().unwrap();
1842         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap();
1843         let vm = Vm::new(&kvm, gm).unwrap();
1844         let mut clock_data = vm.get_clock().unwrap();
1845         clock_data.clock += 1000;
1846         vm.set_clock(&clock_data).unwrap();
1847     }
1848 
1849     #[test]
1850     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pic_handling()1851     fn pic_handling() {
1852         let kvm = Kvm::new().unwrap();
1853         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap();
1854         let vm = Vm::new(&kvm, gm).unwrap();
1855         vm.create_irq_chip().unwrap();
1856         let pic_state = vm.get_pic_state(PicId::Secondary).unwrap();
1857         vm.set_pic_state(PicId::Secondary, &pic_state).unwrap();
1858     }
1859 
1860     #[test]
1861     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
ioapic_handling()1862     fn ioapic_handling() {
1863         let kvm = Kvm::new().unwrap();
1864         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap();
1865         let vm = Vm::new(&kvm, gm).unwrap();
1866         vm.create_irq_chip().unwrap();
1867         let ioapic_state = vm.get_ioapic_state().unwrap();
1868         vm.set_ioapic_state(&ioapic_state).unwrap();
1869     }
1870 
1871     #[test]
1872     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pit_handling()1873     fn pit_handling() {
1874         let kvm = Kvm::new().unwrap();
1875         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap();
1876         let vm = Vm::new(&kvm, gm).unwrap();
1877         vm.create_irq_chip().unwrap();
1878         vm.create_pit().unwrap();
1879         let pit_state = vm.get_pit_state().unwrap();
1880         vm.set_pit_state(&pit_state).unwrap();
1881     }
1882 
1883     #[test]
register_ioevent()1884     fn register_ioevent() {
1885         let kvm = Kvm::new().unwrap();
1886         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap();
1887         let vm = Vm::new(&kvm, gm).unwrap();
1888         let evtfd = Event::new().unwrap();
1889         vm.register_ioevent(&evtfd, IoeventAddress::Pio(0xf4), Datamatch::AnyLength)
1890             .unwrap();
1891         vm.register_ioevent(&evtfd, IoeventAddress::Mmio(0x1000), Datamatch::AnyLength)
1892             .unwrap();
1893         vm.register_ioevent(
1894             &evtfd,
1895             IoeventAddress::Pio(0xc1),
1896             Datamatch::U8(Some(0x7fu8)),
1897         )
1898         .unwrap();
1899         vm.register_ioevent(
1900             &evtfd,
1901             IoeventAddress::Pio(0xc2),
1902             Datamatch::U16(Some(0x1337u16)),
1903         )
1904         .unwrap();
1905         vm.register_ioevent(
1906             &evtfd,
1907             IoeventAddress::Pio(0xc4),
1908             Datamatch::U32(Some(0xdeadbeefu32)),
1909         )
1910         .unwrap();
1911         vm.register_ioevent(
1912             &evtfd,
1913             IoeventAddress::Pio(0xc8),
1914             Datamatch::U64(Some(0xdeadbeefdeadbeefu64)),
1915         )
1916         .unwrap();
1917     }
1918 
1919     #[test]
unregister_ioevent()1920     fn unregister_ioevent() {
1921         let kvm = Kvm::new().unwrap();
1922         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap();
1923         let vm = Vm::new(&kvm, gm).unwrap();
1924         let evtfd = Event::new().unwrap();
1925         vm.register_ioevent(&evtfd, IoeventAddress::Pio(0xf4), Datamatch::AnyLength)
1926             .unwrap();
1927         vm.register_ioevent(&evtfd, IoeventAddress::Mmio(0x1000), Datamatch::AnyLength)
1928             .unwrap();
1929         vm.register_ioevent(
1930             &evtfd,
1931             IoeventAddress::Mmio(0x1004),
1932             Datamatch::U8(Some(0x7fu8)),
1933         )
1934         .unwrap();
1935         vm.unregister_ioevent(&evtfd, IoeventAddress::Pio(0xf4), Datamatch::AnyLength)
1936             .unwrap();
1937         vm.unregister_ioevent(&evtfd, IoeventAddress::Mmio(0x1000), Datamatch::AnyLength)
1938             .unwrap();
1939         vm.unregister_ioevent(
1940             &evtfd,
1941             IoeventAddress::Mmio(0x1004),
1942             Datamatch::U8(Some(0x7fu8)),
1943         )
1944         .unwrap();
1945     }
1946 
1947     #[test]
irqfd_resample()1948     fn irqfd_resample() {
1949         let kvm = Kvm::new().unwrap();
1950         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap();
1951         let vm = Vm::new(&kvm, gm).unwrap();
1952         let evtfd1 = Event::new().unwrap();
1953         let evtfd2 = Event::new().unwrap();
1954         vm.create_irq_chip().unwrap();
1955         vm.register_irqfd_resample(&evtfd1, &evtfd2, 4).unwrap();
1956         vm.unregister_irqfd(&evtfd1, 4).unwrap();
1957         // Ensures the ioctl is actually reading the resamplefd.
1958         vm.register_irqfd_resample(&evtfd1, unsafe { &Event::from_raw_descriptor(-1) }, 4)
1959             .unwrap_err();
1960     }
1961 
1962     #[test]
1963     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
set_gsi_routing()1964     fn set_gsi_routing() {
1965         let kvm = Kvm::new().unwrap();
1966         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap();
1967         let vm = Vm::new(&kvm, gm).unwrap();
1968         vm.create_irq_chip().unwrap();
1969         vm.set_gsi_routing(&[]).unwrap();
1970         vm.set_gsi_routing(&[IrqRoute {
1971             gsi: 1,
1972             source: IrqSource::Irqchip {
1973                 chip: KVM_IRQCHIP_IOAPIC,
1974                 pin: 3,
1975             },
1976         }])
1977         .unwrap();
1978         vm.set_gsi_routing(&[IrqRoute {
1979             gsi: 1,
1980             source: IrqSource::Msi {
1981                 address: 0xf000000,
1982                 data: 0xa0,
1983             },
1984         }])
1985         .unwrap();
1986         vm.set_gsi_routing(&[
1987             IrqRoute {
1988                 gsi: 1,
1989                 source: IrqSource::Irqchip {
1990                     chip: KVM_IRQCHIP_IOAPIC,
1991                     pin: 3,
1992                 },
1993             },
1994             IrqRoute {
1995                 gsi: 2,
1996                 source: IrqSource::Msi {
1997                     address: 0xf000000,
1998                     data: 0xa0,
1999                 },
2000             },
2001         ])
2002         .unwrap();
2003     }
2004 
2005     #[test]
create_vcpu()2006     fn create_vcpu() {
2007         let kvm = Kvm::new().unwrap();
2008         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap();
2009         let vm = Vm::new(&kvm, gm).unwrap();
2010         Vcpu::new(0, &kvm, &vm).unwrap();
2011     }
2012 
2013     #[test]
2014     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
debugregs()2015     fn debugregs() {
2016         let kvm = Kvm::new().unwrap();
2017         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap();
2018         let vm = Vm::new(&kvm, gm).unwrap();
2019         let vcpu = Vcpu::new(0, &kvm, &vm).unwrap();
2020         let mut dregs = vcpu.get_debugregs().unwrap();
2021         dregs.dr7 = 13;
2022         vcpu.set_debugregs(&dregs).unwrap();
2023         let dregs2 = vcpu.get_debugregs().unwrap();
2024         assert_eq!(dregs.dr7, dregs2.dr7);
2025     }
2026 
2027     #[test]
2028     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
xcrs()2029     fn xcrs() {
2030         let kvm = Kvm::new().unwrap();
2031         if !kvm.check_extension(Cap::Xcrs) {
2032             return;
2033         }
2034 
2035         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap();
2036         let vm = Vm::new(&kvm, gm).unwrap();
2037         let vcpu = Vcpu::new(0, &kvm, &vm).unwrap();
2038         let mut xcrs = vcpu.get_xcrs().unwrap();
2039         xcrs.xcrs[0].value = 1;
2040         vcpu.set_xcrs(&xcrs).unwrap();
2041         let xcrs2 = vcpu.get_xcrs().unwrap();
2042         assert_eq!(xcrs.xcrs[0].value, xcrs2.xcrs[0].value);
2043     }
2044 
2045     #[test]
2046     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_msrs()2047     fn get_msrs() {
2048         let kvm = Kvm::new().unwrap();
2049         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap();
2050         let vm = Vm::new(&kvm, gm).unwrap();
2051         let vcpu = Vcpu::new(0, &kvm, &vm).unwrap();
2052         let mut msrs = vec![
2053             // This one should succeed
2054             kvm_msr_entry {
2055                 index: 0x0000011e,
2056                 ..Default::default()
2057             },
2058             // This one will fail to fetch
2059             kvm_msr_entry {
2060                 index: 0x000003f1,
2061                 ..Default::default()
2062             },
2063         ];
2064         vcpu.get_msrs(&mut msrs).unwrap();
2065         assert_eq!(msrs.len(), 1);
2066     }
2067 
2068     #[test]
2069     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
get_hyperv_cpuid()2070     fn get_hyperv_cpuid() {
2071         let kvm = Kvm::new().unwrap();
2072         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap();
2073         let vm = Vm::new(&kvm, gm).unwrap();
2074         let vcpu = Vcpu::new(0, &kvm, &vm).unwrap();
2075         let cpuid = vcpu.get_hyperv_cpuid();
2076         // Older kernels don't support so tolerate this kind of failure.
2077         match cpuid {
2078             Ok(_) => {}
2079             Err(e) => {
2080                 assert_eq!(e.errno(), EINVAL);
2081             }
2082         }
2083     }
2084 
2085     #[test]
2086     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
enable_feature()2087     fn enable_feature() {
2088         let kvm = Kvm::new().unwrap();
2089         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap();
2090         let vm = Vm::new(&kvm, gm).unwrap();
2091         vm.create_irq_chip().unwrap();
2092         let vcpu = Vcpu::new(0, &kvm, &vm).unwrap();
2093         let mut cap: kvm_enable_cap = Default::default();
2094         cap.cap = kvm_sys::KVM_CAP_HYPERV_SYNIC;
2095         unsafe { vcpu.kvm_enable_cap(&cap) }.unwrap();
2096     }
2097 
2098     #[test]
2099     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
mp_state()2100     fn mp_state() {
2101         let kvm = Kvm::new().unwrap();
2102         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap();
2103         let vm = Vm::new(&kvm, gm).unwrap();
2104         vm.create_irq_chip().unwrap();
2105         let vcpu = Vcpu::new(0, &kvm, &vm).unwrap();
2106         let state = vcpu.get_mp_state().unwrap();
2107         vcpu.set_mp_state(&state).unwrap();
2108     }
2109 
2110     #[test]
set_signal_mask()2111     fn set_signal_mask() {
2112         let kvm = Kvm::new().unwrap();
2113         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap();
2114         let vm = Vm::new(&kvm, gm).unwrap();
2115         let vcpu = Vcpu::new(0, &kvm, &vm).unwrap();
2116         vcpu.set_signal_mask(&[base::SIGRTMIN() + 0]).unwrap();
2117     }
2118 
2119     #[test]
vcpu_mmap_size()2120     fn vcpu_mmap_size() {
2121         let kvm = Kvm::new().unwrap();
2122         let mmap_size = kvm.get_vcpu_mmap_size().unwrap();
2123         let page_size = pagesize();
2124         assert!(mmap_size >= page_size);
2125         assert!(mmap_size % page_size == 0);
2126     }
2127 
2128     #[test]
2129     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
set_identity_map_addr()2130     fn set_identity_map_addr() {
2131         let kvm = Kvm::new().unwrap();
2132         let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x10000)]).unwrap();
2133         let vm = Vm::new(&kvm, gm).unwrap();
2134         vm.set_identity_map_addr(GuestAddress(0x20000)).unwrap();
2135     }
2136 }
2137