1 // Copyright 2017 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::fmt::{self, Display};
6 use std::{mem, result};
7
8 use base::{self, warn};
9 use hypervisor::{Fpu, Register, Regs, Sregs, VcpuX86_64};
10 use vm_memory::{GuestAddress, GuestMemory};
11
12 use crate::gdt;
13
14 #[derive(Debug)]
15 pub enum Error {
16 /// Setting up msrs failed.
17 MsrIoctlFailed(base::Error),
18 /// Failed to configure the FPU.
19 FpuIoctlFailed(base::Error),
20 /// Failed to get sregs for this cpu.
21 GetSRegsIoctlFailed(base::Error),
22 /// Failed to set base registers for this cpu.
23 SettingRegistersIoctl(base::Error),
24 /// Failed to set sregs for this cpu.
25 SetSRegsIoctlFailed(base::Error),
26 /// Writing the GDT to RAM failed.
27 WriteGDTFailure,
28 /// Writing the IDT to RAM failed.
29 WriteIDTFailure,
30 /// Writing PML4 to RAM failed.
31 WritePML4Address,
32 /// Writing PDPTE to RAM failed.
33 WritePDPTEAddress,
34 /// Writing PDE to RAM failed.
35 WritePDEAddress,
36 }
37 pub type Result<T> = result::Result<T, Error>;
38
39 impl std::error::Error for Error {}
40
41 impl Display for Error {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result42 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
43 use self::Error::*;
44
45 match self {
46 MsrIoctlFailed(e) => write!(f, "setting up msrs failed: {}", e),
47 FpuIoctlFailed(e) => write!(f, "failed to configure the FPU: {}", e),
48 GetSRegsIoctlFailed(e) => write!(f, "failed to get sregs for this cpu: {}", e),
49 SettingRegistersIoctl(e) => {
50 write!(f, "failed to set base registers for this cpu: {}", e)
51 }
52 SetSRegsIoctlFailed(e) => write!(f, "failed to set sregs for this cpu: {}", e),
53 WriteGDTFailure => write!(f, "writing the GDT to RAM failed"),
54 WriteIDTFailure => write!(f, "writing the IDT to RAM failed"),
55 WritePML4Address => write!(f, "writing PML4 to RAM failed"),
56 WritePDPTEAddress => write!(f, "writing PDPTE to RAM failed"),
57 WritePDEAddress => write!(f, "writing PDE to RAM failed"),
58 }
59 }
60 }
61
62 const MTRR_MEMTYPE_UC: u8 = 0x0;
63 const MTRR_MEMTYPE_WB: u8 = 0x6;
64 const MTRR_VAR_VALID: u64 = 0x800;
65 const MTRR_ENABLE: u64 = 0x800;
66 const MTRR_PHYS_BASE_MSR: u32 = 0x200;
67 const MTRR_PHYS_MASK_MSR: u32 = 0x201;
68 const VAR_MTRR_NUM_MASK: u64 = 0xFF;
69
70 // Returns the value of the highest bit in a 64-bit value. Equivalent to
71 // 1 << HighBitSet(x)
get_power_of_two(data: u64) -> u6472 fn get_power_of_two(data: u64) -> u64 {
73 1 << (64 - data.leading_zeros() - 1)
74 }
75
76 // Returns the max length which suitable for mtrr setting based on the
77 // specified (base, len)
get_max_len(base: u64, len: u64) -> u6478 fn get_max_len(base: u64, len: u64) -> u64 {
79 let mut ret = get_power_of_two(len);
80
81 while base % ret != 0 {
82 ret >>= 1;
83 }
84
85 ret
86 }
87
88 // For the specified (Base, Len), returns (base, len) pair which could be
89 // set into mtrr register. mtrr requires: the base-address alignment value can't be
90 // less than its length
get_mtrr_pairs(base: u64, len: u64) -> Vec<(u64, u64)>91 fn get_mtrr_pairs(base: u64, len: u64) -> Vec<(u64, u64)> {
92 let mut vecs = Vec::new();
93
94 let mut remains = len;
95 let mut new = base;
96 while remains != 0 {
97 let max = get_max_len(new, remains);
98 vecs.push((new, max));
99 remains -= max;
100 new += max;
101 }
102
103 vecs
104 }
105
append_mtrr_entries(vpu: &dyn VcpuX86_64, pci_start: u64, entries: &mut Vec<Register>)106 fn append_mtrr_entries(vpu: &dyn VcpuX86_64, pci_start: u64, entries: &mut Vec<Register>) {
107 // Get VAR MTRR num from MSR_MTRRcap
108 let mut msrs = vec![Register {
109 id: crate::msr_index::MSR_MTRRcap,
110 ..Default::default()
111 }];
112 if vpu.get_msrs(&mut msrs).is_err() {
113 warn!("get msrs fail, guest with pass through device may be very slow");
114 return;
115 }
116 let var_num = msrs[0].value & VAR_MTRR_NUM_MASK;
117
118 // Set pci_start .. 4G as UC
119 // all others are set to default WB
120 let pci_len = (1 << 32) - pci_start;
121 let vecs = get_mtrr_pairs(pci_start, pci_len);
122 if vecs.len() as u64 > var_num {
123 warn!(
124 "mtrr fail for pci mmio, please check pci_start addr,
125 guest with pass through device may be very slow"
126 );
127 return;
128 }
129
130 let phys_mask: u64 = (1 << crate::cpuid::phy_max_address_bits()) - 1;
131 for (idx, (base, len)) in vecs.iter().enumerate() {
132 let reg_idx = idx as u32 * 2;
133 entries.push(Register {
134 id: MTRR_PHYS_BASE_MSR + reg_idx,
135 value: base | MTRR_MEMTYPE_UC as u64,
136 });
137 let mask: u64 = len.wrapping_neg() & phys_mask | MTRR_VAR_VALID;
138 entries.push(Register {
139 id: MTRR_PHYS_MASK_MSR + reg_idx,
140 value: mask,
141 });
142 }
143 // Disable fixed MTRRs and enable variable MTRRs, set default type as WB
144 entries.push(Register {
145 id: crate::msr_index::MSR_MTRRdefType,
146 value: MTRR_ENABLE | MTRR_MEMTYPE_WB as u64,
147 });
148 }
149
create_msr_entries(vcpu: &dyn VcpuX86_64, pci_start: u64) -> Vec<Register>150 fn create_msr_entries(vcpu: &dyn VcpuX86_64, pci_start: u64) -> Vec<Register> {
151 let mut entries = vec![
152 Register {
153 id: crate::msr_index::MSR_IA32_SYSENTER_CS,
154 value: 0x0,
155 },
156 Register {
157 id: crate::msr_index::MSR_IA32_SYSENTER_ESP,
158 value: 0x0,
159 },
160 Register {
161 id: crate::msr_index::MSR_IA32_SYSENTER_EIP,
162 value: 0x0,
163 },
164 // x86_64 specific msrs, we only run on x86_64 not x86
165 Register {
166 id: crate::msr_index::MSR_STAR,
167 value: 0x0,
168 },
169 Register {
170 id: crate::msr_index::MSR_CSTAR,
171 value: 0x0,
172 },
173 Register {
174 id: crate::msr_index::MSR_KERNEL_GS_BASE,
175 value: 0x0,
176 },
177 Register {
178 id: crate::msr_index::MSR_SYSCALL_MASK,
179 value: 0x0,
180 },
181 Register {
182 id: crate::msr_index::MSR_LSTAR,
183 value: 0x0,
184 },
185 // end of x86_64 specific code
186 Register {
187 id: crate::msr_index::MSR_IA32_TSC,
188 value: 0x0,
189 },
190 Register {
191 id: crate::msr_index::MSR_IA32_MISC_ENABLE,
192 value: crate::msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64,
193 },
194 ];
195 append_mtrr_entries(vcpu, pci_start, &mut entries);
196 entries
197 }
198
199 /// Configure Model specific registers for x86
200 ///
201 /// # Arguments
202 ///
203 /// * `vcpu` - Structure for the vcpu that holds the vcpu fd.
setup_msrs(vcpu: &dyn VcpuX86_64, pci_start: u64) -> Result<()>204 pub fn setup_msrs(vcpu: &dyn VcpuX86_64, pci_start: u64) -> Result<()> {
205 let msrs = create_msr_entries(vcpu, pci_start);
206 vcpu.set_msrs(&msrs).map_err(Error::MsrIoctlFailed)
207 }
208
209 /// Configure FPU registers for x86
210 ///
211 /// # Arguments
212 ///
213 /// * `vcpu` - Structure for the vcpu that holds the vcpu fd.
setup_fpu(vcpu: &dyn VcpuX86_64) -> Result<()>214 pub fn setup_fpu(vcpu: &dyn VcpuX86_64) -> Result<()> {
215 let fpu = Fpu {
216 fcw: 0x37f,
217 mxcsr: 0x1f80,
218 ..Default::default()
219 };
220
221 vcpu.set_fpu(&fpu).map_err(Error::FpuIoctlFailed)
222 }
223
224 /// Configure base registers for x86
225 ///
226 /// # Arguments
227 ///
228 /// * `vcpu` - Structure for the vcpu that holds the vcpu fd.
229 /// * `boot_ip` - Starting instruction pointer.
230 /// * `boot_sp` - Starting stack pointer.
231 /// * `boot_si` - Must point to zero page address per Linux ABI.
setup_regs(vcpu: &dyn VcpuX86_64, boot_ip: u64, boot_sp: u64, boot_si: u64) -> Result<()>232 pub fn setup_regs(vcpu: &dyn VcpuX86_64, boot_ip: u64, boot_sp: u64, boot_si: u64) -> Result<()> {
233 let regs = Regs {
234 rflags: 0x0000000000000002u64,
235 rip: boot_ip,
236 rsp: boot_sp,
237 rbp: boot_sp,
238 rsi: boot_si,
239 ..Default::default()
240 };
241
242 vcpu.set_regs(®s).map_err(Error::SettingRegistersIoctl)
243 }
244
245 const X86_CR0_PE: u64 = 0x1;
246 const X86_CR0_PG: u64 = 0x80000000;
247 const X86_CR4_PAE: u64 = 0x20;
248
249 const EFER_LME: u64 = 0x100;
250 const EFER_LMA: u64 = 0x400;
251
252 const BOOT_GDT_OFFSET: u64 = 0x500;
253 const BOOT_IDT_OFFSET: u64 = 0x520;
254
255 const BOOT_GDT_MAX: usize = 4;
256
write_gdt_table(table: &[u64], guest_mem: &GuestMemory) -> Result<()>257 fn write_gdt_table(table: &[u64], guest_mem: &GuestMemory) -> Result<()> {
258 let boot_gdt_addr = GuestAddress(BOOT_GDT_OFFSET);
259 for (index, entry) in table.iter().enumerate() {
260 let addr = guest_mem
261 .checked_offset(boot_gdt_addr, (index * mem::size_of::<u64>()) as u64)
262 .ok_or(Error::WriteGDTFailure)?;
263 guest_mem
264 .write_obj_at_addr(*entry, addr)
265 .map_err(|_| Error::WriteGDTFailure)?;
266 }
267 Ok(())
268 }
269
write_idt_value(val: u64, guest_mem: &GuestMemory) -> Result<()>270 fn write_idt_value(val: u64, guest_mem: &GuestMemory) -> Result<()> {
271 let boot_idt_addr = GuestAddress(BOOT_IDT_OFFSET);
272 guest_mem
273 .write_obj_at_addr(val, boot_idt_addr)
274 .map_err(|_| Error::WriteIDTFailure)
275 }
276
configure_segments_and_sregs(mem: &GuestMemory, sregs: &mut Sregs) -> Result<()>277 fn configure_segments_and_sregs(mem: &GuestMemory, sregs: &mut Sregs) -> Result<()> {
278 let gdt_table: [u64; BOOT_GDT_MAX as usize] = [
279 gdt::gdt_entry(0, 0, 0), // NULL
280 gdt::gdt_entry(0xa09b, 0, 0xfffff), // CODE
281 gdt::gdt_entry(0xc093, 0, 0xfffff), // DATA
282 gdt::gdt_entry(0x808b, 0, 0xfffff), // TSS
283 ];
284
285 let code_seg = gdt::segment_from_gdt(gdt_table[1], 1);
286 let data_seg = gdt::segment_from_gdt(gdt_table[2], 2);
287 let tss_seg = gdt::segment_from_gdt(gdt_table[3], 3);
288
289 // Write segments
290 write_gdt_table(&gdt_table[..], mem)?;
291 sregs.gdt.base = BOOT_GDT_OFFSET as u64;
292 sregs.gdt.limit = mem::size_of_val(&gdt_table) as u16 - 1;
293
294 write_idt_value(0, mem)?;
295 sregs.idt.base = BOOT_IDT_OFFSET as u64;
296 sregs.idt.limit = mem::size_of::<u64>() as u16 - 1;
297
298 sregs.cs = code_seg;
299 sregs.ds = data_seg;
300 sregs.es = data_seg;
301 sregs.fs = data_seg;
302 sregs.gs = data_seg;
303 sregs.ss = data_seg;
304 sregs.tr = tss_seg;
305
306 /* 64-bit protected mode */
307 sregs.cr0 |= X86_CR0_PE;
308 sregs.efer |= EFER_LME;
309
310 Ok(())
311 }
312
setup_page_tables(mem: &GuestMemory, sregs: &mut Sregs) -> Result<()>313 fn setup_page_tables(mem: &GuestMemory, sregs: &mut Sregs) -> Result<()> {
314 // Puts PML4 right after zero page but aligned to 4k.
315 let boot_pml4_addr = GuestAddress(0x9000);
316 let boot_pdpte_addr = GuestAddress(0xa000);
317 let boot_pde_addr = GuestAddress(0xb000);
318
319 // Entry covering VA [0..512GB)
320 mem.write_obj_at_addr(boot_pdpte_addr.offset() as u64 | 0x03, boot_pml4_addr)
321 .map_err(|_| Error::WritePML4Address)?;
322
323 // Entry covering VA [0..1GB)
324 mem.write_obj_at_addr(boot_pde_addr.offset() as u64 | 0x03, boot_pdpte_addr)
325 .map_err(|_| Error::WritePDPTEAddress)?;
326
327 // 512 2MB entries together covering VA [0..1GB). Note we are assuming
328 // CPU supports 2MB pages (/proc/cpuinfo has 'pse'). All modern CPUs do.
329 for i in 0..512 {
330 mem.write_obj_at_addr((i << 21) + 0x83u64, boot_pde_addr.unchecked_add(i * 8))
331 .map_err(|_| Error::WritePDEAddress)?;
332 }
333 sregs.cr3 = boot_pml4_addr.offset() as u64;
334 sregs.cr4 |= X86_CR4_PAE;
335 sregs.cr0 |= X86_CR0_PG;
336 sregs.efer |= EFER_LMA; // Long mode is active. Must be auto-enabled with CR0_PG.
337 Ok(())
338 }
339
340 /// Configures the segment registers and system page tables for a given CPU.
341 ///
342 /// # Arguments
343 ///
344 /// * `mem` - The memory that will be passed to the guest.
345 /// * `vcpu` - The VCPU to configure registers on.
setup_sregs(mem: &GuestMemory, vcpu: &dyn VcpuX86_64) -> Result<()>346 pub fn setup_sregs(mem: &GuestMemory, vcpu: &dyn VcpuX86_64) -> Result<()> {
347 let mut sregs = vcpu.get_sregs().map_err(Error::GetSRegsIoctlFailed)?;
348
349 configure_segments_and_sregs(mem, &mut sregs)?;
350 setup_page_tables(mem, &mut sregs)?; // TODO(dgreid) - Can this be done once per system instead?
351
352 vcpu.set_sregs(&sregs).map_err(Error::SetSRegsIoctlFailed)?;
353
354 Ok(())
355 }
356
357 #[cfg(test)]
358 mod tests {
359 use super::*;
360 use vm_memory::{GuestAddress, GuestMemory};
361
create_guest_mem() -> GuestMemory362 fn create_guest_mem() -> GuestMemory {
363 GuestMemory::new(&[(GuestAddress(0), 0x10000)]).unwrap()
364 }
365
read_u64(gm: &GuestMemory, offset: u64) -> u64366 fn read_u64(gm: &GuestMemory, offset: u64) -> u64 {
367 let read_addr = GuestAddress(offset);
368 gm.read_obj_from_addr(read_addr).unwrap()
369 }
370
371 #[test]
segments_and_sregs()372 fn segments_and_sregs() {
373 let mut sregs = Default::default();
374 let gm = create_guest_mem();
375 configure_segments_and_sregs(&gm, &mut sregs).unwrap();
376
377 assert_eq!(0x0, read_u64(&gm, BOOT_GDT_OFFSET));
378 assert_eq!(0xaf9b000000ffff, read_u64(&gm, BOOT_GDT_OFFSET + 8));
379 assert_eq!(0xcf93000000ffff, read_u64(&gm, BOOT_GDT_OFFSET + 16));
380 assert_eq!(0x8f8b000000ffff, read_u64(&gm, BOOT_GDT_OFFSET + 24));
381 assert_eq!(0x0, read_u64(&gm, BOOT_IDT_OFFSET));
382
383 assert_eq!(0, sregs.cs.base);
384 assert_eq!(0xfffff, sregs.ds.limit);
385 assert_eq!(0x10, sregs.es.selector);
386 assert_eq!(1, sregs.fs.present);
387 assert_eq!(1, sregs.gs.g);
388 assert_eq!(0, sregs.ss.avl);
389 assert_eq!(0, sregs.tr.base);
390 assert_eq!(0xfffff, sregs.tr.limit);
391 assert_eq!(0, sregs.tr.avl);
392 assert_eq!(X86_CR0_PE, sregs.cr0);
393 assert_eq!(EFER_LME, sregs.efer);
394 }
395
396 #[test]
page_tables()397 fn page_tables() {
398 let mut sregs = Default::default();
399 let gm = create_guest_mem();
400 setup_page_tables(&gm, &mut sregs).unwrap();
401
402 assert_eq!(0xa003, read_u64(&gm, 0x9000));
403 assert_eq!(0xb003, read_u64(&gm, 0xa000));
404 for i in 0..512 {
405 assert_eq!((i << 21) + 0x83u64, read_u64(&gm, 0xb000 + i * 8));
406 }
407
408 assert_eq!(0x9000, sregs.cr3);
409 assert_eq!(X86_CR4_PAE, sregs.cr4);
410 assert_eq!(X86_CR0_PG, sregs.cr0);
411 }
412 }
413