1/* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. See 15 * the COPYING file in the top-level directory. 16 * 17 */ 18 19#include "irq.h" 20#include "mmu.h" 21#include "cpuid.h" 22 23#include <linux/kvm_host.h> 24#include <linux/module.h> 25#include <linux/kernel.h> 26#include <linux/mm.h> 27#include <linux/highmem.h> 28#include <linux/sched.h> 29#include <linux/moduleparam.h> 30#include <linux/mod_devicetable.h> 31#include <linux/ftrace_event.h> 32#include <linux/slab.h> 33#include <linux/tboot.h> 34#include <linux/hrtimer.h> 35#include "kvm_cache_regs.h" 36#include "x86.h" 37 38#include <asm/io.h> 39#include <asm/desc.h> 40#include <asm/vmx.h> 41#include <asm/virtext.h> 42#include <asm/mce.h> 43#include <asm/i387.h> 44#include <asm/xcr.h> 45#include <asm/perf_event.h> 46#include <asm/debugreg.h> 47#include <asm/kexec.h> 48 49#include "trace.h" 50 51#define __ex(x) __kvm_handle_fault_on_reboot(x) 52#define __ex_clear(x, reg) \ 53 ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg) 54 55MODULE_AUTHOR("Qumranet"); 56MODULE_LICENSE("GPL"); 57 58static const struct x86_cpu_id vmx_cpu_id[] = { 59 X86_FEATURE_MATCH(X86_FEATURE_VMX), 60 {} 61}; 62MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); 63 64static bool __read_mostly enable_vpid = 1; 65module_param_named(vpid, enable_vpid, bool, 0444); 66 67static bool __read_mostly flexpriority_enabled = 1; 68module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); 69 70static bool __read_mostly enable_ept = 1; 71module_param_named(ept, enable_ept, bool, S_IRUGO); 72 73static bool __read_mostly enable_unrestricted_guest = 1; 74module_param_named(unrestricted_guest, 75 enable_unrestricted_guest, bool, S_IRUGO); 76 77static bool __read_mostly enable_ept_ad_bits = 1; 78module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); 79 80static bool __read_mostly emulate_invalid_guest_state = true; 81module_param(emulate_invalid_guest_state, bool, S_IRUGO); 82 83static bool __read_mostly vmm_exclusive = 1; 84module_param(vmm_exclusive, bool, S_IRUGO); 85 86static bool __read_mostly fasteoi = 1; 87module_param(fasteoi, bool, S_IRUGO); 88 89static bool __read_mostly enable_apicv = 1; 90module_param(enable_apicv, bool, S_IRUGO); 91 92static bool __read_mostly enable_shadow_vmcs = 1; 93module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 94/* 95 * If nested=1, nested virtualization is supported, i.e., guests may use 96 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 97 * use VMX instructions. 98 */ 99static bool __read_mostly nested = 0; 100module_param(nested, bool, S_IRUGO); 101 102#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) 103#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) 104#define KVM_VM_CR0_ALWAYS_ON \ 105 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 106#define KVM_CR4_GUEST_OWNED_BITS \ 107 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ 108 | X86_CR4_OSXMMEXCPT) 109 110#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 111#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 112 113#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 114 115#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 116 117/* 118 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 119 * ple_gap: upper bound on the amount of time between two successive 120 * executions of PAUSE in a loop. Also indicate if ple enabled. 121 * According to test, this time is usually smaller than 128 cycles. 122 * ple_window: upper bound on the amount of time a guest is allowed to execute 123 * in a PAUSE loop. Tests indicate that most spinlocks are held for 124 * less than 2^12 cycles 125 * Time is measured based on a counter that runs at the same rate as the TSC, 126 * refer SDM volume 3b section 21.6.13 & 22.1.3. 127 */ 128#define KVM_VMX_DEFAULT_PLE_GAP 128 129#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 130#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2 131#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0 132#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \ 133 INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW 134 135static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; 136module_param(ple_gap, int, S_IRUGO); 137 138static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 139module_param(ple_window, int, S_IRUGO); 140 141/* Default doubles per-vcpu window every exit. */ 142static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW; 143module_param(ple_window_grow, int, S_IRUGO); 144 145/* Default resets per-vcpu window every exit to ple_window. */ 146static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK; 147module_param(ple_window_shrink, int, S_IRUGO); 148 149/* Default is to compute the maximum so we can never overflow. */ 150static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 151static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 152module_param(ple_window_max, int, S_IRUGO); 153 154extern const ulong vmx_return; 155 156#define NR_AUTOLOAD_MSRS 8 157#define VMCS02_POOL_SIZE 1 158 159struct vmcs { 160 u32 revision_id; 161 u32 abort; 162 char data[0]; 163}; 164 165/* 166 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also 167 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs 168 * loaded on this CPU (so we can clear them if the CPU goes down). 169 */ 170struct loaded_vmcs { 171 struct vmcs *vmcs; 172 int cpu; 173 int launched; 174 struct list_head loaded_vmcss_on_cpu_link; 175}; 176 177struct shared_msr_entry { 178 unsigned index; 179 u64 data; 180 u64 mask; 181}; 182 183/* 184 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a 185 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has 186 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is 187 * stored in guest memory specified by VMPTRLD, but is opaque to the guest, 188 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. 189 * More than one of these structures may exist, if L1 runs multiple L2 guests. 190 * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the 191 * underlying hardware which will be used to run L2. 192 * This structure is packed to ensure that its layout is identical across 193 * machines (necessary for live migration). 194 * If there are changes in this struct, VMCS12_REVISION must be changed. 195 */ 196typedef u64 natural_width; 197struct __packed vmcs12 { 198 /* According to the Intel spec, a VMCS region must start with the 199 * following two fields. Then follow implementation-specific data. 200 */ 201 u32 revision_id; 202 u32 abort; 203 204 u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ 205 u32 padding[7]; /* room for future expansion */ 206 207 u64 io_bitmap_a; 208 u64 io_bitmap_b; 209 u64 msr_bitmap; 210 u64 vm_exit_msr_store_addr; 211 u64 vm_exit_msr_load_addr; 212 u64 vm_entry_msr_load_addr; 213 u64 tsc_offset; 214 u64 virtual_apic_page_addr; 215 u64 apic_access_addr; 216 u64 ept_pointer; 217 u64 guest_physical_address; 218 u64 vmcs_link_pointer; 219 u64 guest_ia32_debugctl; 220 u64 guest_ia32_pat; 221 u64 guest_ia32_efer; 222 u64 guest_ia32_perf_global_ctrl; 223 u64 guest_pdptr0; 224 u64 guest_pdptr1; 225 u64 guest_pdptr2; 226 u64 guest_pdptr3; 227 u64 guest_bndcfgs; 228 u64 host_ia32_pat; 229 u64 host_ia32_efer; 230 u64 host_ia32_perf_global_ctrl; 231 u64 padding64[8]; /* room for future expansion */ 232 /* 233 * To allow migration of L1 (complete with its L2 guests) between 234 * machines of different natural widths (32 or 64 bit), we cannot have 235 * unsigned long fields with no explict size. We use u64 (aliased 236 * natural_width) instead. Luckily, x86 is little-endian. 237 */ 238 natural_width cr0_guest_host_mask; 239 natural_width cr4_guest_host_mask; 240 natural_width cr0_read_shadow; 241 natural_width cr4_read_shadow; 242 natural_width cr3_target_value0; 243 natural_width cr3_target_value1; 244 natural_width cr3_target_value2; 245 natural_width cr3_target_value3; 246 natural_width exit_qualification; 247 natural_width guest_linear_address; 248 natural_width guest_cr0; 249 natural_width guest_cr3; 250 natural_width guest_cr4; 251 natural_width guest_es_base; 252 natural_width guest_cs_base; 253 natural_width guest_ss_base; 254 natural_width guest_ds_base; 255 natural_width guest_fs_base; 256 natural_width guest_gs_base; 257 natural_width guest_ldtr_base; 258 natural_width guest_tr_base; 259 natural_width guest_gdtr_base; 260 natural_width guest_idtr_base; 261 natural_width guest_dr7; 262 natural_width guest_rsp; 263 natural_width guest_rip; 264 natural_width guest_rflags; 265 natural_width guest_pending_dbg_exceptions; 266 natural_width guest_sysenter_esp; 267 natural_width guest_sysenter_eip; 268 natural_width host_cr0; 269 natural_width host_cr3; 270 natural_width host_cr4; 271 natural_width host_fs_base; 272 natural_width host_gs_base; 273 natural_width host_tr_base; 274 natural_width host_gdtr_base; 275 natural_width host_idtr_base; 276 natural_width host_ia32_sysenter_esp; 277 natural_width host_ia32_sysenter_eip; 278 natural_width host_rsp; 279 natural_width host_rip; 280 natural_width paddingl[8]; /* room for future expansion */ 281 u32 pin_based_vm_exec_control; 282 u32 cpu_based_vm_exec_control; 283 u32 exception_bitmap; 284 u32 page_fault_error_code_mask; 285 u32 page_fault_error_code_match; 286 u32 cr3_target_count; 287 u32 vm_exit_controls; 288 u32 vm_exit_msr_store_count; 289 u32 vm_exit_msr_load_count; 290 u32 vm_entry_controls; 291 u32 vm_entry_msr_load_count; 292 u32 vm_entry_intr_info_field; 293 u32 vm_entry_exception_error_code; 294 u32 vm_entry_instruction_len; 295 u32 tpr_threshold; 296 u32 secondary_vm_exec_control; 297 u32 vm_instruction_error; 298 u32 vm_exit_reason; 299 u32 vm_exit_intr_info; 300 u32 vm_exit_intr_error_code; 301 u32 idt_vectoring_info_field; 302 u32 idt_vectoring_error_code; 303 u32 vm_exit_instruction_len; 304 u32 vmx_instruction_info; 305 u32 guest_es_limit; 306 u32 guest_cs_limit; 307 u32 guest_ss_limit; 308 u32 guest_ds_limit; 309 u32 guest_fs_limit; 310 u32 guest_gs_limit; 311 u32 guest_ldtr_limit; 312 u32 guest_tr_limit; 313 u32 guest_gdtr_limit; 314 u32 guest_idtr_limit; 315 u32 guest_es_ar_bytes; 316 u32 guest_cs_ar_bytes; 317 u32 guest_ss_ar_bytes; 318 u32 guest_ds_ar_bytes; 319 u32 guest_fs_ar_bytes; 320 u32 guest_gs_ar_bytes; 321 u32 guest_ldtr_ar_bytes; 322 u32 guest_tr_ar_bytes; 323 u32 guest_interruptibility_info; 324 u32 guest_activity_state; 325 u32 guest_sysenter_cs; 326 u32 host_ia32_sysenter_cs; 327 u32 vmx_preemption_timer_value; 328 u32 padding32[7]; /* room for future expansion */ 329 u16 virtual_processor_id; 330 u16 guest_es_selector; 331 u16 guest_cs_selector; 332 u16 guest_ss_selector; 333 u16 guest_ds_selector; 334 u16 guest_fs_selector; 335 u16 guest_gs_selector; 336 u16 guest_ldtr_selector; 337 u16 guest_tr_selector; 338 u16 host_es_selector; 339 u16 host_cs_selector; 340 u16 host_ss_selector; 341 u16 host_ds_selector; 342 u16 host_fs_selector; 343 u16 host_gs_selector; 344 u16 host_tr_selector; 345}; 346 347/* 348 * VMCS12_REVISION is an arbitrary id that should be changed if the content or 349 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and 350 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. 351 */ 352#define VMCS12_REVISION 0x11e57ed0 353 354/* 355 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region 356 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the 357 * current implementation, 4K are reserved to avoid future complications. 358 */ 359#define VMCS12_SIZE 0x1000 360 361/* Used to remember the last vmcs02 used for some recently used vmcs12s */ 362struct vmcs02_list { 363 struct list_head list; 364 gpa_t vmptr; 365 struct loaded_vmcs vmcs02; 366}; 367 368/* 369 * The nested_vmx structure is part of vcpu_vmx, and holds information we need 370 * for correct emulation of VMX (i.e., nested VMX) on this vcpu. 371 */ 372struct nested_vmx { 373 /* Has the level1 guest done vmxon? */ 374 bool vmxon; 375 gpa_t vmxon_ptr; 376 377 /* The guest-physical address of the current VMCS L1 keeps for L2 */ 378 gpa_t current_vmptr; 379 /* The host-usable pointer to the above */ 380 struct page *current_vmcs12_page; 381 struct vmcs12 *current_vmcs12; 382 struct vmcs *current_shadow_vmcs; 383 /* 384 * Indicates if the shadow vmcs must be updated with the 385 * data hold by vmcs12 386 */ 387 bool sync_shadow_vmcs; 388 389 /* vmcs02_list cache of VMCSs recently used to run L2 guests */ 390 struct list_head vmcs02_pool; 391 int vmcs02_num; 392 u64 vmcs01_tsc_offset; 393 /* L2 must run next, and mustn't decide to exit to L1. */ 394 bool nested_run_pending; 395 /* 396 * Guest pages referred to in vmcs02 with host-physical pointers, so 397 * we must keep them pinned while L2 runs. 398 */ 399 struct page *apic_access_page; 400 struct page *virtual_apic_page; 401 u64 msr_ia32_feature_control; 402 403 struct hrtimer preemption_timer; 404 bool preemption_timer_expired; 405 406 /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ 407 u64 vmcs01_debugctl; 408}; 409 410#define POSTED_INTR_ON 0 411/* Posted-Interrupt Descriptor */ 412struct pi_desc { 413 u32 pir[8]; /* Posted interrupt requested */ 414 u32 control; /* bit 0 of control is outstanding notification bit */ 415 u32 rsvd[7]; 416} __aligned(64); 417 418static bool pi_test_and_set_on(struct pi_desc *pi_desc) 419{ 420 return test_and_set_bit(POSTED_INTR_ON, 421 (unsigned long *)&pi_desc->control); 422} 423 424static bool pi_test_and_clear_on(struct pi_desc *pi_desc) 425{ 426 return test_and_clear_bit(POSTED_INTR_ON, 427 (unsigned long *)&pi_desc->control); 428} 429 430static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) 431{ 432 return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); 433} 434 435struct vcpu_vmx { 436 struct kvm_vcpu vcpu; 437 unsigned long host_rsp; 438 u8 fail; 439 bool nmi_known_unmasked; 440 u32 exit_intr_info; 441 u32 idt_vectoring_info; 442 ulong rflags; 443 struct shared_msr_entry *guest_msrs; 444 int nmsrs; 445 int save_nmsrs; 446 unsigned long host_idt_base; 447#ifdef CONFIG_X86_64 448 u64 msr_host_kernel_gs_base; 449 u64 msr_guest_kernel_gs_base; 450#endif 451 u32 vm_entry_controls_shadow; 452 u32 vm_exit_controls_shadow; 453 /* 454 * loaded_vmcs points to the VMCS currently used in this vcpu. For a 455 * non-nested (L1) guest, it always points to vmcs01. For a nested 456 * guest (L2), it points to a different VMCS. 457 */ 458 struct loaded_vmcs vmcs01; 459 struct loaded_vmcs *loaded_vmcs; 460 bool __launched; /* temporary, used in vmx_vcpu_run */ 461 struct msr_autoload { 462 unsigned nr; 463 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; 464 struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; 465 } msr_autoload; 466 struct { 467 int loaded; 468 u16 fs_sel, gs_sel, ldt_sel; 469#ifdef CONFIG_X86_64 470 u16 ds_sel, es_sel; 471#endif 472 int gs_ldt_reload_needed; 473 int fs_reload_needed; 474 u64 msr_host_bndcfgs; 475 unsigned long vmcs_host_cr4; /* May not match real cr4 */ 476 } host_state; 477 struct { 478 int vm86_active; 479 ulong save_rflags; 480 struct kvm_segment segs[8]; 481 } rmode; 482 struct { 483 u32 bitmask; /* 4 bits per segment (1 bit per field) */ 484 struct kvm_save_segment { 485 u16 selector; 486 unsigned long base; 487 u32 limit; 488 u32 ar; 489 } seg[8]; 490 } segment_cache; 491 int vpid; 492 bool emulation_required; 493 494 /* Support for vnmi-less CPUs */ 495 int soft_vnmi_blocked; 496 ktime_t entry_time; 497 s64 vnmi_blocked_time; 498 u32 exit_reason; 499 500 bool rdtscp_enabled; 501 502 /* Posted interrupt descriptor */ 503 struct pi_desc pi_desc; 504 505 /* Support for a guest hypervisor (nested VMX) */ 506 struct nested_vmx nested; 507 508 /* Dynamic PLE window. */ 509 int ple_window; 510 bool ple_window_dirty; 511}; 512 513enum segment_cache_field { 514 SEG_FIELD_SEL = 0, 515 SEG_FIELD_BASE = 1, 516 SEG_FIELD_LIMIT = 2, 517 SEG_FIELD_AR = 3, 518 519 SEG_FIELD_NR = 4 520}; 521 522static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 523{ 524 return container_of(vcpu, struct vcpu_vmx, vcpu); 525} 526 527#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) 528#define FIELD(number, name) [number] = VMCS12_OFFSET(name) 529#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ 530 [number##_HIGH] = VMCS12_OFFSET(name)+4 531 532 533static unsigned long shadow_read_only_fields[] = { 534 /* 535 * We do NOT shadow fields that are modified when L0 536 * traps and emulates any vmx instruction (e.g. VMPTRLD, 537 * VMXON...) executed by L1. 538 * For example, VM_INSTRUCTION_ERROR is read 539 * by L1 if a vmx instruction fails (part of the error path). 540 * Note the code assumes this logic. If for some reason 541 * we start shadowing these fields then we need to 542 * force a shadow sync when L0 emulates vmx instructions 543 * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified 544 * by nested_vmx_failValid) 545 */ 546 VM_EXIT_REASON, 547 VM_EXIT_INTR_INFO, 548 VM_EXIT_INSTRUCTION_LEN, 549 IDT_VECTORING_INFO_FIELD, 550 IDT_VECTORING_ERROR_CODE, 551 VM_EXIT_INTR_ERROR_CODE, 552 EXIT_QUALIFICATION, 553 GUEST_LINEAR_ADDRESS, 554 GUEST_PHYSICAL_ADDRESS 555}; 556static int max_shadow_read_only_fields = 557 ARRAY_SIZE(shadow_read_only_fields); 558 559static unsigned long shadow_read_write_fields[] = { 560 TPR_THRESHOLD, 561 GUEST_RIP, 562 GUEST_RSP, 563 GUEST_CR0, 564 GUEST_CR3, 565 GUEST_CR4, 566 GUEST_INTERRUPTIBILITY_INFO, 567 GUEST_RFLAGS, 568 GUEST_CS_SELECTOR, 569 GUEST_CS_AR_BYTES, 570 GUEST_CS_LIMIT, 571 GUEST_CS_BASE, 572 GUEST_ES_BASE, 573 GUEST_BNDCFGS, 574 CR0_GUEST_HOST_MASK, 575 CR0_READ_SHADOW, 576 CR4_READ_SHADOW, 577 TSC_OFFSET, 578 EXCEPTION_BITMAP, 579 CPU_BASED_VM_EXEC_CONTROL, 580 VM_ENTRY_EXCEPTION_ERROR_CODE, 581 VM_ENTRY_INTR_INFO_FIELD, 582 VM_ENTRY_INSTRUCTION_LEN, 583 VM_ENTRY_EXCEPTION_ERROR_CODE, 584 HOST_FS_BASE, 585 HOST_GS_BASE, 586 HOST_FS_SELECTOR, 587 HOST_GS_SELECTOR 588}; 589static int max_shadow_read_write_fields = 590 ARRAY_SIZE(shadow_read_write_fields); 591 592static const unsigned short vmcs_field_to_offset_table[] = { 593 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), 594 FIELD(GUEST_ES_SELECTOR, guest_es_selector), 595 FIELD(GUEST_CS_SELECTOR, guest_cs_selector), 596 FIELD(GUEST_SS_SELECTOR, guest_ss_selector), 597 FIELD(GUEST_DS_SELECTOR, guest_ds_selector), 598 FIELD(GUEST_FS_SELECTOR, guest_fs_selector), 599 FIELD(GUEST_GS_SELECTOR, guest_gs_selector), 600 FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), 601 FIELD(GUEST_TR_SELECTOR, guest_tr_selector), 602 FIELD(HOST_ES_SELECTOR, host_es_selector), 603 FIELD(HOST_CS_SELECTOR, host_cs_selector), 604 FIELD(HOST_SS_SELECTOR, host_ss_selector), 605 FIELD(HOST_DS_SELECTOR, host_ds_selector), 606 FIELD(HOST_FS_SELECTOR, host_fs_selector), 607 FIELD(HOST_GS_SELECTOR, host_gs_selector), 608 FIELD(HOST_TR_SELECTOR, host_tr_selector), 609 FIELD64(IO_BITMAP_A, io_bitmap_a), 610 FIELD64(IO_BITMAP_B, io_bitmap_b), 611 FIELD64(MSR_BITMAP, msr_bitmap), 612 FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr), 613 FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr), 614 FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), 615 FIELD64(TSC_OFFSET, tsc_offset), 616 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), 617 FIELD64(APIC_ACCESS_ADDR, apic_access_addr), 618 FIELD64(EPT_POINTER, ept_pointer), 619 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), 620 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), 621 FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), 622 FIELD64(GUEST_IA32_PAT, guest_ia32_pat), 623 FIELD64(GUEST_IA32_EFER, guest_ia32_efer), 624 FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl), 625 FIELD64(GUEST_PDPTR0, guest_pdptr0), 626 FIELD64(GUEST_PDPTR1, guest_pdptr1), 627 FIELD64(GUEST_PDPTR2, guest_pdptr2), 628 FIELD64(GUEST_PDPTR3, guest_pdptr3), 629 FIELD64(GUEST_BNDCFGS, guest_bndcfgs), 630 FIELD64(HOST_IA32_PAT, host_ia32_pat), 631 FIELD64(HOST_IA32_EFER, host_ia32_efer), 632 FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), 633 FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control), 634 FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control), 635 FIELD(EXCEPTION_BITMAP, exception_bitmap), 636 FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask), 637 FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match), 638 FIELD(CR3_TARGET_COUNT, cr3_target_count), 639 FIELD(VM_EXIT_CONTROLS, vm_exit_controls), 640 FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count), 641 FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count), 642 FIELD(VM_ENTRY_CONTROLS, vm_entry_controls), 643 FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count), 644 FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field), 645 FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code), 646 FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len), 647 FIELD(TPR_THRESHOLD, tpr_threshold), 648 FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control), 649 FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error), 650 FIELD(VM_EXIT_REASON, vm_exit_reason), 651 FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info), 652 FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code), 653 FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field), 654 FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code), 655 FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len), 656 FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info), 657 FIELD(GUEST_ES_LIMIT, guest_es_limit), 658 FIELD(GUEST_CS_LIMIT, guest_cs_limit), 659 FIELD(GUEST_SS_LIMIT, guest_ss_limit), 660 FIELD(GUEST_DS_LIMIT, guest_ds_limit), 661 FIELD(GUEST_FS_LIMIT, guest_fs_limit), 662 FIELD(GUEST_GS_LIMIT, guest_gs_limit), 663 FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit), 664 FIELD(GUEST_TR_LIMIT, guest_tr_limit), 665 FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit), 666 FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit), 667 FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes), 668 FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes), 669 FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes), 670 FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes), 671 FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes), 672 FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes), 673 FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes), 674 FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes), 675 FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info), 676 FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), 677 FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), 678 FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), 679 FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value), 680 FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), 681 FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), 682 FIELD(CR0_READ_SHADOW, cr0_read_shadow), 683 FIELD(CR4_READ_SHADOW, cr4_read_shadow), 684 FIELD(CR3_TARGET_VALUE0, cr3_target_value0), 685 FIELD(CR3_TARGET_VALUE1, cr3_target_value1), 686 FIELD(CR3_TARGET_VALUE2, cr3_target_value2), 687 FIELD(CR3_TARGET_VALUE3, cr3_target_value3), 688 FIELD(EXIT_QUALIFICATION, exit_qualification), 689 FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), 690 FIELD(GUEST_CR0, guest_cr0), 691 FIELD(GUEST_CR3, guest_cr3), 692 FIELD(GUEST_CR4, guest_cr4), 693 FIELD(GUEST_ES_BASE, guest_es_base), 694 FIELD(GUEST_CS_BASE, guest_cs_base), 695 FIELD(GUEST_SS_BASE, guest_ss_base), 696 FIELD(GUEST_DS_BASE, guest_ds_base), 697 FIELD(GUEST_FS_BASE, guest_fs_base), 698 FIELD(GUEST_GS_BASE, guest_gs_base), 699 FIELD(GUEST_LDTR_BASE, guest_ldtr_base), 700 FIELD(GUEST_TR_BASE, guest_tr_base), 701 FIELD(GUEST_GDTR_BASE, guest_gdtr_base), 702 FIELD(GUEST_IDTR_BASE, guest_idtr_base), 703 FIELD(GUEST_DR7, guest_dr7), 704 FIELD(GUEST_RSP, guest_rsp), 705 FIELD(GUEST_RIP, guest_rip), 706 FIELD(GUEST_RFLAGS, guest_rflags), 707 FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), 708 FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), 709 FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), 710 FIELD(HOST_CR0, host_cr0), 711 FIELD(HOST_CR3, host_cr3), 712 FIELD(HOST_CR4, host_cr4), 713 FIELD(HOST_FS_BASE, host_fs_base), 714 FIELD(HOST_GS_BASE, host_gs_base), 715 FIELD(HOST_TR_BASE, host_tr_base), 716 FIELD(HOST_GDTR_BASE, host_gdtr_base), 717 FIELD(HOST_IDTR_BASE, host_idtr_base), 718 FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp), 719 FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), 720 FIELD(HOST_RSP, host_rsp), 721 FIELD(HOST_RIP, host_rip), 722}; 723static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table); 724 725static inline short vmcs_field_to_offset(unsigned long field) 726{ 727 if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0) 728 return -1; 729 return vmcs_field_to_offset_table[field]; 730} 731 732static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) 733{ 734 return to_vmx(vcpu)->nested.current_vmcs12; 735} 736 737static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) 738{ 739 struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); 740 if (is_error_page(page)) 741 return NULL; 742 743 return page; 744} 745 746static void nested_release_page(struct page *page) 747{ 748 kvm_release_page_dirty(page); 749} 750 751static void nested_release_page_clean(struct page *page) 752{ 753 kvm_release_page_clean(page); 754} 755 756static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); 757static u64 construct_eptp(unsigned long root_hpa); 758static void kvm_cpu_vmxon(u64 addr); 759static void kvm_cpu_vmxoff(void); 760static bool vmx_mpx_supported(void); 761static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 762static void vmx_set_segment(struct kvm_vcpu *vcpu, 763 struct kvm_segment *var, int seg); 764static void vmx_get_segment(struct kvm_vcpu *vcpu, 765 struct kvm_segment *var, int seg); 766static bool guest_state_valid(struct kvm_vcpu *vcpu); 767static u32 vmx_segment_access_rights(struct kvm_segment *var); 768static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); 769static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); 770static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); 771static int alloc_identity_pagetable(struct kvm *kvm); 772 773static DEFINE_PER_CPU(struct vmcs *, vmxarea); 774static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 775/* 776 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed 777 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 778 */ 779static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 780static DEFINE_PER_CPU(struct desc_ptr, host_gdt); 781 782static unsigned long *vmx_io_bitmap_a; 783static unsigned long *vmx_io_bitmap_b; 784static unsigned long *vmx_msr_bitmap_legacy; 785static unsigned long *vmx_msr_bitmap_longmode; 786static unsigned long *vmx_msr_bitmap_legacy_x2apic; 787static unsigned long *vmx_msr_bitmap_longmode_x2apic; 788static unsigned long *vmx_vmread_bitmap; 789static unsigned long *vmx_vmwrite_bitmap; 790 791static bool cpu_has_load_ia32_efer; 792static bool cpu_has_load_perf_global_ctrl; 793 794static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 795static DEFINE_SPINLOCK(vmx_vpid_lock); 796 797static struct vmcs_config { 798 int size; 799 int order; 800 u32 revision_id; 801 u32 pin_based_exec_ctrl; 802 u32 cpu_based_exec_ctrl; 803 u32 cpu_based_2nd_exec_ctrl; 804 u32 vmexit_ctrl; 805 u32 vmentry_ctrl; 806} vmcs_config; 807 808static struct vmx_capability { 809 u32 ept; 810 u32 vpid; 811} vmx_capability; 812 813#define VMX_SEGMENT_FIELD(seg) \ 814 [VCPU_SREG_##seg] = { \ 815 .selector = GUEST_##seg##_SELECTOR, \ 816 .base = GUEST_##seg##_BASE, \ 817 .limit = GUEST_##seg##_LIMIT, \ 818 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 819 } 820 821static const struct kvm_vmx_segment_field { 822 unsigned selector; 823 unsigned base; 824 unsigned limit; 825 unsigned ar_bytes; 826} kvm_vmx_segment_fields[] = { 827 VMX_SEGMENT_FIELD(CS), 828 VMX_SEGMENT_FIELD(DS), 829 VMX_SEGMENT_FIELD(ES), 830 VMX_SEGMENT_FIELD(FS), 831 VMX_SEGMENT_FIELD(GS), 832 VMX_SEGMENT_FIELD(SS), 833 VMX_SEGMENT_FIELD(TR), 834 VMX_SEGMENT_FIELD(LDTR), 835}; 836 837static u64 host_efer; 838 839static void ept_save_pdptrs(struct kvm_vcpu *vcpu); 840 841/* 842 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it 843 * away by decrementing the array size. 844 */ 845static const u32 vmx_msr_index[] = { 846#ifdef CONFIG_X86_64 847 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 848#endif 849 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 850}; 851 852static inline bool is_page_fault(u32 intr_info) 853{ 854 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 855 INTR_INFO_VALID_MASK)) == 856 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); 857} 858 859static inline bool is_no_device(u32 intr_info) 860{ 861 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 862 INTR_INFO_VALID_MASK)) == 863 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); 864} 865 866static inline bool is_invalid_opcode(u32 intr_info) 867{ 868 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 869 INTR_INFO_VALID_MASK)) == 870 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); 871} 872 873static inline bool is_external_interrupt(u32 intr_info) 874{ 875 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 876 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 877} 878 879static inline bool is_machine_check(u32 intr_info) 880{ 881 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 882 INTR_INFO_VALID_MASK)) == 883 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); 884} 885 886static inline bool cpu_has_vmx_msr_bitmap(void) 887{ 888 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; 889} 890 891static inline bool cpu_has_vmx_tpr_shadow(void) 892{ 893 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; 894} 895 896static inline bool vm_need_tpr_shadow(struct kvm *kvm) 897{ 898 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); 899} 900 901static inline bool cpu_has_secondary_exec_ctrls(void) 902{ 903 return vmcs_config.cpu_based_exec_ctrl & 904 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 905} 906 907static inline bool cpu_has_vmx_virtualize_apic_accesses(void) 908{ 909 return vmcs_config.cpu_based_2nd_exec_ctrl & 910 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 911} 912 913static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) 914{ 915 return vmcs_config.cpu_based_2nd_exec_ctrl & 916 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 917} 918 919static inline bool cpu_has_vmx_apic_register_virt(void) 920{ 921 return vmcs_config.cpu_based_2nd_exec_ctrl & 922 SECONDARY_EXEC_APIC_REGISTER_VIRT; 923} 924 925static inline bool cpu_has_vmx_virtual_intr_delivery(void) 926{ 927 return vmcs_config.cpu_based_2nd_exec_ctrl & 928 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; 929} 930 931static inline bool cpu_has_vmx_posted_intr(void) 932{ 933 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; 934} 935 936static inline bool cpu_has_vmx_apicv(void) 937{ 938 return cpu_has_vmx_apic_register_virt() && 939 cpu_has_vmx_virtual_intr_delivery() && 940 cpu_has_vmx_posted_intr(); 941} 942 943static inline bool cpu_has_vmx_flexpriority(void) 944{ 945 return cpu_has_vmx_tpr_shadow() && 946 cpu_has_vmx_virtualize_apic_accesses(); 947} 948 949static inline bool cpu_has_vmx_ept_execute_only(void) 950{ 951 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; 952} 953 954static inline bool cpu_has_vmx_eptp_uncacheable(void) 955{ 956 return vmx_capability.ept & VMX_EPTP_UC_BIT; 957} 958 959static inline bool cpu_has_vmx_eptp_writeback(void) 960{ 961 return vmx_capability.ept & VMX_EPTP_WB_BIT; 962} 963 964static inline bool cpu_has_vmx_ept_2m_page(void) 965{ 966 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; 967} 968 969static inline bool cpu_has_vmx_ept_1g_page(void) 970{ 971 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; 972} 973 974static inline bool cpu_has_vmx_ept_4levels(void) 975{ 976 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; 977} 978 979static inline bool cpu_has_vmx_ept_ad_bits(void) 980{ 981 return vmx_capability.ept & VMX_EPT_AD_BIT; 982} 983 984static inline bool cpu_has_vmx_invept_context(void) 985{ 986 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; 987} 988 989static inline bool cpu_has_vmx_invept_global(void) 990{ 991 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; 992} 993 994static inline bool cpu_has_vmx_invvpid_single(void) 995{ 996 return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; 997} 998 999static inline bool cpu_has_vmx_invvpid_global(void) 1000{ 1001 return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; 1002} 1003 1004static inline bool cpu_has_vmx_ept(void) 1005{ 1006 return vmcs_config.cpu_based_2nd_exec_ctrl & 1007 SECONDARY_EXEC_ENABLE_EPT; 1008} 1009 1010static inline bool cpu_has_vmx_unrestricted_guest(void) 1011{ 1012 return vmcs_config.cpu_based_2nd_exec_ctrl & 1013 SECONDARY_EXEC_UNRESTRICTED_GUEST; 1014} 1015 1016static inline bool cpu_has_vmx_ple(void) 1017{ 1018 return vmcs_config.cpu_based_2nd_exec_ctrl & 1019 SECONDARY_EXEC_PAUSE_LOOP_EXITING; 1020} 1021 1022static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm) 1023{ 1024 return flexpriority_enabled && irqchip_in_kernel(kvm); 1025} 1026 1027static inline bool cpu_has_vmx_vpid(void) 1028{ 1029 return vmcs_config.cpu_based_2nd_exec_ctrl & 1030 SECONDARY_EXEC_ENABLE_VPID; 1031} 1032 1033static inline bool cpu_has_vmx_rdtscp(void) 1034{ 1035 return vmcs_config.cpu_based_2nd_exec_ctrl & 1036 SECONDARY_EXEC_RDTSCP; 1037} 1038 1039static inline bool cpu_has_vmx_invpcid(void) 1040{ 1041 return vmcs_config.cpu_based_2nd_exec_ctrl & 1042 SECONDARY_EXEC_ENABLE_INVPCID; 1043} 1044 1045static inline bool cpu_has_virtual_nmis(void) 1046{ 1047 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 1048} 1049 1050static inline bool cpu_has_vmx_wbinvd_exit(void) 1051{ 1052 return vmcs_config.cpu_based_2nd_exec_ctrl & 1053 SECONDARY_EXEC_WBINVD_EXITING; 1054} 1055 1056static inline bool cpu_has_vmx_shadow_vmcs(void) 1057{ 1058 u64 vmx_msr; 1059 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); 1060 /* check if the cpu supports writing r/o exit information fields */ 1061 if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) 1062 return false; 1063 1064 return vmcs_config.cpu_based_2nd_exec_ctrl & 1065 SECONDARY_EXEC_SHADOW_VMCS; 1066} 1067 1068static inline bool report_flexpriority(void) 1069{ 1070 return flexpriority_enabled; 1071} 1072 1073static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) 1074{ 1075 return vmcs12->cpu_based_vm_exec_control & bit; 1076} 1077 1078static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) 1079{ 1080 return (vmcs12->cpu_based_vm_exec_control & 1081 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 1082 (vmcs12->secondary_vm_exec_control & bit); 1083} 1084 1085static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) 1086{ 1087 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; 1088} 1089 1090static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) 1091{ 1092 return vmcs12->pin_based_vm_exec_control & 1093 PIN_BASED_VMX_PREEMPTION_TIMER; 1094} 1095 1096static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) 1097{ 1098 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); 1099} 1100 1101static inline bool is_exception(u32 intr_info) 1102{ 1103 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 1104 == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); 1105} 1106 1107static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, 1108 u32 exit_intr_info, 1109 unsigned long exit_qualification); 1110static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, 1111 struct vmcs12 *vmcs12, 1112 u32 reason, unsigned long qualification); 1113 1114static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 1115{ 1116 int i; 1117 1118 for (i = 0; i < vmx->nmsrs; ++i) 1119 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) 1120 return i; 1121 return -1; 1122} 1123 1124static inline void __invvpid(int ext, u16 vpid, gva_t gva) 1125{ 1126 struct { 1127 u64 vpid : 16; 1128 u64 rsvd : 48; 1129 u64 gva; 1130 } operand = { vpid, 0, gva }; 1131 1132 asm volatile (__ex(ASM_VMX_INVVPID) 1133 /* CF==1 or ZF==1 --> rc = -1 */ 1134 "; ja 1f ; ud2 ; 1:" 1135 : : "a"(&operand), "c"(ext) : "cc", "memory"); 1136} 1137 1138static inline void __invept(int ext, u64 eptp, gpa_t gpa) 1139{ 1140 struct { 1141 u64 eptp, gpa; 1142 } operand = {eptp, gpa}; 1143 1144 asm volatile (__ex(ASM_VMX_INVEPT) 1145 /* CF==1 or ZF==1 --> rc = -1 */ 1146 "; ja 1f ; ud2 ; 1:\n" 1147 : : "a" (&operand), "c" (ext) : "cc", "memory"); 1148} 1149 1150static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) 1151{ 1152 int i; 1153 1154 i = __find_msr_index(vmx, msr); 1155 if (i >= 0) 1156 return &vmx->guest_msrs[i]; 1157 return NULL; 1158} 1159 1160static void vmcs_clear(struct vmcs *vmcs) 1161{ 1162 u64 phys_addr = __pa(vmcs); 1163 u8 error; 1164 1165 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" 1166 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) 1167 : "cc", "memory"); 1168 if (error) 1169 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", 1170 vmcs, phys_addr); 1171} 1172 1173static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) 1174{ 1175 vmcs_clear(loaded_vmcs->vmcs); 1176 loaded_vmcs->cpu = -1; 1177 loaded_vmcs->launched = 0; 1178} 1179 1180static void vmcs_load(struct vmcs *vmcs) 1181{ 1182 u64 phys_addr = __pa(vmcs); 1183 u8 error; 1184 1185 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 1186 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) 1187 : "cc", "memory"); 1188 if (error) 1189 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", 1190 vmcs, phys_addr); 1191} 1192 1193#ifdef CONFIG_KEXEC 1194/* 1195 * This bitmap is used to indicate whether the vmclear 1196 * operation is enabled on all cpus. All disabled by 1197 * default. 1198 */ 1199static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; 1200 1201static inline void crash_enable_local_vmclear(int cpu) 1202{ 1203 cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); 1204} 1205 1206static inline void crash_disable_local_vmclear(int cpu) 1207{ 1208 cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); 1209} 1210 1211static inline int crash_local_vmclear_enabled(int cpu) 1212{ 1213 return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); 1214} 1215 1216static void crash_vmclear_local_loaded_vmcss(void) 1217{ 1218 int cpu = raw_smp_processor_id(); 1219 struct loaded_vmcs *v; 1220 1221 if (!crash_local_vmclear_enabled(cpu)) 1222 return; 1223 1224 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 1225 loaded_vmcss_on_cpu_link) 1226 vmcs_clear(v->vmcs); 1227} 1228#else 1229static inline void crash_enable_local_vmclear(int cpu) { } 1230static inline void crash_disable_local_vmclear(int cpu) { } 1231#endif /* CONFIG_KEXEC */ 1232 1233static void __loaded_vmcs_clear(void *arg) 1234{ 1235 struct loaded_vmcs *loaded_vmcs = arg; 1236 int cpu = raw_smp_processor_id(); 1237 1238 if (loaded_vmcs->cpu != cpu) 1239 return; /* vcpu migration can race with cpu offline */ 1240 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 1241 per_cpu(current_vmcs, cpu) = NULL; 1242 crash_disable_local_vmclear(cpu); 1243 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 1244 1245 /* 1246 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link 1247 * is before setting loaded_vmcs->vcpu to -1 which is done in 1248 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist 1249 * then adds the vmcs into percpu list before it is deleted. 1250 */ 1251 smp_wmb(); 1252 1253 loaded_vmcs_init(loaded_vmcs); 1254 crash_enable_local_vmclear(cpu); 1255} 1256 1257static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 1258{ 1259 int cpu = loaded_vmcs->cpu; 1260 1261 if (cpu != -1) 1262 smp_call_function_single(cpu, 1263 __loaded_vmcs_clear, loaded_vmcs, 1); 1264} 1265 1266static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) 1267{ 1268 if (vmx->vpid == 0) 1269 return; 1270 1271 if (cpu_has_vmx_invvpid_single()) 1272 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); 1273} 1274 1275static inline void vpid_sync_vcpu_global(void) 1276{ 1277 if (cpu_has_vmx_invvpid_global()) 1278 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); 1279} 1280 1281static inline void vpid_sync_context(struct vcpu_vmx *vmx) 1282{ 1283 if (cpu_has_vmx_invvpid_single()) 1284 vpid_sync_vcpu_single(vmx); 1285 else 1286 vpid_sync_vcpu_global(); 1287} 1288 1289static inline void ept_sync_global(void) 1290{ 1291 if (cpu_has_vmx_invept_global()) 1292 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); 1293} 1294 1295static inline void ept_sync_context(u64 eptp) 1296{ 1297 if (enable_ept) { 1298 if (cpu_has_vmx_invept_context()) 1299 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); 1300 else 1301 ept_sync_global(); 1302 } 1303} 1304 1305static __always_inline unsigned long vmcs_readl(unsigned long field) 1306{ 1307 unsigned long value; 1308 1309 asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0") 1310 : "=a"(value) : "d"(field) : "cc"); 1311 return value; 1312} 1313 1314static __always_inline u16 vmcs_read16(unsigned long field) 1315{ 1316 return vmcs_readl(field); 1317} 1318 1319static __always_inline u32 vmcs_read32(unsigned long field) 1320{ 1321 return vmcs_readl(field); 1322} 1323 1324static __always_inline u64 vmcs_read64(unsigned long field) 1325{ 1326#ifdef CONFIG_X86_64 1327 return vmcs_readl(field); 1328#else 1329 return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32); 1330#endif 1331} 1332 1333static noinline void vmwrite_error(unsigned long field, unsigned long value) 1334{ 1335 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", 1336 field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); 1337 dump_stack(); 1338} 1339 1340static void vmcs_writel(unsigned long field, unsigned long value) 1341{ 1342 u8 error; 1343 1344 asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0" 1345 : "=q"(error) : "a"(value), "d"(field) : "cc"); 1346 if (unlikely(error)) 1347 vmwrite_error(field, value); 1348} 1349 1350static void vmcs_write16(unsigned long field, u16 value) 1351{ 1352 vmcs_writel(field, value); 1353} 1354 1355static void vmcs_write32(unsigned long field, u32 value) 1356{ 1357 vmcs_writel(field, value); 1358} 1359 1360static void vmcs_write64(unsigned long field, u64 value) 1361{ 1362 vmcs_writel(field, value); 1363#ifndef CONFIG_X86_64 1364 asm volatile (""); 1365 vmcs_writel(field+1, value >> 32); 1366#endif 1367} 1368 1369static void vmcs_clear_bits(unsigned long field, u32 mask) 1370{ 1371 vmcs_writel(field, vmcs_readl(field) & ~mask); 1372} 1373 1374static void vmcs_set_bits(unsigned long field, u32 mask) 1375{ 1376 vmcs_writel(field, vmcs_readl(field) | mask); 1377} 1378 1379static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) 1380{ 1381 vmcs_write32(VM_ENTRY_CONTROLS, val); 1382 vmx->vm_entry_controls_shadow = val; 1383} 1384 1385static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) 1386{ 1387 if (vmx->vm_entry_controls_shadow != val) 1388 vm_entry_controls_init(vmx, val); 1389} 1390 1391static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) 1392{ 1393 return vmx->vm_entry_controls_shadow; 1394} 1395 1396 1397static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val) 1398{ 1399 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val); 1400} 1401 1402static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) 1403{ 1404 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); 1405} 1406 1407static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) 1408{ 1409 vmcs_write32(VM_EXIT_CONTROLS, val); 1410 vmx->vm_exit_controls_shadow = val; 1411} 1412 1413static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val) 1414{ 1415 if (vmx->vm_exit_controls_shadow != val) 1416 vm_exit_controls_init(vmx, val); 1417} 1418 1419static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx) 1420{ 1421 return vmx->vm_exit_controls_shadow; 1422} 1423 1424 1425static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val) 1426{ 1427 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val); 1428} 1429 1430static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) 1431{ 1432 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); 1433} 1434 1435static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) 1436{ 1437 vmx->segment_cache.bitmask = 0; 1438} 1439 1440static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, 1441 unsigned field) 1442{ 1443 bool ret; 1444 u32 mask = 1 << (seg * SEG_FIELD_NR + field); 1445 1446 if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { 1447 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); 1448 vmx->segment_cache.bitmask = 0; 1449 } 1450 ret = vmx->segment_cache.bitmask & mask; 1451 vmx->segment_cache.bitmask |= mask; 1452 return ret; 1453} 1454 1455static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) 1456{ 1457 u16 *p = &vmx->segment_cache.seg[seg].selector; 1458 1459 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) 1460 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); 1461 return *p; 1462} 1463 1464static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) 1465{ 1466 ulong *p = &vmx->segment_cache.seg[seg].base; 1467 1468 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) 1469 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); 1470 return *p; 1471} 1472 1473static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) 1474{ 1475 u32 *p = &vmx->segment_cache.seg[seg].limit; 1476 1477 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) 1478 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); 1479 return *p; 1480} 1481 1482static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) 1483{ 1484 u32 *p = &vmx->segment_cache.seg[seg].ar; 1485 1486 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) 1487 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); 1488 return *p; 1489} 1490 1491static void update_exception_bitmap(struct kvm_vcpu *vcpu) 1492{ 1493 u32 eb; 1494 1495 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 1496 (1u << NM_VECTOR) | (1u << DB_VECTOR); 1497 if ((vcpu->guest_debug & 1498 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 1499 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 1500 eb |= 1u << BP_VECTOR; 1501 if (to_vmx(vcpu)->rmode.vm86_active) 1502 eb = ~0; 1503 if (enable_ept) 1504 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 1505 if (vcpu->fpu_active) 1506 eb &= ~(1u << NM_VECTOR); 1507 1508 /* When we are running a nested L2 guest and L1 specified for it a 1509 * certain exception bitmap, we must trap the same exceptions and pass 1510 * them to L1. When running L2, we will only handle the exceptions 1511 * specified above if L1 did not want them. 1512 */ 1513 if (is_guest_mode(vcpu)) 1514 eb |= get_vmcs12(vcpu)->exception_bitmap; 1515 1516 vmcs_write32(EXCEPTION_BITMAP, eb); 1517} 1518 1519static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1520 unsigned long entry, unsigned long exit) 1521{ 1522 vm_entry_controls_clearbit(vmx, entry); 1523 vm_exit_controls_clearbit(vmx, exit); 1524} 1525 1526static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 1527{ 1528 unsigned i; 1529 struct msr_autoload *m = &vmx->msr_autoload; 1530 1531 switch (msr) { 1532 case MSR_EFER: 1533 if (cpu_has_load_ia32_efer) { 1534 clear_atomic_switch_msr_special(vmx, 1535 VM_ENTRY_LOAD_IA32_EFER, 1536 VM_EXIT_LOAD_IA32_EFER); 1537 return; 1538 } 1539 break; 1540 case MSR_CORE_PERF_GLOBAL_CTRL: 1541 if (cpu_has_load_perf_global_ctrl) { 1542 clear_atomic_switch_msr_special(vmx, 1543 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1544 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 1545 return; 1546 } 1547 break; 1548 } 1549 1550 for (i = 0; i < m->nr; ++i) 1551 if (m->guest[i].index == msr) 1552 break; 1553 1554 if (i == m->nr) 1555 return; 1556 --m->nr; 1557 m->guest[i] = m->guest[m->nr]; 1558 m->host[i] = m->host[m->nr]; 1559 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); 1560 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); 1561} 1562 1563static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1564 unsigned long entry, unsigned long exit, 1565 unsigned long guest_val_vmcs, unsigned long host_val_vmcs, 1566 u64 guest_val, u64 host_val) 1567{ 1568 vmcs_write64(guest_val_vmcs, guest_val); 1569 vmcs_write64(host_val_vmcs, host_val); 1570 vm_entry_controls_setbit(vmx, entry); 1571 vm_exit_controls_setbit(vmx, exit); 1572} 1573 1574static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 1575 u64 guest_val, u64 host_val) 1576{ 1577 unsigned i; 1578 struct msr_autoload *m = &vmx->msr_autoload; 1579 1580 switch (msr) { 1581 case MSR_EFER: 1582 if (cpu_has_load_ia32_efer) { 1583 add_atomic_switch_msr_special(vmx, 1584 VM_ENTRY_LOAD_IA32_EFER, 1585 VM_EXIT_LOAD_IA32_EFER, 1586 GUEST_IA32_EFER, 1587 HOST_IA32_EFER, 1588 guest_val, host_val); 1589 return; 1590 } 1591 break; 1592 case MSR_CORE_PERF_GLOBAL_CTRL: 1593 if (cpu_has_load_perf_global_ctrl) { 1594 add_atomic_switch_msr_special(vmx, 1595 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1596 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 1597 GUEST_IA32_PERF_GLOBAL_CTRL, 1598 HOST_IA32_PERF_GLOBAL_CTRL, 1599 guest_val, host_val); 1600 return; 1601 } 1602 break; 1603 } 1604 1605 for (i = 0; i < m->nr; ++i) 1606 if (m->guest[i].index == msr) 1607 break; 1608 1609 if (i == NR_AUTOLOAD_MSRS) { 1610 printk_once(KERN_WARNING "Not enough msr switch entries. " 1611 "Can't add msr %x\n", msr); 1612 return; 1613 } else if (i == m->nr) { 1614 ++m->nr; 1615 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); 1616 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); 1617 } 1618 1619 m->guest[i].index = msr; 1620 m->guest[i].value = guest_val; 1621 m->host[i].index = msr; 1622 m->host[i].value = host_val; 1623} 1624 1625static void reload_tss(void) 1626{ 1627 /* 1628 * VT restores TR but not its size. Useless. 1629 */ 1630 struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); 1631 struct desc_struct *descs; 1632 1633 descs = (void *)gdt->address; 1634 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 1635 load_TR_desc(); 1636} 1637 1638static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) 1639{ 1640 u64 guest_efer; 1641 u64 ignore_bits; 1642 1643 guest_efer = vmx->vcpu.arch.efer; 1644 1645 /* 1646 * NX is emulated; LMA and LME handled by hardware; SCE meaningless 1647 * outside long mode 1648 */ 1649 ignore_bits = EFER_NX | EFER_SCE; 1650#ifdef CONFIG_X86_64 1651 ignore_bits |= EFER_LMA | EFER_LME; 1652 /* SCE is meaningful only in long mode on Intel */ 1653 if (guest_efer & EFER_LMA) 1654 ignore_bits &= ~(u64)EFER_SCE; 1655#endif 1656 guest_efer &= ~ignore_bits; 1657 guest_efer |= host_efer & ignore_bits; 1658 vmx->guest_msrs[efer_offset].data = guest_efer; 1659 vmx->guest_msrs[efer_offset].mask = ~ignore_bits; 1660 1661 clear_atomic_switch_msr(vmx, MSR_EFER); 1662 /* On ept, can't emulate nx, and must switch nx atomically */ 1663 if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) { 1664 guest_efer = vmx->vcpu.arch.efer; 1665 if (!(guest_efer & EFER_LMA)) 1666 guest_efer &= ~EFER_LME; 1667 add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer); 1668 return false; 1669 } 1670 1671 return true; 1672} 1673 1674static unsigned long segment_base(u16 selector) 1675{ 1676 struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); 1677 struct desc_struct *d; 1678 unsigned long table_base; 1679 unsigned long v; 1680 1681 if (!(selector & ~3)) 1682 return 0; 1683 1684 table_base = gdt->address; 1685 1686 if (selector & 4) { /* from ldt */ 1687 u16 ldt_selector = kvm_read_ldt(); 1688 1689 if (!(ldt_selector & ~3)) 1690 return 0; 1691 1692 table_base = segment_base(ldt_selector); 1693 } 1694 d = (struct desc_struct *)(table_base + (selector & ~7)); 1695 v = get_desc_base(d); 1696#ifdef CONFIG_X86_64 1697 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 1698 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; 1699#endif 1700 return v; 1701} 1702 1703static inline unsigned long kvm_read_tr_base(void) 1704{ 1705 u16 tr; 1706 asm("str %0" : "=g"(tr)); 1707 return segment_base(tr); 1708} 1709 1710static void vmx_save_host_state(struct kvm_vcpu *vcpu) 1711{ 1712 struct vcpu_vmx *vmx = to_vmx(vcpu); 1713 int i; 1714 1715 if (vmx->host_state.loaded) 1716 return; 1717 1718 vmx->host_state.loaded = 1; 1719 /* 1720 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1721 * allow segment selectors with cpl > 0 or ti == 1. 1722 */ 1723 vmx->host_state.ldt_sel = kvm_read_ldt(); 1724 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; 1725 savesegment(fs, vmx->host_state.fs_sel); 1726 if (!(vmx->host_state.fs_sel & 7)) { 1727 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); 1728 vmx->host_state.fs_reload_needed = 0; 1729 } else { 1730 vmcs_write16(HOST_FS_SELECTOR, 0); 1731 vmx->host_state.fs_reload_needed = 1; 1732 } 1733 savesegment(gs, vmx->host_state.gs_sel); 1734 if (!(vmx->host_state.gs_sel & 7)) 1735 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); 1736 else { 1737 vmcs_write16(HOST_GS_SELECTOR, 0); 1738 vmx->host_state.gs_ldt_reload_needed = 1; 1739 } 1740 1741#ifdef CONFIG_X86_64 1742 savesegment(ds, vmx->host_state.ds_sel); 1743 savesegment(es, vmx->host_state.es_sel); 1744#endif 1745 1746#ifdef CONFIG_X86_64 1747 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); 1748 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); 1749#else 1750 vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); 1751 vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); 1752#endif 1753 1754#ifdef CONFIG_X86_64 1755 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1756 if (is_long_mode(&vmx->vcpu)) 1757 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1758#endif 1759 if (boot_cpu_has(X86_FEATURE_MPX)) 1760 rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); 1761 for (i = 0; i < vmx->save_nmsrs; ++i) 1762 kvm_set_shared_msr(vmx->guest_msrs[i].index, 1763 vmx->guest_msrs[i].data, 1764 vmx->guest_msrs[i].mask); 1765} 1766 1767static void __vmx_load_host_state(struct vcpu_vmx *vmx) 1768{ 1769 if (!vmx->host_state.loaded) 1770 return; 1771 1772 ++vmx->vcpu.stat.host_state_reload; 1773 vmx->host_state.loaded = 0; 1774#ifdef CONFIG_X86_64 1775 if (is_long_mode(&vmx->vcpu)) 1776 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1777#endif 1778 if (vmx->host_state.gs_ldt_reload_needed) { 1779 kvm_load_ldt(vmx->host_state.ldt_sel); 1780#ifdef CONFIG_X86_64 1781 load_gs_index(vmx->host_state.gs_sel); 1782#else 1783 loadsegment(gs, vmx->host_state.gs_sel); 1784#endif 1785 } 1786 if (vmx->host_state.fs_reload_needed) 1787 loadsegment(fs, vmx->host_state.fs_sel); 1788#ifdef CONFIG_X86_64 1789 if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) { 1790 loadsegment(ds, vmx->host_state.ds_sel); 1791 loadsegment(es, vmx->host_state.es_sel); 1792 } 1793#endif 1794 reload_tss(); 1795#ifdef CONFIG_X86_64 1796 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1797#endif 1798 if (vmx->host_state.msr_host_bndcfgs) 1799 wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); 1800 /* 1801 * If the FPU is not active (through the host task or 1802 * the guest vcpu), then restore the cr0.TS bit. 1803 */ 1804 if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded) 1805 stts(); 1806 load_gdt(this_cpu_ptr(&host_gdt)); 1807} 1808 1809static void vmx_load_host_state(struct vcpu_vmx *vmx) 1810{ 1811 preempt_disable(); 1812 __vmx_load_host_state(vmx); 1813 preempt_enable(); 1814} 1815 1816/* 1817 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 1818 * vcpu mutex is already taken. 1819 */ 1820static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1821{ 1822 struct vcpu_vmx *vmx = to_vmx(vcpu); 1823 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1824 1825 if (!vmm_exclusive) 1826 kvm_cpu_vmxon(phys_addr); 1827 else if (vmx->loaded_vmcs->cpu != cpu) 1828 loaded_vmcs_clear(vmx->loaded_vmcs); 1829 1830 if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { 1831 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; 1832 vmcs_load(vmx->loaded_vmcs->vmcs); 1833 } 1834 1835 if (vmx->loaded_vmcs->cpu != cpu) { 1836 struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); 1837 unsigned long sysenter_esp; 1838 1839 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1840 local_irq_disable(); 1841 crash_disable_local_vmclear(cpu); 1842 1843 /* 1844 * Read loaded_vmcs->cpu should be before fetching 1845 * loaded_vmcs->loaded_vmcss_on_cpu_link. 1846 * See the comments in __loaded_vmcs_clear(). 1847 */ 1848 smp_rmb(); 1849 1850 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1851 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1852 crash_enable_local_vmclear(cpu); 1853 local_irq_enable(); 1854 1855 /* 1856 * Linux uses per-cpu TSS and GDT, so set these when switching 1857 * processors. 1858 */ 1859 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ 1860 vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ 1861 1862 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 1863 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 1864 vmx->loaded_vmcs->cpu = cpu; 1865 } 1866} 1867 1868static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 1869{ 1870 __vmx_load_host_state(to_vmx(vcpu)); 1871 if (!vmm_exclusive) { 1872 __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); 1873 vcpu->cpu = -1; 1874 kvm_cpu_vmxoff(); 1875 } 1876} 1877 1878static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 1879{ 1880 ulong cr0; 1881 1882 if (vcpu->fpu_active) 1883 return; 1884 vcpu->fpu_active = 1; 1885 cr0 = vmcs_readl(GUEST_CR0); 1886 cr0 &= ~(X86_CR0_TS | X86_CR0_MP); 1887 cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); 1888 vmcs_writel(GUEST_CR0, cr0); 1889 update_exception_bitmap(vcpu); 1890 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 1891 if (is_guest_mode(vcpu)) 1892 vcpu->arch.cr0_guest_owned_bits &= 1893 ~get_vmcs12(vcpu)->cr0_guest_host_mask; 1894 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 1895} 1896 1897static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); 1898 1899/* 1900 * Return the cr0 value that a nested guest would read. This is a combination 1901 * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by 1902 * its hypervisor (cr0_read_shadow). 1903 */ 1904static inline unsigned long nested_read_cr0(struct vmcs12 *fields) 1905{ 1906 return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | 1907 (fields->cr0_read_shadow & fields->cr0_guest_host_mask); 1908} 1909static inline unsigned long nested_read_cr4(struct vmcs12 *fields) 1910{ 1911 return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | 1912 (fields->cr4_read_shadow & fields->cr4_guest_host_mask); 1913} 1914 1915static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) 1916{ 1917 /* Note that there is no vcpu->fpu_active = 0 here. The caller must 1918 * set this *before* calling this function. 1919 */ 1920 vmx_decache_cr0_guest_bits(vcpu); 1921 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); 1922 update_exception_bitmap(vcpu); 1923 vcpu->arch.cr0_guest_owned_bits = 0; 1924 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 1925 if (is_guest_mode(vcpu)) { 1926 /* 1927 * L1's specified read shadow might not contain the TS bit, 1928 * so now that we turned on shadowing of this bit, we need to 1929 * set this bit of the shadow. Like in nested_vmx_run we need 1930 * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet 1931 * up-to-date here because we just decached cr0.TS (and we'll 1932 * only update vmcs12->guest_cr0 on nested exit). 1933 */ 1934 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1935 vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) | 1936 (vcpu->arch.cr0 & X86_CR0_TS); 1937 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 1938 } else 1939 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); 1940} 1941 1942static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1943{ 1944 unsigned long rflags, save_rflags; 1945 1946 if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { 1947 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); 1948 rflags = vmcs_readl(GUEST_RFLAGS); 1949 if (to_vmx(vcpu)->rmode.vm86_active) { 1950 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1951 save_rflags = to_vmx(vcpu)->rmode.save_rflags; 1952 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1953 } 1954 to_vmx(vcpu)->rflags = rflags; 1955 } 1956 return to_vmx(vcpu)->rflags; 1957} 1958 1959static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1960{ 1961 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); 1962 to_vmx(vcpu)->rflags = rflags; 1963 if (to_vmx(vcpu)->rmode.vm86_active) { 1964 to_vmx(vcpu)->rmode.save_rflags = rflags; 1965 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1966 } 1967 vmcs_writel(GUEST_RFLAGS, rflags); 1968} 1969 1970static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 1971{ 1972 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1973 int ret = 0; 1974 1975 if (interruptibility & GUEST_INTR_STATE_STI) 1976 ret |= KVM_X86_SHADOW_INT_STI; 1977 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 1978 ret |= KVM_X86_SHADOW_INT_MOV_SS; 1979 1980 return ret; 1981} 1982 1983static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 1984{ 1985 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1986 u32 interruptibility = interruptibility_old; 1987 1988 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 1989 1990 if (mask & KVM_X86_SHADOW_INT_MOV_SS) 1991 interruptibility |= GUEST_INTR_STATE_MOV_SS; 1992 else if (mask & KVM_X86_SHADOW_INT_STI) 1993 interruptibility |= GUEST_INTR_STATE_STI; 1994 1995 if ((interruptibility != interruptibility_old)) 1996 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); 1997} 1998 1999static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 2000{ 2001 unsigned long rip; 2002 2003 rip = kvm_rip_read(vcpu); 2004 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 2005 kvm_rip_write(vcpu, rip); 2006 2007 /* skipping an emulated instruction also counts */ 2008 vmx_set_interrupt_shadow(vcpu, 0); 2009} 2010 2011/* 2012 * KVM wants to inject page-faults which it got to the guest. This function 2013 * checks whether in a nested guest, we need to inject them to L1 or L2. 2014 */ 2015static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) 2016{ 2017 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2018 2019 if (!(vmcs12->exception_bitmap & (1u << nr))) 2020 return 0; 2021 2022 nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, 2023 vmcs_read32(VM_EXIT_INTR_INFO), 2024 vmcs_readl(EXIT_QUALIFICATION)); 2025 return 1; 2026} 2027 2028static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 2029 bool has_error_code, u32 error_code, 2030 bool reinject) 2031{ 2032 struct vcpu_vmx *vmx = to_vmx(vcpu); 2033 u32 intr_info = nr | INTR_INFO_VALID_MASK; 2034 2035 if (!reinject && is_guest_mode(vcpu) && 2036 nested_vmx_check_exception(vcpu, nr)) 2037 return; 2038 2039 if (has_error_code) { 2040 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); 2041 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 2042 } 2043 2044 if (vmx->rmode.vm86_active) { 2045 int inc_eip = 0; 2046 if (kvm_exception_is_soft(nr)) 2047 inc_eip = vcpu->arch.event_exit_inst_len; 2048 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) 2049 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2050 return; 2051 } 2052 2053 if (kvm_exception_is_soft(nr)) { 2054 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2055 vmx->vcpu.arch.event_exit_inst_len); 2056 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 2057 } else 2058 intr_info |= INTR_TYPE_HARD_EXCEPTION; 2059 2060 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 2061} 2062 2063static bool vmx_rdtscp_supported(void) 2064{ 2065 return cpu_has_vmx_rdtscp(); 2066} 2067 2068static bool vmx_invpcid_supported(void) 2069{ 2070 return cpu_has_vmx_invpcid() && enable_ept; 2071} 2072 2073/* 2074 * Swap MSR entry in host/guest MSR entry array. 2075 */ 2076static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) 2077{ 2078 struct shared_msr_entry tmp; 2079 2080 tmp = vmx->guest_msrs[to]; 2081 vmx->guest_msrs[to] = vmx->guest_msrs[from]; 2082 vmx->guest_msrs[from] = tmp; 2083} 2084 2085static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) 2086{ 2087 unsigned long *msr_bitmap; 2088 2089 if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) { 2090 if (is_long_mode(vcpu)) 2091 msr_bitmap = vmx_msr_bitmap_longmode_x2apic; 2092 else 2093 msr_bitmap = vmx_msr_bitmap_legacy_x2apic; 2094 } else { 2095 if (is_long_mode(vcpu)) 2096 msr_bitmap = vmx_msr_bitmap_longmode; 2097 else 2098 msr_bitmap = vmx_msr_bitmap_legacy; 2099 } 2100 2101 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); 2102} 2103 2104/* 2105 * Set up the vmcs to automatically save and restore system 2106 * msrs. Don't touch the 64-bit msrs if the guest is in legacy 2107 * mode, as fiddling with msrs is very expensive. 2108 */ 2109static void setup_msrs(struct vcpu_vmx *vmx) 2110{ 2111 int save_nmsrs, index; 2112 2113 save_nmsrs = 0; 2114#ifdef CONFIG_X86_64 2115 if (is_long_mode(&vmx->vcpu)) { 2116 index = __find_msr_index(vmx, MSR_SYSCALL_MASK); 2117 if (index >= 0) 2118 move_msr_up(vmx, index, save_nmsrs++); 2119 index = __find_msr_index(vmx, MSR_LSTAR); 2120 if (index >= 0) 2121 move_msr_up(vmx, index, save_nmsrs++); 2122 index = __find_msr_index(vmx, MSR_CSTAR); 2123 if (index >= 0) 2124 move_msr_up(vmx, index, save_nmsrs++); 2125 index = __find_msr_index(vmx, MSR_TSC_AUX); 2126 if (index >= 0 && vmx->rdtscp_enabled) 2127 move_msr_up(vmx, index, save_nmsrs++); 2128 /* 2129 * MSR_STAR is only needed on long mode guests, and only 2130 * if efer.sce is enabled. 2131 */ 2132 index = __find_msr_index(vmx, MSR_STAR); 2133 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) 2134 move_msr_up(vmx, index, save_nmsrs++); 2135 } 2136#endif 2137 index = __find_msr_index(vmx, MSR_EFER); 2138 if (index >= 0 && update_transition_efer(vmx, index)) 2139 move_msr_up(vmx, index, save_nmsrs++); 2140 2141 vmx->save_nmsrs = save_nmsrs; 2142 2143 if (cpu_has_vmx_msr_bitmap()) 2144 vmx_set_msr_bitmap(&vmx->vcpu); 2145} 2146 2147/* 2148 * reads and returns guest's timestamp counter "register" 2149 * guest_tsc = host_tsc + tsc_offset -- 21.3 2150 */ 2151static u64 guest_read_tsc(void) 2152{ 2153 u64 host_tsc, tsc_offset; 2154 2155 rdtscll(host_tsc); 2156 tsc_offset = vmcs_read64(TSC_OFFSET); 2157 return host_tsc + tsc_offset; 2158} 2159 2160/* 2161 * Like guest_read_tsc, but always returns L1's notion of the timestamp 2162 * counter, even if a nested guest (L2) is currently running. 2163 */ 2164static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) 2165{ 2166 u64 tsc_offset; 2167 2168 tsc_offset = is_guest_mode(vcpu) ? 2169 to_vmx(vcpu)->nested.vmcs01_tsc_offset : 2170 vmcs_read64(TSC_OFFSET); 2171 return host_tsc + tsc_offset; 2172} 2173 2174/* 2175 * Engage any workarounds for mis-matched TSC rates. Currently limited to 2176 * software catchup for faster rates on slower CPUs. 2177 */ 2178static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) 2179{ 2180 if (!scale) 2181 return; 2182 2183 if (user_tsc_khz > tsc_khz) { 2184 vcpu->arch.tsc_catchup = 1; 2185 vcpu->arch.tsc_always_catchup = 1; 2186 } else 2187 WARN(1, "user requested TSC rate below hardware speed\n"); 2188} 2189 2190static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) 2191{ 2192 return vmcs_read64(TSC_OFFSET); 2193} 2194 2195/* 2196 * writes 'offset' into guest's timestamp counter offset register 2197 */ 2198static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 2199{ 2200 if (is_guest_mode(vcpu)) { 2201 /* 2202 * We're here if L1 chose not to trap WRMSR to TSC. According 2203 * to the spec, this should set L1's TSC; The offset that L1 2204 * set for L2 remains unchanged, and still needs to be added 2205 * to the newly set TSC to get L2's TSC. 2206 */ 2207 struct vmcs12 *vmcs12; 2208 to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset; 2209 /* recalculate vmcs02.TSC_OFFSET: */ 2210 vmcs12 = get_vmcs12(vcpu); 2211 vmcs_write64(TSC_OFFSET, offset + 2212 (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? 2213 vmcs12->tsc_offset : 0)); 2214 } else { 2215 trace_kvm_write_tsc_offset(vcpu->vcpu_id, 2216 vmcs_read64(TSC_OFFSET), offset); 2217 vmcs_write64(TSC_OFFSET, offset); 2218 } 2219} 2220 2221static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) 2222{ 2223 u64 offset = vmcs_read64(TSC_OFFSET); 2224 2225 vmcs_write64(TSC_OFFSET, offset + adjustment); 2226 if (is_guest_mode(vcpu)) { 2227 /* Even when running L2, the adjustment needs to apply to L1 */ 2228 to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; 2229 } else 2230 trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset, 2231 offset + adjustment); 2232} 2233 2234static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) 2235{ 2236 return target_tsc - native_read_tsc(); 2237} 2238 2239static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) 2240{ 2241 struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); 2242 return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31))); 2243} 2244 2245/* 2246 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX 2247 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for 2248 * all guests if the "nested" module option is off, and can also be disabled 2249 * for a single guest by disabling its VMX cpuid bit. 2250 */ 2251static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) 2252{ 2253 return nested && guest_cpuid_has_vmx(vcpu); 2254} 2255 2256/* 2257 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 2258 * returned for the various VMX controls MSRs when nested VMX is enabled. 2259 * The same values should also be used to verify that vmcs12 control fields are 2260 * valid during nested entry from L1 to L2. 2261 * Each of these control msrs has a low and high 32-bit half: A low bit is on 2262 * if the corresponding bit in the (32-bit) control field *must* be on, and a 2263 * bit in the high half is on if the corresponding bit in the control field 2264 * may be on. See also vmx_control_verify(). 2265 * TODO: allow these variables to be modified (downgraded) by module options 2266 * or other means. 2267 */ 2268static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high; 2269static u32 nested_vmx_true_procbased_ctls_low; 2270static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high; 2271static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; 2272static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; 2273static u32 nested_vmx_true_exit_ctls_low; 2274static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; 2275static u32 nested_vmx_true_entry_ctls_low; 2276static u32 nested_vmx_misc_low, nested_vmx_misc_high; 2277static u32 nested_vmx_ept_caps; 2278static __init void nested_vmx_setup_ctls_msrs(void) 2279{ 2280 /* 2281 * Note that as a general rule, the high half of the MSRs (bits in 2282 * the control fields which may be 1) should be initialized by the 2283 * intersection of the underlying hardware's MSR (i.e., features which 2284 * can be supported) and the list of features we want to expose - 2285 * because they are known to be properly supported in our code. 2286 * Also, usually, the low half of the MSRs (bits which must be 1) can 2287 * be set to 0, meaning that L1 may turn off any of these bits. The 2288 * reason is that if one of these bits is necessary, it will appear 2289 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 2290 * fields of vmcs01 and vmcs02, will turn these bits off - and 2291 * nested_vmx_exit_handled() will not pass related exits to L1. 2292 * These rules have exceptions below. 2293 */ 2294 2295 /* pin-based controls */ 2296 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 2297 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high); 2298 nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 2299 nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | 2300 PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; 2301 nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 2302 PIN_BASED_VMX_PREEMPTION_TIMER; 2303 2304 /* exit controls */ 2305 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 2306 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); 2307 nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 2308 2309 nested_vmx_exit_ctls_high &= 2310#ifdef CONFIG_X86_64 2311 VM_EXIT_HOST_ADDR_SPACE_SIZE | 2312#endif 2313 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; 2314 nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 2315 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 2316 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 2317 2318 if (vmx_mpx_supported()) 2319 nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; 2320 2321 /* We support free control of debug control saving. */ 2322 nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low & 2323 ~VM_EXIT_SAVE_DEBUG_CONTROLS; 2324 2325 /* entry controls */ 2326 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 2327 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); 2328 nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 2329 nested_vmx_entry_ctls_high &= 2330#ifdef CONFIG_X86_64 2331 VM_ENTRY_IA32E_MODE | 2332#endif 2333 VM_ENTRY_LOAD_IA32_PAT; 2334 nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | 2335 VM_ENTRY_LOAD_IA32_EFER); 2336 if (vmx_mpx_supported()) 2337 nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; 2338 2339 /* We support free control of debug control loading. */ 2340 nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low & 2341 ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 2342 2343 /* cpu-based controls */ 2344 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 2345 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); 2346 nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 2347 nested_vmx_procbased_ctls_high &= 2348 CPU_BASED_VIRTUAL_INTR_PENDING | 2349 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | 2350 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 2351 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 2352 CPU_BASED_CR3_STORE_EXITING | 2353#ifdef CONFIG_X86_64 2354 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 2355#endif 2356 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 2357 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | 2358 CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | 2359 CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW | 2360 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 2361 /* 2362 * We can allow some features even when not supported by the 2363 * hardware. For example, L1 can specify an MSR bitmap - and we 2364 * can use it to avoid exits to L1 - even when L0 runs L2 2365 * without MSR bitmaps. 2366 */ 2367 nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 2368 CPU_BASED_USE_MSR_BITMAPS; 2369 2370 /* We support free control of CR3 access interception. */ 2371 nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low & 2372 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 2373 2374 /* secondary cpu-based controls */ 2375 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 2376 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); 2377 nested_vmx_secondary_ctls_low = 0; 2378 nested_vmx_secondary_ctls_high &= 2379 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2380 SECONDARY_EXEC_UNRESTRICTED_GUEST | 2381 SECONDARY_EXEC_WBINVD_EXITING; 2382 2383 if (enable_ept) { 2384 /* nested EPT: emulate EPT also to L1 */ 2385 nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; 2386 nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | 2387 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | 2388 VMX_EPT_INVEPT_BIT; 2389 nested_vmx_ept_caps &= vmx_capability.ept; 2390 /* 2391 * For nested guests, we don't do anything specific 2392 * for single context invalidation. Hence, only advertise 2393 * support for global context invalidation. 2394 */ 2395 nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; 2396 } else 2397 nested_vmx_ept_caps = 0; 2398 2399 /* miscellaneous data */ 2400 rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); 2401 nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; 2402 nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 2403 VMX_MISC_ACTIVITY_HLT; 2404 nested_vmx_misc_high = 0; 2405} 2406 2407static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 2408{ 2409 /* 2410 * Bits 0 in high must be 0, and bits 1 in low must be 1. 2411 */ 2412 return ((control & high) | low) == control; 2413} 2414 2415static inline u64 vmx_control_msr(u32 low, u32 high) 2416{ 2417 return low | ((u64)high << 32); 2418} 2419 2420/* Returns 0 on success, non-0 otherwise. */ 2421static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 2422{ 2423 switch (msr_index) { 2424 case MSR_IA32_VMX_BASIC: 2425 /* 2426 * This MSR reports some information about VMX support. We 2427 * should return information about the VMX we emulate for the 2428 * guest, and the VMCS structure we give it - not about the 2429 * VMX support of the underlying hardware. 2430 */ 2431 *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | 2432 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 2433 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 2434 break; 2435 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 2436 case MSR_IA32_VMX_PINBASED_CTLS: 2437 *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low, 2438 nested_vmx_pinbased_ctls_high); 2439 break; 2440 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 2441 *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low, 2442 nested_vmx_procbased_ctls_high); 2443 break; 2444 case MSR_IA32_VMX_PROCBASED_CTLS: 2445 *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low, 2446 nested_vmx_procbased_ctls_high); 2447 break; 2448 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 2449 *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low, 2450 nested_vmx_exit_ctls_high); 2451 break; 2452 case MSR_IA32_VMX_EXIT_CTLS: 2453 *pdata = vmx_control_msr(nested_vmx_exit_ctls_low, 2454 nested_vmx_exit_ctls_high); 2455 break; 2456 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 2457 *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low, 2458 nested_vmx_entry_ctls_high); 2459 break; 2460 case MSR_IA32_VMX_ENTRY_CTLS: 2461 *pdata = vmx_control_msr(nested_vmx_entry_ctls_low, 2462 nested_vmx_entry_ctls_high); 2463 break; 2464 case MSR_IA32_VMX_MISC: 2465 *pdata = vmx_control_msr(nested_vmx_misc_low, 2466 nested_vmx_misc_high); 2467 break; 2468 /* 2469 * These MSRs specify bits which the guest must keep fixed (on or off) 2470 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 2471 * We picked the standard core2 setting. 2472 */ 2473#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 2474#define VMXON_CR4_ALWAYSON X86_CR4_VMXE 2475 case MSR_IA32_VMX_CR0_FIXED0: 2476 *pdata = VMXON_CR0_ALWAYSON; 2477 break; 2478 case MSR_IA32_VMX_CR0_FIXED1: 2479 *pdata = -1ULL; 2480 break; 2481 case MSR_IA32_VMX_CR4_FIXED0: 2482 *pdata = VMXON_CR4_ALWAYSON; 2483 break; 2484 case MSR_IA32_VMX_CR4_FIXED1: 2485 *pdata = -1ULL; 2486 break; 2487 case MSR_IA32_VMX_VMCS_ENUM: 2488 *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ 2489 break; 2490 case MSR_IA32_VMX_PROCBASED_CTLS2: 2491 *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low, 2492 nested_vmx_secondary_ctls_high); 2493 break; 2494 case MSR_IA32_VMX_EPT_VPID_CAP: 2495 /* Currently, no nested vpid support */ 2496 *pdata = nested_vmx_ept_caps; 2497 break; 2498 default: 2499 return 1; 2500 } 2501 2502 return 0; 2503} 2504 2505/* 2506 * Reads an msr value (of 'msr_index') into 'pdata'. 2507 * Returns 0 on success, non-0 otherwise. 2508 * Assumes vcpu_load() was already called. 2509 */ 2510static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 2511{ 2512 u64 data; 2513 struct shared_msr_entry *msr; 2514 2515 if (!pdata) { 2516 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); 2517 return -EINVAL; 2518 } 2519 2520 switch (msr_index) { 2521#ifdef CONFIG_X86_64 2522 case MSR_FS_BASE: 2523 data = vmcs_readl(GUEST_FS_BASE); 2524 break; 2525 case MSR_GS_BASE: 2526 data = vmcs_readl(GUEST_GS_BASE); 2527 break; 2528 case MSR_KERNEL_GS_BASE: 2529 vmx_load_host_state(to_vmx(vcpu)); 2530 data = to_vmx(vcpu)->msr_guest_kernel_gs_base; 2531 break; 2532#endif 2533 case MSR_EFER: 2534 return kvm_get_msr_common(vcpu, msr_index, pdata); 2535 case MSR_IA32_TSC: 2536 data = guest_read_tsc(); 2537 break; 2538 case MSR_IA32_SYSENTER_CS: 2539 data = vmcs_read32(GUEST_SYSENTER_CS); 2540 break; 2541 case MSR_IA32_SYSENTER_EIP: 2542 data = vmcs_readl(GUEST_SYSENTER_EIP); 2543 break; 2544 case MSR_IA32_SYSENTER_ESP: 2545 data = vmcs_readl(GUEST_SYSENTER_ESP); 2546 break; 2547 case MSR_IA32_BNDCFGS: 2548 if (!vmx_mpx_supported()) 2549 return 1; 2550 data = vmcs_read64(GUEST_BNDCFGS); 2551 break; 2552 case MSR_IA32_FEATURE_CONTROL: 2553 if (!nested_vmx_allowed(vcpu)) 2554 return 1; 2555 data = to_vmx(vcpu)->nested.msr_ia32_feature_control; 2556 break; 2557 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 2558 if (!nested_vmx_allowed(vcpu)) 2559 return 1; 2560 return vmx_get_vmx_msr(vcpu, msr_index, pdata); 2561 case MSR_TSC_AUX: 2562 if (!to_vmx(vcpu)->rdtscp_enabled) 2563 return 1; 2564 /* Otherwise falls through */ 2565 default: 2566 msr = find_msr_entry(to_vmx(vcpu), msr_index); 2567 if (msr) { 2568 data = msr->data; 2569 break; 2570 } 2571 return kvm_get_msr_common(vcpu, msr_index, pdata); 2572 } 2573 2574 *pdata = data; 2575 return 0; 2576} 2577 2578static void vmx_leave_nested(struct kvm_vcpu *vcpu); 2579 2580/* 2581 * Writes msr value into into the appropriate "register". 2582 * Returns 0 on success, non-0 otherwise. 2583 * Assumes vcpu_load() was already called. 2584 */ 2585static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2586{ 2587 struct vcpu_vmx *vmx = to_vmx(vcpu); 2588 struct shared_msr_entry *msr; 2589 int ret = 0; 2590 u32 msr_index = msr_info->index; 2591 u64 data = msr_info->data; 2592 2593 switch (msr_index) { 2594 case MSR_EFER: 2595 ret = kvm_set_msr_common(vcpu, msr_info); 2596 break; 2597#ifdef CONFIG_X86_64 2598 case MSR_FS_BASE: 2599 vmx_segment_cache_clear(vmx); 2600 vmcs_writel(GUEST_FS_BASE, data); 2601 break; 2602 case MSR_GS_BASE: 2603 vmx_segment_cache_clear(vmx); 2604 vmcs_writel(GUEST_GS_BASE, data); 2605 break; 2606 case MSR_KERNEL_GS_BASE: 2607 vmx_load_host_state(vmx); 2608 vmx->msr_guest_kernel_gs_base = data; 2609 break; 2610#endif 2611 case MSR_IA32_SYSENTER_CS: 2612 vmcs_write32(GUEST_SYSENTER_CS, data); 2613 break; 2614 case MSR_IA32_SYSENTER_EIP: 2615 vmcs_writel(GUEST_SYSENTER_EIP, data); 2616 break; 2617 case MSR_IA32_SYSENTER_ESP: 2618 vmcs_writel(GUEST_SYSENTER_ESP, data); 2619 break; 2620 case MSR_IA32_BNDCFGS: 2621 if (!vmx_mpx_supported()) 2622 return 1; 2623 vmcs_write64(GUEST_BNDCFGS, data); 2624 break; 2625 case MSR_IA32_TSC: 2626 kvm_write_tsc(vcpu, msr_info); 2627 break; 2628 case MSR_IA32_CR_PAT: 2629 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2630 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) 2631 return 1; 2632 vmcs_write64(GUEST_IA32_PAT, data); 2633 vcpu->arch.pat = data; 2634 break; 2635 } 2636 ret = kvm_set_msr_common(vcpu, msr_info); 2637 break; 2638 case MSR_IA32_TSC_ADJUST: 2639 ret = kvm_set_msr_common(vcpu, msr_info); 2640 break; 2641 case MSR_IA32_FEATURE_CONTROL: 2642 if (!nested_vmx_allowed(vcpu) || 2643 (to_vmx(vcpu)->nested.msr_ia32_feature_control & 2644 FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) 2645 return 1; 2646 vmx->nested.msr_ia32_feature_control = data; 2647 if (msr_info->host_initiated && data == 0) 2648 vmx_leave_nested(vcpu); 2649 break; 2650 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 2651 return 1; /* they are read-only */ 2652 case MSR_TSC_AUX: 2653 if (!vmx->rdtscp_enabled) 2654 return 1; 2655 /* Check reserved bit, higher 32 bits should be zero */ 2656 if ((data >> 32) != 0) 2657 return 1; 2658 /* Otherwise falls through */ 2659 default: 2660 msr = find_msr_entry(vmx, msr_index); 2661 if (msr) { 2662 u64 old_msr_data = msr->data; 2663 msr->data = data; 2664 if (msr - vmx->guest_msrs < vmx->save_nmsrs) { 2665 preempt_disable(); 2666 ret = kvm_set_shared_msr(msr->index, msr->data, 2667 msr->mask); 2668 preempt_enable(); 2669 if (ret) 2670 msr->data = old_msr_data; 2671 } 2672 break; 2673 } 2674 ret = kvm_set_msr_common(vcpu, msr_info); 2675 } 2676 2677 return ret; 2678} 2679 2680static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2681{ 2682 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); 2683 switch (reg) { 2684 case VCPU_REGS_RSP: 2685 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 2686 break; 2687 case VCPU_REGS_RIP: 2688 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 2689 break; 2690 case VCPU_EXREG_PDPTR: 2691 if (enable_ept) 2692 ept_save_pdptrs(vcpu); 2693 break; 2694 default: 2695 break; 2696 } 2697} 2698 2699static __init int cpu_has_kvm_support(void) 2700{ 2701 return cpu_has_vmx(); 2702} 2703 2704static __init int vmx_disabled_by_bios(void) 2705{ 2706 u64 msr; 2707 2708 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 2709 if (msr & FEATURE_CONTROL_LOCKED) { 2710 /* launched w/ TXT and VMX disabled */ 2711 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 2712 && tboot_enabled()) 2713 return 1; 2714 /* launched w/o TXT and VMX only enabled w/ TXT */ 2715 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 2716 && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 2717 && !tboot_enabled()) { 2718 printk(KERN_WARNING "kvm: disable TXT in the BIOS or " 2719 "activate TXT before enabling KVM\n"); 2720 return 1; 2721 } 2722 /* launched w/o TXT and VMX disabled */ 2723 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 2724 && !tboot_enabled()) 2725 return 1; 2726 } 2727 2728 return 0; 2729} 2730 2731static void kvm_cpu_vmxon(u64 addr) 2732{ 2733 asm volatile (ASM_VMX_VMXON_RAX 2734 : : "a"(&addr), "m"(addr) 2735 : "memory", "cc"); 2736} 2737 2738static int hardware_enable(void) 2739{ 2740 int cpu = raw_smp_processor_id(); 2741 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 2742 u64 old, test_bits; 2743 2744 if (read_cr4() & X86_CR4_VMXE) 2745 return -EBUSY; 2746 2747 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 2748 2749 /* 2750 * Now we can enable the vmclear operation in kdump 2751 * since the loaded_vmcss_on_cpu list on this cpu 2752 * has been initialized. 2753 * 2754 * Though the cpu is not in VMX operation now, there 2755 * is no problem to enable the vmclear operation 2756 * for the loaded_vmcss_on_cpu list is empty! 2757 */ 2758 crash_enable_local_vmclear(cpu); 2759 2760 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 2761 2762 test_bits = FEATURE_CONTROL_LOCKED; 2763 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 2764 if (tboot_enabled()) 2765 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; 2766 2767 if ((old & test_bits) != test_bits) { 2768 /* enable and lock */ 2769 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); 2770 } 2771 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 2772 2773 if (vmm_exclusive) { 2774 kvm_cpu_vmxon(phys_addr); 2775 ept_sync_global(); 2776 } 2777 2778 native_store_gdt(this_cpu_ptr(&host_gdt)); 2779 2780 return 0; 2781} 2782 2783static void vmclear_local_loaded_vmcss(void) 2784{ 2785 int cpu = raw_smp_processor_id(); 2786 struct loaded_vmcs *v, *n; 2787 2788 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), 2789 loaded_vmcss_on_cpu_link) 2790 __loaded_vmcs_clear(v); 2791} 2792 2793 2794/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() 2795 * tricks. 2796 */ 2797static void kvm_cpu_vmxoff(void) 2798{ 2799 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); 2800} 2801 2802static void hardware_disable(void) 2803{ 2804 if (vmm_exclusive) { 2805 vmclear_local_loaded_vmcss(); 2806 kvm_cpu_vmxoff(); 2807 } 2808 write_cr4(read_cr4() & ~X86_CR4_VMXE); 2809} 2810 2811static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, 2812 u32 msr, u32 *result) 2813{ 2814 u32 vmx_msr_low, vmx_msr_high; 2815 u32 ctl = ctl_min | ctl_opt; 2816 2817 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2818 2819 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ 2820 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ 2821 2822 /* Ensure minimum (required) set of control bits are supported. */ 2823 if (ctl_min & ~ctl) 2824 return -EIO; 2825 2826 *result = ctl; 2827 return 0; 2828} 2829 2830static __init bool allow_1_setting(u32 msr, u32 ctl) 2831{ 2832 u32 vmx_msr_low, vmx_msr_high; 2833 2834 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2835 return vmx_msr_high & ctl; 2836} 2837 2838static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) 2839{ 2840 u32 vmx_msr_low, vmx_msr_high; 2841 u32 min, opt, min2, opt2; 2842 u32 _pin_based_exec_control = 0; 2843 u32 _cpu_based_exec_control = 0; 2844 u32 _cpu_based_2nd_exec_control = 0; 2845 u32 _vmexit_control = 0; 2846 u32 _vmentry_control = 0; 2847 2848 min = CPU_BASED_HLT_EXITING | 2849#ifdef CONFIG_X86_64 2850 CPU_BASED_CR8_LOAD_EXITING | 2851 CPU_BASED_CR8_STORE_EXITING | 2852#endif 2853 CPU_BASED_CR3_LOAD_EXITING | 2854 CPU_BASED_CR3_STORE_EXITING | 2855 CPU_BASED_USE_IO_BITMAPS | 2856 CPU_BASED_MOV_DR_EXITING | 2857 CPU_BASED_USE_TSC_OFFSETING | 2858 CPU_BASED_MWAIT_EXITING | 2859 CPU_BASED_MONITOR_EXITING | 2860 CPU_BASED_INVLPG_EXITING | 2861 CPU_BASED_RDPMC_EXITING; 2862 2863 opt = CPU_BASED_TPR_SHADOW | 2864 CPU_BASED_USE_MSR_BITMAPS | 2865 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 2866 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, 2867 &_cpu_based_exec_control) < 0) 2868 return -EIO; 2869#ifdef CONFIG_X86_64 2870 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2871 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & 2872 ~CPU_BASED_CR8_STORE_EXITING; 2873#endif 2874 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2875 min2 = 0; 2876 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2877 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2878 SECONDARY_EXEC_WBINVD_EXITING | 2879 SECONDARY_EXEC_ENABLE_VPID | 2880 SECONDARY_EXEC_ENABLE_EPT | 2881 SECONDARY_EXEC_UNRESTRICTED_GUEST | 2882 SECONDARY_EXEC_PAUSE_LOOP_EXITING | 2883 SECONDARY_EXEC_RDTSCP | 2884 SECONDARY_EXEC_ENABLE_INVPCID | 2885 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2886 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2887 SECONDARY_EXEC_SHADOW_VMCS; 2888 if (adjust_vmx_controls(min2, opt2, 2889 MSR_IA32_VMX_PROCBASED_CTLS2, 2890 &_cpu_based_2nd_exec_control) < 0) 2891 return -EIO; 2892 } 2893#ifndef CONFIG_X86_64 2894 if (!(_cpu_based_2nd_exec_control & 2895 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2896 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2897#endif 2898 2899 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2900 _cpu_based_2nd_exec_control &= ~( 2901 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2902 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2903 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 2904 2905 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 2906 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT 2907 enabled */ 2908 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 2909 CPU_BASED_CR3_STORE_EXITING | 2910 CPU_BASED_INVLPG_EXITING); 2911 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, 2912 vmx_capability.ept, vmx_capability.vpid); 2913 } 2914 2915 min = VM_EXIT_SAVE_DEBUG_CONTROLS; 2916#ifdef CONFIG_X86_64 2917 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; 2918#endif 2919 opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | 2920 VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS; 2921 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, 2922 &_vmexit_control) < 0) 2923 return -EIO; 2924 2925 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; 2926 opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; 2927 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, 2928 &_pin_based_exec_control) < 0) 2929 return -EIO; 2930 2931 if (!(_cpu_based_2nd_exec_control & 2932 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || 2933 !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) 2934 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 2935 2936 min = VM_ENTRY_LOAD_DEBUG_CONTROLS; 2937 opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; 2938 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, 2939 &_vmentry_control) < 0) 2940 return -EIO; 2941 2942 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); 2943 2944 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 2945 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) 2946 return -EIO; 2947 2948#ifdef CONFIG_X86_64 2949 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ 2950 if (vmx_msr_high & (1u<<16)) 2951 return -EIO; 2952#endif 2953 2954 /* Require Write-Back (WB) memory type for VMCS accesses. */ 2955 if (((vmx_msr_high >> 18) & 15) != 6) 2956 return -EIO; 2957 2958 vmcs_conf->size = vmx_msr_high & 0x1fff; 2959 vmcs_conf->order = get_order(vmcs_config.size); 2960 vmcs_conf->revision_id = vmx_msr_low; 2961 2962 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 2963 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 2964 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; 2965 vmcs_conf->vmexit_ctrl = _vmexit_control; 2966 vmcs_conf->vmentry_ctrl = _vmentry_control; 2967 2968 cpu_has_load_ia32_efer = 2969 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, 2970 VM_ENTRY_LOAD_IA32_EFER) 2971 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, 2972 VM_EXIT_LOAD_IA32_EFER); 2973 2974 cpu_has_load_perf_global_ctrl = 2975 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, 2976 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 2977 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, 2978 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 2979 2980 /* 2981 * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL 2982 * but due to arrata below it can't be used. Workaround is to use 2983 * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL. 2984 * 2985 * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32] 2986 * 2987 * AAK155 (model 26) 2988 * AAP115 (model 30) 2989 * AAT100 (model 37) 2990 * BC86,AAY89,BD102 (model 44) 2991 * BA97 (model 46) 2992 * 2993 */ 2994 if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) { 2995 switch (boot_cpu_data.x86_model) { 2996 case 26: 2997 case 30: 2998 case 37: 2999 case 44: 3000 case 46: 3001 cpu_has_load_perf_global_ctrl = false; 3002 printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " 3003 "does not work properly. Using workaround\n"); 3004 break; 3005 default: 3006 break; 3007 } 3008 } 3009 3010 return 0; 3011} 3012 3013static struct vmcs *alloc_vmcs_cpu(int cpu) 3014{ 3015 int node = cpu_to_node(cpu); 3016 struct page *pages; 3017 struct vmcs *vmcs; 3018 3019 pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order); 3020 if (!pages) 3021 return NULL; 3022 vmcs = page_address(pages); 3023 memset(vmcs, 0, vmcs_config.size); 3024 vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ 3025 return vmcs; 3026} 3027 3028static struct vmcs *alloc_vmcs(void) 3029{ 3030 return alloc_vmcs_cpu(raw_smp_processor_id()); 3031} 3032 3033static void free_vmcs(struct vmcs *vmcs) 3034{ 3035 free_pages((unsigned long)vmcs, vmcs_config.order); 3036} 3037 3038/* 3039 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded 3040 */ 3041static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 3042{ 3043 if (!loaded_vmcs->vmcs) 3044 return; 3045 loaded_vmcs_clear(loaded_vmcs); 3046 free_vmcs(loaded_vmcs->vmcs); 3047 loaded_vmcs->vmcs = NULL; 3048} 3049 3050static void free_kvm_area(void) 3051{ 3052 int cpu; 3053 3054 for_each_possible_cpu(cpu) { 3055 free_vmcs(per_cpu(vmxarea, cpu)); 3056 per_cpu(vmxarea, cpu) = NULL; 3057 } 3058} 3059 3060static void init_vmcs_shadow_fields(void) 3061{ 3062 int i, j; 3063 3064 /* No checks for read only fields yet */ 3065 3066 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 3067 switch (shadow_read_write_fields[i]) { 3068 case GUEST_BNDCFGS: 3069 if (!vmx_mpx_supported()) 3070 continue; 3071 break; 3072 default: 3073 break; 3074 } 3075 3076 if (j < i) 3077 shadow_read_write_fields[j] = 3078 shadow_read_write_fields[i]; 3079 j++; 3080 } 3081 max_shadow_read_write_fields = j; 3082 3083 /* shadowed fields guest access without vmexit */ 3084 for (i = 0; i < max_shadow_read_write_fields; i++) { 3085 clear_bit(shadow_read_write_fields[i], 3086 vmx_vmwrite_bitmap); 3087 clear_bit(shadow_read_write_fields[i], 3088 vmx_vmread_bitmap); 3089 } 3090 for (i = 0; i < max_shadow_read_only_fields; i++) 3091 clear_bit(shadow_read_only_fields[i], 3092 vmx_vmread_bitmap); 3093} 3094 3095static __init int alloc_kvm_area(void) 3096{ 3097 int cpu; 3098 3099 for_each_possible_cpu(cpu) { 3100 struct vmcs *vmcs; 3101 3102 vmcs = alloc_vmcs_cpu(cpu); 3103 if (!vmcs) { 3104 free_kvm_area(); 3105 return -ENOMEM; 3106 } 3107 3108 per_cpu(vmxarea, cpu) = vmcs; 3109 } 3110 return 0; 3111} 3112 3113static __init int hardware_setup(void) 3114{ 3115 if (setup_vmcs_config(&vmcs_config) < 0) 3116 return -EIO; 3117 3118 if (boot_cpu_has(X86_FEATURE_NX)) 3119 kvm_enable_efer_bits(EFER_NX); 3120 3121 if (!cpu_has_vmx_vpid()) 3122 enable_vpid = 0; 3123 if (!cpu_has_vmx_shadow_vmcs()) 3124 enable_shadow_vmcs = 0; 3125 if (enable_shadow_vmcs) 3126 init_vmcs_shadow_fields(); 3127 3128 if (!cpu_has_vmx_ept() || 3129 !cpu_has_vmx_ept_4levels()) { 3130 enable_ept = 0; 3131 enable_unrestricted_guest = 0; 3132 enable_ept_ad_bits = 0; 3133 } 3134 3135 if (!cpu_has_vmx_ept_ad_bits()) 3136 enable_ept_ad_bits = 0; 3137 3138 if (!cpu_has_vmx_unrestricted_guest()) 3139 enable_unrestricted_guest = 0; 3140 3141 if (!cpu_has_vmx_flexpriority()) { 3142 flexpriority_enabled = 0; 3143 3144 /* 3145 * set_apic_access_page_addr() is used to reload apic access 3146 * page upon invalidation. No need to do anything if the 3147 * processor does not have the APIC_ACCESS_ADDR VMCS field. 3148 */ 3149 kvm_x86_ops->set_apic_access_page_addr = NULL; 3150 } 3151 3152 if (!cpu_has_vmx_tpr_shadow()) 3153 kvm_x86_ops->update_cr8_intercept = NULL; 3154 3155 if (enable_ept && !cpu_has_vmx_ept_2m_page()) 3156 kvm_disable_largepages(); 3157 3158 if (!cpu_has_vmx_ple()) 3159 ple_gap = 0; 3160 3161 if (!cpu_has_vmx_apicv()) 3162 enable_apicv = 0; 3163 3164 if (enable_apicv) 3165 kvm_x86_ops->update_cr8_intercept = NULL; 3166 else { 3167 kvm_x86_ops->hwapic_irr_update = NULL; 3168 kvm_x86_ops->deliver_posted_interrupt = NULL; 3169 kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; 3170 } 3171 3172 if (nested) 3173 nested_vmx_setup_ctls_msrs(); 3174 3175 return alloc_kvm_area(); 3176} 3177 3178static __exit void hardware_unsetup(void) 3179{ 3180 free_kvm_area(); 3181} 3182 3183static bool emulation_required(struct kvm_vcpu *vcpu) 3184{ 3185 return emulate_invalid_guest_state && !guest_state_valid(vcpu); 3186} 3187 3188static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, 3189 struct kvm_segment *save) 3190{ 3191 if (!emulate_invalid_guest_state) { 3192 /* 3193 * CS and SS RPL should be equal during guest entry according 3194 * to VMX spec, but in reality it is not always so. Since vcpu 3195 * is in the middle of the transition from real mode to 3196 * protected mode it is safe to assume that RPL 0 is a good 3197 * default value. 3198 */ 3199 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 3200 save->selector &= ~SELECTOR_RPL_MASK; 3201 save->dpl = save->selector & SELECTOR_RPL_MASK; 3202 save->s = 1; 3203 } 3204 vmx_set_segment(vcpu, save, seg); 3205} 3206 3207static void enter_pmode(struct kvm_vcpu *vcpu) 3208{ 3209 unsigned long flags; 3210 struct vcpu_vmx *vmx = to_vmx(vcpu); 3211 3212 /* 3213 * Update real mode segment cache. It may be not up-to-date if sement 3214 * register was written while vcpu was in a guest mode. 3215 */ 3216 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3217 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3218 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3219 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3220 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3221 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3222 3223 vmx->rmode.vm86_active = 0; 3224 3225 vmx_segment_cache_clear(vmx); 3226 3227 vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3228 3229 flags = vmcs_readl(GUEST_RFLAGS); 3230 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 3231 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 3232 vmcs_writel(GUEST_RFLAGS, flags); 3233 3234 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 3235 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); 3236 3237 update_exception_bitmap(vcpu); 3238 3239 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3240 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3241 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3242 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3243 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3244 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3245} 3246 3247static void fix_rmode_seg(int seg, struct kvm_segment *save) 3248{ 3249 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3250 struct kvm_segment var = *save; 3251 3252 var.dpl = 0x3; 3253 if (seg == VCPU_SREG_CS) 3254 var.type = 0x3; 3255 3256 if (!emulate_invalid_guest_state) { 3257 var.selector = var.base >> 4; 3258 var.base = var.base & 0xffff0; 3259 var.limit = 0xffff; 3260 var.g = 0; 3261 var.db = 0; 3262 var.present = 1; 3263 var.s = 1; 3264 var.l = 0; 3265 var.unusable = 0; 3266 var.type = 0x3; 3267 var.avl = 0; 3268 if (save->base & 0xf) 3269 printk_once(KERN_WARNING "kvm: segment base is not " 3270 "paragraph aligned when entering " 3271 "protected mode (seg=%d)", seg); 3272 } 3273 3274 vmcs_write16(sf->selector, var.selector); 3275 vmcs_write32(sf->base, var.base); 3276 vmcs_write32(sf->limit, var.limit); 3277 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 3278} 3279 3280static void enter_rmode(struct kvm_vcpu *vcpu) 3281{ 3282 unsigned long flags; 3283 struct vcpu_vmx *vmx = to_vmx(vcpu); 3284 3285 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3286 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3287 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3288 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3289 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3290 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3291 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3292 3293 vmx->rmode.vm86_active = 1; 3294 3295 /* 3296 * Very old userspace does not call KVM_SET_TSS_ADDR before entering 3297 * vcpu. Warn the user that an update is overdue. 3298 */ 3299 if (!vcpu->kvm->arch.tss_addr) 3300 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " 3301 "called before entering vcpu\n"); 3302 3303 vmx_segment_cache_clear(vmx); 3304 3305 vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr); 3306 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3307 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3308 3309 flags = vmcs_readl(GUEST_RFLAGS); 3310 vmx->rmode.save_rflags = flags; 3311 3312 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 3313 3314 vmcs_writel(GUEST_RFLAGS, flags); 3315 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 3316 update_exception_bitmap(vcpu); 3317 3318 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3319 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3320 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3321 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3322 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3323 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3324 3325 kvm_mmu_reset_context(vcpu); 3326} 3327 3328static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 3329{ 3330 struct vcpu_vmx *vmx = to_vmx(vcpu); 3331 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); 3332 3333 if (!msr) 3334 return; 3335 3336 /* 3337 * Force kernel_gs_base reloading before EFER changes, as control 3338 * of this msr depends on is_long_mode(). 3339 */ 3340 vmx_load_host_state(to_vmx(vcpu)); 3341 vcpu->arch.efer = efer; 3342 if (efer & EFER_LMA) { 3343 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); 3344 msr->data = efer; 3345 } else { 3346 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); 3347 3348 msr->data = efer & ~EFER_LME; 3349 } 3350 setup_msrs(vmx); 3351} 3352 3353#ifdef CONFIG_X86_64 3354 3355static void enter_lmode(struct kvm_vcpu *vcpu) 3356{ 3357 u32 guest_tr_ar; 3358 3359 vmx_segment_cache_clear(to_vmx(vcpu)); 3360 3361 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 3362 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { 3363 pr_debug_ratelimited("%s: tss fixup for long mode. \n", 3364 __func__); 3365 vmcs_write32(GUEST_TR_AR_BYTES, 3366 (guest_tr_ar & ~AR_TYPE_MASK) 3367 | AR_TYPE_BUSY_64_TSS); 3368 } 3369 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); 3370} 3371 3372static void exit_lmode(struct kvm_vcpu *vcpu) 3373{ 3374 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); 3375 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 3376} 3377 3378#endif 3379 3380static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 3381{ 3382 vpid_sync_context(to_vmx(vcpu)); 3383 if (enable_ept) { 3384 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 3385 return; 3386 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 3387 } 3388} 3389 3390static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 3391{ 3392 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 3393 3394 vcpu->arch.cr0 &= ~cr0_guest_owned_bits; 3395 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; 3396} 3397 3398static void vmx_decache_cr3(struct kvm_vcpu *vcpu) 3399{ 3400 if (enable_ept && is_paging(vcpu)) 3401 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3402 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 3403} 3404 3405static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 3406{ 3407 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 3408 3409 vcpu->arch.cr4 &= ~cr4_guest_owned_bits; 3410 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; 3411} 3412 3413static void ept_load_pdptrs(struct kvm_vcpu *vcpu) 3414{ 3415 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3416 3417 if (!test_bit(VCPU_EXREG_PDPTR, 3418 (unsigned long *)&vcpu->arch.regs_dirty)) 3419 return; 3420 3421 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 3422 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); 3423 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); 3424 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); 3425 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); 3426 } 3427} 3428 3429static void ept_save_pdptrs(struct kvm_vcpu *vcpu) 3430{ 3431 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3432 3433 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 3434 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 3435 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 3436 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 3437 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 3438 } 3439 3440 __set_bit(VCPU_EXREG_PDPTR, 3441 (unsigned long *)&vcpu->arch.regs_avail); 3442 __set_bit(VCPU_EXREG_PDPTR, 3443 (unsigned long *)&vcpu->arch.regs_dirty); 3444} 3445 3446static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 3447 3448static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, 3449 unsigned long cr0, 3450 struct kvm_vcpu *vcpu) 3451{ 3452 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) 3453 vmx_decache_cr3(vcpu); 3454 if (!(cr0 & X86_CR0_PG)) { 3455 /* From paging/starting to nonpaging */ 3456 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 3457 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | 3458 (CPU_BASED_CR3_LOAD_EXITING | 3459 CPU_BASED_CR3_STORE_EXITING)); 3460 vcpu->arch.cr0 = cr0; 3461 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3462 } else if (!is_paging(vcpu)) { 3463 /* From nonpaging to paging */ 3464 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 3465 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & 3466 ~(CPU_BASED_CR3_LOAD_EXITING | 3467 CPU_BASED_CR3_STORE_EXITING)); 3468 vcpu->arch.cr0 = cr0; 3469 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3470 } 3471 3472 if (!(cr0 & X86_CR0_WP)) 3473 *hw_cr0 &= ~X86_CR0_WP; 3474} 3475 3476static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3477{ 3478 struct vcpu_vmx *vmx = to_vmx(vcpu); 3479 unsigned long hw_cr0; 3480 3481 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK); 3482 if (enable_unrestricted_guest) 3483 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3484 else { 3485 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; 3486 3487 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3488 enter_pmode(vcpu); 3489 3490 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3491 enter_rmode(vcpu); 3492 } 3493 3494#ifdef CONFIG_X86_64 3495 if (vcpu->arch.efer & EFER_LME) { 3496 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) 3497 enter_lmode(vcpu); 3498 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) 3499 exit_lmode(vcpu); 3500 } 3501#endif 3502 3503 if (enable_ept) 3504 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); 3505 3506 if (!vcpu->fpu_active) 3507 hw_cr0 |= X86_CR0_TS | X86_CR0_MP; 3508 3509 vmcs_writel(CR0_READ_SHADOW, cr0); 3510 vmcs_writel(GUEST_CR0, hw_cr0); 3511 vcpu->arch.cr0 = cr0; 3512 3513 /* depends on vcpu->arch.cr0 to be set to a new value */ 3514 vmx->emulation_required = emulation_required(vcpu); 3515} 3516 3517static u64 construct_eptp(unsigned long root_hpa) 3518{ 3519 u64 eptp; 3520 3521 /* TODO write the value reading from MSR */ 3522 eptp = VMX_EPT_DEFAULT_MT | 3523 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; 3524 if (enable_ept_ad_bits) 3525 eptp |= VMX_EPT_AD_ENABLE_BIT; 3526 eptp |= (root_hpa & PAGE_MASK); 3527 3528 return eptp; 3529} 3530 3531static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 3532{ 3533 unsigned long guest_cr3; 3534 u64 eptp; 3535 3536 guest_cr3 = cr3; 3537 if (enable_ept) { 3538 eptp = construct_eptp(cr3); 3539 vmcs_write64(EPT_POINTER, eptp); 3540 if (is_paging(vcpu) || is_guest_mode(vcpu)) 3541 guest_cr3 = kvm_read_cr3(vcpu); 3542 else 3543 guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr; 3544 ept_load_pdptrs(vcpu); 3545 } 3546 3547 vmx_flush_tlb(vcpu); 3548 vmcs_writel(GUEST_CR3, guest_cr3); 3549} 3550 3551static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3552{ 3553 unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? 3554 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 3555 3556 if (cr4 & X86_CR4_VMXE) { 3557 /* 3558 * To use VMXON (and later other VMX instructions), a guest 3559 * must first be able to turn on cr4.VMXE (see handle_vmon()). 3560 * So basically the check on whether to allow nested VMX 3561 * is here. 3562 */ 3563 if (!nested_vmx_allowed(vcpu)) 3564 return 1; 3565 } 3566 if (to_vmx(vcpu)->nested.vmxon && 3567 ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) 3568 return 1; 3569 3570 vcpu->arch.cr4 = cr4; 3571 if (enable_ept) { 3572 if (!is_paging(vcpu)) { 3573 hw_cr4 &= ~X86_CR4_PAE; 3574 hw_cr4 |= X86_CR4_PSE; 3575 /* 3576 * SMEP/SMAP is disabled if CPU is in non-paging mode 3577 * in hardware. However KVM always uses paging mode to 3578 * emulate guest non-paging mode with TDP. 3579 * To emulate this behavior, SMEP/SMAP needs to be 3580 * manually disabled when guest switches to non-paging 3581 * mode. 3582 */ 3583 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); 3584 } else if (!(cr4 & X86_CR4_PAE)) { 3585 hw_cr4 &= ~X86_CR4_PAE; 3586 } 3587 } 3588 3589 vmcs_writel(CR4_READ_SHADOW, cr4); 3590 vmcs_writel(GUEST_CR4, hw_cr4); 3591 return 0; 3592} 3593 3594static void vmx_get_segment(struct kvm_vcpu *vcpu, 3595 struct kvm_segment *var, int seg) 3596{ 3597 struct vcpu_vmx *vmx = to_vmx(vcpu); 3598 u32 ar; 3599 3600 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3601 *var = vmx->rmode.segs[seg]; 3602 if (seg == VCPU_SREG_TR 3603 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3604 return; 3605 var->base = vmx_read_guest_seg_base(vmx, seg); 3606 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3607 return; 3608 } 3609 var->base = vmx_read_guest_seg_base(vmx, seg); 3610 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3611 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3612 ar = vmx_read_guest_seg_ar(vmx, seg); 3613 var->unusable = (ar >> 16) & 1; 3614 var->type = ar & 15; 3615 var->s = (ar >> 4) & 1; 3616 var->dpl = (ar >> 5) & 3; 3617 /* 3618 * Some userspaces do not preserve unusable property. Since usable 3619 * segment has to be present according to VMX spec we can use present 3620 * property to amend userspace bug by making unusable segment always 3621 * nonpresent. vmx_segment_access_rights() already marks nonpresent 3622 * segment as unusable. 3623 */ 3624 var->present = !var->unusable; 3625 var->avl = (ar >> 12) & 1; 3626 var->l = (ar >> 13) & 1; 3627 var->db = (ar >> 14) & 1; 3628 var->g = (ar >> 15) & 1; 3629} 3630 3631static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 3632{ 3633 struct kvm_segment s; 3634 3635 if (to_vmx(vcpu)->rmode.vm86_active) { 3636 vmx_get_segment(vcpu, &s, seg); 3637 return s.base; 3638 } 3639 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3640} 3641 3642static int vmx_get_cpl(struct kvm_vcpu *vcpu) 3643{ 3644 struct vcpu_vmx *vmx = to_vmx(vcpu); 3645 3646 if (unlikely(vmx->rmode.vm86_active)) 3647 return 0; 3648 else { 3649 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3650 return AR_DPL(ar); 3651 } 3652} 3653 3654static u32 vmx_segment_access_rights(struct kvm_segment *var) 3655{ 3656 u32 ar; 3657 3658 if (var->unusable || !var->present) 3659 ar = 1 << 16; 3660 else { 3661 ar = var->type & 15; 3662 ar |= (var->s & 1) << 4; 3663 ar |= (var->dpl & 3) << 5; 3664 ar |= (var->present & 1) << 7; 3665 ar |= (var->avl & 1) << 12; 3666 ar |= (var->l & 1) << 13; 3667 ar |= (var->db & 1) << 14; 3668 ar |= (var->g & 1) << 15; 3669 } 3670 3671 return ar; 3672} 3673 3674static void vmx_set_segment(struct kvm_vcpu *vcpu, 3675 struct kvm_segment *var, int seg) 3676{ 3677 struct vcpu_vmx *vmx = to_vmx(vcpu); 3678 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3679 3680 vmx_segment_cache_clear(vmx); 3681 3682 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3683 vmx->rmode.segs[seg] = *var; 3684 if (seg == VCPU_SREG_TR) 3685 vmcs_write16(sf->selector, var->selector); 3686 else if (var->s) 3687 fix_rmode_seg(seg, &vmx->rmode.segs[seg]); 3688 goto out; 3689 } 3690 3691 vmcs_writel(sf->base, var->base); 3692 vmcs_write32(sf->limit, var->limit); 3693 vmcs_write16(sf->selector, var->selector); 3694 3695 /* 3696 * Fix the "Accessed" bit in AR field of segment registers for older 3697 * qemu binaries. 3698 * IA32 arch specifies that at the time of processor reset the 3699 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3700 * is setting it to 0 in the userland code. This causes invalid guest 3701 * state vmexit when "unrestricted guest" mode is turned on. 3702 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3703 * tree. Newer qemu binaries with that qemu fix would not need this 3704 * kvm hack. 3705 */ 3706 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) 3707 var->type |= 0x1; /* Accessed */ 3708 3709 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); 3710 3711out: 3712 vmx->emulation_required = emulation_required(vcpu); 3713} 3714 3715static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3716{ 3717 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); 3718 3719 *db = (ar >> 14) & 1; 3720 *l = (ar >> 13) & 1; 3721} 3722 3723static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3724{ 3725 dt->size = vmcs_read32(GUEST_IDTR_LIMIT); 3726 dt->address = vmcs_readl(GUEST_IDTR_BASE); 3727} 3728 3729static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3730{ 3731 vmcs_write32(GUEST_IDTR_LIMIT, dt->size); 3732 vmcs_writel(GUEST_IDTR_BASE, dt->address); 3733} 3734 3735static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3736{ 3737 dt->size = vmcs_read32(GUEST_GDTR_LIMIT); 3738 dt->address = vmcs_readl(GUEST_GDTR_BASE); 3739} 3740 3741static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3742{ 3743 vmcs_write32(GUEST_GDTR_LIMIT, dt->size); 3744 vmcs_writel(GUEST_GDTR_BASE, dt->address); 3745} 3746 3747static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 3748{ 3749 struct kvm_segment var; 3750 u32 ar; 3751 3752 vmx_get_segment(vcpu, &var, seg); 3753 var.dpl = 0x3; 3754 if (seg == VCPU_SREG_CS) 3755 var.type = 0x3; 3756 ar = vmx_segment_access_rights(&var); 3757 3758 if (var.base != (var.selector << 4)) 3759 return false; 3760 if (var.limit != 0xffff) 3761 return false; 3762 if (ar != 0xf3) 3763 return false; 3764 3765 return true; 3766} 3767 3768static bool code_segment_valid(struct kvm_vcpu *vcpu) 3769{ 3770 struct kvm_segment cs; 3771 unsigned int cs_rpl; 3772 3773 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3774 cs_rpl = cs.selector & SELECTOR_RPL_MASK; 3775 3776 if (cs.unusable) 3777 return false; 3778 if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) 3779 return false; 3780 if (!cs.s) 3781 return false; 3782 if (cs.type & AR_TYPE_WRITEABLE_MASK) { 3783 if (cs.dpl > cs_rpl) 3784 return false; 3785 } else { 3786 if (cs.dpl != cs_rpl) 3787 return false; 3788 } 3789 if (!cs.present) 3790 return false; 3791 3792 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ 3793 return true; 3794} 3795 3796static bool stack_segment_valid(struct kvm_vcpu *vcpu) 3797{ 3798 struct kvm_segment ss; 3799 unsigned int ss_rpl; 3800 3801 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3802 ss_rpl = ss.selector & SELECTOR_RPL_MASK; 3803 3804 if (ss.unusable) 3805 return true; 3806 if (ss.type != 3 && ss.type != 7) 3807 return false; 3808 if (!ss.s) 3809 return false; 3810 if (ss.dpl != ss_rpl) /* DPL != RPL */ 3811 return false; 3812 if (!ss.present) 3813 return false; 3814 3815 return true; 3816} 3817 3818static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) 3819{ 3820 struct kvm_segment var; 3821 unsigned int rpl; 3822 3823 vmx_get_segment(vcpu, &var, seg); 3824 rpl = var.selector & SELECTOR_RPL_MASK; 3825 3826 if (var.unusable) 3827 return true; 3828 if (!var.s) 3829 return false; 3830 if (!var.present) 3831 return false; 3832 if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) { 3833 if (var.dpl < rpl) /* DPL < RPL */ 3834 return false; 3835 } 3836 3837 /* TODO: Add other members to kvm_segment_field to allow checking for other access 3838 * rights flags 3839 */ 3840 return true; 3841} 3842 3843static bool tr_valid(struct kvm_vcpu *vcpu) 3844{ 3845 struct kvm_segment tr; 3846 3847 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 3848 3849 if (tr.unusable) 3850 return false; 3851 if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ 3852 return false; 3853 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3854 return false; 3855 if (!tr.present) 3856 return false; 3857 3858 return true; 3859} 3860 3861static bool ldtr_valid(struct kvm_vcpu *vcpu) 3862{ 3863 struct kvm_segment ldtr; 3864 3865 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 3866 3867 if (ldtr.unusable) 3868 return true; 3869 if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ 3870 return false; 3871 if (ldtr.type != 2) 3872 return false; 3873 if (!ldtr.present) 3874 return false; 3875 3876 return true; 3877} 3878 3879static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) 3880{ 3881 struct kvm_segment cs, ss; 3882 3883 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3884 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3885 3886 return ((cs.selector & SELECTOR_RPL_MASK) == 3887 (ss.selector & SELECTOR_RPL_MASK)); 3888} 3889 3890/* 3891 * Check if guest state is valid. Returns true if valid, false if 3892 * not. 3893 * We assume that registers are always usable 3894 */ 3895static bool guest_state_valid(struct kvm_vcpu *vcpu) 3896{ 3897 if (enable_unrestricted_guest) 3898 return true; 3899 3900 /* real mode guest state checks */ 3901 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 3902 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3903 return false; 3904 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 3905 return false; 3906 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) 3907 return false; 3908 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) 3909 return false; 3910 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) 3911 return false; 3912 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) 3913 return false; 3914 } else { 3915 /* protected mode guest state checks */ 3916 if (!cs_ss_rpl_check(vcpu)) 3917 return false; 3918 if (!code_segment_valid(vcpu)) 3919 return false; 3920 if (!stack_segment_valid(vcpu)) 3921 return false; 3922 if (!data_segment_valid(vcpu, VCPU_SREG_DS)) 3923 return false; 3924 if (!data_segment_valid(vcpu, VCPU_SREG_ES)) 3925 return false; 3926 if (!data_segment_valid(vcpu, VCPU_SREG_FS)) 3927 return false; 3928 if (!data_segment_valid(vcpu, VCPU_SREG_GS)) 3929 return false; 3930 if (!tr_valid(vcpu)) 3931 return false; 3932 if (!ldtr_valid(vcpu)) 3933 return false; 3934 } 3935 /* TODO: 3936 * - Add checks on RIP 3937 * - Add checks on RFLAGS 3938 */ 3939 3940 return true; 3941} 3942 3943static int init_rmode_tss(struct kvm *kvm) 3944{ 3945 gfn_t fn; 3946 u16 data = 0; 3947 int idx, r; 3948 3949 idx = srcu_read_lock(&kvm->srcu); 3950 fn = kvm->arch.tss_addr >> PAGE_SHIFT; 3951 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 3952 if (r < 0) 3953 goto out; 3954 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 3955 r = kvm_write_guest_page(kvm, fn++, &data, 3956 TSS_IOPB_BASE_OFFSET, sizeof(u16)); 3957 if (r < 0) 3958 goto out; 3959 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); 3960 if (r < 0) 3961 goto out; 3962 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 3963 if (r < 0) 3964 goto out; 3965 data = ~0; 3966 r = kvm_write_guest_page(kvm, fn, &data, 3967 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, 3968 sizeof(u8)); 3969out: 3970 srcu_read_unlock(&kvm->srcu, idx); 3971 return r; 3972} 3973 3974static int init_rmode_identity_map(struct kvm *kvm) 3975{ 3976 int i, idx, r = 0; 3977 pfn_t identity_map_pfn; 3978 u32 tmp; 3979 3980 if (!enable_ept) 3981 return 0; 3982 3983 /* Protect kvm->arch.ept_identity_pagetable_done. */ 3984 mutex_lock(&kvm->slots_lock); 3985 3986 if (likely(kvm->arch.ept_identity_pagetable_done)) 3987 goto out2; 3988 3989 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; 3990 3991 r = alloc_identity_pagetable(kvm); 3992 if (r < 0) 3993 goto out2; 3994 3995 idx = srcu_read_lock(&kvm->srcu); 3996 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); 3997 if (r < 0) 3998 goto out; 3999 /* Set up identity-mapping pagetable for EPT in real mode */ 4000 for (i = 0; i < PT32_ENT_PER_PAGE; i++) { 4001 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | 4002 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); 4003 r = kvm_write_guest_page(kvm, identity_map_pfn, 4004 &tmp, i * sizeof(tmp), sizeof(tmp)); 4005 if (r < 0) 4006 goto out; 4007 } 4008 kvm->arch.ept_identity_pagetable_done = true; 4009 4010out: 4011 srcu_read_unlock(&kvm->srcu, idx); 4012 4013out2: 4014 mutex_unlock(&kvm->slots_lock); 4015 return r; 4016} 4017 4018static void seg_setup(int seg) 4019{ 4020 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 4021 unsigned int ar; 4022 4023 vmcs_write16(sf->selector, 0); 4024 vmcs_writel(sf->base, 0); 4025 vmcs_write32(sf->limit, 0xffff); 4026 ar = 0x93; 4027 if (seg == VCPU_SREG_CS) 4028 ar |= 0x08; /* code segment */ 4029 4030 vmcs_write32(sf->ar_bytes, ar); 4031} 4032 4033static int alloc_apic_access_page(struct kvm *kvm) 4034{ 4035 struct page *page; 4036 struct kvm_userspace_memory_region kvm_userspace_mem; 4037 int r = 0; 4038 4039 mutex_lock(&kvm->slots_lock); 4040 if (kvm->arch.apic_access_page_done) 4041 goto out; 4042 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; 4043 kvm_userspace_mem.flags = 0; 4044 kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE; 4045 kvm_userspace_mem.memory_size = PAGE_SIZE; 4046 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); 4047 if (r) 4048 goto out; 4049 4050 page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); 4051 if (is_error_page(page)) { 4052 r = -EFAULT; 4053 goto out; 4054 } 4055 4056 /* 4057 * Do not pin the page in memory, so that memory hot-unplug 4058 * is able to migrate it. 4059 */ 4060 put_page(page); 4061 kvm->arch.apic_access_page_done = true; 4062out: 4063 mutex_unlock(&kvm->slots_lock); 4064 return r; 4065} 4066 4067static int alloc_identity_pagetable(struct kvm *kvm) 4068{ 4069 /* Called with kvm->slots_lock held. */ 4070 4071 struct kvm_userspace_memory_region kvm_userspace_mem; 4072 int r = 0; 4073 4074 BUG_ON(kvm->arch.ept_identity_pagetable_done); 4075 4076 kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; 4077 kvm_userspace_mem.flags = 0; 4078 kvm_userspace_mem.guest_phys_addr = 4079 kvm->arch.ept_identity_map_addr; 4080 kvm_userspace_mem.memory_size = PAGE_SIZE; 4081 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); 4082 4083 return r; 4084} 4085 4086static void allocate_vpid(struct vcpu_vmx *vmx) 4087{ 4088 int vpid; 4089 4090 vmx->vpid = 0; 4091 if (!enable_vpid) 4092 return; 4093 spin_lock(&vmx_vpid_lock); 4094 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 4095 if (vpid < VMX_NR_VPIDS) { 4096 vmx->vpid = vpid; 4097 __set_bit(vpid, vmx_vpid_bitmap); 4098 } 4099 spin_unlock(&vmx_vpid_lock); 4100} 4101 4102static void free_vpid(struct vcpu_vmx *vmx) 4103{ 4104 if (!enable_vpid) 4105 return; 4106 spin_lock(&vmx_vpid_lock); 4107 if (vmx->vpid != 0) 4108 __clear_bit(vmx->vpid, vmx_vpid_bitmap); 4109 spin_unlock(&vmx_vpid_lock); 4110} 4111 4112#define MSR_TYPE_R 1 4113#define MSR_TYPE_W 2 4114static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, 4115 u32 msr, int type) 4116{ 4117 int f = sizeof(unsigned long); 4118 4119 if (!cpu_has_vmx_msr_bitmap()) 4120 return; 4121 4122 /* 4123 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 4124 * have the write-low and read-high bitmap offsets the wrong way round. 4125 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 4126 */ 4127 if (msr <= 0x1fff) { 4128 if (type & MSR_TYPE_R) 4129 /* read-low */ 4130 __clear_bit(msr, msr_bitmap + 0x000 / f); 4131 4132 if (type & MSR_TYPE_W) 4133 /* write-low */ 4134 __clear_bit(msr, msr_bitmap + 0x800 / f); 4135 4136 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 4137 msr &= 0x1fff; 4138 if (type & MSR_TYPE_R) 4139 /* read-high */ 4140 __clear_bit(msr, msr_bitmap + 0x400 / f); 4141 4142 if (type & MSR_TYPE_W) 4143 /* write-high */ 4144 __clear_bit(msr, msr_bitmap + 0xc00 / f); 4145 4146 } 4147} 4148 4149static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, 4150 u32 msr, int type) 4151{ 4152 int f = sizeof(unsigned long); 4153 4154 if (!cpu_has_vmx_msr_bitmap()) 4155 return; 4156 4157 /* 4158 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 4159 * have the write-low and read-high bitmap offsets the wrong way round. 4160 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 4161 */ 4162 if (msr <= 0x1fff) { 4163 if (type & MSR_TYPE_R) 4164 /* read-low */ 4165 __set_bit(msr, msr_bitmap + 0x000 / f); 4166 4167 if (type & MSR_TYPE_W) 4168 /* write-low */ 4169 __set_bit(msr, msr_bitmap + 0x800 / f); 4170 4171 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 4172 msr &= 0x1fff; 4173 if (type & MSR_TYPE_R) 4174 /* read-high */ 4175 __set_bit(msr, msr_bitmap + 0x400 / f); 4176 4177 if (type & MSR_TYPE_W) 4178 /* write-high */ 4179 __set_bit(msr, msr_bitmap + 0xc00 / f); 4180 4181 } 4182} 4183 4184static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) 4185{ 4186 if (!longmode_only) 4187 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, 4188 msr, MSR_TYPE_R | MSR_TYPE_W); 4189 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, 4190 msr, MSR_TYPE_R | MSR_TYPE_W); 4191} 4192 4193static void vmx_enable_intercept_msr_read_x2apic(u32 msr) 4194{ 4195 __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, 4196 msr, MSR_TYPE_R); 4197 __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, 4198 msr, MSR_TYPE_R); 4199} 4200 4201static void vmx_disable_intercept_msr_read_x2apic(u32 msr) 4202{ 4203 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, 4204 msr, MSR_TYPE_R); 4205 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, 4206 msr, MSR_TYPE_R); 4207} 4208 4209static void vmx_disable_intercept_msr_write_x2apic(u32 msr) 4210{ 4211 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, 4212 msr, MSR_TYPE_W); 4213 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, 4214 msr, MSR_TYPE_W); 4215} 4216 4217static int vmx_vm_has_apicv(struct kvm *kvm) 4218{ 4219 return enable_apicv && irqchip_in_kernel(kvm); 4220} 4221 4222/* 4223 * Send interrupt to vcpu via posted interrupt way. 4224 * 1. If target vcpu is running(non-root mode), send posted interrupt 4225 * notification to vcpu and hardware will sync PIR to vIRR atomically. 4226 * 2. If target vcpu isn't running(root mode), kick it to pick up the 4227 * interrupt from PIR in next vmentry. 4228 */ 4229static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 4230{ 4231 struct vcpu_vmx *vmx = to_vmx(vcpu); 4232 int r; 4233 4234 if (pi_test_and_set_pir(vector, &vmx->pi_desc)) 4235 return; 4236 4237 r = pi_test_and_set_on(&vmx->pi_desc); 4238 kvm_make_request(KVM_REQ_EVENT, vcpu); 4239#ifdef CONFIG_SMP 4240 if (!r && (vcpu->mode == IN_GUEST_MODE)) 4241 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), 4242 POSTED_INTR_VECTOR); 4243 else 4244#endif 4245 kvm_vcpu_kick(vcpu); 4246} 4247 4248static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 4249{ 4250 struct vcpu_vmx *vmx = to_vmx(vcpu); 4251 4252 if (!pi_test_and_clear_on(&vmx->pi_desc)) 4253 return; 4254 4255 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); 4256} 4257 4258static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu) 4259{ 4260 return; 4261} 4262 4263/* 4264 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4265 * will not change in the lifetime of the guest. 4266 * Note that host-state that does change is set elsewhere. E.g., host-state 4267 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4268 */ 4269static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 4270{ 4271 u32 low32, high32; 4272 unsigned long tmpl; 4273 struct desc_ptr dt; 4274 unsigned long cr4; 4275 4276 vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */ 4277 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ 4278 4279 /* Save the most likely value for this task's CR4 in the VMCS. */ 4280 cr4 = read_cr4(); 4281 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 4282 vmx->host_state.vmcs_host_cr4 = cr4; 4283 4284 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 4285#ifdef CONFIG_X86_64 4286 /* 4287 * Load null selectors, so we can avoid reloading them in 4288 * __vmx_load_host_state(), in case userspace uses the null selectors 4289 * too (the expected case). 4290 */ 4291 vmcs_write16(HOST_DS_SELECTOR, 0); 4292 vmcs_write16(HOST_ES_SELECTOR, 0); 4293#else 4294 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4295 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4296#endif 4297 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4298 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 4299 4300 native_store_idt(&dt); 4301 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ 4302 vmx->host_idt_base = dt.address; 4303 4304 vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ 4305 4306 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 4307 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 4308 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); 4309 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ 4310 4311 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { 4312 rdmsr(MSR_IA32_CR_PAT, low32, high32); 4313 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); 4314 } 4315} 4316 4317static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) 4318{ 4319 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; 4320 if (enable_ept) 4321 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; 4322 if (is_guest_mode(&vmx->vcpu)) 4323 vmx->vcpu.arch.cr4_guest_owned_bits &= 4324 ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; 4325 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); 4326} 4327 4328static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 4329{ 4330 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 4331 4332 if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) 4333 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 4334 return pin_based_exec_ctrl; 4335} 4336 4337static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4338{ 4339 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4340 4341 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 4342 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 4343 4344 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { 4345 exec_control &= ~CPU_BASED_TPR_SHADOW; 4346#ifdef CONFIG_X86_64 4347 exec_control |= CPU_BASED_CR8_STORE_EXITING | 4348 CPU_BASED_CR8_LOAD_EXITING; 4349#endif 4350 } 4351 if (!enable_ept) 4352 exec_control |= CPU_BASED_CR3_STORE_EXITING | 4353 CPU_BASED_CR3_LOAD_EXITING | 4354 CPU_BASED_INVLPG_EXITING; 4355 return exec_control; 4356} 4357 4358static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4359{ 4360 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4361 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) 4362 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4363 if (vmx->vpid == 0) 4364 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4365 if (!enable_ept) { 4366 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 4367 enable_unrestricted_guest = 0; 4368 /* Enable INVPCID for non-ept guests may cause performance regression. */ 4369 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; 4370 } 4371 if (!enable_unrestricted_guest) 4372 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4373 if (!ple_gap) 4374 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4375 if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) 4376 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4377 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4378 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4379 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 4380 (handle_vmptrld). 4381 We can NOT enable shadow_vmcs here because we don't have yet 4382 a current VMCS12 4383 */ 4384 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4385 return exec_control; 4386} 4387 4388static void ept_set_mmio_spte_mask(void) 4389{ 4390 /* 4391 * EPT Misconfigurations can be generated if the value of bits 2:0 4392 * of an EPT paging-structure entry is 110b (write/execute). 4393 * Also, magic bits (0x3ull << 62) is set to quickly identify mmio 4394 * spte. 4395 */ 4396 kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); 4397} 4398 4399/* 4400 * Sets up the vmcs for emulated real mode. 4401 */ 4402static int vmx_vcpu_setup(struct vcpu_vmx *vmx) 4403{ 4404#ifdef CONFIG_X86_64 4405 unsigned long a; 4406#endif 4407 int i; 4408 4409 /* I/O */ 4410 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); 4411 vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); 4412 4413 if (enable_shadow_vmcs) { 4414 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 4415 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 4416 } 4417 if (cpu_has_vmx_msr_bitmap()) 4418 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); 4419 4420 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 4421 4422 /* Control */ 4423 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); 4424 4425 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); 4426 4427 if (cpu_has_secondary_exec_ctrls()) { 4428 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 4429 vmx_secondary_exec_control(vmx)); 4430 } 4431 4432 if (vmx_vm_has_apicv(vmx->vcpu.kvm)) { 4433 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4434 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4435 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4436 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4437 4438 vmcs_write16(GUEST_INTR_STATUS, 0); 4439 4440 vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4441 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); 4442 } 4443 4444 if (ple_gap) { 4445 vmcs_write32(PLE_GAP, ple_gap); 4446 vmx->ple_window = ple_window; 4447 vmx->ple_window_dirty = true; 4448 } 4449 4450 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 4451 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 4452 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 4453 4454 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4455 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4456 vmx_set_constant_host_state(vmx); 4457#ifdef CONFIG_X86_64 4458 rdmsrl(MSR_FS_BASE, a); 4459 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ 4460 rdmsrl(MSR_GS_BASE, a); 4461 vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ 4462#else 4463 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 4464 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 4465#endif 4466 4467 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 4468 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 4469 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); 4470 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 4471 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); 4472 4473 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 4474 u32 msr_low, msr_high; 4475 u64 host_pat; 4476 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); 4477 host_pat = msr_low | ((u64) msr_high << 32); 4478 /* Write the default value follow host pat */ 4479 vmcs_write64(GUEST_IA32_PAT, host_pat); 4480 /* Keep arch.pat sync with GUEST_IA32_PAT */ 4481 vmx->vcpu.arch.pat = host_pat; 4482 } 4483 4484 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { 4485 u32 index = vmx_msr_index[i]; 4486 u32 data_low, data_high; 4487 int j = vmx->nmsrs; 4488 4489 if (rdmsr_safe(index, &data_low, &data_high) < 0) 4490 continue; 4491 if (wrmsr_safe(index, data_low, data_high) < 0) 4492 continue; 4493 vmx->guest_msrs[j].index = i; 4494 vmx->guest_msrs[j].data = 0; 4495 vmx->guest_msrs[j].mask = -1ull; 4496 ++vmx->nmsrs; 4497 } 4498 4499 4500 vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); 4501 4502 /* 22.2.1, 20.8.1 */ 4503 vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); 4504 4505 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 4506 set_cr4_guest_host_mask(vmx); 4507 4508 return 0; 4509} 4510 4511static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4512{ 4513 struct vcpu_vmx *vmx = to_vmx(vcpu); 4514 struct msr_data apic_base_msr; 4515 4516 vmx->rmode.vm86_active = 0; 4517 4518 vmx->soft_vnmi_blocked = 0; 4519 4520 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); 4521 kvm_set_cr8(&vmx->vcpu, 0); 4522 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; 4523 if (kvm_vcpu_is_bsp(&vmx->vcpu)) 4524 apic_base_msr.data |= MSR_IA32_APICBASE_BSP; 4525 apic_base_msr.host_initiated = true; 4526 kvm_set_apic_base(&vmx->vcpu, &apic_base_msr); 4527 4528 vmx_segment_cache_clear(vmx); 4529 4530 seg_setup(VCPU_SREG_CS); 4531 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4532 vmcs_write32(GUEST_CS_BASE, 0xffff0000); 4533 4534 seg_setup(VCPU_SREG_DS); 4535 seg_setup(VCPU_SREG_ES); 4536 seg_setup(VCPU_SREG_FS); 4537 seg_setup(VCPU_SREG_GS); 4538 seg_setup(VCPU_SREG_SS); 4539 4540 vmcs_write16(GUEST_TR_SELECTOR, 0); 4541 vmcs_writel(GUEST_TR_BASE, 0); 4542 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 4543 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 4544 4545 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 4546 vmcs_writel(GUEST_LDTR_BASE, 0); 4547 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 4548 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 4549 4550 vmcs_write32(GUEST_SYSENTER_CS, 0); 4551 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4552 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4553 4554 vmcs_writel(GUEST_RFLAGS, 0x02); 4555 kvm_rip_write(vcpu, 0xfff0); 4556 4557 vmcs_writel(GUEST_GDTR_BASE, 0); 4558 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4559 4560 vmcs_writel(GUEST_IDTR_BASE, 0); 4561 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 4562 4563 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 4564 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 4565 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); 4566 4567 /* Special registers */ 4568 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4569 4570 setup_msrs(vmx); 4571 4572 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 4573 4574 if (cpu_has_vmx_tpr_shadow()) { 4575 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4576 if (vm_need_tpr_shadow(vmx->vcpu.kvm)) 4577 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4578 __pa(vmx->vcpu.arch.apic->regs)); 4579 vmcs_write32(TPR_THRESHOLD, 0); 4580 } 4581 4582 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4583 4584 if (vmx_vm_has_apicv(vcpu->kvm)) 4585 memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); 4586 4587 if (vmx->vpid != 0) 4588 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4589 4590 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 4591 vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ 4592 vmx_set_cr4(&vmx->vcpu, 0); 4593 vmx_set_efer(&vmx->vcpu, 0); 4594 vmx_fpu_activate(&vmx->vcpu); 4595 update_exception_bitmap(&vmx->vcpu); 4596 4597 vpid_sync_context(vmx); 4598} 4599 4600/* 4601 * In nested virtualization, check if L1 asked to exit on external interrupts. 4602 * For most existing hypervisors, this will always return true. 4603 */ 4604static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) 4605{ 4606 return get_vmcs12(vcpu)->pin_based_vm_exec_control & 4607 PIN_BASED_EXT_INTR_MASK; 4608} 4609 4610/* 4611 * In nested virtualization, check if L1 has set 4612 * VM_EXIT_ACK_INTR_ON_EXIT 4613 */ 4614static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 4615{ 4616 return get_vmcs12(vcpu)->vm_exit_controls & 4617 VM_EXIT_ACK_INTR_ON_EXIT; 4618} 4619 4620static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) 4621{ 4622 return get_vmcs12(vcpu)->pin_based_vm_exec_control & 4623 PIN_BASED_NMI_EXITING; 4624} 4625 4626static void enable_irq_window(struct kvm_vcpu *vcpu) 4627{ 4628 u32 cpu_based_vm_exec_control; 4629 4630 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 4631 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; 4632 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 4633} 4634 4635static void enable_nmi_window(struct kvm_vcpu *vcpu) 4636{ 4637 u32 cpu_based_vm_exec_control; 4638 4639 if (!cpu_has_virtual_nmis() || 4640 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 4641 enable_irq_window(vcpu); 4642 return; 4643 } 4644 4645 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 4646 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; 4647 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 4648} 4649 4650static void vmx_inject_irq(struct kvm_vcpu *vcpu) 4651{ 4652 struct vcpu_vmx *vmx = to_vmx(vcpu); 4653 uint32_t intr; 4654 int irq = vcpu->arch.interrupt.nr; 4655 4656 trace_kvm_inj_virq(irq); 4657 4658 ++vcpu->stat.irq_injections; 4659 if (vmx->rmode.vm86_active) { 4660 int inc_eip = 0; 4661 if (vcpu->arch.interrupt.soft) 4662 inc_eip = vcpu->arch.event_exit_inst_len; 4663 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) 4664 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4665 return; 4666 } 4667 intr = irq | INTR_INFO_VALID_MASK; 4668 if (vcpu->arch.interrupt.soft) { 4669 intr |= INTR_TYPE_SOFT_INTR; 4670 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 4671 vmx->vcpu.arch.event_exit_inst_len); 4672 } else 4673 intr |= INTR_TYPE_EXT_INTR; 4674 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 4675} 4676 4677static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 4678{ 4679 struct vcpu_vmx *vmx = to_vmx(vcpu); 4680 4681 if (is_guest_mode(vcpu)) 4682 return; 4683 4684 if (!cpu_has_virtual_nmis()) { 4685 /* 4686 * Tracking the NMI-blocked state in software is built upon 4687 * finding the next open IRQ window. This, in turn, depends on 4688 * well-behaving guests: They have to keep IRQs disabled at 4689 * least as long as the NMI handler runs. Otherwise we may 4690 * cause NMI nesting, maybe breaking the guest. But as this is 4691 * highly unlikely, we can live with the residual risk. 4692 */ 4693 vmx->soft_vnmi_blocked = 1; 4694 vmx->vnmi_blocked_time = 0; 4695 } 4696 4697 ++vcpu->stat.nmi_injections; 4698 vmx->nmi_known_unmasked = false; 4699 if (vmx->rmode.vm86_active) { 4700 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) 4701 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4702 return; 4703 } 4704 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 4705 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 4706} 4707 4708static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 4709{ 4710 if (!cpu_has_virtual_nmis()) 4711 return to_vmx(vcpu)->soft_vnmi_blocked; 4712 if (to_vmx(vcpu)->nmi_known_unmasked) 4713 return false; 4714 return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 4715} 4716 4717static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 4718{ 4719 struct vcpu_vmx *vmx = to_vmx(vcpu); 4720 4721 if (!cpu_has_virtual_nmis()) { 4722 if (vmx->soft_vnmi_blocked != masked) { 4723 vmx->soft_vnmi_blocked = masked; 4724 vmx->vnmi_blocked_time = 0; 4725 } 4726 } else { 4727 vmx->nmi_known_unmasked = !masked; 4728 if (masked) 4729 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 4730 GUEST_INTR_STATE_NMI); 4731 else 4732 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 4733 GUEST_INTR_STATE_NMI); 4734 } 4735} 4736 4737static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 4738{ 4739 if (to_vmx(vcpu)->nested.nested_run_pending) 4740 return 0; 4741 4742 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) 4743 return 0; 4744 4745 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 4746 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI 4747 | GUEST_INTR_STATE_NMI)); 4748} 4749 4750static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 4751{ 4752 return (!to_vmx(vcpu)->nested.nested_run_pending && 4753 vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 4754 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 4755 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 4756} 4757 4758static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 4759{ 4760 int ret; 4761 struct kvm_userspace_memory_region tss_mem = { 4762 .slot = TSS_PRIVATE_MEMSLOT, 4763 .guest_phys_addr = addr, 4764 .memory_size = PAGE_SIZE * 3, 4765 .flags = 0, 4766 }; 4767 4768 ret = kvm_set_memory_region(kvm, &tss_mem); 4769 if (ret) 4770 return ret; 4771 kvm->arch.tss_addr = addr; 4772 return init_rmode_tss(kvm); 4773} 4774 4775static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 4776{ 4777 switch (vec) { 4778 case BP_VECTOR: 4779 /* 4780 * Update instruction length as we may reinject the exception 4781 * from user space while in guest debugging mode. 4782 */ 4783 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 4784 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4785 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 4786 return false; 4787 /* fall through */ 4788 case DB_VECTOR: 4789 if (vcpu->guest_debug & 4790 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 4791 return false; 4792 /* fall through */ 4793 case DE_VECTOR: 4794 case OF_VECTOR: 4795 case BR_VECTOR: 4796 case UD_VECTOR: 4797 case DF_VECTOR: 4798 case SS_VECTOR: 4799 case GP_VECTOR: 4800 case MF_VECTOR: 4801 return true; 4802 break; 4803 } 4804 return false; 4805} 4806 4807static int handle_rmode_exception(struct kvm_vcpu *vcpu, 4808 int vec, u32 err_code) 4809{ 4810 /* 4811 * Instruction with address size override prefix opcode 0x67 4812 * Cause the #SS fault with 0 error code in VM86 mode. 4813 */ 4814 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { 4815 if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { 4816 if (vcpu->arch.halt_request) { 4817 vcpu->arch.halt_request = 0; 4818 return kvm_emulate_halt(vcpu); 4819 } 4820 return 1; 4821 } 4822 return 0; 4823 } 4824 4825 /* 4826 * Forward all other exceptions that are valid in real mode. 4827 * FIXME: Breaks guest debugging in real mode, needs to be fixed with 4828 * the required debugging infrastructure rework. 4829 */ 4830 kvm_queue_exception(vcpu, vec); 4831 return 1; 4832} 4833 4834/* 4835 * Trigger machine check on the host. We assume all the MSRs are already set up 4836 * by the CPU and that we still run on the same CPU as the MCE occurred on. 4837 * We pass a fake environment to the machine check handler because we want 4838 * the guest to be always treated like user space, no matter what context 4839 * it used internally. 4840 */ 4841static void kvm_machine_check(void) 4842{ 4843#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) 4844 struct pt_regs regs = { 4845 .cs = 3, /* Fake ring 3 no matter what the guest ran on */ 4846 .flags = X86_EFLAGS_IF, 4847 }; 4848 4849 do_machine_check(®s, 0); 4850#endif 4851} 4852 4853static int handle_machine_check(struct kvm_vcpu *vcpu) 4854{ 4855 /* already handled by vcpu_run */ 4856 return 1; 4857} 4858 4859static int handle_exception(struct kvm_vcpu *vcpu) 4860{ 4861 struct vcpu_vmx *vmx = to_vmx(vcpu); 4862 struct kvm_run *kvm_run = vcpu->run; 4863 u32 intr_info, ex_no, error_code; 4864 unsigned long cr2, rip, dr6; 4865 u32 vect_info; 4866 enum emulation_result er; 4867 4868 vect_info = vmx->idt_vectoring_info; 4869 intr_info = vmx->exit_intr_info; 4870 4871 if (is_machine_check(intr_info)) 4872 return handle_machine_check(vcpu); 4873 4874 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 4875 return 1; /* already handled by vmx_vcpu_run() */ 4876 4877 if (is_no_device(intr_info)) { 4878 vmx_fpu_activate(vcpu); 4879 return 1; 4880 } 4881 4882 if (is_invalid_opcode(intr_info)) { 4883 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); 4884 if (er != EMULATE_DONE) 4885 kvm_queue_exception(vcpu, UD_VECTOR); 4886 return 1; 4887 } 4888 4889 error_code = 0; 4890 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 4891 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 4892 4893 /* 4894 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 4895 * MMIO, it is better to report an internal error. 4896 * See the comments in vmx_handle_exit. 4897 */ 4898 if ((vect_info & VECTORING_INFO_VALID_MASK) && 4899 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 4900 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 4901 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 4902 vcpu->run->internal.ndata = 2; 4903 vcpu->run->internal.data[0] = vect_info; 4904 vcpu->run->internal.data[1] = intr_info; 4905 return 0; 4906 } 4907 4908 if (is_page_fault(intr_info)) { 4909 /* EPT won't cause page fault directly */ 4910 BUG_ON(enable_ept); 4911 cr2 = vmcs_readl(EXIT_QUALIFICATION); 4912 trace_kvm_page_fault(cr2, error_code); 4913 4914 if (kvm_event_needs_reinjection(vcpu)) 4915 kvm_mmu_unprotect_page_virt(vcpu, cr2); 4916 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); 4917 } 4918 4919 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 4920 4921 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) 4922 return handle_rmode_exception(vcpu, ex_no, error_code); 4923 4924 switch (ex_no) { 4925 case DB_VECTOR: 4926 dr6 = vmcs_readl(EXIT_QUALIFICATION); 4927 if (!(vcpu->guest_debug & 4928 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 4929 vcpu->arch.dr6 &= ~15; 4930 vcpu->arch.dr6 |= dr6 | DR6_RTM; 4931 if (!(dr6 & ~DR6_RESERVED)) /* icebp */ 4932 skip_emulated_instruction(vcpu); 4933 4934 kvm_queue_exception(vcpu, DB_VECTOR); 4935 return 1; 4936 } 4937 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; 4938 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 4939 /* fall through */ 4940 case BP_VECTOR: 4941 /* 4942 * Update instruction length as we may reinject #BP from 4943 * user space while in guest debugging mode. Reading it for 4944 * #DB as well causes no harm, it is not used in that case. 4945 */ 4946 vmx->vcpu.arch.event_exit_inst_len = 4947 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4948 kvm_run->exit_reason = KVM_EXIT_DEBUG; 4949 rip = kvm_rip_read(vcpu); 4950 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; 4951 kvm_run->debug.arch.exception = ex_no; 4952 break; 4953 default: 4954 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 4955 kvm_run->ex.exception = ex_no; 4956 kvm_run->ex.error_code = error_code; 4957 break; 4958 } 4959 return 0; 4960} 4961 4962static int handle_external_interrupt(struct kvm_vcpu *vcpu) 4963{ 4964 ++vcpu->stat.irq_exits; 4965 return 1; 4966} 4967 4968static int handle_triple_fault(struct kvm_vcpu *vcpu) 4969{ 4970 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 4971 return 0; 4972} 4973 4974static int handle_io(struct kvm_vcpu *vcpu) 4975{ 4976 unsigned long exit_qualification; 4977 int size, in, string; 4978 unsigned port; 4979 4980 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4981 string = (exit_qualification & 16) != 0; 4982 in = (exit_qualification & 8) != 0; 4983 4984 ++vcpu->stat.io_exits; 4985 4986 if (string || in) 4987 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 4988 4989 port = exit_qualification >> 16; 4990 size = (exit_qualification & 7) + 1; 4991 skip_emulated_instruction(vcpu); 4992 4993 return kvm_fast_pio_out(vcpu, size, port); 4994} 4995 4996static void 4997vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 4998{ 4999 /* 5000 * Patch in the VMCALL instruction: 5001 */ 5002 hypercall[0] = 0x0f; 5003 hypercall[1] = 0x01; 5004 hypercall[2] = 0xc1; 5005} 5006 5007static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val) 5008{ 5009 unsigned long always_on = VMXON_CR0_ALWAYSON; 5010 5011 if (nested_vmx_secondary_ctls_high & 5012 SECONDARY_EXEC_UNRESTRICTED_GUEST && 5013 nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 5014 always_on &= ~(X86_CR0_PE | X86_CR0_PG); 5015 return (val & always_on) == always_on; 5016} 5017 5018/* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 5019static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 5020{ 5021 if (is_guest_mode(vcpu)) { 5022 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5023 unsigned long orig_val = val; 5024 5025 /* 5026 * We get here when L2 changed cr0 in a way that did not change 5027 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 5028 * but did change L0 shadowed bits. So we first calculate the 5029 * effective cr0 value that L1 would like to write into the 5030 * hardware. It consists of the L2-owned bits from the new 5031 * value combined with the L1-owned bits from L1's guest_cr0. 5032 */ 5033 val = (val & ~vmcs12->cr0_guest_host_mask) | 5034 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 5035 5036 if (!nested_cr0_valid(vmcs12, val)) 5037 return 1; 5038 5039 if (kvm_set_cr0(vcpu, val)) 5040 return 1; 5041 vmcs_writel(CR0_READ_SHADOW, orig_val); 5042 return 0; 5043 } else { 5044 if (to_vmx(vcpu)->nested.vmxon && 5045 ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) 5046 return 1; 5047 return kvm_set_cr0(vcpu, val); 5048 } 5049} 5050 5051static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 5052{ 5053 if (is_guest_mode(vcpu)) { 5054 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5055 unsigned long orig_val = val; 5056 5057 /* analogously to handle_set_cr0 */ 5058 val = (val & ~vmcs12->cr4_guest_host_mask) | 5059 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 5060 if (kvm_set_cr4(vcpu, val)) 5061 return 1; 5062 vmcs_writel(CR4_READ_SHADOW, orig_val); 5063 return 0; 5064 } else 5065 return kvm_set_cr4(vcpu, val); 5066} 5067 5068/* called to set cr0 as approriate for clts instruction exit. */ 5069static void handle_clts(struct kvm_vcpu *vcpu) 5070{ 5071 if (is_guest_mode(vcpu)) { 5072 /* 5073 * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS 5074 * but we did (!fpu_active). We need to keep GUEST_CR0.TS on, 5075 * just pretend it's off (also in arch.cr0 for fpu_activate). 5076 */ 5077 vmcs_writel(CR0_READ_SHADOW, 5078 vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS); 5079 vcpu->arch.cr0 &= ~X86_CR0_TS; 5080 } else 5081 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 5082} 5083 5084static int handle_cr(struct kvm_vcpu *vcpu) 5085{ 5086 unsigned long exit_qualification, val; 5087 int cr; 5088 int reg; 5089 int err; 5090 5091 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5092 cr = exit_qualification & 15; 5093 reg = (exit_qualification >> 8) & 15; 5094 switch ((exit_qualification >> 4) & 3) { 5095 case 0: /* mov to cr */ 5096 val = kvm_register_readl(vcpu, reg); 5097 trace_kvm_cr_write(cr, val); 5098 switch (cr) { 5099 case 0: 5100 err = handle_set_cr0(vcpu, val); 5101 kvm_complete_insn_gp(vcpu, err); 5102 return 1; 5103 case 3: 5104 err = kvm_set_cr3(vcpu, val); 5105 kvm_complete_insn_gp(vcpu, err); 5106 return 1; 5107 case 4: 5108 err = handle_set_cr4(vcpu, val); 5109 kvm_complete_insn_gp(vcpu, err); 5110 return 1; 5111 case 8: { 5112 u8 cr8_prev = kvm_get_cr8(vcpu); 5113 u8 cr8 = (u8)val; 5114 err = kvm_set_cr8(vcpu, cr8); 5115 kvm_complete_insn_gp(vcpu, err); 5116 if (irqchip_in_kernel(vcpu->kvm)) 5117 return 1; 5118 if (cr8_prev <= cr8) 5119 return 1; 5120 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 5121 return 0; 5122 } 5123 } 5124 break; 5125 case 2: /* clts */ 5126 handle_clts(vcpu); 5127 trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); 5128 skip_emulated_instruction(vcpu); 5129 vmx_fpu_activate(vcpu); 5130 return 1; 5131 case 1: /*mov from cr*/ 5132 switch (cr) { 5133 case 3: 5134 val = kvm_read_cr3(vcpu); 5135 kvm_register_write(vcpu, reg, val); 5136 trace_kvm_cr_read(cr, val); 5137 skip_emulated_instruction(vcpu); 5138 return 1; 5139 case 8: 5140 val = kvm_get_cr8(vcpu); 5141 kvm_register_write(vcpu, reg, val); 5142 trace_kvm_cr_read(cr, val); 5143 skip_emulated_instruction(vcpu); 5144 return 1; 5145 } 5146 break; 5147 case 3: /* lmsw */ 5148 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5149 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); 5150 kvm_lmsw(vcpu, val); 5151 5152 skip_emulated_instruction(vcpu); 5153 return 1; 5154 default: 5155 break; 5156 } 5157 vcpu->run->exit_reason = 0; 5158 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 5159 (int)(exit_qualification >> 4) & 3, cr); 5160 return 0; 5161} 5162 5163static int handle_dr(struct kvm_vcpu *vcpu) 5164{ 5165 unsigned long exit_qualification; 5166 int dr, reg; 5167 5168 /* Do not handle if the CPL > 0, will trigger GP on re-entry */ 5169 if (!kvm_require_cpl(vcpu, 0)) 5170 return 1; 5171 dr = vmcs_readl(GUEST_DR7); 5172 if (dr & DR7_GD) { 5173 /* 5174 * As the vm-exit takes precedence over the debug trap, we 5175 * need to emulate the latter, either for the host or the 5176 * guest debugging itself. 5177 */ 5178 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5179 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; 5180 vcpu->run->debug.arch.dr7 = dr; 5181 vcpu->run->debug.arch.pc = 5182 vmcs_readl(GUEST_CS_BASE) + 5183 vmcs_readl(GUEST_RIP); 5184 vcpu->run->debug.arch.exception = DB_VECTOR; 5185 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 5186 return 0; 5187 } else { 5188 vcpu->arch.dr7 &= ~DR7_GD; 5189 vcpu->arch.dr6 |= DR6_BD | DR6_RTM; 5190 vmcs_writel(GUEST_DR7, vcpu->arch.dr7); 5191 kvm_queue_exception(vcpu, DB_VECTOR); 5192 return 1; 5193 } 5194 } 5195 5196 if (vcpu->guest_debug == 0) { 5197 u32 cpu_based_vm_exec_control; 5198 5199 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5200 cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING; 5201 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 5202 5203 /* 5204 * No more DR vmexits; force a reload of the debug registers 5205 * and reenter on this instruction. The next vmexit will 5206 * retrieve the full state of the debug registers. 5207 */ 5208 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 5209 return 1; 5210 } 5211 5212 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5213 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 5214 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5215 if (exit_qualification & TYPE_MOV_FROM_DR) { 5216 unsigned long val; 5217 5218 if (kvm_get_dr(vcpu, dr, &val)) 5219 return 1; 5220 kvm_register_write(vcpu, reg, val); 5221 } else 5222 if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) 5223 return 1; 5224 5225 skip_emulated_instruction(vcpu); 5226 return 1; 5227} 5228 5229static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) 5230{ 5231 return vcpu->arch.dr6; 5232} 5233 5234static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) 5235{ 5236} 5237 5238static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 5239{ 5240 u32 cpu_based_vm_exec_control; 5241 5242 get_debugreg(vcpu->arch.db[0], 0); 5243 get_debugreg(vcpu->arch.db[1], 1); 5244 get_debugreg(vcpu->arch.db[2], 2); 5245 get_debugreg(vcpu->arch.db[3], 3); 5246 get_debugreg(vcpu->arch.dr6, 6); 5247 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 5248 5249 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 5250 5251 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5252 cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING; 5253 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 5254} 5255 5256static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 5257{ 5258 vmcs_writel(GUEST_DR7, val); 5259} 5260 5261static int handle_cpuid(struct kvm_vcpu *vcpu) 5262{ 5263 kvm_emulate_cpuid(vcpu); 5264 return 1; 5265} 5266 5267static int handle_rdmsr(struct kvm_vcpu *vcpu) 5268{ 5269 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 5270 u64 data; 5271 5272 if (vmx_get_msr(vcpu, ecx, &data)) { 5273 trace_kvm_msr_read_ex(ecx); 5274 kvm_inject_gp(vcpu, 0); 5275 return 1; 5276 } 5277 5278 trace_kvm_msr_read(ecx, data); 5279 5280 /* FIXME: handling of bits 32:63 of rax, rdx */ 5281 vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; 5282 vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; 5283 skip_emulated_instruction(vcpu); 5284 return 1; 5285} 5286 5287static int handle_wrmsr(struct kvm_vcpu *vcpu) 5288{ 5289 struct msr_data msr; 5290 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 5291 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 5292 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); 5293 5294 msr.data = data; 5295 msr.index = ecx; 5296 msr.host_initiated = false; 5297 if (kvm_set_msr(vcpu, &msr) != 0) { 5298 trace_kvm_msr_write_ex(ecx, data); 5299 kvm_inject_gp(vcpu, 0); 5300 return 1; 5301 } 5302 5303 trace_kvm_msr_write(ecx, data); 5304 skip_emulated_instruction(vcpu); 5305 return 1; 5306} 5307 5308static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 5309{ 5310 kvm_make_request(KVM_REQ_EVENT, vcpu); 5311 return 1; 5312} 5313 5314static int handle_interrupt_window(struct kvm_vcpu *vcpu) 5315{ 5316 u32 cpu_based_vm_exec_control; 5317 5318 /* clear pending irq */ 5319 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5320 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 5321 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 5322 5323 kvm_make_request(KVM_REQ_EVENT, vcpu); 5324 5325 ++vcpu->stat.irq_window_exits; 5326 5327 /* 5328 * If the user space waits to inject interrupts, exit as soon as 5329 * possible 5330 */ 5331 if (!irqchip_in_kernel(vcpu->kvm) && 5332 vcpu->run->request_interrupt_window && 5333 !kvm_cpu_has_interrupt(vcpu)) { 5334 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 5335 return 0; 5336 } 5337 return 1; 5338} 5339 5340static int handle_halt(struct kvm_vcpu *vcpu) 5341{ 5342 skip_emulated_instruction(vcpu); 5343 return kvm_emulate_halt(vcpu); 5344} 5345 5346static int handle_vmcall(struct kvm_vcpu *vcpu) 5347{ 5348 skip_emulated_instruction(vcpu); 5349 kvm_emulate_hypercall(vcpu); 5350 return 1; 5351} 5352 5353static int handle_invd(struct kvm_vcpu *vcpu) 5354{ 5355 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 5356} 5357 5358static int handle_invlpg(struct kvm_vcpu *vcpu) 5359{ 5360 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5361 5362 kvm_mmu_invlpg(vcpu, exit_qualification); 5363 skip_emulated_instruction(vcpu); 5364 return 1; 5365} 5366 5367static int handle_rdpmc(struct kvm_vcpu *vcpu) 5368{ 5369 int err; 5370 5371 err = kvm_rdpmc(vcpu); 5372 kvm_complete_insn_gp(vcpu, err); 5373 5374 return 1; 5375} 5376 5377static int handle_wbinvd(struct kvm_vcpu *vcpu) 5378{ 5379 skip_emulated_instruction(vcpu); 5380 kvm_emulate_wbinvd(vcpu); 5381 return 1; 5382} 5383 5384static int handle_xsetbv(struct kvm_vcpu *vcpu) 5385{ 5386 u64 new_bv = kvm_read_edx_eax(vcpu); 5387 u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 5388 5389 if (kvm_set_xcr(vcpu, index, new_bv) == 0) 5390 skip_emulated_instruction(vcpu); 5391 return 1; 5392} 5393 5394static int handle_apic_access(struct kvm_vcpu *vcpu) 5395{ 5396 if (likely(fasteoi)) { 5397 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5398 int access_type, offset; 5399 5400 access_type = exit_qualification & APIC_ACCESS_TYPE; 5401 offset = exit_qualification & APIC_ACCESS_OFFSET; 5402 /* 5403 * Sane guest uses MOV to write EOI, with written value 5404 * not cared. So make a short-circuit here by avoiding 5405 * heavy instruction emulation. 5406 */ 5407 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && 5408 (offset == APIC_EOI)) { 5409 kvm_lapic_set_eoi(vcpu); 5410 skip_emulated_instruction(vcpu); 5411 return 1; 5412 } 5413 } 5414 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 5415} 5416 5417static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 5418{ 5419 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5420 int vector = exit_qualification & 0xff; 5421 5422 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 5423 kvm_apic_set_eoi_accelerated(vcpu, vector); 5424 return 1; 5425} 5426 5427static int handle_apic_write(struct kvm_vcpu *vcpu) 5428{ 5429 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5430 u32 offset = exit_qualification & 0xfff; 5431 5432 /* APIC-write VM exit is trap-like and thus no need to adjust IP */ 5433 kvm_apic_write_nodecode(vcpu, offset); 5434 return 1; 5435} 5436 5437static int handle_task_switch(struct kvm_vcpu *vcpu) 5438{ 5439 struct vcpu_vmx *vmx = to_vmx(vcpu); 5440 unsigned long exit_qualification; 5441 bool has_error_code = false; 5442 u32 error_code = 0; 5443 u16 tss_selector; 5444 int reason, type, idt_v, idt_index; 5445 5446 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 5447 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); 5448 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 5449 5450 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5451 5452 reason = (u32)exit_qualification >> 30; 5453 if (reason == TASK_SWITCH_GATE && idt_v) { 5454 switch (type) { 5455 case INTR_TYPE_NMI_INTR: 5456 vcpu->arch.nmi_injected = false; 5457 vmx_set_nmi_mask(vcpu, true); 5458 break; 5459 case INTR_TYPE_EXT_INTR: 5460 case INTR_TYPE_SOFT_INTR: 5461 kvm_clear_interrupt_queue(vcpu); 5462 break; 5463 case INTR_TYPE_HARD_EXCEPTION: 5464 if (vmx->idt_vectoring_info & 5465 VECTORING_INFO_DELIVER_CODE_MASK) { 5466 has_error_code = true; 5467 error_code = 5468 vmcs_read32(IDT_VECTORING_ERROR_CODE); 5469 } 5470 /* fall through */ 5471 case INTR_TYPE_SOFT_EXCEPTION: 5472 kvm_clear_exception_queue(vcpu); 5473 break; 5474 default: 5475 break; 5476 } 5477 } 5478 tss_selector = exit_qualification; 5479 5480 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && 5481 type != INTR_TYPE_EXT_INTR && 5482 type != INTR_TYPE_NMI_INTR)) 5483 skip_emulated_instruction(vcpu); 5484 5485 if (kvm_task_switch(vcpu, tss_selector, 5486 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, 5487 has_error_code, error_code) == EMULATE_FAIL) { 5488 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5489 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 5490 vcpu->run->internal.ndata = 0; 5491 return 0; 5492 } 5493 5494 /* clear all local breakpoint enable flags */ 5495 vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55); 5496 5497 /* 5498 * TODO: What about debug traps on tss switch? 5499 * Are we supposed to inject them and update dr6? 5500 */ 5501 5502 return 1; 5503} 5504 5505static int handle_ept_violation(struct kvm_vcpu *vcpu) 5506{ 5507 unsigned long exit_qualification; 5508 gpa_t gpa; 5509 u32 error_code; 5510 int gla_validity; 5511 5512 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5513 5514 gla_validity = (exit_qualification >> 7) & 0x3; 5515 if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { 5516 printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); 5517 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", 5518 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), 5519 vmcs_readl(GUEST_LINEAR_ADDRESS)); 5520 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", 5521 (long unsigned int)exit_qualification); 5522 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; 5523 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; 5524 return 0; 5525 } 5526 5527 /* 5528 * EPT violation happened while executing iret from NMI, 5529 * "blocked by NMI" bit has to be set before next VM entry. 5530 * There are errata that may cause this bit to not be set: 5531 * AAK134, BY25. 5532 */ 5533 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5534 cpu_has_virtual_nmis() && 5535 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5536 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); 5537 5538 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5539 trace_kvm_page_fault(gpa, exit_qualification); 5540 5541 /* It is a write fault? */ 5542 error_code = exit_qualification & (1U << 1); 5543 /* It is a fetch fault? */ 5544 error_code |= (exit_qualification & (1U << 2)) << 2; 5545 /* ept page table is present? */ 5546 error_code |= (exit_qualification >> 3) & 0x1; 5547 5548 vcpu->arch.exit_qualification = exit_qualification; 5549 5550 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 5551} 5552 5553static u64 ept_rsvd_mask(u64 spte, int level) 5554{ 5555 int i; 5556 u64 mask = 0; 5557 5558 for (i = 51; i > boot_cpu_data.x86_phys_bits; i--) 5559 mask |= (1ULL << i); 5560 5561 if (level == 4) 5562 /* bits 7:3 reserved */ 5563 mask |= 0xf8; 5564 else if (spte & (1ULL << 7)) 5565 /* 5566 * 1GB/2MB page, bits 29:12 or 20:12 reserved respectively, 5567 * level == 1 if the hypervisor is using the ignored bit 7. 5568 */ 5569 mask |= (PAGE_SIZE << ((level - 1) * 9)) - PAGE_SIZE; 5570 else if (level > 1) 5571 /* bits 6:3 reserved */ 5572 mask |= 0x78; 5573 5574 return mask; 5575} 5576 5577static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, 5578 int level) 5579{ 5580 printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level); 5581 5582 /* 010b (write-only) */ 5583 WARN_ON((spte & 0x7) == 0x2); 5584 5585 /* 110b (write/execute) */ 5586 WARN_ON((spte & 0x7) == 0x6); 5587 5588 /* 100b (execute-only) and value not supported by logical processor */ 5589 if (!cpu_has_vmx_ept_execute_only()) 5590 WARN_ON((spte & 0x7) == 0x4); 5591 5592 /* not 000b */ 5593 if ((spte & 0x7)) { 5594 u64 rsvd_bits = spte & ept_rsvd_mask(spte, level); 5595 5596 if (rsvd_bits != 0) { 5597 printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n", 5598 __func__, rsvd_bits); 5599 WARN_ON(1); 5600 } 5601 5602 /* bits 5:3 are _not_ reserved for large page or leaf page */ 5603 if ((rsvd_bits & 0x38) == 0) { 5604 u64 ept_mem_type = (spte & 0x38) >> 3; 5605 5606 if (ept_mem_type == 2 || ept_mem_type == 3 || 5607 ept_mem_type == 7) { 5608 printk(KERN_ERR "%s: ept_mem_type=0x%llx\n", 5609 __func__, ept_mem_type); 5610 WARN_ON(1); 5611 } 5612 } 5613 } 5614} 5615 5616static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 5617{ 5618 u64 sptes[4]; 5619 int nr_sptes, i, ret; 5620 gpa_t gpa; 5621 5622 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5623 if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5624 skip_emulated_instruction(vcpu); 5625 return 1; 5626 } 5627 5628 ret = handle_mmio_page_fault_common(vcpu, gpa, true); 5629 if (likely(ret == RET_MMIO_PF_EMULATE)) 5630 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == 5631 EMULATE_DONE; 5632 5633 if (unlikely(ret == RET_MMIO_PF_INVALID)) 5634 return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0); 5635 5636 if (unlikely(ret == RET_MMIO_PF_RETRY)) 5637 return 1; 5638 5639 /* It is the real ept misconfig */ 5640 printk(KERN_ERR "EPT: Misconfiguration.\n"); 5641 printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); 5642 5643 nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes); 5644 5645 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) 5646 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); 5647 5648 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; 5649 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; 5650 5651 return 0; 5652} 5653 5654static int handle_nmi_window(struct kvm_vcpu *vcpu) 5655{ 5656 u32 cpu_based_vm_exec_control; 5657 5658 /* clear pending NMI */ 5659 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5660 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 5661 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 5662 ++vcpu->stat.nmi_window_exits; 5663 kvm_make_request(KVM_REQ_EVENT, vcpu); 5664 5665 return 1; 5666} 5667 5668static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 5669{ 5670 struct vcpu_vmx *vmx = to_vmx(vcpu); 5671 enum emulation_result err = EMULATE_DONE; 5672 int ret = 1; 5673 u32 cpu_exec_ctrl; 5674 bool intr_window_requested; 5675 unsigned count = 130; 5676 5677 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5678 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; 5679 5680 while (vmx->emulation_required && count-- != 0) { 5681 if (intr_window_requested && vmx_interrupt_allowed(vcpu)) 5682 return handle_interrupt_window(&vmx->vcpu); 5683 5684 if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) 5685 return 1; 5686 5687 err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); 5688 5689 if (err == EMULATE_USER_EXIT) { 5690 ++vcpu->stat.mmio_exits; 5691 ret = 0; 5692 goto out; 5693 } 5694 5695 if (err != EMULATE_DONE) { 5696 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5697 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 5698 vcpu->run->internal.ndata = 0; 5699 return 0; 5700 } 5701 5702 if (vcpu->arch.halt_request) { 5703 vcpu->arch.halt_request = 0; 5704 ret = kvm_emulate_halt(vcpu); 5705 goto out; 5706 } 5707 5708 if (signal_pending(current)) 5709 goto out; 5710 if (need_resched()) 5711 schedule(); 5712 } 5713 5714out: 5715 return ret; 5716} 5717 5718static int __grow_ple_window(int val) 5719{ 5720 if (ple_window_grow < 1) 5721 return ple_window; 5722 5723 val = min(val, ple_window_actual_max); 5724 5725 if (ple_window_grow < ple_window) 5726 val *= ple_window_grow; 5727 else 5728 val += ple_window_grow; 5729 5730 return val; 5731} 5732 5733static int __shrink_ple_window(int val, int modifier, int minimum) 5734{ 5735 if (modifier < 1) 5736 return ple_window; 5737 5738 if (modifier < ple_window) 5739 val /= modifier; 5740 else 5741 val -= modifier; 5742 5743 return max(val, minimum); 5744} 5745 5746static void grow_ple_window(struct kvm_vcpu *vcpu) 5747{ 5748 struct vcpu_vmx *vmx = to_vmx(vcpu); 5749 int old = vmx->ple_window; 5750 5751 vmx->ple_window = __grow_ple_window(old); 5752 5753 if (vmx->ple_window != old) 5754 vmx->ple_window_dirty = true; 5755 5756 trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); 5757} 5758 5759static void shrink_ple_window(struct kvm_vcpu *vcpu) 5760{ 5761 struct vcpu_vmx *vmx = to_vmx(vcpu); 5762 int old = vmx->ple_window; 5763 5764 vmx->ple_window = __shrink_ple_window(old, 5765 ple_window_shrink, ple_window); 5766 5767 if (vmx->ple_window != old) 5768 vmx->ple_window_dirty = true; 5769 5770 trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); 5771} 5772 5773/* 5774 * ple_window_actual_max is computed to be one grow_ple_window() below 5775 * ple_window_max. (See __grow_ple_window for the reason.) 5776 * This prevents overflows, because ple_window_max is int. 5777 * ple_window_max effectively rounded down to a multiple of ple_window_grow in 5778 * this process. 5779 * ple_window_max is also prevented from setting vmx->ple_window < ple_window. 5780 */ 5781static void update_ple_window_actual_max(void) 5782{ 5783 ple_window_actual_max = 5784 __shrink_ple_window(max(ple_window_max, ple_window), 5785 ple_window_grow, INT_MIN); 5786} 5787 5788/* 5789 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE 5790 * exiting, so only get here on cpu with PAUSE-Loop-Exiting. 5791 */ 5792static int handle_pause(struct kvm_vcpu *vcpu) 5793{ 5794 if (ple_gap) 5795 grow_ple_window(vcpu); 5796 5797 skip_emulated_instruction(vcpu); 5798 kvm_vcpu_on_spin(vcpu); 5799 5800 return 1; 5801} 5802 5803static int handle_nop(struct kvm_vcpu *vcpu) 5804{ 5805 skip_emulated_instruction(vcpu); 5806 return 1; 5807} 5808 5809static int handle_mwait(struct kvm_vcpu *vcpu) 5810{ 5811 printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); 5812 return handle_nop(vcpu); 5813} 5814 5815static int handle_monitor(struct kvm_vcpu *vcpu) 5816{ 5817 printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); 5818 return handle_nop(vcpu); 5819} 5820 5821/* 5822 * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. 5823 * We could reuse a single VMCS for all the L2 guests, but we also want the 5824 * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this 5825 * allows keeping them loaded on the processor, and in the future will allow 5826 * optimizations where prepare_vmcs02 doesn't need to set all the fields on 5827 * every entry if they never change. 5828 * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE 5829 * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first. 5830 * 5831 * The following functions allocate and free a vmcs02 in this pool. 5832 */ 5833 5834/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */ 5835static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) 5836{ 5837 struct vmcs02_list *item; 5838 list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) 5839 if (item->vmptr == vmx->nested.current_vmptr) { 5840 list_move(&item->list, &vmx->nested.vmcs02_pool); 5841 return &item->vmcs02; 5842 } 5843 5844 if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { 5845 /* Recycle the least recently used VMCS. */ 5846 item = list_entry(vmx->nested.vmcs02_pool.prev, 5847 struct vmcs02_list, list); 5848 item->vmptr = vmx->nested.current_vmptr; 5849 list_move(&item->list, &vmx->nested.vmcs02_pool); 5850 return &item->vmcs02; 5851 } 5852 5853 /* Create a new VMCS */ 5854 item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); 5855 if (!item) 5856 return NULL; 5857 item->vmcs02.vmcs = alloc_vmcs(); 5858 if (!item->vmcs02.vmcs) { 5859 kfree(item); 5860 return NULL; 5861 } 5862 loaded_vmcs_init(&item->vmcs02); 5863 item->vmptr = vmx->nested.current_vmptr; 5864 list_add(&(item->list), &(vmx->nested.vmcs02_pool)); 5865 vmx->nested.vmcs02_num++; 5866 return &item->vmcs02; 5867} 5868 5869/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */ 5870static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) 5871{ 5872 struct vmcs02_list *item; 5873 list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) 5874 if (item->vmptr == vmptr) { 5875 free_loaded_vmcs(&item->vmcs02); 5876 list_del(&item->list); 5877 kfree(item); 5878 vmx->nested.vmcs02_num--; 5879 return; 5880 } 5881} 5882 5883/* 5884 * Free all VMCSs saved for this vcpu, except the one pointed by 5885 * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs 5886 * must be &vmx->vmcs01. 5887 */ 5888static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) 5889{ 5890 struct vmcs02_list *item, *n; 5891 5892 WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); 5893 list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { 5894 /* 5895 * Something will leak if the above WARN triggers. Better than 5896 * a use-after-free. 5897 */ 5898 if (vmx->loaded_vmcs == &item->vmcs02) 5899 continue; 5900 5901 free_loaded_vmcs(&item->vmcs02); 5902 list_del(&item->list); 5903 kfree(item); 5904 vmx->nested.vmcs02_num--; 5905 } 5906} 5907 5908/* 5909 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 5910 * set the success or error code of an emulated VMX instruction, as specified 5911 * by Vol 2B, VMX Instruction Reference, "Conventions". 5912 */ 5913static void nested_vmx_succeed(struct kvm_vcpu *vcpu) 5914{ 5915 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 5916 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 5917 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 5918} 5919 5920static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 5921{ 5922 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 5923 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 5924 X86_EFLAGS_SF | X86_EFLAGS_OF)) 5925 | X86_EFLAGS_CF); 5926} 5927 5928static void nested_vmx_failValid(struct kvm_vcpu *vcpu, 5929 u32 vm_instruction_error) 5930{ 5931 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { 5932 /* 5933 * failValid writes the error number to the current VMCS, which 5934 * can't be done there isn't a current VMCS. 5935 */ 5936 nested_vmx_failInvalid(vcpu); 5937 return; 5938 } 5939 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 5940 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 5941 X86_EFLAGS_SF | X86_EFLAGS_OF)) 5942 | X86_EFLAGS_ZF); 5943 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 5944 /* 5945 * We don't need to force a shadow sync because 5946 * VM_INSTRUCTION_ERROR is not shadowed 5947 */ 5948} 5949 5950static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 5951{ 5952 struct vcpu_vmx *vmx = 5953 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 5954 5955 vmx->nested.preemption_timer_expired = true; 5956 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 5957 kvm_vcpu_kick(&vmx->vcpu); 5958 5959 return HRTIMER_NORESTART; 5960} 5961 5962/* 5963 * Decode the memory-address operand of a vmx instruction, as recorded on an 5964 * exit caused by such an instruction (run by a guest hypervisor). 5965 * On success, returns 0. When the operand is invalid, returns 1 and throws 5966 * #UD or #GP. 5967 */ 5968static int get_vmx_mem_address(struct kvm_vcpu *vcpu, 5969 unsigned long exit_qualification, 5970 u32 vmx_instruction_info, gva_t *ret) 5971{ 5972 /* 5973 * According to Vol. 3B, "Information for VM Exits Due to Instruction 5974 * Execution", on an exit, vmx_instruction_info holds most of the 5975 * addressing components of the operand. Only the displacement part 5976 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 5977 * For how an actual address is calculated from all these components, 5978 * refer to Vol. 1, "Operand Addressing". 5979 */ 5980 int scaling = vmx_instruction_info & 3; 5981 int addr_size = (vmx_instruction_info >> 7) & 7; 5982 bool is_reg = vmx_instruction_info & (1u << 10); 5983 int seg_reg = (vmx_instruction_info >> 15) & 7; 5984 int index_reg = (vmx_instruction_info >> 18) & 0xf; 5985 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 5986 int base_reg = (vmx_instruction_info >> 23) & 0xf; 5987 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 5988 5989 if (is_reg) { 5990 kvm_queue_exception(vcpu, UD_VECTOR); 5991 return 1; 5992 } 5993 5994 /* Addr = segment_base + offset */ 5995 /* offset = base + [index * scale] + displacement */ 5996 *ret = vmx_get_segment_base(vcpu, seg_reg); 5997 if (base_is_valid) 5998 *ret += kvm_register_read(vcpu, base_reg); 5999 if (index_is_valid) 6000 *ret += kvm_register_read(vcpu, index_reg)<<scaling; 6001 *ret += exit_qualification; /* holds the displacement */ 6002 6003 if (addr_size == 1) /* 32 bit */ 6004 *ret &= 0xffffffff; 6005 6006 /* 6007 * TODO: throw #GP (and return 1) in various cases that the VM* 6008 * instructions require it - e.g., offset beyond segment limit, 6009 * unusable or unreadable/unwritable segment, non-canonical 64-bit 6010 * address, and so on. Currently these are not checked. 6011 */ 6012 return 0; 6013} 6014 6015/* 6016 * This function performs the various checks including 6017 * - if it's 4KB aligned 6018 * - No bits beyond the physical address width are set 6019 * - Returns 0 on success or else 1 6020 * (Intel SDM Section 30.3) 6021 */ 6022static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, 6023 gpa_t *vmpointer) 6024{ 6025 gva_t gva; 6026 gpa_t vmptr; 6027 struct x86_exception e; 6028 struct page *page; 6029 struct vcpu_vmx *vmx = to_vmx(vcpu); 6030 int maxphyaddr = cpuid_maxphyaddr(vcpu); 6031 6032 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 6033 vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) 6034 return 1; 6035 6036 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, 6037 sizeof(vmptr), &e)) { 6038 kvm_inject_page_fault(vcpu, &e); 6039 return 1; 6040 } 6041 6042 switch (exit_reason) { 6043 case EXIT_REASON_VMON: 6044 /* 6045 * SDM 3: 24.11.5 6046 * The first 4 bytes of VMXON region contain the supported 6047 * VMCS revision identifier 6048 * 6049 * Note - IA32_VMX_BASIC[48] will never be 1 6050 * for the nested case; 6051 * which replaces physical address width with 32 6052 * 6053 */ 6054 if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { 6055 nested_vmx_failInvalid(vcpu); 6056 skip_emulated_instruction(vcpu); 6057 return 1; 6058 } 6059 6060 page = nested_get_page(vcpu, vmptr); 6061 if (page == NULL || 6062 *(u32 *)kmap(page) != VMCS12_REVISION) { 6063 nested_vmx_failInvalid(vcpu); 6064 kunmap(page); 6065 skip_emulated_instruction(vcpu); 6066 return 1; 6067 } 6068 kunmap(page); 6069 vmx->nested.vmxon_ptr = vmptr; 6070 break; 6071 case EXIT_REASON_VMCLEAR: 6072 if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { 6073 nested_vmx_failValid(vcpu, 6074 VMXERR_VMCLEAR_INVALID_ADDRESS); 6075 skip_emulated_instruction(vcpu); 6076 return 1; 6077 } 6078 6079 if (vmptr == vmx->nested.vmxon_ptr) { 6080 nested_vmx_failValid(vcpu, 6081 VMXERR_VMCLEAR_VMXON_POINTER); 6082 skip_emulated_instruction(vcpu); 6083 return 1; 6084 } 6085 break; 6086 case EXIT_REASON_VMPTRLD: 6087 if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { 6088 nested_vmx_failValid(vcpu, 6089 VMXERR_VMPTRLD_INVALID_ADDRESS); 6090 skip_emulated_instruction(vcpu); 6091 return 1; 6092 } 6093 6094 if (vmptr == vmx->nested.vmxon_ptr) { 6095 nested_vmx_failValid(vcpu, 6096 VMXERR_VMCLEAR_VMXON_POINTER); 6097 skip_emulated_instruction(vcpu); 6098 return 1; 6099 } 6100 break; 6101 default: 6102 return 1; /* shouldn't happen */ 6103 } 6104 6105 if (vmpointer) 6106 *vmpointer = vmptr; 6107 return 0; 6108} 6109 6110/* 6111 * Emulate the VMXON instruction. 6112 * Currently, we just remember that VMX is active, and do not save or even 6113 * inspect the argument to VMXON (the so-called "VMXON pointer") because we 6114 * do not currently need to store anything in that guest-allocated memory 6115 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their 6116 * argument is different from the VMXON pointer (which the spec says they do). 6117 */ 6118static int handle_vmon(struct kvm_vcpu *vcpu) 6119{ 6120 struct kvm_segment cs; 6121 struct vcpu_vmx *vmx = to_vmx(vcpu); 6122 struct vmcs *shadow_vmcs; 6123 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED 6124 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 6125 6126 /* The Intel VMX Instruction Reference lists a bunch of bits that 6127 * are prerequisite to running VMXON, most notably cr4.VMXE must be 6128 * set to 1 (see vmx_set_cr4() for when we allow the guest to set this). 6129 * Otherwise, we should fail with #UD. We test these now: 6130 */ 6131 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) || 6132 !kvm_read_cr0_bits(vcpu, X86_CR0_PE) || 6133 (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 6134 kvm_queue_exception(vcpu, UD_VECTOR); 6135 return 1; 6136 } 6137 6138 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 6139 if (is_long_mode(vcpu) && !cs.l) { 6140 kvm_queue_exception(vcpu, UD_VECTOR); 6141 return 1; 6142 } 6143 6144 if (vmx_get_cpl(vcpu)) { 6145 kvm_inject_gp(vcpu, 0); 6146 return 1; 6147 } 6148 6149 if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) 6150 return 1; 6151 6152 if (vmx->nested.vmxon) { 6153 nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 6154 skip_emulated_instruction(vcpu); 6155 return 1; 6156 } 6157 6158 if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 6159 != VMXON_NEEDED_FEATURES) { 6160 kvm_inject_gp(vcpu, 0); 6161 return 1; 6162 } 6163 6164 if (enable_shadow_vmcs) { 6165 shadow_vmcs = alloc_vmcs(); 6166 if (!shadow_vmcs) 6167 return -ENOMEM; 6168 /* mark vmcs as shadow */ 6169 shadow_vmcs->revision_id |= (1u << 31); 6170 /* init shadow vmcs */ 6171 vmcs_clear(shadow_vmcs); 6172 vmx->nested.current_shadow_vmcs = shadow_vmcs; 6173 } 6174 6175 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); 6176 vmx->nested.vmcs02_num = 0; 6177 6178 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 6179 HRTIMER_MODE_REL); 6180 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 6181 6182 vmx->nested.vmxon = true; 6183 6184 skip_emulated_instruction(vcpu); 6185 nested_vmx_succeed(vcpu); 6186 return 1; 6187} 6188 6189/* 6190 * Intel's VMX Instruction Reference specifies a common set of prerequisites 6191 * for running VMX instructions (except VMXON, whose prerequisites are 6192 * slightly different). It also specifies what exception to inject otherwise. 6193 */ 6194static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 6195{ 6196 struct kvm_segment cs; 6197 struct vcpu_vmx *vmx = to_vmx(vcpu); 6198 6199 if (!vmx->nested.vmxon) { 6200 kvm_queue_exception(vcpu, UD_VECTOR); 6201 return 0; 6202 } 6203 6204 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 6205 if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) || 6206 (is_long_mode(vcpu) && !cs.l)) { 6207 kvm_queue_exception(vcpu, UD_VECTOR); 6208 return 0; 6209 } 6210 6211 if (vmx_get_cpl(vcpu)) { 6212 kvm_inject_gp(vcpu, 0); 6213 return 0; 6214 } 6215 6216 return 1; 6217} 6218 6219static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) 6220{ 6221 u32 exec_control; 6222 if (vmx->nested.current_vmptr == -1ull) 6223 return; 6224 6225 /* current_vmptr and current_vmcs12 are always set/reset together */ 6226 if (WARN_ON(vmx->nested.current_vmcs12 == NULL)) 6227 return; 6228 6229 if (enable_shadow_vmcs) { 6230 /* copy to memory all shadowed fields in case 6231 they were modified */ 6232 copy_shadow_to_vmcs12(vmx); 6233 vmx->nested.sync_shadow_vmcs = false; 6234 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6235 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 6236 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 6237 vmcs_write64(VMCS_LINK_POINTER, -1ull); 6238 } 6239 kunmap(vmx->nested.current_vmcs12_page); 6240 nested_release_page(vmx->nested.current_vmcs12_page); 6241 vmx->nested.current_vmptr = -1ull; 6242 vmx->nested.current_vmcs12 = NULL; 6243} 6244 6245/* 6246 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 6247 * just stops using VMX. 6248 */ 6249static void free_nested(struct vcpu_vmx *vmx) 6250{ 6251 if (!vmx->nested.vmxon) 6252 return; 6253 6254 vmx->nested.vmxon = false; 6255 nested_release_vmcs12(vmx); 6256 if (enable_shadow_vmcs) 6257 free_vmcs(vmx->nested.current_shadow_vmcs); 6258 /* Unpin physical memory we referred to in current vmcs02 */ 6259 if (vmx->nested.apic_access_page) { 6260 nested_release_page(vmx->nested.apic_access_page); 6261 vmx->nested.apic_access_page = NULL; 6262 } 6263 if (vmx->nested.virtual_apic_page) { 6264 nested_release_page(vmx->nested.virtual_apic_page); 6265 vmx->nested.virtual_apic_page = NULL; 6266 } 6267 6268 nested_free_all_saved_vmcss(vmx); 6269} 6270 6271/* Emulate the VMXOFF instruction */ 6272static int handle_vmoff(struct kvm_vcpu *vcpu) 6273{ 6274 if (!nested_vmx_check_permission(vcpu)) 6275 return 1; 6276 free_nested(to_vmx(vcpu)); 6277 skip_emulated_instruction(vcpu); 6278 nested_vmx_succeed(vcpu); 6279 return 1; 6280} 6281 6282/* Emulate the VMCLEAR instruction */ 6283static int handle_vmclear(struct kvm_vcpu *vcpu) 6284{ 6285 struct vcpu_vmx *vmx = to_vmx(vcpu); 6286 gpa_t vmptr; 6287 struct vmcs12 *vmcs12; 6288 struct page *page; 6289 6290 if (!nested_vmx_check_permission(vcpu)) 6291 return 1; 6292 6293 if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) 6294 return 1; 6295 6296 if (vmptr == vmx->nested.current_vmptr) 6297 nested_release_vmcs12(vmx); 6298 6299 page = nested_get_page(vcpu, vmptr); 6300 if (page == NULL) { 6301 /* 6302 * For accurate processor emulation, VMCLEAR beyond available 6303 * physical memory should do nothing at all. However, it is 6304 * possible that a nested vmx bug, not a guest hypervisor bug, 6305 * resulted in this case, so let's shut down before doing any 6306 * more damage: 6307 */ 6308 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 6309 return 1; 6310 } 6311 vmcs12 = kmap(page); 6312 vmcs12->launch_state = 0; 6313 kunmap(page); 6314 nested_release_page(page); 6315 6316 nested_free_vmcs02(vmx, vmptr); 6317 6318 skip_emulated_instruction(vcpu); 6319 nested_vmx_succeed(vcpu); 6320 return 1; 6321} 6322 6323static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); 6324 6325/* Emulate the VMLAUNCH instruction */ 6326static int handle_vmlaunch(struct kvm_vcpu *vcpu) 6327{ 6328 return nested_vmx_run(vcpu, true); 6329} 6330 6331/* Emulate the VMRESUME instruction */ 6332static int handle_vmresume(struct kvm_vcpu *vcpu) 6333{ 6334 6335 return nested_vmx_run(vcpu, false); 6336} 6337 6338enum vmcs_field_type { 6339 VMCS_FIELD_TYPE_U16 = 0, 6340 VMCS_FIELD_TYPE_U64 = 1, 6341 VMCS_FIELD_TYPE_U32 = 2, 6342 VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 6343}; 6344 6345static inline int vmcs_field_type(unsigned long field) 6346{ 6347 if (0x1 & field) /* the *_HIGH fields are all 32 bit */ 6348 return VMCS_FIELD_TYPE_U32; 6349 return (field >> 13) & 0x3 ; 6350} 6351 6352static inline int vmcs_field_readonly(unsigned long field) 6353{ 6354 return (((field >> 10) & 0x3) == 1); 6355} 6356 6357/* 6358 * Read a vmcs12 field. Since these can have varying lengths and we return 6359 * one type, we chose the biggest type (u64) and zero-extend the return value 6360 * to that size. Note that the caller, handle_vmread, might need to use only 6361 * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of 6362 * 64-bit fields are to be returned). 6363 */ 6364static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu, 6365 unsigned long field, u64 *ret) 6366{ 6367 short offset = vmcs_field_to_offset(field); 6368 char *p; 6369 6370 if (offset < 0) 6371 return 0; 6372 6373 p = ((char *)(get_vmcs12(vcpu))) + offset; 6374 6375 switch (vmcs_field_type(field)) { 6376 case VMCS_FIELD_TYPE_NATURAL_WIDTH: 6377 *ret = *((natural_width *)p); 6378 return 1; 6379 case VMCS_FIELD_TYPE_U16: 6380 *ret = *((u16 *)p); 6381 return 1; 6382 case VMCS_FIELD_TYPE_U32: 6383 *ret = *((u32 *)p); 6384 return 1; 6385 case VMCS_FIELD_TYPE_U64: 6386 *ret = *((u64 *)p); 6387 return 1; 6388 default: 6389 return 0; /* can never happen. */ 6390 } 6391} 6392 6393 6394static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu, 6395 unsigned long field, u64 field_value){ 6396 short offset = vmcs_field_to_offset(field); 6397 char *p = ((char *) get_vmcs12(vcpu)) + offset; 6398 if (offset < 0) 6399 return false; 6400 6401 switch (vmcs_field_type(field)) { 6402 case VMCS_FIELD_TYPE_U16: 6403 *(u16 *)p = field_value; 6404 return true; 6405 case VMCS_FIELD_TYPE_U32: 6406 *(u32 *)p = field_value; 6407 return true; 6408 case VMCS_FIELD_TYPE_U64: 6409 *(u64 *)p = field_value; 6410 return true; 6411 case VMCS_FIELD_TYPE_NATURAL_WIDTH: 6412 *(natural_width *)p = field_value; 6413 return true; 6414 default: 6415 return false; /* can never happen. */ 6416 } 6417 6418} 6419 6420static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 6421{ 6422 int i; 6423 unsigned long field; 6424 u64 field_value; 6425 struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; 6426 const unsigned long *fields = shadow_read_write_fields; 6427 const int num_fields = max_shadow_read_write_fields; 6428 6429 preempt_disable(); 6430 6431 vmcs_load(shadow_vmcs); 6432 6433 for (i = 0; i < num_fields; i++) { 6434 field = fields[i]; 6435 switch (vmcs_field_type(field)) { 6436 case VMCS_FIELD_TYPE_U16: 6437 field_value = vmcs_read16(field); 6438 break; 6439 case VMCS_FIELD_TYPE_U32: 6440 field_value = vmcs_read32(field); 6441 break; 6442 case VMCS_FIELD_TYPE_U64: 6443 field_value = vmcs_read64(field); 6444 break; 6445 case VMCS_FIELD_TYPE_NATURAL_WIDTH: 6446 field_value = vmcs_readl(field); 6447 break; 6448 } 6449 vmcs12_write_any(&vmx->vcpu, field, field_value); 6450 } 6451 6452 vmcs_clear(shadow_vmcs); 6453 vmcs_load(vmx->loaded_vmcs->vmcs); 6454 6455 preempt_enable(); 6456} 6457 6458static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 6459{ 6460 const unsigned long *fields[] = { 6461 shadow_read_write_fields, 6462 shadow_read_only_fields 6463 }; 6464 const int max_fields[] = { 6465 max_shadow_read_write_fields, 6466 max_shadow_read_only_fields 6467 }; 6468 int i, q; 6469 unsigned long field; 6470 u64 field_value = 0; 6471 struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; 6472 6473 vmcs_load(shadow_vmcs); 6474 6475 for (q = 0; q < ARRAY_SIZE(fields); q++) { 6476 for (i = 0; i < max_fields[q]; i++) { 6477 field = fields[q][i]; 6478 vmcs12_read_any(&vmx->vcpu, field, &field_value); 6479 6480 switch (vmcs_field_type(field)) { 6481 case VMCS_FIELD_TYPE_U16: 6482 vmcs_write16(field, (u16)field_value); 6483 break; 6484 case VMCS_FIELD_TYPE_U32: 6485 vmcs_write32(field, (u32)field_value); 6486 break; 6487 case VMCS_FIELD_TYPE_U64: 6488 vmcs_write64(field, (u64)field_value); 6489 break; 6490 case VMCS_FIELD_TYPE_NATURAL_WIDTH: 6491 vmcs_writel(field, (long)field_value); 6492 break; 6493 } 6494 } 6495 } 6496 6497 vmcs_clear(shadow_vmcs); 6498 vmcs_load(vmx->loaded_vmcs->vmcs); 6499} 6500 6501/* 6502 * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was 6503 * used before) all generate the same failure when it is missing. 6504 */ 6505static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu) 6506{ 6507 struct vcpu_vmx *vmx = to_vmx(vcpu); 6508 if (vmx->nested.current_vmptr == -1ull) { 6509 nested_vmx_failInvalid(vcpu); 6510 skip_emulated_instruction(vcpu); 6511 return 0; 6512 } 6513 return 1; 6514} 6515 6516static int handle_vmread(struct kvm_vcpu *vcpu) 6517{ 6518 unsigned long field; 6519 u64 field_value; 6520 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 6521 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6522 gva_t gva = 0; 6523 6524 if (!nested_vmx_check_permission(vcpu) || 6525 !nested_vmx_check_vmcs12(vcpu)) 6526 return 1; 6527 6528 /* Decode instruction info and find the field to read */ 6529 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6530 /* Read the field, zero-extended to a u64 field_value */ 6531 if (!vmcs12_read_any(vcpu, field, &field_value)) { 6532 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 6533 skip_emulated_instruction(vcpu); 6534 return 1; 6535 } 6536 /* 6537 * Now copy part of this value to register or memory, as requested. 6538 * Note that the number of bits actually copied is 32 or 64 depending 6539 * on the guest's mode (32 or 64 bit), not on the given field's length. 6540 */ 6541 if (vmx_instruction_info & (1u << 10)) { 6542 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), 6543 field_value); 6544 } else { 6545 if (get_vmx_mem_address(vcpu, exit_qualification, 6546 vmx_instruction_info, &gva)) 6547 return 1; 6548 /* _system ok, as nested_vmx_check_permission verified cpl=0 */ 6549 kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva, 6550 &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL); 6551 } 6552 6553 nested_vmx_succeed(vcpu); 6554 skip_emulated_instruction(vcpu); 6555 return 1; 6556} 6557 6558 6559static int handle_vmwrite(struct kvm_vcpu *vcpu) 6560{ 6561 unsigned long field; 6562 gva_t gva; 6563 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 6564 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6565 /* The value to write might be 32 or 64 bits, depending on L1's long 6566 * mode, and eventually we need to write that into a field of several 6567 * possible lengths. The code below first zero-extends the value to 64 6568 * bit (field_value), and then copies only the approriate number of 6569 * bits into the vmcs12 field. 6570 */ 6571 u64 field_value = 0; 6572 struct x86_exception e; 6573 6574 if (!nested_vmx_check_permission(vcpu) || 6575 !nested_vmx_check_vmcs12(vcpu)) 6576 return 1; 6577 6578 if (vmx_instruction_info & (1u << 10)) 6579 field_value = kvm_register_readl(vcpu, 6580 (((vmx_instruction_info) >> 3) & 0xf)); 6581 else { 6582 if (get_vmx_mem_address(vcpu, exit_qualification, 6583 vmx_instruction_info, &gva)) 6584 return 1; 6585 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, 6586 &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { 6587 kvm_inject_page_fault(vcpu, &e); 6588 return 1; 6589 } 6590 } 6591 6592 6593 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6594 if (vmcs_field_readonly(field)) { 6595 nested_vmx_failValid(vcpu, 6596 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 6597 skip_emulated_instruction(vcpu); 6598 return 1; 6599 } 6600 6601 if (!vmcs12_write_any(vcpu, field, field_value)) { 6602 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 6603 skip_emulated_instruction(vcpu); 6604 return 1; 6605 } 6606 6607 nested_vmx_succeed(vcpu); 6608 skip_emulated_instruction(vcpu); 6609 return 1; 6610} 6611 6612/* Emulate the VMPTRLD instruction */ 6613static int handle_vmptrld(struct kvm_vcpu *vcpu) 6614{ 6615 struct vcpu_vmx *vmx = to_vmx(vcpu); 6616 gpa_t vmptr; 6617 u32 exec_control; 6618 6619 if (!nested_vmx_check_permission(vcpu)) 6620 return 1; 6621 6622 if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr)) 6623 return 1; 6624 6625 if (vmx->nested.current_vmptr != vmptr) { 6626 struct vmcs12 *new_vmcs12; 6627 struct page *page; 6628 page = nested_get_page(vcpu, vmptr); 6629 if (page == NULL) { 6630 nested_vmx_failInvalid(vcpu); 6631 skip_emulated_instruction(vcpu); 6632 return 1; 6633 } 6634 new_vmcs12 = kmap(page); 6635 if (new_vmcs12->revision_id != VMCS12_REVISION) { 6636 kunmap(page); 6637 nested_release_page_clean(page); 6638 nested_vmx_failValid(vcpu, 6639 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 6640 skip_emulated_instruction(vcpu); 6641 return 1; 6642 } 6643 6644 nested_release_vmcs12(vmx); 6645 vmx->nested.current_vmptr = vmptr; 6646 vmx->nested.current_vmcs12 = new_vmcs12; 6647 vmx->nested.current_vmcs12_page = page; 6648 if (enable_shadow_vmcs) { 6649 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6650 exec_control |= SECONDARY_EXEC_SHADOW_VMCS; 6651 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 6652 vmcs_write64(VMCS_LINK_POINTER, 6653 __pa(vmx->nested.current_shadow_vmcs)); 6654 vmx->nested.sync_shadow_vmcs = true; 6655 } 6656 } 6657 6658 nested_vmx_succeed(vcpu); 6659 skip_emulated_instruction(vcpu); 6660 return 1; 6661} 6662 6663/* Emulate the VMPTRST instruction */ 6664static int handle_vmptrst(struct kvm_vcpu *vcpu) 6665{ 6666 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 6667 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6668 gva_t vmcs_gva; 6669 struct x86_exception e; 6670 6671 if (!nested_vmx_check_permission(vcpu)) 6672 return 1; 6673 6674 if (get_vmx_mem_address(vcpu, exit_qualification, 6675 vmx_instruction_info, &vmcs_gva)) 6676 return 1; 6677 /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */ 6678 if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva, 6679 (void *)&to_vmx(vcpu)->nested.current_vmptr, 6680 sizeof(u64), &e)) { 6681 kvm_inject_page_fault(vcpu, &e); 6682 return 1; 6683 } 6684 nested_vmx_succeed(vcpu); 6685 skip_emulated_instruction(vcpu); 6686 return 1; 6687} 6688 6689/* Emulate the INVEPT instruction */ 6690static int handle_invept(struct kvm_vcpu *vcpu) 6691{ 6692 u32 vmx_instruction_info, types; 6693 unsigned long type; 6694 gva_t gva; 6695 struct x86_exception e; 6696 struct { 6697 u64 eptp, gpa; 6698 } operand; 6699 6700 if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || 6701 !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { 6702 kvm_queue_exception(vcpu, UD_VECTOR); 6703 return 1; 6704 } 6705 6706 if (!nested_vmx_check_permission(vcpu)) 6707 return 1; 6708 6709 if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { 6710 kvm_queue_exception(vcpu, UD_VECTOR); 6711 return 1; 6712 } 6713 6714 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6715 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 6716 6717 types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 6718 6719 if (!(types & (1UL << type))) { 6720 nested_vmx_failValid(vcpu, 6721 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6722 return 1; 6723 } 6724 6725 /* According to the Intel VMX instruction reference, the memory 6726 * operand is read even if it isn't needed (e.g., for type==global) 6727 */ 6728 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 6729 vmx_instruction_info, &gva)) 6730 return 1; 6731 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, 6732 sizeof(operand), &e)) { 6733 kvm_inject_page_fault(vcpu, &e); 6734 return 1; 6735 } 6736 6737 switch (type) { 6738 case VMX_EPT_EXTENT_GLOBAL: 6739 kvm_mmu_sync_roots(vcpu); 6740 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 6741 nested_vmx_succeed(vcpu); 6742 break; 6743 default: 6744 /* Trap single context invalidation invept calls */ 6745 BUG_ON(1); 6746 break; 6747 } 6748 6749 skip_emulated_instruction(vcpu); 6750 return 1; 6751} 6752 6753static int handle_invvpid(struct kvm_vcpu *vcpu) 6754{ 6755 kvm_queue_exception(vcpu, UD_VECTOR); 6756 return 1; 6757} 6758 6759/* 6760 * The exit handlers return 1 if the exit was handled fully and guest execution 6761 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 6762 * to be done to userspace and return 0. 6763 */ 6764static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 6765 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 6766 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 6767 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 6768 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 6769 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 6770 [EXIT_REASON_CR_ACCESS] = handle_cr, 6771 [EXIT_REASON_DR_ACCESS] = handle_dr, 6772 [EXIT_REASON_CPUID] = handle_cpuid, 6773 [EXIT_REASON_MSR_READ] = handle_rdmsr, 6774 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 6775 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 6776 [EXIT_REASON_HLT] = handle_halt, 6777 [EXIT_REASON_INVD] = handle_invd, 6778 [EXIT_REASON_INVLPG] = handle_invlpg, 6779 [EXIT_REASON_RDPMC] = handle_rdpmc, 6780 [EXIT_REASON_VMCALL] = handle_vmcall, 6781 [EXIT_REASON_VMCLEAR] = handle_vmclear, 6782 [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, 6783 [EXIT_REASON_VMPTRLD] = handle_vmptrld, 6784 [EXIT_REASON_VMPTRST] = handle_vmptrst, 6785 [EXIT_REASON_VMREAD] = handle_vmread, 6786 [EXIT_REASON_VMRESUME] = handle_vmresume, 6787 [EXIT_REASON_VMWRITE] = handle_vmwrite, 6788 [EXIT_REASON_VMOFF] = handle_vmoff, 6789 [EXIT_REASON_VMON] = handle_vmon, 6790 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 6791 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 6792 [EXIT_REASON_APIC_WRITE] = handle_apic_write, 6793 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 6794 [EXIT_REASON_WBINVD] = handle_wbinvd, 6795 [EXIT_REASON_XSETBV] = handle_xsetbv, 6796 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 6797 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 6798 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 6799 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 6800 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 6801 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, 6802 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, 6803 [EXIT_REASON_INVEPT] = handle_invept, 6804 [EXIT_REASON_INVVPID] = handle_invvpid, 6805}; 6806 6807static const int kvm_vmx_max_exit_handlers = 6808 ARRAY_SIZE(kvm_vmx_exit_handlers); 6809 6810static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 6811 struct vmcs12 *vmcs12) 6812{ 6813 unsigned long exit_qualification; 6814 gpa_t bitmap, last_bitmap; 6815 unsigned int port; 6816 int size; 6817 u8 b; 6818 6819 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 6820 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 6821 6822 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 6823 6824 port = exit_qualification >> 16; 6825 size = (exit_qualification & 7) + 1; 6826 6827 last_bitmap = (gpa_t)-1; 6828 b = -1; 6829 6830 while (size > 0) { 6831 if (port < 0x8000) 6832 bitmap = vmcs12->io_bitmap_a; 6833 else if (port < 0x10000) 6834 bitmap = vmcs12->io_bitmap_b; 6835 else 6836 return 1; 6837 bitmap += (port & 0x7fff) / 8; 6838 6839 if (last_bitmap != bitmap) 6840 if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1)) 6841 return 1; 6842 if (b & (1 << (port & 7))) 6843 return 1; 6844 6845 port++; 6846 size--; 6847 last_bitmap = bitmap; 6848 } 6849 6850 return 0; 6851} 6852 6853/* 6854 * Return 1 if we should exit from L2 to L1 to handle an MSR access access, 6855 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 6856 * disinterest in the current event (read or write a specific MSR) by using an 6857 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 6858 */ 6859static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 6860 struct vmcs12 *vmcs12, u32 exit_reason) 6861{ 6862 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; 6863 gpa_t bitmap; 6864 6865 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 6866 return 1; 6867 6868 /* 6869 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 6870 * for the four combinations of read/write and low/high MSR numbers. 6871 * First we need to figure out which of the four to use: 6872 */ 6873 bitmap = vmcs12->msr_bitmap; 6874 if (exit_reason == EXIT_REASON_MSR_WRITE) 6875 bitmap += 2048; 6876 if (msr_index >= 0xc0000000) { 6877 msr_index -= 0xc0000000; 6878 bitmap += 1024; 6879 } 6880 6881 /* Then read the msr_index'th bit from this bitmap: */ 6882 if (msr_index < 1024*8) { 6883 unsigned char b; 6884 if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1)) 6885 return 1; 6886 return 1 & (b >> (msr_index & 7)); 6887 } else 6888 return 1; /* let L1 handle the wrong parameter */ 6889} 6890 6891/* 6892 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 6893 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 6894 * intercept (via guest_host_mask etc.) the current event. 6895 */ 6896static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 6897 struct vmcs12 *vmcs12) 6898{ 6899 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 6900 int cr = exit_qualification & 15; 6901 int reg = (exit_qualification >> 8) & 15; 6902 unsigned long val = kvm_register_readl(vcpu, reg); 6903 6904 switch ((exit_qualification >> 4) & 3) { 6905 case 0: /* mov to cr */ 6906 switch (cr) { 6907 case 0: 6908 if (vmcs12->cr0_guest_host_mask & 6909 (val ^ vmcs12->cr0_read_shadow)) 6910 return 1; 6911 break; 6912 case 3: 6913 if ((vmcs12->cr3_target_count >= 1 && 6914 vmcs12->cr3_target_value0 == val) || 6915 (vmcs12->cr3_target_count >= 2 && 6916 vmcs12->cr3_target_value1 == val) || 6917 (vmcs12->cr3_target_count >= 3 && 6918 vmcs12->cr3_target_value2 == val) || 6919 (vmcs12->cr3_target_count >= 4 && 6920 vmcs12->cr3_target_value3 == val)) 6921 return 0; 6922 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 6923 return 1; 6924 break; 6925 case 4: 6926 if (vmcs12->cr4_guest_host_mask & 6927 (vmcs12->cr4_read_shadow ^ val)) 6928 return 1; 6929 break; 6930 case 8: 6931 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 6932 return 1; 6933 break; 6934 } 6935 break; 6936 case 2: /* clts */ 6937 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 6938 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 6939 return 1; 6940 break; 6941 case 1: /* mov from cr */ 6942 switch (cr) { 6943 case 3: 6944 if (vmcs12->cpu_based_vm_exec_control & 6945 CPU_BASED_CR3_STORE_EXITING) 6946 return 1; 6947 break; 6948 case 8: 6949 if (vmcs12->cpu_based_vm_exec_control & 6950 CPU_BASED_CR8_STORE_EXITING) 6951 return 1; 6952 break; 6953 } 6954 break; 6955 case 3: /* lmsw */ 6956 /* 6957 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 6958 * cr0. Other attempted changes are ignored, with no exit. 6959 */ 6960 if (vmcs12->cr0_guest_host_mask & 0xe & 6961 (val ^ vmcs12->cr0_read_shadow)) 6962 return 1; 6963 if ((vmcs12->cr0_guest_host_mask & 0x1) && 6964 !(vmcs12->cr0_read_shadow & 0x1) && 6965 (val & 0x1)) 6966 return 1; 6967 break; 6968 } 6969 return 0; 6970} 6971 6972/* 6973 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we 6974 * should handle it ourselves in L0 (and then continue L2). Only call this 6975 * when in is_guest_mode (L2). 6976 */ 6977static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) 6978{ 6979 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 6980 struct vcpu_vmx *vmx = to_vmx(vcpu); 6981 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6982 u32 exit_reason = vmx->exit_reason; 6983 6984 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, 6985 vmcs_readl(EXIT_QUALIFICATION), 6986 vmx->idt_vectoring_info, 6987 intr_info, 6988 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 6989 KVM_ISA_VMX); 6990 6991 if (vmx->nested.nested_run_pending) 6992 return 0; 6993 6994 if (unlikely(vmx->fail)) { 6995 pr_info_ratelimited("%s failed vm entry %x\n", __func__, 6996 vmcs_read32(VM_INSTRUCTION_ERROR)); 6997 return 1; 6998 } 6999 7000 switch (exit_reason) { 7001 case EXIT_REASON_EXCEPTION_NMI: 7002 if (!is_exception(intr_info)) 7003 return 0; 7004 else if (is_page_fault(intr_info)) 7005 return enable_ept; 7006 else if (is_no_device(intr_info) && 7007 !(vmcs12->guest_cr0 & X86_CR0_TS)) 7008 return 0; 7009 return vmcs12->exception_bitmap & 7010 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 7011 case EXIT_REASON_EXTERNAL_INTERRUPT: 7012 return 0; 7013 case EXIT_REASON_TRIPLE_FAULT: 7014 return 1; 7015 case EXIT_REASON_PENDING_INTERRUPT: 7016 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); 7017 case EXIT_REASON_NMI_WINDOW: 7018 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); 7019 case EXIT_REASON_TASK_SWITCH: 7020 return 1; 7021 case EXIT_REASON_CPUID: 7022 if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) 7023 return 0; 7024 return 1; 7025 case EXIT_REASON_HLT: 7026 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 7027 case EXIT_REASON_INVD: 7028 return 1; 7029 case EXIT_REASON_INVLPG: 7030 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 7031 case EXIT_REASON_RDPMC: 7032 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 7033 case EXIT_REASON_RDTSC: 7034 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 7035 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 7036 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 7037 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: 7038 case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: 7039 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 7040 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 7041 /* 7042 * VMX instructions trap unconditionally. This allows L1 to 7043 * emulate them for its L2 guest, i.e., allows 3-level nesting! 7044 */ 7045 return 1; 7046 case EXIT_REASON_CR_ACCESS: 7047 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 7048 case EXIT_REASON_DR_ACCESS: 7049 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 7050 case EXIT_REASON_IO_INSTRUCTION: 7051 return nested_vmx_exit_handled_io(vcpu, vmcs12); 7052 case EXIT_REASON_MSR_READ: 7053 case EXIT_REASON_MSR_WRITE: 7054 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 7055 case EXIT_REASON_INVALID_STATE: 7056 return 1; 7057 case EXIT_REASON_MWAIT_INSTRUCTION: 7058 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 7059 case EXIT_REASON_MONITOR_INSTRUCTION: 7060 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 7061 case EXIT_REASON_PAUSE_INSTRUCTION: 7062 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 7063 nested_cpu_has2(vmcs12, 7064 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 7065 case EXIT_REASON_MCE_DURING_VMENTRY: 7066 return 0; 7067 case EXIT_REASON_TPR_BELOW_THRESHOLD: 7068 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 7069 case EXIT_REASON_APIC_ACCESS: 7070 return nested_cpu_has2(vmcs12, 7071 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); 7072 case EXIT_REASON_EPT_VIOLATION: 7073 /* 7074 * L0 always deals with the EPT violation. If nested EPT is 7075 * used, and the nested mmu code discovers that the address is 7076 * missing in the guest EPT table (EPT12), the EPT violation 7077 * will be injected with nested_ept_inject_page_fault() 7078 */ 7079 return 0; 7080 case EXIT_REASON_EPT_MISCONFIG: 7081 /* 7082 * L2 never uses directly L1's EPT, but rather L0's own EPT 7083 * table (shadow on EPT) or a merged EPT table that L0 built 7084 * (EPT on EPT). So any problems with the structure of the 7085 * table is L0's fault. 7086 */ 7087 return 0; 7088 case EXIT_REASON_WBINVD: 7089 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 7090 case EXIT_REASON_XSETBV: 7091 return 1; 7092 default: 7093 return 1; 7094 } 7095} 7096 7097static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) 7098{ 7099 *info1 = vmcs_readl(EXIT_QUALIFICATION); 7100 *info2 = vmcs_read32(VM_EXIT_INTR_INFO); 7101} 7102 7103/* 7104 * The guest has exited. See if we can fix it or if we need userspace 7105 * assistance. 7106 */ 7107static int vmx_handle_exit(struct kvm_vcpu *vcpu) 7108{ 7109 struct vcpu_vmx *vmx = to_vmx(vcpu); 7110 u32 exit_reason = vmx->exit_reason; 7111 u32 vectoring_info = vmx->idt_vectoring_info; 7112 7113 /* If guest state is invalid, start emulating */ 7114 if (vmx->emulation_required) 7115 return handle_invalid_guest_state(vcpu); 7116 7117 if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { 7118 nested_vmx_vmexit(vcpu, exit_reason, 7119 vmcs_read32(VM_EXIT_INTR_INFO), 7120 vmcs_readl(EXIT_QUALIFICATION)); 7121 return 1; 7122 } 7123 7124 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 7125 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 7126 vcpu->run->fail_entry.hardware_entry_failure_reason 7127 = exit_reason; 7128 return 0; 7129 } 7130 7131 if (unlikely(vmx->fail)) { 7132 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 7133 vcpu->run->fail_entry.hardware_entry_failure_reason 7134 = vmcs_read32(VM_INSTRUCTION_ERROR); 7135 return 0; 7136 } 7137 7138 /* 7139 * Note: 7140 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by 7141 * delivery event since it indicates guest is accessing MMIO. 7142 * The vm-exit can be triggered again after return to guest that 7143 * will cause infinite loop. 7144 */ 7145 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 7146 (exit_reason != EXIT_REASON_EXCEPTION_NMI && 7147 exit_reason != EXIT_REASON_EPT_VIOLATION && 7148 exit_reason != EXIT_REASON_TASK_SWITCH)) { 7149 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 7150 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; 7151 vcpu->run->internal.ndata = 2; 7152 vcpu->run->internal.data[0] = vectoring_info; 7153 vcpu->run->internal.data[1] = exit_reason; 7154 return 0; 7155 } 7156 7157 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && 7158 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( 7159 get_vmcs12(vcpu))))) { 7160 if (vmx_interrupt_allowed(vcpu)) { 7161 vmx->soft_vnmi_blocked = 0; 7162 } else if (vmx->vnmi_blocked_time > 1000000000LL && 7163 vcpu->arch.nmi_pending) { 7164 /* 7165 * This CPU don't support us in finding the end of an 7166 * NMI-blocked window if the guest runs with IRQs 7167 * disabled. So we pull the trigger after 1 s of 7168 * futile waiting, but inform the user about this. 7169 */ 7170 printk(KERN_WARNING "%s: Breaking out of NMI-blocked " 7171 "state on VCPU %d after 1 s timeout\n", 7172 __func__, vcpu->vcpu_id); 7173 vmx->soft_vnmi_blocked = 0; 7174 } 7175 } 7176 7177 if (exit_reason < kvm_vmx_max_exit_handlers 7178 && kvm_vmx_exit_handlers[exit_reason]) 7179 return kvm_vmx_exit_handlers[exit_reason](vcpu); 7180 else { 7181 WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); 7182 kvm_queue_exception(vcpu, UD_VECTOR); 7183 return 1; 7184 } 7185} 7186 7187static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 7188{ 7189 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7190 7191 if (is_guest_mode(vcpu) && 7192 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 7193 return; 7194 7195 if (irr == -1 || tpr < irr) { 7196 vmcs_write32(TPR_THRESHOLD, 0); 7197 return; 7198 } 7199 7200 vmcs_write32(TPR_THRESHOLD, irr); 7201} 7202 7203static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) 7204{ 7205 u32 sec_exec_control; 7206 7207 /* 7208 * There is not point to enable virtualize x2apic without enable 7209 * apicv 7210 */ 7211 if (!cpu_has_vmx_virtualize_x2apic_mode() || 7212 !vmx_vm_has_apicv(vcpu->kvm)) 7213 return; 7214 7215 if (!vm_need_tpr_shadow(vcpu->kvm)) 7216 return; 7217 7218 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 7219 7220 if (set) { 7221 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7222 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 7223 } else { 7224 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 7225 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7226 } 7227 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); 7228 7229 vmx_set_msr_bitmap(vcpu); 7230} 7231 7232static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) 7233{ 7234 struct vcpu_vmx *vmx = to_vmx(vcpu); 7235 7236 /* 7237 * Currently we do not handle the nested case where L2 has an 7238 * APIC access page of its own; that page is still pinned. 7239 * Hence, we skip the case where the VCPU is in guest mode _and_ 7240 * L1 prepared an APIC access page for L2. 7241 * 7242 * For the case where L1 and L2 share the same APIC access page 7243 * (flexpriority=Y but SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES clear 7244 * in the vmcs12), this function will only update either the vmcs01 7245 * or the vmcs02. If the former, the vmcs02 will be updated by 7246 * prepare_vmcs02. If the latter, the vmcs01 will be updated in 7247 * the next L2->L1 exit. 7248 */ 7249 if (!is_guest_mode(vcpu) || 7250 !nested_cpu_has2(vmx->nested.current_vmcs12, 7251 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 7252 vmcs_write64(APIC_ACCESS_ADDR, hpa); 7253} 7254 7255static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) 7256{ 7257 u16 status; 7258 u8 old; 7259 7260 if (!vmx_vm_has_apicv(kvm)) 7261 return; 7262 7263 if (isr == -1) 7264 isr = 0; 7265 7266 status = vmcs_read16(GUEST_INTR_STATUS); 7267 old = status >> 8; 7268 if (isr != old) { 7269 status &= 0xff; 7270 status |= isr << 8; 7271 vmcs_write16(GUEST_INTR_STATUS, status); 7272 } 7273} 7274 7275static void vmx_set_rvi(int vector) 7276{ 7277 u16 status; 7278 u8 old; 7279 7280 status = vmcs_read16(GUEST_INTR_STATUS); 7281 old = (u8)status & 0xff; 7282 if ((u8)vector != old) { 7283 status &= ~0xff; 7284 status |= (u8)vector; 7285 vmcs_write16(GUEST_INTR_STATUS, status); 7286 } 7287} 7288 7289static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) 7290{ 7291 if (max_irr == -1) 7292 return; 7293 7294 /* 7295 * If a vmexit is needed, vmx_check_nested_events handles it. 7296 */ 7297 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 7298 return; 7299 7300 if (!is_guest_mode(vcpu)) { 7301 vmx_set_rvi(max_irr); 7302 return; 7303 } 7304 7305 /* 7306 * Fall back to pre-APICv interrupt injection since L2 7307 * is run without virtual interrupt delivery. 7308 */ 7309 if (!kvm_event_needs_reinjection(vcpu) && 7310 vmx_interrupt_allowed(vcpu)) { 7311 kvm_queue_interrupt(vcpu, max_irr, false); 7312 vmx_inject_irq(vcpu); 7313 } 7314} 7315 7316static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 7317{ 7318 if (!vmx_vm_has_apicv(vcpu->kvm)) 7319 return; 7320 7321 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 7322 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 7323 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 7324 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 7325} 7326 7327static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) 7328{ 7329 u32 exit_intr_info; 7330 7331 if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY 7332 || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) 7333 return; 7334 7335 vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 7336 exit_intr_info = vmx->exit_intr_info; 7337 7338 /* Handle machine checks before interrupts are enabled */ 7339 if (is_machine_check(exit_intr_info)) 7340 kvm_machine_check(); 7341 7342 /* We need to handle NMIs before interrupts are enabled */ 7343 if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && 7344 (exit_intr_info & INTR_INFO_VALID_MASK)) { 7345 kvm_before_handle_nmi(&vmx->vcpu); 7346 asm("int $2"); 7347 kvm_after_handle_nmi(&vmx->vcpu); 7348 } 7349} 7350 7351static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) 7352{ 7353 u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 7354 7355 /* 7356 * If external interrupt exists, IF bit is set in rflags/eflags on the 7357 * interrupt stack frame, and interrupt will be enabled on a return 7358 * from interrupt handler. 7359 */ 7360 if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) 7361 == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { 7362 unsigned int vector; 7363 unsigned long entry; 7364 gate_desc *desc; 7365 struct vcpu_vmx *vmx = to_vmx(vcpu); 7366#ifdef CONFIG_X86_64 7367 unsigned long tmp; 7368#endif 7369 7370 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 7371 desc = (gate_desc *)vmx->host_idt_base + vector; 7372 entry = gate_offset(*desc); 7373 asm volatile( 7374#ifdef CONFIG_X86_64 7375 "mov %%" _ASM_SP ", %[sp]\n\t" 7376 "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" 7377 "push $%c[ss]\n\t" 7378 "push %[sp]\n\t" 7379#endif 7380 "pushf\n\t" 7381 "orl $0x200, (%%" _ASM_SP ")\n\t" 7382 __ASM_SIZE(push) " $%c[cs]\n\t" 7383 "call *%[entry]\n\t" 7384 : 7385#ifdef CONFIG_X86_64 7386 [sp]"=&r"(tmp) 7387#endif 7388 : 7389 [entry]"r"(entry), 7390 [ss]"i"(__KERNEL_DS), 7391 [cs]"i"(__KERNEL_CS) 7392 ); 7393 } else 7394 local_irq_enable(); 7395} 7396 7397static bool vmx_mpx_supported(void) 7398{ 7399 return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && 7400 (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); 7401} 7402 7403static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 7404{ 7405 u32 exit_intr_info; 7406 bool unblock_nmi; 7407 u8 vector; 7408 bool idtv_info_valid; 7409 7410 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7411 7412 if (cpu_has_virtual_nmis()) { 7413 if (vmx->nmi_known_unmasked) 7414 return; 7415 /* 7416 * Can't use vmx->exit_intr_info since we're not sure what 7417 * the exit reason is. 7418 */ 7419 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 7420 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 7421 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 7422 /* 7423 * SDM 3: 27.7.1.2 (September 2008) 7424 * Re-set bit "block by NMI" before VM entry if vmexit caused by 7425 * a guest IRET fault. 7426 * SDM 3: 23.2.2 (September 2008) 7427 * Bit 12 is undefined in any of the following cases: 7428 * If the VM exit sets the valid bit in the IDT-vectoring 7429 * information field. 7430 * If the VM exit is due to a double fault. 7431 */ 7432 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && 7433 vector != DF_VECTOR && !idtv_info_valid) 7434 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 7435 GUEST_INTR_STATE_NMI); 7436 else 7437 vmx->nmi_known_unmasked = 7438 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 7439 & GUEST_INTR_STATE_NMI); 7440 } else if (unlikely(vmx->soft_vnmi_blocked)) 7441 vmx->vnmi_blocked_time += 7442 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); 7443} 7444 7445static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 7446 u32 idt_vectoring_info, 7447 int instr_len_field, 7448 int error_code_field) 7449{ 7450 u8 vector; 7451 int type; 7452 bool idtv_info_valid; 7453 7454 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7455 7456 vcpu->arch.nmi_injected = false; 7457 kvm_clear_exception_queue(vcpu); 7458 kvm_clear_interrupt_queue(vcpu); 7459 7460 if (!idtv_info_valid) 7461 return; 7462 7463 kvm_make_request(KVM_REQ_EVENT, vcpu); 7464 7465 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 7466 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 7467 7468 switch (type) { 7469 case INTR_TYPE_NMI_INTR: 7470 vcpu->arch.nmi_injected = true; 7471 /* 7472 * SDM 3: 27.7.1.2 (September 2008) 7473 * Clear bit "block by NMI" before VM entry if a NMI 7474 * delivery faulted. 7475 */ 7476 vmx_set_nmi_mask(vcpu, false); 7477 break; 7478 case INTR_TYPE_SOFT_EXCEPTION: 7479 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7480 /* fall through */ 7481 case INTR_TYPE_HARD_EXCEPTION: 7482 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 7483 u32 err = vmcs_read32(error_code_field); 7484 kvm_requeue_exception_e(vcpu, vector, err); 7485 } else 7486 kvm_requeue_exception(vcpu, vector); 7487 break; 7488 case INTR_TYPE_SOFT_INTR: 7489 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7490 /* fall through */ 7491 case INTR_TYPE_EXT_INTR: 7492 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 7493 break; 7494 default: 7495 break; 7496 } 7497} 7498 7499static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 7500{ 7501 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 7502 VM_EXIT_INSTRUCTION_LEN, 7503 IDT_VECTORING_ERROR_CODE); 7504} 7505 7506static void vmx_cancel_injection(struct kvm_vcpu *vcpu) 7507{ 7508 __vmx_complete_interrupts(vcpu, 7509 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 7510 VM_ENTRY_INSTRUCTION_LEN, 7511 VM_ENTRY_EXCEPTION_ERROR_CODE); 7512 7513 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 7514} 7515 7516static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) 7517{ 7518 int i, nr_msrs; 7519 struct perf_guest_switch_msr *msrs; 7520 7521 msrs = perf_guest_get_msrs(&nr_msrs); 7522 7523 if (!msrs) 7524 return; 7525 7526 for (i = 0; i < nr_msrs; i++) 7527 if (msrs[i].host == msrs[i].guest) 7528 clear_atomic_switch_msr(vmx, msrs[i].msr); 7529 else 7530 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 7531 msrs[i].host); 7532} 7533 7534static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) 7535{ 7536 struct vcpu_vmx *vmx = to_vmx(vcpu); 7537 unsigned long debugctlmsr, cr4; 7538 7539 /* Record the guest's net vcpu time for enforced NMI injections. */ 7540 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 7541 vmx->entry_time = ktime_get(); 7542 7543 /* Don't enter VMX if guest state is invalid, let the exit handler 7544 start emulation until we arrive back to a valid state */ 7545 if (vmx->emulation_required) 7546 return; 7547 7548 if (vmx->ple_window_dirty) { 7549 vmx->ple_window_dirty = false; 7550 vmcs_write32(PLE_WINDOW, vmx->ple_window); 7551 } 7552 7553 if (vmx->nested.sync_shadow_vmcs) { 7554 copy_vmcs12_to_shadow(vmx); 7555 vmx->nested.sync_shadow_vmcs = false; 7556 } 7557 7558 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 7559 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 7560 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) 7561 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 7562 7563 cr4 = read_cr4(); 7564 if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { 7565 vmcs_writel(HOST_CR4, cr4); 7566 vmx->host_state.vmcs_host_cr4 = cr4; 7567 } 7568 7569 /* When single-stepping over STI and MOV SS, we must clear the 7570 * corresponding interruptibility bits in the guest state. Otherwise 7571 * vmentry fails as it then expects bit 14 (BS) in pending debug 7572 * exceptions being set, but that's not correct for the guest debugging 7573 * case. */ 7574 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 7575 vmx_set_interrupt_shadow(vcpu, 0); 7576 7577 atomic_switch_perf_msrs(vmx); 7578 debugctlmsr = get_debugctlmsr(); 7579 7580 vmx->__launched = vmx->loaded_vmcs->launched; 7581 asm( 7582 /* Store host registers */ 7583 "push %%" _ASM_DX "; push %%" _ASM_BP ";" 7584 "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ 7585 "push %%" _ASM_CX " \n\t" 7586 "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" 7587 "je 1f \n\t" 7588 "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" 7589 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" 7590 "1: \n\t" 7591 /* Reload cr2 if changed */ 7592 "mov %c[cr2](%0), %%" _ASM_AX " \n\t" 7593 "mov %%cr2, %%" _ASM_DX " \n\t" 7594 "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" 7595 "je 2f \n\t" 7596 "mov %%" _ASM_AX", %%cr2 \n\t" 7597 "2: \n\t" 7598 /* Check if vmlaunch of vmresume is needed */ 7599 "cmpl $0, %c[launched](%0) \n\t" 7600 /* Load guest registers. Don't clobber flags. */ 7601 "mov %c[rax](%0), %%" _ASM_AX " \n\t" 7602 "mov %c[rbx](%0), %%" _ASM_BX " \n\t" 7603 "mov %c[rdx](%0), %%" _ASM_DX " \n\t" 7604 "mov %c[rsi](%0), %%" _ASM_SI " \n\t" 7605 "mov %c[rdi](%0), %%" _ASM_DI " \n\t" 7606 "mov %c[rbp](%0), %%" _ASM_BP " \n\t" 7607#ifdef CONFIG_X86_64 7608 "mov %c[r8](%0), %%r8 \n\t" 7609 "mov %c[r9](%0), %%r9 \n\t" 7610 "mov %c[r10](%0), %%r10 \n\t" 7611 "mov %c[r11](%0), %%r11 \n\t" 7612 "mov %c[r12](%0), %%r12 \n\t" 7613 "mov %c[r13](%0), %%r13 \n\t" 7614 "mov %c[r14](%0), %%r14 \n\t" 7615 "mov %c[r15](%0), %%r15 \n\t" 7616#endif 7617 "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */ 7618 7619 /* Enter guest mode */ 7620 "jne 1f \n\t" 7621 __ex(ASM_VMX_VMLAUNCH) "\n\t" 7622 "jmp 2f \n\t" 7623 "1: " __ex(ASM_VMX_VMRESUME) "\n\t" 7624 "2: " 7625 /* Save guest registers, load host registers, keep flags */ 7626 "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" 7627 "pop %0 \n\t" 7628 "mov %%" _ASM_AX ", %c[rax](%0) \n\t" 7629 "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" 7630 __ASM_SIZE(pop) " %c[rcx](%0) \n\t" 7631 "mov %%" _ASM_DX ", %c[rdx](%0) \n\t" 7632 "mov %%" _ASM_SI ", %c[rsi](%0) \n\t" 7633 "mov %%" _ASM_DI ", %c[rdi](%0) \n\t" 7634 "mov %%" _ASM_BP ", %c[rbp](%0) \n\t" 7635#ifdef CONFIG_X86_64 7636 "mov %%r8, %c[r8](%0) \n\t" 7637 "mov %%r9, %c[r9](%0) \n\t" 7638 "mov %%r10, %c[r10](%0) \n\t" 7639 "mov %%r11, %c[r11](%0) \n\t" 7640 "mov %%r12, %c[r12](%0) \n\t" 7641 "mov %%r13, %c[r13](%0) \n\t" 7642 "mov %%r14, %c[r14](%0) \n\t" 7643 "mov %%r15, %c[r15](%0) \n\t" 7644#endif 7645 "mov %%cr2, %%" _ASM_AX " \n\t" 7646 "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" 7647 7648 "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" 7649 "setbe %c[fail](%0) \n\t" 7650 ".pushsection .rodata \n\t" 7651 ".global vmx_return \n\t" 7652 "vmx_return: " _ASM_PTR " 2b \n\t" 7653 ".popsection" 7654 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 7655 [launched]"i"(offsetof(struct vcpu_vmx, __launched)), 7656 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 7657 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), 7658 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), 7659 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), 7660 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), 7661 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), 7662 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), 7663 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), 7664 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), 7665#ifdef CONFIG_X86_64 7666 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), 7667 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), 7668 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), 7669 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), 7670 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), 7671 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), 7672 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), 7673 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), 7674#endif 7675 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), 7676 [wordsize]"i"(sizeof(ulong)) 7677 : "cc", "memory" 7678#ifdef CONFIG_X86_64 7679 , "rax", "rbx", "rdi", "rsi" 7680 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 7681#else 7682 , "eax", "ebx", "edi", "esi" 7683#endif 7684 ); 7685 7686 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 7687 if (debugctlmsr) 7688 update_debugctlmsr(debugctlmsr); 7689 7690#ifndef CONFIG_X86_64 7691 /* 7692 * The sysexit path does not restore ds/es, so we must set them to 7693 * a reasonable value ourselves. 7694 * 7695 * We can't defer this to vmx_load_host_state() since that function 7696 * may be executed in interrupt context, which saves and restore segments 7697 * around it, nullifying its effect. 7698 */ 7699 loadsegment(ds, __USER_DS); 7700 loadsegment(es, __USER_DS); 7701#endif 7702 7703 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) 7704 | (1 << VCPU_EXREG_RFLAGS) 7705 | (1 << VCPU_EXREG_PDPTR) 7706 | (1 << VCPU_EXREG_SEGMENTS) 7707 | (1 << VCPU_EXREG_CR3)); 7708 vcpu->arch.regs_dirty = 0; 7709 7710 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 7711 7712 vmx->loaded_vmcs->launched = 1; 7713 7714 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 7715 trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); 7716 7717 /* 7718 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if 7719 * we did not inject a still-pending event to L1 now because of 7720 * nested_run_pending, we need to re-enable this bit. 7721 */ 7722 if (vmx->nested.nested_run_pending) 7723 kvm_make_request(KVM_REQ_EVENT, vcpu); 7724 7725 vmx->nested.nested_run_pending = 0; 7726 7727 vmx_complete_atomic_exit(vmx); 7728 vmx_recover_nmi_blocking(vmx); 7729 vmx_complete_interrupts(vmx); 7730} 7731 7732static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) 7733{ 7734 struct vcpu_vmx *vmx = to_vmx(vcpu); 7735 int cpu; 7736 7737 if (vmx->loaded_vmcs == &vmx->vmcs01) 7738 return; 7739 7740 cpu = get_cpu(); 7741 vmx->loaded_vmcs = &vmx->vmcs01; 7742 vmx_vcpu_put(vcpu); 7743 vmx_vcpu_load(vcpu, cpu); 7744 vcpu->cpu = cpu; 7745 put_cpu(); 7746} 7747 7748static void vmx_free_vcpu(struct kvm_vcpu *vcpu) 7749{ 7750 struct vcpu_vmx *vmx = to_vmx(vcpu); 7751 7752 free_vpid(vmx); 7753 leave_guest_mode(vcpu); 7754 vmx_load_vmcs01(vcpu); 7755 free_nested(vmx); 7756 free_loaded_vmcs(vmx->loaded_vmcs); 7757 kfree(vmx->guest_msrs); 7758 kvm_vcpu_uninit(vcpu); 7759 kmem_cache_free(kvm_vcpu_cache, vmx); 7760} 7761 7762static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) 7763{ 7764 int err; 7765 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 7766 int cpu; 7767 7768 if (!vmx) 7769 return ERR_PTR(-ENOMEM); 7770 7771 allocate_vpid(vmx); 7772 7773 err = kvm_vcpu_init(&vmx->vcpu, kvm, id); 7774 if (err) 7775 goto free_vcpu; 7776 7777 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); 7778 BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) 7779 > PAGE_SIZE); 7780 7781 err = -ENOMEM; 7782 if (!vmx->guest_msrs) { 7783 goto uninit_vcpu; 7784 } 7785 7786 vmx->loaded_vmcs = &vmx->vmcs01; 7787 vmx->loaded_vmcs->vmcs = alloc_vmcs(); 7788 if (!vmx->loaded_vmcs->vmcs) 7789 goto free_msrs; 7790 if (!vmm_exclusive) 7791 kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); 7792 loaded_vmcs_init(vmx->loaded_vmcs); 7793 if (!vmm_exclusive) 7794 kvm_cpu_vmxoff(); 7795 7796 cpu = get_cpu(); 7797 vmx_vcpu_load(&vmx->vcpu, cpu); 7798 vmx->vcpu.cpu = cpu; 7799 err = vmx_vcpu_setup(vmx); 7800 vmx_vcpu_put(&vmx->vcpu); 7801 put_cpu(); 7802 if (err) 7803 goto free_vmcs; 7804 if (vm_need_virtualize_apic_accesses(kvm)) { 7805 err = alloc_apic_access_page(kvm); 7806 if (err) 7807 goto free_vmcs; 7808 } 7809 7810 if (enable_ept) { 7811 if (!kvm->arch.ept_identity_map_addr) 7812 kvm->arch.ept_identity_map_addr = 7813 VMX_EPT_IDENTITY_PAGETABLE_ADDR; 7814 err = init_rmode_identity_map(kvm); 7815 if (err) 7816 goto free_vmcs; 7817 } 7818 7819 vmx->nested.current_vmptr = -1ull; 7820 vmx->nested.current_vmcs12 = NULL; 7821 7822 return &vmx->vcpu; 7823 7824free_vmcs: 7825 free_loaded_vmcs(vmx->loaded_vmcs); 7826free_msrs: 7827 kfree(vmx->guest_msrs); 7828uninit_vcpu: 7829 kvm_vcpu_uninit(&vmx->vcpu); 7830free_vcpu: 7831 free_vpid(vmx); 7832 kmem_cache_free(kvm_vcpu_cache, vmx); 7833 return ERR_PTR(err); 7834} 7835 7836static void __init vmx_check_processor_compat(void *rtn) 7837{ 7838 struct vmcs_config vmcs_conf; 7839 7840 *(int *)rtn = 0; 7841 if (setup_vmcs_config(&vmcs_conf) < 0) 7842 *(int *)rtn = -EIO; 7843 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { 7844 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", 7845 smp_processor_id()); 7846 *(int *)rtn = -EIO; 7847 } 7848} 7849 7850static int get_ept_level(void) 7851{ 7852 return VMX_EPT_DEFAULT_GAW + 1; 7853} 7854 7855static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 7856{ 7857 u64 ret; 7858 7859 /* For VT-d and EPT combination 7860 * 1. MMIO: always map as UC 7861 * 2. EPT with VT-d: 7862 * a. VT-d without snooping control feature: can't guarantee the 7863 * result, try to trust guest. 7864 * b. VT-d with snooping control feature: snooping control feature of 7865 * VT-d engine can guarantee the cache correctness. Just set it 7866 * to WB to keep consistent with host. So the same as item 3. 7867 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep 7868 * consistent with host MTRR 7869 */ 7870 if (is_mmio) 7871 ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; 7872 else if (kvm_arch_has_noncoherent_dma(vcpu->kvm)) 7873 ret = kvm_get_guest_memory_type(vcpu, gfn) << 7874 VMX_EPT_MT_EPTE_SHIFT; 7875 else 7876 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) 7877 | VMX_EPT_IPAT_BIT; 7878 7879 return ret; 7880} 7881 7882static int vmx_get_lpage_level(void) 7883{ 7884 if (enable_ept && !cpu_has_vmx_ept_1g_page()) 7885 return PT_DIRECTORY_LEVEL; 7886 else 7887 /* For shadow and EPT supported 1GB page */ 7888 return PT_PDPE_LEVEL; 7889} 7890 7891static void vmx_cpuid_update(struct kvm_vcpu *vcpu) 7892{ 7893 struct kvm_cpuid_entry2 *best; 7894 struct vcpu_vmx *vmx = to_vmx(vcpu); 7895 u32 exec_control; 7896 7897 vmx->rdtscp_enabled = false; 7898 if (vmx_rdtscp_supported()) { 7899 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 7900 if (exec_control & SECONDARY_EXEC_RDTSCP) { 7901 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 7902 if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) 7903 vmx->rdtscp_enabled = true; 7904 else { 7905 exec_control &= ~SECONDARY_EXEC_RDTSCP; 7906 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 7907 exec_control); 7908 } 7909 } 7910 } 7911 7912 /* Exposing INVPCID only when PCID is exposed */ 7913 best = kvm_find_cpuid_entry(vcpu, 0x7, 0); 7914 if (vmx_invpcid_supported() && 7915 best && (best->ebx & bit(X86_FEATURE_INVPCID)) && 7916 guest_cpuid_has_pcid(vcpu)) { 7917 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 7918 exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; 7919 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 7920 exec_control); 7921 } else { 7922 if (cpu_has_secondary_exec_ctrls()) { 7923 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 7924 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; 7925 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 7926 exec_control); 7927 } 7928 if (best) 7929 best->ebx &= ~bit(X86_FEATURE_INVPCID); 7930 } 7931} 7932 7933static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 7934{ 7935 if (func == 1 && nested) 7936 entry->ecx |= bit(X86_FEATURE_VMX); 7937} 7938 7939static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 7940 struct x86_exception *fault) 7941{ 7942 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7943 u32 exit_reason; 7944 7945 if (fault->error_code & PFERR_RSVD_MASK) 7946 exit_reason = EXIT_REASON_EPT_MISCONFIG; 7947 else 7948 exit_reason = EXIT_REASON_EPT_VIOLATION; 7949 nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification); 7950 vmcs12->guest_physical_address = fault->address; 7951} 7952 7953/* Callbacks for nested_ept_init_mmu_context: */ 7954 7955static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) 7956{ 7957 /* return the page table to be shadowed - in our case, EPT12 */ 7958 return get_vmcs12(vcpu)->ept_pointer; 7959} 7960 7961static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 7962{ 7963 kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, 7964 nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT); 7965 7966 vcpu->arch.mmu.set_cr3 = vmx_set_cr3; 7967 vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; 7968 vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; 7969 7970 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 7971} 7972 7973static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 7974{ 7975 vcpu->arch.walk_mmu = &vcpu->arch.mmu; 7976} 7977 7978static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, 7979 struct x86_exception *fault) 7980{ 7981 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7982 7983 WARN_ON(!is_guest_mode(vcpu)); 7984 7985 /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ 7986 if (vmcs12->exception_bitmap & (1u << PF_VECTOR)) 7987 nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, 7988 vmcs_read32(VM_EXIT_INTR_INFO), 7989 vmcs_readl(EXIT_QUALIFICATION)); 7990 else 7991 kvm_inject_page_fault(vcpu, fault); 7992} 7993 7994static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, 7995 struct vmcs12 *vmcs12) 7996{ 7997 struct vcpu_vmx *vmx = to_vmx(vcpu); 7998 7999 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 8000 /* TODO: Also verify bits beyond physical address width are 0 */ 8001 if (!PAGE_ALIGNED(vmcs12->apic_access_addr)) 8002 return false; 8003 8004 /* 8005 * Translate L1 physical address to host physical 8006 * address for vmcs02. Keep the page pinned, so this 8007 * physical address remains valid. We keep a reference 8008 * to it so we can release it later. 8009 */ 8010 if (vmx->nested.apic_access_page) /* shouldn't happen */ 8011 nested_release_page(vmx->nested.apic_access_page); 8012 vmx->nested.apic_access_page = 8013 nested_get_page(vcpu, vmcs12->apic_access_addr); 8014 } 8015 8016 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 8017 /* TODO: Also verify bits beyond physical address width are 0 */ 8018 if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr)) 8019 return false; 8020 8021 if (vmx->nested.virtual_apic_page) /* shouldn't happen */ 8022 nested_release_page(vmx->nested.virtual_apic_page); 8023 vmx->nested.virtual_apic_page = 8024 nested_get_page(vcpu, vmcs12->virtual_apic_page_addr); 8025 8026 /* 8027 * Failing the vm entry is _not_ what the processor does 8028 * but it's basically the only possibility we have. 8029 * We could still enter the guest if CR8 load exits are 8030 * enabled, CR8 store exits are enabled, and virtualize APIC 8031 * access is disabled; in this case the processor would never 8032 * use the TPR shadow and we could simply clear the bit from 8033 * the execution control. But such a configuration is useless, 8034 * so let's keep the code simple. 8035 */ 8036 if (!vmx->nested.virtual_apic_page) 8037 return false; 8038 } 8039 8040 return true; 8041} 8042 8043static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) 8044{ 8045 u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; 8046 struct vcpu_vmx *vmx = to_vmx(vcpu); 8047 8048 if (vcpu->arch.virtual_tsc_khz == 0) 8049 return; 8050 8051 /* Make sure short timeouts reliably trigger an immediate vmexit. 8052 * hrtimer_start does not guarantee this. */ 8053 if (preemption_timeout <= 1) { 8054 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 8055 return; 8056 } 8057 8058 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 8059 preemption_timeout *= 1000000; 8060 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 8061 hrtimer_start(&vmx->nested.preemption_timer, 8062 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); 8063} 8064 8065/* 8066 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 8067 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 8068 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 8069 * guest in a way that will both be appropriate to L1's requests, and our 8070 * needs. In addition to modifying the active vmcs (which is vmcs02), this 8071 * function also has additional necessary side-effects, like setting various 8072 * vcpu->arch fields. 8073 */ 8074static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 8075{ 8076 struct vcpu_vmx *vmx = to_vmx(vcpu); 8077 u32 exec_control; 8078 8079 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 8080 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 8081 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 8082 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 8083 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 8084 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 8085 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 8086 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 8087 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 8088 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 8089 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 8090 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 8091 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 8092 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 8093 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 8094 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 8095 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 8096 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 8097 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 8098 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 8099 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 8100 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 8101 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 8102 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 8103 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 8104 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 8105 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 8106 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 8107 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 8108 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 8109 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 8110 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 8111 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 8112 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 8113 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 8114 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 8115 8116 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 8117 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 8118 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 8119 } else { 8120 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 8121 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 8122 } 8123 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 8124 vmcs12->vm_entry_intr_info_field); 8125 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 8126 vmcs12->vm_entry_exception_error_code); 8127 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 8128 vmcs12->vm_entry_instruction_len); 8129 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 8130 vmcs12->guest_interruptibility_info); 8131 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 8132 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 8133 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 8134 vmcs12->guest_pending_dbg_exceptions); 8135 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 8136 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 8137 8138 vmcs_write64(VMCS_LINK_POINTER, -1ull); 8139 8140 exec_control = vmcs12->pin_based_vm_exec_control; 8141 exec_control |= vmcs_config.pin_based_exec_ctrl; 8142 exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER | 8143 PIN_BASED_POSTED_INTR); 8144 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); 8145 8146 vmx->nested.preemption_timer_expired = false; 8147 if (nested_cpu_has_preemption_timer(vmcs12)) 8148 vmx_start_preemption_timer(vcpu); 8149 8150 /* 8151 * Whether page-faults are trapped is determined by a combination of 8152 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. 8153 * If enable_ept, L0 doesn't care about page faults and we should 8154 * set all of these to L1's desires. However, if !enable_ept, L0 does 8155 * care about (at least some) page faults, and because it is not easy 8156 * (if at all possible?) to merge L0 and L1's desires, we simply ask 8157 * to exit on each and every L2 page fault. This is done by setting 8158 * MASK=MATCH=0 and (see below) EB.PF=1. 8159 * Note that below we don't need special code to set EB.PF beyond the 8160 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 8161 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 8162 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 8163 * 8164 * A problem with this approach (when !enable_ept) is that L1 may be 8165 * injected with more page faults than it asked for. This could have 8166 * caused problems, but in practice existing hypervisors don't care. 8167 * To fix this, we will need to emulate the PFEC checking (on the L1 8168 * page tables), using walk_addr(), when injecting PFs to L1. 8169 */ 8170 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 8171 enable_ept ? vmcs12->page_fault_error_code_mask : 0); 8172 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 8173 enable_ept ? vmcs12->page_fault_error_code_match : 0); 8174 8175 if (cpu_has_secondary_exec_ctrls()) { 8176 exec_control = vmx_secondary_exec_control(vmx); 8177 if (!vmx->rdtscp_enabled) 8178 exec_control &= ~SECONDARY_EXEC_RDTSCP; 8179 /* Take the following fields only from vmcs12 */ 8180 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 8181 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 8182 SECONDARY_EXEC_APIC_REGISTER_VIRT); 8183 if (nested_cpu_has(vmcs12, 8184 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 8185 exec_control |= vmcs12->secondary_vm_exec_control; 8186 8187 if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) { 8188 /* 8189 * If translation failed, no matter: This feature asks 8190 * to exit when accessing the given address, and if it 8191 * can never be accessed, this feature won't do 8192 * anything anyway. 8193 */ 8194 if (!vmx->nested.apic_access_page) 8195 exec_control &= 8196 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 8197 else 8198 vmcs_write64(APIC_ACCESS_ADDR, 8199 page_to_phys(vmx->nested.apic_access_page)); 8200 } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) { 8201 exec_control |= 8202 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 8203 kvm_vcpu_reload_apic_access_page(vcpu); 8204 } 8205 8206 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 8207 } 8208 8209 8210 /* 8211 * Set host-state according to L0's settings (vmcs12 is irrelevant here) 8212 * Some constant fields are set here by vmx_set_constant_host_state(). 8213 * Other fields are different per CPU, and will be set later when 8214 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. 8215 */ 8216 vmx_set_constant_host_state(vmx); 8217 8218 /* 8219 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before 8220 * entry, but only if the current (host) sp changed from the value 8221 * we wrote last (vmx->host_rsp). This cache is no longer relevant 8222 * if we switch vmcs, and rather than hold a separate cache per vmcs, 8223 * here we just force the write to happen on entry. 8224 */ 8225 vmx->host_rsp = 0; 8226 8227 exec_control = vmx_exec_control(vmx); /* L0's desires */ 8228 exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 8229 exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 8230 exec_control &= ~CPU_BASED_TPR_SHADOW; 8231 exec_control |= vmcs12->cpu_based_vm_exec_control; 8232 8233 if (exec_control & CPU_BASED_TPR_SHADOW) { 8234 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 8235 page_to_phys(vmx->nested.virtual_apic_page)); 8236 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 8237 } 8238 8239 /* 8240 * Merging of IO and MSR bitmaps not currently supported. 8241 * Rather, exit every time. 8242 */ 8243 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 8244 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 8245 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 8246 8247 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); 8248 8249 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 8250 * bitwise-or of what L1 wants to trap for L2, and what we want to 8251 * trap. Note that CR0.TS also needs updating - we do this later. 8252 */ 8253 update_exception_bitmap(vcpu); 8254 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 8255 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 8256 8257 /* L2->L1 exit controls are emulated - the hardware exit is to L0 so 8258 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 8259 * bits are further modified by vmx_set_efer() below. 8260 */ 8261 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); 8262 8263 /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are 8264 * emulated by vmx_set_efer(), below. 8265 */ 8266 vm_entry_controls_init(vmx, 8267 (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & 8268 ~VM_ENTRY_IA32E_MODE) | 8269 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); 8270 8271 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) { 8272 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 8273 vcpu->arch.pat = vmcs12->guest_ia32_pat; 8274 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 8275 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 8276 8277 8278 set_cr4_guest_host_mask(vmx); 8279 8280 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) 8281 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 8282 8283 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 8284 vmcs_write64(TSC_OFFSET, 8285 vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); 8286 else 8287 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); 8288 8289 if (enable_vpid) { 8290 /* 8291 * Trivially support vpid by letting L2s share their parent 8292 * L1's vpid. TODO: move to a more elaborate solution, giving 8293 * each L2 its own vpid and exposing the vpid feature to L1. 8294 */ 8295 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 8296 vmx_flush_tlb(vcpu); 8297 } 8298 8299 if (nested_cpu_has_ept(vmcs12)) { 8300 kvm_mmu_unload(vcpu); 8301 nested_ept_init_mmu_context(vcpu); 8302 } 8303 8304 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) 8305 vcpu->arch.efer = vmcs12->guest_ia32_efer; 8306 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 8307 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 8308 else 8309 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 8310 /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 8311 vmx_set_efer(vcpu, vcpu->arch.efer); 8312 8313 /* 8314 * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified 8315 * TS bit (for lazy fpu) and bits which we consider mandatory enabled. 8316 * The CR0_READ_SHADOW is what L2 should have expected to read given 8317 * the specifications by L1; It's not enough to take 8318 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we 8319 * have more bits than L1 expected. 8320 */ 8321 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 8322 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 8323 8324 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 8325 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 8326 8327 /* shadow page tables on either EPT or shadow page tables */ 8328 kvm_set_cr3(vcpu, vmcs12->guest_cr3); 8329 kvm_mmu_reset_context(vcpu); 8330 8331 if (!enable_ept) 8332 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 8333 8334 /* 8335 * L1 may access the L2's PDPTR, so save them to construct vmcs12 8336 */ 8337 if (enable_ept) { 8338 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 8339 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 8340 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 8341 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 8342 } 8343 8344 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); 8345 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); 8346} 8347 8348/* 8349 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 8350 * for running an L2 nested guest. 8351 */ 8352static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 8353{ 8354 struct vmcs12 *vmcs12; 8355 struct vcpu_vmx *vmx = to_vmx(vcpu); 8356 int cpu; 8357 struct loaded_vmcs *vmcs02; 8358 bool ia32e; 8359 8360 if (!nested_vmx_check_permission(vcpu) || 8361 !nested_vmx_check_vmcs12(vcpu)) 8362 return 1; 8363 8364 skip_emulated_instruction(vcpu); 8365 vmcs12 = get_vmcs12(vcpu); 8366 8367 if (enable_shadow_vmcs) 8368 copy_shadow_to_vmcs12(vmx); 8369 8370 /* 8371 * The nested entry process starts with enforcing various prerequisites 8372 * on vmcs12 as required by the Intel SDM, and act appropriately when 8373 * they fail: As the SDM explains, some conditions should cause the 8374 * instruction to fail, while others will cause the instruction to seem 8375 * to succeed, but return an EXIT_REASON_INVALID_STATE. 8376 * To speed up the normal (success) code path, we should avoid checking 8377 * for misconfigurations which will anyway be caught by the processor 8378 * when using the merged vmcs02. 8379 */ 8380 if (vmcs12->launch_state == launch) { 8381 nested_vmx_failValid(vcpu, 8382 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 8383 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 8384 return 1; 8385 } 8386 8387 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 8388 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) { 8389 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 8390 return 1; 8391 } 8392 8393 if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && 8394 !PAGE_ALIGNED(vmcs12->msr_bitmap)) { 8395 /*TODO: Also verify bits beyond physical address width are 0*/ 8396 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 8397 return 1; 8398 } 8399 8400 if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { 8401 /*TODO: Also verify bits beyond physical address width are 0*/ 8402 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 8403 return 1; 8404 } 8405 8406 if (vmcs12->vm_entry_msr_load_count > 0 || 8407 vmcs12->vm_exit_msr_load_count > 0 || 8408 vmcs12->vm_exit_msr_store_count > 0) { 8409 pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n", 8410 __func__); 8411 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 8412 return 1; 8413 } 8414 8415 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 8416 nested_vmx_true_procbased_ctls_low, 8417 nested_vmx_procbased_ctls_high) || 8418 !vmx_control_verify(vmcs12->secondary_vm_exec_control, 8419 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) || 8420 !vmx_control_verify(vmcs12->pin_based_vm_exec_control, 8421 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) || 8422 !vmx_control_verify(vmcs12->vm_exit_controls, 8423 nested_vmx_true_exit_ctls_low, 8424 nested_vmx_exit_ctls_high) || 8425 !vmx_control_verify(vmcs12->vm_entry_controls, 8426 nested_vmx_true_entry_ctls_low, 8427 nested_vmx_entry_ctls_high)) 8428 { 8429 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 8430 return 1; 8431 } 8432 8433 if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || 8434 ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { 8435 nested_vmx_failValid(vcpu, 8436 VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 8437 return 1; 8438 } 8439 8440 if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) || 8441 ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { 8442 nested_vmx_entry_failure(vcpu, vmcs12, 8443 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); 8444 return 1; 8445 } 8446 if (vmcs12->vmcs_link_pointer != -1ull) { 8447 nested_vmx_entry_failure(vcpu, vmcs12, 8448 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR); 8449 return 1; 8450 } 8451 8452 /* 8453 * If the load IA32_EFER VM-entry control is 1, the following checks 8454 * are performed on the field for the IA32_EFER MSR: 8455 * - Bits reserved in the IA32_EFER MSR must be 0. 8456 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 8457 * the IA-32e mode guest VM-exit control. It must also be identical 8458 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 8459 * CR0.PG) is 1. 8460 */ 8461 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) { 8462 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 8463 if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || 8464 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || 8465 ((vmcs12->guest_cr0 & X86_CR0_PG) && 8466 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) { 8467 nested_vmx_entry_failure(vcpu, vmcs12, 8468 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); 8469 return 1; 8470 } 8471 } 8472 8473 /* 8474 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 8475 * IA32_EFER MSR must be 0 in the field for that register. In addition, 8476 * the values of the LMA and LME bits in the field must each be that of 8477 * the host address-space size VM-exit control. 8478 */ 8479 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 8480 ia32e = (vmcs12->vm_exit_controls & 8481 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; 8482 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || 8483 ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || 8484 ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) { 8485 nested_vmx_entry_failure(vcpu, vmcs12, 8486 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); 8487 return 1; 8488 } 8489 } 8490 8491 /* 8492 * We're finally done with prerequisite checking, and can start with 8493 * the nested entry. 8494 */ 8495 8496 vmcs02 = nested_get_current_vmcs02(vmx); 8497 if (!vmcs02) 8498 return -ENOMEM; 8499 8500 enter_guest_mode(vcpu); 8501 8502 vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); 8503 8504 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 8505 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 8506 8507 cpu = get_cpu(); 8508 vmx->loaded_vmcs = vmcs02; 8509 vmx_vcpu_put(vcpu); 8510 vmx_vcpu_load(vcpu, cpu); 8511 vcpu->cpu = cpu; 8512 put_cpu(); 8513 8514 vmx_segment_cache_clear(vmx); 8515 8516 vmcs12->launch_state = 1; 8517 8518 prepare_vmcs02(vcpu, vmcs12); 8519 8520 if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) 8521 return kvm_emulate_halt(vcpu); 8522 8523 vmx->nested.nested_run_pending = 1; 8524 8525 /* 8526 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 8527 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 8528 * returned as far as L1 is concerned. It will only return (and set 8529 * the success flag) when L2 exits (see nested_vmx_vmexit()). 8530 */ 8531 return 1; 8532} 8533 8534/* 8535 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 8536 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). 8537 * This function returns the new value we should put in vmcs12.guest_cr0. 8538 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 8539 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 8540 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 8541 * didn't trap the bit, because if L1 did, so would L0). 8542 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 8543 * been modified by L2, and L1 knows it. So just leave the old value of 8544 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 8545 * isn't relevant, because if L0 traps this bit it can set it to anything. 8546 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 8547 * changed these bits, and therefore they need to be updated, but L0 8548 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 8549 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 8550 */ 8551static inline unsigned long 8552vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 8553{ 8554 return 8555 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 8556 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 8557 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 8558 vcpu->arch.cr0_guest_owned_bits)); 8559} 8560 8561static inline unsigned long 8562vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 8563{ 8564 return 8565 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 8566 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 8567 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 8568 vcpu->arch.cr4_guest_owned_bits)); 8569} 8570 8571static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 8572 struct vmcs12 *vmcs12) 8573{ 8574 u32 idt_vectoring; 8575 unsigned int nr; 8576 8577 if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) { 8578 nr = vcpu->arch.exception.nr; 8579 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 8580 8581 if (kvm_exception_is_soft(nr)) { 8582 vmcs12->vm_exit_instruction_len = 8583 vcpu->arch.event_exit_inst_len; 8584 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 8585 } else 8586 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 8587 8588 if (vcpu->arch.exception.has_error_code) { 8589 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 8590 vmcs12->idt_vectoring_error_code = 8591 vcpu->arch.exception.error_code; 8592 } 8593 8594 vmcs12->idt_vectoring_info_field = idt_vectoring; 8595 } else if (vcpu->arch.nmi_injected) { 8596 vmcs12->idt_vectoring_info_field = 8597 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 8598 } else if (vcpu->arch.interrupt.pending) { 8599 nr = vcpu->arch.interrupt.nr; 8600 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 8601 8602 if (vcpu->arch.interrupt.soft) { 8603 idt_vectoring |= INTR_TYPE_SOFT_INTR; 8604 vmcs12->vm_entry_instruction_len = 8605 vcpu->arch.event_exit_inst_len; 8606 } else 8607 idt_vectoring |= INTR_TYPE_EXT_INTR; 8608 8609 vmcs12->idt_vectoring_info_field = idt_vectoring; 8610 } 8611} 8612 8613static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) 8614{ 8615 struct vcpu_vmx *vmx = to_vmx(vcpu); 8616 8617 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 8618 vmx->nested.preemption_timer_expired) { 8619 if (vmx->nested.nested_run_pending) 8620 return -EBUSY; 8621 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 8622 return 0; 8623 } 8624 8625 if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { 8626 if (vmx->nested.nested_run_pending || 8627 vcpu->arch.interrupt.pending) 8628 return -EBUSY; 8629 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 8630 NMI_VECTOR | INTR_TYPE_NMI_INTR | 8631 INTR_INFO_VALID_MASK, 0); 8632 /* 8633 * The NMI-triggered VM exit counts as injection: 8634 * clear this one and block further NMIs. 8635 */ 8636 vcpu->arch.nmi_pending = 0; 8637 vmx_set_nmi_mask(vcpu, true); 8638 return 0; 8639 } 8640 8641 if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && 8642 nested_exit_on_intr(vcpu)) { 8643 if (vmx->nested.nested_run_pending) 8644 return -EBUSY; 8645 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 8646 } 8647 8648 return 0; 8649} 8650 8651static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 8652{ 8653 ktime_t remaining = 8654 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 8655 u64 value; 8656 8657 if (ktime_to_ns(remaining) <= 0) 8658 return 0; 8659 8660 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 8661 do_div(value, 1000000); 8662 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 8663} 8664 8665/* 8666 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 8667 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 8668 * and this function updates it to reflect the changes to the guest state while 8669 * L2 was running (and perhaps made some exits which were handled directly by L0 8670 * without going back to L1), and to reflect the exit reason. 8671 * Note that we do not have to copy here all VMCS fields, just those that 8672 * could have changed by the L2 guest or the exit - i.e., the guest-state and 8673 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 8674 * which already writes to vmcs12 directly. 8675 */ 8676static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 8677 u32 exit_reason, u32 exit_intr_info, 8678 unsigned long exit_qualification) 8679{ 8680 /* update guest state fields: */ 8681 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 8682 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 8683 8684 vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 8685 vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); 8686 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 8687 8688 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 8689 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 8690 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 8691 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 8692 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 8693 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 8694 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 8695 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 8696 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 8697 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 8698 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 8699 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 8700 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 8701 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 8702 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 8703 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 8704 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 8705 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 8706 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 8707 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 8708 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 8709 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 8710 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 8711 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 8712 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 8713 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 8714 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 8715 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 8716 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 8717 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 8718 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 8719 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 8720 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 8721 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 8722 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 8723 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 8724 8725 vmcs12->guest_interruptibility_info = 8726 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 8727 vmcs12->guest_pending_dbg_exceptions = 8728 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 8729 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 8730 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 8731 else 8732 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 8733 8734 if (nested_cpu_has_preemption_timer(vmcs12)) { 8735 if (vmcs12->vm_exit_controls & 8736 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) 8737 vmcs12->vmx_preemption_timer_value = 8738 vmx_get_preemption_timer_value(vcpu); 8739 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 8740 } 8741 8742 /* 8743 * In some cases (usually, nested EPT), L2 is allowed to change its 8744 * own CR3 without exiting. If it has changed it, we must keep it. 8745 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 8746 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 8747 * 8748 * Additionally, restore L2's PDPTR to vmcs12. 8749 */ 8750 if (enable_ept) { 8751 vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3); 8752 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 8753 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 8754 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 8755 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 8756 } 8757 8758 vmcs12->vm_entry_controls = 8759 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 8760 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 8761 8762 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { 8763 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 8764 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 8765 } 8766 8767 /* TODO: These cannot have changed unless we have MSR bitmaps and 8768 * the relevant bit asks not to trap the change */ 8769 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) 8770 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); 8771 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 8772 vmcs12->guest_ia32_efer = vcpu->arch.efer; 8773 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); 8774 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); 8775 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); 8776 if (vmx_mpx_supported()) 8777 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 8778 8779 /* update exit information fields: */ 8780 8781 vmcs12->vm_exit_reason = exit_reason; 8782 vmcs12->exit_qualification = exit_qualification; 8783 8784 vmcs12->vm_exit_intr_info = exit_intr_info; 8785 if ((vmcs12->vm_exit_intr_info & 8786 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == 8787 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) 8788 vmcs12->vm_exit_intr_error_code = 8789 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 8790 vmcs12->idt_vectoring_info_field = 0; 8791 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 8792 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 8793 8794 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 8795 /* vm_entry_intr_info_field is cleared on exit. Emulate this 8796 * instead of reading the real value. */ 8797 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 8798 8799 /* 8800 * Transfer the event that L0 or L1 may wanted to inject into 8801 * L2 to IDT_VECTORING_INFO_FIELD. 8802 */ 8803 vmcs12_save_pending_event(vcpu, vmcs12); 8804 } 8805 8806 /* 8807 * Drop what we picked up for L2 via vmx_complete_interrupts. It is 8808 * preserved above and would only end up incorrectly in L1. 8809 */ 8810 vcpu->arch.nmi_injected = false; 8811 kvm_clear_exception_queue(vcpu); 8812 kvm_clear_interrupt_queue(vcpu); 8813} 8814 8815/* 8816 * A part of what we need to when the nested L2 guest exits and we want to 8817 * run its L1 parent, is to reset L1's guest state to the host state specified 8818 * in vmcs12. 8819 * This function is to be called not only on normal nested exit, but also on 8820 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 8821 * Failures During or After Loading Guest State"). 8822 * This function should be called when the active VMCS is L1's (vmcs01). 8823 */ 8824static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 8825 struct vmcs12 *vmcs12) 8826{ 8827 struct kvm_segment seg; 8828 8829 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 8830 vcpu->arch.efer = vmcs12->host_ia32_efer; 8831 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 8832 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 8833 else 8834 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 8835 vmx_set_efer(vcpu, vcpu->arch.efer); 8836 8837 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); 8838 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); 8839 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 8840 /* 8841 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 8842 * actually changed, because it depends on the current state of 8843 * fpu_active (which may have changed). 8844 * Note that vmx_set_cr0 refers to efer set above. 8845 */ 8846 vmx_set_cr0(vcpu, vmcs12->host_cr0); 8847 /* 8848 * If we did fpu_activate()/fpu_deactivate() during L2's run, we need 8849 * to apply the same changes to L1's vmcs. We just set cr0 correctly, 8850 * but we also need to update cr0_guest_host_mask and exception_bitmap. 8851 */ 8852 update_exception_bitmap(vcpu); 8853 vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0); 8854 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 8855 8856 /* 8857 * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01 8858 * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask(); 8859 */ 8860 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 8861 kvm_set_cr4(vcpu, vmcs12->host_cr4); 8862 8863 nested_ept_uninit_mmu_context(vcpu); 8864 8865 kvm_set_cr3(vcpu, vmcs12->host_cr3); 8866 kvm_mmu_reset_context(vcpu); 8867 8868 if (!enable_ept) 8869 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; 8870 8871 if (enable_vpid) { 8872 /* 8873 * Trivially support vpid by letting L2s share their parent 8874 * L1's vpid. TODO: move to a more elaborate solution, giving 8875 * each L2 its own vpid and exposing the vpid feature to L1. 8876 */ 8877 vmx_flush_tlb(vcpu); 8878 } 8879 8880 8881 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 8882 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 8883 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 8884 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 8885 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 8886 8887 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 8888 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 8889 vmcs_write64(GUEST_BNDCFGS, 0); 8890 8891 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 8892 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 8893 vcpu->arch.pat = vmcs12->host_ia32_pat; 8894 } 8895 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 8896 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, 8897 vmcs12->host_ia32_perf_global_ctrl); 8898 8899 /* Set L1 segment info according to Intel SDM 8900 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 8901 seg = (struct kvm_segment) { 8902 .base = 0, 8903 .limit = 0xFFFFFFFF, 8904 .selector = vmcs12->host_cs_selector, 8905 .type = 11, 8906 .present = 1, 8907 .s = 1, 8908 .g = 1 8909 }; 8910 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 8911 seg.l = 1; 8912 else 8913 seg.db = 1; 8914 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 8915 seg = (struct kvm_segment) { 8916 .base = 0, 8917 .limit = 0xFFFFFFFF, 8918 .type = 3, 8919 .present = 1, 8920 .s = 1, 8921 .db = 1, 8922 .g = 1 8923 }; 8924 seg.selector = vmcs12->host_ds_selector; 8925 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 8926 seg.selector = vmcs12->host_es_selector; 8927 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 8928 seg.selector = vmcs12->host_ss_selector; 8929 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 8930 seg.selector = vmcs12->host_fs_selector; 8931 seg.base = vmcs12->host_fs_base; 8932 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 8933 seg.selector = vmcs12->host_gs_selector; 8934 seg.base = vmcs12->host_gs_base; 8935 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 8936 seg = (struct kvm_segment) { 8937 .base = vmcs12->host_tr_base, 8938 .limit = 0x67, 8939 .selector = vmcs12->host_tr_selector, 8940 .type = 11, 8941 .present = 1 8942 }; 8943 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 8944 8945 kvm_set_dr(vcpu, 7, 0x400); 8946 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 8947} 8948 8949/* 8950 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 8951 * and modify vmcs12 to make it see what it would expect to see there if 8952 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 8953 */ 8954static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, 8955 u32 exit_intr_info, 8956 unsigned long exit_qualification) 8957{ 8958 struct vcpu_vmx *vmx = to_vmx(vcpu); 8959 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8960 8961 /* trying to cancel vmlaunch/vmresume is a bug */ 8962 WARN_ON_ONCE(vmx->nested.nested_run_pending); 8963 8964 leave_guest_mode(vcpu); 8965 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, 8966 exit_qualification); 8967 8968 vmx_load_vmcs01(vcpu); 8969 8970 if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) 8971 && nested_exit_intr_ack_set(vcpu)) { 8972 int irq = kvm_cpu_get_interrupt(vcpu); 8973 WARN_ON(irq < 0); 8974 vmcs12->vm_exit_intr_info = irq | 8975 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 8976 } 8977 8978 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 8979 vmcs12->exit_qualification, 8980 vmcs12->idt_vectoring_info_field, 8981 vmcs12->vm_exit_intr_info, 8982 vmcs12->vm_exit_intr_error_code, 8983 KVM_ISA_VMX); 8984 8985 vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); 8986 vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); 8987 vmx_segment_cache_clear(vmx); 8988 8989 /* if no vmcs02 cache requested, remove the one we used */ 8990 if (VMCS02_POOL_SIZE == 0) 8991 nested_free_vmcs02(vmx, vmx->nested.current_vmptr); 8992 8993 load_vmcs12_host_state(vcpu, vmcs12); 8994 8995 /* Update TSC_OFFSET if TSC was changed while L2 ran */ 8996 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); 8997 8998 /* This is needed for same reason as it was needed in prepare_vmcs02 */ 8999 vmx->host_rsp = 0; 9000 9001 /* Unpin physical memory we referred to in vmcs02 */ 9002 if (vmx->nested.apic_access_page) { 9003 nested_release_page(vmx->nested.apic_access_page); 9004 vmx->nested.apic_access_page = NULL; 9005 } 9006 if (vmx->nested.virtual_apic_page) { 9007 nested_release_page(vmx->nested.virtual_apic_page); 9008 vmx->nested.virtual_apic_page = NULL; 9009 } 9010 9011 /* 9012 * We are now running in L2, mmu_notifier will force to reload the 9013 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. 9014 */ 9015 kvm_vcpu_reload_apic_access_page(vcpu); 9016 9017 /* 9018 * Exiting from L2 to L1, we're now back to L1 which thinks it just 9019 * finished a VMLAUNCH or VMRESUME instruction, so we need to set the 9020 * success or failure flag accordingly. 9021 */ 9022 if (unlikely(vmx->fail)) { 9023 vmx->fail = 0; 9024 nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); 9025 } else 9026 nested_vmx_succeed(vcpu); 9027 if (enable_shadow_vmcs) 9028 vmx->nested.sync_shadow_vmcs = true; 9029 9030 /* in case we halted in L2 */ 9031 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 9032} 9033 9034/* 9035 * Forcibly leave nested mode in order to be able to reset the VCPU later on. 9036 */ 9037static void vmx_leave_nested(struct kvm_vcpu *vcpu) 9038{ 9039 if (is_guest_mode(vcpu)) 9040 nested_vmx_vmexit(vcpu, -1, 0, 0); 9041 free_nested(to_vmx(vcpu)); 9042} 9043 9044/* 9045 * L1's failure to enter L2 is a subset of a normal exit, as explained in 9046 * 23.7 "VM-entry failures during or after loading guest state" (this also 9047 * lists the acceptable exit-reason and exit-qualification parameters). 9048 * It should only be called before L2 actually succeeded to run, and when 9049 * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss). 9050 */ 9051static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, 9052 struct vmcs12 *vmcs12, 9053 u32 reason, unsigned long qualification) 9054{ 9055 load_vmcs12_host_state(vcpu, vmcs12); 9056 vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY; 9057 vmcs12->exit_qualification = qualification; 9058 nested_vmx_succeed(vcpu); 9059 if (enable_shadow_vmcs) 9060 to_vmx(vcpu)->nested.sync_shadow_vmcs = true; 9061} 9062 9063static int vmx_check_intercept(struct kvm_vcpu *vcpu, 9064 struct x86_instruction_info *info, 9065 enum x86_intercept_stage stage) 9066{ 9067 return X86EMUL_CONTINUE; 9068} 9069 9070static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) 9071{ 9072 if (ple_gap) 9073 shrink_ple_window(vcpu); 9074} 9075 9076static struct kvm_x86_ops vmx_x86_ops = { 9077 .cpu_has_kvm_support = cpu_has_kvm_support, 9078 .disabled_by_bios = vmx_disabled_by_bios, 9079 .hardware_setup = hardware_setup, 9080 .hardware_unsetup = hardware_unsetup, 9081 .check_processor_compatibility = vmx_check_processor_compat, 9082 .hardware_enable = hardware_enable, 9083 .hardware_disable = hardware_disable, 9084 .cpu_has_accelerated_tpr = report_flexpriority, 9085 9086 .vcpu_create = vmx_create_vcpu, 9087 .vcpu_free = vmx_free_vcpu, 9088 .vcpu_reset = vmx_vcpu_reset, 9089 9090 .prepare_guest_switch = vmx_save_host_state, 9091 .vcpu_load = vmx_vcpu_load, 9092 .vcpu_put = vmx_vcpu_put, 9093 9094 .update_db_bp_intercept = update_exception_bitmap, 9095 .get_msr = vmx_get_msr, 9096 .set_msr = vmx_set_msr, 9097 .get_segment_base = vmx_get_segment_base, 9098 .get_segment = vmx_get_segment, 9099 .set_segment = vmx_set_segment, 9100 .get_cpl = vmx_get_cpl, 9101 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 9102 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, 9103 .decache_cr3 = vmx_decache_cr3, 9104 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 9105 .set_cr0 = vmx_set_cr0, 9106 .set_cr3 = vmx_set_cr3, 9107 .set_cr4 = vmx_set_cr4, 9108 .set_efer = vmx_set_efer, 9109 .get_idt = vmx_get_idt, 9110 .set_idt = vmx_set_idt, 9111 .get_gdt = vmx_get_gdt, 9112 .set_gdt = vmx_set_gdt, 9113 .get_dr6 = vmx_get_dr6, 9114 .set_dr6 = vmx_set_dr6, 9115 .set_dr7 = vmx_set_dr7, 9116 .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, 9117 .cache_reg = vmx_cache_reg, 9118 .get_rflags = vmx_get_rflags, 9119 .set_rflags = vmx_set_rflags, 9120 .fpu_deactivate = vmx_fpu_deactivate, 9121 9122 .tlb_flush = vmx_flush_tlb, 9123 9124 .run = vmx_vcpu_run, 9125 .handle_exit = vmx_handle_exit, 9126 .skip_emulated_instruction = skip_emulated_instruction, 9127 .set_interrupt_shadow = vmx_set_interrupt_shadow, 9128 .get_interrupt_shadow = vmx_get_interrupt_shadow, 9129 .patch_hypercall = vmx_patch_hypercall, 9130 .set_irq = vmx_inject_irq, 9131 .set_nmi = vmx_inject_nmi, 9132 .queue_exception = vmx_queue_exception, 9133 .cancel_injection = vmx_cancel_injection, 9134 .interrupt_allowed = vmx_interrupt_allowed, 9135 .nmi_allowed = vmx_nmi_allowed, 9136 .get_nmi_mask = vmx_get_nmi_mask, 9137 .set_nmi_mask = vmx_set_nmi_mask, 9138 .enable_nmi_window = enable_nmi_window, 9139 .enable_irq_window = enable_irq_window, 9140 .update_cr8_intercept = update_cr8_intercept, 9141 .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, 9142 .set_apic_access_page_addr = vmx_set_apic_access_page_addr, 9143 .vm_has_apicv = vmx_vm_has_apicv, 9144 .load_eoi_exitmap = vmx_load_eoi_exitmap, 9145 .hwapic_irr_update = vmx_hwapic_irr_update, 9146 .hwapic_isr_update = vmx_hwapic_isr_update, 9147 .sync_pir_to_irr = vmx_sync_pir_to_irr, 9148 .deliver_posted_interrupt = vmx_deliver_posted_interrupt, 9149 9150 .set_tss_addr = vmx_set_tss_addr, 9151 .get_tdp_level = get_ept_level, 9152 .get_mt_mask = vmx_get_mt_mask, 9153 9154 .get_exit_info = vmx_get_exit_info, 9155 9156 .get_lpage_level = vmx_get_lpage_level, 9157 9158 .cpuid_update = vmx_cpuid_update, 9159 9160 .rdtscp_supported = vmx_rdtscp_supported, 9161 .invpcid_supported = vmx_invpcid_supported, 9162 9163 .set_supported_cpuid = vmx_set_supported_cpuid, 9164 9165 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 9166 9167 .set_tsc_khz = vmx_set_tsc_khz, 9168 .read_tsc_offset = vmx_read_tsc_offset, 9169 .write_tsc_offset = vmx_write_tsc_offset, 9170 .adjust_tsc_offset = vmx_adjust_tsc_offset, 9171 .compute_tsc_offset = vmx_compute_tsc_offset, 9172 .read_l1_tsc = vmx_read_l1_tsc, 9173 9174 .set_tdp_cr3 = vmx_set_cr3, 9175 9176 .check_intercept = vmx_check_intercept, 9177 .handle_external_intr = vmx_handle_external_intr, 9178 .mpx_supported = vmx_mpx_supported, 9179 9180 .check_nested_events = vmx_check_nested_events, 9181 9182 .sched_in = vmx_sched_in, 9183}; 9184 9185static int __init vmx_init(void) 9186{ 9187 int r, i, msr; 9188 9189 rdmsrl_safe(MSR_EFER, &host_efer); 9190 9191 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) 9192 kvm_define_shared_msr(i, vmx_msr_index[i]); 9193 9194 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); 9195 if (!vmx_io_bitmap_a) 9196 return -ENOMEM; 9197 9198 r = -ENOMEM; 9199 9200 vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); 9201 if (!vmx_io_bitmap_b) 9202 goto out; 9203 9204 vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); 9205 if (!vmx_msr_bitmap_legacy) 9206 goto out1; 9207 9208 vmx_msr_bitmap_legacy_x2apic = 9209 (unsigned long *)__get_free_page(GFP_KERNEL); 9210 if (!vmx_msr_bitmap_legacy_x2apic) 9211 goto out2; 9212 9213 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); 9214 if (!vmx_msr_bitmap_longmode) 9215 goto out3; 9216 9217 vmx_msr_bitmap_longmode_x2apic = 9218 (unsigned long *)__get_free_page(GFP_KERNEL); 9219 if (!vmx_msr_bitmap_longmode_x2apic) 9220 goto out4; 9221 vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); 9222 if (!vmx_vmread_bitmap) 9223 goto out5; 9224 9225 vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); 9226 if (!vmx_vmwrite_bitmap) 9227 goto out6; 9228 9229 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 9230 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 9231 9232 /* 9233 * Allow direct access to the PC debug port (it is often used for I/O 9234 * delays, but the vmexits simply slow things down). 9235 */ 9236 memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); 9237 clear_bit(0x80, vmx_io_bitmap_a); 9238 9239 memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); 9240 9241 memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); 9242 memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); 9243 9244 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 9245 9246 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), 9247 __alignof__(struct vcpu_vmx), THIS_MODULE); 9248 if (r) 9249 goto out7; 9250 9251#ifdef CONFIG_KEXEC 9252 rcu_assign_pointer(crash_vmclear_loaded_vmcss, 9253 crash_vmclear_local_loaded_vmcss); 9254#endif 9255 9256 vmx_disable_intercept_for_msr(MSR_FS_BASE, false); 9257 vmx_disable_intercept_for_msr(MSR_GS_BASE, false); 9258 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); 9259 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); 9260 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); 9261 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 9262 vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); 9263 9264 memcpy(vmx_msr_bitmap_legacy_x2apic, 9265 vmx_msr_bitmap_legacy, PAGE_SIZE); 9266 memcpy(vmx_msr_bitmap_longmode_x2apic, 9267 vmx_msr_bitmap_longmode, PAGE_SIZE); 9268 9269 if (enable_apicv) { 9270 for (msr = 0x800; msr <= 0x8ff; msr++) 9271 vmx_disable_intercept_msr_read_x2apic(msr); 9272 9273 /* According SDM, in x2apic mode, the whole id reg is used. 9274 * But in KVM, it only use the highest eight bits. Need to 9275 * intercept it */ 9276 vmx_enable_intercept_msr_read_x2apic(0x802); 9277 /* TMCCT */ 9278 vmx_enable_intercept_msr_read_x2apic(0x839); 9279 /* TPR */ 9280 vmx_disable_intercept_msr_write_x2apic(0x808); 9281 /* EOI */ 9282 vmx_disable_intercept_msr_write_x2apic(0x80b); 9283 /* SELF-IPI */ 9284 vmx_disable_intercept_msr_write_x2apic(0x83f); 9285 } 9286 9287 if (enable_ept) { 9288 kvm_mmu_set_mask_ptes(0ull, 9289 (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, 9290 (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, 9291 0ull, VMX_EPT_EXECUTABLE_MASK); 9292 ept_set_mmio_spte_mask(); 9293 kvm_enable_tdp(); 9294 } else 9295 kvm_disable_tdp(); 9296 9297 update_ple_window_actual_max(); 9298 9299 return 0; 9300 9301out7: 9302 free_page((unsigned long)vmx_vmwrite_bitmap); 9303out6: 9304 free_page((unsigned long)vmx_vmread_bitmap); 9305out5: 9306 free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); 9307out4: 9308 free_page((unsigned long)vmx_msr_bitmap_longmode); 9309out3: 9310 free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); 9311out2: 9312 free_page((unsigned long)vmx_msr_bitmap_legacy); 9313out1: 9314 free_page((unsigned long)vmx_io_bitmap_b); 9315out: 9316 free_page((unsigned long)vmx_io_bitmap_a); 9317 return r; 9318} 9319 9320static void __exit vmx_exit(void) 9321{ 9322 free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); 9323 free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); 9324 free_page((unsigned long)vmx_msr_bitmap_legacy); 9325 free_page((unsigned long)vmx_msr_bitmap_longmode); 9326 free_page((unsigned long)vmx_io_bitmap_b); 9327 free_page((unsigned long)vmx_io_bitmap_a); 9328 free_page((unsigned long)vmx_vmwrite_bitmap); 9329 free_page((unsigned long)vmx_vmread_bitmap); 9330 9331#ifdef CONFIG_KEXEC 9332 RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); 9333 synchronize_rcu(); 9334#endif 9335 9336 kvm_exit(); 9337} 9338 9339module_init(vmx_init) 9340module_exit(vmx_exit) 9341