Commit e6d9bfde authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "Bug fixes, plus a new test case and the associated infrastructure for
  writing nested virtualization tests"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  kvm: selftests: add vmx_tsc_adjust_test
  kvm: x86: move MSR_IA32_TSC handling to x86.c
  X86/KVM: Properly update 'tsc_offset' to represent the running guest
  kvm: selftests: add -std=gnu99 cflags
  x86: Add check for APIC access address for vmentry of L2 guests
  KVM: X86: fix incorrect reference of trace_kvm_pi_irte_update
  X86/KVM: Do not allow DISABLE_EXITS_MWAIT when LAPIC ARAT is not available
  kvm: selftests: fix spelling mistake: "divisable" and "divisible"
  X86/VMX: Disable VMX preemption timer if MWAIT is not intercepted
parents e6f39e87 d5edb7f8
......@@ -1013,6 +1013,7 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
......
......@@ -1423,12 +1423,23 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
seg->base = 0;
}
static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
if (is_guest_mode(vcpu))
return svm->nested.hsave->control.tsc_offset;
return vcpu->arch.tsc_offset;
}
static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 g_tsc_offset = 0;
if (is_guest_mode(vcpu)) {
/* Write L1's TSC offset. */
g_tsc_offset = svm->vmcb->control.tsc_offset -
svm->nested.hsave->control.tsc_offset;
svm->nested.hsave->control.tsc_offset = offset;
......@@ -3322,6 +3333,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
/* Restore the original control entries */
copy_vmcb_control_area(vmcb, hsave);
svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset;
kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);
......@@ -3482,10 +3494,12 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
/* We don't want to see VMMCALLs from a nested guest */
clr_intercept(svm, INTERCEPT_VMMCALL);
svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset;
svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
svm->vmcb->control.int_state = nested_vmcb->control.int_state;
svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
......@@ -4035,12 +4049,6 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
struct vcpu_svm *svm = to_svm(vcpu);
switch (msr_info->index) {
case MSR_IA32_TSC: {
msr_info->data = svm->vmcb->control.tsc_offset +
kvm_scale_tsc(vcpu, rdtsc());
break;
}
case MSR_STAR:
msr_info->data = svm->vmcb->save.star;
break;
......@@ -4193,9 +4201,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->vmcb->save.g_pat = data;
mark_dirty(svm->vmcb, VMCB_NPT);
break;
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr);
break;
case MSR_IA32_SPEC_CTRL:
if (!msr->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
......@@ -5265,9 +5270,8 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
}
if (!ret && svm) {
trace_kvm_pi_irte_update(svm->vcpu.vcpu_id,
host_irq, e->gsi,
vcpu_info.vector,
trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
e->gsi, vcpu_info.vector,
vcpu_info.pi_desc_addr, set);
}
......@@ -7102,6 +7106,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.has_wbinvd_exit = svm_has_wbinvd_exit,
.read_l1_tsc_offset = svm_read_l1_tsc_offset,
.write_tsc_offset = svm_write_tsc_offset,
.set_tdp_cr3 = set_tdp_cr3,
......
......@@ -2880,18 +2880,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
vmx_update_msr_bitmap(&vmx->vcpu);
}
/*
* reads and returns guest's timestamp counter "register"
* guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
* -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
*/
static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
{
u64 host_tsc, tsc_offset;
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
host_tsc = rdtsc();
tsc_offset = vmcs_read64(TSC_OFFSET);
return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
if (is_guest_mode(vcpu) &&
(vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
return vcpu->arch.tsc_offset;
}
/*
......@@ -3524,9 +3521,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
#endif
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_info);
case MSR_IA32_TSC:
msr_info->data = guest_read_tsc(vcpu);
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
......@@ -3646,9 +3640,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vmcs_write64(GUEST_BNDCFGS, data);
break;
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr_info);
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
......@@ -10608,6 +10599,16 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
return true;
}
static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
!page_address_valid(vcpu, vmcs12->apic_access_addr))
return -EINVAL;
else
return 0;
}
static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
......@@ -11176,11 +11177,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
}
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
vmcs_write64(TSC_OFFSET,
vcpu->arch.tsc_offset + vmcs12->tsc_offset);
else
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
if (kvm_has_tsc_control)
decache_tsc_multiplier(vmx);
......@@ -11299,6 +11297,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
if (nested_vmx_check_apic_access_controls(vcpu, vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
......@@ -11420,6 +11421,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
u32 msr_entry_idx;
u32 exit_qual;
int r;
enter_guest_mode(vcpu);
......@@ -11429,26 +11431,21 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
vmx_segment_cache_clear(vmx);
if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
leave_guest_mode(vcpu);
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
nested_vmx_entry_failure(vcpu, vmcs12,
EXIT_REASON_INVALID_STATE, exit_qual);
return 1;
}
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
vcpu->arch.tsc_offset += vmcs12->tsc_offset;
r = EXIT_REASON_INVALID_STATE;
if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual))
goto fail;
nested_get_vmcs12_pages(vcpu, vmcs12);
r = EXIT_REASON_MSR_LOAD_FAIL;
msr_entry_idx = nested_vmx_load_msr(vcpu,
vmcs12->vm_entry_msr_load_addr,
vmcs12->vm_entry_msr_load_count);
if (msr_entry_idx) {
leave_guest_mode(vcpu);
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
nested_vmx_entry_failure(vcpu, vmcs12,
EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
return 1;
}
if (msr_entry_idx)
goto fail;
/*
* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
......@@ -11457,6 +11454,14 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
* the success flag) when L2 exits (see nested_vmx_vmexit()).
*/
return 0;
fail:
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
leave_guest_mode(vcpu);
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual);
return 1;
}
/*
......@@ -12028,6 +12033,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
leave_guest_mode(vcpu);
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
if (likely(!vmx->fail)) {
if (exit_reason == -1)
sync_vmcs12(vcpu, vmcs12);
......@@ -12224,10 +12232,16 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 tscl = rdtsc();
u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
struct vcpu_vmx *vmx;
u64 tscl, guest_tscl, delta_tsc;
if (kvm_mwait_in_guest(vcpu->kvm))
return -EOPNOTSUPP;
vmx = to_vmx(vcpu);
tscl = rdtsc();
guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
/* Convert to host delta tsc if tsc scaling is enabled */
if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
......@@ -12533,7 +12547,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
vcpu_info.vector = irq.vector;
trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
vcpu_info.vector, vcpu_info.pi_desc_addr, set);
if (set)
......@@ -12712,6 +12726,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
.read_l1_tsc_offset = vmx_read_l1_tsc_offset,
.write_tsc_offset = vmx_write_tsc_offset,
.set_tdp_cr3 = vmx_set_cr3,
......
......@@ -1490,7 +1490,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
{
u64 curr_offset = vcpu->arch.tsc_offset;
u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
}
......@@ -1532,7 +1532,9 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
return vcpu->arch.tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
}
EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
......@@ -2362,6 +2364,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vcpu->arch.smbase = data;
break;
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr_info);
break;
case MSR_SMI_COUNT:
if (!msr_info->host_initiated)
return 1;
......@@ -2605,6 +2610,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_UCODE_REV:
msr_info->data = vcpu->arch.microcode_version;
break;
case MSR_IA32_TSC:
msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
break;
case MSR_MTRRcap:
case 0x200 ... 0x2ff:
return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
......@@ -2819,7 +2827,8 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
static inline bool kvm_can_mwait_in_guest(void)
{
return boot_cpu_has(X86_FEATURE_MWAIT) &&
!boot_cpu_has_bug(X86_BUG_MONITOR);
!boot_cpu_has_bug(X86_BUG_MONITOR) &&
boot_cpu_has(X86_FEATURE_ARAT);
}
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
......
......@@ -4,17 +4,18 @@ top_srcdir = ../../../../
UNAME_M := $(shell uname -m)
LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c
LIBKVM_x86_64 = lib/x86.c
LIBKVM_x86_64 = lib/x86.c lib/vmx.c
TEST_GEN_PROGS_x86_64 = set_sregs_test
TEST_GEN_PROGS_x86_64 += sync_regs_test
TEST_GEN_PROGS_x86_64 += vmx_tsc_adjust_test
TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
LIBKVM += $(LIBKVM_$(UNAME_M))
INSTALL_HDR_PATH = $(top_srcdir)/usr
LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
CFLAGS += -O2 -g -I$(LINUX_HDR_PATH) -Iinclude -I$(<D)
CFLAGS += -O2 -g -std=gnu99 -I$(LINUX_HDR_PATH) -Iinclude -I$(<D)
# After inclusion, $(OUTPUT) is defined and
# $(TEST_GEN_PROGS) starts with $(OUTPUT)/
......
......@@ -112,24 +112,27 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm,
vm_paddr_t paddr_min, uint32_t memslot);
void kvm_get_supported_cpuid(struct kvm_cpuid2 *cpuid);
struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
void vcpu_set_cpuid(
struct kvm_vm *vm, uint32_t vcpuid, struct kvm_cpuid2 *cpuid);
struct kvm_cpuid2 *allocate_kvm_cpuid2(void);
struct kvm_cpuid_entry2 *
find_cpuid_index_entry(struct kvm_cpuid2 *cpuid, uint32_t function,
uint32_t index);
kvm_get_supported_cpuid_index(uint32_t function, uint32_t index);
static inline struct kvm_cpuid_entry2 *
find_cpuid_entry(struct kvm_cpuid2 *cpuid, uint32_t function)
kvm_get_supported_cpuid_entry(uint32_t function)
{
return find_cpuid_index_entry(cpuid, function, 0);
return kvm_get_supported_cpuid_index(function, 0);
}
struct kvm_vm *vm_create_default(uint32_t vcpuid, void *guest_code);
void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
typedef void (*vmx_guest_code_t)(vm_vaddr_t vmxon_vaddr,
vm_paddr_t vmxon_paddr,
vm_vaddr_t vmcs_vaddr,
vm_paddr_t vmcs_paddr);
struct kvm_userspace_memory_region *
kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
uint64_t end);
......
This diff is collapsed.
......@@ -378,7 +378,7 @@ int kvm_memcmp_hva_gva(void *hva,
* complicated. This function uses a reasonable default length for
* the array and performs the appropriate allocation.
*/
struct kvm_cpuid2 *allocate_kvm_cpuid2(void)
static struct kvm_cpuid2 *allocate_kvm_cpuid2(void)
{
struct kvm_cpuid2 *cpuid;
int nent = 100;
......@@ -402,17 +402,21 @@ struct kvm_cpuid2 *allocate_kvm_cpuid2(void)
* Input Args: None
*
* Output Args:
* cpuid - The supported KVM CPUID
*
* Return: void
* Return: The supported KVM CPUID
*
* Get the guest CPUID supported by KVM.
*/
void kvm_get_supported_cpuid(struct kvm_cpuid2 *cpuid)
struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
{
static struct kvm_cpuid2 *cpuid;
int ret;
int kvm_fd;
if (cpuid)
return cpuid;
cpuid = allocate_kvm_cpuid2();
kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
TEST_ASSERT(kvm_fd >= 0, "open %s failed, rc: %i errno: %i",
KVM_DEV_PATH, kvm_fd, errno);
......@@ -422,6 +426,7 @@ void kvm_get_supported_cpuid(struct kvm_cpuid2 *cpuid)
ret, errno);
close(kvm_fd);
return cpuid;
}
/* Locate a cpuid entry.
......@@ -435,12 +440,13 @@ void kvm_get_supported_cpuid(struct kvm_cpuid2 *cpuid)
* Return: A pointer to the cpuid entry. Never returns NULL.
*/
struct kvm_cpuid_entry2 *
find_cpuid_index_entry(struct kvm_cpuid2 *cpuid, uint32_t function,
uint32_t index)
kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
{
struct kvm_cpuid2 *cpuid;
struct kvm_cpuid_entry2 *entry = NULL;
int i;
cpuid = kvm_get_supported_cpuid();
for (i = 0; i < cpuid->nent; i++) {
if (cpuid->entries[i].function == function &&
cpuid->entries[i].index == index) {
......@@ -1435,7 +1441,7 @@ vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm,
sparsebit_idx_t pg;
TEST_ASSERT((paddr_min % vm->page_size) == 0, "Min physical address "
"not divisable by page size.\n"
"not divisible by page size.\n"
" paddr_min: 0x%lx page_size: 0x%x",
paddr_min, vm->page_size);
......
......@@ -121,7 +121,7 @@
* avoided by moving the setting of the nodes mask bits into
* the previous nodes num_after setting.
*
* + Node starting index is evenly divisable by the number of bits
* + Node starting index is evenly divisible by the number of bits
* within a nodes mask member.
*
* + Nodes never represent a range of bits that wrap around the
......@@ -1741,7 +1741,7 @@ void sparsebit_validate_internal(struct sparsebit *s)
/* Validate node index is divisible by the mask size */
if (nodep->idx % MASK_BITS) {
fprintf(stderr, "Node index not divisable by "
fprintf(stderr, "Node index not divisible by "
"mask size,\n"
" nodep: %p nodep->idx: 0x%lx "
"MASK_BITS: %lu\n",
......
/*
* tools/testing/selftests/kvm/lib/x86.c
*
* Copyright (C) 2018, Google LLC.
*
* This work is licensed under the terms of the GNU GPL, version 2.
*/
#define _GNU_SOURCE /* for program_invocation_name */
#include "test_util.h"
#include "kvm_util.h"
#include "x86.h"
#include "vmx.h"
/* Create a default VM for VMX tests.
*
* Input Args:
* vcpuid - The id of the single VCPU to add to the VM.
* guest_code - The vCPU's entry point
*
* Output Args: None
*
* Return:
* Pointer to opaque structure that describes the created VM.
*/
struct kvm_vm *
vm_create_default_vmx(uint32_t vcpuid, vmx_guest_code_t guest_code)
{
struct kvm_cpuid2 *cpuid;
struct kvm_vm *vm;
vm_vaddr_t vmxon_vaddr;
vm_paddr_t vmxon_paddr;
vm_vaddr_t vmcs_vaddr;
vm_paddr_t vmcs_paddr;
vm = vm_create_default(vcpuid, (void *) guest_code);
/* Enable nesting in CPUID */
vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid());
/* Setup of a region of guest memory for the vmxon region. */
vmxon_vaddr = vm_vaddr_alloc(vm, getpagesize(), 0, 0, 0);
vmxon_paddr = addr_gva2gpa(vm, vmxon_vaddr);
/* Setup of a region of guest memory for a vmcs. */
vmcs_vaddr = vm_vaddr_alloc(vm, getpagesize(), 0, 0, 0);
vmcs_paddr = addr_gva2gpa(vm, vmcs_vaddr);
vcpu_args_set(vm, vcpuid, 4, vmxon_vaddr, vmxon_paddr, vmcs_vaddr,
vmcs_paddr);
return vm;
}
void prepare_for_vmx_operation(void)
{
uint64_t feature_control;
uint64_t required;
unsigned long cr0;
unsigned long cr4;
/*
* Ensure bits in CR0 and CR4 are valid in VMX operation:
* - Bit X is 1 in _FIXED0: bit X is fixed to 1 in CRx.
* - Bit X is 0 in _FIXED1: bit X is fixed to 0 in CRx.
*/
__asm__ __volatile__("mov %%cr0, %0" : "=r"(cr0) : : "memory");
cr0 &= rdmsr(MSR_IA32_VMX_CR0_FIXED1);
cr0 |= rdmsr(MSR_IA32_VMX_CR0_FIXED0);
__asm__ __volatile__("mov %0, %%cr0" : : "r"(cr0) : "memory");
__asm__ __volatile__("mov %%cr4, %0" : "=r"(cr4) : : "memory");
cr4 &= rdmsr(MSR_IA32_VMX_CR4_FIXED1);
cr4 |= rdmsr(MSR_IA32_VMX_CR4_FIXED0);
/* Enable VMX operation */
cr4 |= X86_CR4_VMXE;
__asm__ __volatile__("mov %0, %%cr4" : : "r"(cr4) : "memory");
/*
* Configure IA32_FEATURE_CONTROL MSR to allow VMXON:
* Bit 0: Lock bit. If clear, VMXON causes a #GP.
* Bit 2: Enables VMXON outside of SMX operation. If clear, VMXON
* outside of SMX causes a #GP.
*/
required = FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
required |= FEATURE_CONTROL_LOCKED;
feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
if ((feature_control & required) != required)
wrmsr(MSR_IA32_FEATURE_CONTROL, feature_control | required);
}
/*
* Initialize the control fields to the most basic settings possible.
*/
static inline void init_vmcs_control_fields(void)
{
vmwrite(VIRTUAL_PROCESSOR_ID, 0);
vmwrite(POSTED_INTR_NV, 0);
vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_PINBASED_CTLS));
vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_PROCBASED_CTLS));
vmwrite(EXCEPTION_BITMAP, 0);
vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, -1); /* Never match */
vmwrite(CR3_TARGET_COUNT, 0);
vmwrite(VM_EXIT_CONTROLS, rdmsr(MSR_IA32_VMX_EXIT_CTLS) |
VM_EXIT_HOST_ADDR_SPACE_SIZE); /* 64-bit host */
vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
vmwrite(VM_ENTRY_CONTROLS, rdmsr(MSR_IA32_VMX_ENTRY_CTLS) |
VM_ENTRY_IA32E_MODE); /* 64-bit guest */
vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
vmwrite(VM_ENTRY_INTR_INFO_FIELD, 0);
vmwrite(TPR_THRESHOLD, 0);
vmwrite(SECONDARY_VM_EXEC_CONTROL, 0);
vmwrite(CR0_GUEST_HOST_MASK, 0);
vmwrite(CR4_GUEST_HOST_MASK, 0);
vmwrite(CR0_READ_SHADOW, get_cr0());
vmwrite(CR4_READ_SHADOW, get_cr4());
}
/*
* Initialize the host state fields based on the current host state, with
* the exception of HOST_RSP and HOST_RIP, which should be set by vmlaunch
* or vmresume.
*/
static inline void init_vmcs_host_state(void)
{
uint32_t exit_controls = vmreadz(VM_EXIT_CONTROLS);
vmwrite(HOST_ES_SELECTOR, get_es());
vmwrite(HOST_CS_SELECTOR, get_cs());
vmwrite(HOST_SS_SELECTOR, get_ss());
vmwrite(HOST_DS_SELECTOR, get_ds());
vmwrite(HOST_FS_SELECTOR, get_fs());
vmwrite(HOST_GS_SELECTOR, get_gs());
vmwrite(HOST_TR_SELECTOR, get_tr());
if (exit_controls & VM_EXIT_LOAD_IA32_PAT)
vmwrite(HOST_IA32_PAT, rdmsr(MSR_IA32_CR_PAT));
if (exit_controls & VM_EXIT_LOAD_IA32_EFER)
vmwrite(HOST_IA32_EFER, rdmsr(MSR_EFER));
if (exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
vmwrite(HOST_IA32_PERF_GLOBAL_CTRL,
rdmsr(MSR_CORE_PERF_GLOBAL_CTRL));
vmwrite(HOST_IA32_SYSENTER_CS, rdmsr(MSR_IA32_SYSENTER_CS));
vmwrite(HOST_CR0, get_cr0());
vmwrite(HOST_CR3, get_cr3());
vmwrite(HOST_CR4, get_cr4());
vmwrite(HOST_FS_BASE, rdmsr(MSR_FS_BASE));
vmwrite(HOST_GS_BASE, rdmsr(MSR_GS_BASE));
vmwrite(HOST_TR_BASE,
get_desc64_base((struct desc64 *)(get_gdt_base() + get_tr())));
vmwrite(HOST_GDTR_BASE, get_gdt_base());
vmwrite(HOST_IDTR_BASE, get_idt_base());
vmwrite(HOST_IA32_SYSENTER_ESP, rdmsr(MSR_IA32_SYSENTER_ESP));
vmwrite(HOST_IA32_SYSENTER_EIP, rdmsr(MSR_IA32_SYSENTER_EIP));
}
/*
* Initialize the guest state fields essentially as a clone of
* the host state fields. Some host state fields have fixed
* values, and we set the corresponding guest state fields accordingly.
*/
static inline void init_vmcs_guest_state(void *rip, void *rsp)
{
vmwrite(GUEST_ES_SELECTOR, vmreadz(HOST_ES_SELECTOR));
vmwrite(GUEST_CS_SELECTOR, vmreadz(HOST_CS_SELECTOR));
vmwrite(GUEST_SS_SELECTOR, vmreadz(HOST_SS_SELECTOR));
vmwrite(GUEST_DS_SELECTOR, vmreadz(HOST_DS_SELECTOR));
vmwrite(GUEST_FS_SELECTOR, vmreadz(HOST_FS_SELECTOR));
vmwrite(GUEST_GS_SELECTOR, vmreadz(HOST_GS_SELECTOR));
vmwrite(GUEST_LDTR_SELECTOR, 0);
vmwrite(GUEST_TR_SELECTOR, vmreadz(HOST_TR_SELECTOR));
vmwrite(GUEST_INTR_STATUS, 0);
vmwrite(GUEST_PML_INDEX, 0);
vmwrite(VMCS_LINK_POINTER, -1ll);
vmwrite(GUEST_IA32_DEBUGCTL, 0);
vmwrite(GUEST_IA32_PAT, vmreadz(HOST_IA32_PAT));
vmwrite(GUEST_IA32_EFER, vmreadz(HOST_IA32_EFER));
vmwrite(GUEST_IA32_PERF_GLOBAL_CTRL,
vmreadz(HOST_IA32_PERF_GLOBAL_CTRL));
vmwrite(GUEST_ES_LIMIT, -1);
vmwrite(GUEST_CS_LIMIT, -1);
vmwrite(GUEST_SS_LIMIT, -1);
vmwrite(GUEST_DS_LIMIT, -1);
vmwrite(GUEST_FS_LIMIT, -1);
vmwrite(GUEST_GS_LIMIT, -1);
vmwrite(GUEST_LDTR_LIMIT, -1);
vmwrite(GUEST_TR_LIMIT, 0x67);
vmwrite(GUEST_GDTR_LIMIT, 0xffff);
vmwrite(GUEST_IDTR_LIMIT, 0xffff);
vmwrite(GUEST_ES_AR_BYTES,
vmreadz(GUEST_ES_SELECTOR) == 0 ? 0x10000 : 0xc093);
vmwrite(GUEST_CS_AR_BYTES, 0xa09b);
vmwrite(GUEST_SS_AR_BYTES, 0xc093);
vmwrite(GUEST_DS_AR_BYTES,
vmreadz(GUEST_DS_SELECTOR) == 0 ? 0x10000 : 0xc093);
vmwrite(GUEST_FS_AR_BYTES,
vmreadz(GUEST_FS_SELECTOR) == 0 ? 0x10000 : 0xc093);
vmwrite(GUEST_GS_AR_BYTES,
vmreadz(GUEST_GS_SELECTOR) == 0 ? 0x10000 : 0xc093);
vmwrite(GUEST_LDTR_AR_BYTES, 0x10000);
vmwrite(GUEST_TR_AR_BYTES, 0x8b);
vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
vmwrite(GUEST_ACTIVITY_STATE, 0);
vmwrite(GUEST_SYSENTER_CS, vmreadz(HOST_IA32_SYSENTER_CS));
vmwrite(VMX_PREEMPTION_TIMER_VALUE, 0);
vmwrite(GUEST_CR0, vmreadz(HOST_CR0));
vmwrite(GUEST_CR3, vmreadz(HOST_CR3));
vmwrite(GUEST_CR4, vmreadz(HOST_CR4));
vmwrite(GUEST_ES_BASE, 0);
vmwrite(GUEST_CS_BASE, 0);
vmwrite(GUEST_SS_BASE, 0);
vmwrite(GUEST_DS_BASE, 0);
vmwrite(GUEST_FS_BASE, vmreadz(HOST_FS_BASE));
vmwrite(GUEST_GS_BASE, vmreadz(HOST_GS_BASE));
vmwrite(GUEST_LDTR_BASE, 0);
vmwrite(GUEST_TR_BASE, vmreadz(HOST_TR_BASE));
vmwrite(GUEST_GDTR_BASE, vmreadz(HOST_GDTR_BASE));
vmwrite(GUEST_IDTR_BASE, vmreadz(HOST_IDTR_BASE));
vmwrite(GUEST_DR7, 0x400);
vmwrite(GUEST_RSP, (uint64_t)rsp);
vmwrite(GUEST_RIP, (uint64_t)rip);
vmwrite(GUEST_RFLAGS, 2);
vmwrite(GUEST_PENDING_DBG_EXCEPTIONS, 0);
vmwrite(GUEST_SYSENTER_ESP, vmreadz(HOST_IA32_SYSENTER_ESP));
vmwrite(GUEST_SYSENTER_EIP, vmreadz(HOST_IA32_SYSENTER_EIP));
}
void prepare_vmcs(void *guest_rip, void *guest_rsp)
{
init_vmcs_control_fields();
init_vmcs_host_state();
init_vmcs_guest_state(guest_rip, guest_rsp);
}