Commit 9db59599 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull more KVM updates from Paolo Bonzini:
 - PPC bugfixes
 - RCU splat fix
 - swait races fix
 - pointless userspace-triggerable BUG() fix
 - misc fixes for KVM_RUN corner cases
 - nested virt correctness fixes + one host DoS
 - some cleanups
 - clang build fix
 - fix AMD AVIC with default QEMU command line options
 - x86 bugfixes

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (28 commits)
  kvm: nVMX: Handle deferred early VMLAUNCH/VMRESUME failure properly
  kvm: vmx: Handle VMLAUNCH/VMRESUME failure properly
  kvm: nVMX: Remove nested_vmx_succeed after successful VM-entry
  kvm,mips: Fix potential swait_active() races
  kvm,powerpc: Serialize wq active checks in ops->vcpu_kick
  kvm: Serialize wq active checks in kvm_vcpu_wake_up()
  kvm,x86: Fix apf_task_wake_one() wq serialization
  kvm,lapic: Justify use of swait_active()
  kvm,async_pf: Use swq_has_sleeper()
  sched/wait: Add swq_has_sleeper()
  KVM: VMX: Do not BUG() on out-of-bounds guest IRQ
  KVM: Don't accept obviously wrong gsi values via KVM_IRQFD
  kvm: nVMX: Don't allow L2 to access the hardware CR8
  KVM: trace events: update list of exit reasons
  KVM: async_pf: Fix #DF due to inject "Page not Present" and "Page Ready" exceptions simultaneously
  KVM: X86: Don't block vCPU if there is pending exception
  KVM: SVM: Add irqchip_split() checks before enabling AVIC
  KVM: Add struct kvm_vcpu pointer parameter to get_enable_apicv()
  KVM: SVM: Refactor AVIC vcpu initialization into avic_init_vcpu()
  KVM: x86: fix clang build
  ...
parents b38923a0 4f350c6d
......@@ -514,7 +514,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
dvcpu->arch.wait = 0;
if (swait_active(&dvcpu->wq))
if (swq_has_sleeper(&dvcpu->wq))
swake_up(&dvcpu->wq);
return 0;
......@@ -1179,7 +1179,7 @@ static void kvm_mips_comparecount_func(unsigned long data)
kvm_mips_callbacks->queue_timer_int(vcpu);
vcpu->arch.wait = 0;
if (swait_active(&vcpu->wq))
if (swq_has_sleeper(&vcpu->wq))
swake_up(&vcpu->wq);
}
......
......@@ -181,7 +181,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
struct swait_queue_head *wqp;
wqp = kvm_arch_vcpu_wq(vcpu);
if (swait_active(wqp)) {
if (swq_has_sleeper(wqp)) {
swake_up(wqp);
++vcpu->stat.halt_wakeup;
}
......@@ -4212,11 +4212,13 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
if ((cfg->process_table & PRTS_MASK) > 24)
return -EINVAL;
mutex_lock(&kvm->lock);
kvm->arch.process_table = cfg->process_table;
kvmppc_setup_partition_table(kvm);
lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
mutex_unlock(&kvm->lock);
return 0;
}
......
......@@ -38,7 +38,6 @@ static inline void __iomem *get_tima_phys(void)
#define __x_tima get_tima_phys()
#define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_page))
#define __x_trig_page(xd) ((void __iomem *)((xd)->trig_page))
#define __x_readb __raw_rm_readb
#define __x_writeb __raw_rm_writeb
#define __x_readw __raw_rm_readw
#define __x_readq __raw_rm_readq
......
......@@ -771,6 +771,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BEGIN_FTR_SECTION
/*
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
*/
bl kvmppc_restore_tm
END_FTR_SECTION_IFSET(CPU_FTR_TM)
#endif
......@@ -1630,6 +1633,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BEGIN_FTR_SECTION
/*
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
*/
bl kvmppc_save_tm
END_FTR_SECTION_IFSET(CPU_FTR_TM)
#endif
......@@ -1749,7 +1755,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
/*
* Are we running hash or radix ?
*/
beq cr2,3f
ld r5, VCPU_KVM(r9)
lbz r0, KVM_RADIX(r5)
cmpwi cr2, r0, 0
beq cr2, 3f
/* Radix: Handle the case where the guest used an illegal PID */
LOAD_REG_ADDR(r4, mmu_base_pid)
......@@ -2466,6 +2475,9 @@ _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BEGIN_FTR_SECTION
/*
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
*/
ld r9, HSTATE_KVM_VCPU(r13)
bl kvmppc_save_tm
END_FTR_SECTION_IFSET(CPU_FTR_TM)
......@@ -2578,6 +2590,9 @@ kvm_end_cede:
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BEGIN_FTR_SECTION
/*
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
*/
bl kvmppc_restore_tm
END_FTR_SECTION_IFSET(CPU_FTR_TM)
#endif
......
......@@ -48,7 +48,6 @@
#define __x_tima xive_tima
#define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_mmio))
#define __x_trig_page(xd) ((void __iomem *)((xd)->trig_mmio))
#define __x_readb __raw_readb
#define __x_writeb __raw_writeb
#define __x_readw __raw_readw
#define __x_readq __raw_readq
......
......@@ -28,7 +28,8 @@ static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc)
* bit.
*/
if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
u8 pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR);
__be64 qw1 = __x_readq(__x_tima + TM_QW1_OS);
u8 pipr = be64_to_cpu(qw1) & 0xff;
if (pipr >= xc->hw_cppr)
return;
}
......@@ -336,7 +337,6 @@ X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
u8 pending = xc->pending;
u32 hirq;
u8 pipr;
pr_devel("H_IPOLL(server=%ld)\n", server);
......@@ -353,7 +353,8 @@ X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long
pending = 0xff;
} else {
/* Grab pending interrupt if any */
pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR);
__be64 qw1 = __x_readq(__x_tima + TM_QW1_OS);
u8 pipr = be64_to_cpu(qw1) & 0xff;
if (pipr < 8)
pending |= 1 << pipr;
}
......
......@@ -951,7 +951,6 @@ struct kvm_x86_ops {
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
u32 (*get_pkru)(struct kvm_vcpu *vcpu);
void (*tlb_flush)(struct kvm_vcpu *vcpu);
......@@ -973,7 +972,7 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
bool (*get_enable_apicv)(void);
bool (*get_enable_apicv)(struct kvm_vcpu *vcpu);
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
......
......@@ -180,7 +180,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
hlist_del_init(&n->link);
if (n->halted)
smp_send_reschedule(n->cpu);
else if (swait_active(&n->wq))
else if (swq_has_sleeper(&n->wq))
swake_up(&n->wq);
}
......
......@@ -59,7 +59,6 @@ static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
{
unsigned x86_leaf = x86_feature / 32;
BUILD_BUG_ON(!__builtin_constant_p(x86_leaf));
BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
......
......@@ -1324,6 +1324,10 @@ static void apic_timer_expired(struct kvm_lapic *apic)
atomic_inc(&apic->lapic_timer.pending);
kvm_set_pending_timer(vcpu);
/*
* For x86, the atomic_inc() is serialized, thus
* using swait_active() is safe.
*/
if (swait_active(q))
swake_up(q);
......
......@@ -1200,7 +1200,6 @@ static void avic_init_vmcb(struct vcpu_svm *svm)
vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
svm->vcpu.arch.apicv_active = true;
}
static void init_vmcb(struct vcpu_svm *svm)
......@@ -1316,7 +1315,7 @@ static void init_vmcb(struct vcpu_svm *svm)
set_intercept(svm, INTERCEPT_PAUSE);
}
if (avic)
if (kvm_vcpu_apicv_active(&svm->vcpu))
avic_init_vmcb(svm);
/*
......@@ -1600,6 +1599,23 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
}
static int avic_init_vcpu(struct vcpu_svm *svm)
{
int ret;
if (!kvm_vcpu_apicv_active(&svm->vcpu))
return 0;
ret = avic_init_backing_page(&svm->vcpu);
if (ret)
return ret;
INIT_LIST_HEAD(&svm->ir_list);
spin_lock_init(&svm->ir_list_lock);
return ret;
}
static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
{
struct vcpu_svm *svm;
......@@ -1636,14 +1652,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
if (!hsave_page)
goto free_page3;
if (avic) {
err = avic_init_backing_page(&svm->vcpu);
if (err)
goto free_page4;
INIT_LIST_HEAD(&svm->ir_list);
spin_lock_init(&svm->ir_list_lock);
}
err = avic_init_vcpu(svm);
if (err)
goto free_page4;
/* We initialize this flag to true to make sure that the is_running
* bit would be set the first time the vcpu is loaded.
......@@ -4395,9 +4406,9 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
return;
}
static bool svm_get_enable_apicv(void)
static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu)
{
return avic;
return avic && irqchip_split(vcpu->kvm);
}
static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
......@@ -4414,7 +4425,7 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
if (!avic)
if (!kvm_vcpu_apicv_active(&svm->vcpu))
return;
vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
......@@ -5302,6 +5313,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
*/
if (info->rep_prefix != REPE_PREFIX)
goto out;
break;
case SVM_EXIT_IOIO: {
u64 exit_info;
u32 bytes;
......
......@@ -5012,7 +5012,7 @@ static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_activ
}
}
static bool vmx_get_enable_apicv(void)
static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
{
return enable_apicv;
}
......@@ -8344,12 +8344,14 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
vmcs_readl(EXIT_QUALIFICATION),
vmx->idt_vectoring_info,
intr_info,
vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
KVM_ISA_VMX);
if (vmx->nested.nested_run_pending)
return false;
if (unlikely(vmx->fail)) {
pr_info_ratelimited("%s failed vm entry %x\n", __func__,
vmcs_read32(VM_INSTRUCTION_ERROR));
return true;
}
/*
* The host physical addresses of some pages of guest memory
......@@ -8363,14 +8365,12 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
*/
nested_mark_vmcs12_pages_dirty(vcpu);
if (vmx->nested.nested_run_pending)
return false;
if (unlikely(vmx->fail)) {
pr_info_ratelimited("%s failed vm entry %x\n", __func__,
vmcs_read32(VM_INSTRUCTION_ERROR));
return true;
}
trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
vmcs_readl(EXIT_QUALIFICATION),
vmx->idt_vectoring_info,
intr_info,
vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
KVM_ISA_VMX);
switch (exit_reason) {
case EXIT_REASON_EXCEPTION_NMI:
......@@ -9424,12 +9424,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
| (1 << VCPU_EXREG_CR3));
vcpu->arch.regs_dirty = 0;
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
vmx->loaded_vmcs->launched = 1;
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
/*
* eager fpu is enabled if PKEY is supported and CR4 is switched
* back on host, so it is safe to read guest PKRU from current
......@@ -9451,6 +9445,14 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
vmx->nested.nested_run_pending = 0;
vmx->idt_vectoring_info = 0;
vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
return;
vmx->loaded_vmcs->launched = 1;
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
vmx_complete_atomic_exit(vmx);
vmx_recover_nmi_blocking(vmx);
......@@ -10525,6 +10527,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (exec_control & CPU_BASED_TPR_SHADOW) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
} else {
#ifdef CONFIG_X86_64
exec_control |= CPU_BASED_CR8_LOAD_EXITING |
CPU_BASED_CR8_STORE_EXITING;
#endif
}
/*
......@@ -11388,46 +11395,30 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
u32 vm_inst_error = 0;
/* trying to cancel vmlaunch/vmresume is a bug */
WARN_ON_ONCE(vmx->nested.nested_run_pending);
/*
* The only expected VM-instruction error is "VM entry with
* invalid control field(s)." Anything else indicates a
* problem with L0.
*/
WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) !=
VMXERR_ENTRY_INVALID_CONTROL_FIELD));
leave_guest_mode(vcpu);
prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
exit_qualification);
if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
vmcs12->vm_exit_msr_store_count))
nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
if (likely(!vmx->fail)) {
prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
exit_qualification);
if (unlikely(vmx->fail))
vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR);
if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
vmcs12->vm_exit_msr_store_count))
nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
}
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
/*
* TODO: SDM says that with acknowledge interrupt on exit, bit 31 of
* the VM-exit interrupt information (valid interrupt) is always set to
* 1 on EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't need
* kvm_cpu_has_interrupt(). See the commit message for details.
*/
if (nested_exit_intr_ack_set(vcpu) &&
exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
kvm_cpu_has_interrupt(vcpu)) {
int irq = kvm_cpu_get_interrupt(vcpu);
WARN_ON(irq < 0);
vmcs12->vm_exit_intr_info = irq |
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
}
trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
vmcs12->exit_qualification,
vmcs12->idt_vectoring_info_field,
vmcs12->vm_exit_intr_info,
vmcs12->vm_exit_intr_error_code,
KVM_ISA_VMX);
vm_entry_controls_reset_shadow(vmx);
vm_exit_controls_reset_shadow(vmx);
vmx_segment_cache_clear(vmx);
......@@ -11436,8 +11427,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
if (VMCS02_POOL_SIZE == 0)
nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
load_vmcs12_host_state(vcpu, vmcs12);
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
......@@ -11486,21 +11475,57 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
*/
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
/*
* Exiting from L2 to L1, we're now back to L1 which thinks it just
* finished a VMLAUNCH or VMRESUME instruction, so we need to set the
* success or failure flag accordingly.
*/
if (unlikely(vmx->fail)) {
vmx->fail = 0;
nested_vmx_failValid(vcpu, vm_inst_error);
} else
nested_vmx_succeed(vcpu);
if (enable_shadow_vmcs)
vmx->nested.sync_shadow_vmcs = true;
/* in case we halted in L2 */
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
if (likely(!vmx->fail)) {
/*
* TODO: SDM says that with acknowledge interrupt on
* exit, bit 31 of the VM-exit interrupt information
* (valid interrupt) is always set to 1 on
* EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
* need kvm_cpu_has_interrupt(). See the commit
* message for details.
*/
if (nested_exit_intr_ack_set(vcpu) &&
exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
kvm_cpu_has_interrupt(vcpu)) {
int irq = kvm_cpu_get_interrupt(vcpu);
WARN_ON(irq < 0);
vmcs12->vm_exit_intr_info = irq |
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
}
trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
vmcs12->exit_qualification,
vmcs12->idt_vectoring_info_field,
vmcs12->vm_exit_intr_info,
vmcs12->vm_exit_intr_error_code,
KVM_ISA_VMX);
load_vmcs12_host_state(vcpu, vmcs12);
return;
}
/*
* After an early L2 VM-entry failure, we're now back
* in L1 which thinks it just finished a VMLAUNCH or
* VMRESUME instruction, so we need to set the failure
* flag and the VM-instruction error field of the VMCS
* accordingly.
*/
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
/*
* The emulated instruction was already skipped in
* nested_vmx_run, but the updated RIP was never
* written back to the vmcs01.
*/
skip_emulated_instruction(vcpu);
vmx->fail = 0;
}
/*
......@@ -11829,7 +11854,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
struct kvm_lapic_irq irq;
struct kvm_vcpu *vcpu;
struct vcpu_data vcpu_info;
int idx, ret = -EINVAL;
int idx, ret = 0;
if (!kvm_arch_has_assigned_device(kvm) ||
!irq_remapping_cap(IRQ_POSTING_CAP) ||
......@@ -11838,7 +11863,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
idx = srcu_read_lock(&kvm->irq_srcu);
irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
if (guest_irq >= irq_rt->nr_rt_entries ||
hlist_empty(&irq_rt->map[guest_irq])) {
pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
guest_irq, irq_rt->nr_rt_entries);
goto out;
}
hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
if (e->type != KVM_IRQ_ROUTING_MSI)
......
......@@ -7231,10 +7231,19 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
if (kvm_run->immediate_exit) {
r = -EINTR;
goto out;
}
kvm_vcpu_block(vcpu);
kvm_apic_accept_events(vcpu);
kvm_clear_request(KVM_REQ_UNHALT, vcpu);
r = -EAGAIN;
if (signal_pending(current)) {
r = -EINTR;
vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.signal_exits;
}
goto out;
}
......@@ -7971,7 +7980,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
BUG_ON(vcpu->kvm == NULL);
kvm = vcpu->kvm;
vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv();
vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.emulate_ctxt.ops = &emulate_ops;
if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
......@@ -8452,6 +8461,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
if (vcpu->arch.pv.pv_unhalted)
return true;
if (vcpu->arch.exception.pending)
return true;
if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
(vcpu->arch.nmi_pending &&
kvm_x86_ops->nmi_allowed(vcpu)))
......@@ -8619,6 +8631,13 @@ static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
sizeof(val));
}
static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val)
{
return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
sizeof(u32));
}
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
......@@ -8646,6 +8665,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
struct x86_exception fault;
u32 val;
if (work->wakeup_all)
work->arch.token = ~0; /* broadcast wakeup */
......@@ -8653,15 +8673,26 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
trace_kvm_async_pf_ready(work->arch.token, work->gva);
if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
fault.vector = PF_VECTOR;
fault.error_code_valid = true;
fault.error_code = 0;
fault.nested_page_fault = false;
fault.address = work->arch.token;
fault.async_page_fault = true;
kvm_inject_page_fault(vcpu, &fault);
if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
!apf_get_user(vcpu, &val)) {
if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
vcpu->arch.exception.pending &&
vcpu->arch.exception.nr == PF_VECTOR &&
!apf_put_user(vcpu, 0)) {
vcpu->arch.exception.injected = false;
vcpu->arch.exception.pending = false;
vcpu->arch.exception.nr = 0;
vcpu->arch.exception.has_error_code = false;
vcpu->arch.exception.error_code = 0;
} else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
fault.vector = PF_VECTOR;
fault.error_code_valid = true;
fault.error_code = 0;
fault.nested_page_fault = false;
fault.address = work->arch.token;
fault.async_page_fault = true;
kvm_inject_page_fault(vcpu, &fault);
}
}
vcpu->arch.apf.halted = false;
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
......
......@@ -79,9 +79,63 @@ extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name
DECLARE_SWAIT_QUEUE_HEAD(name)
#endif
static inline int swait_active(struct swait_queue_head *q)
/**
* swait_active -- locklessly test for waiters on the queue
* @wq: the waitqueue to test for waiters
*
* returns true if the wait list is not empty
*
* NOTE: this function is lockless and requires care, incorrect usage _will_
* lead to sporadic and non-obvious failure.
*
* NOTE2: this function has the same above implications as regular waitqueues.
*
* Use either while holding swait_queue_head::lock or when used for wakeups
* with an extra smp_mb() like:
*
* CPU0 - waker CPU1 - waiter
*
* for (;;) {
* @cond = true; prepare_to_swait(&wq_head, &wait, state);
* smp_mb(); // smp_mb() from set_current_state()
* if (swait_active(wq_head)) if (@cond)
* wake_up(wq_head); break;
* schedule();
* }
* finish_swait(&wq_head, &wait);
*
* Because without the explicit smp_mb() it's possible for the
* swait_active() load to get hoisted over the @cond store such that we'll
* observe an empty wait list while the waiter might not observe @cond.
* This, in turn, can trigger missing wakeups.
*
* Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
* which (when the lock is uncontended) are of roughly equal cost.
*/
static inline int swait_active(struct swait_queue_head *wq)
{
return !list_empty(&wq->task_list);
}
/**
* swq_has_sleeper - check if there are any waiting processes
* @wq: the waitqueue to test for waiters
*
* Returns true if @wq has waiting processes
*
* Please refer to the comment for swait_active.
*/
static inline bool swq_has_sleeper(struct swait_queue_head *wq)
{
return !list_empty(&q->task_list);
/*
* We need to be sure we are in sync with the list_add()
* modifications to the wait queue (task_list).
*
* This memory barrier should be paired with one on the
* waiting side.
*/
smp_mb();
return swait_active(wq);
}
extern void swake_up(struct swait_queue_head *q);
......
......@@ -14,7 +14,9 @@
ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \
ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\
ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \
ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH)
ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\
ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI), \
ERSN(HYPERV)
TRACE_EVENT(kvm_userspace_exit,
TP_PROTO(__u32 reason, int errno),
......
......@@ -106,11 +106,7 @@ static void async_pf_execute(struct work_struct *work)
trace_kvm_async_pf_completed(addr, gva);
/*
* This memory barrier pairs with prepare_to_wait's set_current_state()
*/
smp_mb();
if (swait_active(&vcpu->wq))
if (swq_has_sleeper(&vcpu->wq))
swake_up(&vcpu->wq);
mmput(mm);
......
......@@ -565,6 +565,8 @@ kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
{
if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
return -EINVAL;
if (args->gsi >= KVM_MAX_IRQ_ROUTES)
return -EINVAL;
if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
return kvm_irqfd_deassign(kvm, args);
......
<
......@@ -674,6 +674,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
out_err_no_srcu:
hardware_disable_all();
out_err_no_disable:
refcount_set(&kvm->users_count, 0);
for (i = 0; i < KVM_NR_BUSES; i++)
kfree(kvm_get_bus(kvm, i));
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
......@@ -2186,7 +2187,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
struct swait_queue_head *wqp;
wqp = kvm_arch_vcpu_wq(vcpu);
if (swait_active(wqp)) {
if (swq_has_sleeper(wqp)) {
swake_up(wqp);
++vcpu->stat.halt_wakeup;
return true;
......