KVM x86 and selftests fixes for 6.12:

x86:
 
 - When emulating a guest TLB flush for a nested guest, flush vpid01, not
   vpid02, if L2 is active but VPID is disabled in vmcs12, i.e. if L2 and
   L1 are sharing VPID '0' (from L1's perspective).
 
 - Fix a bug in the SNP initialization flow where KVM would return '0' to
   userspace instead of -errno on failure.
 
 - Move the Intel PT virtualization (i.e. outputting host trace to host
   buffer and guest trace to guest buffer) behind CONFIG_BROKEN.
 
 - Fix memory leak on failure of KVM_SEV_SNP_LAUNCH_START
 
 - Fix a bug where KVM fails to inject an interrupt from the IRR after
   KVM_SET_LAPIC.
 
 Selftests:
 
 - Increase the timeout for the memslot performance selftest to avoid false
   failures on arm64 and nested x86 platforms.
 
 - Fix a goof in the guest_memfd selftest where a for-loop initialized a
   bit mask to zero instead of BIT(0).
 
 - Disable strict aliasing when building KVM selftests to prevent the
   compiler from treating things like "u64 *" to "uint64_t *" cases as
   undefined behavior, which can lead to nasty, hard to debug failures.
 
 - Force -march=x86-64-v2 for KVM x86 selftests if and only if the uarch
   is supported by the compiler.
 
 - Fix broken compilation of kvm selftests after a header sync in tools/
 -----BEGIN PGP SIGNATURE-----
 
 iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmczm1QUHHBib256aW5p
 QHJlZGhhdC5jb20ACgkQv/vSX3jHroOLKwf+IjkJHZ/LS95HuP/0QLM17Sc4MmiZ
 Pk5gLd5un7BBSLA98RvALR/YPnsA7emEJ34bE/8lQ6R5VSZ5PrIzF+29f60HzRFe
 EDi1/24dqnzdWn50na5nk7A2QhFpfnLQQTl7vMqPFsrU7gfLuHQI6ABp9kloEwP/
 xnjAT683IWNX9v0N2A8kNemy9NNMGssJk1ssDTGzNflSyRNL8cLPGlPkZqAIMsM6
 fHjkDRg0UxasUDkL5CjwnTSdBGoz+/Myyz4unFlYGJB9D3+ev2qDlMqATO4Jfik/
 peJMZ65i8/8/7MgKCTn8qQuT0FLLEvxTuzDHUSGzjMZl0DGaZi2BPETNqg==
 =nW8/
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "x86 and selftests fixes.

  x86:

   - When emulating a guest TLB flush for a nested guest, flush vpid01,
     not vpid02, if L2 is active but VPID is disabled in vmcs12, i.e. if
     L2 and L1 are sharing VPID '0' (from L1's perspective).

   - Fix a bug in the SNP initialization flow where KVM would return '0'
     to userspace instead of -errno on failure.

   - Move the Intel PT virtualization (i.e. outputting host trace to
     host buffer and guest trace to guest buffer) behind CONFIG_BROKEN.

   - Fix memory leak on failure of KVM_SEV_SNP_LAUNCH_START

   - Fix a bug where KVM fails to inject an interrupt from the IRR after
     KVM_SET_LAPIC.

  Selftests:

   - Increase the timeout for the memslot performance selftest to avoid
     false failures on arm64 and nested x86 platforms.

   - Fix a goof in the guest_memfd selftest where a for-loop initialized
     a bit mask to zero instead of BIT(0).

   - Disable strict aliasing when building KVM selftests to prevent the
     compiler from treating things like "u64 *" to "uint64_t *" cases as
     undefined behavior, which can lead to nasty, hard to debug
     failures.

   - Force -march=x86-64-v2 for KVM x86 selftests if and only if the
     uarch is supported by the compiler.

   - Fix broken compilation of kvm selftests after a header sync in
     tools/"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  KVM: VMX: Bury Intel PT virtualization (guest/host mode) behind CONFIG_BROKEN
  KVM: x86: Unconditionally set irr_pending when updating APICv state
  kvm: svm: Fix gctx page leak on invalid inputs
  KVM: selftests: use X86_MEMTYPE_WB instead of VMX_BASIC_MEM_TYPE_WB
  KVM: SVM: Propagate error from snp_guest_req_init() to userspace
  KVM: nVMX: Treat vpid01 as current if L2 is active, but with VPID disabled
  KVM: selftests: Don't force -march=x86-64-v2 if it's unsupported
  KVM: selftests: Disable strict aliasing
  KVM: selftests: fix unintentional noop test in guest_memfd_test.c
  KVM: selftests: memslot_perf_test: increase guest sync timeout
This commit is contained in:
Linus Torvalds 2024-11-12 13:35:13 -08:00
commit 14b6320953
8 changed files with 65 additions and 31 deletions

View File

@ -2629,19 +2629,26 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
if (apic->apicv_active) {
/* irr_pending is always true when apicv is activated. */
apic->irr_pending = true;
/*
* When APICv is enabled, KVM must always search the IRR for a pending
* IRQ, as other vCPUs and devices can set IRR bits even if the vCPU
* isn't running. If APICv is disabled, KVM _should_ search the IRR
* for a pending IRQ. But KVM currently doesn't ensure *all* hardware,
* e.g. CPUs and IOMMUs, has seen the change in state, i.e. searching
* the IRR at this time could race with IRQ delivery from hardware that
* still sees APICv as being enabled.
*
* FIXME: Ensure other vCPUs and devices observe the change in APICv
* state prior to updating KVM's metadata caches, so that KVM
* can safely search the IRR and set irr_pending accordingly.
*/
apic->irr_pending = true;
if (apic->apicv_active)
apic->isr_count = 1;
} else {
/*
* Don't clear irr_pending, searching the IRR can race with
* updates from the CPU as APICv is still active from hardware's
* perspective. The flag will be cleared as appropriate when
* KVM injects the interrupt.
*/
else
apic->isr_count = count_vectors(apic->regs + APIC_ISR);
}
apic->highest_isr_cache = -1;
}

View File

@ -450,8 +450,11 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
goto e_free;
/* This needs to happen after SEV/SNP firmware initialization. */
if (vm_type == KVM_X86_SNP_VM && snp_guest_req_init(kvm))
goto e_free;
if (vm_type == KVM_X86_SNP_VM) {
ret = snp_guest_req_init(kvm);
if (ret)
goto e_free;
}
INIT_LIST_HEAD(&sev->regions_list);
INIT_LIST_HEAD(&sev->mirror_vms);
@ -2212,10 +2215,6 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (sev->snp_context)
return -EINVAL;
sev->snp_context = snp_context_create(kvm, argp);
if (!sev->snp_context)
return -ENOTTY;
if (params.flags)
return -EINVAL;
@ -2230,6 +2229,10 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET)
return -EINVAL;
sev->snp_context = snp_context_create(kvm, argp);
if (!sev->snp_context)
return -ENOTTY;
start.gctx_paddr = __psp_pa(sev->snp_context);
start.policy = params.policy;
memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw));

View File

@ -1197,11 +1197,14 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept);
/*
* If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
* for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
* full TLB flush from the guest's perspective. This is required even
* if VPID is disabled in the host as KVM may need to synchronize the
* MMU in response to the guest TLB flush.
* If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
* same VPID as the host, and so architecturally, linear and combined
* mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM
* emulates L2 sharing L1's VPID=0 by using vpid01 while running L2,
* and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This
* is required if VPID is disabled in KVM, as a TLB flush (there are no
* VPIDs) still occurs from L1's perspective, and KVM may need to
* synchronize the MMU in response to the guest TLB flush.
*
* Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use.
* EPT is a special snowflake, as guest-physical mappings aren't
@ -2315,6 +2318,17 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
/*
* If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
* same VPID as the host. Emulate this behavior by using vpid01 for L2
* if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter
* and VM-Exit are architecturally required to flush VPID=0, but *only*
* VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the
* required flushes), but doing so would cause KVM to over-flush. E.g.
* if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled,
* and then runs L2 X again, then KVM can and should retain TLB entries
* for VPID12=1.
*/
if (enable_vpid) {
if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
@ -5950,6 +5964,12 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
/*
* Always flush the effective vpid02, i.e. never flush the current VPID
* and never explicitly flush vpid01. INVVPID targets a VPID, not a
* VMCS, and so whether or not the current vmcs12 has VPID enabled is
* irrelevant (and there may not be a loaded vmcs12).
*/
vpid02 = nested_get_vpid02(vcpu);
switch (type) {
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:

View File

@ -217,9 +217,11 @@ module_param(ple_window_shrink, uint, 0444);
static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
module_param(ple_window_max, uint, 0444);
/* Default is SYSTEM mode, 1 for host-guest mode */
/* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */
int __read_mostly pt_mode = PT_MODE_SYSTEM;
#ifdef CONFIG_BROKEN
module_param(pt_mode, int, S_IRUGO);
#endif
struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;
@ -3216,7 +3218,7 @@ void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu))
if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu)))
return nested_get_vpid02(vcpu);
return to_vmx(vcpu)->vpid;
}

View File

@ -241,16 +241,18 @@ CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
-Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \
-fno-builtin-memcmp -fno-builtin-memcpy \
-fno-builtin-memset -fno-builtin-strnlen \
-fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \
-I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \
-I$(<D) -Iinclude/$(ARCH_DIR) -I ../rseq -I.. $(EXTRA_CFLAGS) \
$(KHDR_INCLUDES)
-fno-stack-protector -fno-PIE -fno-strict-aliasing \
-I$(LINUX_TOOL_INCLUDE) -I$(LINUX_TOOL_ARCH_INCLUDE) \
-I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -Iinclude/$(ARCH_DIR) \
-I ../rseq -I.. $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
ifeq ($(ARCH),s390)
CFLAGS += -march=z10
endif
ifeq ($(ARCH),x86)
ifeq ($(shell echo "void foo(void) { }" | $(CC) -march=x86-64-v2 -x c - -c -o /dev/null 2>/dev/null; echo "$$?"),0)
CFLAGS += -march=x86-64-v2
endif
endif
ifeq ($(ARCH),arm64)
tools_dir := $(top_srcdir)/tools
arm64_tools_dir := $(tools_dir)/arch/arm64/tools/

View File

@ -134,7 +134,7 @@ static void test_create_guest_memfd_invalid(struct kvm_vm *vm)
size);
}
for (flag = 0; flag; flag <<= 1) {
for (flag = BIT(0); flag; flag <<= 1) {
fd = __vm_create_guest_memfd(vm, page_size, flag);
TEST_ASSERT(fd == -1 && errno == EINVAL,
"guest_memfd() with flag '0x%lx' should fail with EINVAL",

View File

@ -200,7 +200,7 @@ static inline void init_vmcs_control_fields(struct vmx_pages *vmx)
if (vmx->eptp_gpa) {
uint64_t ept_paddr;
struct eptPageTablePointer eptp = {
.memory_type = VMX_BASIC_MEM_TYPE_WB,
.memory_type = X86_MEMTYPE_WB,
.page_walk_length = 3, /* + 1 */
.ad_enabled = ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS),
.address = vmx->eptp_gpa >> PAGE_SHIFT_4K,

View File

@ -417,7 +417,7 @@ static bool _guest_should_exit(void)
*/
static noinline void host_perform_sync(struct sync_area *sync)
{
alarm(2);
alarm(10);
atomic_store_explicit(&sync->sync_flag, true, memory_order_release);
while (atomic_load_explicit(&sync->sync_flag, memory_order_acquire))