From 9fc3402a20771ebd3abedec9d354b89b10ccfd98 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Thu, 27 Feb 2025 09:20:11 +0800 Subject: [PATCH 01/16] KVM: TDX: Enable guest access to LMCE related MSRs Allow TDX guest to configure LMCE (Local Machine Check Event) by handling MSR IA32_FEAT_CTL and IA32_MCG_EXT_CTL. MCE and MCA are advertised via cpuid based on the TDX module spec. Guest kernel can access IA32_FEAT_CTL to check whether LMCE is opted-in by the platform or not. If LMCE is opted-in by the platform, guest kernel can access IA32_MCG_EXT_CTL to enable/disable LMCE. Handle MSR IA32_FEAT_CTL and IA32_MCG_EXT_CTL for TDX guests to avoid failure when a guest accesses them with TDG.VP.VMCALL on #VE. E.g., Linux guest will treat the failure as a #GP(0). Userspace VMM may not opt-in LMCE by default, e.g., QEMU disables it by default, "-cpu lmce=on" is needed in QEMU command line to opt-in it. Signed-off-by: Isaku Yamahata [binbin: rework changelog] Signed-off-by: Binbin Wu Message-ID: <20250227012021.1778144-11-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/tdx.c | 46 +++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 6ce20082a519..1aecb4050082 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -2065,6 +2065,7 @@ bool tdx_has_emulated_msr(u32 index) case MSR_MISC_FEATURES_ENABLES: case MSR_IA32_APICBASE: case MSR_EFER: + case MSR_IA32_FEAT_CTL: case MSR_IA32_MCG_CAP: case MSR_IA32_MCG_STATUS: case MSR_IA32_MCG_CTL: @@ -2097,26 +2098,53 @@ bool tdx_has_emulated_msr(u32 index) static bool tdx_is_read_only_msr(u32 index) { - return index == MSR_IA32_APICBASE || index == MSR_EFER; + return index == MSR_IA32_APICBASE || index == MSR_EFER || + index == MSR_IA32_FEAT_CTL; } int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { - if (!tdx_has_emulated_msr(msr->index)) - return 1; + switch (msr->index) { + case MSR_IA32_FEAT_CTL: + /* + * MCE and MCA are advertised via cpuid. Guest kernel could + * check if LMCE is enabled or not. + */ + msr->data = FEAT_CTL_LOCKED; + if (vcpu->arch.mcg_cap & MCG_LMCE_P) + msr->data |= FEAT_CTL_LMCE_ENABLED; + return 0; + case MSR_IA32_MCG_EXT_CTL: + if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) + return 1; + msr->data = vcpu->arch.mcg_ext_ctl; + return 0; + default: + if (!tdx_has_emulated_msr(msr->index)) + return 1; - return kvm_get_msr_common(vcpu, msr); + return kvm_get_msr_common(vcpu, msr); + } } int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { - if (tdx_is_read_only_msr(msr->index)) - return 1; + switch (msr->index) { + case MSR_IA32_MCG_EXT_CTL: + if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) || + (msr->data & ~MCG_EXT_CTL_LMCE_EN)) + return 1; + vcpu->arch.mcg_ext_ctl = msr->data; + return 0; + default: + if (tdx_is_read_only_msr(msr->index)) + return 1; - if (!tdx_has_emulated_msr(msr->index)) - return 1; + if (!tdx_has_emulated_msr(msr->index)) + return 1; - return kvm_set_msr_common(vcpu, msr); + return kvm_set_msr_common(vcpu, msr); + } } static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) -- 2.51.0 From 04733836fe7d55b23c69234e2c90ab6b6a2794e0 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Thu, 27 Feb 2025 09:20:12 +0800 Subject: [PATCH 02/16] KVM: TDX: Handle TDG.VP.VMCALL hypercall Implement TDG.VP.VMCALL hypercall. If the input value is zero, return success code and zero in output registers. TDG.VP.VMCALL hypercall is a subleaf of TDG.VP.VMCALL to enumerate which TDG.VP.VMCALL sub leaves are supported. This hypercall is for future enhancement of the Guest-Host-Communication Interface (GHCI) specification. The GHCI version of 344426-001US defines it to require input R12 to be zero and to return zero in output registers, R11, R12, R13, and R14 so that guest TD enumerates no enhancement. Signed-off-by: Isaku Yamahata Signed-off-by: Binbin Wu Message-ID: <20250227012021.1778144-12-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/shared/tdx.h | 1 + arch/x86/kvm/vmx/tdx.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index f23657350d28..606d93a1cbac 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -67,6 +67,7 @@ #define TD_CTLS_LOCK BIT_ULL(TD_CTLS_LOCK_BIT) /* TDX hypercall Leaf IDs */ +#define TDVMCALL_GET_TD_VM_CALL_INFO 0x10000 #define TDVMCALL_MAP_GPA 0x10001 #define TDVMCALL_GET_QUOTE 0x10002 #define TDVMCALL_REPORT_FATAL_ERROR 0x10003 diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 1aecb4050082..24881e2fafc2 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1448,6 +1448,20 @@ error: return 1; } +static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + if (tdx->vp_enter_args.r12) + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + else { + tdx->vp_enter_args.r11 = 0; + tdx->vp_enter_args.r13 = 0; + tdx->vp_enter_args.r14 = 0; + } + return 1; +} + static int handle_tdvmcall(struct kvm_vcpu *vcpu) { switch (tdvmcall_leaf(vcpu)) { @@ -1455,6 +1469,8 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu) return tdx_map_gpa(vcpu); case TDVMCALL_REPORT_FATAL_ERROR: return tdx_report_fatal_error(vcpu); + case TDVMCALL_GET_TD_VM_CALL_INFO: + return tdx_get_td_vm_call_info(vcpu); default: break; } -- 2.51.0 From a141f28d6b02af2e9c3aa6b7a3138b15ea7cef98 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Thu, 27 Feb 2025 09:20:13 +0800 Subject: [PATCH 03/16] KVM: TDX: Add methods to ignore accesses to CPU state TDX protects TDX guest state from VMM. Implement access methods for TDX guest state to ignore them or return zero. Because those methods can be called by kvm ioctls to set/get cpu registers, they don't have KVM_BUG_ON. Signed-off-by: Isaku Yamahata Co-developed-by: Adrian Hunter Signed-off-by: Adrian Hunter Co-developed-by: Binbin Wu Signed-off-by: Binbin Wu Message-ID: <20250227012021.1778144-13-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/main.c | 307 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 278 insertions(+), 29 deletions(-) diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 713426198930..39638afaa918 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -335,6 +335,214 @@ static void vt_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, vmx_deliver_interrupt(apic, delivery_mode, trig_mode, vector); } +static void vt_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_vcpu_after_set_cpuid(vcpu); +} + +static void vt_update_exception_bitmap(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_update_exception_bitmap(vcpu); +} + +static u64 vt_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_segment_base(vcpu, seg); +} + +static void vt_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, + int seg) +{ + if (is_td_vcpu(vcpu)) { + memset(var, 0, sizeof(*var)); + return; + } + + vmx_get_segment(vcpu, var, seg); +} + +static void vt_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, + int seg) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_segment(vcpu, var, seg); +} + +static int vt_get_cpl(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_cpl(vcpu); +} + +static int vt_get_cpl_no_cache(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_cpl_no_cache(vcpu); +} + +static void vt_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + if (is_td_vcpu(vcpu)) { + *db = 0; + *l = 0; + return; + } + + vmx_get_cs_db_l_bits(vcpu, db, l); +} + +static bool vt_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (is_td_vcpu(vcpu)) + return true; + + return vmx_is_valid_cr0(vcpu, cr0); +} + +static void vt_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_cr0(vcpu, cr0); +} + +static bool vt_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (is_td_vcpu(vcpu)) + return true; + + return vmx_is_valid_cr4(vcpu, cr4); +} + +static void vt_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_cr4(vcpu, cr4); +} + +static int vt_set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_set_efer(vcpu, efer); +} + +static void vt_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) { + memset(dt, 0, sizeof(*dt)); + return; + } + + vmx_get_idt(vcpu, dt); +} + +static void vt_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_idt(vcpu, dt); +} + +static void vt_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) { + memset(dt, 0, sizeof(*dt)); + return; + } + + vmx_get_gdt(vcpu, dt); +} + +static void vt_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_gdt(vcpu, dt); +} + +static void vt_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_dr6(vcpu, val); +} + +static void vt_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_dr7(vcpu, val); +} + +static void vt_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + /* + * MOV-DR exiting is always cleared for TD guest, even in debug mode. + * Thus KVM_DEBUGREG_WONT_EXIT can never be set and it should never + * reach here for TD vcpu. + */ + if (is_td_vcpu(vcpu)) + return; + + vmx_sync_dirty_debug_regs(vcpu); +} + +static void vt_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) +{ + if (WARN_ON_ONCE(is_td_vcpu(vcpu))) + return; + + vmx_cache_reg(vcpu, reg); +} + +static unsigned long vt_get_rflags(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_rflags(vcpu); +} + +static void vt_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_rflags(vcpu, rflags); +} + +static bool vt_get_if_flag(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return false; + + return vmx_get_if_flag(vcpu); +} + static void vt_flush_tlb_all(struct kvm_vcpu *vcpu) { if (is_td_vcpu(vcpu)) { @@ -457,6 +665,14 @@ static void vt_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) vmx_inject_irq(vcpu, reinjected); } +static void vt_inject_exception(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_inject_exception(vcpu); +} + static void vt_cancel_injection(struct kvm_vcpu *vcpu) { if (is_td_vcpu(vcpu)) @@ -504,6 +720,14 @@ static void vt_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, vmx_get_exit_info(vcpu, reason, info1, info2, intr_info, error_code); } +static void vt_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_update_cr8_intercept(vcpu, tpr, irr); +} + static void vt_set_apic_access_page_addr(struct kvm_vcpu *vcpu) { if (is_td_vcpu(vcpu)) @@ -522,6 +746,30 @@ static void vt_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) vmx_refresh_apicv_exec_ctrl(vcpu); } +static void vt_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_load_eoi_exitmap(vcpu, eoi_exit_bitmap); +} + +static int vt_set_tss_addr(struct kvm *kvm, unsigned int addr) +{ + if (is_td(kvm)) + return 0; + + return vmx_set_tss_addr(kvm, addr); +} + +static int vt_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) +{ + if (is_td(kvm)) + return 0; + + return vmx_set_identity_map_addr(kvm, ident_addr); +} + static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { if (!is_td(kvm)) @@ -583,32 +831,33 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .vcpu_load = vt_vcpu_load, .vcpu_put = vt_vcpu_put, - .update_exception_bitmap = vmx_update_exception_bitmap, + .update_exception_bitmap = vt_update_exception_bitmap, .get_feature_msr = vmx_get_feature_msr, .get_msr = vt_get_msr, .set_msr = vt_set_msr, - .get_segment_base = vmx_get_segment_base, - .get_segment = vmx_get_segment, - .set_segment = vmx_set_segment, - .get_cpl = vmx_get_cpl, - .get_cpl_no_cache = vmx_get_cpl_no_cache, - .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .is_valid_cr0 = vmx_is_valid_cr0, - .set_cr0 = vmx_set_cr0, - .is_valid_cr4 = vmx_is_valid_cr4, - .set_cr4 = vmx_set_cr4, - .set_efer = vmx_set_efer, - .get_idt = vmx_get_idt, - .set_idt = vmx_set_idt, - .get_gdt = vmx_get_gdt, - .set_gdt = vmx_set_gdt, - .set_dr6 = vmx_set_dr6, - .set_dr7 = vmx_set_dr7, - .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, - .cache_reg = vmx_cache_reg, - .get_rflags = vmx_get_rflags, - .set_rflags = vmx_set_rflags, - .get_if_flag = vmx_get_if_flag, + + .get_segment_base = vt_get_segment_base, + .get_segment = vt_get_segment, + .set_segment = vt_set_segment, + .get_cpl = vt_get_cpl, + .get_cpl_no_cache = vt_get_cpl_no_cache, + .get_cs_db_l_bits = vt_get_cs_db_l_bits, + .is_valid_cr0 = vt_is_valid_cr0, + .set_cr0 = vt_set_cr0, + .is_valid_cr4 = vt_is_valid_cr4, + .set_cr4 = vt_set_cr4, + .set_efer = vt_set_efer, + .get_idt = vt_get_idt, + .set_idt = vt_set_idt, + .get_gdt = vt_get_gdt, + .set_gdt = vt_set_gdt, + .set_dr6 = vt_set_dr6, + .set_dr7 = vt_set_dr7, + .sync_dirty_debug_regs = vt_sync_dirty_debug_regs, + .cache_reg = vt_cache_reg, + .get_rflags = vt_get_rflags, + .set_rflags = vt_set_rflags, + .get_if_flag = vt_get_if_flag, .flush_tlb_all = vt_flush_tlb_all, .flush_tlb_current = vt_flush_tlb_current, @@ -625,7 +874,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .patch_hypercall = vmx_patch_hypercall, .inject_irq = vt_inject_irq, .inject_nmi = vt_inject_nmi, - .inject_exception = vmx_inject_exception, + .inject_exception = vt_inject_exception, .cancel_injection = vt_cancel_injection, .interrupt_allowed = vt_interrupt_allowed, .nmi_allowed = vt_nmi_allowed, @@ -633,13 +882,13 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .set_nmi_mask = vt_set_nmi_mask, .enable_nmi_window = vt_enable_nmi_window, .enable_irq_window = vt_enable_irq_window, - .update_cr8_intercept = vmx_update_cr8_intercept, + .update_cr8_intercept = vt_update_cr8_intercept, .x2apic_icr_is_split = false, .set_virtual_apic_mode = vt_set_virtual_apic_mode, .set_apic_access_page_addr = vt_set_apic_access_page_addr, .refresh_apicv_exec_ctrl = vt_refresh_apicv_exec_ctrl, - .load_eoi_exitmap = vmx_load_eoi_exitmap, + .load_eoi_exitmap = vt_load_eoi_exitmap, .apicv_pre_state_restore = vt_apicv_pre_state_restore, .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, .hwapic_isr_update = vt_hwapic_isr_update, @@ -647,14 +896,14 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .deliver_interrupt = vt_deliver_interrupt, .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, - .set_tss_addr = vmx_set_tss_addr, - .set_identity_map_addr = vmx_set_identity_map_addr, + .set_tss_addr = vt_set_tss_addr, + .set_identity_map_addr = vt_set_identity_map_addr, .get_mt_mask = vmx_get_mt_mask, .get_exit_info = vt_get_exit_info, .get_entry_info = vt_get_entry_info, - .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, + .vcpu_after_set_cpuid = vt_vcpu_after_set_cpuid, .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, -- 2.51.0 From e6bb3978848098e23e5485905870ce585f85a1df Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Thu, 27 Feb 2025 09:20:14 +0800 Subject: [PATCH 04/16] KVM: TDX: Add method to ignore guest instruction emulation Skip instruction emulation and let the TDX guest retry for MMIO emulation after installing the MMIO SPTE with suppress #VE bit cleared. TDX protects TDX guest state from VMM, instructions in guest memory cannot be emulated. MMIO emulation is the only case that triggers the instruction emulation code path for TDX guest. The MMIO emulation handling flow as following: - The TDX guest issues a vMMIO instruction. (The GPA must be shared and is not covered by KVM memory slot.) - The default SPTE entry for shared-EPT by KVM has suppress #VE bit set. So EPT violation causes TD exit to KVM. - Trigger KVM page fault handler and install a new SPTE with suppress #VE bit cleared. - Skip instruction emulation and return X86EMU_RETRY_INSTR to let the vCPU retry. - TDX guest re-executes the vMMIO instruction. - TDX guest gets #VE because KVM has cleared #VE suppress bit. - TDX guest #VE handler converts MMIO into TDG.VP.VMCALL Return X86EMU_RETRY_INSTR in the callback check_emulate_instruction() for TDX guests to retry the MMIO instruction. Also, the instruction emulation handling will be skipped, so that the callback check_intercept() will never be called for TDX guest. Signed-off-by: Isaku Yamahata Co-developed-by: Binbin Wu Signed-off-by: Binbin Wu Message-ID: <20250227012021.1778144-14-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/main.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 39638afaa918..0b9e4ede6143 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -278,6 +278,22 @@ static void vt_enable_smi_window(struct kvm_vcpu *vcpu) } #endif +static int vt_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) +{ + /* + * For TDX, this can only be triggered for MMIO emulation. Let the + * guest retry after installing the SPTE with suppress #VE bit cleared, + * so that the guest will receive #VE when retry. The guest is expected + * to call TDG.VP.VMCALL to request VMM to do MMIO emulation on + * #VE. + */ + if (is_td_vcpu(vcpu)) + return X86EMUL_RETRY_INSTR; + + return vmx_check_emulate_instruction(vcpu, emul_type, insn, insn_len); +} + static bool vt_apic_init_signal_blocked(struct kvm_vcpu *vcpu) { /* @@ -938,7 +954,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .enable_smi_window = vt_enable_smi_window, #endif - .check_emulate_instruction = vmx_check_emulate_instruction, + .check_emulate_instruction = vt_check_emulate_instruction, .apic_init_signal_blocked = vt_apic_init_signal_blocked, .migrate_timers = vmx_migrate_timers, -- 2.51.0 From a946c71cf86bd5eb16e57bb03997b54ef8dd15d5 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Thu, 27 Feb 2025 09:20:15 +0800 Subject: [PATCH 05/16] KVM: TDX: Add methods to ignore VMX preemption timer TDX doesn't support VMX preemption timer. Implement access methods for VMM to ignore VMX preemption timer. Signed-off-by: Isaku Yamahata Signed-off-by: Binbin Wu Message-ID: <20250227012021.1778144-15-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/main.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 0b9e4ede6143..2e5798d03423 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -786,6 +786,27 @@ static int vt_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) return vmx_set_identity_map_addr(kvm, ident_addr); } +#ifdef CONFIG_X86_64 +static int vt_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, + bool *expired) +{ + /* VMX-preemption timer isn't available for TDX. */ + if (is_td_vcpu(vcpu)) + return -EINVAL; + + return vmx_set_hv_timer(vcpu, guest_deadline_tsc, expired); +} + +static void vt_cancel_hv_timer(struct kvm_vcpu *vcpu) +{ + /* VMX-preemption timer can't be set. See vt_set_hv_timer(). */ + if (is_td_vcpu(vcpu)) + return; + + vmx_cancel_hv_timer(vcpu); +} +#endif + static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { if (!is_td(kvm)) @@ -941,8 +962,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .pi_start_assignment = vmx_pi_start_assignment, #ifdef CONFIG_X86_64 - .set_hv_timer = vmx_set_hv_timer, - .cancel_hv_timer = vmx_cancel_hv_timer, + .set_hv_timer = vt_set_hv_timer, + .cancel_hv_timer = vt_cancel_hv_timer, #endif .setup_mce = vmx_setup_mce, -- 2.51.0 From cf5f3668c58c34030eadc9f3c8c228a302cd5f67 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Thu, 27 Feb 2025 09:20:16 +0800 Subject: [PATCH 06/16] KVM: TDX: Add methods to ignore accesses to TSC TDX protects TDX guest TSC state from VMM. Implement access methods to ignore guest TSC. Signed-off-by: Isaku Yamahata Signed-off-by: Binbin Wu Message-ID: <20250227012021.1778144-16-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/main.c | 44 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 2e5798d03423..98e59048f11f 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -786,6 +786,42 @@ static int vt_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) return vmx_set_identity_map_addr(kvm, ident_addr); } +static u64 vt_get_l2_tsc_offset(struct kvm_vcpu *vcpu) +{ + /* TDX doesn't support L2 guest at the moment. */ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_l2_tsc_offset(vcpu); +} + +static u64 vt_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + /* TDX doesn't support L2 guest at the moment. */ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_l2_tsc_multiplier(vcpu); +} + +static void vt_write_tsc_offset(struct kvm_vcpu *vcpu) +{ + /* In TDX, tsc offset can't be changed. */ + if (is_td_vcpu(vcpu)) + return; + + vmx_write_tsc_offset(vcpu); +} + +static void vt_write_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + /* In TDX, tsc multiplier can't be changed. */ + if (is_td_vcpu(vcpu)) + return; + + vmx_write_tsc_multiplier(vcpu); +} + #ifdef CONFIG_X86_64 static int vt_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, bool *expired) @@ -944,10 +980,10 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - .get_l2_tsc_offset = vmx_get_l2_tsc_offset, - .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, - .write_tsc_offset = vmx_write_tsc_offset, - .write_tsc_multiplier = vmx_write_tsc_multiplier, + .get_l2_tsc_offset = vt_get_l2_tsc_offset, + .get_l2_tsc_multiplier = vt_get_l2_tsc_multiplier, + .write_tsc_offset = vt_write_tsc_offset, + .write_tsc_multiplier = vt_write_tsc_multiplier, .load_mmu_pgd = vt_load_mmu_pgd, -- 2.51.0 From 79264ff080c7902c6f23e0245641bc06b4f9e4d1 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Thu, 27 Feb 2025 09:20:17 +0800 Subject: [PATCH 07/16] KVM: TDX: Ignore setting up mce Because vmx_set_mce function is VMX specific and it cannot be used for TDX. Add vt stub to ignore setting up mce for TDX. Signed-off-by: Isaku Yamahata Signed-off-by: Binbin Wu Message-ID: <20250227012021.1778144-17-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/main.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 98e59048f11f..00217397666f 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -843,6 +843,14 @@ static void vt_cancel_hv_timer(struct kvm_vcpu *vcpu) } #endif +static void vt_setup_mce(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_setup_mce(vcpu); +} + static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { if (!is_td(kvm)) @@ -1002,7 +1010,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .cancel_hv_timer = vt_cancel_hv_timer, #endif - .setup_mce = vmx_setup_mce, + .setup_mce = vt_setup_mce, #ifdef CONFIG_KVM_SMM .smi_allowed = vt_smi_allowed, -- 2.51.0 From 0b75889b0cca92a5692d4266921f9604fcf9ae28 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Thu, 27 Feb 2025 09:20:18 +0800 Subject: [PATCH 08/16] KVM: TDX: Add a method to ignore hypercall patching Because guest TD memory is protected, VMM patching guest binary for hypercall instruction isn't possible. Add a method to ignore hypercall patching. Note: guest TD kernel needs to be modified to use TDG.VP.VMCALL for hypercall. Signed-off-by: Isaku Yamahata Signed-off-by: Binbin Wu Message-ID: <20250227012021.1778144-18-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/main.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 00217397666f..3d8a13e66d36 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -673,6 +673,19 @@ static u32 vt_get_interrupt_shadow(struct kvm_vcpu *vcpu) return vmx_get_interrupt_shadow(vcpu); } +static void vt_patch_hypercall(struct kvm_vcpu *vcpu, + unsigned char *hypercall) +{ + /* + * Because guest memory is protected, guest can't be patched. TD kernel + * is modified to use TDG.VP.VMCALL for hypercall. + */ + if (is_td_vcpu(vcpu)) + return; + + vmx_patch_hypercall(vcpu, hypercall); +} + static void vt_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { if (is_td_vcpu(vcpu)) @@ -952,7 +965,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .update_emulated_instruction = vmx_update_emulated_instruction, .set_interrupt_shadow = vt_set_interrupt_shadow, .get_interrupt_shadow = vt_get_interrupt_shadow, - .patch_hypercall = vmx_patch_hypercall, + .patch_hypercall = vt_patch_hypercall, .inject_irq = vt_inject_irq, .inject_nmi = vt_inject_nmi, .inject_exception = vt_inject_exception, -- 2.51.0 From 26eab9ae4be0255ecb603684b46841857f71df90 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Thu, 27 Feb 2025 09:20:19 +0800 Subject: [PATCH 09/16] KVM: TDX: Enable guest access to MTRR MSRs Allow TDX guests to access MTRR MSRs as what KVM does for normal VMs, i.e., KVM emulates accesses to MTRR MSRs, but doesn't virtualize guest MTRR memory types. TDX module exposes MTRR feature to TDX guests unconditionally. KVM needs to support MTRR MSRs accesses for TDX guests to match the architectural behavior. Signed-off-by: Binbin Wu Message-ID: <20250227012021.1778144-19-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/tdx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 24881e2fafc2..db1ca8b22340 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -2075,6 +2075,9 @@ bool tdx_has_emulated_msr(u32 index) case MSR_IA32_ARCH_CAPABILITIES: case MSR_IA32_POWER_CTL: case MSR_IA32_CR_PAT: + case MSR_MTRRcap: + case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: case MSR_IA32_TSC_DEADLINE: case MSR_IA32_MISC_ENABLE: case MSR_PLATFORM_INFO: -- 2.51.0 From 9966b7822b3f49b3aea5d926ece4bc92f1f0a700 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 3 Mar 2025 09:09:37 -0500 Subject: [PATCH 10/16] KVM: x86: do not allow re-enabling quirks Allowing arbitrary re-enabling of quirks puts a limit on what the quirks themselves can do, since you cannot assume that the quirk prevents a particular state. More important, it also prevents KVM from disabling a quirk at VM creation time, because userspace can always go back and re-enable that. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index df2da8b511c7..cdc9c7b96511 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6533,7 +6533,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, break; fallthrough; case KVM_CAP_DISABLE_QUIRKS: - kvm->arch.disabled_quirks = cap->args[0]; + kvm->arch.disabled_quirks |= cap->args[0]; r = 0; break; case KVM_CAP_SPLIT_IRQCHIP: { -- 2.51.0 From a4dae7c7a41d803a05192015b2d47aca8aca4abf Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 3 Mar 2025 11:18:38 -0500 Subject: [PATCH 11/16] KVM: x86: Allow vendor code to disable quirks In some cases, the handling of quirks is split between platform-specific code and generic code, or it is done entirely in generic code, but the relevant bug does not trigger on some platforms; for example, this will be the case for "ignore guest PAT". Allow unaffected vendor modules to disable handling of a quirk for all VMs via a new entry in kvm_caps. Such quirks remain available in KVM_CAP_DISABLE_QUIRKS2, because that API tells userspace that KVM *knows* that some of its past behavior was bogus or just undesirable. In other words, it's plausible for userspace to refuse to run if a quirk is not listed by KVM_CAP_DISABLE_QUIRKS2, so preserve that and make it part of the API. As an example, mark KVM_X86_QUIRK_CD_NW_CLEARED as auto-disabled on Intel systems. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/svm/svm.c | 1 + arch/x86/kvm/x86.c | 2 ++ arch/x86/kvm/x86.h | 1 + 4 files changed, 7 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7fdbe84374f4..bf1dc79fb0c6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2422,6 +2422,9 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_SLOT_ZAP_ALL | \ KVM_X86_QUIRK_STUFF_FEATURE_MSRS) +#define KVM_X86_CONDITIONAL_QUIRKS \ + KVM_X86_QUIRK_CD_NW_CLEARED + /* * KVM previously used a u32 field in kvm_run to indicate the hypercall was * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e67de787fc71..92c1b0fffe6b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5472,6 +5472,7 @@ static __init int svm_hardware_setup(void) */ allow_smaller_maxphyaddr = !npt_enabled; + kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; return 0; err: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cdc9c7b96511..93168a46adcf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9783,6 +9783,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0; } + kvm_caps.inapplicable_quirks = KVM_X86_CONDITIONAL_QUIRKS; rdmsrl_safe(MSR_EFER, &kvm_host.efer); @@ -12733,6 +12734,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* Decided by the vendor code for other VM types. */ kvm->arch.pre_fault_allowed = type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM; + kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks; ret = kvm_page_track_init(kvm); if (ret) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 8ce6da98b5a2..221778792c3c 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -34,6 +34,7 @@ struct kvm_caps { u64 supported_xcr0; u64 supported_xss; u64 supported_perf_cap; + u64 inapplicable_quirks; }; struct kvm_host_values { -- 2.51.0 From bd7d5362b4c4ac8b951385867a0fadfae0ba3c07 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Mon, 24 Feb 2025 15:08:32 +0800 Subject: [PATCH 12/16] KVM: x86: Introduce supported_quirks to block disabling quirks Introduce supported_quirks in kvm_caps to store platform-specific force-enabled quirks. No functional changes intended. Signed-off-by: Yan Zhao Message-ID: <20250224070832.31394-1-yan.y.zhao@intel.com> [Remove unsupported quirks at KVM_ENABLE_CAP time. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 +++++---- arch/x86/kvm/x86.h | 2 ++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 93168a46adcf..920b36c31869 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4790,7 +4790,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; break; case KVM_CAP_DISABLE_QUIRKS2: - r = KVM_X86_VALID_QUIRKS; + r = kvm_caps.supported_quirks; break; case KVM_CAP_X86_NOTIFY_VMEXIT: r = kvm_caps.has_notify_vmexit; @@ -6529,11 +6529,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, switch (cap->cap) { case KVM_CAP_DISABLE_QUIRKS2: r = -EINVAL; - if (cap->args[0] & ~KVM_X86_VALID_QUIRKS) + if (cap->args[0] & ~kvm_caps.supported_quirks) break; fallthrough; case KVM_CAP_DISABLE_QUIRKS: - kvm->arch.disabled_quirks |= cap->args[0]; + kvm->arch.disabled_quirks |= cap->args[0] & kvm_caps.supported_quirks; r = 0; break; case KVM_CAP_SPLIT_IRQCHIP: { @@ -9783,6 +9783,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0; } + kvm_caps.supported_quirks = KVM_X86_VALID_QUIRKS; kvm_caps.inapplicable_quirks = KVM_X86_CONDITIONAL_QUIRKS; rdmsrl_safe(MSR_EFER, &kvm_host.efer); @@ -12734,7 +12735,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* Decided by the vendor code for other VM types. */ kvm->arch.pre_fault_allowed = type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM; - kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks; + kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks & kvm_caps.supported_quirks; ret = kvm_page_track_init(kvm); if (ret) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 221778792c3c..287dac35ed5e 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -34,6 +34,8 @@ struct kvm_caps { u64 supported_xcr0; u64 supported_xss; u64 supported_perf_cap; + + u64 supported_quirks; u64 inapplicable_quirks; }; -- 2.51.0 From c9c1e20b4c7d60fa084b3257525d21a49fe651a1 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Mon, 24 Feb 2025 15:09:45 +0800 Subject: [PATCH 13/16] KVM: x86: Introduce Intel specific quirk KVM_X86_QUIRK_IGNORE_GUEST_PAT Introduce an Intel specific quirk KVM_X86_QUIRK_IGNORE_GUEST_PAT to have KVM ignore guest PAT when this quirk is enabled. On AMD platforms, KVM always honors guest PAT. On Intel however there are two issues. First, KVM *cannot* honor guest PAT if CPU feature self-snoop is not supported. Second, UC access on certain Intel platforms can be very slow[1] and honoring guest PAT on those platforms may break some old guests that accidentally specify video RAM as UC. Those old guests may never expect the slowness since KVM always forces WB previously. See [2]. So, introduce a quirk that KVM can enable by default on all Intel platforms to avoid breaking old unmodifiable guests. Newer userspace can disable this quirk if it wishes KVM to honor guest PAT; disabling the quirk will fail if self-snoop is not supported, i.e. if KVM cannot obey the wish. The quirk is a no-op on AMD and also if any assigned devices have non-coherent DMA. This is not an issue, as KVM_X86_QUIRK_CD_NW_CLEARED is another example of a quirk that is sometimes automatically disabled. Suggested-by: Paolo Bonzini Suggested-by: Sean Christopherson Cc: Kevin Tian Signed-off-by: Yan Zhao Link: https://lore.kernel.org/all/Ztl9NWCOupNfVaCA@yzhao56-desk.sh.intel.com # [1] Link: https://lore.kernel.org/all/87jzfutmfc.fsf@redhat.com # [2] Message-ID: <20250224070946.31482-1-yan.y.zhao@intel.com> [Use supported_quirks/inapplicable_quirks to support both AMD and no-self-snoop cases, as well as to remove the shadow_memtype_mask check from kvm_mmu_may_ignore_guest_pat(). - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 22 ++++++++++++++++++ arch/x86/include/asm/kvm_host.h | 6 +++-- arch/x86/include/uapi/asm/kvm.h | 1 + arch/x86/kvm/mmu.h | 2 +- arch/x86/kvm/mmu/mmu.c | 10 ++++---- arch/x86/kvm/vmx/vmx.c | 41 +++++++++++++++++++++++++++------ arch/x86/kvm/x86.c | 6 ++++- 7 files changed, 73 insertions(+), 15 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 8dd3151fb9e4..0a6cef2ea5d8 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8160,6 +8160,28 @@ KVM_X86_QUIRK_STUFF_FEATURE_MSRS By default, at vCPU creation, KVM sets the and 0x489), as KVM does now allow them to be set by userspace (KVM sets them based on guest CPUID, for safety purposes). + +KVM_X86_QUIRK_IGNORE_GUEST_PAT By default, on Intel platforms, KVM ignores + guest PAT and forces the effective memory + type to WB in EPT. The quirk is not available + on Intel platforms which are incapable of + safely honoring guest PAT (i.e., without CPU + self-snoop, KVM always ignores guest PAT and + forces effective memory type to WB). It is + also ignored on AMD platforms or, on Intel, + when a VM has non-coherent DMA devices + assigned; KVM always honors guest PAT in + such case. The quirk is needed to avoid + slowdowns on certain Intel Xeon platforms + (e.g. ICX, SPR) where self-snoop feature is + supported but UC is slow enough to cause + issues with some older guests that use + UC instead of WC to map the video RAM. + Userspace can disable the quirk to honor + guest PAT if it knows that there is no such + guest software, for example if it does not + expose a bochs graphics device (which is + known to have had a buggy driver). =================================== ============================================ 7.32 KVM_CAP_MAX_VCPU_ID diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bf1dc79fb0c6..7b9ab0883b37 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2420,10 +2420,12 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \ KVM_X86_QUIRK_SLOT_ZAP_ALL | \ - KVM_X86_QUIRK_STUFF_FEATURE_MSRS) + KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \ + KVM_X86_QUIRK_IGNORE_GUEST_PAT) #define KVM_X86_CONDITIONAL_QUIRKS \ - KVM_X86_QUIRK_CD_NW_CLEARED + (KVM_X86_QUIRK_CD_NW_CLEARED | \ + KVM_X86_QUIRK_IGNORE_GUEST_PAT) /* * KVM previously used a u32 field in kvm_run to indicate the hypercall was diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 89cc7a18ef45..dc4d6428dd02 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -441,6 +441,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) #define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7) #define KVM_X86_QUIRK_STUFF_FEATURE_MSRS (1 << 8) +#define KVM_X86_QUIRK_IGNORE_GUEST_PAT (1 << 9) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 47e64a3c4ce3..f999c15d8d3e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -232,7 +232,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return -(u32)fault & errcode; } -bool kvm_mmu_may_ignore_guest_pat(void); +bool kvm_mmu_may_ignore_guest_pat(struct kvm *kvm); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0c6a4568af5b..d31ba356e453 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4663,17 +4663,19 @@ out_unlock: } #endif -bool kvm_mmu_may_ignore_guest_pat(void) +bool kvm_mmu_may_ignore_guest_pat(struct kvm *kvm) { /* * When EPT is enabled (shadow_memtype_mask is non-zero), and the VM * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to * honor the memtype from the guest's PAT so that guest accesses to * memory that is DMA'd aren't cached against the guest's wishes. As a - * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA, - * KVM _always_ ignores guest PAT (when EPT is enabled). + * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA. + * KVM _always_ ignores guest PAT, when EPT is enabled and when quirk + * KVM_X86_QUIRK_IGNORE_GUEST_PAT is enabled or the CPU lacks the + * ability to safely honor guest PAT. */ - return shadow_memtype_mask; + return kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT); } int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 71476a33f4f2..871c5d5f38bd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7595,6 +7595,17 @@ int vmx_vm_init(struct kvm *kvm) return 0; } +static inline bool vmx_ignore_guest_pat(struct kvm *kvm) +{ + /* + * Non-coherent DMA devices need the guest to flush CPU properly. + * In that case it is not possible to map all guest RAM as WB, so + * always trust guest PAT. + */ + return !kvm_arch_has_noncoherent_dma(kvm) && + kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT); +} + u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { /* @@ -7604,13 +7615,8 @@ u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) if (is_mmio) return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - /* - * Force WB and ignore guest PAT if the VM does NOT have a non-coherent - * device attached. Letting the guest control memory types on Intel - * CPUs may result in unexpected behavior, and so KVM's ABI is to trust - * the guest to behave only as a last resort. - */ - if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) + /* Force WB if ignoring guest PAT */ + if (vmx_ignore_guest_pat(vcpu->kvm)) return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); @@ -8502,6 +8508,27 @@ __init int vmx_hardware_setup(void) kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); + /* + * On Intel CPUs that lack self-snoop feature, letting the guest control + * memory types may result in unexpected behavior. So always ignore guest + * PAT on those CPUs and map VM as writeback, not allowing userspace to + * disable the quirk. + * + * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is + * supported, UC is slow enough to cause issues with some older guests (e.g. + * an old version of bochs driver uses ioremap() instead of ioremap_wc() to + * map the video RAM, causing wayland desktop to fail to get started + * correctly). To avoid breaking those older guests that rely on KVM to force + * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the + * safer (for performance) default behavior. + * + * On top of this, non-coherent DMA devices need the guest to flush CPU + * caches properly. This also requires honoring guest PAT, and is forced + * independent of the quirk in vmx_ignore_guest_pat(). + */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; + kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; return r; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 920b36c31869..fa658bc85b8e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9829,6 +9829,10 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled) kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM); + /* KVM always ignores guest PAT for shadow paging. */ + if (!tdp_enabled) + kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) kvm_caps.supported_xss = 0; @@ -13564,7 +13568,7 @@ static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm) * (or last) non-coherent device is (un)registered to so that new SPTEs * with the correct "ignore guest PAT" setting are created. */ - if (kvm_mmu_may_ignore_guest_pat()) + if (kvm_mmu_may_ignore_guest_pat(kvm)) kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL)); } -- 2.51.0 From 3fee4837ef405597971e9486e46a92270eadae5e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 3 Mar 2025 11:31:50 -0500 Subject: [PATCH 14/16] KVM: x86: remove shadow_memtype_mask The IGNORE_GUEST_PAT quirk is inapplicable, and thus always-disabled, if shadow_memtype_mask is zero. As long as vmx_get_mt_mask is not called for the shadow paging case, there is no need to consult shadow_memtype_mask and it can be removed altogether. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 15 --------------- arch/x86/kvm/mmu/spte.c | 19 ++----------------- arch/x86/kvm/mmu/spte.h | 1 - arch/x86/kvm/vmx/vmx.c | 2 ++ arch/x86/kvm/x86.c | 4 +++- 5 files changed, 7 insertions(+), 34 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d31ba356e453..066f27b56063 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4663,21 +4663,6 @@ out_unlock: } #endif -bool kvm_mmu_may_ignore_guest_pat(struct kvm *kvm) -{ - /* - * When EPT is enabled (shadow_memtype_mask is non-zero), and the VM - * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to - * honor the memtype from the guest's PAT so that guest accesses to - * memory that is DMA'd aren't cached against the guest's wishes. As a - * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA. - * KVM _always_ ignores guest PAT, when EPT is enabled and when quirk - * KVM_X86_QUIRK_IGNORE_GUEST_PAT is enabled or the CPU lacks the - * ability to safely honor guest PAT. - */ - return kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT); -} - int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index a609d5b58b69..f279153a1588 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -37,7 +37,6 @@ u64 __read_mostly shadow_mmio_value; u64 __read_mostly shadow_mmio_mask; u64 __read_mostly shadow_mmio_access_mask; u64 __read_mostly shadow_present_mask; -u64 __read_mostly shadow_memtype_mask; u64 __read_mostly shadow_me_value; u64 __read_mostly shadow_me_mask; u64 __read_mostly shadow_acc_track_mask; @@ -203,9 +202,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; - if (shadow_memtype_mask) - spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, kvm_is_mmio_pfn(pfn)); if (host_writable) spte |= shadow_host_writable_mask; else @@ -460,13 +457,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */ shadow_present_mask = (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT; - /* - * EPT overrides the host MTRRs, and so KVM must program the desired - * memtype directly into the SPTEs. Note, this mask is just the mask - * of all bits that factor into the memtype, the actual memtype must be - * dynamically calculated, e.g. to ensure host MMIO is mapped UC. - */ - shadow_memtype_mask = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT; + shadow_acc_track_mask = VMX_EPT_RWX_MASK; shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; @@ -518,12 +509,6 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_x_mask = 0; shadow_present_mask = PT_PRESENT_MASK; - /* - * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB - * memtype in the SPTEs, i.e. relies on host MTRRs to provide the - * correct memtype (WB is the "weakest" memtype). - */ - shadow_memtype_mask = 0; shadow_acc_track_mask = 0; shadow_me_mask = 0; shadow_me_value = 0; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 59746854c0af..249027efff0c 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -187,7 +187,6 @@ extern u64 __read_mostly shadow_mmio_value; extern u64 __read_mostly shadow_mmio_mask; extern u64 __read_mostly shadow_mmio_access_mask; extern u64 __read_mostly shadow_present_mask; -extern u64 __read_mostly shadow_memtype_mask; extern u64 __read_mostly shadow_me_value; extern u64 __read_mostly shadow_me_mask; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 871c5d5f38bd..459e6d58a327 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8434,6 +8434,8 @@ __init int vmx_hardware_setup(void) if (enable_ept) kvm_mmu_set_ept_masks(enable_ept_ad_bits, cpu_has_vmx_ept_execute_only()); + else + vt_x86_ops.get_mt_mask = NULL; /* * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa658bc85b8e..d4da2d936745 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13567,8 +13567,10 @@ static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm) * due to toggling the "ignore PAT" bit. Zap all SPTEs when the first * (or last) non-coherent device is (un)registered to so that new SPTEs * with the correct "ignore guest PAT" setting are created. + * + * If KVM always honors guest PAT, however, there is nothing to do. */ - if (kvm_mmu_may_ignore_guest_pat(kvm)) + if (kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT)) kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL)); } -- 2.51.0 From 90fe64a94d5475738ca5db781cdf93ab39bf844f Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Mon, 24 Feb 2025 15:10:39 +0800 Subject: [PATCH 15/16] KVM: TDX: KVM: TDX: Always honor guest PAT on TDX enabled guests Always honor guest PAT in KVM-managed EPTs on TDX enabled guests by making self-snoop feature a hard dependency for TDX and making quirk KVM_X86_QUIRK_IGNORE_GUEST_PAT not a valid quirk once TDX is enabled. The quirk KVM_X86_QUIRK_IGNORE_GUEST_PAT only affects memory type of KVM-managed EPTs. For the TDX-module-managed private EPT, memory type is always forced to WB now. Honoring guest PAT in KVM-managed EPTs ensures KVM does not invoke kvm_zap_gfn_range() when attaching/detaching non-coherent DMA devices, which would cause mirrored EPTs for TDs to be zapped, leading to the TDX-module-managed private EPT being incorrectly zapped. As a new feature, TDX always comes with support for self-snoop, and does not have to worry about unmodifiable but buggy guests. So, simply ignore KVM_X86_QUIRK_IGNORE_GUEST_PAT on TDX guests just like kvm-amd.ko already does. Suggested-by: Sean Christopherson Signed-off-by: Yan Zhao Message-ID: <20250224071039.31511-1-yan.y.zhao@intel.com> [Only apply to TDX guests. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/tdx.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index db1ca8b22340..fb4f9aa84b95 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -625,6 +625,7 @@ int tdx_vm_init(struct kvm *kvm) kvm->arch.has_protected_state = true; kvm->arch.has_private_mem = true; + kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT; /* * Because guest TD is protected, VMM can't parse the instruction in TD. @@ -3475,6 +3476,11 @@ int __init tdx_bringup(void) goto success_disable_tdx; } + if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) { + pr_err("Self-snoop is required for TDX\n"); + goto success_disable_tdx; + } + if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { pr_err("tdx: no TDX private KeyIDs available\n"); goto success_disable_tdx; -- 2.51.0 From 161d34609f9b319d0706eaf7d2cb7a2731251b25 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 10 Dec 2024 08:49:43 +0800 Subject: [PATCH 16/16] KVM: TDX: Make TDX VM type supported Now all the necessary code for TDX is in place, it's ready to run TDX guest. Advertise the VM type of KVM_X86_TDX_VM so that the user space VMM like QEMU can start to use it. Signed-off-by: Isaku Yamahata Signed-off-by: Binbin Wu --- TDX "the rest" v2: - No change. TDX "the rest" v1: - Move down to the end of patch series. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 3d8a13e66d36..94d5d907d37b 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -1091,6 +1091,7 @@ static int __init vt_init(void) sizeof(struct vcpu_tdx)); vcpu_align = max_t(unsigned, vcpu_align, __alignof__(struct vcpu_tdx)); + kvm_caps.supported_vm_types |= BIT(KVM_X86_TDX_VM); } /* -- 2.51.0