From fd3252571b13639e21c24a740b6cc23083ed1c9c Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Mon, 13 Jan 2025 11:08:59 +0800 Subject: [PATCH 01/16] KVM: x86/mmu: Add parameter "kvm" to kvm_mmu_page_ad_need_write_protect() Add a parameter "kvm" to kvm_mmu_page_ad_need_write_protect() and its caller tdp_mmu_need_write_protect(). This is a preparation to make cpu_dirty_log_size a per-VM value rather than a system-wide value. No function changes expected. Signed-off-by: Yan Zhao Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu_internal.h | 3 ++- arch/x86/kvm/mmu/spte.c | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 12 ++++++------ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 75f00598289d..86d6d4f82cf4 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -187,7 +187,8 @@ static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct kvm_mm return kvm_gfn_direct_bits(kvm); } -static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) +static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm *kvm, + struct kvm_mmu_page *sp) { /* * When using the EPT page-modification log, the GPAs in the CPU dirty diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index e819d16655b6..a609d5b58b69 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -168,7 +168,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED; - else if (kvm_mmu_page_ad_need_write_protect(sp)) + else if (kvm_mmu_page_ad_need_write_protect(vcpu->kvm, sp)) spte |= SPTE_TDP_AD_WRPROT_ONLY; spte |= shadow_present_mask; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 22675a5746d0..fd0a7792386b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1613,21 +1613,21 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, } } -static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp) +static bool tdp_mmu_need_write_protect(struct kvm *kvm, struct kvm_mmu_page *sp) { /* * All TDP MMU shadow pages share the same role as their root, aside * from level, so it is valid to key off any shadow page to determine if * write protection is needed for an entire tree. */ - return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled; + return kvm_mmu_page_ad_need_write_protect(kvm, sp) || !kvm_ad_enabled; } static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end) { - const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK : - shadow_dirty_mask; + const u64 dbit = tdp_mmu_need_write_protect(kvm, root) ? + PT_WRITABLE_MASK : shadow_dirty_mask; struct tdp_iter iter; rcu_read_lock(); @@ -1672,8 +1672,8 @@ void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { - const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK : - shadow_dirty_mask; + const u64 dbit = (wrprot || tdp_mmu_need_write_protect(kvm, root)) ? + PT_WRITABLE_MASK : shadow_dirty_mask; struct tdp_iter iter; lockdep_assert_held_write(&kvm->mmu_lock); -- 2.51.0 From fbb4adadea55e68f0f9ebde83b069552b7a67fe1 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Thu, 23 Jan 2025 17:58:26 -0800 Subject: [PATCH 02/16] KVM: x86: Make cpu_dirty_log_size a per-VM value Make cpu_dirty_log_size (CPU's dirty log buffer size) a per-VM value and set the per-VM cpu_dirty_log_size only for normal VMs when PML is enabled. Do not set it for TDs. Until now, cpu_dirty_log_size was a system-wide value that is used for all VMs and is set to the PML buffer size when PML was enabled in VMX. However, PML is not currently supported for TDs, though PML remains available for normal VMs as long as the feature is supported by hardware and enabled in VMX. Making cpu_dirty_log_size a per-VM value allows it to be ther PML buffer size for normal VMs and 0 for TDs. This allows functions like kvm_arch_sync_dirty_log() and kvm_mmu_update_cpu_dirty_logging() to determine if PML is supported, in order to kick off vCPUs or request them to update CPU dirty logging status (turn on/off PML in VMCS). This fixes an issue first reported in [1], where QEMU attaches an emulated VGA device to a TD; note that KVM_MEM_LOG_DIRTY_PAGES still works if the corresponding has no flag KVM_MEM_GUEST_MEMFD. KVM then invokes kvm_mmu_update_cpu_dirty_logging() and from there vmx_update_cpu_dirty_logging(), which incorrectly accesses a kvm_vmx struct for a TDX VM. Reported-by: ANAND NARSHINHA PATIL Reported-by: Pedro Principeza Reported-by: Farrah Chen Closes: https://github.com/canonical/tdx/issues/202 Link: https://github.com/canonical/tdx/issues/202 [1] Suggested-by: Kai Huang Signed-off-by: Yan Zhao Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 12 +++++++----- arch/x86/kvm/mmu/mmu.c | 4 ++-- arch/x86/kvm/mmu/mmu_internal.h | 2 +- arch/x86/kvm/vmx/main.c | 1 - arch/x86/kvm/vmx/vmx.c | 6 +++--- arch/x86/kvm/x86.c | 6 +++--- 6 files changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6800d3956ab1..ddb08581c64d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1562,6 +1562,13 @@ struct kvm_arch { struct kvm_mmu_memory_cache split_desc_cache; gfn_t gfn_direct_bits; + + /* + * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A Zero + * value indicates CPU dirty logging is unsupported or disabled in + * current VM. + */ + int cpu_dirty_log_size; }; struct kvm_vm_stat { @@ -1815,11 +1822,6 @@ struct kvm_x86_ops { struct x86_exception *exception); void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); - /* - * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero - * value indicates CPU dirty logging is unsupported or disabled. - */ - int cpu_dirty_log_size; void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu); const struct kvm_x86_nested_ops *nested_ops; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a0c736c11091..0c6a4568af5b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1305,7 +1305,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, * enabled but it chooses between clearing the Dirty bit and Writeable * bit based on the context. */ - if (kvm_x86_ops.cpu_dirty_log_size) + if (kvm->arch.cpu_dirty_log_size) kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); @@ -1313,7 +1313,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, int kvm_cpu_dirty_log_size(struct kvm *kvm) { - return kvm_x86_ops.cpu_dirty_log_size; + return kvm->arch.cpu_dirty_log_size; } bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 86d6d4f82cf4..db8f33e4de62 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -198,7 +198,7 @@ static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm *kvm, * being enabled is mandatory as the bits used to denote WP-only SPTEs * are reserved for PAE paging (32-bit KVM). */ - return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode; + return kvm->arch.cpu_dirty_log_size && sp->role.guest_mode; } static inline gfn_t gfn_round_for_level(gfn_t gfn, int level) diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index abb0fc723a0b..ba3a23747bb1 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -322,7 +322,6 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .check_intercept = vmx_check_intercept, .handle_exit_irqoff = vmx_handle_exit_irqoff, - .cpu_dirty_log_size = PML_LOG_NR_ENTRIES, .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, .nested_ops = &vmx_nested_ops, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cf0a8a040f7b..39d55d638899 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7649,6 +7649,9 @@ int vmx_vm_init(struct kvm *kvm) break; } } + + if (enable_pml) + kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES; return 0; } @@ -8502,9 +8505,6 @@ __init int vmx_hardware_setup(void) if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) enable_pml = 0; - if (!enable_pml) - vt_x86_ops.cpu_dirty_log_size = 0; - if (!cpu_has_vmx_preemption_timer()) enable_preemption_timer = false; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9f92170226e5..8f3292ccdca4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6480,7 +6480,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) struct kvm_vcpu *vcpu; unsigned long i; - if (!kvm_x86_ops.cpu_dirty_log_size) + if (!kvm->arch.cpu_dirty_log_size) return; kvm_for_each_vcpu(i, vcpu, kvm) @@ -13078,7 +13078,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) { int nr_slots; - if (!kvm_x86_ops.cpu_dirty_log_size) + if (!kvm->arch.cpu_dirty_log_size) return; nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging); @@ -13150,7 +13150,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, if (READ_ONCE(eager_page_split)) kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K); - if (kvm_x86_ops.cpu_dirty_log_size) { + if (kvm->arch.cpu_dirty_log_size) { kvm_mmu_slot_leaf_clear_dirty(kvm, new); kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M); } else { -- 2.51.0 From 1f62531bc9fa33f2b5169f02871ac14b9e20f575 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 19 Feb 2025 07:43:51 -0500 Subject: [PATCH 03/16] KVM: TDX: Skip updating CPU dirty logging request for TDs Wrap vmx_update_cpu_dirty_logging so as to ignore requests to update CPU dirty logging for TDs, as basic TDX does not support the PML feature. Invoking vmx_update_cpu_dirty_logging() for TDs would cause an incorrect access to a kvm_vmx struct for a TDX VM, so block that before it happens. Signed-off-by: Yan Zhao Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/main.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index ba3a23747bb1..ec8223ee9d28 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -129,6 +129,18 @@ static void vt_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vmx_vcpu_load(vcpu, cpu); } +static void vt_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) +{ + /* + * Basic TDX does not support feature PML. KVM does not enable PML in + * TD's VMCS, nor does it allocate or flush PML buffer for TDX. + */ + if (WARN_ON_ONCE(is_td_vcpu(vcpu))) + return; + + vmx_update_cpu_dirty_logging(vcpu); +} + static void vt_flush_tlb_all(struct kvm_vcpu *vcpu) { if (is_td_vcpu(vcpu)) { @@ -322,7 +334,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .check_intercept = vmx_check_intercept, .handle_exit_irqoff = vmx_handle_exit_irqoff, - .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, + .update_cpu_dirty_logging = vt_update_cpu_dirty_logging, .nested_ops = &vmx_nested_ops, -- 2.51.0 From eac0b72fae3936028feb0f5f44af9dc850ca0133 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Mon, 17 Feb 2025 16:56:42 +0800 Subject: [PATCH 04/16] KVM: TDX: Handle SEPT zap error due to page add error in premap Move the handling of SEPT zap errors caused by unsuccessful execution of tdh_mem_page_add() in KVM_TDX_INIT_MEM_REGION from tdx_sept_drop_private_spte() to tdx_sept_zap_private_spte(). Introduce a new helper function tdx_is_sept_zap_err_due_to_premap() to detect this specific error. During the IOCTL KVM_TDX_INIT_MEM_REGION, KVM premaps leaf SPTEs in the mirror page table before the corresponding entry in the private page table is successfully installed by tdh_mem_page_add(). If an error occurs during the invocation of tdh_mem_page_add(), a mismatch between the mirror and private page tables results in SEAMCALLs for SEPT zap returning the error code TDX_EPT_ENTRY_STATE_INCORRECT. The error TDX_EPT_WALK_FAILED is not possible because, during KVM_TDX_INIT_MEM_REGION, KVM only premaps leaf SPTEs after successfully mapping non-leaf SPTEs. Unlike leaf SPTEs, there is no mismatch in non-leaf PTEs between the mirror and private page tables. Therefore, during zap, SEAMCALLs should find an empty leaf entry in the private EPT, leading to the error TDX_EPT_ENTRY_STATE_INCORRECT instead of TDX_EPT_WALK_FAILED. Since tdh_mem_range_block() is always invoked before tdh_mem_page_remove(), move the handling of SEPT zap errors from tdx_sept_drop_private_spte() to tdx_sept_zap_private_spte(). In tdx_sept_zap_private_spte(), return 0 for errors due to premap to skip executing other SEAMCALLs for zap, which are unnecessary. Return 1 to indicate no other errors, allowing the execution of other zap SEAMCALLs to continue. The failure of tdh_mem_page_add() is uncommon and has not been observed in real workloads. Currently, this failure is only hypothetically triggered by skipping the real SEAMCALL and faking the add error in the SEAMCALL wrapper. Additionally, without this fix, there will be no host crashes or other severe issues. Signed-off-by: Yan Zhao Message-ID: <20250217085642.19696-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/tdx.c | 66 ++++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 314387728e82..277f5decde9c 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -780,20 +780,6 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, &level_state); } while (unlikely(tdx_operand_busy(err))); - if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE && - err == (TDX_EPT_WALK_FAILED | TDX_OPERAND_ID_RCX))) { - /* - * Page is mapped by KVM_TDX_INIT_MEM_REGION, but hasn't called - * tdh_mem_page_add(). - */ - if ((!is_last_spte(entry, level) || !(entry & VMX_EPT_RWX_MASK)) && - !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { - atomic64_dec(&kvm_tdx->nr_premapped); - tdx_unpin(kvm, page); - return 0; - } - } - if (KVM_BUG_ON(err, kvm)) { pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); return -EIO; @@ -831,8 +817,41 @@ int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, return 0; } +/* + * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is + * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called + * successfully. + * + * Since tdh_mem_sept_add() must have been invoked successfully before a + * non-leaf entry present in the mirrored page table, the SEPT ZAP related + * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead + * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the + * SEPT. + * + * Further check if the returned entry from SEPT walking is with RWX permissions + * to filter out anything unexpected. + * + * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from + * level_state returned from a SEAMCALL error is the same as that passed into + * the SEAMCALL. + */ +static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err, + u64 entry, int level) +{ + if (!err || kvm_tdx->state == TD_STATE_RUNNABLE) + return false; + + if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX)) + return false; + + if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK))) + return false; + + return true; +} + static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, - enum pg_level level) + enum pg_level level, struct page *page) { int tdx_level = pg_level_to_tdx_sept_level(level); struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); @@ -845,11 +864,19 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); if (unlikely(tdx_operand_busy(err))) return -EBUSY; + + if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && + !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { + atomic64_dec(&kvm_tdx->nr_premapped); + tdx_unpin(kvm, page); + return 0; + } + if (KVM_BUG_ON(err, kvm)) { pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); return -EIO; } - return 0; + return 1; } /* @@ -923,6 +950,7 @@ int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level, kvm_pfn_t pfn) { + struct page *page = pfn_to_page(pfn); int ret; /* @@ -933,8 +961,8 @@ int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) return -EINVAL; - ret = tdx_sept_zap_private_spte(kvm, gfn, level); - if (ret) + ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); + if (ret <= 0) return ret; /* @@ -943,7 +971,7 @@ int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, */ tdx_track(kvm); - return tdx_sept_drop_private_spte(kvm, gfn, level, pfn_to_page(pfn)); + return tdx_sept_drop_private_spte(kvm, gfn, level, page); } static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) -- 2.51.0 From 69e23faf82b4e7efe3581f9055a9c29d8d6ff9af Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Thu, 21 Nov 2024 22:14:40 +0200 Subject: [PATCH 05/16] x86/virt/tdx: Add SEAMCALL wrapper to enter/exit TDX guest Intel TDX protects guest VM's from malicious host and certain physical attacks. TDX introduces a new operation mode, Secure Arbitration Mode (SEAM) to isolate and protect guest VM's. A TDX guest VM runs in SEAM and, unlike VMX, direct control and interaction with the guest by the host VMM is not possible. Instead, Intel TDX Module, which also runs in SEAM, provides a SEAMCALL API. The SEAMCALL that provides the ability to enter a guest is TDH.VP.ENTER. The TDX Module processes TDH.VP.ENTER, and enters the guest via VMX VMLAUNCH/VMRESUME instructions. When a guest VM-exit requires host VMM interaction, the TDH.VP.ENTER SEAMCALL returns to the host VMM (KVM). Add tdh_vp_enter() to wrap the SEAMCALL invocation of TDH.VP.ENTER; tdh_vp_enter() needs to be noinstr because VM entry in KVM is noinstr as well, which is for two reasons: * marking the area as CT_STATE_GUEST via guest_state_enter_irqoff() and guest_state_exit_irqoff() * IRET must be avoided between VM-exit and NMI handling, in order to avoid prematurely releasing the NMI inhibit. TDH.VP.ENTER is different from other SEAMCALLs in several ways: it uses more arguments, and after it returns some host state may need to be restored. Therefore tdh_vp_enter() uses __seamcall_saved_ret() instead of __seamcall_ret(); since it is the only caller of __seamcall_saved_ret(), it can be made noinstr also. TDH.VP.ENTER arguments are passed through General Purpose Registers (GPRs). For the special case of the TD guest invoking TDG.VP.VMCALL, nearly any GPR can be used, as well as XMM0 to XMM15. Notably, RBP is not used, and Linux mandates the TDX Module feature NO_RBP_MOD, which is enforced elsewhere. Additionally, XMM registers are not required for the existing Guest Hypervisor Communication Interface and are handled by existing KVM code should they be modified by the guest. There are 2 input formats and 5 output formats for TDH.VP.ENTER arguments. Input #1 : Initial entry or following a previous async. TD Exit Input #2 : Following a previous TDCALL(TDG.VP.VMCALL) Output #1 : On Error (No TD Entry) Output #2 : Async. Exits with a VMX Architectural Exit Reason Output #3 : Async. Exits with a non-VMX TD Exit Status Output #4 : Async. Exits with Cross-TD Exit Details Output #5 : On TDCALL(TDG.VP.VMCALL) Currently, to keep things simple, the wrapper function does not attempt to support different formats, and just passes all the GPRs that could be used. The GPR values are held by KVM in the area set aside for guest GPRs. KVM code uses the guest GPR area (vcpu->arch.regs[]) to set up for or process results of tdh_vp_enter(). Therefore changing tdh_vp_enter() to use more complex argument formats would also alter the way KVM code interacts with tdh_vp_enter(). Signed-off-by: Kai Huang Signed-off-by: Adrian Hunter Message-ID: <20241121201448.36170-2-adrian.hunter@intel.com> Acked-by: Dave Hansen Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 1 + arch/x86/virt/vmx/tdx/seamcall.S | 3 +++ arch/x86/virt/vmx/tdx/tdx.c | 8 ++++++++ arch/x86/virt/vmx/tdx/tdx.h | 1 + 4 files changed, 13 insertions(+) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index e38e7558c02f..bb4a3bc26219 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -165,6 +165,7 @@ static inline int pg_level_to_tdx_sept_level(enum pg_level level) return level - 1; } +u64 tdh_vp_enter(struct tdx_vp *vp, struct tdx_module_args *args); u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page); u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2); u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2); diff --git a/arch/x86/virt/vmx/tdx/seamcall.S b/arch/x86/virt/vmx/tdx/seamcall.S index 5b1f2286aea9..6854c52c374b 100644 --- a/arch/x86/virt/vmx/tdx/seamcall.S +++ b/arch/x86/virt/vmx/tdx/seamcall.S @@ -41,6 +41,9 @@ SYM_FUNC_START(__seamcall_ret) TDX_MODULE_CALL host=1 ret=1 SYM_FUNC_END(__seamcall_ret) +/* KVM requires non-instrumentable __seamcall_saved_ret() for TDH.VP.ENTER */ +.section .noinstr.text, "ax" + /* * __seamcall_saved_ret() - Host-side interface functions to SEAM software * (the P-SEAMLDR or the TDX module), with saving output registers to the diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 8eaaf31a4187..f5e2a937c1e7 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1517,6 +1517,14 @@ static void tdx_clflush_page(struct page *page) clflush_cache_range(page_to_virt(page), PAGE_SIZE); } +noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args) +{ + args->rcx = tdx_tdvpr_pa(td); + + return __seamcall_saved_ret(TDH_VP_ENTER, args); +} +EXPORT_SYMBOL_GPL(tdh_vp_enter); + u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page) { struct tdx_module_args args = { diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index ed7152f81a6d..82bb82be8567 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -14,6 +14,7 @@ /* * TDX module SEAMCALL leaf functions */ +#define TDH_VP_ENTER 0 #define TDH_MNG_ADDCX 1 #define TDH_MEM_PAGE_ADD 2 #define TDH_MEM_SEPT_ADD 3 -- 2.51.0 From 7172c753c26a51c2e2bce922f0c94b62cb2ade46 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Fri, 14 Mar 2025 14:06:48 -0400 Subject: [PATCH 06/16] KVM: VMX: Move common fields of struct vcpu_{vmx,tdx} to a struct Move common fields of struct vcpu_vmx and struct vcpu_tdx to struct vcpu_vt, to share the code between VMX/TDX as much as possible and to make TDX exit handling more VMX like. No functional change intended. [Adrian: move code that depends on struct vcpu_vmx back to vmx.h] Suggested-by: Sean Christopherson Link: https://lore.kernel.org/r/Z1suNzg2Or743a7e@google.com Signed-off-by: Binbin Wu Signed-off-by: Adrian Hunter Message-ID: <20250129095902.16391-5-adrian.hunter@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/common.h | 69 ++++++++++++++++++++++ arch/x86/kvm/vmx/main.c | 4 ++ arch/x86/kvm/vmx/nested.c | 10 ++-- arch/x86/kvm/vmx/posted_intr.c | 18 +++--- arch/x86/kvm/vmx/tdx.h | 16 +----- arch/x86/kvm/vmx/vmx.c | 93 +++++++++++++++--------------- arch/x86/kvm/vmx/vmx.h | 102 +++++++++++++-------------------- 7 files changed, 176 insertions(+), 136 deletions(-) diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h index 7a592467a044..a298ac795c09 100644 --- a/arch/x86/kvm/vmx/common.h +++ b/arch/x86/kvm/vmx/common.h @@ -3,9 +3,78 @@ #define __KVM_X86_VMX_COMMON_H #include +#include #include "mmu.h" +union vmx_exit_reason { + struct { + u32 basic : 16; + u32 reserved16 : 1; + u32 reserved17 : 1; + u32 reserved18 : 1; + u32 reserved19 : 1; + u32 reserved20 : 1; + u32 reserved21 : 1; + u32 reserved22 : 1; + u32 reserved23 : 1; + u32 reserved24 : 1; + u32 reserved25 : 1; + u32 bus_lock_detected : 1; + u32 enclave_mode : 1; + u32 smi_pending_mtf : 1; + u32 smi_from_vmx_root : 1; + u32 reserved30 : 1; + u32 failed_vmentry : 1; + }; + u32 full; +}; + +struct vcpu_vt { + /* Posted interrupt descriptor */ + struct pi_desc pi_desc; + + /* Used if this vCPU is waiting for PI notification wakeup. */ + struct list_head pi_wakeup_list; + + union vmx_exit_reason exit_reason; + + unsigned long exit_qualification; + u32 exit_intr_info; + + /* + * If true, guest state has been loaded into hardware, and host state + * saved into vcpu_{vt,vmx,tdx}. If false, host state is loaded into + * hardware. + */ + bool guest_state_loaded; + +#ifdef CONFIG_X86_64 + u64 msr_host_kernel_gs_base; +#endif + + unsigned long host_debugctlmsr; +}; + +#ifdef CONFIG_KVM_INTEL_TDX + +static __always_inline bool is_td(struct kvm *kvm) +{ + return kvm->arch.vm_type == KVM_X86_TDX_VM; +} + +static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu) +{ + return is_td(vcpu->kvm); +} + +#else + +static inline bool is_td(struct kvm *kvm) { return false; } +static inline bool is_td_vcpu(struct kvm_vcpu *vcpu) { return false; } + +#endif + static inline bool vt_is_tdx_private_gpa(struct kvm *kvm, gpa_t gpa) { /* For TDX the direct mask is the shared mask. */ diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index ec8223ee9d28..36b6343a011e 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -10,6 +10,10 @@ #include "tdx.h" #include "tdx_arch.h" +#ifdef CONFIG_KVM_INTEL_TDX +static_assert(offsetof(struct vcpu_vmx, vt) == offsetof(struct vcpu_tdx, vt)); +#endif + static void vt_disable_virtualization_cpu(void) { /* Note, TDX *and* VMX need to be disabled if TDX is enabled. */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ed8a3cb53961..99f02972cd74 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -275,7 +275,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, { struct vmcs_host_state *dest, *src; - if (unlikely(!vmx->guest_state_loaded)) + if (unlikely(!vmx->vt.guest_state_loaded)) return; src = &prev->host_state; @@ -425,7 +425,7 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, * tables also changed, but KVM should not treat EPT Misconfig * VM-Exits as writes. */ - WARN_ON_ONCE(vmx->exit_reason.basic != EXIT_REASON_EPT_VIOLATION); + WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION); /* * PML Full and EPT Violation VM-Exits both use bit 12 to report @@ -4622,7 +4622,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, { /* update exit information fields: */ vmcs12->vm_exit_reason = vm_exit_reason; - if (to_vmx(vcpu)->exit_reason.enclave_mode) + if (vmx_get_exit_reason(vcpu).enclave_mode) vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; vmcs12->exit_qualification = exit_qualification; @@ -6126,7 +6126,7 @@ fail: * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode * EXIT_REASON_VMFUNC as the exit reason. */ - nested_vmx_vmexit(vcpu, vmx->exit_reason.full, + nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full, vmx_get_intr_info(vcpu), vmx_get_exit_qual(vcpu)); return 1; @@ -6571,7 +6571,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - union vmx_exit_reason exit_reason = vmx->exit_reason; + union vmx_exit_reason exit_reason = vmx->vt.exit_reason; unsigned long exit_qual; u32 exit_intr_info; diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index ec08fa3caf43..5696e0f9f924 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -33,7 +33,7 @@ static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { - return &(to_vmx(vcpu)->pi_desc); + return &(to_vt(vcpu)->pi_desc); } static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) @@ -53,7 +53,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); struct pi_desc old, new; unsigned long flags; unsigned int dest; @@ -90,7 +90,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) */ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); - list_del(&vmx->pi_wakeup_list); + list_del(&vt->pi_wakeup_list); raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); } @@ -146,14 +146,14 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm) static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); struct pi_desc old, new; unsigned long flags; local_irq_save(flags); raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); - list_add_tail(&vmx->pi_wakeup_list, + list_add_tail(&vt->pi_wakeup_list, &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); @@ -220,13 +220,13 @@ void pi_wakeup_handler(void) int cpu = smp_processor_id(); struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu); raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu); - struct vcpu_vmx *vmx; + struct vcpu_vt *vt; raw_spin_lock(spinlock); - list_for_each_entry(vmx, wakeup_list, pi_wakeup_list) { + list_for_each_entry(vt, wakeup_list, pi_wakeup_list) { - if (pi_test_on(&vmx->pi_desc)) - kvm_vcpu_wake_up(&vmx->vcpu); + if (pi_test_on(&vt->pi_desc)) + kvm_vcpu_wake_up(vt_to_vcpu(vt)); } raw_spin_unlock(spinlock); } diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index 5f34b79d16dd..db6aa02f9c80 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -6,6 +6,8 @@ #include "tdx_errno.h" #ifdef CONFIG_KVM_INTEL_TDX +#include "common.h" + int tdx_bringup(void); void tdx_cleanup(void); @@ -45,6 +47,7 @@ enum vcpu_tdx_state { struct vcpu_tdx { struct kvm_vcpu vcpu; + struct vcpu_vt vt; struct tdx_vp vp; @@ -57,16 +60,6 @@ void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err); void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, u64 val, u64 err); -static inline bool is_td(struct kvm *kvm) -{ - return kvm->arch.vm_type == KVM_X86_TDX_VM; -} - -static inline bool is_td_vcpu(struct kvm_vcpu *vcpu) -{ - return is_td(vcpu->kvm); -} - static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field) { u64 err, data; @@ -176,9 +169,6 @@ struct vcpu_tdx { struct kvm_vcpu vcpu; }; -static inline bool is_td(struct kvm *kvm) { return false; } -static inline bool is_td_vcpu(struct kvm_vcpu *vcpu) { return false; } - #endif #endif diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 39d55d638899..e7e451209306 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1282,6 +1282,7 @@ void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); struct vmcs_host_state *host_state; #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); @@ -1310,7 +1311,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) if (vmx->nested.need_vmcs12_to_shadow_sync) nested_sync_vmcs12_to_shadow(vcpu); - if (vmx->guest_state_loaded) + if (vt->guest_state_loaded) return; host_state = &vmx->loaded_vmcs->host_state; @@ -1331,12 +1332,12 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) fs_sel = current->thread.fsindex; gs_sel = current->thread.gsindex; fs_base = current->thread.fsbase; - vmx->msr_host_kernel_gs_base = current->thread.gsbase; + vt->msr_host_kernel_gs_base = current->thread.gsbase; } else { savesegment(fs, fs_sel); savesegment(gs, gs_sel); fs_base = read_msr(MSR_FS_BASE); - vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); @@ -1348,14 +1349,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #endif vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); - vmx->guest_state_loaded = true; + vt->guest_state_loaded = true; } static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) { struct vmcs_host_state *host_state; - if (!vmx->guest_state_loaded) + if (!vmx->vt.guest_state_loaded) return; host_state = &vmx->loaded_vmcs->host_state; @@ -1383,10 +1384,10 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) #endif invalidate_tss_limit(); #ifdef CONFIG_X86_64 - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base); #endif load_fixmap_gdt(raw_smp_processor_id()); - vmx->guest_state_loaded = false; + vmx->vt.guest_state_loaded = false; vmx->guest_uret_msrs_loaded = false; } @@ -1394,7 +1395,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { preempt_disable(); - if (vmx->guest_state_loaded) + if (vmx->vt.guest_state_loaded) rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); preempt_enable(); return vmx->msr_guest_kernel_gs_base; @@ -1403,7 +1404,7 @@ static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) { preempt_disable(); - if (vmx->guest_state_loaded) + if (vmx->vt.guest_state_loaded) wrmsrl(MSR_KERNEL_GS_BASE, data); preempt_enable(); vmx->msr_guest_kernel_gs_base = data; @@ -1699,7 +1700,7 @@ int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * so that guest userspace can't DoS the guest simply by triggering * emulation (enclaves are CPL3 only). */ - if (to_vmx(vcpu)->exit_reason.enclave_mode) { + if (vmx_get_exit_reason(vcpu).enclave_mode) { kvm_queue_exception(vcpu, UD_VECTOR); return X86EMUL_PROPAGATE_FAULT; } @@ -1714,7 +1715,7 @@ int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { - union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; + union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); unsigned long rip, orig_rip; u32 instr_len; @@ -4273,7 +4274,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, */ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); int r; r = vmx_deliver_nested_posted_interrupt(vcpu, vector); @@ -4284,11 +4285,11 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (!vcpu->arch.apic->apicv_active) return -1; - if (pi_test_and_set_pir(vector, &vmx->pi_desc)) + if (pi_test_and_set_pir(vector, &vt->pi_desc)) return 0; /* If a previous notification has sent the IPI, nothing to do. */ - if (pi_test_and_set_on(&vmx->pi_desc)) + if (pi_test_and_set_on(&vt->pi_desc)) return 0; /* @@ -4764,7 +4765,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write16(GUEST_INTR_STATUS, 0); vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); - vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc))); } if (vmx_can_use_ipiv(&vmx->vcpu)) { @@ -4877,8 +4878,8 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR * or POSTED_INTR_WAKEUP_VECTOR. */ - vmx->pi_desc.nv = POSTED_INTR_VECTOR; - __pi_set_sn(&vmx->pi_desc); + vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR; + __pi_set_sn(&vmx->vt.pi_desc); } void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -6064,7 +6065,7 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) * VM-Exits. Unconditionally set the flag here and leave the handling to * vmx_handle_exit(). */ - to_vmx(vcpu)->exit_reason.bus_lock_detected = true; + to_vt(vcpu)->exit_reason.bus_lock_detected = true; return 1; } @@ -6162,9 +6163,9 @@ void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, { struct vcpu_vmx *vmx = to_vmx(vcpu); - *reason = vmx->exit_reason.full; + *reason = vmx->vt.exit_reason.full; *info1 = vmx_get_exit_qual(vcpu); - if (!(vmx->exit_reason.failed_vmentry)) { + if (!(vmx->vt.exit_reason.failed_vmentry)) { *info2 = vmx->idt_vectoring_info; *intr_info = vmx_get_intr_info(vcpu); if (is_exception_with_error_code(*intr_info)) @@ -6460,7 +6461,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu) static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); - union vmx_exit_reason exit_reason = vmx->exit_reason; + union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); u32 vectoring_info = vmx->idt_vectoring_info; u16 exit_handler_index; @@ -6626,7 +6627,7 @@ int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * Exit to user space when bus lock detected to inform that there is * a bus lock in guest. */ - if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { + if (vmx_get_exit_reason(vcpu).bus_lock_detected) { if (ret > 0) vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; @@ -6905,22 +6906,22 @@ static void vmx_set_rvi(int vector) int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); int max_irr; bool got_posted_interrupt; if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) return -EIO; - if (pi_test_on(&vmx->pi_desc)) { - pi_clear_on(&vmx->pi_desc); + if (pi_test_on(&vt->pi_desc)) { + pi_clear_on(&vt->pi_desc); /* * IOMMU can write to PID.ON, so the barrier matters even on UP. * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); got_posted_interrupt = - kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); + kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); got_posted_interrupt = false; @@ -6962,10 +6963,10 @@ void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); - pi_clear_on(&vmx->pi_desc); - memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); + pi_clear_on(&vt->pi_desc); + memset(vt->pi_desc.pir, 0, sizeof(vt->pi_desc.pir)); } void vmx_do_interrupt_irqoff(unsigned long entry); @@ -7030,9 +7031,9 @@ void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) if (vmx->emulation_required) return; - if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) + if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT) handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); - else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) + else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI) handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); } @@ -7263,10 +7264,10 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, * the fastpath even, all other exits must use the slow path. */ if (is_guest_mode(vcpu) && - to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER) + vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER) return EXIT_FASTPATH_NONE; - switch (to_vmx(vcpu)->exit_reason.basic) { + switch (vmx_get_exit_reason(vcpu).basic) { case EXIT_REASON_MSR_WRITE: return handle_fastpath_set_msr_irqoff(vcpu); case EXIT_REASON_PREEMPTION_TIMER: @@ -7313,15 +7314,15 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, vmx_enable_fb_clear(vmx); if (unlikely(vmx->fail)) { - vmx->exit_reason.full = 0xdead; + vmx->vt.exit_reason.full = 0xdead; goto out; } - vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); - if (likely(!vmx->exit_reason.failed_vmentry)) + vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON); + if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry)) vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && + if ((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI && is_nmi(vmx_get_intr_info(vcpu))) { kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); if (cpu_feature_enabled(X86_FEATURE_FRED)) @@ -7353,12 +7354,12 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) if (unlikely(vmx->emulation_required)) { vmx->fail = 0; - vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; - vmx->exit_reason.failed_vmentry = 1; + vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE; + vmx->vt.exit_reason.failed_vmentry = 1; kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); - vmx->exit_qualification = ENTRY_FAIL_DEFAULT; + vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT; kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); - vmx->exit_intr_info = 0; + vmx->vt.exit_intr_info = 0; return EXIT_FASTPATH_NONE; } @@ -7461,7 +7462,7 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) * checking. */ if (vmx->nested.nested_run_pending && - !vmx->exit_reason.failed_vmentry) + !vmx_get_exit_reason(vcpu).failed_vmentry) ++vcpu->stat.nested_run; vmx->nested.nested_run_pending = 0; @@ -7470,12 +7471,12 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) if (unlikely(vmx->fail)) return EXIT_FASTPATH_NONE; - if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) + if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); trace_kvm_exit(vcpu, KVM_ISA_VMX); - if (unlikely(vmx->exit_reason.failed_vmentry)) + if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry)) return EXIT_FASTPATH_NONE; vmx->loaded_vmcs->launched = 1; @@ -7507,7 +7508,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu) BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); - INIT_LIST_HEAD(&vmx->pi_wakeup_list); + INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list); err = -ENOMEM; @@ -7605,7 +7606,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu) if (vmx_can_use_ipiv(vcpu)) WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], - __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); + __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID); return 0; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a4ee44ffaede..e635199901e2 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -17,6 +17,7 @@ #include "../cpuid.h" #include "run_flags.h" #include "../mmu.h" +#include "common.h" #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) @@ -68,29 +69,6 @@ struct pt_desc { struct pt_ctx guest; }; -union vmx_exit_reason { - struct { - u32 basic : 16; - u32 reserved16 : 1; - u32 reserved17 : 1; - u32 reserved18 : 1; - u32 reserved19 : 1; - u32 reserved20 : 1; - u32 reserved21 : 1; - u32 reserved22 : 1; - u32 reserved23 : 1; - u32 reserved24 : 1; - u32 reserved25 : 1; - u32 bus_lock_detected : 1; - u32 enclave_mode : 1; - u32 smi_pending_mtf : 1; - u32 smi_from_vmx_root : 1; - u32 reserved30 : 1; - u32 failed_vmentry : 1; - }; - u32 full; -}; - /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -231,20 +209,10 @@ struct nested_vmx { struct vcpu_vmx { struct kvm_vcpu vcpu; + struct vcpu_vt vt; u8 fail; u8 x2apic_msr_bitmap_mode; - /* - * If true, host state has been stored in vmx->loaded_vmcs for - * the CPU registers that only need to be switched when transitioning - * to/from the kernel, and the registers have been loaded with guest - * values. If false, host state is loaded in the CPU registers - * and vmx->loaded_vmcs->host_state is invalid. - */ - bool guest_state_loaded; - - unsigned long exit_qualification; - u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; @@ -257,7 +225,6 @@ struct vcpu_vmx { struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; bool guest_uret_msrs_loaded; #ifdef CONFIG_X86_64 - u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; #endif @@ -298,14 +265,6 @@ struct vcpu_vmx { int vpid; bool emulation_required; - union vmx_exit_reason exit_reason; - - /* Posted interrupt descriptor */ - struct pi_desc pi_desc; - - /* Used if this vCPU is waiting for PI notification wakeup. */ - struct list_head pi_wakeup_list; - /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; @@ -359,6 +318,43 @@ struct kvm_vmx { u64 *pid_table; }; +static __always_inline struct vcpu_vt *to_vt(struct kvm_vcpu *vcpu) +{ + return &(container_of(vcpu, struct vcpu_vmx, vcpu)->vt); +} + +static __always_inline struct kvm_vcpu *vt_to_vcpu(struct vcpu_vt *vt) +{ + return &(container_of(vt, struct vcpu_vmx, vt)->vcpu); +} + +static __always_inline union vmx_exit_reason vmx_get_exit_reason(struct kvm_vcpu *vcpu) +{ + return to_vt(vcpu)->exit_reason; +} + +static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + + if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1) && + !WARN_ON_ONCE(is_td_vcpu(vcpu))) + vt->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + return vt->exit_qualification; +} + +static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + + if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2) && + !WARN_ON_ONCE(is_td_vcpu(vcpu))) + vt->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + return vt->exit_intr_info; +} + void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, struct loaded_vmcs *buddy); int allocate_vpid(void); @@ -649,26 +645,6 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu); -static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) - vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - return vmx->exit_qualification; -} - -static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - return vmx->exit_intr_info; -} - struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); -- 2.51.0 From 81bf912b2c15201ac96ee07042f804d02c0f69ee Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 29 Jan 2025 11:58:54 +0200 Subject: [PATCH 07/16] KVM: TDX: Implement TDX vcpu enter/exit path Implement callbacks to enter/exit a TDX VCPU by calling tdh_vp_enter(). Ensure the TDX VCPU is in a correct state to run. Do not pass arguments from/to vcpu->arch.regs[] unconditionally. Instead, marshall state to/from the appropriate x86 registers only when needed, i.e., to handle some TDVMCALL sub-leaves following KVM's ABI to leverage the existing code. Signed-off-by: Isaku Yamahata Signed-off-by: Adrian Hunter Message-ID: <20250129095902.16391-6-adrian.hunter@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/main.c | 20 ++++++++++-- arch/x86/kvm/vmx/tdx.c | 62 ++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/tdx.h | 3 ++ arch/x86/kvm/vmx/x86_ops.h | 7 +++++ 4 files changed, 90 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 36b6343a011e..037590fc05e9 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -145,6 +145,22 @@ static void vt_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) vmx_update_cpu_dirty_logging(vcpu); } +static int vt_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return tdx_vcpu_pre_run(vcpu); + + return vmx_vcpu_pre_run(vcpu); +} + +static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) +{ + if (is_td_vcpu(vcpu)) + return tdx_vcpu_run(vcpu, force_immediate_exit); + + return vmx_vcpu_run(vcpu, force_immediate_exit); +} + static void vt_flush_tlb_all(struct kvm_vcpu *vcpu) { if (is_td_vcpu(vcpu)) { @@ -285,8 +301,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .flush_tlb_gva = vt_flush_tlb_gva, .flush_tlb_guest = vt_flush_tlb_guest, - .vcpu_pre_run = vmx_vcpu_pre_run, - .vcpu_run = vmx_vcpu_run, + .vcpu_pre_run = vt_vcpu_pre_run, + .vcpu_run = vt_vcpu_run, .handle_exit = vmx_handle_exit, .skip_emulated_instruction = vmx_skip_emulated_instruction, .update_emulated_instruction = vmx_update_emulated_instruction, diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 277f5decde9c..bd299606b2ba 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -12,6 +12,8 @@ #include "vmx.h" #include "mmu/spte.h" #include "common.h" +#include +#include "trace.h" #pragma GCC poison to_vmx @@ -660,6 +662,66 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu) tdx->state = VCPU_TD_STATE_UNINITIALIZED; } +int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED || + to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE)) + return -EINVAL; + + return 1; +} + +static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + guest_state_enter_irqoff(); + + tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args); + + guest_state_exit_irqoff(); +} + +#define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \ + BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \ + BIT_ULL(VCPU_REGS_RAX) | \ + BIT_ULL(VCPU_REGS_RBX) | \ + BIT_ULL(VCPU_REGS_RCX) | \ + BIT_ULL(VCPU_REGS_RDX) | \ + BIT_ULL(VCPU_REGS_RBP) | \ + BIT_ULL(VCPU_REGS_RSI) | \ + BIT_ULL(VCPU_REGS_RDI) | \ + BIT_ULL(VCPU_REGS_R8) | \ + BIT_ULL(VCPU_REGS_R9) | \ + BIT_ULL(VCPU_REGS_R10) | \ + BIT_ULL(VCPU_REGS_R11) | \ + BIT_ULL(VCPU_REGS_R12) | \ + BIT_ULL(VCPU_REGS_R13) | \ + BIT_ULL(VCPU_REGS_R14) | \ + BIT_ULL(VCPU_REGS_R15)) + +fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) +{ + /* + * force_immediate_exit requires vCPU entering for events injection with + * an immediately exit followed. But The TDX module doesn't guarantee + * entry, it's already possible for KVM to _think_ it completely entry + * to the guest without actually having done so. + * Since KVM never needs to force an immediate exit for TDX, and can't + * do direct injection, just warn on force_immediate_exit. + */ + WARN_ON_ONCE(force_immediate_exit); + + trace_kvm_entry(vcpu, force_immediate_exit); + + tdx_vcpu_enter_exit(vcpu); + + vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; + + trace_kvm_exit(vcpu, KVM_ISA_VMX); + + return EXIT_FASTPATH_NONE; +} void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) { diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index db6aa02f9c80..59a35e43b5bf 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -48,11 +48,14 @@ enum vcpu_tdx_state { struct vcpu_tdx { struct kvm_vcpu vcpu; struct vcpu_vt vt; + struct tdx_module_args vp_enter_args; struct tdx_vp vp; struct list_head cpu_list; + u64 vp_enter_ret; + enum vcpu_tdx_state state; }; diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index f47d739051cf..578c26d3aec4 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -131,6 +131,8 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp); int tdx_vcpu_create(struct kvm_vcpu *vcpu); void tdx_vcpu_free(struct kvm_vcpu *vcpu); void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu); +fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit); int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); @@ -157,6 +159,11 @@ static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOP static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; } static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {} static inline void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) {} +static inline int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; } +static inline fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) +{ + return EXIT_FASTPATH_NONE; +} static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; } -- 2.51.0 From 81bf40d54cd56e33b4cd381cc719efaa53832468 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 29 Jan 2025 11:58:55 +0200 Subject: [PATCH 08/16] KVM: TDX: vcpu_run: save/restore host state(host kernel gs) On entering/exiting TDX vcpu, preserved or clobbered CPU state is different from the VMX case. Add TDX hooks to save/restore host/guest CPU state. Save/restore kernel GS base MSR. Signed-off-by: Isaku Yamahata Signed-off-by: Adrian Hunter Reviewed-by: Paolo Bonzini Message-ID: <20250129095902.16391-7-adrian.hunter@intel.com> Reviewed-by: Xiayao Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/main.c | 24 +++++++++++++++++++++-- arch/x86/kvm/vmx/tdx.c | 40 ++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/x86_ops.h | 4 ++++ 3 files changed, 66 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 037590fc05e9..c0497ed0c9be 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -145,6 +145,26 @@ static void vt_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) vmx_update_cpu_dirty_logging(vcpu); } +static void vt_prepare_switch_to_guest(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_prepare_switch_to_guest(vcpu); + return; + } + + vmx_prepare_switch_to_guest(vcpu); +} + +static void vt_vcpu_put(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_vcpu_put(vcpu); + return; + } + + vmx_vcpu_put(vcpu); +} + static int vt_vcpu_pre_run(struct kvm_vcpu *vcpu) { if (is_td_vcpu(vcpu)) @@ -265,9 +285,9 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .vcpu_free = vt_vcpu_free, .vcpu_reset = vt_vcpu_reset, - .prepare_switch_to_guest = vmx_prepare_switch_to_guest, + .prepare_switch_to_guest = vt_prepare_switch_to_guest, .vcpu_load = vt_vcpu_load, - .vcpu_put = vmx_vcpu_put, + .vcpu_put = vt_vcpu_put, .update_exception_bitmap = vmx_update_exception_bitmap, .get_feature_msr = vmx_get_feature_msr, diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index bd299606b2ba..eac5fa794b56 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include "capabilities.h" #include "mmu.h" @@ -12,6 +13,7 @@ #include "vmx.h" #include "mmu/spte.h" #include "common.h" +#include "posted_intr.h" #include #include "trace.h" @@ -629,6 +631,44 @@ void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) local_irq_enable(); } +/* + * Compared to vmx_prepare_switch_to_guest(), there is not much to do + * as SEAMCALL/SEAMRET calls take care of most of save and restore. + */ +void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + + if (vt->guest_state_loaded) + return; + + if (likely(is_64bit_mm(current->mm))) + vt->msr_host_kernel_gs_base = current->thread.gsbase; + else + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + + vt->guest_state_loaded = true; +} + +static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + + if (!vt->guest_state_loaded) + return; + + ++vcpu->stat.host_state_reload; + wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base); + + vt->guest_state_loaded = false; +} + +void tdx_vcpu_put(struct kvm_vcpu *vcpu) +{ + vmx_vcpu_pi_put(vcpu); + tdx_prepare_switch_to_host(vcpu); +} + void tdx_vcpu_free(struct kvm_vcpu *vcpu) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 578c26d3aec4..cd18e9b1e124 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -133,6 +133,8 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu); void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu); fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit); +void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); +void tdx_vcpu_put(struct kvm_vcpu *vcpu); int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); @@ -164,6 +166,8 @@ static inline fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediat { return EXIT_FASTPATH_NONE; } +static inline void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) {} +static inline void tdx_vcpu_put(struct kvm_vcpu *vcpu) {} static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; } -- 2.51.0 From 6bfa6d8509ad71f26ceec1e2ab64efbe39c58c8d Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 29 Jan 2025 11:58:56 +0200 Subject: [PATCH 09/16] KVM: TDX: restore host xsave state when exit from the guest TD On exiting from the guest TD, xsave state is clobbered; restore it. Do not use kvm_load_host_xsave_state(), as it relies on vcpu->arch to find out whether other KVM_RUN code has loaded guest state into XCR0/PKRU/XSS or not. In the case of TDX, the exit values are known independent of the guest CR0 and CR4, and in fact the latter are not available. Signed-off-by: Isaku Yamahata Signed-off-by: Adrian Hunter Message-ID: <20250129095902.16391-8-adrian.hunter@intel.com> [Rewrite to not use kvm_load_host_xsave_state. - Paolo] Reviewed-by: Xiayao Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/tdx.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index eac5fa794b56..9a08a82da804 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -740,6 +741,30 @@ static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu) BIT_ULL(VCPU_REGS_R14) | \ BIT_ULL(VCPU_REGS_R15)) +static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + + /* + * All TDX hosts support PKRU; but even if they didn't, + * vcpu->arch.host_pkru would be 0 and the wrpkru would be + * skipped. + */ + if (vcpu->arch.host_pkru != 0) + wrpkru(vcpu->arch.host_pkru); + + if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0)) + xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); + + /* + * Likewise, even if a TDX hosts didn't support XSS both arms of + * the comparison would be 0 and the wrmsrl would be skipped. + */ + if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss)) + wrmsrl(MSR_IA32_XSS, kvm_host.xss); +} +EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); + fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) { /* @@ -756,6 +781,8 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) tdx_vcpu_enter_exit(vcpu); + tdx_load_host_xsave_state(vcpu); + vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; trace_kvm_exit(vcpu, KVM_ISA_VMX); @@ -2332,6 +2359,11 @@ int __init tdx_bringup(void) goto success_disable_tdx; } + if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) { + pr_err("tdx: OSXSAVE is required for TDX\n"); + goto success_disable_tdx; + } + if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) { pr_err("tdx: MOVDIR64B is required for TDX\n"); goto success_disable_tdx; -- 2.51.0 From d3a6b6cfb82a9158ee624bf3d43e6ebd8fe05fc8 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Wed, 29 Jan 2025 11:58:57 +0200 Subject: [PATCH 10/16] KVM: x86: Allow to update cached values in kvm_user_return_msrs w/o wrmsr Several MSRs are constant and only used in userspace(ring 3). But VMs may have different values. KVM uses kvm_set_user_return_msr() to switch to guest's values and leverages user return notifier to restore them when the kernel is to return to userspace. To eliminate unnecessary wrmsr, KVM also caches the value it wrote to an MSR last time. TDX module unconditionally resets some of these MSRs to architectural INIT state on TD exit. It makes the cached values in kvm_user_return_msrs are inconsistent with values in hardware. This inconsistency needs to be fixed. Otherwise, it may mislead kvm_on_user_return() to skip restoring some MSRs to the host's values. kvm_set_user_return_msr() can help correct this case, but it is not optimal as it always does a wrmsr. So, introduce a variation of kvm_set_user_return_msr() to update cached values and skip that wrmsr. Signed-off-by: Chao Gao Signed-off-by: Isaku Yamahata Signed-off-by: Adrian Hunter Reviewed-by: Paolo Bonzini Message-ID: <20250129095902.16391-9-adrian.hunter@intel.com> Reviewed-by: Xiayao Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 24 +++++++++++++++++++----- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ddb08581c64d..0c8453fa9f9e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2328,6 +2328,7 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, int kvm_add_user_return_msr(u32 msr); int kvm_find_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); +void kvm_user_return_msr_update_cache(unsigned int index, u64 val); static inline bool kvm_is_supported_user_return_msr(u32 msr) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8f3292ccdca4..1133bec79a82 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -636,6 +636,15 @@ static void kvm_user_return_msr_cpu_online(void) } } +static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs) +{ + if (!msrs->registered) { + msrs->urn.on_user_return = kvm_on_user_return; + user_return_notifier_register(&msrs->urn); + msrs->registered = true; + } +} + int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); @@ -649,15 +658,20 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) return 1; msrs->values[slot].curr = value; - if (!msrs->registered) { - msrs->urn.on_user_return = kvm_on_user_return; - user_return_notifier_register(&msrs->urn); - msrs->registered = true; - } + kvm_user_return_register_notifier(msrs); return 0; } EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); +void kvm_user_return_msr_update_cache(unsigned int slot, u64 value) +{ + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); + + msrs->values[slot].curr = value; + kvm_user_return_register_notifier(msrs); +} +EXPORT_SYMBOL_GPL(kvm_user_return_msr_update_cache); + static void drop_user_return_notifiers(void) { struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); -- 2.51.0 From e0b4f31a3c65af66ad39bf686ce5602e49c905ae Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 29 Jan 2025 11:58:58 +0200 Subject: [PATCH 11/16] KVM: TDX: restore user ret MSRs Several MSRs are clobbered on TD exit that are not used by Linux while in ring 0. Ensure the cached value of the MSR is updated on vcpu_put, and the MSRs themselves before returning to ring 3. Co-developed-by: Tony Lindgren Signed-off-by: Tony Lindgren Signed-off-by: Isaku Yamahata Signed-off-by: Adrian Hunter Reviewed-by: Paolo Bonzini Message-ID: <20250129095902.16391-10-adrian.hunter@intel.com> Reviewed-by: Xiayao Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/tdx.c | 51 +++++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/vmx/tdx.h | 1 + 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 9a08a82da804..b942678677cc 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -651,9 +651,32 @@ void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) vt->guest_state_loaded = true; } +struct tdx_uret_msr { + u32 msr; + unsigned int slot; + u64 defval; +}; + +static struct tdx_uret_msr tdx_uret_msrs[] = { + {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 }, + {.msr = MSR_STAR,}, + {.msr = MSR_LSTAR,}, + {.msr = MSR_TSC_AUX,}, +}; + +static void tdx_user_return_msr_update_cache(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) + kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot, + tdx_uret_msrs[i].defval); +} + static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) { struct vcpu_vt *vt = to_vt(vcpu); + struct vcpu_tdx *tdx = to_tdx(vcpu); if (!vt->guest_state_loaded) return; @@ -661,6 +684,11 @@ static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) ++vcpu->stat.host_state_reload; wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base); + if (tdx->guest_entered) { + tdx_user_return_msr_update_cache(); + tdx->guest_entered = false; + } + vt->guest_state_loaded = false; } @@ -767,6 +795,8 @@ EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) { + struct vcpu_tdx *tdx = to_tdx(vcpu); + /* * force_immediate_exit requires vCPU entering for events injection with * an immediately exit followed. But The TDX module doesn't guarantee @@ -782,6 +812,7 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) tdx_vcpu_enter_exit(vcpu); tdx_load_host_xsave_state(vcpu); + tdx->guest_entered = true; vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; @@ -2242,7 +2273,25 @@ static int __init __do_tdx_bringup(void) static int __init __tdx_bringup(void) { const struct tdx_sys_info_td_conf *td_conf; - int r; + int r, i; + + for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) { + /* + * Check if MSRs (tdx_uret_msrs) can be saved/restored + * before returning to user space. + * + * this_cpu_ptr(user_return_msrs)->registered isn't checked + * because the registration is done at vcpu runtime by + * tdx_user_return_msr_update_cache(). + */ + tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr); + if (tdx_uret_msrs[i].slot == -1) { + /* If any MSR isn't supported, it is a KVM bug */ + pr_err("MSR %x isn't included by kvm_find_user_return_msr\n", + tdx_uret_msrs[i].msr); + return -EIO; + } + } /* * Enabling TDX requires enabling hardware virtualization first, diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index 59a35e43b5bf..0a54d286e380 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -57,6 +57,7 @@ struct vcpu_tdx { u64 vp_enter_ret; enum vcpu_tdx_state state; + bool guest_entered; }; void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err); -- 2.51.0 From 6d415778f1069b52344f7cb09ae7b623579486c9 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 29 Jan 2025 11:58:59 +0200 Subject: [PATCH 12/16] KVM: TDX: Disable support for TSX and WAITPKG Support for restoring IA32_TSX_CTRL MSR and IA32_UMWAIT_CONTROL MSR is not yet implemented, so disable support for TSX and WAITPKG for now. Clear the associated CPUID bits returned by KVM_TDX_CAPABILITIES, and return an error if those bits are set in KVM_TDX_INIT_VM. Signed-off-by: Adrian Hunter Message-ID: <20250129095902.16391-11-adrian.hunter@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/tdx.c | 43 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index b942678677cc..06decfe9eb5e 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -107,6 +107,44 @@ static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits) return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16; } +#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM)) + +static bool has_tsx(const struct kvm_cpuid_entry2 *entry) +{ + return entry->function == 7 && entry->index == 0 && + (entry->ebx & TDX_FEATURE_TSX); +} + +static void clear_tsx(struct kvm_cpuid_entry2 *entry) +{ + entry->ebx &= ~TDX_FEATURE_TSX; +} + +static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry) +{ + return entry->function == 7 && entry->index == 0 && + (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG)); +} + +static void clear_waitpkg(struct kvm_cpuid_entry2 *entry) +{ + entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG); +} + +static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry) +{ + if (has_tsx(entry)) + clear_tsx(entry); + + if (has_waitpkg(entry)) + clear_waitpkg(entry); +} + +static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry) +{ + return has_tsx(entry) || has_waitpkg(entry); +} + #define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1) static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx) @@ -130,6 +168,8 @@ static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char i */ if (entry->function == 0x80000008) entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff); + + tdx_clear_unsupported_cpuid(entry); } static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf, @@ -1244,6 +1284,9 @@ static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid, if (!entry) continue; + if (tdx_unsupported_cpuid(entry)) + return -EINVAL; + copy_cnt++; value = &td_params->cpuid_values[i]; -- 2.51.0 From 8af099037527c54399d44c4fd30eab15931700ce Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 29 Jan 2025 11:59:00 +0200 Subject: [PATCH 13/16] KVM: TDX: Save and restore IA32_DEBUGCTL Save the IA32_DEBUGCTL MSR before entering a TDX VCPU and restore it afterwards. The TDX Module preserves bits 1, 12, and 14, so if no other bits are set, no restore is done. Signed-off-by: Adrian Hunter Message-ID: <20250129095902.16391-12-adrian.hunter@intel.com> Reviewed-by: Xiayao Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/tdx.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 06decfe9eb5e..7ad680b5f771 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -688,6 +688,8 @@ void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) else vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + vt->host_debugctlmsr = get_debugctlmsr(); + vt->guest_state_loaded = true; } @@ -831,11 +833,15 @@ static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu) if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss)) wrmsrl(MSR_IA32_XSS, kvm_host.xss); } -EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); + +#define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \ + DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \ + DEBUGCTLMSR_FREEZE_IN_SMM) fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) { struct vcpu_tdx *tdx = to_tdx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); /* * force_immediate_exit requires vCPU entering for events injection with @@ -851,6 +857,9 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) tdx_vcpu_enter_exit(vcpu); + if (vt->host_debugctlmsr & ~TDX_DEBUGCTL_PRESERVED) + update_debugctlmsr(vt->host_debugctlmsr); + tdx_load_host_xsave_state(vcpu); tdx->guest_entered = true; -- 2.51.0 From 484612f1a7d7386cec8fc9a3fa9dd45dd39e8b6d Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 29 Jan 2025 11:59:01 +0200 Subject: [PATCH 14/16] KVM: x86: Add a switch_db_regs flag to handle TDX's auto-switched behavior Add a flag KVM_DEBUGREG_AUTO_SWITCH to skip saving/restoring guest DRs. TDX-SEAM unconditionally saves/restores guest DRs on TD exit/enter, and resets DRs to architectural INIT state on TD exit. Use the new flag KVM_DEBUGREG_AUTO_SWITCH to indicate that KVM doesn't need to save/restore guest DRs. KVM still needs to restore host DRs after TD exit if there are active breakpoints in the host, which is covered by the existing code. MOV-DR exiting is always cleared for TDX guests, so the handler for DR access is never called, and KVM_DEBUGREG_WONT_EXIT is never set. Add a warning if both KVM_DEBUGREG_WONT_EXIT and KVM_DEBUGREG_AUTO_SWITCH are set. Opportunistically convert the KVM_DEBUGREG_* definitions to use BIT(). Reported-by: Xiaoyao Li Signed-off-by: Sean Christopherson Co-developed-by: Chao Gao Signed-off-by: Chao Gao Signed-off-by: Isaku Yamahata [binbin: rework changelog] Signed-off-by: Binbin Wu Message-ID: <20241210004946.3718496-2-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini Message-ID: <20250129095902.16391-13-adrian.hunter@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 11 +++++++++-- arch/x86/kvm/vmx/tdx.c | 1 + arch/x86/kvm/x86.c | 4 +++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0c8453fa9f9e..3a6373fc58a1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -606,8 +606,15 @@ struct kvm_pmu { struct kvm_pmu_ops; enum { - KVM_DEBUGREG_BP_ENABLED = 1, - KVM_DEBUGREG_WONT_EXIT = 2, + KVM_DEBUGREG_BP_ENABLED = BIT(0), + KVM_DEBUGREG_WONT_EXIT = BIT(1), + /* + * Guest debug registers (DR0-3, DR6 and DR7) are saved/restored by + * hardware on exit from or enter to guest. KVM needn't switch them. + * DR0-3, DR6 and DR7 are set to their architectural INIT value on VM + * exit, host values need to be restored. + */ + KVM_DEBUGREG_AUTO_SWITCH = BIT(2), }; struct kvm_mtrr { diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 7ad680b5f771..a6388eb95988 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -630,6 +630,7 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX; + vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH; vcpu->arch.cr0_guest_owned_bits = -1ul; vcpu->arch.cr4_guest_owned_bits = -1ul; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1133bec79a82..1ef83f24449d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10985,7 +10985,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_fpu.xfd_err) wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); - if (unlikely(vcpu->arch.switch_db_regs)) { + if (unlikely(vcpu->arch.switch_db_regs && + !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) { set_debugreg(0, 7); set_debugreg(vcpu->arch.eff_db[0], 0); set_debugreg(vcpu->arch.eff_db[1], 1); @@ -11037,6 +11038,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); + WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH); kvm_x86_call(sync_dirty_debug_regs)(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr7(vcpu); -- 2.51.0 From 6162b37357d039614ee0e9ff53c000e4de081536 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Sat, 22 Feb 2025 09:42:17 +0800 Subject: [PATCH 15/16] KVM: x86: Have ____kvm_emulate_hypercall() read the GPRs Have ____kvm_emulate_hypercall() read the GPRs instead of passing them in via the macro. When emulating KVM hypercalls via TDVMCALL, TDX will marshall registers of TDVMCALL ABI into KVM's x86 registers to match the definition of KVM hypercall ABI _before_ ____kvm_emulate_hypercall() gets called. Therefore, ____kvm_emulate_hypercall() can just read registers internally based on KVM hypercall ABI, and those registers can be removed from the __kvm_emulate_hypercall() macro. Also, op_64_bit can be determined inside ____kvm_emulate_hypercall(), remove it from the __kvm_emulate_hypercall() macro as well. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Binbin Wu Reviewed-by: Kai Huang Message-ID: <20250222014225.897298-2-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 15 ++++++++------- arch/x86/kvm/x86.h | 26 +++++++++----------------- 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1ef83f24449d..a792207a0dd1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10030,13 +10030,16 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } -int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, - unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3, - int op_64_bit, int cpl, +int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, int (*complete_hypercall)(struct kvm_vcpu *)) { unsigned long ret; + unsigned long nr = kvm_rax_read(vcpu); + unsigned long a0 = kvm_rbx_read(vcpu); + unsigned long a1 = kvm_rcx_read(vcpu); + unsigned long a2 = kvm_rdx_read(vcpu); + unsigned long a3 = kvm_rsi_read(vcpu); + int op_64_bit = is_64_bit_hypercall(vcpu); ++vcpu->stat.hypercalls; @@ -10139,9 +10142,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) if (kvm_hv_hypercall_enabled(vcpu)) return kvm_hv_hypercall(vcpu); - return __kvm_emulate_hypercall(vcpu, rax, rbx, rcx, rdx, rsi, - is_64_bit_hypercall(vcpu), - kvm_x86_call(get_cpl)(vcpu), + return __kvm_emulate_hypercall(vcpu, kvm_x86_call(get_cpl)(vcpu), complete_hypercall_exit); } EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 91e50a513100..8b27f70c6321 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -621,25 +621,17 @@ static inline bool user_exit_on_hypercall(struct kvm *kvm, unsigned long hc_nr) return kvm->arch.hypercall_exit_enabled & BIT(hc_nr); } -int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, - unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3, - int op_64_bit, int cpl, +int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, int (*complete_hypercall)(struct kvm_vcpu *)); -#define __kvm_emulate_hypercall(_vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl, complete_hypercall) \ -({ \ - int __ret; \ - \ - __ret = ____kvm_emulate_hypercall(_vcpu, \ - kvm_##nr##_read(_vcpu), kvm_##a0##_read(_vcpu), \ - kvm_##a1##_read(_vcpu), kvm_##a2##_read(_vcpu), \ - kvm_##a3##_read(_vcpu), op_64_bit, cpl, \ - complete_hypercall); \ - \ - if (__ret > 0) \ - __ret = complete_hypercall(_vcpu); \ - __ret; \ +#define __kvm_emulate_hypercall(_vcpu, cpl, complete_hypercall) \ +({ \ + int __ret; \ + __ret = ____kvm_emulate_hypercall(_vcpu, cpl, complete_hypercall); \ + \ + if (__ret > 0) \ + __ret = complete_hypercall(_vcpu); \ + __ret; \ }) int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); -- 2.51.0 From 44428e4936022a7a31743017849b167e64f33a32 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Sat, 22 Feb 2025 09:42:18 +0800 Subject: [PATCH 16/16] KVM: x86: Move pv_unhalted check out of kvm_vcpu_has_events() Move pv_unhalted check out of kvm_vcpu_has_events(), check pv_unhalted explicitly when handling PV unhalt and expose kvm_vcpu_has_events(). kvm_vcpu_has_events() returns true if pv_unhalted is set, and pv_unhalted is only cleared on transitions to KVM_MP_STATE_RUNNABLE. If the guest initiates a spurious wakeup, pv_unhalted could be left set in perpetuity. Currently, this is not problematic because kvm_vcpu_has_events() is only called when handling PV unhalt. However, if kvm_vcpu_has_events() is used for other purposes in the future, it could return the unexpected results. Export kvm_vcpu_has_events() for its usage in broader contexts. Suggested-by: Sean Christopherson Signed-off-by: Binbin Wu Message-ID: <20250222014225.897298-3-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 11 +++++------ include/linux/kvm_host.h | 1 + 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a792207a0dd1..3cae210ffaa4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11143,7 +11143,7 @@ static bool kvm_vcpu_running(struct kvm_vcpu *vcpu) !vcpu->arch.apf.halted); } -static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) +bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) { if (!list_empty_careful(&vcpu->async_pf.done)) return true; @@ -11152,9 +11152,6 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) kvm_apic_init_sipi_allowed(vcpu)) return true; - if (vcpu->arch.pv.pv_unhalted) - return true; - if (kvm_is_exception_pending(vcpu)) return true; @@ -11192,10 +11189,12 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return false; } +EXPORT_SYMBOL_GPL(kvm_vcpu_has_events); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); + return kvm_vcpu_running(vcpu) || vcpu->arch.pv.pv_unhalted || + kvm_vcpu_has_events(vcpu); } /* Called within kvm->srcu read side. */ @@ -11331,7 +11330,7 @@ static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) */ ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { - if (kvm_vcpu_has_events(vcpu)) + if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted) vcpu->arch.pv.pv_unhalted = false; else vcpu->arch.mp_state = state; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3bfe3140f444..ed1968f6f841 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1609,6 +1609,7 @@ void kvm_arch_disable_virtualization(void); int kvm_arch_enable_virtualization_cpu(void); void kvm_arch_disable_virtualization_cpu(void); #endif +bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); -- 2.51.0