From 5f3b30b2b0d92e5a5050959a0994b2f3c4e1e6d0 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Sat, 12 Oct 2024 00:55:55 -0700 Subject: [PATCH 01/16] KVM: x86: Push down setting vcpu.arch.user_set_tsc Push down setting vcpu.arch.user_set_tsc to true from kvm_synchronize_tsc() to __kvm_synchronize_tsc() so that the two callers don't have to modify user_set_tsc directly as preparation. Later, prohibit changing TSC synchronization for TDX guests to modify __kvm_synchornize_tsc() change. We don't want to touch caller sites not to change user_set_tsc. Signed-off-by: Isaku Yamahata Message-ID: <62b1a7a35d6961844786b6e47e8ecb774af7a228.1728719037.git.isaku.yamahata@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bd2b71d4e64f..2da75bbf7f94 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2626,12 +2626,15 @@ static inline bool kvm_check_tsc_unstable(void) * participates in. */ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, - u64 ns, bool matched) + u64 ns, bool matched, bool user_set_tsc) { struct kvm *kvm = vcpu->kvm; lockdep_assert_held(&kvm->arch.tsc_write_lock); + if (user_set_tsc) + vcpu->kvm->arch.user_set_tsc = true; + /* * We also track th most recent recorded KHZ, write and time to * allow the matching interval to be extended at each write. @@ -2717,8 +2720,6 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) } } - if (user_value) - kvm->arch.user_set_tsc = true; /* * For a reliable TSC, we can match TSC offsets, and for an unstable @@ -2738,7 +2739,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) matched = true; } - __kvm_synchronize_tsc(vcpu, offset, data, ns, matched); + __kvm_synchronize_tsc(vcpu, offset, data, ns, matched, !!user_value); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); } @@ -5725,8 +5726,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; ns = get_kvmclock_base_ns(); - kvm->arch.user_set_tsc = true; - __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); + __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched, true); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); r = 0; -- 2.51.0 From adafea1106004dba26ea12d3193f4f132b1f0065 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Sat, 12 Oct 2024 00:55:56 -0700 Subject: [PATCH 02/16] KVM: x86: Add infrastructure for secure TSC Add guest_tsc_protected member to struct kvm_arch_vcpu and prohibit changing TSC offset/multiplier when guest_tsc_protected is true. X86 confidential computing technology defines protected guest TSC so that the VMM can't change the TSC offset/multiplier once vCPU is initialized. SEV-SNP defines Secure TSC as optional, whereas TDX mandates it. KVM has common logic on x86 that tries to guess or adjust TSC offset/multiplier for better guest TSC and TSC interrupt latency at KVM vCPU creation (kvm_arch_vcpu_postcreate()), vCPU migration over pCPU (kvm_arch_vcpu_load()), vCPU TSC device attributes (kvm_arch_tsc_set_attr()) and guest/host writing to TSC or TSC adjust MSR (kvm_set_msr_common()). The current x86 KVM implementation conflicts with protected TSC because the VMM can't change the TSC offset/multiplier. Because KVM emulates the TSC timer or the TSC deadline timer with the TSC offset/multiplier, the TSC timer interrupts is injected to the guest at the wrong time if the KVM TSC offset is different from what the TDX module determined. Originally this issue was found by cyclic test of rt-test [1] as the latency in TDX case is worse than VMX value + TDX SEAMCALL overhead. It turned out that the KVM TSC offset is different from what the TDX module determines. Disable or ignore the KVM logic to change/adjust the TSC offset/multiplier somehow, thus keeping the KVM TSC offset/multiplier the same as the value of the TDX module. Writes to MSR_IA32_TSC are also blocked as they amount to a change in the TSC offset. [1] https://git.kernel.org/pub/scm/utils/rt-tests/rt-tests.git Reported-by: Marcelo Tosatti Signed-off-by: Isaku Yamahata Message-ID: <3a7444aec08042fe205666864b6858910e86aa98.1728719037.git.isaku.yamahata@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 32ae3aa50c7e..ee55d1f753e8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1053,6 +1053,7 @@ struct kvm_vcpu_arch { /* Protected Guests */ bool guest_state_protected; + bool guest_tsc_protected; /* * Set when PDPTS were loaded directly by the userspace without diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2da75bbf7f94..053547fc6f89 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2569,6 +2569,9 @@ EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) { + if (vcpu->arch.guest_tsc_protected) + return; + trace_kvm_write_tsc_offset(vcpu->vcpu_id, vcpu->arch.l1_tsc_offset, l1_offset); @@ -2632,6 +2635,9 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, lockdep_assert_held(&kvm->arch.tsc_write_lock); + if (vcpu->arch.guest_tsc_protected) + return; + if (user_set_tsc) vcpu->kvm->arch.user_set_tsc = true; @@ -3907,7 +3913,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: if (msr_info->host_initiated) { kvm_synchronize_tsc(vcpu, &data); - } else { + } else if (!vcpu->arch.guest_tsc_protected) { u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; adjust_tsc_offset_guest(vcpu, adj); vcpu->arch.ia32_tsc_adjust_msr += adj; @@ -4987,7 +4993,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) u64 offset = kvm_compute_l1_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_vcpu_write_tsc_offset(vcpu, offset); - vcpu->arch.tsc_catchup = 1; + if (!vcpu->arch.guest_tsc_protected) + vcpu->arch.tsc_catchup = 1; } if (kvm_lapic_hv_timer_in_use(vcpu)) -- 2.51.0 From 74c1807f6c4feddb3c3cb1056c54531d4adbaea6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 6 Mar 2025 21:29:22 +0100 Subject: [PATCH 03/16] KVM: x86: block KVM_CAP_SYNC_REGS if guest state is protected KVM_CAP_SYNC_REGS does not make sense for VMs with protected guest state, since the register values cannot actually be written. Return 0 when using the VM-level KVM_CHECK_EXTENSION ioctl, and accordingly return -EINVAL from KVM_RUN if the valid/dirty fields are nonzero. However, on exit from KVM_RUN userspace could have placed a nonzero value into kvm_run->kvm_valid_regs, so check guest_state_protected again and skip store_regs() in that case. Cc: stable@vger.kernel.org Fixes: 517987e3fb19 ("KVM: x86: add fields to struct kvm_arch for CoCo features") Signed-off-by: Paolo Bonzini Message-ID: <20250306202923.646075-1-pbonzini@redhat.com> Reviewed-by: Pankaj Gupta Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 053547fc6f89..eceb97734646 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4580,6 +4580,11 @@ static bool kvm_is_vm_type_supported(unsigned long type) return type < 32 && (kvm_caps.supported_vm_types & BIT(type)); } +static inline u32 kvm_sync_valid_fields(struct kvm *kvm) +{ + return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS; +} + int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r = 0; @@ -4688,7 +4693,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; #endif case KVM_CAP_SYNC_REGS: - r = KVM_SYNC_X86_VALID_FIELDS; + r = kvm_sync_valid_fields(kvm); break; case KVM_CAP_ADJUST_CLOCK: r = KVM_CLOCK_VALID_FLAGS; @@ -11481,6 +11486,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception; struct kvm_run *kvm_run = vcpu->run; + u32 sync_valid_fields; int r; r = kvm_mmu_post_init_vm(vcpu->kvm); @@ -11526,8 +11532,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } - if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) || - (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) { + sync_valid_fields = kvm_sync_valid_fields(vcpu->kvm); + if ((kvm_run->kvm_valid_regs & ~sync_valid_fields) || + (kvm_run->kvm_dirty_regs & ~sync_valid_fields)) { r = -EINVAL; goto out; } @@ -11585,7 +11592,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) out: kvm_put_guest_fpu(vcpu); - if (kvm_run->kvm_valid_regs) + if (kvm_run->kvm_valid_regs && likely(!vcpu->arch.guest_state_protected)) store_regs(vcpu); post_kvm_run_save(vcpu); kvm_vcpu_srcu_read_unlock(vcpu); -- 2.51.0 From d19a42d69692f4f1ef6dd0984caf280edd4a403a Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 2 Dec 2024 17:03:11 -0800 Subject: [PATCH 04/16] x86/virt/tdx: Add SEAMCALL wrappers for TDX KeyID management Intel TDX protects guest VMs from malicious host and certain physical attacks. Pre-TDX Intel hardware has support for a memory encryption architecture called MK-TME, which repurposes several high bits of physical address as "KeyID". TDX ends up with reserving a sub-range of MK-TME KeyIDs as "TDX private KeyIDs". Like MK-TME, these KeyIDs can be associated with an ephemeral key. For TDX this association is done by the TDX module. It also has its own tracking for which KeyIDs are in use. To do this ephemeral key setup and manipulate the TDX module's internal tracking, KVM will use the following SEAMCALLs: TDH.MNG.KEY.CONFIG: Mark the KeyID as in use, and initialize its ephemeral key. TDH.MNG.KEY.FREEID: Mark the KeyID as not in use. These SEAMCALLs both operate on TDR structures, which are setup using the previously added TDH.MNG.CREATE SEAMCALL. KVM's use of these operations will go like: - tdx_guest_keyid_alloc() - Initialize TD and TDR page with TDH.MNG.CREATE (not yet-added), passing KeyID - TDH.MNG.KEY.CONFIG to initialize the key - TD runs, teardown is started - TDH.MNG.KEY.FREEID - tdx_guest_keyid_free() Don't try to combine the tdx_guest_keyid_alloc() and TDH.MNG.KEY.CONFIG operations because TDH.MNG.CREATE and some locking need to be done in the middle. Don't combine TDH.MNG.KEY.FREEID and tdx_guest_keyid_free() so they are symmetrical with the creation path. So implement tdh_mng_key_config() and tdh_mng_key_freeid() as separate functions than tdx_guest_keyid_alloc() and tdx_guest_keyid_free(). The TDX module provides SEAMCALLs to hand pages to the TDX module for storing TDX controlled state. SEAMCALLs that operate on this state are directed to the appropriate TD VM using references to the pages originally provided for managing the TD's state. So the host kernel needs to track these pages, both as an ID for specifying which TD to operate on, and to allow them to be eventually reclaimed. The TD VM associated pages are called TDR (Trust Domain Root) and TDCS (Trust Domain Control Structure). Introduce "struct tdx_td" for holding references to pages provided to the TDX module for this TD VM associated state. Don't plan for any TD associated state that is controlled by KVM to live in this struct. Only expect it to hold data for concepts specific to the TDX architecture, for which there can't already be preexisting storage for in KVM. Add both the TDR page and an array of TDCS pages, even though the SEAMCALL wrappers will only need to know about the TDR pages for directing the SEAMCALLs to the right TD. Adding the TDCS pages to this struct will let all of the TD VM associated pages handed to the TDX module be tracked in one location. For a type to specify physical pages, use KVM's hpa_t type. Do this for KVM's benefit This is the common type used to hold physical addresses in KVM, so will make interoperability easier. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Signed-off-by: Kai Huang Signed-off-by: Rick Edgecombe Reviewed-by: Binbin Wu Reviewed-by: Yuan Yao Message-ID: <20241203010317.827803-2-rick.p.edgecombe@intel.com> Acked-by: Dave Hansen Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 12 ++++++++++++ arch/x86/virt/vmx/tdx/tdx.c | 25 +++++++++++++++++++++++++ arch/x86/virt/vmx/tdx/tdx.h | 16 +++++++++------- 3 files changed, 46 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index b4b16dafd55e..5950e0a092ca 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -119,6 +119,18 @@ static inline u64 sc_retry(sc_func_t func, u64 fn, int tdx_cpu_enable(void); int tdx_enable(void); const char *tdx_dump_mce_info(struct mce *m); + +struct tdx_td { + /* TD root structure: */ + struct page *tdr_page; + + int tdcs_nr_pages; + /* TD control structure: */ + struct page **tdcs_pages; +}; + +u64 tdh_mng_key_config(struct tdx_td *td); +u64 tdh_mng_key_freeid(struct tdx_td *td); #else static inline void tdx_init(void) { } static inline int tdx_cpu_enable(void) { return -ENODEV; } diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 7fdb37387886..1ffbdb840004 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1456,3 +1456,28 @@ void __init tdx_init(void) check_tdx_erratum(); } + +static inline u64 tdx_tdr_pa(struct tdx_td *td) +{ + return page_to_phys(td->tdr_page); +} + +u64 tdh_mng_key_config(struct tdx_td *td) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + }; + + return seamcall(TDH_MNG_KEY_CONFIG, &args); +} +EXPORT_SYMBOL_GPL(tdh_mng_key_config); + +u64 tdh_mng_key_freeid(struct tdx_td *td) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + }; + + return seamcall(TDH_MNG_KEY_FREEID, &args); +} +EXPORT_SYMBOL_GPL(tdh_mng_key_freeid); diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 4e3d533cdd61..5579317f67ab 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -15,13 +15,15 @@ /* * TDX module SEAMCALL leaf functions */ -#define TDH_PHYMEM_PAGE_RDMD 24 -#define TDH_SYS_KEY_CONFIG 31 -#define TDH_SYS_INIT 33 -#define TDH_SYS_RD 34 -#define TDH_SYS_LP_INIT 35 -#define TDH_SYS_TDMR_INIT 36 -#define TDH_SYS_CONFIG 45 +#define TDH_MNG_KEY_CONFIG 8 +#define TDH_MNG_KEY_FREEID 20 +#define TDH_PHYMEM_PAGE_RDMD 24 +#define TDH_SYS_KEY_CONFIG 31 +#define TDH_SYS_INIT 33 +#define TDH_SYS_RD 34 +#define TDH_SYS_LP_INIT 35 +#define TDH_SYS_TDMR_INIT 36 +#define TDH_SYS_CONFIG 45 /* TDX page types */ #define PT_NDA 0x0 -- 2.51.0 From b8a4e7de84a2587a250e130a85e4b049dff06448 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 2 Dec 2024 17:03:12 -0800 Subject: [PATCH 05/16] x86/virt/tdx: Add SEAMCALL wrappers for TDX TD creation Intel TDX protects guest VMs from malicious hosts and certain physical attacks. It defines various control structures that hold state for things like TDs or vCPUs. These control structures are stored in pages given to the TDX module and encrypted with either the global KeyID or the guest KeyIDs. To manipulate these control structures the TDX module defines a few SEAMCALLs. KVM will use these during the process of creating a TD as follows: 1) Allocate a unique TDX KeyID for a new guest. 1) Call TDH.MNG.CREATE to create a "TD Root" (TDR) page, together with the new allocated KeyID. Unlike the rest of the TDX guest, the TDR page is crypto-protected by the 'global KeyID'. 2) Call the previously added TDH.MNG.KEY.CONFIG on each package to configure the KeyID for the guest. After this step, the KeyID to protect the guest is ready and the rest of the guest will be protected by this KeyID. 3) Call TDH.MNG.ADDCX to add TD Control Structure (TDCS) pages. 4) Call TDH.MNG.INIT to initialize the TDCS. To reclaim these pages for use by the kernel other SEAMCALLs are needed, which will be added in future patches. Add tdh_mng_addcx(), tdh_mng_create() and tdh_mng_init() to export these SEAMCALLs so that KVM can use them to create TDs. For SEAMCALLs that give a page to the TDX module to be encrypted, CLFLUSH the page mapped with KeyID 0, such that any dirty cache lines don't write back later and clobber TD memory or control structures. Don't worry about the other MK-TME KeyIDs because the kernel doesn't use them. The TDX docs specify that this flush is not needed unless the TDX module exposes the CLFLUSH_BEFORE_ALLOC feature bit. Be conservative and always flush. Add a helper function to facilitate this. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Signed-off-by: Kai Huang Signed-off-by: Rick Edgecombe Reviewed-by: Binbin Wu Reviewed-by: Yuan Yao Message-ID: <20241203010317.827803-3-rick.p.edgecombe@intel.com> Acked-by: Dave Hansen Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 3 +++ arch/x86/virt/vmx/tdx/tdx.c | 51 +++++++++++++++++++++++++++++++++++++ arch/x86/virt/vmx/tdx/tdx.h | 3 +++ 3 files changed, 57 insertions(+) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 5950e0a092ca..6ba3b806e880 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -129,8 +129,11 @@ struct tdx_td { struct page **tdcs_pages; }; +u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page); u64 tdh_mng_key_config(struct tdx_td *td); +u64 tdh_mng_create(struct tdx_td *td, u16 hkid); u64 tdh_mng_key_freeid(struct tdx_td *td); +u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err); #else static inline void tdx_init(void) { } static inline int tdx_cpu_enable(void) { return -ENODEV; } diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 1ffbdb840004..ce4b1e96c5b0 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1462,6 +1462,29 @@ static inline u64 tdx_tdr_pa(struct tdx_td *td) return page_to_phys(td->tdr_page); } +/* + * The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether + * a CLFLUSH of pages is required before handing them to the TDX module. + * Be conservative and make the code simpler by doing the CLFLUSH + * unconditionally. + */ +static void tdx_clflush_page(struct page *page) +{ + clflush_cache_range(page_to_virt(page), PAGE_SIZE); +} + +u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page) +{ + struct tdx_module_args args = { + .rcx = page_to_phys(tdcs_page), + .rdx = tdx_tdr_pa(td), + }; + + tdx_clflush_page(tdcs_page); + return seamcall(TDH_MNG_ADDCX, &args); +} +EXPORT_SYMBOL_GPL(tdh_mng_addcx); + u64 tdh_mng_key_config(struct tdx_td *td) { struct tdx_module_args args = { @@ -1472,6 +1495,18 @@ u64 tdh_mng_key_config(struct tdx_td *td) } EXPORT_SYMBOL_GPL(tdh_mng_key_config); +u64 tdh_mng_create(struct tdx_td *td, u16 hkid) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + .rdx = hkid, + }; + + tdx_clflush_page(td->tdr_page); + return seamcall(TDH_MNG_CREATE, &args); +} +EXPORT_SYMBOL_GPL(tdh_mng_create); + u64 tdh_mng_key_freeid(struct tdx_td *td) { struct tdx_module_args args = { @@ -1481,3 +1516,19 @@ u64 tdh_mng_key_freeid(struct tdx_td *td) return seamcall(TDH_MNG_KEY_FREEID, &args); } EXPORT_SYMBOL_GPL(tdh_mng_key_freeid); + +u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + .rdx = td_params, + }; + u64 ret; + + ret = seamcall_ret(TDH_MNG_INIT, &args); + + *extended_err = args.rcx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mng_init); diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 5579317f67ab..0861c3f09576 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -15,8 +15,11 @@ /* * TDX module SEAMCALL leaf functions */ +#define TDH_MNG_ADDCX 1 #define TDH_MNG_KEY_CONFIG 8 +#define TDH_MNG_CREATE 9 #define TDH_MNG_KEY_FREEID 20 +#define TDH_MNG_INIT 21 #define TDH_PHYMEM_PAGE_RDMD 24 #define TDH_SYS_KEY_CONFIG 31 #define TDH_SYS_INIT 33 -- 2.51.0 From 0d65dff2b9057107cd002864a27de6c2b49a5755 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 2 Dec 2024 17:03:13 -0800 Subject: [PATCH 06/16] x86/virt/tdx: Add SEAMCALL wrappers for TDX vCPU creation Intel TDX protects guest VMs from malicious host and certain physical attacks. It defines various control structures that hold state for virtualized components of the TD (i.e. VMs or vCPUs) These control structures are stored in pages given to the TDX module and encrypted with either the global KeyID or the guest KeyIDs. To manipulate these control structures the TDX module defines a few SEAMCALLs. KVM will use these during the process of creating a vCPU as follows: 1) Call TDH.VP.CREATE to create a TD vCPU Root (TDVPR) page for each vCPU. 2) Call TDH.VP.ADDCX to add per-vCPU control pages (TDCX) for each vCPU. 3) Call TDH.VP.INIT to initialize the TDCX for each vCPU. To reclaim these pages for use by the kernel other SEAMCALLs are needed, which will be added in future patches. Export functions to allow KVM to make these SEAMCALLs. Export two variants for TDH.VP.CREATE, in order to support the planned logic of KVM to support TDX modules with and without the ENUM_TOPOLOGY feature. If KVM can drop support for the !ENUM_TOPOLOGY case, this could go down a single version. Leave that for later discussion. The TDX module provides SEAMCALLs to hand pages to the TDX module for storing TDX controlled state. SEAMCALLs that operate on this state are directed to the appropriate TD vCPU using references to the pages originally provided for managing the vCPU's state. So the host kernel needs to track these pages, both as an ID for specifying which vCPU to operate on, and to allow them to be eventually reclaimed. The vCPU associated pages are called TDVPR (Trust Domain Virtual Processor Root) and TDCX (Trust Domain Control Extension). Introduce "struct tdx_vp" for holding references to pages provided to the TDX module for the TD vCPU associated state. Don't plan for any vCPU associated state that is controlled by KVM to live in this struct. Only expect it to hold data for concepts specific to the TDX architecture, for which there can't already be preexisting storage for in KVM. Add both the TDVPR page and an array of TDCX pages, even though the SEAMCALL wrappers will only need to know about the TDVPR pages for directing the SEAMCALLs to the right vCPU. Adding the TDCX pages to this struct will let all of the vCPU associated pages handed to the TDX module be tracked in one location. For a type to specify physical pages, use KVM's hpa_t type. Do this for KVM's benefit This is the common type used to hold physical addresses in KVM, so will make interoperability easier. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Signed-off-by: Kai Huang Signed-off-by: Rick Edgecombe Reviewed-by: Binbin Wu Reviewed-by: Yuan Yao Message-ID: <20241203010317.827803-4-rick.p.edgecombe@intel.com> Acked-by: Dave Hansen Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 14 ++++++++++++ arch/x86/virt/vmx/tdx/tdx.c | 43 +++++++++++++++++++++++++++++++++++++ arch/x86/virt/vmx/tdx/tdx.h | 11 ++++++++++ 3 files changed, 68 insertions(+) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 6ba3b806e880..f783d5f1a0e1 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -127,13 +127,27 @@ struct tdx_td { int tdcs_nr_pages; /* TD control structure: */ struct page **tdcs_pages; + + /* Size of `tdcx_pages` in struct tdx_vp */ + int tdcx_nr_pages; +}; + +struct tdx_vp { + /* TDVP root page */ + struct page *tdvpr_page; + + /* TD vCPU control structure: */ + struct page **tdcx_pages; }; u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page); +u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page); u64 tdh_mng_key_config(struct tdx_td *td); u64 tdh_mng_create(struct tdx_td *td, u16 hkid); +u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp); u64 tdh_mng_key_freeid(struct tdx_td *td); u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err); +u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid); #else static inline void tdx_init(void) { } static inline int tdx_cpu_enable(void) { return -ENODEV; } diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index ce4b1e96c5b0..e55570575f22 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -5,6 +5,7 @@ * Intel Trusted Domain Extensions (TDX) support */ +#include "asm/page_types.h" #define pr_fmt(fmt) "virt/tdx: " fmt #include @@ -1462,6 +1463,11 @@ static inline u64 tdx_tdr_pa(struct tdx_td *td) return page_to_phys(td->tdr_page); } +static inline u64 tdx_tdvpr_pa(struct tdx_vp *td) +{ + return page_to_phys(td->tdvpr_page); +} + /* * The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether * a CLFLUSH of pages is required before handing them to the TDX module. @@ -1485,6 +1491,18 @@ u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page) } EXPORT_SYMBOL_GPL(tdh_mng_addcx); +u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page) +{ + struct tdx_module_args args = { + .rcx = page_to_phys(tdcx_page), + .rdx = tdx_tdvpr_pa(vp), + }; + + tdx_clflush_page(tdcx_page); + return seamcall(TDH_VP_ADDCX, &args); +} +EXPORT_SYMBOL_GPL(tdh_vp_addcx); + u64 tdh_mng_key_config(struct tdx_td *td) { struct tdx_module_args args = { @@ -1507,6 +1525,18 @@ u64 tdh_mng_create(struct tdx_td *td, u16 hkid) } EXPORT_SYMBOL_GPL(tdh_mng_create); +u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp) +{ + struct tdx_module_args args = { + .rcx = tdx_tdvpr_pa(vp), + .rdx = tdx_tdr_pa(td), + }; + + tdx_clflush_page(vp->tdvpr_page); + return seamcall(TDH_VP_CREATE, &args); +} +EXPORT_SYMBOL_GPL(tdh_vp_create); + u64 tdh_mng_key_freeid(struct tdx_td *td) { struct tdx_module_args args = { @@ -1532,3 +1562,16 @@ u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err) return ret; } EXPORT_SYMBOL_GPL(tdh_mng_init); + +u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid) +{ + struct tdx_module_args args = { + .rcx = tdx_tdvpr_pa(vp), + .rdx = initial_rcx, + .r8 = x2apicid, + }; + + /* apicid requires version == 1. */ + return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args); +} +EXPORT_SYMBOL_GPL(tdh_vp_init); diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 0861c3f09576..f0464f7d9780 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -16,10 +16,13 @@ * TDX module SEAMCALL leaf functions */ #define TDH_MNG_ADDCX 1 +#define TDH_VP_ADDCX 4 #define TDH_MNG_KEY_CONFIG 8 #define TDH_MNG_CREATE 9 +#define TDH_VP_CREATE 10 #define TDH_MNG_KEY_FREEID 20 #define TDH_MNG_INIT 21 +#define TDH_VP_INIT 22 #define TDH_PHYMEM_PAGE_RDMD 24 #define TDH_SYS_KEY_CONFIG 31 #define TDH_SYS_INIT 33 @@ -28,6 +31,14 @@ #define TDH_SYS_TDMR_INIT 36 #define TDH_SYS_CONFIG 45 +/* + * SEAMCALL leaf: + * + * Bit 15:0 Leaf number + * Bit 23:16 Version number + */ +#define TDX_VERSION_SHIFT 16 + /* TDX page types */ #define PT_NDA 0x0 #define PT_RSVD 0x1 -- 2.51.0 From 541b3e9e0d907b5d94e5d6e4a2e57962a7d1d134 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 2 Dec 2024 17:03:14 -0800 Subject: [PATCH 07/16] x86/virt/tdx: Add SEAMCALL wrappers for TDX page cache management Intel TDX protects guest VMs from malicious host and certain physical attacks. The TDX module uses pages provided by the host for both control structures and for TD guest pages. These pages are encrypted using the MK-TME encryption engine, with its special requirements around cache invalidation. For its own security, the TDX module ensures pages are flushed properly and track which usage they are currently assigned. For creating and tearing down TD VMs and vCPUs KVM will need to use the TDH.PHYMEM.PAGE.RECLAIM, TDH.PHYMEM.CACHE.WB, and TDH.PHYMEM.PAGE.WBINVD SEAMCALLs. Add tdh_phymem_page_reclaim() to enable KVM to call TDH.PHYMEM.PAGE.RECLAIM to reclaim the page for use by the host kernel. This effectively resets its state in the TDX module's page tracking (PAMT), if the page is available to be reclaimed. This will be used by KVM to reclaim the various types of pages owned by the TDX module. It will have a small wrapper in KVM that retries in the case of a relevant error code. Don't implement this wrapper in arch/x86 because KVM's solution around retrying SEAMCALLs will be better located in a single place. Add tdh_phymem_cache_wb() to enable KVM to call TDH.PHYMEM.CACHE.WB to do a cache write back in a way that the TDX module can verify, before it allows a KeyID to be freed. The KVM code will use this to have a small wrapper that handles retries. Since the TDH.PHYMEM.CACHE.WB operation is interruptible, have tdh_phymem_cache_wb() take a resume argument to pass this info to the TDX module for restarts. It is worth noting that this SEAMCALL uses a SEAM specific MSR to do the write back in sections. In this way it does export some new functionality that affects CPU state. Add tdh_phymem_page_wbinvd_tdr() to enable KVM to call TDH.PHYMEM.PAGE.WBINVD to do a cache write back and invalidate of a TDR, using the global KeyID. The underlying TDH.PHYMEM.PAGE.WBINVD SEAMCALL requires the related KeyID to be encoded into the SEAMCALL args. Since the global KeyID is not exposed to KVM, a dedicated wrapper is needed for TDR focused TDH.PHYMEM.PAGE.WBINVD operations. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Signed-off-by: Kai Huang Signed-off-by: Rick Edgecombe Reviewed-by: Binbin Wu Reviewed-by: Yuan Yao Message-ID: <20241203010317.827803-5-rick.p.edgecombe@intel.com> Acked-by: Dave Hansen Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 18 ++++++++++++++++ arch/x86/virt/vmx/tdx/tdx.c | 42 +++++++++++++++++++++++++++++++++++++ arch/x86/virt/vmx/tdx/tdx.h | 3 +++ 3 files changed, 63 insertions(+) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index f783d5f1a0e1..41a72ce961f3 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -33,6 +34,7 @@ #ifndef __ASSEMBLY__ #include +#include /* * Used by the #VE exception handler to gather the #VE exception @@ -140,6 +142,19 @@ struct tdx_vp { struct page **tdcx_pages; }; + +static inline u64 mk_keyed_paddr(u16 hkid, struct page *page) +{ + u64 ret; + + ret = page_to_phys(page); + /* KeyID bits are just above the physical address bits: */ + ret |= (u64)hkid << boot_cpu_data.x86_phys_bits; + + return ret; + +} + u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page); u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page); u64 tdh_mng_key_config(struct tdx_td *td); @@ -148,6 +163,9 @@ u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp); u64 tdh_mng_key_freeid(struct tdx_td *td); u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err); u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid); +u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size); +u64 tdh_phymem_cache_wb(bool resume); +u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td); #else static inline void tdx_init(void) { } static inline int tdx_cpu_enable(void) { return -ENODEV; } diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index e55570575f22..2350d25b4ca3 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1575,3 +1575,45 @@ u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid) return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args); } EXPORT_SYMBOL_GPL(tdh_vp_init); + +/* + * TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats. + * So despite the names, they must be interpted specially as described by the spec. Return + * them only for error reporting purposes. + */ +u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size) +{ + struct tdx_module_args args = { + .rcx = page_to_phys(page), + }; + u64 ret; + + ret = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &args); + + *tdx_pt = args.rcx; + *tdx_owner = args.rdx; + *tdx_size = args.r8; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim); + +u64 tdh_phymem_cache_wb(bool resume) +{ + struct tdx_module_args args = { + .rcx = resume ? 1 : 0, + }; + + return seamcall(TDH_PHYMEM_CACHE_WB, &args); +} +EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb); + +u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td) +{ + struct tdx_module_args args = {}; + + args.rcx = mk_keyed_paddr(tdx_global_keyid, td->tdr_page); + + return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); +} +EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr); diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index f0464f7d9780..7a15c9afcdfa 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -24,11 +24,14 @@ #define TDH_MNG_INIT 21 #define TDH_VP_INIT 22 #define TDH_PHYMEM_PAGE_RDMD 24 +#define TDH_PHYMEM_PAGE_RECLAIM 28 #define TDH_SYS_KEY_CONFIG 31 #define TDH_SYS_INIT 33 #define TDH_SYS_RD 34 #define TDH_SYS_LP_INIT 35 #define TDH_SYS_TDMR_INIT 36 +#define TDH_PHYMEM_CACHE_WB 40 +#define TDH_PHYMEM_PAGE_WBINVD 41 #define TDH_SYS_CONFIG 45 /* -- 2.51.0 From 5e5151c5562afa28555b45e70f4386d62f62e640 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 2 Dec 2024 17:03:15 -0800 Subject: [PATCH 08/16] x86/virt/tdx: Add SEAMCALL wrappers for TDX VM/vCPU field access Intel TDX protects guest VMs from malicious host and certain physical attacks. The TDX module has TD scoped and vCPU scoped "metadata fields". These fields are a bit like VMCS fields, and stored in data structures maintained by the TDX module. Export 3 SEAMCALLs for use in reading and writing these fields: Make tdh_mng_rd() use MNG.VP.RD to read the TD scoped metadata. Make tdh_vp_rd()/tdh_vp_wr() use TDH.VP.RD/WR to read/write the vCPU scoped metadata. KVM will use these by creating inline helpers that target various metadata sizes. Export the raw SEAMCALL leaf, to avoid exporting the large number of various sized helpers. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Signed-off-by: Kai Huang Signed-off-by: Rick Edgecombe Reviewed-by: Binbin Wu Reviewed-by: Yuan Yao Acked-by: Dave Hansen Message-ID: <20241203010317.827803-6-rick.p.edgecombe@intel.com> Acked-by: Dave Hansen Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 3 +++ arch/x86/virt/vmx/tdx/tdx.c | 47 +++++++++++++++++++++++++++++++++++++ arch/x86/virt/vmx/tdx/tdx.h | 3 +++ 3 files changed, 53 insertions(+) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 41a72ce961f3..22ffb8dfe994 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -160,9 +160,12 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page); u64 tdh_mng_key_config(struct tdx_td *td); u64 tdh_mng_create(struct tdx_td *td, u16 hkid); u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp); +u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data); u64 tdh_mng_key_freeid(struct tdx_td *td); u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err); u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid); +u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data); +u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask); u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size); u64 tdh_phymem_cache_wb(bool resume); u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td); diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 2350d25b4ca3..93150f3c1112 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1537,6 +1537,23 @@ u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp) } EXPORT_SYMBOL_GPL(tdh_vp_create); +u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + .rdx = field, + }; + u64 ret; + + ret = seamcall_ret(TDH_MNG_RD, &args); + + /* R8: Content of the field, or 0 in case of error. */ + *data = args.r8; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mng_rd); + u64 tdh_mng_key_freeid(struct tdx_td *td) { struct tdx_module_args args = { @@ -1563,6 +1580,36 @@ u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err) } EXPORT_SYMBOL_GPL(tdh_mng_init); +u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data) +{ + struct tdx_module_args args = { + .rcx = tdx_tdvpr_pa(vp), + .rdx = field, + }; + u64 ret; + + ret = seamcall_ret(TDH_VP_RD, &args); + + /* R8: Content of the field, or 0 in case of error. */ + *data = args.r8; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_vp_rd); + +u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask) +{ + struct tdx_module_args args = { + .rcx = tdx_tdvpr_pa(vp), + .rdx = field, + .r8 = data, + .r9 = mask, + }; + + return seamcall(TDH_VP_WR, &args); +} +EXPORT_SYMBOL_GPL(tdh_vp_wr); + u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid) { struct tdx_module_args args = { diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 7a15c9afcdfa..aacd38b12989 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -19,11 +19,13 @@ #define TDH_VP_ADDCX 4 #define TDH_MNG_KEY_CONFIG 8 #define TDH_MNG_CREATE 9 +#define TDH_MNG_RD 11 #define TDH_VP_CREATE 10 #define TDH_MNG_KEY_FREEID 20 #define TDH_MNG_INIT 21 #define TDH_VP_INIT 22 #define TDH_PHYMEM_PAGE_RDMD 24 +#define TDH_VP_RD 26 #define TDH_PHYMEM_PAGE_RECLAIM 28 #define TDH_SYS_KEY_CONFIG 31 #define TDH_SYS_INIT 33 @@ -32,6 +34,7 @@ #define TDH_SYS_TDMR_INIT 36 #define TDH_PHYMEM_CACHE_WB 40 #define TDH_PHYMEM_PAGE_WBINVD 41 +#define TDH_VP_WR 43 #define TDH_SYS_CONFIG 45 /* -- 2.51.0 From e465cc63db1956f41370099c8ba0a62cf4f65990 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 2 Dec 2024 17:03:16 -0800 Subject: [PATCH 09/16] x86/virt/tdx: Add SEAMCALL wrappers for TDX flush operations Intel TDX protects guest VMs from malicious host and certain physical attacks. The TDX module has the concept of flushing vCPUs. These flushes include both a flush of the translation caches and also any other state internal to the TDX module. Before freeing a KeyID, this flush operation needs to be done. KVM will need to perform the flush on each pCPU associated with the TD, and also perform a TD scoped operation that checks if the flush has been done on all vCPU's associated with the TD. Add a tdh_vp_flush() function to be used to call TDH.VP.FLUSH on each pCPU associated with the TD during TD teardown. It will also be called when disabling TDX and during vCPU migration between pCPUs. Add tdh_mng_vpflushdone() to be used by KVM to call TDH.MNG.VPFLUSHDONE. KVM will use this during TD teardown to verify that TDH.VP.FLUSH has been called sufficiently, and advance the state machine that will allow for reclaiming the TD's KeyID. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Signed-off-by: Kai Huang Signed-off-by: Rick Edgecombe Reviewed-by: Binbin Wu Reviewed-by: Yuan Yao Message-ID: <20241203010317.827803-7-rick.p.edgecombe@intel.com> Acked-by: Dave Hansen Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 2 ++ arch/x86/virt/vmx/tdx/tdx.c | 20 ++++++++++++++++++++ arch/x86/virt/vmx/tdx/tdx.h | 2 ++ 3 files changed, 24 insertions(+) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 22ffb8dfe994..1392f20c3f1d 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -161,6 +161,8 @@ u64 tdh_mng_key_config(struct tdx_td *td); u64 tdh_mng_create(struct tdx_td *td, u16 hkid); u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp); u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data); +u64 tdh_vp_flush(struct tdx_vp *vp); +u64 tdh_mng_vpflushdone(struct tdx_td *td); u64 tdh_mng_key_freeid(struct tdx_td *td); u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err); u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid); diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 93150f3c1112..369b264f4d9b 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1554,6 +1554,26 @@ u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data) } EXPORT_SYMBOL_GPL(tdh_mng_rd); +u64 tdh_vp_flush(struct tdx_vp *vp) +{ + struct tdx_module_args args = { + .rcx = tdx_tdvpr_pa(vp), + }; + + return seamcall(TDH_VP_FLUSH, &args); +} +EXPORT_SYMBOL_GPL(tdh_vp_flush); + +u64 tdh_mng_vpflushdone(struct tdx_td *td) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + }; + + return seamcall(TDH_MNG_VPFLUSHDONE, &args); +} +EXPORT_SYMBOL_GPL(tdh_mng_vpflushdone); + u64 tdh_mng_key_freeid(struct tdx_td *td) { struct tdx_module_args args = { diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index aacd38b12989..62cb7832c42d 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -20,6 +20,8 @@ #define TDH_MNG_KEY_CONFIG 8 #define TDH_MNG_CREATE 9 #define TDH_MNG_RD 11 +#define TDH_VP_FLUSH 18 +#define TDH_MNG_VPFLUSHDONE 19 #define TDH_VP_CREATE 10 #define TDH_MNG_KEY_FREEID 20 #define TDH_MNG_INIT 21 -- 2.51.0 From 7ba2fd80ee43e34f5e5c449e5e26538c44bbf52e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 22 Jan 2025 06:00:28 -0500 Subject: [PATCH 10/16] x86/virt/tdx: allocate tdx_sys_info in static memory Adding all the information that KVM needs increases the size of struct tdx_sys_info, to the point that you can get warnings about the stack size of init_tdx_module(). Since KVM also needs to read the TDX metadata after init_tdx_module() returns, make the variable a global. Reviewed-by: Rick Edgecombe Reviewed-by: Kai Huang Signed-off-by: Paolo Bonzini --- arch/x86/virt/vmx/tdx/tdx.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 369b264f4d9b..578ad468634b 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -53,6 +53,8 @@ static DEFINE_MUTEX(tdx_module_lock); /* All TDX-usable memory regions. Protected by mem_hotplug_lock. */ static LIST_HEAD(tdx_memlist); +static struct tdx_sys_info tdx_sysinfo; + typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args); static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args) @@ -1061,15 +1063,14 @@ static int init_tdmrs(struct tdmr_info_list *tdmr_list) static int init_tdx_module(void) { - struct tdx_sys_info sysinfo; int ret; - ret = get_tdx_sys_info(&sysinfo); + ret = get_tdx_sys_info(&tdx_sysinfo); if (ret) return ret; /* Check whether the kernel can support this module */ - ret = check_features(&sysinfo); + ret = check_features(&tdx_sysinfo); if (ret) return ret; @@ -1090,12 +1091,12 @@ static int init_tdx_module(void) goto out_put_tdxmem; /* Allocate enough space for constructing TDMRs */ - ret = alloc_tdmr_list(&tdx_tdmr_list, &sysinfo.tdmr); + ret = alloc_tdmr_list(&tdx_tdmr_list, &tdx_sysinfo.tdmr); if (ret) goto err_free_tdxmem; /* Cover all TDX-usable memory regions in TDMRs */ - ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &sysinfo.tdmr); + ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdx_sysinfo.tdmr); if (ret) goto err_free_tdmrs; -- 2.51.0 From 4caf32daf0b4f83681d63acc559635a2a8ddf71c Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Tue, 24 Dec 2024 09:07:40 -0500 Subject: [PATCH 11/16] x86/virt/tdx: Read essential global metadata for KVM KVM needs two classes of global metadata to create and run TDX guests: - "TD Control Structures" - "TD Configurability" The first class contains the sizes of TDX guest per-VM and per-vCPU control structures. KVM will need to use them to allocate enough space for those control structures. The second class contains info which reports things like which features are configurable to TDX guest etc. KVM will need to use them to properly configure TDX guests. Read them for KVM TDX to use. The code change is auto-generated by re-running the script in [1] after uncommenting the "td_conf" and "td_ctrl" part to regenerate the tdx_global_metadata.{hc} and update them to the existing ones in the kernel. #python tdx.py global_metadata.json tdx_global_metadata.h \ tdx_global_metadata.c The 'global_metadata.json' can be fetched from [2]. Note that as of this writing, the JSON file only allows a maximum of 32 CPUID entries. While this is enough for current contents of the CPUID leaves, there were plans to change the JSON per TDX module release which would change the ABI and potentially prevent future versions of the TDX module from working with older kernels. While discussions are ongoing with the TDX module team on what exactly constitutes an ABI breakage, in the meantime the TDX module team has agreed to not increase the number of CPUID entries beyond 128 without an opt in. Therefore the file was tweaked by hand to change the maximum number of CPUID_CONFIGs. Link: https://lore.kernel.org/kvm/0853b155ec9aac09c594caa60914ed6ea4dc0a71.camel@intel.com/ [1] Link: https://cdrdv2.intel.com/v1/dl/getContent/795381 [2] Signed-off-by: Kai Huang Signed-off-by: Rick Edgecombe Message-ID: <20241030190039.77971-4-rick.p.edgecombe@intel.com> Acked-by: Dave Hansen Signed-off-by: Paolo Bonzini --- arch/x86/virt/vmx/tdx/tdx_global_metadata.c | 50 +++++++++++++++++++++ arch/x86/virt/vmx/tdx/tdx_global_metadata.h | 19 ++++++++ 2 files changed, 69 insertions(+) diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c index 8027a24d1c6e..13ad2663488b 100644 --- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c +++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c @@ -37,12 +37,62 @@ static int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr) return ret; } +static int get_tdx_sys_info_td_ctrl(struct tdx_sys_info_td_ctrl *sysinfo_td_ctrl) +{ + int ret = 0; + u64 val; + + if (!ret && !(ret = read_sys_metadata_field(0x9800000100000000, &val))) + sysinfo_td_ctrl->tdr_base_size = val; + if (!ret && !(ret = read_sys_metadata_field(0x9800000100000100, &val))) + sysinfo_td_ctrl->tdcs_base_size = val; + if (!ret && !(ret = read_sys_metadata_field(0x9800000100000200, &val))) + sysinfo_td_ctrl->tdvps_base_size = val; + + return ret; +} + +static int get_tdx_sys_info_td_conf(struct tdx_sys_info_td_conf *sysinfo_td_conf) +{ + int ret = 0; + u64 val; + int i, j; + + if (!ret && !(ret = read_sys_metadata_field(0x1900000300000000, &val))) + sysinfo_td_conf->attributes_fixed0 = val; + if (!ret && !(ret = read_sys_metadata_field(0x1900000300000001, &val))) + sysinfo_td_conf->attributes_fixed1 = val; + if (!ret && !(ret = read_sys_metadata_field(0x1900000300000002, &val))) + sysinfo_td_conf->xfam_fixed0 = val; + if (!ret && !(ret = read_sys_metadata_field(0x1900000300000003, &val))) + sysinfo_td_conf->xfam_fixed1 = val; + if (!ret && !(ret = read_sys_metadata_field(0x9900000100000004, &val))) + sysinfo_td_conf->num_cpuid_config = val; + if (!ret && !(ret = read_sys_metadata_field(0x9900000100000008, &val))) + sysinfo_td_conf->max_vcpus_per_td = val; + if (sysinfo_td_conf->num_cpuid_config > ARRAY_SIZE(sysinfo_td_conf->cpuid_config_leaves)) + return -EINVAL; + for (i = 0; i < sysinfo_td_conf->num_cpuid_config; i++) + if (!ret && !(ret = read_sys_metadata_field(0x9900000300000400 + i, &val))) + sysinfo_td_conf->cpuid_config_leaves[i] = val; + if (sysinfo_td_conf->num_cpuid_config > ARRAY_SIZE(sysinfo_td_conf->cpuid_config_values)) + return -EINVAL; + for (i = 0; i < sysinfo_td_conf->num_cpuid_config; i++) + for (j = 0; j < 2; j++) + if (!ret && !(ret = read_sys_metadata_field(0x9900000300000500 + i * 2 + j, &val))) + sysinfo_td_conf->cpuid_config_values[i][j] = val; + + return ret; +} + static int get_tdx_sys_info(struct tdx_sys_info *sysinfo) { int ret = 0; ret = ret ?: get_tdx_sys_info_features(&sysinfo->features); ret = ret ?: get_tdx_sys_info_tdmr(&sysinfo->tdmr); + ret = ret ?: get_tdx_sys_info_td_ctrl(&sysinfo->td_ctrl); + ret = ret ?: get_tdx_sys_info_td_conf(&sysinfo->td_conf); return ret; } diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.h b/arch/x86/virt/vmx/tdx/tdx_global_metadata.h index 6dd3c9695f59..060a2ad744bf 100644 --- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.h +++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.h @@ -17,9 +17,28 @@ struct tdx_sys_info_tdmr { u16 pamt_1g_entry_size; }; +struct tdx_sys_info_td_ctrl { + u16 tdr_base_size; + u16 tdcs_base_size; + u16 tdvps_base_size; +}; + +struct tdx_sys_info_td_conf { + u64 attributes_fixed0; + u64 attributes_fixed1; + u64 xfam_fixed0; + u64 xfam_fixed1; + u16 num_cpuid_config; + u16 max_vcpus_per_td; + u64 cpuid_config_leaves[128]; + u64 cpuid_config_values[128][2]; +}; + struct tdx_sys_info { struct tdx_sys_info_features features; struct tdx_sys_info_tdmr tdmr; + struct tdx_sys_info_td_ctrl td_ctrl; + struct tdx_sys_info_td_conf td_conf; }; #endif -- 2.51.0 From aed4dde24c8e10e579ad40d04333a13140d011fc Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 30 Oct 2024 12:00:17 -0700 Subject: [PATCH 12/16] x86/virt/tdx: Add tdx_guest_keyid_alloc/free() to alloc and free TDX guest KeyID Intel TDX protects guest VMs from malicious host and certain physical attacks. Pre-TDX Intel hardware has support for a memory encryption architecture called MK-TME, which repurposes several high bits of physical address as "KeyID". The BIOS reserves a sub-range of MK-TME KeyIDs as "TDX private KeyIDs". Each TDX guest must be assigned with a unique TDX KeyID when it is created. The kernel reserves the first TDX private KeyID for crypto-protection of specific TDX module data which has a lifecycle that exceeds the KeyID reserved for the TD's use. The rest of the KeyIDs are left for TDX guests to use. Create a small KeyID allocator. Export tdx_guest_keyid_alloc()/tdx_guest_keyid_free() to allocate and free TDX guest KeyID for KVM to use. Don't provide the stub functions when CONFIG_INTEL_TDX_HOST=n since they are not supposed to be called in this case. Signed-off-by: Isaku Yamahata Signed-off-by: Rick Edgecombe Message-ID: <20241030190039.77971-5-rick.p.edgecombe@intel.com> Acked-by: Dave Hansen Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 3 +++ arch/x86/virt/vmx/tdx/tdx.c | 17 +++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 1392f20c3f1d..75a91869453d 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -122,6 +122,9 @@ int tdx_cpu_enable(void); int tdx_enable(void); const char *tdx_dump_mce_info(struct mce *m); +int tdx_guest_keyid_alloc(void); +void tdx_guest_keyid_free(unsigned int keyid); + struct tdx_td { /* TD root structure: */ struct page *tdr_page; diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 578ad468634b..0122051af6b3 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,8 @@ static u32 tdx_global_keyid __ro_after_init; static u32 tdx_guest_keyid_start __ro_after_init; static u32 tdx_nr_guest_keyids __ro_after_init; +static DEFINE_IDA(tdx_guest_keyid_pool); + static DEFINE_PER_CPU(bool, tdx_lp_initialized); static struct tdmr_info_list tdx_tdmr_list; @@ -1459,6 +1462,20 @@ void __init tdx_init(void) check_tdx_erratum(); } +int tdx_guest_keyid_alloc(void) +{ + return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start, + tdx_guest_keyid_start + tdx_nr_guest_keyids - 1, + GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(tdx_guest_keyid_alloc); + +void tdx_guest_keyid_free(unsigned int keyid) +{ + ida_free(&tdx_guest_keyid_pool, keyid); +} +EXPORT_SYMBOL_GPL(tdx_guest_keyid_free); + static inline u64 tdx_tdr_pa(struct tdx_td *td) { return page_to_phys(td->tdr_page); -- 2.51.0 From 62b1fa69f31969e11d03529d3a80599ddda2d043 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 15 Nov 2024 22:52:40 +1300 Subject: [PATCH 13/16] KVM: Export hardware virtualization enabling/disabling functions To support TDX, KVM will need to enable TDX during KVM module loading time. Enabling TDX requires enabling hardware virtualization first so that all online CPUs (and the new CPU going online) are in post-VMXON state. KVM by default enables hardware virtualization but that is done in kvm_init(), which must be the last step after all initialization is done thus is too late for enabling TDX. Export functions to enable/disable hardware virtualization so that TDX code can use them to handle hardware virtualization enabling before kvm_init(). Signed-off-by: Kai Huang Message-ID: Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 8 ++++++++ virt/kvm/kvm_main.c | 18 ++++-------------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f34f4cfaa513..1e75fa114f34 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2571,4 +2571,12 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range); #endif +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +int kvm_enable_virtualization(void); +void kvm_disable_virtualization(void); +#else +static inline int kvm_enable_virtualization(void) { return 0; } +static inline void kvm_disable_virtualization(void) { } +#endif + #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ba0327e2d0d3..6e40383fbe47 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -143,8 +143,6 @@ static int kvm_no_compat_open(struct inode *inode, struct file *file) #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ .open = kvm_no_compat_open #endif -static int kvm_enable_virtualization(void); -static void kvm_disable_virtualization(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); @@ -5576,7 +5574,7 @@ static struct syscore_ops kvm_syscore_ops = { .shutdown = kvm_shutdown, }; -static int kvm_enable_virtualization(void) +int kvm_enable_virtualization(void) { int r; @@ -5621,8 +5619,9 @@ err_cpuhp: --kvm_usage_count; return r; } +EXPORT_SYMBOL_GPL(kvm_enable_virtualization); -static void kvm_disable_virtualization(void) +void kvm_disable_virtualization(void) { guard(mutex)(&kvm_usage_lock); @@ -5633,6 +5632,7 @@ static void kvm_disable_virtualization(void) cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); kvm_arch_disable_virtualization(); } +EXPORT_SYMBOL_GPL(kvm_disable_virtualization); static int kvm_init_virtualization(void) { @@ -5648,21 +5648,11 @@ static void kvm_uninit_virtualization(void) kvm_disable_virtualization(); } #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ -static int kvm_enable_virtualization(void) -{ - return 0; -} - static int kvm_init_virtualization(void) { return 0; } -static void kvm_disable_virtualization(void) -{ - -} - static void kvm_uninit_virtualization(void) { -- 2.51.0 From d6bee7813752b3dcf0a057315ab5cca0805bf905 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 22 Jan 2025 05:40:25 -0500 Subject: [PATCH 14/16] KVM: VMX: Refactor VMX module init/exit functions Add vt_init() and vt_exit() as the new module init/exit functions and refactor existing vmx_init()/vmx_exit() as helper to make room for TDX specific initialization and teardown. To support TDX, KVM will need to enable TDX during KVM module loading time. Enabling TDX requires enabling hardware virtualization first so that all online CPUs (and the new CPU going online) are in post-VMXON state. Currently, the vmx_init() flow is: 1) hv_init_evmcs(), 2) kvm_x86_vendor_init(), 3) Other VMX specific initialization, 4) kvm_init() The kvm_x86_vendor_init() invokes kvm_x86_init_ops::hardware_setup() to do VMX specific hardware setup and calls kvm_update_ops() to initialize kvm_x86_ops to VMX's version. TDX will have its own version for most of kvm_x86_ops callbacks. It would be nice if kvm_x86_init_ops::hardware_setup() could also be used for TDX, but in practice it cannot. The reason is, as mentioned above, TDX initialization requires hardware virtualization having been enabled, which must happen after kvm_update_ops(), but hardware_setup() is done before that. Also, TDX is based on VMX, and it makes sense to only initialize TDX after VMX has been initialized. If VMX fails to initialize, TDX is likely broken anyway. So the new flow of KVM module init function will be: 1) Current VMX initialization code in vmx_init() before kvm_init(), 2) TDX initialization, 3) kvm_init() Split vmx_init() into two parts based on above 1) and 3) so that TDX initialization can fit in between. Make part 1) as the new helper vmx_init(). Introduce vt_init() as the new module init function which calls vmx_init() and kvm_init(). TDX initialization will be added later. Do the same thing for vmx_exit()/vt_exit(). Signed-off-by: Kai Huang Message-ID: <3f23f24098bdcf42e213798893ffff7cdc7103be.1731664295.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/main.c | 32 ++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 23 ++--------------------- arch/x86/kvm/vmx/vmx.h | 3 +++ 3 files changed, 37 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 43ee9ed11291..54cf95cb8d42 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -168,3 +168,35 @@ struct kvm_x86_init_ops vt_init_ops __initdata = { .runtime_ops = &vt_x86_ops, .pmu_ops = &intel_pmu_ops, }; + +static void __exit vt_exit(void) +{ + kvm_exit(); + vmx_exit(); +} +module_exit(vt_exit); + +static int __init vt_init(void) +{ + int r; + + r = vmx_init(); + if (r) + return r; + + /* + * Common KVM initialization _must_ come last, after this, /dev/kvm is + * exposed to userspace! + */ + r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), + THIS_MODULE); + if (r) + goto err_kvm_init; + + return 0; + +err_kvm_init: + vmx_exit(); + return r; +} +module_init(vt_init); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3b92f893b239..03d9e5069791 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8594,23 +8594,16 @@ static void vmx_cleanup_l1d_flush(void) l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; } -static void __vmx_exit(void) +void vmx_exit(void) { allow_smaller_maxphyaddr = false; vmx_cleanup_l1d_flush(); -} -static void __exit vmx_exit(void) -{ - kvm_exit(); - __vmx_exit(); kvm_x86_vendor_exit(); - } -module_exit(vmx_exit); -static int __init vmx_init(void) +int __init vmx_init(void) { int r, cpu; @@ -8654,21 +8647,9 @@ static int __init vmx_init(void) if (!enable_ept) allow_smaller_maxphyaddr = true; - /* - * Common KVM initialization _must_ come last, after this, /dev/kvm is - * exposed to userspace! - */ - r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), - THIS_MODULE); - if (r) - goto err_kvm_init; - return 0; -err_kvm_init: - __vmx_exit(); err_l1d_flush: kvm_x86_vendor_exit(); return r; } -module_init(vmx_init); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 951e44dc9d0e..ec8483311d00 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -758,4 +758,7 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) vmx->segment_cache.bitmask = 0; } +int vmx_init(void); +void vmx_exit(void); + #endif /* __KVM_X86_VMX_H */ -- 2.51.0 From fcdbdf63431c9faf639bffc957ea2ce9b545432e Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 15 Nov 2024 22:52:41 +1300 Subject: [PATCH 15/16] KVM: VMX: Initialize TDX during KVM module load Before KVM can use TDX to create and run TDX guests, TDX needs to be initialized from two perspectives: 1) TDX module must be initialized properly to a working state; 2) A per-cpu TDX initialization, a.k.a the TDH.SYS.LP.INIT SEAMCALL must be done on any logical cpu before it can run any other TDX SEAMCALLs. The TDX host core-kernel provides two functions to do the above two respectively: tdx_enable() and tdx_cpu_enable(). There are two options in terms of when to initialize TDX: initialize TDX at KVM module loading time, or when creating the first TDX guest. Choose to initialize TDX during KVM module loading time: Initializing TDX module is both memory and CPU time consuming: 1) the kernel needs to allocate a non-trivial size(~1/256) of system memory as metadata used by TDX module to track each TDX-usable memory page's status; 2) the TDX module needs to initialize this metadata, one entry for each TDX-usable memory page. Also, the kernel uses alloc_contig_pages() to allocate those metadata chunks, because they are large and need to be physically contiguous. alloc_contig_pages() can fail. If initializing TDX when creating the first TDX guest, then there's chance that KVM won't be able to run any TDX guests albeit KVM _declares_ to be able to support TDX. This isn't good for the user. On the other hand, initializing TDX at KVM module loading time can make sure KVM is providing a consistent view of whether KVM can support TDX to the user. Always only try to initialize TDX after VMX has been initialized. TDX is based on VMX, and if VMX fails to initialize then TDX is likely to be broken anyway. Also, in practice, supporting TDX will require part of VMX and common x86 infrastructure in working order, so TDX cannot be enabled alone w/o VMX support. There are two cases that can result in failure to initialize TDX: 1) TDX cannot be supported (e.g., because of TDX is not supported or enabled by hardware, or module is not loaded, or missing some dependency in KVM's configuration); 2) Any unexpected error during TDX bring-up. For the first case only mark TDX is disabled but still allow KVM module to be loaded. For the second case just fail to load the KVM module so that the user can be aware. Because TDX costs additional memory, don't enable TDX by default. Add a new module parameter 'enable_tdx' to allow the user to opt-in. Note, the name tdx_init() has already been taken by the early boot code. Use tdx_bringup() for initializing TDX (and tdx_cleanup() since KVM doesn't actually teardown TDX). They don't match vt_init()/vt_exit(), vmx_init()/vmx_exit() etc but it's not end of the world. Also, once initialized, the TDX module cannot be disabled and enabled again w/o the TDX module runtime update, which isn't supported by the kernel. After TDX is enabled, nothing needs to be done when KVM disables hardware virtualization, e.g., when offlining CPU, or during suspend/resume. TDX host core-kernel code internally tracks TDX status and can handle "multiple enabling" scenario. Similar to KVM_AMD_SEV, add a new KVM_INTEL_TDX Kconfig to guide KVM TDX code. Make it depend on INTEL_TDX_HOST but not replace INTEL_TDX_HOST because in the longer term there's a use case that requires making SEAMCALLs w/o KVM as mentioned by Dan [1]. Link: https://lore.kernel.org/6723fc2070a96_60c3294dc@dwillia2-mobl3.amr.corp.intel.com.notmuch/ [1] Signed-off-by: Kai Huang Message-ID: <162f9dee05c729203b9ad6688db1ca2960b4b502.1731664295.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 3 + .../tdx => include/asm}/tdx_global_metadata.h | 0 arch/x86/kvm/Kconfig | 10 ++ arch/x86/kvm/Makefile | 1 + arch/x86/kvm/vmx/main.c | 9 + arch/x86/kvm/vmx/tdx.c | 162 ++++++++++++++++++ arch/x86/kvm/vmx/tdx.h | 13 ++ arch/x86/virt/vmx/tdx/tdx.c | 14 ++ arch/x86/virt/vmx/tdx/tdx.h | 1 - include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 3 +- 11 files changed, 215 insertions(+), 2 deletions(-) rename arch/x86/{virt/vmx/tdx => include/asm}/tdx_global_metadata.h (100%) create mode 100644 arch/x86/kvm/vmx/tdx.c create mode 100644 arch/x86/kvm/vmx/tdx.h diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 75a91869453d..1a8c687603aa 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -34,6 +34,7 @@ #ifndef __ASSEMBLY__ #include +#include #include /* @@ -121,6 +122,7 @@ static inline u64 sc_retry(sc_func_t func, u64 fn, int tdx_cpu_enable(void); int tdx_enable(void); const char *tdx_dump_mce_info(struct mce *m); +const struct tdx_sys_info *tdx_get_sysinfo(void); int tdx_guest_keyid_alloc(void); void tdx_guest_keyid_free(unsigned int keyid); @@ -179,6 +181,7 @@ static inline void tdx_init(void) { } static inline int tdx_cpu_enable(void) { return -ENODEV; } static inline int tdx_enable(void) { return -ENODEV; } static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } +static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } #endif /* CONFIG_INTEL_TDX_HOST */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.h b/arch/x86/include/asm/tdx_global_metadata.h similarity index 100% rename from arch/x86/virt/vmx/tdx/tdx_global_metadata.h rename to arch/x86/include/asm/tdx_global_metadata.h diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ea2c4f21c1ca..fe8cbee6f614 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -128,6 +128,16 @@ config X86_SGX_KVM If unsure, say N. +config KVM_INTEL_TDX + bool "Intel Trust Domain Extensions (TDX) support" + default y + depends on INTEL_TDX_HOST + help + Provides support for launching Intel Trust Domain Extensions (TDX) + confidential VMs on Intel processors. + + If unsure, say N. + config KVM_AMD tristate "KVM for AMD processors support" depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON) diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f9dddb8cb466..a5d362c7b504 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -20,6 +20,7 @@ kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o +kvm-intel-$(CONFIG_KVM_INTEL_TDX) += vmx/tdx.o kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 54cf95cb8d42..97c453187cc1 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -6,6 +6,7 @@ #include "nested.h" #include "pmu.h" #include "posted_intr.h" +#include "tdx.h" #define VMX_REQUIRED_APICV_INHIBITS \ (BIT(APICV_INHIBIT_REASON_DISABLED) | \ @@ -172,6 +173,7 @@ struct kvm_x86_init_ops vt_init_ops __initdata = { static void __exit vt_exit(void) { kvm_exit(); + tdx_cleanup(); vmx_exit(); } module_exit(vt_exit); @@ -184,6 +186,11 @@ static int __init vt_init(void) if (r) return r; + /* tdx_init() has been taken */ + r = tdx_bringup(); + if (r) + goto err_tdx_bringup; + /* * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! @@ -196,6 +203,8 @@ static int __init vt_init(void) return 0; err_kvm_init: + tdx_cleanup(); +err_tdx_bringup: vmx_exit(); return r; } diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c new file mode 100644 index 000000000000..3c089ed3b843 --- /dev/null +++ b/arch/x86/kvm/vmx/tdx.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include "capabilities.h" +#include "tdx.h" + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +static bool enable_tdx __ro_after_init; +module_param_named(tdx, enable_tdx, bool, 0444); + +static enum cpuhp_state tdx_cpuhp_state; + +static int tdx_online_cpu(unsigned int cpu) +{ + unsigned long flags; + int r; + + /* Sanity check CPU is already in post-VMXON */ + WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE)); + + local_irq_save(flags); + r = tdx_cpu_enable(); + local_irq_restore(flags); + + return r; +} + +static void __do_tdx_cleanup(void) +{ + /* + * Once TDX module is initialized, it cannot be disabled and + * re-initialized again w/o runtime update (which isn't + * supported by kernel). Only need to remove the cpuhp here. + * The TDX host core code tracks TDX status and can handle + * 'multiple enabling' scenario. + */ + WARN_ON_ONCE(!tdx_cpuhp_state); + cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state); + tdx_cpuhp_state = 0; +} + +static void __tdx_cleanup(void) +{ + cpus_read_lock(); + __do_tdx_cleanup(); + cpus_read_unlock(); +} + +static int __init __do_tdx_bringup(void) +{ + int r; + + /* + * TDX-specific cpuhp callback to call tdx_cpu_enable() on all + * online CPUs before calling tdx_enable(), and on any new + * going-online CPU to make sure it is ready for TDX guest. + */ + r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN, + "kvm/cpu/tdx:online", + tdx_online_cpu, NULL); + if (r < 0) + return r; + + tdx_cpuhp_state = r; + + r = tdx_enable(); + if (r) + __do_tdx_cleanup(); + + return r; +} + +static int __init __tdx_bringup(void) +{ + int r; + + /* + * Enabling TDX requires enabling hardware virtualization first, + * as making SEAMCALLs requires CPU being in post-VMXON state. + */ + r = kvm_enable_virtualization(); + if (r) + return r; + + cpus_read_lock(); + r = __do_tdx_bringup(); + cpus_read_unlock(); + + if (r) + goto tdx_bringup_err; + + /* + * Leave hardware virtualization enabled after TDX is enabled + * successfully. TDX CPU hotplug depends on this. + */ + return 0; +tdx_bringup_err: + kvm_disable_virtualization(); + return r; +} + +void tdx_cleanup(void) +{ + if (enable_tdx) { + __tdx_cleanup(); + kvm_disable_virtualization(); + } +} + +int __init tdx_bringup(void) +{ + int r; + + if (!enable_tdx) + return 0; + + if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { + pr_err("tdx: no TDX private KeyIDs available\n"); + goto success_disable_tdx; + } + + if (!enable_virt_at_load) { + pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n"); + goto success_disable_tdx; + } + + /* + * Ideally KVM should probe whether TDX module has been loaded + * first and then try to bring it up. But TDX needs to use SEAMCALL + * to probe whether the module is loaded (there is no CPUID or MSR + * for that), and making SEAMCALL requires enabling virtualization + * first, just like the rest steps of bringing up TDX module. + * + * So, for simplicity do everything in __tdx_bringup(); the first + * SEAMCALL will return -ENODEV when the module is not loaded. The + * only complication is having to make sure that initialization + * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other + * cases. + */ + r = __tdx_bringup(); + if (r) { + /* + * Disable TDX only but don't fail to load module if + * the TDX module could not be loaded. No need to print + * message saying "module is not loaded" because it was + * printed when the first SEAMCALL failed. + */ + if (r == -ENODEV) + goto success_disable_tdx; + + enable_tdx = 0; + } + + return r; + +success_disable_tdx: + enable_tdx = 0; + return 0; +} diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h new file mode 100644 index 000000000000..9d4a0e8265bf --- /dev/null +++ b/arch/x86/kvm/vmx/tdx.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_TDX_H +#define __KVM_X86_VMX_TDX_H + +#ifdef CONFIG_KVM_INTEL_TDX +int tdx_bringup(void); +void tdx_cleanup(void); +#else +static inline int tdx_bringup(void) { return 0; } +static inline void tdx_cleanup(void) {} +#endif + +#endif diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 0122051af6b3..9f0c482c1a03 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1462,6 +1462,20 @@ void __init tdx_init(void) check_tdx_erratum(); } +const struct tdx_sys_info *tdx_get_sysinfo(void) +{ + const struct tdx_sys_info *p = NULL; + + /* Make sure all fields in @tdx_sysinfo have been populated */ + mutex_lock(&tdx_module_lock); + if (tdx_module_status == TDX_MODULE_INITIALIZED) + p = (const struct tdx_sys_info *)&tdx_sysinfo; + mutex_unlock(&tdx_module_lock); + + return p; +} +EXPORT_SYMBOL_GPL(tdx_get_sysinfo); + int tdx_guest_keyid_alloc(void) { return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start, diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 62cb7832c42d..da384387d4eb 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -3,7 +3,6 @@ #define _X86_VIRT_TDX_H #include -#include "tdx_global_metadata.h" /* * This file contains both macros and data structures defined by the TDX diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1e75fa114f34..3bfe3140f444 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2284,6 +2284,7 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) } #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +extern bool enable_virt_at_load; extern bool kvm_rebooting; #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6e40383fbe47..622b5a99078a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5464,8 +5464,9 @@ static struct miscdevice kvm_dev = { }; #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING -static bool enable_virt_at_load = true; +bool enable_virt_at_load = true; module_param(enable_virt_at_load, bool, 0444); +EXPORT_SYMBOL_GPL(enable_virt_at_load); __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); -- 2.51.0 From 45154fb010f7fdd99a67ce110f2abd646eaa209b Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 30 Oct 2024 12:00:15 -0700 Subject: [PATCH 16/16] KVM: TDX: Get TDX global information KVM will need to consult some essential TDX global information to create and run TDX guests. Get the global information after initializing TDX. Signed-off-by: Kai Huang Signed-off-by: Rick Edgecombe Message-ID: <20241030190039.77971-3-rick.p.edgecombe@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/tdx.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 3c089ed3b843..b74a50e8d086 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -13,6 +13,8 @@ module_param_named(tdx, enable_tdx, bool, 0444); static enum cpuhp_state tdx_cpuhp_state; +static const struct tdx_sys_info *tdx_sysinfo; + static int tdx_online_cpu(unsigned int cpu) { unsigned long flags; @@ -92,11 +94,20 @@ static int __init __tdx_bringup(void) if (r) goto tdx_bringup_err; + /* Get TDX global information for later use */ + tdx_sysinfo = tdx_get_sysinfo(); + if (WARN_ON_ONCE(!tdx_sysinfo)) { + r = -EINVAL; + goto get_sysinfo_err; + } + /* * Leave hardware virtualization enabled after TDX is enabled * successfully. TDX CPU hotplug depends on this. */ return 0; +get_sysinfo_err: + __tdx_cleanup(); tdx_bringup_err: kvm_disable_virtualization(); return r; -- 2.51.0