From 385ba3fd8d3d22d907484f9eec27d7906d48a446 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 12 Nov 2024 15:36:24 +0800 Subject: [PATCH 01/16] x86/virt/tdx: Add SEAMCALL wrapper tdh_mem_sept_add() to add SEPT pages TDX architecture introduces the concept of private GPA vs shared GPA, depending on the GPA.SHARED bit. The TDX module maintains a Secure EPT (S-EPT or SEPT) tree per TD for private GPA to HPA translation. Wrap the TDH.MEM.SEPT.ADD SEAMCALL with tdh_mem_sept_add() to provide pages to the TDX module for building a TD's SEPT tree. (Refer to these pages as SEPT pages). Callers need to allocate and provide a normal page to tdh_mem_sept_add(), which then passes the page to the TDX module via the SEAMCALL TDH.MEM.SEPT.ADD. The TDX module then installs the page into SEPT tree and encrypts this SEPT page with the TD's guest keyID. The kernel cannot use the SEPT page until after reclaiming it via TDH.MEM.SEPT.REMOVE or TDH.PHYMEM.PAGE.RECLAIM. Before passing the page to the TDX module, tdh_mem_sept_add() performs a CLFLUSH on the page mapped with keyID 0 to ensure that any dirty cache lines don't write back later and clobber TD memory or control structures. Don't worry about the other MK-TME keyIDs because the kernel doesn't use them. The TDX docs specify that this flush is not needed unless the TDX module exposes the CLFLUSH_BEFORE_ALLOC feature bit. Do the CLFLUSH unconditionally for two reasons: make the solution simpler by having a single path that can handle both !CLFLUSH_BEFORE_ALLOC and CLFLUSH_BEFORE_ALLOC cases. Avoid wading into any correctness uncertainty by going with a conservative solution to start. Callers should specify "GPA" and "level" for the TDX module to install the SEPT page at the specified position in the SEPT. Do not include the root page level in "level" since TDH.MEM.SEPT.ADD can only add non-root pages to the SEPT. Ensure "level" is between 1 and 3 for a 4-level SEPT or between 1 and 4 for a 5-level SEPT. Call tdh_mem_sept_add() during the TD's build time or during the TD's runtime. Check for errors from the function return value and retrieve extended error info from the function output parameters. The TDX module has many internal locks. To avoid staying in SEAM mode for too long, SEAMCALLs returns a BUSY error code to the kernel instead of spinning on the locks. Depending on the specific SEAMCALL, the caller may need to handle this error in specific ways (e.g., retry). Therefore, return the SEAMCALL error code directly to the caller. Don't attempt to handle it in the core kernel. TDH.MEM.SEPT.ADD effectively manages two internal resources of the TDX module: it installs page table pages in the SEPT tree and also updates the TDX module's page metadata (PAMT). Don't add a wrapper for the matching SEAMCALL for removing a SEPT page (TDH.MEM.SEPT.REMOVE) because KVM, as the only in-kernel user, will only tear down the SEPT tree when the TD is being torn down. When this happens it can just do other operations that reclaim the SEPT pages for the host kernels to use, update the PAMT and let the SEPT get trashed. [Kai: Switched from generic seamcall export] [Yan: Re-wrote the changelog] Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Signed-off-by: Kai Huang Signed-off-by: Rick Edgecombe Signed-off-by: Yan Zhao Message-ID: <20241112073624.22114-1-yan.y.zhao@intel.com> Acked-by: Dave Hansen Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 7 ++++++- arch/x86/virt/vmx/tdx/tdx.c | 19 +++++++++++++++++++ arch/x86/virt/vmx/tdx/tdx.h | 1 + 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 2879fc518a32..c3be3742f347 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -148,7 +148,6 @@ struct tdx_vp { struct page **tdcx_pages; }; - static inline u64 mk_keyed_paddr(u16 hkid, struct page *page) { u64 ret; @@ -158,10 +157,16 @@ static inline u64 mk_keyed_paddr(u16 hkid, struct page *page) ret |= (u64)hkid << boot_cpu_data.x86_phys_bits; return ret; +} +static inline int pg_level_to_tdx_sept_level(enum pg_level level) +{ + WARN_ON_ONCE(level == PG_LEVEL_NONE); + return level - 1; } u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page); +u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2); u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page); u64 tdh_mng_key_config(struct tdx_td *td); u64 tdh_mng_create(struct tdx_td *td, u16 hkid); diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 3a272e9ff2ca..506a75fbac0b 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1529,6 +1529,25 @@ u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page) } EXPORT_SYMBOL_GPL(tdh_mng_addcx); +u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa | level, + .rdx = tdx_tdr_pa(td), + .r8 = page_to_phys(page), + }; + u64 ret; + + tdx_clflush_page(page); + ret = seamcall_ret(TDH_MEM_SEPT_ADD, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_sept_add); + u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page) { struct tdx_module_args args = { diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index da384387d4eb..f3e37df4c63a 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -15,6 +15,7 @@ * TDX module SEAMCALL leaf functions */ #define TDH_MNG_ADDCX 1 +#define TDH_MEM_SEPT_ADD 3 #define TDH_VP_ADDCX 4 #define TDH_MNG_KEY_CONFIG 8 #define TDH_MNG_CREATE 9 -- 2.51.0 From 94c477a751c7b835f52e9171f1266027dd06aa1d Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 12 Nov 2024 15:36:36 +0800 Subject: [PATCH 02/16] x86/virt/tdx: Add SEAMCALL wrappers to add TD private pages TDX architecture introduces the concept of private GPA vs shared GPA, depending on the GPA.SHARED bit. The TDX module maintains a Secure EPT (S-EPT or SEPT) tree per TD to translate TD's private memory accessed using a private GPA. Wrap the SEAMCALL TDH.MEM.PAGE.ADD with tdh_mem_page_add() and TDH.MEM.PAGE.AUG with tdh_mem_page_aug() to add TD private pages and map them to the TD's private GPAs in the SEPT. Callers of tdh_mem_page_add() and tdh_mem_page_aug() allocate and provide normal pages to the wrappers, who further pass those pages to the TDX module. Before passing the pages to the TDX module, tdh_mem_page_add() and tdh_mem_page_aug() perform a CLFLUSH on the page mapped with keyID 0 to ensure that any dirty cache lines don't write back later and clobber TD memory or control structures. Don't worry about the other MK-TME keyIDs because the kernel doesn't use them. The TDX docs specify that this flush is not needed unless the TDX module exposes the CLFLUSH_BEFORE_ALLOC feature bit. Do the CLFLUSH unconditionally for two reasons: make the solution simpler by having a single path that can handle both !CLFLUSH_BEFORE_ALLOC and CLFLUSH_BEFORE_ALLOC cases. Avoid wading into any correctness uncertainty by going with a conservative solution to start. Call tdh_mem_page_add() to add a private page to a TD during the TD's build time (i.e., before TDH.MR.FINALIZE). Specify which GPA the 4K private page will map to. No need to specify level info since TDH.MEM.PAGE.ADD only adds pages at 4K level. To provide initial contents to TD, provide an additional source page residing in memory managed by the host kernel itself (encrypted with a shared keyID). The TDX module will copy the initial contents from the source page in shared memory into the private page after mapping the page in the SEPT to the specified private GPA. The TDX module allows the source page to be the same page as the private page to be added. In that case, the TDX module converts and encrypts the source page as a TD private page. Call tdh_mem_page_aug() to add a private page to a TD during the TD's runtime (i.e., after TDH.MR.FINALIZE). TDH.MEM.PAGE.AUG supports adding huge pages. Specify which GPA the private page will map to, along with level info embedded in the lower bits of the GPA. The TDX module will recognize the added page as the TD's private page after the TD's acceptance with TDCALL TDG.MEM.PAGE.ACCEPT. tdh_mem_page_add() and tdh_mem_page_aug() may fail. Callers can check function return value and retrieve extended error info from the function output parameters. The TDX module has many internal locks. To avoid staying in SEAM mode for too long, SEAMCALLs returns a BUSY error code to the kernel instead of spinning on the locks. Depending on the specific SEAMCALL, the caller may need to handle this error in specific ways (e.g., retry). Therefore, return the SEAMCALL error code directly to the caller. Don't attempt to handle it in the core kernel. [Kai: Switched from generic seamcall export] [Yan: Re-wrote the changelog] Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Signed-off-by: Kai Huang Signed-off-by: Rick Edgecombe Signed-off-by: Yan Zhao Message-ID: <20241112073636.22129-1-yan.y.zhao@intel.com> Acked-by: Dave Hansen Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 2 ++ arch/x86/virt/vmx/tdx/tdx.c | 39 +++++++++++++++++++++++++++++++++++++ arch/x86/virt/vmx/tdx/tdx.h | 2 ++ 3 files changed, 43 insertions(+) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index c3be3742f347..af1c6036b214 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -166,8 +166,10 @@ static inline int pg_level_to_tdx_sept_level(enum pg_level level) } u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page); +u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2); u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2); u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page); +u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2); u64 tdh_mng_key_config(struct tdx_td *td); u64 tdh_mng_create(struct tdx_td *td, u16 hkid); u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp); diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 506a75fbac0b..4ae10246260e 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1529,6 +1529,26 @@ u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page) } EXPORT_SYMBOL_GPL(tdh_mng_addcx); +u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa, + .rdx = tdx_tdr_pa(td), + .r8 = page_to_phys(page), + .r9 = page_to_phys(source), + }; + u64 ret; + + tdx_clflush_page(page); + ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_page_add); + u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) { struct tdx_module_args args = { @@ -1560,6 +1580,25 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page) } EXPORT_SYMBOL_GPL(tdh_vp_addcx); +u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa | level, + .rdx = tdx_tdr_pa(td), + .r8 = page_to_phys(page), + }; + u64 ret; + + tdx_clflush_page(page); + ret = seamcall_ret(TDH_MEM_PAGE_AUG, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_page_aug); + u64 tdh_mng_key_config(struct tdx_td *td) { struct tdx_module_args args = { diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index f3e37df4c63a..5879bdb0045f 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -15,8 +15,10 @@ * TDX module SEAMCALL leaf functions */ #define TDH_MNG_ADDCX 1 +#define TDH_MEM_PAGE_ADD 2 #define TDH_MEM_SEPT_ADD 3 #define TDH_VP_ADDCX 4 +#define TDH_MEM_PAGE_AUG 6 #define TDH_MNG_KEY_CONFIG 8 #define TDH_MNG_CREATE 9 #define TDH_MNG_RD 11 -- 2.51.0 From ee4884eb84dc876d1474652acc1351495921b45c Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 14 Jan 2025 16:44:26 -0500 Subject: [PATCH 03/16] x86/virt/tdx: Add SEAMCALL wrappers to manage TDX TLB tracking TDX module defines a TLB tracking protocol to make sure that no logical processor holds any stale Secure EPT (S-EPT or SEPT) TLB translations for a given TD private GPA range. After a successful TDH.MEM.RANGE.BLOCK, TDH.MEM.TRACK, and kicking off all vCPUs, TDX module ensures that the subsequent TDH.VP.ENTER on each vCPU will flush all stale TLB entries for the specified GPA ranges in TDH.MEM.RANGE.BLOCK. Wrap the TDH.MEM.RANGE.BLOCK with tdh_mem_range_block() and TDH.MEM.TRACK with tdh_mem_track() to enable the kernel to assist the TDX module in TLB tracking management. The caller of tdh_mem_range_block() needs to specify "GPA" and "level" to request the TDX module to block the subsequent creation of TLB translation for a GPA range. This GPA range can correspond to a SEPT page or a TD private page at any level. Contentions and errors are possible with the SEAMCALL TDH.MEM.RANGE.BLOCK. Therefore, the caller of tdh_mem_range_block() needs to check the function return value and retrieve extended error info from the function output params. Upon TDH.MEM.RANGE.BLOCK success, no new TLB entries will be created for the specified private GPA range, though the existing TLB translations may still persist. TDH.MEM.TRACK will then advance the TD's epoch counter to ensure TDX module will flush TLBs in all vCPUs once the vCPUs re-enter the TD. TDH.MEM.TRACK will fail to advance TD's epoch counter if there are vCPUs still running in non-root mode at the previous TD epoch counter. So to ensure private GPA translations are flushed, callers must first call tdh_mem_range_block(), then tdh_mem_track(), and lastly send IPIs to kick all the vCPUs and force them to re-enter, thus triggering the TLB flush. Don't export a single operation and instead export functions that just expose the block and track operations; this is for a couple reasons: 1. The vCPU kick should use KVM's functionality for doing this, which can better target sending IPIs to only the minimum required pCPUs. 2. tdh_mem_track() doesn't need to be executed if a vCPU has not entered a TD, which is information only KVM knows. 3. Leaving the operations separate will allow for batching many tdh_mem_range_block() calls before a tdh_mem_track(). While this batching will not be done initially by KVM, it demonstrates that keeping mem block and track as separate operations is a generally good design. Contentions are also possible in TDH.MEM.TRACK. For example, TDH.MEM.TRACK may contend with TDH.VP.ENTER when advancing the TD epoch counter. tdh_mem_track() does not provide the retries for the caller. Callers can choose to avoid contentions or retry on their own. [Kai: Switched from generic seamcall export] [Yan: Re-wrote the changelog] Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Signed-off-by: Kai Huang Signed-off-by: Rick Edgecombe Signed-off-by: Yan Zhao Message-ID: <20241112073648.22143-1-yan.y.zhao@intel.com> Acked-by: Dave Hansen Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 2 ++ arch/x86/virt/vmx/tdx/tdx.c | 27 +++++++++++++++++++++++++++ arch/x86/virt/vmx/tdx/tdx.h | 2 ++ 3 files changed, 31 insertions(+) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index af1c6036b214..b79860b5e50f 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -170,6 +170,7 @@ u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2); u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page); u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2); u64 tdh_mng_key_config(struct tdx_td *td); u64 tdh_mng_create(struct tdx_td *td, u16 hkid); u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp); @@ -182,6 +183,7 @@ u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid); u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data); u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask); u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size); +u64 tdh_mem_track(struct tdx_td *tdr); u64 tdh_phymem_cache_wb(bool resume); u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td); #else diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 4ae10246260e..f920754bb35e 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1599,6 +1599,23 @@ u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u } EXPORT_SYMBOL_GPL(tdh_mem_page_aug); +u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa | level, + .rdx = tdx_tdr_pa(td), + }; + u64 ret; + + ret = seamcall_ret(TDH_MEM_RANGE_BLOCK, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_range_block); + u64 tdh_mng_key_config(struct tdx_td *td) { struct tdx_module_args args = { @@ -1761,6 +1778,16 @@ u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 } EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim); +u64 tdh_mem_track(struct tdx_td *td) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + }; + + return seamcall(TDH_MEM_TRACK, &args); +} +EXPORT_SYMBOL_GPL(tdh_mem_track); + u64 tdh_phymem_cache_wb(bool resume) { struct tdx_module_args args = { diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 5879bdb0045f..104c97abf264 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -19,6 +19,7 @@ #define TDH_MEM_SEPT_ADD 3 #define TDH_VP_ADDCX 4 #define TDH_MEM_PAGE_AUG 6 +#define TDH_MEM_RANGE_BLOCK 7 #define TDH_MNG_KEY_CONFIG 8 #define TDH_MNG_CREATE 9 #define TDH_MNG_RD 11 @@ -36,6 +37,7 @@ #define TDH_SYS_RD 34 #define TDH_SYS_LP_INIT 35 #define TDH_SYS_TDMR_INIT 36 +#define TDH_MEM_TRACK 38 #define TDH_PHYMEM_CACHE_WB 40 #define TDH_PHYMEM_PAGE_WBINVD 41 #define TDH_VP_WR 43 -- 2.51.0 From 206e7860e754e184cc35a535bb0f2136cf57f376 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 12 Nov 2024 15:36:58 +0800 Subject: [PATCH 04/16] x86/virt/tdx: Add SEAMCALL wrappers to remove a TD private page TDX architecture introduces the concept of private GPA vs shared GPA, depending on the GPA.SHARED bit. The TDX module maintains a single Secure EPT (S-EPT or SEPT) tree per TD to translate TD's private memory accessed using a private GPA. Wrap the SEAMCALL TDH.MEM.PAGE.REMOVE with tdh_mem_page_remove() and TDH_PHYMEM_PAGE_WBINVD with tdh_phymem_page_wbinvd_hkid() to unmap a TD private page from the SEPT, remove the TD private page from the TDX module and flush cache lines to memory after removal of the private page. Callers should specify "GPA" and "level" when calling tdh_mem_page_remove() to indicate to the TDX module which TD private page to unmap and remove. TDH.MEM.PAGE.REMOVE may fail, and the caller of tdh_mem_page_remove() can check the function return value and retrieve extended error information from the function output parameters. Follow the TLB tracking protocol before calling tdh_mem_page_remove() to remove a TD private page to avoid SEAMCALL failure. After removing a TD's private page, the TDX module does not write back and invalidate cache lines associated with the page and the page's keyID (i.e., the TD's guest keyID). Therefore, provide tdh_phymem_page_wbinvd_hkid() to allow the caller to pass in the TD's guest keyID and invoke TDH_PHYMEM_PAGE_WBINVD to perform this action. Before reusing the page, the host kernel needs to map the page with keyID 0 and invoke movdir64b() to convert the TD private page to a normal shared page. TDH.MEM.PAGE.REMOVE and TDH_PHYMEM_PAGE_WBINVD may meet contentions inside the TDX module for TDX's internal resources. To avoid staying in SEAM mode for too long, TDX module will return a BUSY error code to the kernel instead of spinning on the locks. The caller may need to handle this error in specific ways (e.g., retry). The wrappers return the SEAMCALL error code directly to the caller. Don't attempt to handle it in the core kernel. [Kai: Switched from generic seamcall export] [Yan: Re-wrote the changelog] Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Signed-off-by: Kai Huang Signed-off-by: Rick Edgecombe Signed-off-by: Yan Zhao Message-ID: <20241112073658.22157-1-yan.y.zhao@intel.com> Acked-by: Dave Hansen Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 2 ++ arch/x86/virt/vmx/tdx/tdx.c | 27 +++++++++++++++++++++++++++ arch/x86/virt/vmx/tdx/tdx.h | 1 + 3 files changed, 30 insertions(+) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index b79860b5e50f..9d98c949db0a 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -184,8 +184,10 @@ u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data); u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask); u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size); u64 tdh_mem_track(struct tdx_td *tdr); +u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2); u64 tdh_phymem_cache_wb(bool resume); u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td); +u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page); #else static inline void tdx_init(void) { } static inline int tdx_cpu_enable(void) { return -ENODEV; } diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index f920754bb35e..ffa0f6b8254d 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1788,6 +1788,23 @@ u64 tdh_mem_track(struct tdx_td *td) } EXPORT_SYMBOL_GPL(tdh_mem_track); +u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa | level, + .rdx = tdx_tdr_pa(td), + }; + u64 ret; + + ret = seamcall_ret(TDH_MEM_PAGE_REMOVE, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_page_remove); + u64 tdh_phymem_cache_wb(bool resume) { struct tdx_module_args args = { @@ -1807,3 +1824,13 @@ u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td) return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); } EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr); + +u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) +{ + struct tdx_module_args args = {}; + + args.rcx = mk_keyed_paddr(hkid, page); + + return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); +} +EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid); diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 104c97abf264..c9bea023f2b4 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -32,6 +32,7 @@ #define TDH_PHYMEM_PAGE_RDMD 24 #define TDH_VP_RD 26 #define TDH_PHYMEM_PAGE_RECLAIM 28 +#define TDH_MEM_PAGE_REMOVE 29 #define TDH_SYS_KEY_CONFIG 31 #define TDH_SYS_INIT 33 #define TDH_SYS_RD 34 -- 2.51.0 From 099d7e9bea82f442950dff57ee3aa5a5599a361b Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 12 Nov 2024 15:37:08 +0800 Subject: [PATCH 05/16] x86/virt/tdx: Add SEAMCALL wrappers for TD measurement of initial contents The TDX module measures the TD during the build process and saves the measurement in TDCS.MRTD to facilitate TD attestation of the initial contents of the TD. Wrap the SEAMCALL TDH.MR.EXTEND with tdh_mr_extend() and TDH.MR.FINALIZE with tdh_mr_finalize() to enable the host kernel to assist the TDX module in performing the measurement. The measurement in TDCS.MRTD is a SHA-384 digest of the build process. SEAMCALLs TDH.MNG.INIT and TDH.MEM.PAGE.ADD initialize and contribute to the MRTD digest calculation. The caller of tdh_mr_extend() should break the TD private page into chunks of size TDX_EXTENDMR_CHUNKSIZE and invoke tdh_mr_extend() to add the page content into the digest calculation. Failures are possible with TDH.MR.EXTEND (e.g., due to SEPT walking). The caller of tdh_mr_extend() can check the function return value and retrieve extended error information from the function output parameters. Calling tdh_mr_finalize() completes the measurement. The TDX module then turns the TD into the runnable state. Further TDH.MEM.PAGE.ADD and TDH.MR.EXTEND calls will fail. TDH.MR.FINALIZE may fail due to errors such as the TD having no vCPUs or contentions. Check function return value when calling tdh_mr_finalize() to determine the exact reason for failure. Take proper locks on the caller's side to avoid contention failures, or handle the BUSY error in specific ways (e.g., retry). Return the SEAMCALL error code directly to the caller. Do not attempt to handle it in the core kernel. [Kai: Switched from generic seamcall export] [Yan: Re-wrote the changelog] Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Signed-off-by: Kai Huang Signed-off-by: Rick Edgecombe Signed-off-by: Yan Zhao Message-ID: <20241112073709.22171-1-yan.y.zhao@intel.com> Acked-by: Dave Hansen Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 2 ++ arch/x86/virt/vmx/tdx/tdx.c | 27 +++++++++++++++++++++++++++ arch/x86/virt/vmx/tdx/tdx.h | 2 ++ 3 files changed, 31 insertions(+) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 9d98c949db0a..e38e7558c02f 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -175,6 +175,8 @@ u64 tdh_mng_key_config(struct tdx_td *td); u64 tdh_mng_create(struct tdx_td *td, u16 hkid); u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp); u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data); +u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mr_finalize(struct tdx_td *td); u64 tdh_vp_flush(struct tdx_vp *vp); u64 tdh_mng_vpflushdone(struct tdx_td *td); u64 tdh_mng_key_freeid(struct tdx_td *td); diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index ffa0f6b8254d..8eaaf31a4187 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1667,6 +1667,33 @@ u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data) } EXPORT_SYMBOL_GPL(tdh_mng_rd); +u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa, + .rdx = tdx_tdr_pa(td), + }; + u64 ret; + + ret = seamcall_ret(TDH_MR_EXTEND, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mr_extend); + +u64 tdh_mr_finalize(struct tdx_td *td) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + }; + + return seamcall(TDH_MR_FINALIZE, &args); +} +EXPORT_SYMBOL_GPL(tdh_mr_finalize); + u64 tdh_vp_flush(struct tdx_vp *vp) { struct tdx_module_args args = { diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index c9bea023f2b4..ed7152f81a6d 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -23,6 +23,8 @@ #define TDH_MNG_KEY_CONFIG 8 #define TDH_MNG_CREATE 9 #define TDH_MNG_RD 11 +#define TDH_MR_EXTEND 16 +#define TDH_MR_FINALIZE 17 #define TDH_VP_FLUSH 18 #define TDH_MNG_VPFLUSHDONE 19 #define TDH_VP_CREATE 10 -- 2.51.0 From ae80c7d66c5f18c7d86629a2fb47765c6917f3e5 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Tue, 12 Nov 2024 15:34:26 +0800 Subject: [PATCH 06/16] KVM: x86/mmu: Implement memslot deletion for TDX Update attr_filter field to zap both private and shared mappings for TDX when memslot is deleted. Signed-off-by: Rick Edgecombe Co-developed-by: Yan Zhao Signed-off-by: Yan Zhao Message-ID: <20241112073426.21997-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a79bc15f31c6..42712cc961bf 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7072,6 +7072,7 @@ static void kvm_mmu_zap_memslot(struct kvm *kvm, .start = slot->base_gfn, .end = slot->base_gfn + slot->npages, .may_block = true, + .attr_filter = KVM_FILTER_PRIVATE | KVM_FILTER_SHARED, }; bool flush; -- 2.51.0 From 2608f105760115e94a03efd9f12f8fbfd1f9af4b Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 12 Nov 2024 15:34:57 +0800 Subject: [PATCH 07/16] KVM: x86/tdp_mmu: Add a helper function to walk down the TDP MMU Export a function to walk down the TDP without modifying it and simply check if a GPA is mapped. Future changes will support pre-populating TDX private memory. In order to implement this KVM will need to check if a given GFN is already pre-populated in the mirrored EPT. [1] There is already a TDP MMU walker, kvm_tdp_mmu_get_walk() for use within the KVM MMU that almost does what is required. However, to make sense of the results, MMU internal PTE helpers are needed. Refactor the code to provide a helper that can be used outside of the KVM MMU code. Refactoring the KVM page fault handler to support this lookup usage was also considered, but it was an awkward fit. kvm_tdp_mmu_gpa_is_mapped() is based on a diff by Paolo Bonzini. Link: https://lore.kernel.org/kvm/ZfBkle1eZFfjPI8l@google.com/ [1] Signed-off-by: Isaku Yamahata Co-developed-by: Rick Edgecombe Signed-off-by: Rick Edgecombe Signed-off-by: Yan Zhao Reviewed-by: Paolo Bonzini Message-ID: <20241112073457.22011-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 3 +++ arch/x86/kvm/mmu/mmu.c | 3 +-- arch/x86/kvm/mmu/tdp_mmu.c | 37 ++++++++++++++++++++++++++++++++----- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 050a0e229a4d..8d0fca4b4a50 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -253,6 +253,9 @@ extern bool tdp_mmu_enabled; #define tdp_mmu_enabled false #endif +bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa); +int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level); + static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) { return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 42712cc961bf..ec086f831e69 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4685,8 +4685,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return direct_page_fault(vcpu, fault); } -static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, - u8 *level) +int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level) { int r; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 046b6ba31197..22675a5746d0 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1894,16 +1894,13 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, * * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ -int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, - int *root_level) +static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + struct kvm_mmu_page *root) { - struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); struct tdp_iter iter; gfn_t gfn = addr >> PAGE_SHIFT; int leaf = -1; - *root_level = vcpu->arch.mmu->root_role.level; - tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; @@ -1912,6 +1909,36 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, return leaf; } +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level) +{ + struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); + *root_level = vcpu->arch.mmu->root_role.level; + + return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root); +} + +bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa) +{ + struct kvm *kvm = vcpu->kvm; + bool is_direct = kvm_is_addr_direct(kvm, gpa); + hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa : + vcpu->arch.mmu->mirror_root_hpa; + u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte; + int leaf; + + lockdep_assert_held(&kvm->mmu_lock); + rcu_read_lock(); + leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root)); + rcu_read_unlock(); + if (leaf < 0) + return false; + + spte = sptes[leaf]; + return is_shadow_present_pte(spte) && is_last_spte(spte, leaf); +} +EXPORT_SYMBOL_GPL(kvm_tdp_mmu_gpa_is_mapped); + /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no -- 2.51.0 From 6d15a641fddb0455600f3399238aee5c6fccdef3 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Tue, 12 Nov 2024 15:35:15 +0800 Subject: [PATCH 08/16] KVM: x86/mmu: Do not enable page track for TD guest Fail kvm_page_track_write_tracking_enabled() if VM type is TDX to make the external page track user fail in kvm_page_track_register_notifier() since TDX does not support write protection and hence page track. No need to fail KVM internal users of page track (i.e. for shadow page), because TDX is always with EPT enabled and currently TDX module does not emulate and send VMLAUNCH/VMRESUME VMExits to VMM. Suggested-by: Paolo Bonzini Signed-off-by: Yan Zhao Reviewed-by: Binbin Wu Cc: Yuan Yao Message-ID: <20241112073515.22028-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/page_track.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 561c331fd6ec..1b17b12393a8 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -172,6 +172,9 @@ static int kvm_enable_external_write_tracking(struct kvm *kvm) struct kvm_memory_slot *slot; int r = 0, i, bkt; + if (kvm->arch.vm_type == KVM_X86_TDX_VM) + return -EOPNOTSUPP; + mutex_lock(&kvm->slots_arch_lock); /* -- 2.51.0 From c8563d1b69988ef9b6803508e1c95f2aea0a171d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 12 Nov 2024 15:35:28 +0800 Subject: [PATCH 09/16] KVM: VMX: Split out guts of EPT violation to common/exposed function The difference of TDX EPT violation is how to retrieve information, GPA, and exit qualification. To share the code to handle EPT violation, split out the guts of EPT violation handler so that VMX/TDX exit handler can call it after retrieving GPA and exit qualification. Signed-off-by: Sean Christopherson Co-developed-by: Isaku Yamahata Signed-off-by: Isaku Yamahata Co-developed-by: Rick Edgecombe Signed-off-by: Rick Edgecombe Signed-off-by: Yan Zhao Reviewed-by: Paolo Bonzini Reviewed-by: Kai Huang Reviewed-by: Binbin Wu Message-ID: <20241112073528.22042-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/common.h | 34 ++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 25 +++---------------------- 2 files changed, 37 insertions(+), 22 deletions(-) create mode 100644 arch/x86/kvm/vmx/common.h diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h new file mode 100644 index 000000000000..78ae39b6cdcd --- /dev/null +++ b/arch/x86/kvm/vmx/common.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __KVM_X86_VMX_COMMON_H +#define __KVM_X86_VMX_COMMON_H + +#include + +#include "mmu.h" + +static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa, + unsigned long exit_qualification) +{ + u64 error_code; + + /* Is it a read fault? */ + error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) + ? PFERR_USER_MASK : 0; + /* Is it a write fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) + ? PFERR_WRITE_MASK : 0; + /* Is it a fetch fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) + ? PFERR_FETCH_MASK : 0; + /* ept page table entry is present? */ + error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) + ? PFERR_PRESENT_MASK : 0; + + if (error_code & EPT_VIOLATION_GVA_IS_VALID) + error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? + PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; + + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); +} + +#endif /* __KVM_X86_VMX_COMMON_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 03d9e5069791..cf0a8a040f7b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -53,6 +53,7 @@ #include #include "capabilities.h" +#include "common.h" #include "cpuid.h" #include "hyperv.h" #include "kvm_onhyperv.h" @@ -5787,11 +5788,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) static int handle_ept_violation(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); gpa_t gpa; - u64 error_code; - - exit_qualification = vmx_get_exit_qual(vcpu); /* * EPT violation happened while executing iret from NMI, @@ -5807,23 +5805,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(vcpu, gpa, exit_qualification); - /* Is it a read fault? */ - error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) - ? PFERR_USER_MASK : 0; - /* Is it a write fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) - ? PFERR_WRITE_MASK : 0; - /* Is it a fetch fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) - ? PFERR_FETCH_MASK : 0; - /* ept page table entry is present? */ - error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) - ? PFERR_PRESENT_MASK : 0; - - if (error_code & EPT_VIOLATION_GVA_IS_VALID) - error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? - PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; - /* * Check that the GPA doesn't exceed physical memory limits, as that is * a guest page fault. We have to emulate the instruction here, because @@ -5835,7 +5816,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) return kvm_emulate_instruction(vcpu, 0); - return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); + return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification); } static int handle_ept_misconfig(struct kvm_vcpu *vcpu) -- 2.51.0 From 3b725e972fd003218397437bae81ee309ed26322 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Tue, 12 Nov 2024 15:35:39 +0800 Subject: [PATCH 10/16] KVM: VMX: Teach EPT violation helper about private mem Teach EPT violation helper to check shared mask of a GPA to find out whether the GPA is for private memory. When EPT violation is triggered after TD accessing a private GPA, KVM will exit to user space if the corresponding GFN's attribute is not private. User space will then update GFN's attribute during its memory conversion process. After that, TD will re-access the private GPA and trigger EPT violation again. Only with GFN's attribute matches to private, KVM will fault in private page, map it in mirrored TDP root, and propagate changes to private EPT to resolve the EPT violation. Relying on GFN's attribute tracking xarray to determine if a GFN is private, as for KVM_X86_SW_PROTECTED_VM, may lead to endless EPT violations. Signed-off-by: Rick Edgecombe Co-developed-by: Yan Zhao Signed-off-by: Yan Zhao Message-ID: <20241112073539.22056-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/common.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h index 78ae39b6cdcd..7a592467a044 100644 --- a/arch/x86/kvm/vmx/common.h +++ b/arch/x86/kvm/vmx/common.h @@ -6,6 +6,12 @@ #include "mmu.h" +static inline bool vt_is_tdx_private_gpa(struct kvm *kvm, gpa_t gpa) +{ + /* For TDX the direct mask is the shared mask. */ + return !kvm_is_addr_direct(kvm, gpa); +} + static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long exit_qualification) { @@ -28,6 +34,9 @@ static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa, error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) + error_code |= PFERR_PRIVATE_ACCESS; + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } -- 2.51.0 From fe1e6d483fcfd46ee16436749bc73a5b1fc89324 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 12 Nov 2024 15:35:50 +0800 Subject: [PATCH 11/16] KVM: TDX: Add accessors VMX VMCS helpers TDX defines SEAMCALL APIs to access TDX control structures corresponding to the VMX VMCS. Introduce helper accessors to hide its SEAMCALL ABI details. Signed-off-by: Isaku Yamahata Co-developed-by: Rick Edgecombe Signed-off-by: Rick Edgecombe Co-developed-by: Yan Zhao Signed-off-by: Yan Zhao Message-ID: <20241112073551.22070-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/tdx.c | 13 +++++++ arch/x86/kvm/vmx/tdx.h | 88 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 01166cb8f2e6..6932b38bdfe0 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -36,6 +36,19 @@ static enum cpuhp_state tdx_cpuhp_state; static const struct tdx_sys_info *tdx_sysinfo; +void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err) +{ + KVM_BUG_ON(1, tdx->vcpu.kvm); + pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err); +} + +void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, + u64 val, u64 err) +{ + KVM_BUG_ON(1, tdx->vcpu.kvm); + pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err); +} + #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE) static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm) diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index 0559126c8f9d..b3029e6e5caf 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -48,6 +48,10 @@ struct vcpu_tdx { enum vcpu_tdx_state state; }; +void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err); +void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, + u64 val, u64 err); + static inline bool is_td(struct kvm *kvm) { return kvm->arch.vm_type == KVM_X86_TDX_VM; @@ -69,6 +73,90 @@ static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 fiel } return data; } + +static __always_inline void tdvps_vmcs_check(u32 field, u8 bits) +{ +#define VMCS_ENC_ACCESS_TYPE_MASK 0x1UL +#define VMCS_ENC_ACCESS_TYPE_FULL 0x0UL +#define VMCS_ENC_ACCESS_TYPE_HIGH 0x1UL +#define VMCS_ENC_ACCESS_TYPE(field) ((field) & VMCS_ENC_ACCESS_TYPE_MASK) + + /* TDX is 64bit only. HIGH field isn't supported. */ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && + VMCS_ENC_ACCESS_TYPE(field) == VMCS_ENC_ACCESS_TYPE_HIGH, + "Read/Write to TD VMCS *_HIGH fields not supported"); + + BUILD_BUG_ON(bits != 16 && bits != 32 && bits != 64); + +#define VMCS_ENC_WIDTH_MASK GENMASK(14, 13) +#define VMCS_ENC_WIDTH_16BIT (0UL << 13) +#define VMCS_ENC_WIDTH_64BIT (1UL << 13) +#define VMCS_ENC_WIDTH_32BIT (2UL << 13) +#define VMCS_ENC_WIDTH_NATURAL (3UL << 13) +#define VMCS_ENC_WIDTH(field) ((field) & VMCS_ENC_WIDTH_MASK) + + /* TDX is 64bit only. i.e. natural width = 64bit. */ + BUILD_BUG_ON_MSG(bits != 64 && __builtin_constant_p(field) && + (VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_64BIT || + VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_NATURAL), + "Invalid TD VMCS access for 64-bit field"); + BUILD_BUG_ON_MSG(bits != 32 && __builtin_constant_p(field) && + VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_32BIT, + "Invalid TD VMCS access for 32-bit field"); + BUILD_BUG_ON_MSG(bits != 16 && __builtin_constant_p(field) && + VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_16BIT, + "Invalid TD VMCS access for 16-bit field"); +} + +#define TDX_BUILD_TDVPS_ACCESSORS(bits, uclass, lclass) \ +static __always_inline u##bits td_##lclass##_read##bits(struct vcpu_tdx *tdx, \ + u32 field) \ +{ \ + u64 err, data; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_rd(&tdx->vp, TDVPS_##uclass(field), &data); \ + if (unlikely(err)) { \ + tdh_vp_rd_failed(tdx, #uclass, field, err); \ + return 0; \ + } \ + return (u##bits)data; \ +} \ +static __always_inline void td_##lclass##_write##bits(struct vcpu_tdx *tdx, \ + u32 field, u##bits val) \ +{ \ + u64 err; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), val, \ + GENMASK_ULL(bits - 1, 0)); \ + if (unlikely(err)) \ + tdh_vp_wr_failed(tdx, #uclass, " = ", field, (u64)val, err); \ +} \ +static __always_inline void td_##lclass##_setbit##bits(struct vcpu_tdx *tdx, \ + u32 field, u64 bit) \ +{ \ + u64 err; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), bit, bit); \ + if (unlikely(err)) \ + tdh_vp_wr_failed(tdx, #uclass, " |= ", field, bit, err); \ +} \ +static __always_inline void td_##lclass##_clearbit##bits(struct vcpu_tdx *tdx, \ + u32 field, u64 bit) \ +{ \ + u64 err; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), 0, bit); \ + if (unlikely(err)) \ + tdh_vp_wr_failed(tdx, #uclass, " &= ~", field, bit, err);\ +} + +TDX_BUILD_TDVPS_ACCESSORS(16, VMCS, vmcs); +TDX_BUILD_TDVPS_ACCESSORS(32, VMCS, vmcs); +TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs); #else static inline int tdx_bringup(void) { return 0; } static inline void tdx_cleanup(void) {} -- 2.51.0 From 87e3f45e80474a652ad6e75cc33a6b68602de463 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 12 Nov 2024 15:36:01 +0800 Subject: [PATCH 12/16] KVM: TDX: Add load_mmu_pgd method for TDX TDX uses two EPT pointers, one for the private half of the GPA space and one for the shared half. The private half uses the normal EPT_POINTER vmcs field, which is managed in a special way by the TDX module. For TDX, KVM is not allowed to operate on it directly. The shared half uses a new SHARED_EPT_POINTER field and will be managed by the conventional MMU management operations that operate directly on the EPT root. This means for TDX the .load_mmu_pgd() operation will need to know to use the SHARED_EPT_POINTER field instead of the normal one. Add a new wrapper in x86 ops for load_mmu_pgd() that either directs the write to the existing vmx implementation or a TDX one. tdx_load_mmu_pgd() is so much simpler than vmx_load_mmu_pgd() since for the TDX mode of operation, EPT will always be used and KVM does not need to be involved in virtualization of CR3 behavior. So tdx_load_mmu_pgd() can simply write to SHARED_EPT_POINTER. Signed-off-by: Sean Christopherson Co-developed-by: Isaku Yamahata Signed-off-by: Isaku Yamahata Co-developed-by: Rick Edgecombe Signed-off-by: Rick Edgecombe Co-developed-by: Yan Zhao Signed-off-by: Yan Zhao Reviewed-by: Paolo Bonzini Message-ID: <20241112073601.22084-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/vmx/main.c | 13 ++++++++++++- arch/x86/kvm/vmx/tdx.c | 15 +++++++++++++++ arch/x86/kvm/vmx/x86_ops.h | 4 ++++ 4 files changed, 32 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index f7fd4369b821..9298fb9d4bb3 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -256,6 +256,7 @@ enum vmcs_field { TSC_MULTIPLIER_HIGH = 0x00002033, TERTIARY_VM_EXEC_CONTROL = 0x00002034, TERTIARY_VM_EXEC_CONTROL_HIGH = 0x00002035, + SHARED_EPT_POINTER = 0x0000203C, PID_POINTER_TABLE = 0x00002042, PID_POINTER_TABLE_HIGH = 0x00002043, GUEST_PHYSICAL_ADDRESS = 0x00002400, diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index e7d402b3a90d..8ed08c53c02f 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -98,6 +98,17 @@ static void vt_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_vcpu_reset(vcpu, init_event); } +static void vt_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, + int pgd_level) +{ + if (is_td_vcpu(vcpu)) { + tdx_load_mmu_pgd(vcpu, root_hpa, pgd_level); + return; + } + + vmx_load_mmu_pgd(vcpu, root_hpa, pgd_level); +} + static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { if (!is_td(kvm)) @@ -231,7 +242,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .write_tsc_offset = vmx_write_tsc_offset, .write_tsc_multiplier = vmx_write_tsc_multiplier, - .load_mmu_pgd = vmx_load_mmu_pgd, + .load_mmu_pgd = vt_load_mmu_pgd, .check_intercept = vmx_check_intercept, .handle_exit_irqoff = vmx_handle_exit_irqoff, diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 6932b38bdfe0..c3e2526c861a 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -32,6 +32,9 @@ bool enable_tdx __ro_after_init; module_param_named(tdx, enable_tdx, bool, 0444); +#define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51)) +#define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47)) + static enum cpuhp_state tdx_cpuhp_state; static const struct tdx_sys_info *tdx_sysinfo; @@ -495,6 +498,18 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu) tdx->state = VCPU_TD_STATE_UNINITIALIZED; } + +void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) +{ + u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 : + TDX_SHARED_BIT_PWL_4; + + if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm)) + return; + + td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); +} + static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) { const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 89bb7785bd09..f9d0e62221ca 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -131,6 +131,8 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu); void tdx_vcpu_free(struct kvm_vcpu *vcpu); int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); + +void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); #else static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; } static inline void tdx_mmu_release_hkid(struct kvm *kvm) {} @@ -141,6 +143,8 @@ static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; } static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {} static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; } + +static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {} #endif #endif /* __KVM_X86_VMX_X86_OPS_H */ -- 2.51.0 From e0fbb3bbb5d076769621d7bac06046d4f3c75875 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 12 Nov 2024 15:36:13 +0800 Subject: [PATCH 13/16] KVM: TDX: Set gfn_direct_bits to shared bit Make the direct root handle memslot GFNs at an alias with the TDX shared bit set. For TDX shared memory, the memslot GFNs need to be mapped at an alias with the shared bit set. These shared mappings will be mapped on the KVM MMU's "direct" root. The direct root has it's mappings shifted by applying "gfn_direct_bits" as a mask. The concept of "GPAW" (guest physical address width) determines the location of the shared bit. So set gfn_direct_bits based on this, to map shared memory at the proper GPA. Signed-off-by: Isaku Yamahata Co-developed-by: Rick Edgecombe Signed-off-by: Rick Edgecombe Co-developed-by: Yan Zhao Signed-off-by: Yan Zhao Reviewed-by: Paolo Bonzini Message-ID: <20241112073613.22100-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/tdx.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index c3e2526c861a..5a641daf13d4 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1051,6 +1051,11 @@ static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) kvm_tdx->attributes = td_params->attributes; kvm_tdx->xfam = td_params->xfam; + if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW) + kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5; + else + kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4; + kvm_tdx->state = TD_STATE_INITIALIZED; out: /* kfree() accepts NULL. */ -- 2.51.0 From 427a6486c51b285ac471434168ce401659b78766 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 12 Nov 2024 15:37:20 +0800 Subject: [PATCH 14/16] KVM: TDX: Require TDP MMU, mmio caching and EPT A/D bits for TDX Disable TDX support when TDP MMU or mmio caching or EPT A/D bits aren't supported. As TDP MMU is becoming main stream than the legacy MMU, the legacy MMU support for TDX isn't implemented. TDX requires KVM mmio caching. Without mmio caching, KVM will go to MMIO emulation without installing SPTEs for MMIOs. However, TDX guest is protected and KVM would meet errors when trying to emulate MMIOs for TDX guest during instruction decoding. So, TDX guest relies on SPTEs being installed for MMIOs, which are with no RWX bits and with VE suppress bit unset, to inject VE to TDX guest. The TDX guest would then issue TDVMCALL in the VE handler to perform instruction decoding and have host do MMIO emulation. TDX also relies on EPT A/D bits as EPT A/D bits have been supported in all CPUs since Haswell. Relying on it can avoid RWX bits being masked out in the mirror page table for prefaulted entries. Signed-off-by: Isaku Yamahata Co-developed-by: Rick Edgecombe Signed-off-by: Rick Edgecombe Co-developed-by: Yan Zhao Signed-off-by: Yan Zhao Reviewed-by: Paolo Bonzini Signed-off-by: Yan Zhao --- Requested by Sean at [1]. [1] https://lore.kernel.org/kvm/Zva4aORxE9ljlMNe@google.com/ Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 1 + arch/x86/kvm/vmx/main.c | 1 + arch/x86/kvm/vmx/tdx.c | 10 ++++++++++ 3 files changed, 12 insertions(+) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index ec086f831e69..af6c11995cec 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -110,6 +110,7 @@ static bool __ro_after_init tdp_mmu_allowed; #ifdef CONFIG_X86_64 bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444); +EXPORT_SYMBOL_GPL(tdp_mmu_enabled); #endif static int max_huge_page_level __read_mostly; diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 8ed08c53c02f..a4cb3d6b2986 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -3,6 +3,7 @@ #include "x86_ops.h" #include "vmx.h" +#include "mmu.h" #include "nested.h" #include "pmu.h" #include "posted_intr.h" diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 5a641daf13d4..78ed554203d7 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1534,6 +1534,16 @@ int __init tdx_bringup(void) if (!enable_tdx) return 0; + if (!enable_ept) { + pr_err("EPT is required for TDX\n"); + goto success_disable_tdx; + } + + if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) { + pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n"); + goto success_disable_tdx; + } + if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) { pr_err("tdx: MOVDIR64B is required for TDX\n"); goto success_disable_tdx; -- 2.51.0 From 5a46fd48d884f3ed1bac8adf64dafd7e60d0d6e3 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 12 Nov 2024 15:37:30 +0800 Subject: [PATCH 15/16] KVM: x86/mmu: Add setter for shadow_mmio_value Future changes will want to set shadow_mmio_value from TDX code. Add a helper to setter with a name that makes more sense from that context. Signed-off-by: Isaku Yamahata [split into new patch] Co-developed-by: Rick Edgecombe Signed-off-by: Rick Edgecombe Signed-off-by: Yan Zhao Reviewed-by: Paolo Bonzini Message-ID: <20241112073730.22200-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 1 + arch/x86/kvm/mmu/spte.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 8d0fca4b4a50..47e64a3c4ce3 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -79,6 +79,7 @@ static inline gfn_t kvm_mmu_max_gfn(void) u8 kvm_mmu_get_max_tdp_level(void); void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); +void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value); void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 22551e2f1d00..c42ac5d1f027 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -433,6 +433,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value) +{ + kvm->arch.shadow_mmio_value = mmio_value; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_value); + void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask) { /* shadow_me_value must be a subset of shadow_me_mask */ -- 2.51.0 From 7d10ffb1acac2c17c2c839e2db88565529e8c8ef Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 12 Nov 2024 15:37:43 +0800 Subject: [PATCH 16/16] KVM: TDX: Set per-VM shadow_mmio_value to 0 Set per-VM shadow_mmio_value to 0 for TDX. With enable_mmio_caching on, KVM installs MMIO SPTEs for TDs. To correctly configure MMIO SPTEs, TDX requires the per-VM shadow_mmio_value to be set to 0. This is necessary to override the default value of the suppress VE bit in the SPTE, which is 1, and to ensure value 0 in RWX bits. For MMIO SPTE, the spte value changes as follows: 1. initial value (suppress VE bit is set) 2. Guest issues MMIO and triggers EPT violation 3. KVM updates SPTE value to MMIO value (suppress VE bit is cleared) 4. Guest MMIO resumes. It triggers VE exception in guest TD 5. Guest VE handler issues TDG.VP.VMCALL 6. KVM handles MMIO 7. Guest VE handler resumes its execution after MMIO instruction Signed-off-by: Isaku Yamahata Co-developed-by: Rick Edgecombe Signed-off-by: Rick Edgecombe Co-developed-by: Yan Zhao Signed-off-by: Yan Zhao Reviewed-by: Paolo Bonzini Message-ID: <20241112073743.22214-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/spte.c | 2 -- arch/x86/kvm/vmx/tdx.c | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index c42ac5d1f027..e819d16655b6 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -96,8 +96,6 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) u64 spte = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; - WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value); - access &= shadow_mmio_access_mask; spte |= vcpu->kvm->arch.shadow_mmio_value | access; spte |= gpa | shadow_nonpresent_or_rsvd_mask; diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 78ed554203d7..f73b03f6ef59 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -8,6 +8,7 @@ #include "x86_ops.h" #include "lapic.h" #include "tdx.h" +#include "mmu/spte.h" #pragma GCC poison to_vmx @@ -410,6 +411,19 @@ int tdx_vm_init(struct kvm *kvm) kvm->arch.has_protected_state = true; kvm->arch.has_private_mem = true; + /* + * Because guest TD is protected, VMM can't parse the instruction in TD. + * Instead, guest uses MMIO hypercall. For unmodified device driver, + * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO + * instruction into MMIO hypercall. + * + * SPTE value for MMIO needs to be setup so that #VE is injected into + * TD instead of triggering EPT MISCONFIG. + * - RWX=0 so that EPT violation is triggered. + * - suppress #VE bit is cleared to inject #VE. + */ + kvm_mmu_set_mmio_spte_value(kvm, 0); + /* * TDX has its own limit of maximum vCPUs it can support for all * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports -- 2.51.0