From a0cc649353bb726d4aa0db60dce467432197b746 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:28:01 -0400 Subject: [PATCH 01/16] selftests/rseq: Fix mm_cid test failure Adapt the rseq.c/rseq.h code to follow GNU C library changes introduced by: glibc commit 2e456ccf0c34 ("Linux: Make __rseq_size useful for feature detection (bug 31965)") Without this fix, rseq selftests for mm_cid fail: ./run_param_test.sh Default parameters Running test spinlock Running compare-twice test spinlock Running mm_cid test spinlock Error: cpu id getter unavailable Fixes: 18c2355838e7 ("selftests/rseq: Implement rseq mm_cid field support") Signed-off-by: Mathieu Desnoyers Cc: Peter Zijlstra CC: Boqun Feng CC: "Paul E. McKenney" Cc: Shuah Khan CC: Carlos O'Donell CC: Florian Weimer CC: linux-kselftest@vger.kernel.org CC: stable@vger.kernel.org Signed-off-by: Shuah Khan --- tools/testing/selftests/rseq/rseq.c | 110 +++++++++++++++++++--------- tools/testing/selftests/rseq/rseq.h | 10 +-- 2 files changed, 77 insertions(+), 43 deletions(-) diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c index 96e812bdf8a4..5b9772cdf265 100644 --- a/tools/testing/selftests/rseq/rseq.c +++ b/tools/testing/selftests/rseq/rseq.c @@ -60,12 +60,6 @@ unsigned int rseq_size = -1U; /* Flags used during rseq registration. */ unsigned int rseq_flags; -/* - * rseq feature size supported by the kernel. 0 if the registration was - * unsuccessful. - */ -unsigned int rseq_feature_size = -1U; - static int rseq_ownership; static int rseq_reg_success; /* At least one rseq registration has succeded. */ @@ -111,6 +105,43 @@ int rseq_available(void) } } +/* The rseq areas need to be at least 32 bytes. */ +static +unsigned int get_rseq_min_alloc_size(void) +{ + unsigned int alloc_size = rseq_size; + + if (alloc_size < ORIG_RSEQ_ALLOC_SIZE) + alloc_size = ORIG_RSEQ_ALLOC_SIZE; + return alloc_size; +} + +/* + * Return the feature size supported by the kernel. + * + * Depending on the value returned by getauxval(AT_RSEQ_FEATURE_SIZE): + * + * 0: Return ORIG_RSEQ_FEATURE_SIZE (20) + * > 0: Return the value from getauxval(AT_RSEQ_FEATURE_SIZE). + * + * It should never return a value below ORIG_RSEQ_FEATURE_SIZE. + */ +static +unsigned int get_rseq_kernel_feature_size(void) +{ + unsigned long auxv_rseq_feature_size, auxv_rseq_align; + + auxv_rseq_align = getauxval(AT_RSEQ_ALIGN); + assert(!auxv_rseq_align || auxv_rseq_align <= RSEQ_THREAD_AREA_ALLOC_SIZE); + + auxv_rseq_feature_size = getauxval(AT_RSEQ_FEATURE_SIZE); + assert(!auxv_rseq_feature_size || auxv_rseq_feature_size <= RSEQ_THREAD_AREA_ALLOC_SIZE); + if (auxv_rseq_feature_size) + return auxv_rseq_feature_size; + else + return ORIG_RSEQ_FEATURE_SIZE; +} + int rseq_register_current_thread(void) { int rc; @@ -119,7 +150,7 @@ int rseq_register_current_thread(void) /* Treat libc's ownership as a successful registration. */ return 0; } - rc = sys_rseq(&__rseq_abi, rseq_size, 0, RSEQ_SIG); + rc = sys_rseq(&__rseq_abi, get_rseq_min_alloc_size(), 0, RSEQ_SIG); if (rc) { if (RSEQ_READ_ONCE(rseq_reg_success)) { /* Incoherent success/failure within process. */ @@ -140,28 +171,12 @@ int rseq_unregister_current_thread(void) /* Treat libc's ownership as a successful unregistration. */ return 0; } - rc = sys_rseq(&__rseq_abi, rseq_size, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); + rc = sys_rseq(&__rseq_abi, get_rseq_min_alloc_size(), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); if (rc) return -1; return 0; } -static -unsigned int get_rseq_feature_size(void) -{ - unsigned long auxv_rseq_feature_size, auxv_rseq_align; - - auxv_rseq_align = getauxval(AT_RSEQ_ALIGN); - assert(!auxv_rseq_align || auxv_rseq_align <= RSEQ_THREAD_AREA_ALLOC_SIZE); - - auxv_rseq_feature_size = getauxval(AT_RSEQ_FEATURE_SIZE); - assert(!auxv_rseq_feature_size || auxv_rseq_feature_size <= RSEQ_THREAD_AREA_ALLOC_SIZE); - if (auxv_rseq_feature_size) - return auxv_rseq_feature_size; - else - return ORIG_RSEQ_FEATURE_SIZE; -} - static __attribute__((constructor)) void rseq_init(void) { @@ -178,28 +193,54 @@ void rseq_init(void) } if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p && *libc_rseq_size_p != 0) { + unsigned int libc_rseq_size; + /* rseq registration owned by glibc */ rseq_offset = *libc_rseq_offset_p; - rseq_size = *libc_rseq_size_p; + libc_rseq_size = *libc_rseq_size_p; rseq_flags = *libc_rseq_flags_p; - rseq_feature_size = get_rseq_feature_size(); - if (rseq_feature_size > rseq_size) - rseq_feature_size = rseq_size; + + /* + * Previous versions of glibc expose the value + * 32 even though the kernel only supported 20 + * bytes initially. Therefore treat 32 as a + * special-case. glibc 2.40 exposes a 20 bytes + * __rseq_size without using getauxval(3) to + * query the supported size, while still allocating a 32 + * bytes area. Also treat 20 as a special-case. + * + * Special-cases are handled by using the following + * value as active feature set size: + * + * rseq_size = min(32, get_rseq_kernel_feature_size()) + */ + switch (libc_rseq_size) { + case ORIG_RSEQ_FEATURE_SIZE: + fallthrough; + case ORIG_RSEQ_ALLOC_SIZE: + { + unsigned int rseq_kernel_feature_size = get_rseq_kernel_feature_size(); + + if (rseq_kernel_feature_size < ORIG_RSEQ_ALLOC_SIZE) + rseq_size = rseq_kernel_feature_size; + else + rseq_size = ORIG_RSEQ_ALLOC_SIZE; + break; + } + default: + /* Otherwise just use the __rseq_size from libc as rseq_size. */ + rseq_size = libc_rseq_size; + break; + } return; } rseq_ownership = 1; if (!rseq_available()) { rseq_size = 0; - rseq_feature_size = 0; return; } rseq_offset = (void *)&__rseq_abi - rseq_thread_pointer(); rseq_flags = 0; - rseq_feature_size = get_rseq_feature_size(); - if (rseq_feature_size == ORIG_RSEQ_FEATURE_SIZE) - rseq_size = ORIG_RSEQ_ALLOC_SIZE; - else - rseq_size = RSEQ_THREAD_AREA_ALLOC_SIZE; } static __attribute__((destructor)) @@ -209,7 +250,6 @@ void rseq_exit(void) return; rseq_offset = 0; rseq_size = -1U; - rseq_feature_size = -1U; rseq_ownership = 0; } diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h index d7364ea4d201..4e217b620e0c 100644 --- a/tools/testing/selftests/rseq/rseq.h +++ b/tools/testing/selftests/rseq/rseq.h @@ -68,12 +68,6 @@ extern unsigned int rseq_size; /* Flags used during rseq registration. */ extern unsigned int rseq_flags; -/* - * rseq feature size supported by the kernel. 0 if the registration was - * unsuccessful. - */ -extern unsigned int rseq_feature_size; - enum rseq_mo { RSEQ_MO_RELAXED = 0, RSEQ_MO_CONSUME = 1, /* Unused */ @@ -193,7 +187,7 @@ static inline uint32_t rseq_current_cpu(void) static inline bool rseq_node_id_available(void) { - return (int) rseq_feature_size >= rseq_offsetofend(struct rseq_abi, node_id); + return (int) rseq_size >= rseq_offsetofend(struct rseq_abi, node_id); } /* @@ -207,7 +201,7 @@ static inline uint32_t rseq_current_node_id(void) static inline bool rseq_mm_cid_available(void) { - return (int) rseq_feature_size >= rseq_offsetofend(struct rseq_abi, mm_cid); + return (int) rseq_size >= rseq_offsetofend(struct rseq_abi, mm_cid); } static inline uint32_t rseq_current_mm_cid(void) -- 2.51.0 From 4ee5ca9a29384fcf3f18232fdf8474166dea8dca Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Oct 2024 16:52:35 -0400 Subject: [PATCH 02/16] ftrace/selftest: Test combination of function_graph tracer and function profiler Masami reported a bug when running function graph tracing then the function profiler. The following commands would cause a kernel crash: # cd /sys/kernel/tracing/ # echo function_graph > current_tracer # echo 1 > function_profile_enabled In that order. Create a test to test this two to make sure this does not come back as a regression. Link: https://lore.kernel.org/172398528350.293426.8347220120333730248.stgit@devnote2 Link: https://lore.kernel.org/all/20241010165235.35122877@gandalf.local.home/ Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) Signed-off-by: Shuah Khan --- .../ftrace/test.d/ftrace/fgraph-profiler.tc | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/ftrace/fgraph-profiler.tc diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-profiler.tc b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-profiler.tc new file mode 100644 index 000000000000..ffff8646733c --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-profiler.tc @@ -0,0 +1,31 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: ftrace - function profiler with function graph tracing +# requires: function_profile_enabled set_ftrace_filter function_graph:tracer + +# The function graph tracer can now be run along side of the function +# profiler. But there was a bug that caused the combination of the two +# to crash. It also required the function graph tracer to be started +# first. +# +# This test triggers that bug +# +# We need both function_graph and profiling to run this test + +fail() { # mesg + echo $1 + exit_fail +} + +echo "Enabling function graph tracer:" +echo function_graph > current_tracer +echo "enable profiler" + +# Older kernels do not allow function_profile to be enabled with +# function graph tracer. If the below fails, mark it as unsupported +echo 1 > function_profile_enabled || exit_unsupported + +# Let it run for a bit to make sure nothing explodes +sleep 1 + +exit 0 -- 2.51.0 From 8e929cb546ee42c9a61d24fae60605e9e3192354 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 13 Oct 2024 14:33:32 -0700 Subject: [PATCH 03/16] Linux 6.12-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c5493c0c0ca1..8cf3cf528892 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 12 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.51.0 From 7c204426b81822ba768b3a5e5393b1489917fb84 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:21 +0800 Subject: [PATCH 04/16] iommu/vt-d: Add domain_alloc_paging support Add the domain_alloc_paging callback for domain allocation using the iommu_paging_domain_alloc() interface. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 9f6b0780f2ef..4803e0cb8279 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4590,6 +4590,19 @@ static struct iommu_domain identity_domain = { }, }; +static struct iommu_domain *intel_iommu_domain_alloc_paging(struct device *dev) +{ + struct dmar_domain *dmar_domain; + bool first_stage; + + first_stage = first_level_by_default(0); + dmar_domain = paging_domain_alloc(dev, first_stage); + if (IS_ERR(dmar_domain)) + return ERR_CAST(dmar_domain); + + return &dmar_domain->domain; +} + const struct iommu_ops intel_iommu_ops = { .blocked_domain = &blocking_domain, .release_domain = &blocking_domain, @@ -4599,6 +4612,7 @@ const struct iommu_ops intel_iommu_ops = { .domain_alloc = intel_iommu_domain_alloc, .domain_alloc_user = intel_iommu_domain_alloc_user, .domain_alloc_sva = intel_svm_domain_alloc, + .domain_alloc_paging = intel_iommu_domain_alloc_paging, .probe_device = intel_iommu_probe_device, .release_device = intel_iommu_release_device, .get_resv_regions = intel_iommu_get_resv_regions, -- 2.51.0 From 9ecfcac1fe15e097cfd74663bcb8fbeaf3cc2910 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:22 +0800 Subject: [PATCH 05/16] iommu/vt-d: Remove unused domain_alloc callback With domain_alloc_paging callback supported, the legacy domain_alloc callback will never be used anymore. Remove it to avoid dead code. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 90 ------------------------------------- 1 file changed, 90 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 4803e0cb8279..dd158ff5fd45 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1454,27 +1454,6 @@ static bool first_level_by_default(unsigned int type) return type != IOMMU_DOMAIN_UNMANAGED; } -static struct dmar_domain *alloc_domain(unsigned int type) -{ - struct dmar_domain *domain; - - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) - return NULL; - - domain->nid = NUMA_NO_NODE; - if (first_level_by_default(type)) - domain->use_first_level = true; - INIT_LIST_HEAD(&domain->devices); - INIT_LIST_HEAD(&domain->dev_pasids); - INIT_LIST_HEAD(&domain->cache_tags); - spin_lock_init(&domain->lock); - spin_lock_init(&domain->cache_lock); - xa_init(&domain->iommu_array); - - return domain; -} - int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info, *curr; @@ -1546,20 +1525,6 @@ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) spin_unlock(&iommu->lock); } -static int guestwidth_to_adjustwidth(int gaw) -{ - int agaw; - int r = (gaw - 12) % 9; - - if (r == 0) - agaw = gaw; - else - agaw = gaw + 9 - r; - if (agaw > 64) - agaw = 64; - return agaw; -} - static void domain_exit(struct dmar_domain *domain) { if (domain->pgd) { @@ -3379,27 +3344,6 @@ void device_block_translation(struct device *dev) info->domain = NULL; } -static int md_domain_init(struct dmar_domain *domain, int guest_width) -{ - int adjust_width; - - /* calculate AGAW */ - domain->gaw = guest_width; - adjust_width = guestwidth_to_adjustwidth(guest_width); - domain->agaw = width_to_agaw(adjust_width); - - domain->iommu_coherency = false; - domain->iommu_superpage = 0; - domain->max_addr = 0; - - /* always allocate the top pgd */ - domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC); - if (!domain->pgd) - return -ENOMEM; - domain_flush_cache(domain, domain->pgd, PAGE_SIZE); - return 0; -} - static int blocking_domain_attach_dev(struct iommu_domain *domain, struct device *dev) { @@ -3486,39 +3430,6 @@ static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_st return domain; } -static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) -{ - struct dmar_domain *dmar_domain; - struct iommu_domain *domain; - - switch (type) { - case IOMMU_DOMAIN_DMA: - case IOMMU_DOMAIN_UNMANAGED: - dmar_domain = alloc_domain(type); - if (!dmar_domain) { - pr_err("Can't allocate dmar_domain\n"); - return NULL; - } - if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { - pr_err("Domain initialization failed\n"); - domain_exit(dmar_domain); - return NULL; - } - - domain = &dmar_domain->domain; - domain->geometry.aperture_start = 0; - domain->geometry.aperture_end = - __DOMAIN_MAX_ADDR(dmar_domain->gaw); - domain->geometry.force_aperture = true; - - return domain; - default: - return NULL; - } - - return NULL; -} - static struct iommu_domain * intel_iommu_domain_alloc_user(struct device *dev, u32 flags, struct iommu_domain *parent, @@ -4609,7 +4520,6 @@ const struct iommu_ops intel_iommu_ops = { .identity_domain = &identity_domain, .capable = intel_iommu_capable, .hw_info = intel_iommu_hw_info, - .domain_alloc = intel_iommu_domain_alloc, .domain_alloc_user = intel_iommu_domain_alloc_user, .domain_alloc_sva = intel_svm_domain_alloc, .domain_alloc_paging = intel_iommu_domain_alloc_paging, -- 2.51.0 From a98db518dde246e01ead53617dc0a30d6aaa3752 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:23 +0800 Subject: [PATCH 06/16] iommu/vt-d: Enhance compatibility check for paging domain attach The driver now supports domain_alloc_paging, ensuring that a valid device pointer is provided whenever a paging domain is allocated. Additionally, the dmar_domain attributes are set up at the time of allocation. Consistent with the established semantics in the IOMMU core, if a domain is attached to a device and found to be incompatible with the IOMMU hardware capabilities, the operation will return an -EINVAL error. This implicitly advises the caller to allocate a new domain for the device and attempt the domain attachment again. Rename prepare_domain_attach_device() to a more meaningful name. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 70 ++++++++++++------------------------ drivers/iommu/intel/iommu.h | 3 +- drivers/iommu/intel/nested.c | 2 +- drivers/iommu/intel/pasid.c | 28 +-------------- 4 files changed, 26 insertions(+), 77 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index dd158ff5fd45..eeb341aafe3e 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1606,7 +1606,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int translation = CONTEXT_TT_MULTI_LEVEL; struct dma_pte *pgd = domain->pgd; struct context_entry *context; - int agaw, ret; + int ret; pr_debug("Set context mapping for %02x:%02x.%d\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); @@ -1623,27 +1623,15 @@ static int domain_context_mapping_one(struct dmar_domain *domain, copied_context_tear_down(iommu, context, bus, devfn); context_clear_entry(context); - context_set_domain_id(context, did); - /* - * Skip top levels of page tables for iommu which has - * less agaw than default. Unnecessary for PT mode. - */ - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - ret = -ENOMEM; - pgd = phys_to_virt(dma_pte_addr(pgd)); - if (!dma_pte_present(pgd)) - goto out_unlock; - } - if (info && info->ats_supported) translation = CONTEXT_TT_DEV_IOTLB; else translation = CONTEXT_TT_MULTI_LEVEL; context_set_address_root(context, virt_to_phys(pgd)); - context_set_address_width(context, agaw); + context_set_address_width(context, domain->agaw); context_set_translation_type(context, translation); context_set_fault_enable(context); context_set_present(context); @@ -1876,20 +1864,9 @@ static int domain_setup_first_level(struct intel_iommu *iommu, u32 pasid) { struct dma_pte *pgd = domain->pgd; - int agaw, level; - int flags = 0; + int level, flags = 0; - /* - * Skip top levels of page tables for iommu which has - * less agaw than default. Unnecessary for PT mode. - */ - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - pgd = phys_to_virt(dma_pte_addr(pgd)); - if (!dma_pte_present(pgd)) - return -ENOMEM; - } - - level = agaw_to_level(agaw); + level = agaw_to_level(domain->agaw); if (level != 4 && level != 5) return -EINVAL; @@ -3492,42 +3469,41 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) domain_exit(dmar_domain); } -int prepare_domain_attach_device(struct iommu_domain *domain, - struct device *dev) +int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; int addr_width; + if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) + return -EPERM; + if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) return -EINVAL; if (domain->dirty_ops && !ssads_supported(iommu)) return -EINVAL; + if (dmar_domain->iommu_coherency != + iommu_paging_structure_coherency(iommu)) + return -EINVAL; + + if (dmar_domain->iommu_superpage != + iommu_superpage_capability(iommu, dmar_domain->use_first_level)) + return -EINVAL; + + if (dmar_domain->use_first_level && + (!sm_supported(iommu) || !ecap_flts(iommu->ecap))) + return -EINVAL; + /* check if this iommu agaw is sufficient for max mapped address */ addr_width = agaw_to_width(iommu->agaw); if (addr_width > cap_mgaw(iommu->cap)) addr_width = cap_mgaw(iommu->cap); - if (dmar_domain->max_addr > (1LL << addr_width)) + if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw) return -EINVAL; - dmar_domain->gaw = addr_width; - - /* - * Knock out extra levels of page tables if necessary - */ - while (iommu->agaw < dmar_domain->agaw) { - struct dma_pte *pte; - - pte = dmar_domain->pgd; - if (dma_pte_present(pte)) { - dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); - iommu_free_page(pte); - } - dmar_domain->agaw--; - } if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && context_copied(iommu, info->bus, info->devfn)) @@ -3543,7 +3519,7 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, device_block_translation(dev); - ret = prepare_domain_attach_device(domain, dev); + ret = paging_domain_compatible(domain, dev); if (ret) return ret; @@ -4214,7 +4190,7 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (context_copied(iommu, info->bus, info->devfn)) return -EBUSY; - ret = prepare_domain_attach_device(domain, dev); + ret = paging_domain_compatible(domain, dev); if (ret) return ret; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 1497f3112b12..b1928ca3aaa8 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1230,8 +1230,7 @@ void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void device_block_translation(struct device *dev); -int prepare_domain_attach_device(struct iommu_domain *domain, - struct device *dev); +int paging_domain_compatible(struct iommu_domain *domain, struct device *dev); void domain_update_iommu_cap(struct dmar_domain *domain); int dmar_ir_support(void); diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 433c58944401..96016bc40f94 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -40,7 +40,7 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, * The s2_domain will be used in nested translation, hence needs * to ensure the s2_domain is compatible with this IOMMU. */ - ret = prepare_domain_attach_device(&dmar_domain->s2_domain->domain, dev); + ret = paging_domain_compatible(&dmar_domain->s2_domain->domain, dev); if (ret) { dev_err_ratelimited(dev, "s2 domain is not compatible\n"); return ret; diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 2e5fa0a23299..53157e1194f4 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -345,25 +345,6 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, return 0; } -/* - * Skip top levels of page tables for iommu which has less agaw - * than default. Unnecessary for PT mode. - */ -static int iommu_skip_agaw(struct dmar_domain *domain, - struct intel_iommu *iommu, - struct dma_pte **pgd) -{ - int agaw; - - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - *pgd = phys_to_virt(dma_pte_addr(*pgd)); - if (!dma_pte_present(*pgd)) - return -EINVAL; - } - - return agaw; -} - /* * Set up the scalable mode pasid entry for second only translation type. */ @@ -374,7 +355,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct pasid_entry *pte; struct dma_pte *pgd; u64 pgd_val; - int agaw; u16 did; /* @@ -388,12 +368,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, } pgd = domain->pgd; - agaw = iommu_skip_agaw(domain, iommu, &pgd); - if (agaw < 0) { - dev_err(dev, "Invalid domain page table\n"); - return -EINVAL; - } - pgd_val = virt_to_phys(pgd); did = domain_id_iommu(domain, iommu); @@ -412,7 +386,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, pasid_clear_entry(pte); pasid_set_domain_id(pte, did); pasid_set_slptr(pte, pgd_val); - pasid_set_address_width(pte, agaw); + pasid_set_address_width(pte, domain->agaw); pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); pasid_set_fault_enable(pte); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); -- 2.51.0 From c376a3456d8bef43ec556a98c0a04c35086c2737 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:24 +0800 Subject: [PATCH 07/16] iommu/vt-d: Remove domain_update_iommu_cap() The attributes of a paging domain are initialized during the allocation process, and any attempt to attach a domain that is not compatible will result in a failure. Therefore, there is no need to update the domain attributes at the time of domain attachment. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 83 ------------------------------------- drivers/iommu/intel/iommu.h | 1 - 2 files changed, 84 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index eeb341aafe3e..756caa24008f 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -352,36 +352,6 @@ static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); } -static void domain_update_iommu_coherency(struct dmar_domain *domain) -{ - struct iommu_domain_info *info; - struct dmar_drhd_unit *drhd; - struct intel_iommu *iommu; - bool found = false; - unsigned long i; - - domain->iommu_coherency = true; - xa_for_each(&domain->iommu_array, i, info) { - found = true; - if (!iommu_paging_structure_coherency(info->iommu)) { - domain->iommu_coherency = false; - break; - } - } - if (found) - return; - - /* No hardware attached; use lowest common denominator */ - rcu_read_lock(); - for_each_active_iommu(iommu, drhd) { - if (!iommu_paging_structure_coherency(iommu)) { - domain->iommu_coherency = false; - break; - } - } - rcu_read_unlock(); -} - static int domain_update_iommu_superpage(struct dmar_domain *domain, struct intel_iommu *skip) { @@ -412,29 +382,6 @@ static int domain_update_iommu_superpage(struct dmar_domain *domain, return fls(mask); } -static int domain_update_device_node(struct dmar_domain *domain) -{ - struct device_domain_info *info; - int nid = NUMA_NO_NODE; - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - list_for_each_entry(info, &domain->devices, link) { - /* - * There could possibly be multiple device numa nodes as devices - * within the same domain may sit behind different IOMMUs. There - * isn't perfect answer in such situation, so we select first - * come first served policy. - */ - nid = dev_to_node(info->dev); - if (nid != NUMA_NO_NODE) - break; - } - spin_unlock_irqrestore(&domain->lock, flags); - - return nid; -} - /* Return the super pagesize bitmap if supported. */ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) { @@ -452,34 +399,6 @@ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) return bitmap; } -/* Some capabilities may be different across iommus */ -void domain_update_iommu_cap(struct dmar_domain *domain) -{ - domain_update_iommu_coherency(domain); - domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); - - /* - * If RHSA is missing, we should default to the device numa domain - * as fall back. - */ - if (domain->nid == NUMA_NO_NODE) - domain->nid = domain_update_device_node(domain); - - /* - * First-level translation restricts the input-address to a - * canonical address (i.e., address bits 63:N have the same - * value as address bit [N-1], where N is 48-bits with 4-level - * paging and 57-bits with 5-level paging). Hence, skip bit - * [N-1]. - */ - if (domain->use_first_level) - domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); - else - domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); - - domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); -} - struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, u8 devfn, int alloc) { @@ -1493,7 +1412,6 @@ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) ret = xa_err(curr) ? : -EBUSY; goto err_clear; } - domain_update_iommu_cap(domain); spin_unlock(&iommu->lock); return 0; @@ -1519,7 +1437,6 @@ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) clear_bit(info->did, iommu->domain_ids); xa_erase(&domain->iommu_array, iommu->seq_id); domain->nid = NUMA_NO_NODE; - domain_update_iommu_cap(domain); kfree(info); } spin_unlock(&iommu->lock); diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index b1928ca3aaa8..f9fba9a26dac 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1231,7 +1231,6 @@ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void device_block_translation(struct device *dev); int paging_domain_compatible(struct iommu_domain *domain, struct device *dev); -void domain_update_iommu_cap(struct dmar_domain *domain); int dmar_ir_support(void); -- 2.51.0 From 5bdd86ec5d19060f63c00fff3b081c887242a37a Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:25 +0800 Subject: [PATCH 08/16] iommu/vt-d: Remove domain_update_iommu_superpage() The requirement for consistent super page support across all the IOMMU hardware in the system has been removed. In the past, if a new IOMMU was hot-added and lacked consistent super page capability, the hot-add process would be aborted. However, with the updated attachment semantics, it is now permissible for the super page capability to vary among different IOMMU hardware units. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-6-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 39 +------------------------------------ 1 file changed, 1 insertion(+), 38 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 756caa24008f..36854b683b11 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -352,36 +352,6 @@ static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); } -static int domain_update_iommu_superpage(struct dmar_domain *domain, - struct intel_iommu *skip) -{ - struct dmar_drhd_unit *drhd; - struct intel_iommu *iommu; - int mask = 0x3; - - if (!intel_iommu_superpage) - return 0; - - /* set iommu_superpage to the smallest common denominator */ - rcu_read_lock(); - for_each_active_iommu(iommu, drhd) { - if (iommu != skip) { - if (domain && domain->use_first_level) { - if (!cap_fl1gp_support(iommu->cap)) - mask = 0x1; - } else { - mask &= cap_super_page_val(iommu->cap); - } - - if (!mask) - break; - } - } - rcu_read_unlock(); - - return fls(mask); -} - /* Return the super pagesize bitmap if supported. */ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) { @@ -2605,20 +2575,13 @@ int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) static int intel_iommu_add(struct dmar_drhd_unit *dmaru) { - int sp, ret; struct intel_iommu *iommu = dmaru->iommu; + int ret; ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); if (ret) goto out; - sp = domain_update_iommu_superpage(NULL, iommu) - 1; - if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { - pr_warn("%s: Doesn't support large page.\n", - iommu->name); - return -ENXIO; - } - /* * Disable translation if already enabled prior to OS handover. */ -- 2.51.0 From ed56de8a9e90d9771c4517fb9f2daac8282269ba Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:26 +0800 Subject: [PATCH 09/16] iommu/vt-d: Refactor first_level_by_default() The first stage page table is compatible across host and guest kernels. Therefore, this driver uses the first stage page table as the default for paging domains. The helper first_level_by_default() determines the feasibility of using the first stage page table based on a global policy. This policy requires consistency in scalable mode and first stage translation capability among all iommu units. However, this is unnecessary as domain allocation, attachment, and removal operations are performed on a per-device basis. The domain type (IOMMU_DOMAIN_DMA vs. IOMMU_DOMAIN_UNMANAGED) should not be a factor in determining the first stage page table usage. Both types are for paging domains, and there's no fundamental difference between them. The driver should not be aware of this distinction unless the core specifies allocation flags that require special handling. Convert first_level_by_default() from global to per-iommu and remove the 'type' input. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-7-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 36854b683b11..bad9593f2464 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1329,18 +1329,17 @@ static void free_dmar_iommu(struct intel_iommu *iommu) * Check and return whether first level is used by default for * DMA translation. */ -static bool first_level_by_default(unsigned int type) +static bool first_level_by_default(struct intel_iommu *iommu) { /* Only SL is available in legacy mode */ - if (!scalable_mode_support()) + if (!sm_supported(iommu)) return false; /* Only level (either FL or SL) is available, just use it */ - if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) - return intel_cap_flts_sanity(); + if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap)) + return ecap_flts(iommu->ecap); - /* Both levels are available, decide it based on domain type */ - return type != IOMMU_DOMAIN_UNMANAGED; + return true; } int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) @@ -3110,7 +3109,7 @@ int __init intel_iommu_init(void) * the virtual and physical IOMMU page-tables. */ if (cap_caching_mode(iommu->cap) && - !first_level_by_default(IOMMU_DOMAIN_DMA)) { + !first_level_by_default(iommu)) { pr_info_once("IOMMU batching disallowed due to virtualization\n"); iommu_set_dma_strict(); } @@ -4359,10 +4358,12 @@ static struct iommu_domain identity_domain = { static struct iommu_domain *intel_iommu_domain_alloc_paging(struct device *dev) { + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; struct dmar_domain *dmar_domain; bool first_stage; - first_stage = first_level_by_default(0); + first_stage = first_level_by_default(iommu); dmar_domain = paging_domain_alloc(dev, first_stage); if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain); -- 2.51.0 From 621838c718a81ba3bfb8e0f941bc0133166bc534 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:27 +0800 Subject: [PATCH 10/16] iommu/vt-d: Refine intel_iommu_domain_alloc_user() The domain_alloc_user ops should always allocate a guest-compatible page table unless specific allocation flags are specified. Currently, IOMMU_HWPT_ALLOC_NEST_PARENT and IOMMU_HWPT_ALLOC_DIRTY_TRACKING require special handling, as both require hardware support for scalable mode and second-stage translation. In such cases, the driver should select a second-stage page table for the paging domain. Suggested-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-8-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index bad9593f2464..2b5027dd0c96 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3297,6 +3297,7 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, struct intel_iommu *iommu = info->iommu; struct dmar_domain *dmar_domain; struct iommu_domain *domain; + bool first_stage; /* Must be NESTING domain */ if (parent) { @@ -3313,8 +3314,20 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, if (user_data || (dirty_tracking && !ssads_supported(iommu))) return ERR_PTR(-EOPNOTSUPP); - /* Do not use first stage for user domain translation. */ - dmar_domain = paging_domain_alloc(dev, false); + /* + * Always allocate the guest compatible page table unless + * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING + * is specified. + */ + if (nested_parent || dirty_tracking) { + if (!sm_supported(iommu) || !ecap_slts(iommu->ecap)) + return ERR_PTR(-EOPNOTSUPP); + first_stage = false; + } else { + first_stage = first_level_by_default(iommu); + } + + dmar_domain = paging_domain_alloc(dev, first_stage); if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain); domain = &dmar_domain->domain; -- 2.51.0 From 2a32309345ef2977ceb4fba81600066474ac8581 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Mon, 4 Nov 2024 09:40:28 +0800 Subject: [PATCH 11/16] iommu/vt-d: Use PCI_DEVID() macro The macro PCI_DEVID() can be used instead of compose it manually. Signed-off-by: Jinjie Ruan Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240829021011.4135618-1-ruanjinjie@huawei.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 4 ++-- drivers/iommu/intel/irq_remapping.c | 4 ++-- drivers/iommu/intel/pasid.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 2b5027dd0c96..2d67db67f5e3 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1452,7 +1452,7 @@ static void copied_context_tear_down(struct intel_iommu *iommu, if (did_old < cap_ndoms(iommu->cap)) { iommu->flush.flush_context(iommu, did_old, - (((u16)bus) << 8) | devfn, + PCI_DEVID(bus, devfn), DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); iommu->flush.flush_iotlb(iommu, did_old, 0, 0, @@ -1473,7 +1473,7 @@ static void context_present_cache_flush(struct intel_iommu *iommu, u16 did, { if (cap_caching_mode(iommu->cap)) { iommu->flush.flush_context(iommu, 0, - (((u16)bus) << 8) | devfn, + PCI_DEVID(bus, devfn), DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 7a6d188e3bea..466c1412dd45 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -312,7 +312,7 @@ static int set_ioapic_sid(struct irte *irte, int apic) for (i = 0; i < MAX_IO_APICS; i++) { if (ir_ioapic[i].iommu && ir_ioapic[i].id == apic) { - sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn; + sid = PCI_DEVID(ir_ioapic[i].bus, ir_ioapic[i].devfn); break; } } @@ -337,7 +337,7 @@ static int set_hpet_sid(struct irte *irte, u8 id) for (i = 0; i < MAX_HPET_TBS; i++) { if (ir_hpet[i].iommu && ir_hpet[i].id == id) { - sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn; + sid = PCI_DEVID(ir_hpet[i].bus, ir_hpet[i].devfn); break; } } diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 53157e1194f4..7ef157615e0f 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -220,7 +220,7 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu, if (pci_dev_is_disconnected(to_pci_dev(dev))) return; - sid = info->bus << 8 | info->devfn; + sid = PCI_DEVID(info->bus, info->devfn); qdep = info->ats_qdep; pfsid = info->pfsid; -- 2.51.0 From 6d8bac098e6e44b9a2768e38e9bf77626dc591b7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 4 Nov 2024 09:40:29 +0800 Subject: [PATCH 12/16] iommu/vt-d: Increase buffer size for device name MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit GCC is not happy with the current code, e.g.: .../iommu/intel/dmar.c:1063:9: note: ‘sprintf’ output between 6 and 15 bytes into a destination of size 13 1063 | sprintf(iommu->name, "dmar%d", iommu->seq_id); When `make W=1` is supplied, this prevents kernel building. Fix it by increasing the buffer size for device name and use sizeoF() instead of hard coded constants. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241014104529.4025937-1-andriy.shevchenko@linux.intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/dmar.c | 2 +- drivers/iommu/intel/iommu.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index eaf862e8dea1..e16c2b1d7633 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1060,7 +1060,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) err = iommu->seq_id; goto error; } - sprintf(iommu->name, "dmar%d", iommu->seq_id); + snprintf(iommu->name, sizeof(iommu->name), "dmar%d", iommu->seq_id); err = map_iommu(iommu, drhd); if (err) { diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index f9fba9a26dac..79692d7a26d1 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -720,7 +720,7 @@ struct intel_iommu { int msagaw; /* max sagaw of this iommu */ unsigned int irq, pr_irq, perf_irq; u16 segment; /* PCI segment# */ - unsigned char name[13]; /* Device Name */ + unsigned char name[16]; /* Device Name */ #ifdef CONFIG_INTEL_IOMMU unsigned long *domain_ids; /* bitmap of domains */ -- 2.51.0 From 95e2eaf5b91aae6c3a433cd7882733bd806fa3c8 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 4 Nov 2024 09:40:30 +0800 Subject: [PATCH 13/16] iommu/vt-d: Remove unused dmar_msi_read dmar_msi_read() has been unused since 2022 in commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture") Remove it. (dmar_msi_write still exists and is used once). Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20241022002702.302728-1-linux@treblig.org Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/dmar.c | 13 ------------- include/linux/dmar.h | 1 - 2 files changed, 14 deletions(-) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index e16c2b1d7633..9f424acf474e 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1895,19 +1895,6 @@ void dmar_msi_write(int irq, struct msi_msg *msg) raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } -void dmar_msi_read(int irq, struct msi_msg *msg) -{ - struct intel_iommu *iommu = irq_get_handler_data(irq); - int reg = dmar_msi_reg(iommu, irq); - unsigned long flag; - - raw_spin_lock_irqsave(&iommu->register_lock, flag); - msg->data = readl(iommu->reg + reg + 4); - msg->address_lo = readl(iommu->reg + reg + 8); - msg->address_hi = readl(iommu->reg + reg + 12); - raw_spin_unlock_irqrestore(&iommu->register_lock, flag); -} - static int dmar_fault_do_one(struct intel_iommu *iommu, int type, u8 fault_reason, u32 pasid, u16 source_id, unsigned long long addr) diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 499bb2c63483..692b2b445761 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -292,7 +292,6 @@ static inline void dmar_copy_shared_irte(struct irte *dst, struct irte *src) struct irq_data; extern void dmar_msi_unmask(struct irq_data *data); extern void dmar_msi_mask(struct irq_data *data); -extern void dmar_msi_read(int irq, struct msi_msg *msg); extern void dmar_msi_write(int irq, struct msi_msg *msg); extern int dmar_set_interrupt(struct intel_iommu *iommu); extern irqreturn_t dmar_fault(int irq, void *dev_id); -- 2.51.0 From 4f178e07a2e62293311e2107786a843d6290d77a Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 4 Nov 2024 09:40:31 +0800 Subject: [PATCH 14/16] iommu/vt-d: Drop s1_pgtbl from dmar_domain dmar_domian has stored the s1_cfg which includes the s1_pgtbl info, so no need to store s1_pgtbl, hence drop it. Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241025143339.2328991-1-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.h | 2 -- drivers/iommu/intel/nested.c | 1 - drivers/iommu/intel/pasid.c | 3 +-- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 79692d7a26d1..4c6135a2e2f8 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -653,8 +653,6 @@ struct dmar_domain { struct { /* parent page table which the user domain is nested on */ struct dmar_domain *s2_domain; - /* user page table pointer (in GPA) */ - unsigned long s1_pgtbl; /* page table attributes */ struct iommu_hwpt_vtd_s1 s1_cfg; /* link to parent domain siblings */ diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 96016bc40f94..989ca5cc04eb 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -162,7 +162,6 @@ struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, domain->use_first_level = true; domain->s2_domain = s2_domain; - domain->s1_pgtbl = vtd.pgtbl_addr; domain->s1_cfg = vtd; domain->domain.ops = &intel_nested_domain_ops; domain->domain.type = IOMMU_DOMAIN_NESTED; diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 7ef157615e0f..7e76062a7ad2 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -560,7 +560,6 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, u32 pasid, struct dmar_domain *domain) { struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; - pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl; struct dmar_domain *s2_domain = domain->s2_domain; u16 did = domain_id_iommu(domain, iommu); struct dma_pte *pgd = s2_domain->pgd; @@ -611,7 +610,7 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) pasid_set_flpm(pte, 1); - pasid_set_flptr(pte, (uintptr_t)s1_gpgd); + pasid_set_flptr(pte, s1_cfg->pgtbl_addr); if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { pasid_set_sre(pte); -- 2.51.0 From 6ceb93f952f6ca34823ce3650c902c31b8385b40 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Mon, 4 Nov 2024 09:40:32 +0800 Subject: [PATCH 15/16] iommu/vt-d: Fix checks and print in dmar_fault_dump_ptes() There are some issues in dmar_fault_dump_ptes(): 1. return value of phys_to_virt() is used for checking if an entry is present. 2. dump is confusing, e.g., "pasid table entry is not present", confusing by unpresent pasid table vs. unpresent pasid table entry. Current code means the former. 3. pgtable_walk() is called without checking if page table is present. Fix 1 by checking present bit of an entry before dump a lower level entry. Fix 2 by removing "entry" string, e.g., "pasid table is not present". Fix 3 by checking page table present before walk. Take issue 3 for example, before fix: [ 442.240357] DMAR: pasid dir entry: 0x000000012c83e001 [ 442.246661] DMAR: pasid table entry[0]: 0x0000000000000000 [ 442.253429] DMAR: pasid table entry[1]: 0x0000000000000000 [ 442.260203] DMAR: pasid table entry[2]: 0x0000000000000000 [ 442.266969] DMAR: pasid table entry[3]: 0x0000000000000000 [ 442.273733] DMAR: pasid table entry[4]: 0x0000000000000000 [ 442.280479] DMAR: pasid table entry[5]: 0x0000000000000000 [ 442.287234] DMAR: pasid table entry[6]: 0x0000000000000000 [ 442.293989] DMAR: pasid table entry[7]: 0x0000000000000000 [ 442.300742] DMAR: PTE not present at level 2 After fix: ... [ 357.241214] DMAR: pasid table entry[6]: 0x0000000000000000 [ 357.248022] DMAR: pasid table entry[7]: 0x0000000000000000 [ 357.254824] DMAR: scalable mode page table is not present Fixes: 914ff7719e8a ("iommu/vt-d: Dump DMAR translation structure when DMA fault occurs") Signed-off-by: Zhenzhong Duan Link: https://lore.kernel.org/r/20241024092146.715063-2-zhenzhong.duan@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 2d67db67f5e3..2337baa3cb80 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -626,11 +626,11 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); /* root entry dump */ - rt_entry = &iommu->root_entry[bus]; - if (!rt_entry) { - pr_info("root table entry is not present\n"); + if (!iommu->root_entry) { + pr_info("root table is not present\n"); return; } + rt_entry = &iommu->root_entry[bus]; if (sm_supported(iommu)) pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", @@ -641,7 +641,7 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, /* context entry dump */ ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); if (!ctx_entry) { - pr_info("context table entry is not present\n"); + pr_info("context table is not present\n"); return; } @@ -650,17 +650,23 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, /* legacy mode does not require PASID entries */ if (!sm_supported(iommu)) { + if (!context_present(ctx_entry)) { + pr_info("legacy mode page table is not present\n"); + return; + } level = agaw_to_level(ctx_entry->hi & 7); pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); goto pgtable_walk; } - /* get the pointer to pasid directory entry */ - dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); - if (!dir) { - pr_info("pasid directory entry is not present\n"); + if (!context_present(ctx_entry)) { + pr_info("pasid directory table is not present\n"); return; } + + /* get the pointer to pasid directory entry */ + dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); + /* For request-without-pasid, get the pasid from context entry */ if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) pasid = IOMMU_NO_PASID; @@ -672,7 +678,7 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, /* get the pointer to the pasid table entry */ entries = get_pasid_table_from_pde(pde); if (!entries) { - pr_info("pasid table entry is not present\n"); + pr_info("pasid table is not present\n"); return; } index = pasid & PASID_PTE_MASK; @@ -680,6 +686,11 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, for (i = 0; i < ARRAY_SIZE(pte->val); i++) pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); + if (!pasid_pte_is_present(pte)) { + pr_info("scalable mode page table is not present\n"); + return; + } + if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { level = pte->val[2] & BIT_ULL(2) ? 5 : 4; pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); -- 2.51.0 From f1645676f25d2c846798f0233c3a953efd62aafb Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Mon, 4 Nov 2024 09:40:33 +0800 Subject: [PATCH 16/16] iommu/vt-d: Fix checks and print in pgtable_walk() There are some issues in pgtable_walk(): 1. Super page is dumped as non-present page 2. dma_pte_superpage() should not check against leaf page table entries 3. Pointer pte is never NULL so checking it is meaningless 4. When an entry is not present, it still makes sense to dump the entry content. Fix 1,2 by checking dma_pte_superpage()'s returned value after level check. Fix 3 by removing pte check. Fix 4 by checking present bit after printing. By this chance, change to print "page table not present" instead of "PTE not present" to be clearer. Fixes: 914ff7719e8a ("iommu/vt-d: Dump DMAR translation structure when DMA fault occurs") Signed-off-by: Zhenzhong Duan Link: https://lore.kernel.org/r/20241024092146.715063-3-zhenzhong.duan@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 2337baa3cb80..5095147f6ba2 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -596,14 +596,15 @@ static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, while (1) { offset = pfn_level_offset(pfn, level); pte = &parent[offset]; - if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { - pr_info("PTE not present at level %d\n", level); - break; - } pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); - if (level == 1) + if (!dma_pte_present(pte)) { + pr_info("page table not present at level %d\n", level - 1); + break; + } + + if (level == 1 || dma_pte_superpage(pte)) break; parent = phys_to_virt(dma_pte_addr(pte)); -- 2.51.0