From 1f7df3a691740a7736bbc99dc4ed536120eb4746 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:36 -0800 Subject: [PATCH 01/16] genirq/msi: Store the IOMMU IOVA directly in msi_desc instead of iommu_cookie The IOMMU translation for MSI message addresses has been a 2-step process, separated in time: 1) iommu_dma_prepare_msi(): A cookie pointer containing the IOVA address is stored in the MSI descriptor when an MSI interrupt is allocated. 2) iommu_dma_compose_msi_msg(): this cookie pointer is used to compute a translated message address. This has an inherent lifetime problem for the pointer stored in the cookie that must remain valid between the two steps. However, there is no locking at the irq layer that helps protect the lifetime. Today, this works under the assumption that the iommu domain is not changed while MSI interrupts being programmed. This is true for normal DMA API users within the kernel, as the iommu domain is attached before the driver is probed and cannot be changed while a driver is attached. Classic VFIO type1 also prevented changing the iommu domain while VFIO was running as it does not support changing the "container" after starting up. However, iommufd has improved this so that the iommu domain can be changed during VFIO operation. This potentially allows userspace to directly race VFIO_DEVICE_ATTACH_IOMMUFD_PT (which calls iommu_attach_group()) and VFIO_DEVICE_SET_IRQS (which calls into iommu_dma_compose_msi_msg()). This potentially causes both the cookie pointer and the unlocked call to iommu_get_domain_for_dev() on the MSI translation path to become UAFs. Fix the MSI cookie UAF by removing the cookie pointer. The translated IOVA address is already known during iommu_dma_prepare_msi() and cannot change. Thus, it can simply be stored as an integer in the MSI descriptor. The other UAF related to iommu_get_domain_for_dev() will be addressed in patch "iommu: Make iommu_dma_prepare_msi() into a generic operation" by using the IOMMU group mutex. Link: https://patch.msgid.link/r/a4f2cd76b9dc1833ee6c1cf325cba57def22231c.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Thomas Gleixner Signed-off-by: Jason Gunthorpe --- drivers/iommu/dma-iommu.c | 28 +++++++++++++--------------- include/linux/msi.h | 33 ++++++++++++--------------------- 2 files changed, 25 insertions(+), 36 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 2a9fa0c8cc00..0f0caf59023c 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1815,7 +1815,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) static DEFINE_MUTEX(msi_prepare_lock); /* see below */ if (!domain || !domain->iova_cookie) { - desc->iommu_cookie = NULL; + msi_desc_set_iommu_msi_iova(desc, 0, 0); return 0; } @@ -1827,11 +1827,12 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) mutex_lock(&msi_prepare_lock); msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain); mutex_unlock(&msi_prepare_lock); - - msi_desc_set_iommu_cookie(desc, msi_page); - if (!msi_page) return -ENOMEM; + + msi_desc_set_iommu_msi_iova( + desc, msi_page->iova, + ilog2(cookie_msi_granule(domain->iova_cookie))); return 0; } @@ -1842,18 +1843,15 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) */ void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) { - struct device *dev = msi_desc_to_dev(desc); - const struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - const struct iommu_dma_msi_page *msi_page; +#ifdef CONFIG_IRQ_MSI_IOMMU + if (desc->iommu_msi_shift) { + u64 msi_iova = desc->iommu_msi_iova << desc->iommu_msi_shift; - msi_page = msi_desc_get_iommu_cookie(desc); - - if (!domain || !domain->iova_cookie || WARN_ON(!msi_page)) - return; - - msg->address_hi = upper_32_bits(msi_page->iova); - msg->address_lo &= cookie_msi_granule(domain->iova_cookie) - 1; - msg->address_lo += lower_32_bits(msi_page->iova); + msg->address_hi = upper_32_bits(msi_iova); + msg->address_lo = lower_32_bits(msi_iova) | + (msg->address_lo & ((1 << desc->iommu_msi_shift) - 1)); + } +#endif } static int iommu_dma_init(void) diff --git a/include/linux/msi.h b/include/linux/msi.h index b10093c4d00e..fc4f3774c3af 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -166,6 +166,10 @@ struct msi_desc_data { * @dev: Pointer to the device which uses this descriptor * @msg: The last set MSI message cached for reuse * @affinity: Optional pointer to a cpu affinity mask for this descriptor + * @iommu_msi_iova: Optional shifted IOVA from the IOMMU to override the msi_addr. + * Only used if iommu_msi_shift != 0 + * @iommu_msi_shift: Indicates how many bits of the original address should be + * preserved when using iommu_msi_iova. * @sysfs_attr: Pointer to sysfs device attribute * * @write_msi_msg: Callback that may be called when the MSI message @@ -184,7 +188,8 @@ struct msi_desc { struct msi_msg msg; struct irq_affinity_desc *affinity; #ifdef CONFIG_IRQ_MSI_IOMMU - const void *iommu_cookie; + u64 iommu_msi_iova : 58; + u64 iommu_msi_shift : 6; #endif #ifdef CONFIG_SYSFS struct device_attribute *sysfs_attrs; @@ -285,28 +290,14 @@ struct msi_desc *msi_next_desc(struct device *dev, unsigned int domid, #define msi_desc_to_dev(desc) ((desc)->dev) -#ifdef CONFIG_IRQ_MSI_IOMMU -static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) -{ - return desc->iommu_cookie; -} - -static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, - const void *iommu_cookie) -{ - desc->iommu_cookie = iommu_cookie; -} -#else -static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) +static inline void msi_desc_set_iommu_msi_iova(struct msi_desc *desc, u64 msi_iova, + unsigned int msi_shift) { - return NULL; -} - -static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, - const void *iommu_cookie) -{ -} +#ifdef CONFIG_IRQ_MSI_IOMMU + desc->iommu_msi_iova = msi_iova >> msi_shift; + desc->iommu_msi_shift = msi_shift; #endif +} int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid, struct msi_desc *init_desc); -- 2.51.0 From 9349887e93009331e751854843f73a086bef4018 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:37 -0800 Subject: [PATCH 02/16] genirq/msi: Refactor iommu_dma_compose_msi_msg() The two-step process to translate the MSI address involves two functions, iommu_dma_prepare_msi() and iommu_dma_compose_msi_msg(). Previously iommu_dma_compose_msi_msg() needed to be in the iommu layer as it had to dereference the opaque cookie pointer. Now, the previous patch changed the cookie pointer into an integer so there is no longer any need for the iommu layer to be involved. Further, the call sites of iommu_dma_compose_msi_msg() all follow the same pattern of setting an MSI message address_hi/lo to non-translated and then immediately calling iommu_dma_compose_msi_msg(). Refactor iommu_dma_compose_msi_msg() into msi_msg_set_addr() that directly accepts the u64 version of the address and simplifies all the callers. Move the new helper to linux/msi.h since it has nothing to do with iommu. Aside from refactoring, this logically prepares for the next patch, which allows multiple implementation options for iommu_dma_prepare_msi(). So, it does not make sense to have the iommu_dma_compose_msi_msg() in dma-iommu.c as it no longer provides the only iommu_dma_prepare_msi() implementation. Link: https://patch.msgid.link/r/eda62a9bafa825e9cdabd7ddc61ad5a21c32af24.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Thomas Gleixner Signed-off-by: Jason Gunthorpe --- drivers/iommu/dma-iommu.c | 18 ------------------ drivers/irqchip/irq-gic-v2m.c | 5 +---- drivers/irqchip/irq-gic-v3-its.c | 13 +++---------- drivers/irqchip/irq-gic-v3-mbi.c | 12 ++++-------- drivers/irqchip/irq-ls-scfg-msi.c | 5 ++--- include/linux/iommu.h | 6 ------ include/linux/msi.h | 28 ++++++++++++++++++++++++++++ 7 files changed, 38 insertions(+), 49 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 0f0caf59023c..bf91e014d179 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1836,24 +1836,6 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) return 0; } -/** - * iommu_dma_compose_msi_msg() - Apply translation to an MSI message - * @desc: MSI descriptor prepared by iommu_dma_prepare_msi() - * @msg: MSI message containing target physical address - */ -void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) -{ -#ifdef CONFIG_IRQ_MSI_IOMMU - if (desc->iommu_msi_shift) { - u64 msi_iova = desc->iommu_msi_iova << desc->iommu_msi_shift; - - msg->address_hi = upper_32_bits(msi_iova); - msg->address_lo = lower_32_bits(msi_iova) | - (msg->address_lo & ((1 << desc->iommu_msi_shift) - 1)); - } -#endif -} - static int iommu_dma_init(void) { if (is_kdump_kernel()) diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index be35c5349986..57e0470e0d13 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -87,9 +87,6 @@ static void gicv2m_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) struct v2m_data *v2m = irq_data_get_irq_chip_data(data); phys_addr_t addr = gicv2m_get_msi_addr(v2m, data->hwirq); - msg->address_hi = upper_32_bits(addr); - msg->address_lo = lower_32_bits(addr); - if (v2m->flags & GICV2M_GRAVITON_ADDRESS_ONLY) msg->data = 0; else @@ -97,7 +94,7 @@ static void gicv2m_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) if (v2m->flags & GICV2M_NEEDS_SPI_OFFSET) msg->data -= v2m->spi_offset; - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), msg); + msi_msg_set_addr(irq_data_get_msi_desc(data), msg, addr); } static struct irq_chip gicv2m_irq_chip = { diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 8c3ec5734f1e..ce0bf70b9eaf 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1809,17 +1809,10 @@ static u64 its_irq_get_msi_base(struct its_device *its_dev) static void its_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *msg) { struct its_device *its_dev = irq_data_get_irq_chip_data(d); - struct its_node *its; - u64 addr; - - its = its_dev->its; - addr = its->get_msi_base(its_dev); - - msg->address_lo = lower_32_bits(addr); - msg->address_hi = upper_32_bits(addr); - msg->data = its_get_event_id(d); - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(d), msg); + msg->data = its_get_event_id(d); + msi_msg_set_addr(irq_data_get_msi_desc(d), msg, + its_dev->its->get_msi_base(its_dev)); } static int its_irq_set_irqchip_state(struct irq_data *d, diff --git a/drivers/irqchip/irq-gic-v3-mbi.c b/drivers/irqchip/irq-gic-v3-mbi.c index 3fe870f8ee17..a6510128611e 100644 --- a/drivers/irqchip/irq-gic-v3-mbi.c +++ b/drivers/irqchip/irq-gic-v3-mbi.c @@ -147,22 +147,18 @@ static const struct irq_domain_ops mbi_domain_ops = { static void mbi_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { - msg[0].address_hi = upper_32_bits(mbi_phys_base + GICD_SETSPI_NSR); - msg[0].address_lo = lower_32_bits(mbi_phys_base + GICD_SETSPI_NSR); msg[0].data = data->parent_data->hwirq; - - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), msg); + msi_msg_set_addr(irq_data_get_msi_desc(data), &msg[0], + mbi_phys_base + GICD_SETSPI_NSR); } static void mbi_compose_mbi_msg(struct irq_data *data, struct msi_msg *msg) { mbi_compose_msi_msg(data, msg); - msg[1].address_hi = upper_32_bits(mbi_phys_base + GICD_CLRSPI_NSR); - msg[1].address_lo = lower_32_bits(mbi_phys_base + GICD_CLRSPI_NSR); msg[1].data = data->parent_data->hwirq; - - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), &msg[1]); + msi_msg_set_addr(irq_data_get_msi_desc(data), &msg[1], + mbi_phys_base + GICD_CLRSPI_NSR); } static bool mbi_init_dev_msi_info(struct device *dev, struct irq_domain *domain, diff --git a/drivers/irqchip/irq-ls-scfg-msi.c b/drivers/irqchip/irq-ls-scfg-msi.c index c0e1aafe468c..3cb80796cc7c 100644 --- a/drivers/irqchip/irq-ls-scfg-msi.c +++ b/drivers/irqchip/irq-ls-scfg-msi.c @@ -87,8 +87,6 @@ static void ls_scfg_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) { struct ls_scfg_msi *msi_data = irq_data_get_irq_chip_data(data); - msg->address_hi = upper_32_bits(msi_data->msiir_addr); - msg->address_lo = lower_32_bits(msi_data->msiir_addr); msg->data = data->hwirq; if (msi_affinity_flag) { @@ -98,7 +96,8 @@ static void ls_scfg_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) msg->data |= cpumask_first(mask); } - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), msg); + msi_msg_set_addr(irq_data_get_msi_desc(data), msg, + msi_data->msiir_addr); } static int ls_scfg_msi_set_affinity(struct irq_data *irq_data, diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 38c65e92ecd0..caee952febd4 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1508,7 +1508,6 @@ static inline void iommu_debugfs_setup(void) {} int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base); int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); -void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg); #else /* CONFIG_IOMMU_DMA */ @@ -1524,11 +1523,6 @@ static inline int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_a { return 0; } - -static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) -{ -} - #endif /* CONFIG_IOMMU_DMA */ /* diff --git a/include/linux/msi.h b/include/linux/msi.h index fc4f3774c3af..8d97b890faec 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -299,6 +299,34 @@ static inline void msi_desc_set_iommu_msi_iova(struct msi_desc *desc, u64 msi_io #endif } +/** + * msi_msg_set_addr() - Set MSI address in an MSI message + * + * @desc: MSI descriptor that may carry an IOVA base address for MSI via @iommu_msi_iova/shift + * @msg: Target MSI message to set its address_hi and address_lo + * @msi_addr: Physical address to set the MSI message + * + * Notes: + * - Override @msi_addr using the IOVA base address in the @desc if @iommu_msi_shift is set + * - Otherwise, simply set @msi_addr to @msg + */ +static inline void msi_msg_set_addr(struct msi_desc *desc, struct msi_msg *msg, + phys_addr_t msi_addr) +{ +#ifdef CONFIG_IRQ_MSI_IOMMU + if (desc->iommu_msi_shift) { + u64 msi_iova = desc->iommu_msi_iova << desc->iommu_msi_shift; + + msg->address_hi = upper_32_bits(msi_iova); + msg->address_lo = lower_32_bits(msi_iova) | + (msi_addr & ((1 << desc->iommu_msi_shift) - 1)); + return; + } +#endif + msg->address_hi = upper_32_bits(msi_addr); + msg->address_lo = lower_32_bits(msi_addr); +} + int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid, struct msi_desc *init_desc); /** -- 2.51.0 From 288683c92b1abc32277c83819bea287af614d239 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:38 -0800 Subject: [PATCH 03/16] iommu: Make iommu_dma_prepare_msi() into a generic operation SW_MSI supports IOMMU to translate an MSI message before the MSI message is delivered to the interrupt controller. On such systems, an iommu_domain must have a translation for the MSI message for interrupts to work. The IRQ subsystem will call into IOMMU to request that a physical page be set up to receive MSI messages, and the IOMMU then sets an IOVA that maps to that physical page. Ultimately the IOVA is programmed into the device via the msi_msg. Generalize this by allowing iommu_domain owners to provide implementations of this mapping. Add a function pointer in struct iommu_domain to allow a domain owner to provide its own implementation. Have dma-iommu supply its implementation for IOMMU_DOMAIN_DMA types during the iommu_get_dma_cookie() path. For IOMMU_DOMAIN_UNMANAGED types used by VFIO (and iommufd for now), have the same iommu_dma_sw_msi set as well in the iommu_get_msi_cookie() path. Hold the group mutex while in iommu_dma_prepare_msi() to ensure the domain doesn't change or become freed while running. Races with IRQ operations from VFIO and domain changes from iommufd are possible here. Replace the msi_prepare_lock with a lockdep assertion for the group mutex as documentation. For the dmau_iommu.c each iommu_domain is unique to a group. Link: https://patch.msgid.link/r/4ca696150d2baee03af27c4ddefdb7b0b0280e7b.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/dma-iommu.c | 33 +++++++++++++---------------- drivers/iommu/iommu.c | 29 ++++++++++++++++++++++++++ include/linux/iommu.h | 44 ++++++++++++++++++++++++++------------- 3 files changed, 73 insertions(+), 33 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index bf91e014d179..3b58244e6344 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -102,6 +103,9 @@ static int __init iommu_dma_forcedac_setup(char *str) } early_param("iommu.forcedac", iommu_dma_forcedac_setup); +static int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); + /* Number of entries per flush queue */ #define IOVA_DEFAULT_FQ_SIZE 256 #define IOVA_SINGLE_FQ_SIZE 32768 @@ -398,6 +402,7 @@ int iommu_get_dma_cookie(struct iommu_domain *domain) return -ENOMEM; mutex_init(&domain->iova_cookie->mutex); + iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); return 0; } @@ -429,6 +434,7 @@ int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) cookie->msi_iova = base; domain->iova_cookie = cookie; + iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); return 0; } EXPORT_SYMBOL(iommu_get_msi_cookie); @@ -443,6 +449,9 @@ void iommu_put_dma_cookie(struct iommu_domain *domain) struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iommu_dma_msi_page *msi, *tmp; + if (domain->sw_msi != iommu_dma_sw_msi) + return; + if (!cookie) return; @@ -1800,33 +1809,19 @@ out_free_page: return NULL; } -/** - * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain - * @desc: MSI descriptor, will store the MSI page - * @msi_addr: MSI target address to be mapped - * - * Return: 0 on success or negative error code if the mapping failed. - */ -int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) +static int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr) { struct device *dev = msi_desc_to_dev(desc); - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct iommu_dma_msi_page *msi_page; - static DEFINE_MUTEX(msi_prepare_lock); /* see below */ + const struct iommu_dma_msi_page *msi_page; - if (!domain || !domain->iova_cookie) { + if (!domain->iova_cookie) { msi_desc_set_iommu_msi_iova(desc, 0, 0); return 0; } - /* - * In fact the whole prepare operation should already be serialised by - * irq_domain_mutex further up the callchain, but that's pretty subtle - * on its own, so consider this locking as failsafe documentation... - */ - mutex_lock(&msi_prepare_lock); + iommu_group_mutex_assert(dev); msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain); - mutex_unlock(&msi_prepare_lock); if (!msi_page) return -ENOMEM; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 870c3cdbd0f6..022bf96a18c5 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3596,3 +3596,32 @@ err_unlock: return ret; } EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); + +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) +/** + * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain + * @desc: MSI descriptor, will store the MSI page + * @msi_addr: MSI target address to be mapped + * + * The implementation of sw_msi() should take msi_addr and map it to + * an IOVA in the domain and call msi_desc_set_iommu_msi_iova() with the + * mapping information. + * + * Return: 0 on success or negative error code if the mapping failed. + */ +int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) +{ + struct device *dev = msi_desc_to_dev(desc); + struct iommu_group *group = dev->iommu_group; + int ret = 0; + + if (!group) + return 0; + + mutex_lock(&group->mutex); + if (group->domain && group->domain->sw_msi) + ret = group->domain->sw_msi(group->domain, desc, msi_addr); + mutex_unlock(&group->mutex); + return ret; +} +#endif /* CONFIG_IRQ_MSI_IOMMU */ diff --git a/include/linux/iommu.h b/include/linux/iommu.h index caee952febd4..761c5e186de9 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -44,6 +44,8 @@ struct iommu_dma_cookie; struct iommu_fault_param; struct iommufd_ctx; struct iommufd_viommu; +struct msi_desc; +struct msi_msg; #define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ #define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ @@ -216,6 +218,12 @@ struct iommu_domain { struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; int (*iopf_handler)(struct iopf_group *group); + +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) + int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); +#endif + void *fault_data; union { struct { @@ -234,6 +242,16 @@ struct iommu_domain { }; }; +static inline void iommu_domain_set_sw_msi( + struct iommu_domain *domain, + int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr)) +{ +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) + domain->sw_msi = sw_msi; +#endif +} + static inline bool iommu_is_dma_domain(struct iommu_domain *domain) { return domain->type & __IOMMU_DOMAIN_DMA_API; @@ -1470,6 +1488,18 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev) static inline void iommu_free_global_pasid(ioasid_t pasid) {} #endif /* CONFIG_IOMMU_API */ +#ifdef CONFIG_IRQ_MSI_IOMMU +#ifdef CONFIG_IOMMU_API +int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); +#else +static inline int iommu_dma_prepare_msi(struct msi_desc *desc, + phys_addr_t msi_addr) +{ + return 0; +} +#endif /* CONFIG_IOMMU_API */ +#endif /* CONFIG_IRQ_MSI_IOMMU */ + #if IS_ENABLED(CONFIG_LOCKDEP) && IS_ENABLED(CONFIG_IOMMU_API) void iommu_group_mutex_assert(struct device *dev); #else @@ -1503,26 +1533,12 @@ static inline void iommu_debugfs_setup(void) {} #endif #ifdef CONFIG_IOMMU_DMA -#include - int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base); - -int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); - #else /* CONFIG_IOMMU_DMA */ - -struct msi_desc; -struct msi_msg; - static inline int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) { return -ENODEV; } - -static inline int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) -{ - return 0; -} #endif /* CONFIG_IOMMU_DMA */ /* -- 2.51.0 From 96093fe54f4864b07013cf61b847708233832841 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:39 -0800 Subject: [PATCH 04/16] irqchip: Have CONFIG_IRQ_MSI_IOMMU be selected by irqchips that need it Currently, IRQ_MSI_IOMMU is selected if DMA_IOMMU is available to provide an implementation for iommu_dma_prepare/compose_msi_msg(). However, it'll make more sense for irqchips that call prepare/compose to select it, and that will trigger all the additional code and data to be compiled into the kernel. If IRQ_MSI_IOMMU is selected with no IOMMU side implementation, then the prepare/compose() will be NOP stubs. If IRQ_MSI_IOMMU is not selected by an irqchip, then the related code on the iommu side is compiled out. Link: https://patch.msgid.link/r/a2620f67002c5cdf974e89ca3bf905f5c0817be6.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Thomas Gleixner Signed-off-by: Jason Gunthorpe --- drivers/iommu/Kconfig | 1 - drivers/iommu/dma-iommu.c | 2 ++ drivers/irqchip/Kconfig | 4 ++++ kernel/irq/Kconfig | 1 + 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index ec1b5e32b972..5124e7431fe3 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -154,7 +154,6 @@ config IOMMU_DMA select DMA_OPS_HELPERS select IOMMU_API select IOMMU_IOVA - select IRQ_MSI_IOMMU select NEED_SG_DMA_LENGTH select NEED_SG_DMA_FLAGS if SWIOTLB diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 3b58244e6344..94263ed2c564 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -449,8 +449,10 @@ void iommu_put_dma_cookie(struct iommu_domain *domain) struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iommu_dma_msi_page *msi, *tmp; +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) if (domain->sw_msi != iommu_dma_sw_msi) return; +#endif if (!cookie) return; diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index c11b9965c4ad..64658a1c3aa1 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -28,6 +28,7 @@ config ARM_GIC_V2M select ARM_GIC select IRQ_MSI_LIB select PCI_MSI + select IRQ_MSI_IOMMU config GIC_NON_BANKED bool @@ -38,12 +39,14 @@ config ARM_GIC_V3 select PARTITION_PERCPU select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP select HAVE_ARM_SMCCC_DISCOVERY + select IRQ_MSI_IOMMU config ARM_GIC_V3_ITS bool select GENERIC_MSI_IRQ select IRQ_MSI_LIB default ARM_GIC_V3 + select IRQ_MSI_IOMMU config ARM_GIC_V3_ITS_FSL_MC bool @@ -408,6 +411,7 @@ config LS_EXTIRQ config LS_SCFG_MSI def_bool y if SOC_LS1021A || ARCH_LAYERSCAPE + select IRQ_MSI_IOMMU depends on PCI_MSI config PARTITION_PERCPU diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 5432418c0fea..9636aed20401 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -100,6 +100,7 @@ config GENERIC_MSI_IRQ bool select IRQ_DOMAIN_HIERARCHY +# irqchip drivers should select this if they call iommu_dma_prepare_msi() config IRQ_MSI_IOMMU bool -- 2.51.0 From 748706d7ca06012621b32851b68136bf33613b9e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 19 Feb 2025 17:31:40 -0800 Subject: [PATCH 05/16] iommu: Turn fault_data to iommufd private pointer A "fault_data" was added exclusively for the iommufd_fault_iopf_handler() used by IOPF/PRI use cases, along with the attach_handle. Now, the iommufd version of the sw_msi function will reuse the attach_handle and fault_data for a non-fault case. Rename "fault_data" to "iommufd_hwpt" so as not to confine it to a "fault" case. Move it into a union to be the iommufd private pointer. A following patch will move the iova_cookie to the union for dma-iommu too after the iommufd_sw_msi implementation is added. Since we have two unions now, add some simple comments for readability. Link: https://patch.msgid.link/r/ee5039503f28a16590916e9eef28b917e2d1607a.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/fault.c | 2 +- drivers/iommu/iommufd/hw_pagetable.c | 2 +- include/linux/iommu.h | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index 931a3fbe6e32..c48d72c9668c 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -329,7 +329,7 @@ int iommufd_fault_iopf_handler(struct iopf_group *group) struct iommufd_hw_pagetable *hwpt; struct iommufd_fault *fault; - hwpt = group->attach_handle->domain->fault_data; + hwpt = group->attach_handle->domain->iommufd_hwpt; fault = hwpt->fault; spin_lock(&fault->lock); diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 598be26a14e2..2641d50f46cf 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -406,10 +406,10 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) } hwpt->fault = fault; hwpt->domain->iopf_handler = iommufd_fault_iopf_handler; - hwpt->domain->fault_data = hwpt; refcount_inc(&fault->obj.users); iommufd_put_object(ucmd->ictx, &fault->obj); } + hwpt->domain->iommufd_hwpt = hwpt; cmd->out_hwpt_id = hwpt->obj.id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 761c5e186de9..e93d2e918599 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -224,8 +224,10 @@ struct iommu_domain { phys_addr_t msi_addr); #endif - void *fault_data; - union { + union { /* Pointer usable by owner of the domain */ + struct iommufd_hw_pagetable *iommufd_hwpt; /* iommufd */ + }; + union { /* Fault handler */ struct { iommu_fault_handler_t handler; void *handler_token; -- 2.51.0 From 40f5175d0eb77f902ba8e2a5df2a8f3a218c8843 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:41 -0800 Subject: [PATCH 06/16] iommufd: Implement sw_msi support natively iommufd has a model where the iommu_domain can be changed while the VFIO device is attached. In this case, the MSI should continue to work. This corner case has not worked because the dma-iommu implementation of sw_msi is tied to a single domain. Implement the sw_msi mapping directly and use a global per-fd table to associate assigned IOVA to the MSI pages. This allows the MSI pages to be loaded into a domain before it is attached ensuring that MSI is not disrupted. Link: https://patch.msgid.link/r/e13d23eeacd67c0a692fc468c85b483f4dd51c57.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 161 ++++++++++++++++++++---- drivers/iommu/iommufd/hw_pagetable.c | 3 + drivers/iommu/iommufd/iommufd_private.h | 23 +++- drivers/iommu/iommufd/main.c | 9 ++ 4 files changed, 173 insertions(+), 23 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 0786290b4056..4e107f69f951 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "../iommu-priv.h" #include "io_pagetable.h" @@ -293,36 +294,152 @@ u32 iommufd_device_to_id(struct iommufd_device *idev) } EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, "IOMMUFD"); +/* + * Get a iommufd_sw_msi_map for the msi physical address requested by the irq + * layer. The mapping to IOVA is global to the iommufd file descriptor, every + * domain that is attached to a device using the same MSI parameters will use + * the same IOVA. + */ +static __maybe_unused struct iommufd_sw_msi_map * +iommufd_sw_msi_get_map(struct iommufd_ctx *ictx, phys_addr_t msi_addr, + phys_addr_t sw_msi_start) +{ + struct iommufd_sw_msi_map *cur; + unsigned int max_pgoff = 0; + + lockdep_assert_held(&ictx->sw_msi_lock); + + list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) { + if (cur->sw_msi_start != sw_msi_start) + continue; + max_pgoff = max(max_pgoff, cur->pgoff + 1); + if (cur->msi_addr == msi_addr) + return cur; + } + + if (ictx->sw_msi_id >= + BITS_PER_BYTE * sizeof_field(struct iommufd_sw_msi_maps, bitmap)) + return ERR_PTR(-EOVERFLOW); + + cur = kzalloc(sizeof(*cur), GFP_KERNEL); + if (!cur) + return ERR_PTR(-ENOMEM); + + cur->sw_msi_start = sw_msi_start; + cur->msi_addr = msi_addr; + cur->pgoff = max_pgoff; + cur->id = ictx->sw_msi_id++; + list_add_tail(&cur->sw_msi_item, &ictx->sw_msi_list); + return cur; +} + +static int iommufd_sw_msi_install(struct iommufd_ctx *ictx, + struct iommufd_hwpt_paging *hwpt_paging, + struct iommufd_sw_msi_map *msi_map) +{ + unsigned long iova; + + lockdep_assert_held(&ictx->sw_msi_lock); + + iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE; + if (!test_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap)) { + int rc; + + rc = iommu_map(hwpt_paging->common.domain, iova, + msi_map->msi_addr, PAGE_SIZE, + IOMMU_WRITE | IOMMU_READ | IOMMU_MMIO, + GFP_KERNEL_ACCOUNT); + if (rc) + return rc; + __set_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap); + } + return 0; +} + +/* + * Called by the irq code if the platform translates the MSI address through the + * IOMMU. msi_addr is the physical address of the MSI page. iommufd will + * allocate a fd global iova for the physical page that is the same on all + * domains and devices. + */ +#ifdef CONFIG_IRQ_MSI_IOMMU +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr) +{ + struct device *dev = msi_desc_to_dev(desc); + struct iommufd_hwpt_paging *hwpt_paging; + struct iommu_attach_handle *raw_handle; + struct iommufd_attach_handle *handle; + struct iommufd_sw_msi_map *msi_map; + struct iommufd_ctx *ictx; + unsigned long iova; + int rc; + + /* + * It is safe to call iommu_attach_handle_get() here because the iommu + * core code invokes this under the group mutex which also prevents any + * change of the attach handle for the duration of this function. + */ + iommu_group_mutex_assert(dev); + + raw_handle = + iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0); + if (IS_ERR(raw_handle)) + return 0; + hwpt_paging = find_hwpt_paging(domain->iommufd_hwpt); + + handle = to_iommufd_handle(raw_handle); + /* No IOMMU_RESV_SW_MSI means no change to the msi_msg */ + if (handle->idev->igroup->sw_msi_start == PHYS_ADDR_MAX) + return 0; + + ictx = handle->idev->ictx; + guard(mutex)(&ictx->sw_msi_lock); + /* + * The input msi_addr is the exact byte offset of the MSI doorbell, we + * assume the caller has checked that it is contained with a MMIO region + * that is secure to map at PAGE_SIZE. + */ + msi_map = iommufd_sw_msi_get_map(handle->idev->ictx, + msi_addr & PAGE_MASK, + handle->idev->igroup->sw_msi_start); + if (IS_ERR(msi_map)) + return PTR_ERR(msi_map); + + rc = iommufd_sw_msi_install(ictx, hwpt_paging, msi_map); + if (rc) + return rc; + __set_bit(msi_map->id, handle->idev->igroup->required_sw_msi.bitmap); + + iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE; + msi_desc_set_iommu_msi_iova(desc, iova, PAGE_SHIFT); + return 0; +} +#endif + static int iommufd_group_setup_msi(struct iommufd_group *igroup, struct iommufd_hwpt_paging *hwpt_paging) { - phys_addr_t sw_msi_start = igroup->sw_msi_start; - int rc; + struct iommufd_ctx *ictx = igroup->ictx; + struct iommufd_sw_msi_map *cur; + + if (igroup->sw_msi_start == PHYS_ADDR_MAX) + return 0; /* - * If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to - * call iommu_get_msi_cookie() on its behalf. This is necessary to setup - * the MSI window so iommu_dma_prepare_msi() can install pages into our - * domain after request_irq(). If it is not done interrupts will not - * work on this domain. - * - * FIXME: This is conceptually broken for iommufd since we want to allow - * userspace to change the domains, eg switch from an identity IOAS to a - * DMA IOAS. There is currently no way to create a MSI window that - * matches what the IRQ layer actually expects in a newly created - * domain. + * Install all the MSI pages the device has been using into the domain */ - if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) { - rc = iommu_get_msi_cookie(hwpt_paging->common.domain, - sw_msi_start); + guard(mutex)(&ictx->sw_msi_lock); + list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) { + int rc; + + if (cur->sw_msi_start != igroup->sw_msi_start || + !test_bit(cur->id, igroup->required_sw_msi.bitmap)) + continue; + + rc = iommufd_sw_msi_install(ictx, hwpt_paging, cur); if (rc) return rc; - - /* - * iommu_get_msi_cookie() can only be called once per domain, - * it returns -EBUSY on later calls. - */ - hwpt_paging->msi_cookie = true; } return 0; } diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 2641d50f46cf..7de6e914232e 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -156,6 +156,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, goto out_abort; } } + iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); /* * Set the coherency mode before we do iopt_table_add_domain() as some @@ -251,6 +252,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, goto out_abort; } hwpt->domain->owner = ops; + iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { rc = -EINVAL; @@ -307,6 +309,7 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, goto out_abort; } hwpt->domain->owner = viommu->iommu_dev->ops; + iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { rc = -EINVAL; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 8e0e3ab64747..246297452a44 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -19,6 +19,22 @@ struct iommu_group; struct iommu_option; struct iommufd_device; +struct iommufd_sw_msi_map { + struct list_head sw_msi_item; + phys_addr_t sw_msi_start; + phys_addr_t msi_addr; + unsigned int pgoff; + unsigned int id; +}; + +/* Bitmap of struct iommufd_sw_msi_map::id */ +struct iommufd_sw_msi_maps { + DECLARE_BITMAP(bitmap, 64); +}; + +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); + struct iommufd_ctx { struct file *file; struct xarray objects; @@ -26,6 +42,10 @@ struct iommufd_ctx { wait_queue_head_t destroy_wait; struct rw_semaphore ioas_creation_lock; + struct mutex sw_msi_lock; + struct list_head sw_msi_list; + unsigned int sw_msi_id; + u8 account_mode; /* Compatibility with VFIO no iommu */ u8 no_iommu_mode; @@ -283,10 +303,10 @@ struct iommufd_hwpt_paging { struct iommufd_ioas *ioas; bool auto_domain : 1; bool enforce_cache_coherency : 1; - bool msi_cookie : 1; bool nest_parent : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; + struct iommufd_sw_msi_maps present_sw_msi; }; struct iommufd_hwpt_nested { @@ -383,6 +403,7 @@ struct iommufd_group { struct iommu_group *group; struct iommufd_hw_pagetable *hwpt; struct list_head device_list; + struct iommufd_sw_msi_maps required_sw_msi; phys_addr_t sw_msi_start; }; diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index ccf616462a1c..b6fa9fd11bc1 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -227,6 +227,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) xa_init(&ictx->groups); ictx->file = filp; init_waitqueue_head(&ictx->destroy_wait); + mutex_init(&ictx->sw_msi_lock); + INIT_LIST_HEAD(&ictx->sw_msi_list); filp->private_data = ictx; return 0; } @@ -234,6 +236,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) static int iommufd_fops_release(struct inode *inode, struct file *filp) { struct iommufd_ctx *ictx = filp->private_data; + struct iommufd_sw_msi_map *next; + struct iommufd_sw_msi_map *cur; struct iommufd_object *obj; /* @@ -262,6 +266,11 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) break; } WARN_ON(!xa_empty(&ictx->groups)); + + mutex_destroy(&ictx->sw_msi_lock); + list_for_each_entry_safe(cur, next, &ictx->sw_msi_list, sw_msi_item) + kfree(cur); + kfree(ictx); return 0; } -- 2.51.0 From 237603a46abf9637f9b71c6225293fac2b4d6ef7 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 25 Feb 2025 17:18:46 -0800 Subject: [PATCH 07/16] iommu: Make @handle mandatory in iommu_{attach|replace}_group_handle() Caller of the two APIs always provide a valid handle, make @handle as mandatory parameter. Take this chance incoporate the handle->domain set under the protection of group->mutex in iommu_attach_group_handle(). Link: https://patch.msgid.link/r/20250226011849.5102-2-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 022bf96a18c5..81189895df37 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3511,10 +3511,11 @@ int iommu_attach_group_handle(struct iommu_domain *domain, { int ret; - if (handle) - handle->domain = domain; + if (!handle) + return -EINVAL; mutex_lock(&group->mutex); + handle->domain = domain; ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, handle, GFP_KERNEL); if (ret) goto err_unlock; @@ -3568,16 +3569,14 @@ int iommu_replace_group_handle(struct iommu_group *group, void *curr; int ret; - if (!new_domain) + if (!new_domain || !handle) return -EINVAL; mutex_lock(&group->mutex); - if (handle) { - ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL); - if (ret) - goto err_unlock; - handle->domain = new_domain; - } + handle->domain = new_domain; + ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL); + if (ret) + goto err_unlock; ret = __iommu_group_set_domain(group, new_domain); if (ret) -- 2.51.0 From 473ec072a63370e37dddbadb2a7cc2419a0fdb28 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 25 Feb 2025 17:18:47 -0800 Subject: [PATCH 08/16] iommu: Drop iommu_group_replace_domain() iommufd does not use it now, so drop it. Link: https://patch.msgid.link/r/20250226011849.5102-3-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu-priv.h | 3 --- drivers/iommu/iommu.c | 35 ++++++----------------------------- 2 files changed, 6 insertions(+), 32 deletions(-) diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index de5b54eaa8bf..b4508423e13b 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -24,9 +24,6 @@ static inline const struct iommu_ops *iommu_fwspec_ops(struct iommu_fwspec *fwsp return iommu_ops_from_fwnode(fwspec ? fwspec->iommu_fwnode : NULL); } -int iommu_group_replace_domain(struct iommu_group *group, - struct iommu_domain *new_domain); - int iommu_device_register_bus(struct iommu_device *iommu, const struct iommu_ops *ops, const struct bus_type *bus, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 81189895df37..37c8cc6a9f79 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2187,32 +2187,6 @@ int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) } EXPORT_SYMBOL_GPL(iommu_attach_group); -/** - * iommu_group_replace_domain - replace the domain that a group is attached to - * @group: IOMMU group that will be attached to the new domain - * @new_domain: new IOMMU domain to replace with - * - * This API allows the group to switch domains without being forced to go to - * the blocking domain in-between. - * - * If the currently attached domain is a core domain (e.g. a default_domain), - * it will act just like the iommu_attach_group(). - */ -int iommu_group_replace_domain(struct iommu_group *group, - struct iommu_domain *new_domain) -{ - int ret; - - if (!new_domain) - return -EINVAL; - - mutex_lock(&group->mutex); - ret = __iommu_group_set_domain(group, new_domain); - mutex_unlock(&group->mutex); - return ret; -} -EXPORT_SYMBOL_NS_GPL(iommu_group_replace_domain, "IOMMUFD_INTERNAL"); - static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, @@ -3558,9 +3532,12 @@ EXPORT_SYMBOL_NS_GPL(iommu_detach_group_handle, "IOMMUFD_INTERNAL"); * @new_domain: new IOMMU domain to replace with * @handle: attach handle * - * This is a variant of iommu_group_replace_domain(). It allows the caller to - * provide an attach handle for the new domain and use it when the domain is - * attached. + * This API allows the group to switch domains without being forced to go to + * the blocking domain in-between. It allows the caller to provide an attach + * handle for the new domain and use it when the domain is attached. + * + * If the currently attached domain is a core domain (e.g. a default_domain), + * it will act just like the iommu_attach_group_handle(). */ int iommu_replace_group_handle(struct iommu_group *group, struct iommu_domain *new_domain, -- 2.51.0 From e1ea9d30d84c65e96eba27b240be5b6798350490 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 25 Feb 2025 17:18:48 -0800 Subject: [PATCH 09/16] iommu: Store either domain or handle in group->pasid_array iommu_attach_device_pasid() only stores handle to group->pasid_array when there is a valid handle input. However, it makes the iommu_attach_device_pasid() unable to detect if the pasid has been attached or not previously. To be complete, let the iommu_attach_device_pasid() store the domain to group->pasid_array if no valid handle. The other users of the group->pasid_array should be updated to be consistent. e.g. the iommu_attach_group_handle() and iommu_replace_group_handle(). Link: https://patch.msgid.link/r/20250226011849.5102-4-yi.l.liu@intel.com Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 37c8cc6a9f79..d3652518c734 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -45,6 +45,9 @@ static unsigned int iommu_def_domain_type __read_mostly; static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_DMA_STRICT); static u32 iommu_cmd_line __read_mostly; +/* Tags used with xa_tag_pointer() in group->pasid_array */ +enum { IOMMU_PASID_ARRAY_DOMAIN = 0, IOMMU_PASID_ARRAY_HANDLE = 1 }; + struct iommu_group { struct kobject kobj; struct kobject *devices_kobj; @@ -2147,6 +2150,17 @@ struct iommu_domain *iommu_get_dma_domain(struct device *dev) return dev->iommu_group->default_domain; } +static void *iommu_make_pasid_array_entry(struct iommu_domain *domain, + struct iommu_attach_handle *handle) +{ + if (handle) { + handle->domain = domain; + return xa_tag_pointer(handle, IOMMU_PASID_ARRAY_HANDLE); + } + + return xa_tag_pointer(domain, IOMMU_PASID_ARRAY_DOMAIN); +} + static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { @@ -3348,6 +3362,7 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, struct iommu_group *group = dev->iommu_group; struct group_device *device; const struct iommu_ops *ops; + void *entry; int ret; if (!group) @@ -3371,10 +3386,9 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, } } - if (handle) - handle->domain = domain; + entry = iommu_make_pasid_array_entry(domain, handle); - ret = xa_insert(&group->pasid_array, pasid, handle, GFP_KERNEL); + ret = xa_insert(&group->pasid_array, pasid, entry, GFP_KERNEL); if (ret) goto out_unlock; @@ -3454,13 +3468,17 @@ struct iommu_attach_handle * iommu_attach_handle_get(struct iommu_group *group, ioasid_t pasid, unsigned int type) { struct iommu_attach_handle *handle; + void *entry; xa_lock(&group->pasid_array); - handle = xa_load(&group->pasid_array, pasid); - if (!handle) + entry = xa_load(&group->pasid_array, pasid); + if (!entry || xa_pointer_tag(entry) != IOMMU_PASID_ARRAY_HANDLE) { handle = ERR_PTR(-ENOENT); - else if (type && handle->domain->type != type) - handle = ERR_PTR(-EBUSY); + } else { + handle = xa_untag_pointer(entry); + if (type && handle->domain->type != type) + handle = ERR_PTR(-EBUSY); + } xa_unlock(&group->pasid_array); return handle; @@ -3483,14 +3501,15 @@ int iommu_attach_group_handle(struct iommu_domain *domain, struct iommu_group *group, struct iommu_attach_handle *handle) { + void *entry; int ret; if (!handle) return -EINVAL; mutex_lock(&group->mutex); - handle->domain = domain; - ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, handle, GFP_KERNEL); + entry = iommu_make_pasid_array_entry(domain, handle); + ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL); if (ret) goto err_unlock; @@ -3543,14 +3562,14 @@ int iommu_replace_group_handle(struct iommu_group *group, struct iommu_domain *new_domain, struct iommu_attach_handle *handle) { - void *curr; + void *curr, *entry; int ret; if (!new_domain || !handle) return -EINVAL; mutex_lock(&group->mutex); - handle->domain = new_domain; + entry = iommu_make_pasid_array_entry(new_domain, handle); ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL); if (ret) goto err_unlock; @@ -3559,7 +3578,7 @@ int iommu_replace_group_handle(struct iommu_group *group, if (ret) goto err_release; - curr = xa_store(&group->pasid_array, IOMMU_NO_PASID, handle, GFP_KERNEL); + curr = xa_store(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL); WARN_ON(xa_is_err(curr)); mutex_unlock(&group->mutex); -- 2.51.0 From 5e9f822c9c683ae884fa5e71df41d1647b2876c6 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 25 Feb 2025 17:18:49 -0800 Subject: [PATCH 10/16] iommu: Swap the order of setting group->pasid_array and calling attach op of iommu drivers The current implementation stores entry to the group->pasid_array before the underlying iommu driver has successfully set the new domain. This can lead to issues where PRIs are received on the new domain before the attach operation is completed. This patch swaps the order of operations to ensure that the domain is set in the underlying iommu driver before updating the group->pasid_array. Link: https://patch.msgid.link/r/20250226011849.5102-5-yi.l.liu@intel.com Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu.c | 48 ++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index d3652518c734..0ee17893810f 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3388,13 +3388,29 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, entry = iommu_make_pasid_array_entry(domain, handle); - ret = xa_insert(&group->pasid_array, pasid, entry, GFP_KERNEL); + /* + * Entry present is a failure case. Use xa_insert() instead of + * xa_reserve(). + */ + ret = xa_insert(&group->pasid_array, pasid, XA_ZERO_ENTRY, GFP_KERNEL); if (ret) goto out_unlock; ret = __iommu_set_group_pasid(domain, group, pasid); - if (ret) - xa_erase(&group->pasid_array, pasid); + if (ret) { + xa_release(&group->pasid_array, pasid); + goto out_unlock; + } + + /* + * The xa_insert() above reserved the memory, and the group->mutex is + * held, this cannot fail. The new domain cannot be visible until the + * operation succeeds as we cannot tolerate PRIs becoming concurrently + * queued and then failing attach. + */ + WARN_ON(xa_is_err(xa_store(&group->pasid_array, + pasid, entry, GFP_KERNEL))); + out_unlock: mutex_unlock(&group->mutex); return ret; @@ -3509,19 +3525,27 @@ int iommu_attach_group_handle(struct iommu_domain *domain, mutex_lock(&group->mutex); entry = iommu_make_pasid_array_entry(domain, handle); - ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL); + ret = xa_insert(&group->pasid_array, + IOMMU_NO_PASID, XA_ZERO_ENTRY, GFP_KERNEL); if (ret) - goto err_unlock; + goto out_unlock; ret = __iommu_attach_group(domain, group); - if (ret) - goto err_erase; - mutex_unlock(&group->mutex); + if (ret) { + xa_release(&group->pasid_array, IOMMU_NO_PASID); + goto out_unlock; + } - return 0; -err_erase: - xa_erase(&group->pasid_array, IOMMU_NO_PASID); -err_unlock: + /* + * The xa_insert() above reserved the memory, and the group->mutex is + * held, this cannot fail. The new domain cannot be visible until the + * operation succeeds as we cannot tolerate PRIs becoming concurrently + * queued and then failing attach. + */ + WARN_ON(xa_is_err(xa_store(&group->pasid_array, + IOMMU_NO_PASID, entry, GFP_KERNEL))); + +out_unlock: mutex_unlock(&group->mutex); return ret; } -- 2.51.0 From 032d7e435cbd81ffa37b846aea5a236f3f5912a3 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 26 Feb 2025 15:57:49 +0000 Subject: [PATCH 11/16] iommu/dma: Remove redundant locking This reverts commit ac9a5d522bb80be50ea84965699e1c8257d745ce. iommu_dma_init_domain() is now only called under the group mutex, as it should be given that that the default domain belongs to the group, so the additional internal locking is no longer needed. Signed-off-by: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/a943d4c198e6a1fffe998337d577dc3aa7f660a9.1740585469.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/dma-iommu.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 94263ed2c564..0832998eca38 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -87,7 +87,6 @@ struct iommu_dma_cookie { struct iommu_domain *fq_domain; /* Options for dma-iommu use */ struct iommu_dma_options options; - struct mutex mutex; }; static DEFINE_STATIC_KEY_FALSE(iommu_deferred_attach_enabled); @@ -401,7 +400,6 @@ int iommu_get_dma_cookie(struct iommu_domain *domain) if (!domain->iova_cookie) return -ENOMEM; - mutex_init(&domain->iova_cookie->mutex); iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); return 0; } @@ -709,23 +707,20 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev domain->geometry.aperture_start >> order); /* start_pfn is always nonzero for an already-initialised domain */ - mutex_lock(&cookie->mutex); if (iovad->start_pfn) { if (1UL << order != iovad->granule || base_pfn != iovad->start_pfn) { pr_warn("Incompatible range for DMA domain\n"); - ret = -EFAULT; - goto done_unlock; + return -EFAULT; } - ret = 0; - goto done_unlock; + return 0; } init_iova_domain(iovad, 1UL << order, base_pfn); ret = iova_domain_init_rcaches(iovad); if (ret) - goto done_unlock; + return ret; iommu_dma_init_options(&cookie->options, dev); @@ -734,11 +729,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev (!device_iommu_capable(dev, IOMMU_CAP_DEFERRED_FLUSH) || iommu_dma_init_fq(domain))) domain->type = IOMMU_DOMAIN_DMA; - ret = iova_reserve_iommu_regions(dev, domain); - -done_unlock: - mutex_unlock(&cookie->mutex); - return ret; + return iova_reserve_iommu_regions(dev, domain); } /** -- 2.51.0 From 29c6e1c2b923b43e8082bba5c6675185a8fe305a Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 27 Feb 2025 14:47:47 +0000 Subject: [PATCH 12/16] iommu: Unexport iommu_fwspec_free() The drivers doing their own fwspec parsing have no need to call iommu_fwspec_free() since fwspecs were moved into dev_iommu, as returning an error from .probe_device will tear down the whole lot anyway. Move it into the private interface now that it only serves for of_iommu to clean up in an error case. I have no idea what mtk_v1 was doing in effectively guaranteeing a NULL fwspec would be dereferenced if no "iommus" DT property was found, so add a check for that to at least make the code look sane. Signed-off-by: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/36e245489361de2d13db22a510fa5c79e7126278.1740667667.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 1 - drivers/iommu/iommu-priv.h | 2 ++ drivers/iommu/iommu.c | 1 - drivers/iommu/mtk_iommu_v1.c | 14 ++++---------- drivers/iommu/tegra-smmu.c | 1 - include/linux/iommu.h | 5 ----- 6 files changed, 6 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index de205a34ffc6..ea9f8e484e35 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1486,7 +1486,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) out_cfg_free: kfree(cfg); out_free: - iommu_fwspec_free(dev); return ERR_PTR(ret); } diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index b4508423e13b..fed55fbfe99c 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -24,6 +24,8 @@ static inline const struct iommu_ops *iommu_fwspec_ops(struct iommu_fwspec *fwsp return iommu_ops_from_fwnode(fwspec ? fwspec->iommu_fwnode : NULL); } +void iommu_fwspec_free(struct device *dev); + int iommu_device_register_bus(struct iommu_device *iommu, const struct iommu_ops *ops, const struct bus_type *bus, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index d96cfb3af8a3..ac4731d13a83 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2837,7 +2837,6 @@ void iommu_fwspec_free(struct device *dev) dev_iommu_fwspec_set(dev, NULL); } } -EXPORT_SYMBOL_GPL(iommu_fwspec_free); int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids) { diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index a565b9e40f4a..3e724e7f10f0 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -446,22 +446,13 @@ static int mtk_iommu_v1_create_mapping(struct device *dev, static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev) { - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct iommu_fwspec *fwspec = NULL; struct of_phandle_args iommu_spec; struct mtk_iommu_v1_data *data; int err, idx = 0, larbid, larbidx; struct device_link *link; struct device *larbdev; - /* - * In the deferred case, free the existed fwspec. - * Always initialize the fwspec internally. - */ - if (fwspec) { - iommu_fwspec_free(dev); - fwspec = dev_iommu_fwspec_get(dev); - } - while (!of_parse_phandle_with_args(dev->of_node, "iommus", "#iommu-cells", idx, &iommu_spec)) { @@ -476,6 +467,9 @@ static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev) idx++; } + if (!fwspec) + return ERR_PTR(-ENODEV); + data = dev_iommu_priv_get(dev); /* Link the consumer device with the smi-larb device(supplier) */ diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 7f633bb5efef..69d353e1df84 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -846,7 +846,6 @@ static int tegra_smmu_configure(struct tegra_smmu *smmu, struct device *dev, err = ops->of_xlate(dev, args); if (err < 0) { dev_err(dev, "failed to parse SW group ID: %d\n", err); - iommu_fwspec_free(dev); return err; } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e93d2e918599..cf8c16ba04a0 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1099,7 +1099,6 @@ struct iommu_mm_data { }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); -void iommu_fwspec_free(struct device *dev); int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids); static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) @@ -1410,10 +1409,6 @@ static inline int iommu_fwspec_init(struct device *dev, return -ENODEV; } -static inline void iommu_fwspec_free(struct device *dev) -{ -} - static inline int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids) { -- 2.51.0 From b46064a18810bad3aea089a79993ca5ea7a3d2b2 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 28 Feb 2025 15:46:30 +0000 Subject: [PATCH 13/16] iommu: Handle race with default domain setup It turns out that deferred default domain creation leaves a subtle race window during iommu_device_register() wherein a client driver may asynchronously probe in parallel and get as far as performing DMA API operations with dma-direct, only to be switched to iommu-dma underfoot once the default domain attachment finally happens, with obviously disastrous consequences. Even the wonky of_iommu_configure() path is at risk, since iommu_fwspec_init() will no longer defer client probe as the instance ops are (necessarily) already registered, and the "replay" iommu_probe_device() call can see dev->iommu_group already set and so think there's nothing to do either. Fortunately we already have the right tool in the right place in the form of iommu_device_use_default_domain(), which just needs to ensure that said default domain is actually ready to *be* used. Deferring the client probe shouldn't have too much impact, given that this only happens while the IOMMU driver is probing, and thus due to kick the deferred probe list again once it finishes. Reported-by: Charan Teja Kalla Fixes: 98ac73f99bc4 ("iommu: Require a default_domain for all iommu drivers") Reviewed-by: Jason Gunthorpe Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/e88b94c9b575034a2c98a48b3d383654cbda7902.1740753261.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index ac4731d13a83..5b73d36e4359 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3084,6 +3084,11 @@ int iommu_device_use_default_domain(struct device *dev) return 0; mutex_lock(&group->mutex); + /* We may race against bus_iommu_probe() finalising groups here */ + if (!group->default_domain) { + ret = -EPROBE_DEFER; + goto unlock_out; + } if (group->owner_cnt) { if (group->domain != group->default_domain || group->owner || !xa_empty(&group->pasid_array)) { -- 2.51.0 From fd598f71b66975c042f40ae7cd4310d6e699e149 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 28 Feb 2025 15:46:31 +0000 Subject: [PATCH 14/16] iommu: Resolve ops in iommu_init_device() Since iommu_init_device() was factored out, it is in fact the only consumer of the ops which __iommu_probe_device() is resolving, so let it do that itself rather than passing them in. This also puts the ops lookup at a more logical point relative to the rest of the flow through __iommu_probe_device(). Signed-off-by: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/fa4b6cfc67a352488b7f4e0b736008307ce9ac2e.1740753261.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5b73d36e4359..d26a459f29c3 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -407,14 +407,28 @@ EXPORT_SYMBOL_GPL(dev_iommu_priv_set); * Init the dev->iommu and dev->iommu_group in the struct device and get the * driver probed */ -static int iommu_init_device(struct device *dev, const struct iommu_ops *ops) +static int iommu_init_device(struct device *dev) { + const struct iommu_ops *ops; struct iommu_device *iommu_dev; struct iommu_group *group; int ret; if (!dev_iommu_get(dev)) return -ENOMEM; + /* + * For FDT-based systems and ACPI IORT/VIOT, drivers register IOMMU + * instances with non-NULL fwnodes, and client devices should have been + * identified with a fwspec by this point. Otherwise, we can currently + * assume that only one of Intel, AMD, s390, PAMU or legacy SMMUv2 can + * be present, and that any of their registered instances has suitable + * ops for probing, and thus cheekily co-opt the same mechanism. + */ + ops = iommu_fwspec_ops(dev->iommu->fwspec); + if (!ops) { + ret = -ENODEV; + goto err_free; + } if (!try_module_get(ops->owner)) { ret = -EINVAL; @@ -517,22 +531,10 @@ DEFINE_MUTEX(iommu_probe_device_lock); static int __iommu_probe_device(struct device *dev, struct list_head *group_list) { - const struct iommu_ops *ops; struct iommu_group *group; struct group_device *gdev; int ret; - /* - * For FDT-based systems and ACPI IORT/VIOT, drivers register IOMMU - * instances with non-NULL fwnodes, and client devices should have been - * identified with a fwspec by this point. Otherwise, we can currently - * assume that only one of Intel, AMD, s390, PAMU or legacy SMMUv2 can - * be present, and that any of their registered instances has suitable - * ops for probing, and thus cheekily co-opt the same mechanism. - */ - ops = iommu_fwspec_ops(dev_iommu_fwspec_get(dev)); - if (!ops) - return -ENODEV; /* * Serialise to avoid races between IOMMU drivers registering in * parallel and/or the "replay" calls from ACPI/OF code via client @@ -546,7 +548,7 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list if (dev->iommu_group) return 0; - ret = iommu_init_device(dev, ops); + ret = iommu_init_device(dev); if (ret) return ret; -- 2.51.0 From 3832862eb9c4dfa0e80b2522bfaedbc8a43de97d Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 28 Feb 2025 15:46:32 +0000 Subject: [PATCH 15/16] iommu: Keep dev->iommu state consistent At the moment, if of_iommu_configure() allocates dev->iommu itself via iommu_fwspec_init(), then suffers a DT parsing failure, it cleans up the fwspec but leaves the empty dev_iommu hanging around. So far this is benign (if a tiny bit wasteful), but we'd like to be able to reason about dev->iommu having a consistent and unambiguous lifecycle. Thus make sure that the of_iommu cleanup undoes precisely whatever it did. Signed-off-by: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/d219663a3f23001f23d520a883ac622d70b4e642.1740753261.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu-priv.h | 2 ++ drivers/iommu/iommu.c | 2 +- drivers/iommu/of_iommu.c | 6 +++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index fed55fbfe99c..05fa6e682e88 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -17,6 +17,8 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) return dev->iommu->iommu_dev->ops; } +void dev_iommu_free(struct device *dev); + const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode); static inline const struct iommu_ops *iommu_fwspec_ops(struct iommu_fwspec *fwspec) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index d26a459f29c3..b0be5c168a43 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -355,7 +355,7 @@ static struct dev_iommu *dev_iommu_get(struct device *dev) return param; } -static void dev_iommu_free(struct device *dev) +void dev_iommu_free(struct device *dev) { struct dev_iommu *param = dev->iommu; diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 97987cd78da9..e10a68b5ffde 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -116,6 +116,7 @@ static void of_pci_check_device_ats(struct device *dev, struct device_node *np) int of_iommu_configure(struct device *dev, struct device_node *master_np, const u32 *id) { + bool dev_iommu_present; int err; if (!master_np) @@ -127,6 +128,7 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np, mutex_unlock(&iommu_probe_device_lock); return 0; } + dev_iommu_present = dev->iommu; /* * We don't currently walk up the tree looking for a parent IOMMU. @@ -147,8 +149,10 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np, err = of_iommu_configure_device(master_np, dev, id); } - if (err) + if (err && dev_iommu_present) iommu_fwspec_free(dev); + else if (err && dev->iommu) + dev_iommu_free(dev); mutex_unlock(&iommu_probe_device_lock); if (!err && dev->bus) -- 2.51.0 From bcb81ac6ae3c2ef95b44e7b54c3c9522364a245c Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 28 Feb 2025 15:46:33 +0000 Subject: [PATCH 16/16] iommu: Get DT/ACPI parsing into the proper probe path In hindsight, there were some crucial subtleties overlooked when moving {of,acpi}_dma_configure() to driver probe time to allow waiting for IOMMU drivers with -EPROBE_DEFER, and these have become an ever-increasing source of problems. The IOMMU API has some fundamental assumptions that iommu_probe_device() is called for every device added to the system, in the order in which they are added. Calling it in a random order or not at all dependent on driver binding leads to malformed groups, a potential lack of isolation for devices with no driver, and all manner of unexpected concurrency and race conditions. We've attempted to mitigate the latter with point-fix bodges like iommu_probe_device_lock, but it's a losing battle and the time has come to bite the bullet and address the true source of the problem instead. The crux of the matter is that the firmware parsing actually serves two distinct purposes; one is identifying the IOMMU instance associated with a device so we can check its availability, the second is actually telling that instance about the relevant firmware-provided data for the device. However the latter also depends on the former, and at the time there was no good place to defer and retry that separately from the availability check we also wanted for client driver probe. Nowadays, though, we have a proper notion of multiple IOMMU instances in the core API itself, and each one gets a chance to probe its own devices upon registration, so we can finally make that work as intended for DT/IORT/VIOT platforms too. All we need is for iommu_probe_device() to be able to run the iommu_fwspec machinery currently buried deep in the wrong end of {of,acpi}_dma_configure(). Luckily it turns out to be surprisingly straightforward to bootstrap this transformation by pretty much just calling the same path twice. At client driver probe time, dev->driver is obviously set; conversely at device_add(), or a subsequent bus_iommu_probe(), any device waiting for an IOMMU really should *not* have a driver already, so we can use that as a condition to disambiguate the two cases, and avoid recursing back into the IOMMU core at the wrong times. Obviously this isn't the nicest thing, but for now it gives us a functional baseline to then unpick the layers in between without many more awkward cross-subsystem patches. There are some minor side-effects like dma_range_map potentially being created earlier, and some debug prints being repeated, but these aren't significantly detrimental. Let's make things work first, then deal with making them nice. With the basic flow finally in the right order again, the next step is probably turning the bus->dma_configure paths inside-out, since all we really need from bus code is its notion of which device and input ID(s) to parse the common firmware properties with... Acked-by: Bjorn Helgaas # pci-driver.c Acked-by: Rob Herring (Arm) # of/device.c Signed-off-by: Robin Murphy Reviewed-by: Lorenzo Pieralisi Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/e3b191e6fd6ca9a1e84c5e5e40044faf97abb874.1740753261.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/acpi/arm64/dma.c | 5 +++++ drivers/acpi/scan.c | 7 ------- drivers/amba/bus.c | 3 ++- drivers/base/platform.c | 3 ++- drivers/bus/fsl-mc/fsl-mc-bus.c | 3 ++- drivers/cdx/cdx.c | 3 ++- drivers/iommu/iommu.c | 24 +++++++++++++++++++++--- drivers/iommu/of_iommu.c | 7 ++++++- drivers/of/device.c | 7 ++++++- drivers/pci/pci-driver.c | 3 ++- 10 files changed, 48 insertions(+), 17 deletions(-) diff --git a/drivers/acpi/arm64/dma.c b/drivers/acpi/arm64/dma.c index 52b2abf88689..f30f138352b7 100644 --- a/drivers/acpi/arm64/dma.c +++ b/drivers/acpi/arm64/dma.c @@ -26,6 +26,11 @@ void acpi_arch_dma_setup(struct device *dev) else end = (1ULL << 32) - 1; + if (dev->dma_range_map) { + dev_dbg(dev, "dma_range_map already set\n"); + return; + } + ret = acpi_dma_get_range(dev, &map); if (!ret && map) { end = dma_range_map_max(map); diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 9f4efa8f75a6..fb1fe9f3b1a3 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1632,13 +1632,6 @@ static int acpi_iommu_configure_id(struct device *dev, const u32 *id_in) err = viot_iommu_configure(dev); mutex_unlock(&iommu_probe_device_lock); - /* - * If we have reason to believe the IOMMU driver missed the initial - * iommu_probe_device() call for dev, replay it to get things in order. - */ - if (!err && dev->bus) - err = iommu_probe_device(dev); - return err; } diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c index 8ef259b4d037..71482d639a6d 100644 --- a/drivers/amba/bus.c +++ b/drivers/amba/bus.c @@ -364,7 +364,8 @@ static int amba_dma_configure(struct device *dev) ret = acpi_dma_configure(dev, attr); } - if (!ret && !drv->driver_managed_dma) { + /* @drv may not be valid when we're called from the IOMMU layer */ + if (!ret && dev->driver && !drv->driver_managed_dma) { ret = iommu_device_use_default_domain(dev); if (ret) arch_teardown_dma_ops(dev); diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 6f2a33722c52..1813cfd0c4bd 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -1451,7 +1451,8 @@ static int platform_dma_configure(struct device *dev) attr = acpi_get_dma_attr(to_acpi_device_node(fwnode)); ret = acpi_dma_configure(dev, attr); } - if (ret || drv->driver_managed_dma) + /* @drv may not be valid when we're called from the IOMMU layer */ + if (ret || !dev->driver || drv->driver_managed_dma) return ret; ret = iommu_device_use_default_domain(dev); diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c index d1f3d327ddd1..a8be8cf246fb 100644 --- a/drivers/bus/fsl-mc/fsl-mc-bus.c +++ b/drivers/bus/fsl-mc/fsl-mc-bus.c @@ -153,7 +153,8 @@ static int fsl_mc_dma_configure(struct device *dev) else ret = acpi_dma_configure_id(dev, DEV_DMA_COHERENT, &input_id); - if (!ret && !mc_drv->driver_managed_dma) { + /* @mc_drv may not be valid when we're called from the IOMMU layer */ + if (!ret && dev->driver && !mc_drv->driver_managed_dma) { ret = iommu_device_use_default_domain(dev); if (ret) arch_teardown_dma_ops(dev); diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c index c573ed2ee71a..780fb0c4adba 100644 --- a/drivers/cdx/cdx.c +++ b/drivers/cdx/cdx.c @@ -360,7 +360,8 @@ static int cdx_dma_configure(struct device *dev) return ret; } - if (!ret && !cdx_drv->driver_managed_dma) { + /* @cdx_drv may not be valid when we're called from the IOMMU layer */ + if (!ret && dev->driver && !cdx_drv->driver_managed_dma) { ret = iommu_device_use_default_domain(dev); if (ret) arch_teardown_dma_ops(dev); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index b0be5c168a43..c283721579b3 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -417,9 +417,21 @@ static int iommu_init_device(struct device *dev) if (!dev_iommu_get(dev)) return -ENOMEM; /* - * For FDT-based systems and ACPI IORT/VIOT, drivers register IOMMU - * instances with non-NULL fwnodes, and client devices should have been - * identified with a fwspec by this point. Otherwise, we can currently + * For FDT-based systems and ACPI IORT/VIOT, the common firmware parsing + * is buried in the bus dma_configure path. Properly unpicking that is + * still a big job, so for now just invoke the whole thing. The device + * already having a driver bound means dma_configure has already run and + * either found no IOMMU to wait for, or we're in its replay call right + * now, so either way there's no point calling it again. + */ + if (!dev->driver && dev->bus->dma_configure) { + mutex_unlock(&iommu_probe_device_lock); + dev->bus->dma_configure(dev); + mutex_lock(&iommu_probe_device_lock); + } + /* + * At this point, relevant devices either now have a fwspec which will + * match ops registered with a non-NULL fwnode, or we can reasonably * assume that only one of Intel, AMD, s390, PAMU or legacy SMMUv2 can * be present, and that any of their registered instances has suitable * ops for probing, and thus cheekily co-opt the same mechanism. @@ -429,6 +441,12 @@ static int iommu_init_device(struct device *dev) ret = -ENODEV; goto err_free; } + /* + * And if we do now see any replay calls, they would indicate someone + * misusing the dma_configure path outside bus code. + */ + if (dev->driver) + dev_WARN(dev, "late IOMMU probe at driver bind, something fishy here!\n"); if (!try_module_get(ops->owner)) { ret = -EINVAL; diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index e10a68b5ffde..6b989a62def2 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -155,7 +155,12 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np, dev_iommu_free(dev); mutex_unlock(&iommu_probe_device_lock); - if (!err && dev->bus) + /* + * If we're not on the iommu_probe_device() path (as indicated by the + * initial dev->iommu) then try to simulate it. This should no longer + * happen unless of_dma_configure() is being misused outside bus code. + */ + if (!err && dev->bus && !dev_iommu_present) err = iommu_probe_device(dev); if (err && err != -EPROBE_DEFER) diff --git a/drivers/of/device.c b/drivers/of/device.c index edf3be197265..5053e5d532cc 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -99,6 +99,11 @@ int of_dma_configure_id(struct device *dev, struct device_node *np, bool coherent, set_map = false; int ret; + if (dev->dma_range_map) { + dev_dbg(dev, "dma_range_map already set\n"); + goto skip_map; + } + if (np == dev->of_node) bus_np = __of_get_dma_parent(np); else @@ -119,7 +124,7 @@ int of_dma_configure_id(struct device *dev, struct device_node *np, end = dma_range_map_max(map); set_map = true; } - +skip_map: /* * If @dev is expected to be DMA-capable then the bus code that created * it should have initialised its dma_mask pointer by this point. For diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index f57ea36d125d..4468b7327cab 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -1653,7 +1653,8 @@ static int pci_dma_configure(struct device *dev) pci_put_host_bridge_device(bridge); - if (!ret && !driver->driver_managed_dma) { + /* @driver may not be valid when we're called from the IOMMU layer */ + if (!ret && dev->driver && !driver->driver_managed_dma) { ret = iommu_device_use_default_domain(dev); if (ret) arch_teardown_dma_ops(dev); -- 2.51.0