From 7948c915f4a57d8d13551b0e8759c179e6151a3b Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 30 Nov 2017 14:46:12 -0500 Subject: [PATCH] xen/pci: Add PXM node notifier for PXM (NUMA) changes. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit commit c011624b48ca8713135fee9af20f65cc9610be7b "xen-pcifront/hvm: Slurp up "pxm" entry and set NUMA node on PCIe device. (V5)" added the functionality to update the PXM (ProXiMity) information on the PCI devices. In today's pre-Q35 type guests the PCI bus topology is pretty flat, where there is only one bridge - and that does not square very well with the real way one is suppose to expose PXM information. That is have a PCI bridge and under the bridge plug in devices. But since we can't do that, we are providing the information via XenBus - and Xen pcifront listens to this and updates the 'struct device' numa_node. While that works.. there is a race. Thanks to the seperation of pieces, Xend has no clue what the PCI topology is in QEMU. Meaning it has no idea what the bus:slot:function should be present in there - until QEMU gets told to plug the device (ACPI hotplug) andthen it presents the correct "vdevfn" to Xend - and Xend then creates the XenBus entries which the Xen pcifront inside of the guest enumerates. This is a race. This is what happens at the guest side: pci 0000:00:04.0: reg 0x18: [mem 0x00000000-0x07ffffff 64bit pref] pci 0000:00:02.0: BAR 6: [??? 0x00000000 flags 0x2] has bogus ali.. pci 0000:00:04.0: BAR 2: assigned [mem 0x3c10000000-0x3c17ffffff .. mlx4_core: Mellanox ConnectX core driver v2.2-1 (Nov 30 2017) mlx4_core: Initializing 0000:00:04.0 mlx4_core 0000:00:04.0: enabling device (0000 -> 0002) (mlx loaded, it sets the numa_node based on pdev->dev (which is -1). Keep in mind that numa_node is on 'struct mlx_dev' not on 'struct device') pcifront_hvm pci-0: PCI device 0000:00:04.0 (PXM=0) (XenBus entries are created, pdev->dev is changed from -1 to 0. The notification are being sent out when driver is finished loading BUS_NOTIFY_BOUND_DRIVER) But pdev->dev is NOT the same as 'struct mlx_dev' which has its own numa_node attribute (which is exposed via SysFS)! To make this work we need notifier that the mlx4 driver can be notified when PXM is updated. And with this patch (and another follow-up): mlx4_core 0000:00:04.0: Updating PXM-1 to 0 (mlx4 updates it PXM data) The sensible ways would be for Xen hypervisor to keep track of the guest PCI topology instead of delegating that to QEMU, and in fact that work is being done upstream. With that one can query the hypevisor for the next available slot to hotplug the device instead of asking QEMU. But that is ways off, and we don't have that much time. Instead we add a per-'struct pci_dev' notifier that various drivers can subscribe to and update their 'numa_node' when there are changes. It is not perfect but it does the job. It is also a NOP if run on anything that is not Xen HVM domains. OraBug: 27200813 Acked-by: HÃ¥kon Bugge Reviewed-by: Boris Ostrovsky Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/pci.h | 4 ++ arch/x86/pci/xen.c | 122 +++++++++++++++++++++++++++++++++ drivers/pci/xen-pcifront.c | 11 +-- 3 files changed, 132 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h index f320ee32d5a1..211d4e2ab975 100644 --- a/arch/x86/include/asm/xen/pci.h +++ b/arch/x86/include/asm/xen/pci.h @@ -79,4 +79,8 @@ static inline void xen_pci_frontend_disable_msix(struct pci_dev *dev) #endif /* CONFIG_PCI_XEN */ #endif /* CONFIG_PCI_MSI */ +int register_pci_pxm_handler(struct notifier_block *nb, struct pci_dev *pdev); +int unregister_pci_pxm_handler(struct notifier_block *nb, struct pci_dev *pdev); +void do_kernel_pci_update_pxm(struct pci_dev *dev); + #endif /* _ASM_X86_XEN_PCI_H */ diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 7c07a0027bc6..851324230919 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -576,3 +576,125 @@ int xen_unregister_device_domain_owner(struct pci_dev *dev) } EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner); #endif + +/* + * Notifier list for kernel code which wants to be called + * to get PXM updates. + */ +struct pci_pxm_nb_device { + struct atomic_notifier_head n_head; + struct pci_dev *pdev; + struct list_head list; +}; + +static DEFINE_SPINLOCK(pci_pxm_list_spinlock); +static struct list_head pci_pxm_list = LIST_HEAD_INIT(pci_pxm_list); + +/** + * register_pci_pxm_handler - Register function to be called when wanting + * PXM (NUMA locality) information. + * @nb: Info about handler function to be called + * + * Registers a function with code to be called to pci_pxm the + * system. + * + * Currently always returns zero, as atomic_notifier_chain_register() + * always returns zero. + */ +int register_pci_pxm_handler(struct notifier_block *nb, struct pci_dev *pdev) +{ + struct pci_pxm_nb_device *owner, *entry; + int rc; + + if (!xen_hvm_domain()) + return 0; + + spin_lock(&pci_pxm_list_spinlock); + list_for_each_entry(entry, &pci_pxm_list, list) { + if (entry->pdev == pdev) { + spin_unlock(&pci_pxm_list_spinlock); + return -EEXIST; + } + } + spin_unlock(&pci_pxm_list_spinlock); + + owner = kzalloc(sizeof(struct pci_pxm_nb_device), GFP_KERNEL); + if (!owner) + return -ENOMEM; + + owner->pdev = pdev; + ATOMIC_INIT_NOTIFIER_HEAD(&owner->n_head); + rc = atomic_notifier_chain_register(&owner->n_head, nb); + if (rc) { + kfree(owner); + return rc; + } + + spin_lock(&pci_pxm_list_spinlock); + list_add_tail(&owner->list, &pci_pxm_list); + spin_unlock(&pci_pxm_list_spinlock); + + return 0; +} +EXPORT_SYMBOL(register_pci_pxm_handler); + +/** + * unregister_pci_pxm_handler - Unregister previously registered + * pci_pxm handler + * @nb: Hook to be unregistered + * + * Unregisters a previously registered pci_pxm handler function. + * + * Returns zero on success, or -ENOENT on failure. + */ +int unregister_pci_pxm_handler(struct notifier_block *nb, struct pci_dev *pdev) +{ + struct pci_pxm_nb_device *owner = NULL, *entry; + int rc = -ENODEV; + + if (!xen_hvm_domain()) + return 0; + + spin_lock(&pci_pxm_list_spinlock); + list_for_each_entry(entry, &pci_pxm_list, list) { + if (entry->pdev == pdev) { + owner = entry; + break; + } + } + if (owner) + list_del(&owner->list); + + spin_unlock(&pci_pxm_list_spinlock); + + if (!owner) + return -ENODEV; + + rc = atomic_notifier_chain_unregister(&owner->n_head, nb); + kfree(owner); + + return rc; +} +EXPORT_SYMBOL(unregister_pci_pxm_handler); + +/** + * do_kernel_pci_pxm - Execute kernel PXM handler call chain + * + * Calls functions registered with register_pci_pxm_handler. + * + */ +void do_kernel_pci_update_pxm(struct pci_dev *pdev) +{ + struct pci_pxm_nb_device *entry; + + if (!xen_hvm_domain()) + return; + + spin_lock(&pci_pxm_list_spinlock); + list_for_each_entry(entry, &pci_pxm_list, list) { + if (entry->pdev == pdev) + atomic_notifier_call_chain(&entry->n_head, 0, pdev); + } + spin_unlock(&pci_pxm_list_spinlock); +} +EXPORT_SYMBOL(do_kernel_pci_update_pxm); diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 10bc3e26eb02..f78b51d2eadb 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -1186,11 +1186,12 @@ static int pcifront_hvm_set_node(struct pci_dev *pci_dev, void *data) int pxm = dev_to_node(&pci_dev->dev); - if (pxm == dev->pxm) - break; - - dev_info(&pci_dev->dev, "PXM = %d\n", dev->pxm); - set_dev_node(&pci_dev->dev, dev->pxm); + if (pxm != dev->pxm) + { + dev_info(&pci_dev->dev, "PXM = %d\n", dev->pxm); + set_dev_node(&pci_dev->dev, dev->pxm); + } + do_kernel_pci_update_pxm(pci_dev); break; } } -- 2.50.1