From c011624b48ca8713135fee9af20f65cc9610be7b Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 10 Mar 2017 20:24:34 -0500
Subject: [PATCH] xen-pcifront/hvm: Slurp up "pxm" entry and set NUMA node on
 PCIe device. (V5)
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

If the XenBus contains the "pci" (which by default
it does for both PV and HVM guests), then iterate over
all the entries there and see if there are any with "pxm-X"
key. If so those values are used to modify the NUMA locality
information for the PCIe devices that match.

Also support PCIe hotplug - in case this done during runtime.

This patch also depends on the Xen to expose via XenBus the
"pxm-%d" entries.

A bit of background:

_PXM in ACPI is used to tie all kind of ACPI devices to the SRAT
table.

The SRAT table is simple N CPU array that lists APIC IDs and the NUMA nodes
and their distance from each other. There are two types - processor
affinity and memory affinity. For example one can have on a 4 CPU
machine this processor affinity:

APIC_ID | NUMA id (nid)
--------+--------------
0       | 0
2       | 0
4       | 1
6       | 1

The _PXM tie in the NUMA (nid), so for this guest there can only be
two - 0 or 1.

The _PXM can be slapped on most anything in the DSDT, the Processors
(kind of redundant as it is in SRAT), but most importantly for us the
PCIe devices. Except that ACPI does not enumerate all kind of PCIe devices.

 Device (PCI0)
        {
            Name (_HID, EisaId ("PNP0A08") /* PCI Express Bus */)  // _HID: Hardware ID
..
            Name (_PXM, Zero)  // _PXM: Device Proximity
        }

  Device (PCI1)
        {
            Name (_HID, EisaId ("PNP0A08") /* PCI Express Bus */)  // _HID: Hardware ID
            Name (_CID, EisaId ("PNP0A03") /* PCI Bus */)  // _CID: Compatible ID
            Name (_PXM, 0x01)  // _PXM: Device Proximity
        }

And this nicely helps with the Linux OS (and Windows) enumerating the
PCIe bridges (the two above) _ONCE_ during bootup. Then when a device
is hotplugged under the bridges it is very clear to which NUMA domain
it belongs.

To recap, on normal hardware Linux scans _ONCE_ the DSDT during
bootup and _only_ evaluates the _PXM on bridges "PNP0A03".

ONCE.

On the QEMU guests that Xen provides we have exactly _one_ bridge.
And the PCIe are hotplugged as 'slots' under it.

The SR-IOV VFs we hot-plug in the guest are done during runtime (not
during bootup, that would be too easy).

This means to make this work we would need to implement in QEMU:
 1) Expand piix4 emulation to have bridges, PCIe bridges at bootup.
    And bridges also expose the "window" of what the size of the MMIO
    region is behind it (and the PCIe devices would fit in there).

 2). Create up to NUMA node of these PCI bridges with the _PXM
    information.

 3). Then during PCI hotplug would decide which bridge based on the
    NUMA locality.

That is hard. The 1) is especially difficult as we have no idea
how big MMIO bar the device plugged in will be!

Fortunatly Intel resolved this with the Intel VT-D. It has a hotplug
capability so you can insert a brand new PCIe bridge at any point.
This is how ThunderBolt works in essence.

This would mean that in QEMU we would need to:
 4). Emulate in QEMU an IOMMU VT-d with PCI hotplug support.

Recognizing that 1-4 may take some time, and would need to be
done upstream first I decided to take a bit of shortcut.

Mainly that:
 1) This only needs to work for ExaData which uses our kernel (UEK)
 2) We already expose some of this information on XenBus.
 3) Once upstream is done this can be easily 'dropped'.

In fact, 2) exposes a lot of information:

/libxl/1/device/pci/0 = ""
/libxl/1/device/pci/0/frontend = "/local/domain/1/device/pci/0"
/libxl/1/device/pci/0/backend = "/local/domain/0/backend/pci/1/0"
.. snip..
/libxl/1/device/pci/0/frontend-id = "1"
/libxl/1/device/pci/0/online = "1"
/libxl/1/device/pci/0/state = "1"
/libxl/1/device/pci/0/key-0 = "0000:04:00.0"
/libxl/1/device/pci/0/dev-0 = "0000:04:00.0"
/libxl/1/device/pci/0/vdevfn-0 = "28"
/libxl/1/device/pci/0/opts-0 = "msitranslate=0,power_mgmt=0,permissive=0"
/libxl/1/device/pci/0/state-0 = "1"
/libxl/1/device/pci/0/key-1 = "0000:07:00.0"
/libxl/1/device/pci/0/dev-1 = "0000:07:00.0"
/libxl/1/device/pci/0/vdevfn-1 = "30"
/libxl/1/device/pci/0/opts-1 = "msitranslate=0,power_mgmt=0,permissive=0"
/libxl/1/device/pci/0/state-1 = "1"
/libxl/1/device/pci/0/num_devs = "2"

The 'vdevfn' is the slot:function value. 28 is 00:05.0 and 30
is 00:06:0 and that corresponds to (inside of the guest):
-bash-4.1# lspci
00:00.0 Host bridge: Intel Corporation 440FX - 82441FX PMC [Natoma] (rev 02)
00:01.0 ISA bridge: Intel Corporation 82371SB PIIX3 ISA [Natoma/Triton II]
00:01.1 IDE interface: Intel Corporation 82371SB PIIX3 IDE [Natoma/Triton II]
00:01.2 USB Controller: Intel Corporation 82371SB PIIX3 USB [Natoma/Triton II] (rev 01)
00:01.3 Bridge: Intel Corporation 82371AB/EB/MB PIIX4 ACPI (rev 01)
00:02.0 VGA compatible controller: Cirrus Logic GD 5446
00:03.0 Class ff80: XenSource, Inc. Xen Platform Device (rev 01)
00:05.0 USB Controller: NEC Corporation Device 0194 (rev 03)
00:06.0 USB Controller: NEC Corporation Device 0194 (rev 04)

This 'vdevfn' is created by QEMU when the device is hotplugged
(or at bootup time).

So I figured it we have an extra key:

/libxl/1/device/pci/0/pxm-0 = "1"
/libxl/1/device/pci/0/pxm-1 = "1"

We could use that inside of the guest to call 'set_dev_node'
on the PCIe devices.

And in fact, for the above the lspci output we can make this work:

-bash-4.1# find /sys -name local_cpulist
/sys/devices/pci0000:00/0000:00:00.0/local_cpulist
/sys/devices/pci0000:00/0000:00:01.3/local_cpulist
/sys/devices/pci0000:00/0000:00:03.0/local_cpulist
/sys/devices/pci0000:00/0000:00:01.1/local_cpulist
/sys/devices/pci0000:00/0000:00:06.0/local_cpulist <===
/sys/devices/pci0000:00/0000:00:02.0/local_cpulist
/sys/devices/pci0000:00/0000:00:05.0/local_cpulist <===
/sys/devices/pci0000:00/0000:00:01.2/local_cpulist
/sys/devices/pci0000:00/0000:00:01.0/local_cpulist
-bash-4.1# find /sys -name local_cpulist | xargs cat
-3
0-3
0-3
0-3
2-3 <===
0-3
2-3 <===
0-3
0-3

-bash-4.1# find /sys -name cpulist
/sys/devices/system/node/node0/cpulist
/sys/devices/system/node/node1/cpulist
-bash-4.1# find /sys -name cpulist | xargs cat
0-1
2-3

With this guest config:
kernel = "hvmloader"
device_model_version = 'qemu-xen-traditional'
builder='hvm'
memory=1024
serial='pty'
smt=1
vcpus = 4
cpus=['0-3']
name="bootstrap-x86_64-pvhvm"
disk = [ 'file:/mnt/lab/bootstrap-x86_64/root_image.iso,hdc:cdrom,r','phy:/dev/guests/bootstrap-x86_64-pvhvm,xvda,w']
boot="dn"
vif = [ 'mac=00:0F:4B:00:00:68, bridge=switch' ]
vnc=1
vnclisten="0.0.0.0"
usb=1
usbdevice="tablet"
xen_platform_pci=1
vnuma=[ ["pnode=0", "size=512", "vcpus=0,1","vdistances=10,20"],
        ["pnode=0", "size=512", "vcpus=2,3","vdistances=20,10"]]

pci=['04:00.0','07:00.0']

And
During bootup you would see:
[   11.227821] calling  pcifront_init+0x0/0x118 @ 1
[   11.241826] pcifront_hvm pci-0: PCI device 0000:00:05.0 (PXM=1)
[   11.261298] pci 0000:00:05.0: Updating PXM to 1
[   11.273989] initcall pcifront_init+0x0/0x118 returned 0 after 32791 usecs
[   11.274620] pcifront_hvm pci-0: PCI device 0000:00:05.0 (PXM=1)
[   11.276977] pcifront_hvm pci-0: PCI device 0000:00:05.0 (PXM=1)

OraBug: 25788744

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: HÃ¥kon Bugge <haakon.bugge@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>

---
v1: Fixed all the checkpatch.pl issues
       Made it respect the node_online() in case the backend provided
       a larger value than there are NUMA nodes.
v2: Fixed per Boris's reviews.
v3: Added a mechanism to prune the list when devices are removed.
v4: s/l/len
    Added space after 'len' in decleration.
    Fixed comments
    Added Reviewed-by.
v5: Added Boris's Reviewed-by
---
 drivers/pci/xen-pcifront.c | 302 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 297 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index 4f5bdcada351..4b3c733c5c34 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -380,6 +380,10 @@ static struct xen_pci_frontend_ops pci_frontend_ops = {
 
 static void pci_frontend_registrar(int enable)
 {
+
+	if (xen_hvm_domain())
+		return;
+
 	if (enable)
 		xen_pci_frontend = &pci_frontend_ops;
 	else
@@ -1154,23 +1158,311 @@ static struct xenbus_driver xenpci_driver = {
 	.otherend_changed	= pcifront_backend_changed,
 };
 
+/*
+ * Device on the list of PCI passthrough devices that has been hotplugged
+ * or initialized at the guest startup.
+ */
+struct pcifront_hvm_dev {
+	unsigned int domain;
+	unsigned int bus;
+	unsigned int devfn;
+	unsigned int pxm;
+	struct list_head list;
+};
+
+/*
+ * Our global list of devices on XenBus that are PCIe (not emulated).
+ */
+static LIST_HEAD(pcifront_devs);
+static DEFINE_MUTEX(pcifront_devs_mutex);
+
+static int pcifront_hvm_set_node(struct pci_dev *pci_dev, void *data)
+{
+	struct pcifront_hvm_dev *dev;
+
+	WARN_ON(!mutex_is_locked(&pcifront_devs_mutex));
+	list_for_each_entry(dev, &pcifront_devs, list) {
+		if ((dev->domain == pci_domain_nr(pci_dev->bus)) &&
+		    (dev->bus == pci_dev->bus->number) &&
+		    (dev->devfn == pci_dev->devfn)) {
+
+			int pxm = dev_to_node(&pci_dev->dev);
+
+			if (pxm == dev->pxm)
+				break;
+
+			dev_info(&pci_dev->dev, "PXM = %d\n", dev->pxm);
+			set_dev_node(&pci_dev->dev, dev->pxm);
+			break;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Only called after bootup and if a PCIe device is hotplugged.
+ */
+static int pcifront_hvm_notifier(struct notifier_block *nb,
+				 unsigned long action, void *data)
+{
+	struct device *dev = data;
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+
+	if (action == BUS_NOTIFY_ADD_DEVICE) {
+		mutex_lock(&pcifront_devs_mutex);
+		pcifront_hvm_set_node(pci_dev, NULL);
+		mutex_unlock(&pcifront_devs_mutex);
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block xenpci_hvm_nb = {
+	.notifier_call = pcifront_hvm_notifier,
+};
+
+static int pcifront_hvm_add_dev(unsigned int domain, unsigned int bus,
+				unsigned int devfn, unsigned int pxm)
+{
+	struct pcifront_hvm_dev *dev;
+
+	WARN_ON(!mutex_is_locked(&pcifront_devs_mutex));
+	/* Device may have been unplugged and replugged. */
+	list_for_each_entry(dev, &pcifront_devs, list) {
+		if ((dev->domain == domain) &&
+		    (dev->bus == bus) &&
+		    (dev->devfn == devfn)) {
+			/* Update PXM if needed. */
+			dev->pxm = pxm;
+
+			return 0;
+		}
+	}
+
+	dev = kzalloc(sizeof(struct pcifront_hvm_dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+
+	dev->domain = domain;
+	dev->bus = bus;
+	dev->devfn = devfn;
+	dev->pxm = pxm;
+
+	INIT_LIST_HEAD(&dev->list);
+
+	list_add_tail(&dev->list, &pcifront_devs);
+
+	return 0;
+}
+
+static int pcifront_hvm_add(struct xenbus_device *xdev)
+{
+	unsigned int num_roots, i, bus = 0, domain = 0;
+	unsigned int devfn;
+	char str[64];
+	int len, err;
+
+	WARN_ON(!mutex_is_locked(&pcifront_devs_mutex));
+	err = xenbus_scanf(XBT_NIL, xdev->otherend,
+			   "num_devs", "%d", &num_roots);
+
+	/* This will allow us to skip the addition, but we still can prune. */
+	if (err == -ENOENT)
+		num_roots = 0;
+
+	for (i = 0; i < num_roots; i++) {
+		int pxm;
+
+		len = snprintf(str, sizeof(str), "vdevfn-%d", i);
+		if (len < 0)
+			return -EINVAL;
+
+		err = xenbus_scanf(XBT_NIL, xdev->otherend, str, "%x", &devfn);
+		if (err != 1) {
+			if (err >= 0)
+				return -EINVAL;
+			break;
+		}
+		len = snprintf(str, sizeof(str), "pxm-%d", i);
+		if (len < 0)
+			return -EINVAL;
+
+		err = xenbus_scanf(XBT_NIL, xdev->otherend, str, "%x", &pxm);
+		/*
+		 * No point going on if we don't have _PXM data, nor if the
+		 * backend has no clue either.
+		 */
+		if (err != 1 || pxm < 0) {
+			err = 0;
+			continue;
+		}
+
+		dev_info(&xdev->dev, "PCI device %04x:%02x:%02x.%d (PXM=%d%s)\n",
+			 domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pxm,
+			 node_online(pxm) ? "" : " but node offline!");
+
+		if (!node_online(pxm)) {
+			err = 0;
+			continue;
+		}
+		err = pcifront_hvm_add_dev(domain, bus, devfn, pxm);
+		if (err)
+			break;
+	}
+
+	/* Prune the list if needed. */
+	if (!list_empty(&pcifront_devs)) {
+		struct pcifront_hvm_dev *dev, *tmp;
+
+		list_for_each_entry_safe(dev, tmp, &pcifront_devs, list) {
+			bool found = false;
+			for (i = 0; i < num_roots; i++) {
+				len = snprintf(str, sizeof(str), "vdevfn-%d", i);
+				if (len < 0)
+					return -EINVAL;
+
+				err = xenbus_scanf(XBT_NIL, xdev->otherend, str, "%x", &devfn);
+				if (err != 1) {
+					if (err >= 0)
+						return -EINVAL;
+				}
+				err = 0; /* Don't want the last error to be 1. */
+				if (devfn == dev->devfn) {
+					found = true;
+					break;
+				}
+			}
+
+			if (found)
+				continue;
+
+			dev_info(&xdev->dev, "PCI device %04x:%02x:%02x.%d unplugged\n",
+				 dev->domain, dev->bus, PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
+
+			list_del(&dev->list);
+			kfree(dev);
+		}
+	}
+	return err;
+}
+
+static int pcifront_hvm_xenbus_probe(struct xenbus_device *xdev,
+				     const struct xenbus_device_id *id)
+{
+	int err;
+
+	mutex_lock(&pcifront_devs_mutex);
+	err = pcifront_hvm_add(xdev);
+	mutex_unlock(&pcifront_devs_mutex);
+
+	return err;
+}
+
+static void __ref pcifront_hvm_backend_changed(struct xenbus_device *xdev,
+					       enum xenbus_state be_state)
+{
+	int err;
+
+	dev_dbg(&xdev->dev, "%d backend=%s\n", be_state, xdev->otherend);
+
+	err = pcifront_hvm_xenbus_probe(xdev, NULL);
+
+	/*
+	 * A lie, as we don't really connect. But we need this state otherwise
+	 * wait_loop & is_device_connecting will loop forever and we will
+	 * never boot!
+	 */
+	xenbus_switch_state(xdev, XenbusStateConnected);
+}
+
+
+static int pcifront_hvm_xenbus_remove(struct xenbus_device *xdev)
+{
+
+	struct pcifront_hvm_dev *dev, *tmp;
+
+	mutex_lock(&pcifront_devs_mutex);
+
+	list_for_each_entry_safe(dev, tmp, &pcifront_devs, list) {
+		list_del(&dev->list);
+		kfree(dev);
+	}
+
+	INIT_LIST_HEAD(&pcifront_devs);
+
+	mutex_unlock(&pcifront_devs_mutex);
+
+	return 0;
+}
+
+static struct xenbus_driver xenpci_driver_hvm = {
+	.name			= "pcifront_hvm",
+	.ids			= xenpci_ids,
+	.probe			= pcifront_hvm_xenbus_probe,
+	.remove			= pcifront_hvm_xenbus_remove,
+	.otherend_changed	= pcifront_hvm_backend_changed,
+};
+
+static void __init walk_pcifront_hvm(void)
+{
+	if (!xen_hvm_domain())
+		return;
+
+	mutex_lock(&pcifront_devs_mutex);
+
+	/*
+	 * PCI devices may have been added during guest init, which means
+	 * the pcifront_hvm_notifier would have gotten called - but the
+	 * 'devices' list hadn't been populated (as does happens once we
+	 * we walk through the XenBus.
+	 *
+	 * Therefore walk the PCI bus and check our XenBus list.
+	 */
+	if (!list_empty(&pcifront_devs)) {
+		struct pci_bus *root_bus;
+
+		list_for_each_entry(root_bus, &pci_root_buses, node)
+			pci_walk_bus(root_bus, pcifront_hvm_set_node, NULL);
+	}
+	mutex_unlock(&pcifront_devs_mutex);
+};
+
+static struct xenbus_driver *pcifront_driver = &xenpci_driver;
+static struct notifier_block *notifier;
+
 static int __init pcifront_init(void)
 {
-	if (!xen_pv_domain() || xen_initial_domain())
-		return -ENODEV;
+	int rc;
+
+	if (xen_hvm_domain()) {
+		pcifront_driver = &xenpci_driver_hvm;
+		notifier = &xenpci_hvm_nb;
 
-	if (!xen_has_pv_devices())
+		rc = bus_register_notifier(&pci_bus_type, notifier);
+		if (rc)
+			return rc;
+	} else if (xen_initial_domain())
 		return -ENODEV;
 
 	pci_frontend_registrar(1 /* enable */);
 
-	return xenbus_register_frontend(&xenpci_driver);
+	rc = xenbus_register_frontend(pcifront_driver);
+	if (rc) {
+		if (xen_hvm_domain())
+			bus_unregister_notifier(&pci_bus_type, notifier);
+		return rc;
+	}
+	walk_pcifront_hvm();
+
+	return 0;
 }
 
 static void __exit pcifront_cleanup(void)
 {
-	xenbus_unregister_driver(&xenpci_driver);
+	xenbus_unregister_driver(pcifront_driver);
 	pci_frontend_registrar(0 /* disable */);
+	if (xen_hvm_domain())
+		bus_unregister_notifier(&pci_bus_type, notifier);
 }
 module_init(pcifront_init);
 module_exit(pcifront_cleanup);
-- 
2.50.1