From 4ac4df18fc8938a9bbbc542ef4aa1c483ce7aff0 Mon Sep 17 00:00:00 2001 From: Elena Ufimtseva Date: Wed, 13 Sep 2017 20:40:25 -0400 Subject: [PATCH] xen: Make PV Dom0 Linux kernel NUMA aware Issues Xen hypercall subop XENMEM_get_vnumainfo and sets the NUMA topology, otherwise sets dummy NUMA node and prevents numa_init from calling other numa initializators. Enables vNUMA for dom0 if numa kernel boot option does not disable it. It also requires Xen to have patches that support Dom0 NUMA and xen boot option dom0_vcpus_pin=numa. Dom0 NUMA topology with this patch applied and Xen booted with "dom0_mem=max:6144M dom0_vcpus_pin=numa dom0_max_vcpus=20": [root@localhost ~]# numactl --ha available: 2 nodes (0-1) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 node 0 size: 2966 MB node 0 free: 2175 MB node 1 cpus: 10 11 12 13 14 15 16 17 18 19 node 1 size: 2880 MB node 1 free: 2143 MB node distances: node 0 1 0: 10 21 1: 21 10 And lstopo output: Machine (5847MB) NUMANode L#0 (P#0 2967MB) Socket L#0 L3 L#0 (25MB) + L2 L#0 (256KB) + L1d L#0 (32KB) + L1i L#0 (32KB) + Core L#0 + PU L#0 (P#0) L3 L#1 (25MB) + L2 L#1 (256KB) + L1d L#1 (32KB) + L1i L#1 (32KB) + Core L#1 + PU L#1 (P#1) L3 L#2 (25MB) + L2 L#2 (256KB) + L1d L#2 (32KB) + L1i L#2 (32KB) + Core L#2 + PU L#2 (P#2) L3 L#3 (25MB) + L2 L#3 (256KB) + L1d L#3 (32KB) + L1i L#3 (32KB) + Core L#3 + PU L#3 (P#3) L3 L#4 (25MB) + L2 L#4 (256KB) + L1d L#4 (32KB) + L1i L#4 (32KB) + Core L#4 + PU L#4 (P#4) L3 L#5 (25MB) + L2 L#5 (256KB) + L1d L#5 (32KB) + L1i L#5 (32KB) + Core L#5 + PU L#5 (P#5) L3 L#6 (25MB) + L2 L#6 (256KB) + L1d L#6 (32KB) + L1i L#6 (32KB) + Core L#6 + PU L#6 (P#6) L3 L#7 (25MB) + L2 L#7 (256KB) + L1d L#7 (32KB) + L1i L#7 (32KB) + Core L#7 + PU L#7 (P#7) L3 L#8 (25MB) + L2 L#8 (256KB) + L1d L#8 (32KB) + L1i L#8 (32KB) + Core L#8 + PU L#8 (P#8) L3 L#9 (25MB) + L2 L#9 (256KB) + L1d L#9 (32KB) + L1i L#9 (32KB) + Core L#9 + PU L#9 (P#9) HostBridge L#0 PCIBridge PCI 1000:005d Block L#0 "sda" Block L#1 "sdb" PCIBridge PCI 8086:1528 Net L#2 "eth0" PCI 8086:1528 Net L#3 "eth1" PCIBridge PCI 102b:0522 PCI 8086:8d02 NUMANode L#1 (P#1 2880MB) Socket L#1 L3 L#10 (25MB) + L2 L#10 (256KB) + L1d L#10 (32KB) + L1i L#10 (32KB) + Core L#10 + PU L#10 (P#10) L3 L#11 (25MB) + L2 L#11 (256KB) + L1d L#11 (32KB) + L1i L#11 (32KB) + Core L#11 + PU L#11 (P#11) L3 L#12 (25MB) + L2 L#12 (256KB) + L1d L#12 (32KB) + L1i L#12 (32KB) + Core L#12 + PU L#12 (P#12) L3 L#13 (25MB) + L2 L#13 (256KB) + L1d L#13 (32KB) + L1i L#13 (32KB) + Core L#13 + PU L#13 (P#13) L3 L#14 (25MB) + L2 L#14 (256KB) + L1d L#14 (32KB) + L1i L#14 (32KB) + Core L#14 + PU L#14 (P#14) L3 L#15 (25MB) + L2 L#15 (256KB) + L1d L#15 (32KB) + L1i L#15 (32KB) + Core L#15 + PU L#15 (P#15) L3 L#16 (25MB) + L2 L#16 (256KB) + L1d L#16 (32KB) + L1i L#16 (32KB) + Core L#16 + PU L#16 (P#16) L3 L#17 (25MB) + L2 L#17 (256KB) + L1d L#17 (32KB) + L1i L#17 (32KB) + Core L#17 + PU L#17 (P#17) L3 L#18 (25MB) + L2 L#18 (256KB) + L1d L#18 (32KB) + L1i L#18 (32KB) + Core L#18 + PU L#18 (P#18) L3 L#19 (25MB) + L2 L#19 (256KB) + L1d L#19 (32KB) + L1i L#19 (32KB) + Core L#19 + PU L#19 (P#19) HostBridge L#4 PCIBridge PCI 8086:1528 Net L#4 "eth2" PCI 8086:1528 Net L#5 "eth3" PCIBridge PCI 8086:2701 OraBug: 26189217 Signed-off-by: Elena Ufimtseva Reviewed-by: Boris Ostrovsky Reviewed-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/vnuma.h | 10 +++ arch/x86/mm/numa.c | 14 ++-- arch/x86/xen/Makefile | 1 + arch/x86/xen/setup.c | 4 +- arch/x86/xen/vnuma.c | 116 +++++++++++++++++++++++++++++++ include/xen/interface/memory.h | 50 +++++++++++++ 6 files changed, 188 insertions(+), 7 deletions(-) create mode 100644 arch/x86/include/asm/xen/vnuma.h create mode 100644 arch/x86/xen/vnuma.c diff --git a/arch/x86/include/asm/xen/vnuma.h b/arch/x86/include/asm/xen/vnuma.h new file mode 100644 index 000000000000..8c8b098f66dd --- /dev/null +++ b/arch/x86/include/asm/xen/vnuma.h @@ -0,0 +1,10 @@ +#ifndef _ASM_X86_VNUMA_H +#define _ASM_X86_VNUMA_H + +#ifdef CONFIG_XEN +int xen_numa_init(void); +#else +static inline int xen_numa_init(void) { return -1; }; +#endif + +#endif /* _ASM_X86_VNUMA_H */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 4053bb58bf92..3c1c011c17ec 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -18,6 +18,7 @@ #include #include +#include "asm/xen/vnuma.h" #include "numa_internal.h" int __initdata numa_off; @@ -689,14 +690,19 @@ static int __init dummy_numa_init(void) void __init x86_numa_init(void) { if (!numa_off) { + if (xen_initial_domain()) { + if (!numa_init(xen_numa_init)) + return; + } else { #ifdef CONFIG_ACPI_NUMA - if (!numa_init(x86_acpi_numa_init)) - return; + if (!numa_init(x86_acpi_numa_init)) + return; #endif #ifdef CONFIG_AMD_NUMA - if (!numa_init(amd_numa_init)) - return; + if (!numa_init(amd_numa_init)) + return; #endif + } } numa_init(dummy_numa_init); diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index cb0164aee156..d3929fbe0595 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -24,3 +24,4 @@ obj-$(CONFIG_XEN_DOM0) += vga.o obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o obj-$(CONFIG_XEN_EFI) += efi.o obj-$(CONFIG_XEN_PVH) += xen-pvh.o +obj-$(CONFIG_NUMA) += vnuma.o diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index eaf72b735b50..e52b4b74ae30 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -1030,9 +1031,6 @@ void __init xen_arch_setup(void) disable_cpufreq(); WARN_ON(xen_set_default_idle()); fiddle_vdso(); -#ifdef CONFIG_NUMA - numa_off = 1; -#endif #ifdef CONFIG_ACPI_HOTPLUG_MEMORY acpi_no_memhotplug = true; #endif diff --git a/arch/x86/xen/vnuma.c b/arch/x86/xen/vnuma.c new file mode 100644 index 000000000000..ba8b2c162f0d --- /dev/null +++ b/arch/x86/xen/vnuma.c @@ -0,0 +1,116 @@ +/* + * vNUMA support for Dom0 Linux + * + * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Called from numa_init if numa_off = 0; + */ +int __init xen_numa_init(void) +{ + unsigned int i, j; + unsigned int nr_nodes, nr_cpus, nr_ranges; + unsigned int *vdistance, *cpu_to_node; + unsigned long mem_size, dist_size, cpu_to_node_size; + struct xen_vmemrange *vmem; + u64 physm, physd, physc; + int rc; + struct xen_vnuma_topology_info numa_topo = { + .domid = DOMID_SELF, + .pad = 0 + }; + + physm = physd = physc = 0; + + /* For now only Dom0 is supported trough this mechanism. */ + if (!xen_initial_domain()) + return -EINVAL; + + /* + * Set the numa parameters to zero and hypercall should return -ENOBUFS + * and hypervisor will copy number of cpus, nodes and memory ranges. + */ + numa_topo.nr_vnodes = numa_topo.nr_vcpus = numa_topo.nr_vmemranges = 0; + rc = HYPERVISOR_memory_op(XENMEM_get_vnumainfo, &numa_topo); + if (rc != -ENOBUFS) + return rc ? rc : -EINVAL; + + /* support for nodes with at least one cpu */ + nr_nodes = numa_topo.nr_vnodes; + nr_ranges = numa_topo.nr_vmemranges; + nr_cpus = numa_topo.nr_vcpus; + + /* + * Allocate arrays for nr_cpus/nr_nodes sizes and do second hypercall. + * If second time it fails, we dont try anymore and fail. + */ + mem_size = nr_ranges * sizeof(struct xen_vmemrange); + dist_size = nr_nodes * nr_nodes * sizeof(*numa_topo.vdistance.h); + cpu_to_node_size = nr_cpus * sizeof(*numa_topo.vcpu_to_vnode.h); + + physm = memblock_alloc(mem_size, PAGE_SIZE); + physd = memblock_alloc(dist_size, PAGE_SIZE); + physc = memblock_alloc(cpu_to_node_size, PAGE_SIZE); + + if (!physm || !physd || !physc) + goto out; + + vmem = __va(physm); + vdistance = __va(physd); + cpu_to_node = __va(physc); + + set_xen_guest_handle(numa_topo.vmemrange.h, vmem); + set_xen_guest_handle(numa_topo.vdistance.h, vdistance); + set_xen_guest_handle(numa_topo.vcpu_to_vnode.h, cpu_to_node); + + rc = HYPERVISOR_memory_op(XENMEM_get_vnumainfo, &numa_topo); + if (rc < 0) + goto out; + + /* + * NUMA nodes memory ranges are in pfns, constructed and + * aligned based on e820 ram domain map. + */ + for (i = 0; i < nr_ranges; i++) { + rc = numa_add_memblk(vmem[i].nid, vmem[i].start, vmem[i].end); + if (rc < 0) + goto out; + } + + for (i = 0; i < nr_cpus; i++) + numa_set_node(i, cpu_to_node[i]); + + for (i = 0; i < nr_nodes; i++) + for (j = 0; j < nr_nodes; j++) + numa_set_distance(i, j, *(vdistance + ((i * nr_nodes) + j))); + + rc = 0; +out: + if (physm) + memblock_free(physm, mem_size); + if (physd) + memblock_free(physd, dist_size); + if (physc) + memblock_free(physc, cpu_to_node_size); + + return rc; +} diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h index 9aa8988cb340..d716a7e7b5af 100644 --- a/include/xen/interface/memory.h +++ b/include/xen/interface/memory.h @@ -264,4 +264,54 @@ struct xen_remove_from_physmap { }; DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap); +/* + * Used to retreive vnuma topology info. + * Use XENMEM_get_vnuma_nodes to obtain number of + * nodes before allocating memory for topology. + */ +#define XENMEM_get_vnumainfo 26 + +/* vNUMA node memory ranges */ +struct xen_vmemrange { + uint64_t start, end; + uint32_t flags; + uint32_t nid; +}; +DEFINE_GUEST_HANDLE_STRUCT(xen_vmemrange); + +/* + * vNUMA topology specifies vNUMA node number, distance table, + * memory ranges and vcpu mapping provided for guests. + * XENMEM_get_vnumainfo hypercall expects to see from guest + * nr_vnodes, nr_vmemranges and nr_vcpus to indicate available memory. + * After filling guests structures, nr_vnodes, nr_vmemranges and nr_vcpus + * copied back to guest. Domain returns expected values of nr_vnodes, + * nr_vmemranges and nr_vcpus to guest if the values where incorrect. + */ +struct xen_vnuma_topology_info { + /* IN */ + domid_t domid; + uint16_t pad; + + /* IN/OUT */ + uint32_t nr_vnodes; + uint32_t nr_vcpus; + uint32_t nr_vmemranges; + + /* OUT */ + union { + GUEST_HANDLE(uint) h; + uint64_t _pad; + } vdistance; + union { + GUEST_HANDLE(uint) h; + uint64_t _pad; + } vcpu_to_vnode; + union { + GUEST_HANDLE(xen_vmemrange) h; + uint64_t _pad; + } vmemrange; +}; +DEFINE_GUEST_HANDLE_STRUCT(xen_vnuma_topology_info); + #endif /* __XEN_PUBLIC_MEMORY_H__ */ -- 2.50.1