sched/numa: Rewrite the CONFIG_NUMA sched domain support

author Peter Zijlstra <a.p.zijlstra@chello.nl>

Tue, 17 Apr 2012 13:49:36 +0000 (15:49 +0200)

committer Ingo Molnar <mingo@kernel.org>

Wed, 9 May 2012 13:00:55 +0000 (15:00 +0200)
author Peter Zijlstra <a.p.zijlstra@chello.nl>
Tue, 17 Apr 2012 13:49:36 +0000 (15:49 +0200)
committer Ingo Molnar <mingo@kernel.org>
Wed, 9 May 2012 13:00:55 +0000 (15:00 +0200)
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h

index 09f646753d1aa2203c2b0942f2294ae0e11d6e67..a2496e449b75452038ab7e7b14e4bd2c796fd458 100644 (file)
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -70,31 +70,6 @@ void build_cpu_to_node_map(void);
         .nr_balance_failed      = 0,                    \
  }
  
-/* sched_domains SD_NODE_INIT for IA64 NUMA machines */
-#define SD_NODE_INIT (struct sched_domain) {           \
-       .parent                 = NULL,                 \
-       .child                  = NULL,                 \
-       .groups                 = NULL,                 \
-       .min_interval           = 8,                    \
-       .max_interval           = 8*(min(num_online_cpus(), 32U)), \
-       .busy_factor            = 64,                   \
-       .imbalance_pct          = 125,                  \
-       .cache_nice_tries       = 2,                    \
-       .busy_idx               = 3,                    \
-       .idle_idx               = 2,                    \
-       .newidle_idx            = 0,                    \
-       .wake_idx               = 0,                    \
-       .forkexec_idx           = 0,                    \
-       .flags                  = SD_LOAD_BALANCE       \
-                               | SD_BALANCE_NEWIDLE    \
-                               | SD_BALANCE_EXEC       \
-                               | SD_BALANCE_FORK       \
-                               | SD_SERIALIZE,         \
-       .last_balance           = jiffies,              \
-       .balance_interval       = 64,                   \
-       .nr_balance_failed      = 0,                    \
-}
-
  #endif /* CONFIG_NUMA */
  
  #ifdef CONFIG_SMP
diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h

index 1b1a7d1632b915ece8f030b538f89f4ef8eb2d63..b2cf641f206f0bad0b3dbf0c80d481984160737c 100644 (file)
--- a/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@ -36,23 +36,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
  
  #define node_distance(from, to)        (__node_distances[(from)][(to)])
  
-/* sched_domains SD_NODE_INIT for SGI IP27 machines */
-#define SD_NODE_INIT (struct sched_domain) {           \
-       .parent                 = NULL,                 \
-       .child                  = NULL,                 \
-       .groups                 = NULL,                 \
-       .min_interval           = 8,                    \
-       .max_interval           = 32,                   \
-       .busy_factor            = 32,                   \
-       .imbalance_pct          = 125,                  \
-       .cache_nice_tries       = 1,                    \
-       .flags                  = SD_LOAD_BALANCE |     \
-                                 SD_BALANCE_EXEC,      \
-       .last_balance           = jiffies,              \
-       .balance_interval       = 1,                    \
-       .nr_balance_failed      = 0,                    \
-}
-
  #include <asm-generic/topology.h>
  
  #endif /* _ASM_MACH_TOPOLOGY_H */
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h

index c97185885c6d0dbce3773c32e96f96bf8df7d8dd..852ed1b384f6dfd4bd7a3d697a044f6def66623e 100644 (file)
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -18,12 +18,6 @@ struct device_node;
   */
  #define RECLAIM_DISTANCE 10
  
-/*
- * Avoid creating an extra level of balancing (SD_ALLNODES) on the largest
- * POWER7 boxes which have a maximum of 32 nodes.
- */
-#define SD_NODES_PER_DOMAIN 32
-
  #include <asm/mmzone.h>
  
  static inline int cpu_to_node(int cpu)
@@ -51,36 +45,6 @@ static inline int pcibus_to_node(struct pci_bus *bus)
                                  cpu_all_mask :                         \
                                  cpumask_of_node(pcibus_to_node(bus)))
  
-/* sched_domains SD_NODE_INIT for PPC64 machines */
-#define SD_NODE_INIT (struct sched_domain) {                           \
-       .min_interval           = 8,                                    \
-       .max_interval           = 32,                                   \
-       .busy_factor            = 32,                                   \
-       .imbalance_pct          = 125,                                  \
-       .cache_nice_tries       = 1,                                    \
-       .busy_idx               = 3,                                    \
-       .idle_idx               = 1,                                    \
-       .newidle_idx            = 0,                                    \
-       .wake_idx               = 0,                                    \
-       .forkexec_idx           = 0,                                    \
-                                                                       \
-       .flags                  = 1*SD_LOAD_BALANCE                     \
-                               | 0*SD_BALANCE_NEWIDLE                  \
-                               | 1*SD_BALANCE_EXEC                     \
-                               | 1*SD_BALANCE_FORK                     \
-                               | 0*SD_BALANCE_WAKE                     \
-                               | 1*SD_WAKE_AFFINE                      \
-                               | 0*SD_PREFER_LOCAL                     \
-                               | 0*SD_SHARE_CPUPOWER                   \
-                               | 0*SD_POWERSAVINGS_BALANCE             \
-                               | 0*SD_SHARE_PKG_RESOURCES              \
-                               | 1*SD_SERIALIZE                        \
-                               | 0*SD_PREFER_SIBLING                   \
-                               ,                                       \
-       .last_balance           = jiffies,                              \
-       .balance_interval       = 1,                                    \
-}
-
  extern int __node_distance(int, int);
  #define node_distance(a, b) __node_distance(a, b)
  
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h

index 88e734069fa655de6022f91554173b67750b3820..b0a282d65f6a16bc0753faac736e67adc00f62e7 100644 (file)
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -3,31 +3,6 @@
  
  #ifdef CONFIG_NUMA
  
-/* sched_domains SD_NODE_INIT for sh machines */
-#define SD_NODE_INIT (struct sched_domain) {           \
-       .parent                 = NULL,                 \
-       .child                  = NULL,                 \
-       .groups                 = NULL,                 \
-       .min_interval           = 8,                    \
-       .max_interval           = 32,                   \
-       .busy_factor            = 32,                   \
-       .imbalance_pct          = 125,                  \
-       .cache_nice_tries       = 2,                    \
-       .busy_idx               = 3,                    \
-       .idle_idx               = 2,                    \
-       .newidle_idx            = 0,                    \
-       .wake_idx               = 0,                    \
-       .forkexec_idx           = 0,                    \
-       .flags                  = SD_LOAD_BALANCE       \
-                               | SD_BALANCE_FORK       \
-                               | SD_BALANCE_EXEC       \
-                               | SD_BALANCE_NEWIDLE    \
-                               | SD_SERIALIZE,         \
-       .last_balance           = jiffies,              \
-       .balance_interval       = 1,                    \
-       .nr_balance_failed      = 0,                    \
-}
-
  #define cpu_to_node(cpu)       ((void)(cpu),0)
  #define parent_node(node)      ((void)(node),0)
  
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h

index 8b9c556d630b8731c76b16d1708d090dbd6d4cfc..1754390a426fb1e3c8eef4f1b947b04271bc794c 100644 (file)
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -31,25 +31,6 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
          cpu_all_mask : \
          cpumask_of_node(pcibus_to_node(bus)))
  
-#define SD_NODE_INIT (struct sched_domain) {           \
-       .min_interval           = 8,                    \
-       .max_interval           = 32,                   \
-       .busy_factor            = 32,                   \
-       .imbalance_pct          = 125,                  \
-       .cache_nice_tries       = 2,                    \
-       .busy_idx               = 3,                    \
-       .idle_idx               = 2,                    \
-       .newidle_idx            = 0,                    \
-       .wake_idx               = 0,                    \
-       .forkexec_idx           = 0,                    \
-       .flags                  = SD_LOAD_BALANCE       \
-                               | SD_BALANCE_FORK       \
-                               | SD_BALANCE_EXEC       \
-                               | SD_SERIALIZE,         \
-       .last_balance           = jiffies,              \
-       .balance_interval       = 1,                    \
-}
-
  #else /* CONFIG_NUMA */
  
  #include <asm-generic/topology.h>
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h

index 6fdd0c860193a9b3b3279d881a4d368706de7f9a..7a7ce390534f2c65df3c24bdb57c9c7a530e9244 100644 (file)
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -78,32 +78,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
         .balance_interval       = 32,                                   \
  }
  
-/* sched_domains SD_NODE_INIT for TILE architecture */
-#define SD_NODE_INIT (struct sched_domain) {                           \
-       .min_interval           = 16,                                   \
-       .max_interval           = 512,                                  \
-       .busy_factor            = 32,                                   \
-       .imbalance_pct          = 125,                                  \
-       .cache_nice_tries       = 1,                                    \
-       .busy_idx               = 3,                                    \
-       .idle_idx               = 1,                                    \
-       .newidle_idx            = 2,                                    \
-       .wake_idx               = 1,                                    \
-       .flags                  = 1*SD_LOAD_BALANCE                     \
-                               | 1*SD_BALANCE_NEWIDLE                  \
-                               | 1*SD_BALANCE_EXEC                     \
-                               | 1*SD_BALANCE_FORK                     \
-                               | 0*SD_BALANCE_WAKE                     \
-                               | 0*SD_WAKE_AFFINE                      \
-                               | 0*SD_PREFER_LOCAL                     \
-                               | 0*SD_SHARE_CPUPOWER                   \
-                               | 0*SD_SHARE_PKG_RESOURCES              \
-                               | 1*SD_SERIALIZE                        \
-                               ,                                       \
-       .last_balance           = jiffies,                              \
-       .balance_interval       = 128,                                  \
-}
-
  /* By definition, we create nodes based on online memory. */
  #define node_has_online_mem(nid) 1
  
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h

index b9676ae37ada7ddbf7c7d208653ecb892ca2a9c8..095b21507b6a005130dc07a58464d15eaf1ceb94 100644 (file)
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -92,44 +92,6 @@ extern void setup_node_to_cpumask_map(void);
  
  #define pcibus_to_node(bus) __pcibus_to_node(bus)
  
-#ifdef CONFIG_X86_32
-# define SD_CACHE_NICE_TRIES   1
-# define SD_IDLE_IDX           1
-#else
-# define SD_CACHE_NICE_TRIES   2
-# define SD_IDLE_IDX           2
-#endif
-
-/* sched_domains SD_NODE_INIT for NUMA machines */
-#define SD_NODE_INIT (struct sched_domain) {                           \
-       .min_interval           = 8,                                    \
-       .max_interval           = 32,                                   \
-       .busy_factor            = 32,                                   \
-       .imbalance_pct          = 125,                                  \
-       .cache_nice_tries       = SD_CACHE_NICE_TRIES,                  \
-       .busy_idx               = 3,                                    \
-       .idle_idx               = SD_IDLE_IDX,                          \
-       .newidle_idx            = 0,                                    \
-       .wake_idx               = 0,                                    \
-       .forkexec_idx           = 0,                                    \
-                                                                       \
-       .flags                  = 1*SD_LOAD_BALANCE                     \
-                               | 1*SD_BALANCE_NEWIDLE                  \
-                               | 1*SD_BALANCE_EXEC                     \
-                               | 1*SD_BALANCE_FORK                     \
-                               | 0*SD_BALANCE_WAKE                     \
-                               | 1*SD_WAKE_AFFINE                      \
-                               | 0*SD_PREFER_LOCAL                     \
-                               | 0*SD_SHARE_CPUPOWER                   \
-                               | 0*SD_POWERSAVINGS_BALANCE             \
-                               | 0*SD_SHARE_PKG_RESOURCES              \
-                               | 1*SD_SERIALIZE                        \
-                               | 0*SD_PREFER_SIBLING                   \
-                               ,                                       \
-       .last_balance           = jiffies,                              \
-       .balance_interval       = 1,                                    \
-}
-
  extern int __node_distance(int, int);
  #define node_distance(a, b) __node_distance(a, b)
  
diff --git a/include/linux/topology.h b/include/linux/topology.h

index e26db031303b69cee9954780f3ce3f8d276d8644..4f59bf36f0afb570aac41ed975a6c269505a4d7a 100644 (file)
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -70,7 +70,6 @@ int arch_update_cpu_topology(void);
   * Below are the 3 major initializers used in building sched_domains:
   * SD_SIBLING_INIT, for SMT domains
   * SD_CPU_INIT, for SMP domains
- * SD_NODE_INIT, for NUMA domains
   *
   * Any architecture that cares to do any tuning to these values should do so
   * by defining their own arch-specific initializer in include/asm/topology.h.
@@ -176,48 +175,12 @@ int arch_update_cpu_topology(void);
  }
  #endif
  
-/* sched_domains SD_ALLNODES_INIT for NUMA machines */
-#define SD_ALLNODES_INIT (struct sched_domain) {                       \
-       .min_interval           = 64,                                   \
-       .max_interval           = 64*num_online_cpus(),                 \
-       .busy_factor            = 128,                                  \
-       .imbalance_pct          = 133,                                  \
-       .cache_nice_tries       = 1,                                    \
-       .busy_idx               = 3,                                    \
-       .idle_idx               = 3,                                    \
-       .flags                  = 1*SD_LOAD_BALANCE                     \
-                               | 1*SD_BALANCE_NEWIDLE                  \
-                               | 0*SD_BALANCE_EXEC                     \
-                               | 0*SD_BALANCE_FORK                     \
-                               | 0*SD_BALANCE_WAKE                     \
-                               | 0*SD_WAKE_AFFINE                      \
-                               | 0*SD_SHARE_CPUPOWER                   \
-                               | 0*SD_POWERSAVINGS_BALANCE             \
-                               | 0*SD_SHARE_PKG_RESOURCES              \
-                               | 1*SD_SERIALIZE                        \
-                               | 0*SD_PREFER_SIBLING                   \
-                               ,                                       \
-       .last_balance           = jiffies,                              \
-       .balance_interval       = 64,                                   \
-}
-
-#ifndef SD_NODES_PER_DOMAIN
-#define SD_NODES_PER_DOMAIN 16
-#endif
-
  #ifdef CONFIG_SCHED_BOOK
  #ifndef SD_BOOK_INIT
  #error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
  #endif
  #endif /* CONFIG_SCHED_BOOK */
  
-#ifdef CONFIG_NUMA
-#ifndef SD_NODE_INIT
-#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
-#endif
-
-#endif /* CONFIG_NUMA */
-
  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
  DECLARE_PER_CPU(int, numa_node);
  
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 6001e5c3b4e496f6e9cb99b06e61951c0ff0d792..b4f2096980a39d0f2d0ef8e8d7237ceabffa8065 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5560,7 +5560,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                         break;
                 }
  
-               if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
+               if (!(sd->flags & SD_OVERLAP) &&
+                   cpumask_intersects(groupmask, sched_group_cpus(group))) {
                         printk(KERN_CONT "\n");
                         printk(KERN_ERR "ERROR: repeated CPUs\n");
                         break;
@@ -5898,92 +5899,6 @@ static int __init isolated_cpu_setup(char *str)
  
  __setup("isolcpus=", isolated_cpu_setup);
  
-#ifdef CONFIG_NUMA
-
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain. Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, nodemask_t *used_nodes)
-{
-       int i, n, val, min_val, best_node = -1;
-
-       min_val = INT_MAX;
-
-       for (i = 0; i < nr_node_ids; i++) {
-               /* Start at @node */
-               n = (node + i) % nr_node_ids;
-
-               if (!nr_cpus_node(n))
-                       continue;
-
-               /* Skip already used nodes */
-               if (node_isset(n, *used_nodes))
-                       continue;
-
-               /* Simple min distance search */
-               val = node_distance(node, n);
-
-               if (val < min_val) {
-                       min_val = val;
-                       best_node = n;
-               }
-       }
-
-       if (best_node != -1)
-               node_set(best_node, *used_nodes);
-       return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @span: resulting cpumask
- *
- * Given a node, construct a good cpumask for its sched_domain to span. It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static void sched_domain_node_span(int node, struct cpumask *span)
-{
-       nodemask_t used_nodes;
-       int i;
-
-       cpumask_clear(span);
-       nodes_clear(used_nodes);
-
-       cpumask_or(span, span, cpumask_of_node(node));
-       node_set(node, used_nodes);
-
-       for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-               int next_node = find_next_best_node(node, &used_nodes);
-               if (next_node < 0)
-                       break;
-               cpumask_or(span, span, cpumask_of_node(next_node));
-       }
-}
-
-static const struct cpumask *cpu_node_mask(int cpu)
-{
-       lockdep_assert_held(&sched_domains_mutex);
-
-       sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
-
-       return sched_domains_tmpmask;
-}
-
-static const struct cpumask *cpu_allnodes_mask(int cpu)
-{
-       return cpu_possible_mask;
-}
-#endif /* CONFIG_NUMA */
-
  static const struct cpumask *cpu_cpu_mask(int cpu)
  {
         return cpumask_of_node(cpu_to_node(cpu));
@@ -6020,6 +5935,7 @@ struct sched_domain_topology_level {
         sched_domain_init_f init;
         sched_domain_mask_f mask;
         int                 flags;
+       int                 numa_level;
         struct sd_data      data;
  };
  
@@ -6213,10 +6129,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu)  \
  }
  
  SD_INIT_FUNC(CPU)
-#ifdef CONFIG_NUMA
- SD_INIT_FUNC(ALLNODES)
- SD_INIT_FUNC(NODE)
-#endif
  #ifdef CONFIG_SCHED_SMT
   SD_INIT_FUNC(SIBLING)
  #endif
@@ -6338,15 +6250,191 @@ static struct sched_domain_topology_level default_topology[] = {
         { sd_init_BOOK, cpu_book_mask, },
  #endif
         { sd_init_CPU, cpu_cpu_mask, },
-#ifdef CONFIG_NUMA
-       { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
-       { sd_init_ALLNODES, cpu_allnodes_mask, },
-#endif
         { NULL, },
  };
  
  static struct sched_domain_topology_level *sched_domain_topology = default_topology;
  
+#ifdef CONFIG_NUMA
+
+static int sched_domains_numa_levels;
+static int sched_domains_numa_scale;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+
+static inline unsigned long numa_scale(unsigned long x, int level)
+{
+       return x * sched_domains_numa_distance[level] / sched_domains_numa_scale;
+}
+
+static inline int sd_local_flags(int level)
+{
+       if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
+               return 0;
+
+       return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
+}
+
+static struct sched_domain *
+sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+{
+       struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+       int level = tl->numa_level;
+       int sd_weight = cpumask_weight(
+                       sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+
+       *sd = (struct sched_domain){
+               .min_interval           = sd_weight,
+               .max_interval           = 2*sd_weight,
+               .busy_factor            = 32,
+               .imbalance_pct          = 100 + numa_scale(25, level),
+               .cache_nice_tries       = 2,
+               .busy_idx               = 3,
+               .idle_idx               = 2,
+               .newidle_idx            = 0,
+               .wake_idx               = 0,
+               .forkexec_idx           = 0,
+
+               .flags                  = 1*SD_LOAD_BALANCE
+                                       | 1*SD_BALANCE_NEWIDLE
+                                       | 0*SD_BALANCE_EXEC
+                                       | 0*SD_BALANCE_FORK
+                                       | 0*SD_BALANCE_WAKE
+                                       | 0*SD_WAKE_AFFINE
+                                       | 0*SD_PREFER_LOCAL
+                                       | 0*SD_SHARE_CPUPOWER
+                                       | 0*SD_POWERSAVINGS_BALANCE
+                                       | 0*SD_SHARE_PKG_RESOURCES
+                                       | 1*SD_SERIALIZE
+                                       | 0*SD_PREFER_SIBLING
+                                       | sd_local_flags(level)
+                                       ,
+               .last_balance           = jiffies,
+               .balance_interval       = sd_weight,
+       };
+       SD_INIT_NAME(sd, NUMA);
+       sd->private = &tl->data;
+
+       /*
+        * Ugly hack to pass state to sd_numa_mask()...
+        */
+       sched_domains_curr_level = tl->numa_level;
+
+       return sd;
+}
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+       return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_init_numa(void)
+{
+       int next_distance, curr_distance = node_distance(0, 0);
+       struct sched_domain_topology_level *tl;
+       int level = 0;
+       int i, j, k;
+
+       sched_domains_numa_scale = curr_distance;
+       sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+       if (!sched_domains_numa_distance)
+               return;
+
+       /*
+        * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+        * unique distances in the node_distance() table.
+        *
+        * Assumes node_distance(0,j) includes all distances in
+        * node_distance(i,j) in order to avoid cubic time.
+        *
+        * XXX: could be optimized to O(n log n) by using sort()
+        */
+       next_distance = curr_distance;
+       for (i = 0; i < nr_node_ids; i++) {
+               for (j = 0; j < nr_node_ids; j++) {
+                       int distance = node_distance(0, j);
+                       if (distance > curr_distance &&
+                                       (distance < next_distance ||
+                                        next_distance == curr_distance))
+                               next_distance = distance;
+               }
+               if (next_distance != curr_distance) {
+                       sched_domains_numa_distance[level++] = next_distance;
+                       sched_domains_numa_levels = level;
+                       curr_distance = next_distance;
+               } else break;
+       }
+       /*
+        * 'level' contains the number of unique distances, excluding the
+        * identity distance node_distance(i,i).
+        *
+        * The sched_domains_nume_distance[] array includes the actual distance
+        * numbers.
+        */
+
+       sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+       if (!sched_domains_numa_masks)
+               return;
+
+       /*
+        * Now for each level, construct a mask per node which contains all
+        * cpus of nodes that are that many hops away from us.
+        */
+       for (i = 0; i < level; i++) {
+               sched_domains_numa_masks[i] =
+                       kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+               if (!sched_domains_numa_masks[i])
+                       return;
+
+               for (j = 0; j < nr_node_ids; j++) {
+                       struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
+                       if (!mask)
+                               return;
+
+                       sched_domains_numa_masks[i][j] = mask;
+
+                       for (k = 0; k < nr_node_ids; k++) {
+                               if (node_distance(cpu_to_node(j), k) >
+                                               sched_domains_numa_distance[i])
+                                       continue;
+
+                               cpumask_or(mask, mask, cpumask_of_node(k));
+                       }
+               }
+       }
+
+       tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+                       sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+       if (!tl)
+               return;
+
+       /*
+        * Copy the default topology bits..
+        */
+       for (i = 0; default_topology[i].init; i++)
+               tl[i] = default_topology[i];
+
+       /*
+        * .. and append 'j' levels of NUMA goodness.
+        */
+       for (j = 0; j < level; i++, j++) {
+               tl[i] = (struct sched_domain_topology_level){
+                       .init = sd_numa_init,
+                       .mask = sd_numa_mask,
+                       .flags = SDTL_OVERLAP,
+                       .numa_level = j,
+               };
+       }
+
+       sched_domain_topology = tl;
+}
+#else
+static inline void sched_init_numa(void)
+{
+}
+#endif /* CONFIG_NUMA */
+
  static int __sdt_alloc(const struct cpumask *cpu_map)
  {
         struct sched_domain_topology_level *tl;
@@ -6840,6 +6928,8 @@ void __init sched_init_smp(void)
         alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
         alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
  
+       sched_init_numa();
+
         get_online_cpus();
         mutex_lock(&sched_domains_mutex);
         init_sched_domains(cpu_active_mask);
author	Peter Zijlstra <a.p.zijlstra@chello.nl>
	Tue, 17 Apr 2012 13:49:36 +0000 (15:49 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Wed, 9 May 2012 13:00:55 +0000 (15:00 +0200)
arch/ia64/include/asm/topology.h		patch \| blob \| history
arch/mips/include/asm/mach-ip27/topology.h		patch \| blob \| history
arch/powerpc/include/asm/topology.h		patch \| blob \| history
arch/sh/include/asm/topology.h		patch \| blob \| history
arch/sparc/include/asm/topology_64.h		patch \| blob \| history
arch/tile/include/asm/topology.h		patch \| blob \| history
arch/x86/include/asm/topology.h		patch \| blob \| history
include/linux/topology.h		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history