break;
                }
 
-               if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
+               if (!(sd->flags & SD_OVERLAP) &&
+                   cpumask_intersects(groupmask, sched_group_cpus(group))) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: repeated CPUs\n");
                        break;
 
 __setup("isolcpus=", isolated_cpu_setup);
 
-#ifdef CONFIG_NUMA
-
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain. Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, nodemask_t *used_nodes)
-{
-       int i, n, val, min_val, best_node = -1;
-
-       min_val = INT_MAX;
-
-       for (i = 0; i < nr_node_ids; i++) {
-               /* Start at @node */
-               n = (node + i) % nr_node_ids;
-
-               if (!nr_cpus_node(n))
-                       continue;
-
-               /* Skip already used nodes */
-               if (node_isset(n, *used_nodes))
-                       continue;
-
-               /* Simple min distance search */
-               val = node_distance(node, n);
-
-               if (val < min_val) {
-                       min_val = val;
-                       best_node = n;
-               }
-       }
-
-       if (best_node != -1)
-               node_set(best_node, *used_nodes);
-       return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @span: resulting cpumask
- *
- * Given a node, construct a good cpumask for its sched_domain to span. It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static void sched_domain_node_span(int node, struct cpumask *span)
-{
-       nodemask_t used_nodes;
-       int i;
-
-       cpumask_clear(span);
-       nodes_clear(used_nodes);
-
-       cpumask_or(span, span, cpumask_of_node(node));
-       node_set(node, used_nodes);
-
-       for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-               int next_node = find_next_best_node(node, &used_nodes);
-               if (next_node < 0)
-                       break;
-               cpumask_or(span, span, cpumask_of_node(next_node));
-       }
-}
-
-static const struct cpumask *cpu_node_mask(int cpu)
-{
-       lockdep_assert_held(&sched_domains_mutex);
-
-       sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
-
-       return sched_domains_tmpmask;
-}
-
-static const struct cpumask *cpu_allnodes_mask(int cpu)
-{
-       return cpu_possible_mask;
-}
-#endif /* CONFIG_NUMA */
-
 static const struct cpumask *cpu_cpu_mask(int cpu)
 {
        return cpumask_of_node(cpu_to_node(cpu));
        sched_domain_init_f init;
        sched_domain_mask_f mask;
        int                 flags;
+       int                 numa_level;
        struct sd_data      data;
 };
 
 }
 
 SD_INIT_FUNC(CPU)
-#ifdef CONFIG_NUMA
- SD_INIT_FUNC(ALLNODES)
- SD_INIT_FUNC(NODE)
-#endif
 #ifdef CONFIG_SCHED_SMT
  SD_INIT_FUNC(SIBLING)
 #endif
        { sd_init_BOOK, cpu_book_mask, },
 #endif
        { sd_init_CPU, cpu_cpu_mask, },
-#ifdef CONFIG_NUMA
-       { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
-       { sd_init_ALLNODES, cpu_allnodes_mask, },
-#endif
        { NULL, },
 };
 
 static struct sched_domain_topology_level *sched_domain_topology = default_topology;
 
+#ifdef CONFIG_NUMA
+
+static int sched_domains_numa_levels;
+static int sched_domains_numa_scale;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+
+static inline unsigned long numa_scale(unsigned long x, int level)
+{
+       return x * sched_domains_numa_distance[level] / sched_domains_numa_scale;
+}
+
+static inline int sd_local_flags(int level)
+{
+       if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
+               return 0;
+
+       return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
+}
+
+static struct sched_domain *
+sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+{
+       struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+       int level = tl->numa_level;
+       int sd_weight = cpumask_weight(
+                       sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+
+       *sd = (struct sched_domain){
+               .min_interval           = sd_weight,
+               .max_interval           = 2*sd_weight,
+               .busy_factor            = 32,
+               .imbalance_pct          = 100 + numa_scale(25, level),
+               .cache_nice_tries       = 2,
+               .busy_idx               = 3,
+               .idle_idx               = 2,
+               .newidle_idx            = 0,
+               .wake_idx               = 0,
+               .forkexec_idx           = 0,
+
+               .flags                  = 1*SD_LOAD_BALANCE
+                                       | 1*SD_BALANCE_NEWIDLE
+                                       | 0*SD_BALANCE_EXEC
+                                       | 0*SD_BALANCE_FORK
+                                       | 0*SD_BALANCE_WAKE
+                                       | 0*SD_WAKE_AFFINE
+                                       | 0*SD_PREFER_LOCAL
+                                       | 0*SD_SHARE_CPUPOWER
+                                       | 0*SD_POWERSAVINGS_BALANCE
+                                       | 0*SD_SHARE_PKG_RESOURCES
+                                       | 1*SD_SERIALIZE
+                                       | 0*SD_PREFER_SIBLING
+                                       | sd_local_flags(level)
+                                       ,
+               .last_balance           = jiffies,
+               .balance_interval       = sd_weight,
+       };
+       SD_INIT_NAME(sd, NUMA);
+       sd->private = &tl->data;
+
+       /*
+        * Ugly hack to pass state to sd_numa_mask()...
+        */
+       sched_domains_curr_level = tl->numa_level;
+
+       return sd;
+}
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+       return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_init_numa(void)
+{
+       int next_distance, curr_distance = node_distance(0, 0);
+       struct sched_domain_topology_level *tl;
+       int level = 0;
+       int i, j, k;
+
+       sched_domains_numa_scale = curr_distance;
+       sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+       if (!sched_domains_numa_distance)
+               return;
+
+       /*
+        * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+        * unique distances in the node_distance() table.
+        *
+        * Assumes node_distance(0,j) includes all distances in
+        * node_distance(i,j) in order to avoid cubic time.
+        *
+        * XXX: could be optimized to O(n log n) by using sort()
+        */
+       next_distance = curr_distance;
+       for (i = 0; i < nr_node_ids; i++) {
+               for (j = 0; j < nr_node_ids; j++) {
+                       int distance = node_distance(0, j);
+                       if (distance > curr_distance &&
+                                       (distance < next_distance ||
+                                        next_distance == curr_distance))
+                               next_distance = distance;
+               }
+               if (next_distance != curr_distance) {
+                       sched_domains_numa_distance[level++] = next_distance;
+                       sched_domains_numa_levels = level;
+                       curr_distance = next_distance;
+               } else break;
+       }
+       /*
+        * 'level' contains the number of unique distances, excluding the
+        * identity distance node_distance(i,i).
+        *
+        * The sched_domains_nume_distance[] array includes the actual distance
+        * numbers.
+        */
+
+       sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+       if (!sched_domains_numa_masks)
+               return;
+
+       /*
+        * Now for each level, construct a mask per node which contains all
+        * cpus of nodes that are that many hops away from us.
+        */
+       for (i = 0; i < level; i++) {
+               sched_domains_numa_masks[i] =
+                       kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+               if (!sched_domains_numa_masks[i])
+                       return;
+
+               for (j = 0; j < nr_node_ids; j++) {
+                       struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
+                       if (!mask)
+                               return;
+
+                       sched_domains_numa_masks[i][j] = mask;
+
+                       for (k = 0; k < nr_node_ids; k++) {
+                               if (node_distance(cpu_to_node(j), k) >
+                                               sched_domains_numa_distance[i])
+                                       continue;
+
+                               cpumask_or(mask, mask, cpumask_of_node(k));
+                       }
+               }
+       }
+
+       tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+                       sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+       if (!tl)
+               return;
+
+       /*
+        * Copy the default topology bits..
+        */
+       for (i = 0; default_topology[i].init; i++)
+               tl[i] = default_topology[i];
+
+       /*
+        * .. and append 'j' levels of NUMA goodness.
+        */
+       for (j = 0; j < level; i++, j++) {
+               tl[i] = (struct sched_domain_topology_level){
+                       .init = sd_numa_init,
+                       .mask = sd_numa_mask,
+                       .flags = SDTL_OVERLAP,
+                       .numa_level = j,
+               };
+       }
+
+       sched_domain_topology = tl;
+}
+#else
+static inline void sched_init_numa(void)
+{
+}
+#endif /* CONFIG_NUMA */
+
 static int __sdt_alloc(const struct cpumask *cpu_map)
 {
        struct sched_domain_topology_level *tl;
        alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
        alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
+       sched_init_numa();
+
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
        init_sched_domains(cpu_active_mask);