sched/numa: Avoid overloading CPUs on a preferred NUMA node

author Mel Gorman <mgorman@suse.de>

Mon, 7 Oct 2013 10:29:10 +0000 (11:29 +0100)

committer Ingo Molnar <mingo@kernel.org>

Wed, 9 Oct 2013 10:40:39 +0000 (12:40 +0200)
author Mel Gorman <mgorman@suse.de>
Mon, 7 Oct 2013 10:29:10 +0000 (11:29 +0100)
committer Ingo Molnar <mingo@kernel.org>
Wed, 9 Oct 2013 10:40:39 +0000 (12:40 +0200)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index d98175d5c2c6ecd9dc4451f598b5615ddab35ebc..51a7600811931ad0a0d9cb58fabf80b41b8a5271 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -901,28 +901,114 @@ static inline unsigned long task_faults(struct task_struct *p, int nid)
  }
  
  static unsigned long weighted_cpuload(const int cpu);
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long power_of(int cpu);
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
  
+struct numa_stats {
+       unsigned long load;
+       s64 eff_load;
+       unsigned long faults;
+};
  
-static int
-find_idlest_cpu_node(int this_cpu, int nid)
-{
-       unsigned long load, min_load = ULONG_MAX;
-       int i, idlest_cpu = this_cpu;
+struct task_numa_env {
+       struct task_struct *p;
  
-       BUG_ON(cpu_to_node(this_cpu) == nid);
+       int src_cpu, src_nid;
+       int dst_cpu, dst_nid;
  
-       rcu_read_lock();
-       for_each_cpu(i, cpumask_of_node(nid)) {
-               load = weighted_cpuload(i);
+       struct numa_stats src_stats, dst_stats;
  
-               if (load < min_load) {
-                       min_load = load;
-                       idlest_cpu = i;
+       unsigned long best_load;
+       int best_cpu;
+};
+
+static int task_numa_migrate(struct task_struct *p)
+{
+       int node_cpu = cpumask_first(cpumask_of_node(p->numa_preferred_nid));
+       struct task_numa_env env = {
+               .p = p,
+               .src_cpu = task_cpu(p),
+               .src_nid = cpu_to_node(task_cpu(p)),
+               .dst_cpu = node_cpu,
+               .dst_nid = p->numa_preferred_nid,
+               .best_load = ULONG_MAX,
+               .best_cpu = task_cpu(p),
+       };
+       struct sched_domain *sd;
+       int cpu;
+       struct task_group *tg = task_group(p);
+       unsigned long weight;
+       bool balanced;
+       int imbalance_pct, idx = -1;
+
+       /*
+        * Find the lowest common scheduling domain covering the nodes of both
+        * the CPU the task is currently running on and the target NUMA node.
+        */
+       rcu_read_lock();
+       for_each_domain(env.src_cpu, sd) {
+               if (cpumask_test_cpu(node_cpu, sched_domain_span(sd))) {
+                       /*
+                        * busy_idx is used for the load decision as it is the
+                        * same index used by the regular load balancer for an
+                        * active cpu.
+                        */
+                       idx = sd->busy_idx;
+                       imbalance_pct = sd->imbalance_pct;
+                       break;
                 }
         }
         rcu_read_unlock();
  
-       return idlest_cpu;
+       if (WARN_ON_ONCE(idx == -1))
+               return 0;
+
+       /*
+        * XXX the below is mostly nicked from wake_affine(); we should
+        * see about sharing a bit if at all possible; also it might want
+        * some per entity weight love.
+        */
+       weight = p->se.load.weight;
+       env.src_stats.load = source_load(env.src_cpu, idx);
+       env.src_stats.eff_load = 100 + (imbalance_pct - 100) / 2;
+       env.src_stats.eff_load *= power_of(env.src_cpu);
+       env.src_stats.eff_load *= env.src_stats.load + effective_load(tg, env.src_cpu, -weight, -weight);
+
+       for_each_cpu(cpu, cpumask_of_node(env.dst_nid)) {
+               env.dst_cpu = cpu;
+               env.dst_stats.load = target_load(cpu, idx);
+
+               /* If the CPU is idle, use it */
+               if (!env.dst_stats.load) {
+                       env.best_cpu = cpu;
+                       goto migrate;
+               }
+
+               /* Otherwise check the target CPU load */
+               env.dst_stats.eff_load = 100;
+               env.dst_stats.eff_load *= power_of(cpu);
+               env.dst_stats.eff_load *= env.dst_stats.load + effective_load(tg, cpu, weight, weight);
+
+               /*
+                * Destination is considered balanced if the destination CPU is
+                * less loaded than the source CPU. Unfortunately there is a
+                * risk that a task running on a lightly loaded CPU will not
+                * migrate to its preferred node due to load imbalances.
+                */
+               balanced = (env.dst_stats.eff_load <= env.src_stats.eff_load);
+               if (!balanced)
+                       continue;
+
+               if (env.dst_stats.eff_load < env.best_load) {
+                       env.best_load = env.dst_stats.eff_load;
+                       env.best_cpu = cpu;
+               }
+       }
+
+migrate:
+       return migrate_task_to(p, env.best_cpu);
  }
  
  static void task_numa_placement(struct task_struct *p)
@@ -966,22 +1052,10 @@ static void task_numa_placement(struct task_struct *p)
          * the working set placement.
          */
         if (max_faults && max_nid != p->numa_preferred_nid) {
-               int preferred_cpu;
-
-               /*
-                * If the task is not on the preferred node then find the most
-                * idle CPU to migrate to.
-                */
-               preferred_cpu = task_cpu(p);
-               if (cpu_to_node(preferred_cpu) != max_nid) {
-                       preferred_cpu = find_idlest_cpu_node(preferred_cpu,
-                                                            max_nid);
-               }
-
                 /* Update the preferred nid and migrate task if possible */
                 p->numa_preferred_nid = max_nid;
                 p->numa_migrate_seq = 1;
-               migrate_task_to(p, preferred_cpu);
+               task_numa_migrate(p);
         }
  }
  
@@ -3292,7 +3366,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
  {
         struct sched_entity *se = tg->se[cpu];
  
-       if (!tg->parent)        /* the trivial, non-cgroup case */
+       if (!tg->parent || !wl) /* the trivial, non-cgroup case */
                 return wl;
  
         for_each_sched_entity(se) {
@@ -3345,8 +3419,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
  }
  #else
  
-static inline unsigned long effective_load(struct task_group *tg, int cpu,
-               unsigned long wl, unsigned long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
  {
         return wl;
  }
author	Mel Gorman <mgorman@suse.de>
	Mon, 7 Oct 2013 10:29:10 +0000 (11:29 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Wed, 9 Oct 2013 10:40:39 +0000 (12:40 +0200)