sched/fair: Remove task_util from effective utilization in feec()

author Vincent Donnefort <vincent.donnefort@arm.com>

Tue, 21 Jun 2022 09:04:13 +0000 (10:04 +0100)

committer Peter Zijlstra <peterz@infradead.org>

Tue, 28 Jun 2022 07:17:47 +0000 (09:17 +0200)
author Vincent Donnefort <vincent.donnefort@arm.com>
Tue, 21 Jun 2022 09:04:13 +0000 (10:04 +0100)
committer Peter Zijlstra <peterz@infradead.org>
Tue, 28 Jun 2022 07:17:47 +0000 (09:17 +0200)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 46d669297b1f069a7777e830d9268da72dee152a..0ef7e0a670892297b4d68989f43604e074b6b67a 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6702,61 +6702,96 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
  }
  
  /*
- * compute_energy(): Estimates the energy that @pd would consume if @p was
- * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
- * landscape of @pd's CPUs after the task migration, and uses the Energy Model
- * to compute what would be the energy if we decided to actually migrate that
- * task.
+ * energy_env - Utilization landscape for energy estimation.
+ * @task_busy_time: Utilization contribution by the task for which we test the
+ *                  placement. Given by eenv_task_busy_time().
+ * @pd_busy_time:   Utilization of the whole perf domain without the task
+ *                  contribution. Given by eenv_pd_busy_time().
+ * @cpu_cap:        Maximum CPU capacity for the perf domain.
+ * @pd_cap:         Entire perf domain capacity. (pd->nr_cpus * cpu_cap).
+ */
+struct energy_env {
+       unsigned long task_busy_time;
+       unsigned long pd_busy_time;
+       unsigned long cpu_cap;
+       unsigned long pd_cap;
+};
+
+/*
+ * Compute the task busy time for compute_energy(). This time cannot be
+ * injected directly into effective_cpu_util() because of the IRQ scaling.
+ * The latter only makes sense with the most recent CPUs where the task has
+ * run.
   */
-static long
-compute_energy(struct task_struct *p, int dst_cpu, struct cpumask *cpus,
-              struct perf_domain *pd)
+static inline void eenv_task_busy_time(struct energy_env *eenv,
+                                      struct task_struct *p, int prev_cpu)
  {
-       unsigned long max_util = 0, sum_util = 0, cpu_cap;
+       unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu);
+       unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu));
+
+       if (unlikely(irq >= max_cap))
+               busy_time = max_cap;
+       else
+               busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap);
+
+       eenv->task_busy_time = busy_time;
+}
+
+/*
+ * Compute the perf_domain (PD) busy time for compute_energy(). Based on the
+ * utilization for each @pd_cpus, it however doesn't take into account
+ * clamping since the ratio (utilization / cpu_capacity) is already enough to
+ * scale the EM reported power consumption at the (eventually clamped)
+ * cpu_capacity.
+ *
+ * The contribution of the task @p for which we want to estimate the
+ * energy cost is removed (by cpu_util_next()) and must be calculated
+ * separately (see eenv_task_busy_time). This ensures:
+ *
+ *   - A stable PD utilization, no matter which CPU of that PD we want to place
+ *     the task on.
+ *
+ *   - A fair comparison between CPUs as the task contribution (task_util())
+ *     will always be the same no matter which CPU utilization we rely on
+ *     (util_avg or util_est).
+ *
+ * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't
+ * exceed @eenv->pd_cap.
+ */
+static inline void eenv_pd_busy_time(struct energy_env *eenv,
+                                    struct cpumask *pd_cpus,
+                                    struct task_struct *p)
+{
+       unsigned long busy_time = 0;
         int cpu;
  
-       cpu_cap = arch_scale_cpu_capacity(cpumask_first(cpus));
-       cpu_cap -= arch_scale_thermal_pressure(cpumask_first(cpus));
+       for_each_cpu(cpu, pd_cpus) {
+               unsigned long util = cpu_util_next(cpu, p, -1);
  
-       /*
-        * The capacity state of CPUs of the current rd can be driven by CPUs
-        * of another rd if they belong to the same pd. So, account for the
-        * utilization of these CPUs too by masking pd with cpu_online_mask
-        * instead of the rd span.
-        *
-        * If an entire pd is outside of the current rd, it will not appear in
-        * its pd list and will not be accounted by compute_energy().
-        */
-       for_each_cpu(cpu, cpus) {
-               unsigned long util_freq = cpu_util_next(cpu, p, dst_cpu);
-               unsigned long cpu_util, util_running = util_freq;
-               struct task_struct *tsk = NULL;
+               busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL);
+       }
  
-               /*
-                * When @p is placed on @cpu:
-                *
-                * util_running = max(cpu_util, cpu_util_est) +
-                *                max(task_util, _task_util_est)
-                *
-                * while cpu_util_next is: max(cpu_util + task_util,
-                *                             cpu_util_est + _task_util_est)
-                */
-               if (cpu == dst_cpu) {
-                       tsk = p;
-                       util_running =
-                               cpu_util_next(cpu, p, -1) + task_util_est(p);
-               }
+       eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
+}
  
-               /*
-                * Busy time computation: utilization clamping is not
-                * required since the ratio (sum_util / cpu_capacity)
-                * is already enough to scale the EM reported power
-                * consumption at the (eventually clamped) cpu_capacity.
-                */
-               cpu_util = effective_cpu_util(cpu, util_running, ENERGY_UTIL,
-                                             NULL);
+/*
+ * Compute the maximum utilization for compute_energy() when the task @p
+ * is placed on the cpu @dst_cpu.
+ *
+ * Returns the maximum utilization among @eenv->cpus. This utilization can't
+ * exceed @eenv->cpu_cap.
+ */
+static inline unsigned long
+eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
+                struct task_struct *p, int dst_cpu)
+{
+       unsigned long max_util = 0;
+       int cpu;
  
-               sum_util += min(cpu_util, cpu_cap);
+       for_each_cpu(cpu, pd_cpus) {
+               struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
+               unsigned long util = cpu_util_next(cpu, p, dst_cpu);
+               unsigned long cpu_util;
  
                 /*
                  * Performance domain frequency: utilization clamping
@@ -6765,12 +6800,29 @@ compute_energy(struct task_struct *p, int dst_cpu, struct cpumask *cpus,
                  * NOTE: in case RT tasks are running, by default the
                  * FREQUENCY_UTIL's utilization can be max OPP.
                  */
-               cpu_util = effective_cpu_util(cpu, util_freq, FREQUENCY_UTIL,
-                                             tsk);
-               max_util = max(max_util, min(cpu_util, cpu_cap));
+               cpu_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
+               max_util = max(max_util, cpu_util);
         }
  
-       return em_cpu_energy(pd->em_pd, max_util, sum_util, cpu_cap);
+       return min(max_util, eenv->cpu_cap);
+}
+
+/*
+ * compute_energy(): Use the Energy Model to estimate the energy that @pd would
+ * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
+ * contribution is ignored.
+ */
+static inline unsigned long
+compute_energy(struct energy_env *eenv, struct perf_domain *pd,
+              struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
+{
+       unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
+       unsigned long busy_time = eenv->pd_busy_time;
+
+       if (dst_cpu >= 0)
+               busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
+
+       return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
  }
  
  /*
@@ -6816,11 +6868,12 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
  {
         struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
         unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
-       struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
         int cpu, best_energy_cpu = prev_cpu, target = -1;
-       unsigned long cpu_cap, util, base_energy = 0;
+       struct root_domain *rd = this_rq()->rd;
+       unsigned long base_energy = 0;
         struct sched_domain *sd;
         struct perf_domain *pd;
+       struct energy_env eenv;
  
         rcu_read_lock();
         pd = rcu_dereference(rd->pd);
@@ -6843,22 +6896,39 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
         if (!task_util_est(p))
                 goto unlock;
  
+       eenv_task_busy_time(&eenv, p, prev_cpu);
+
         for (; pd; pd = pd->next) {
-               unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+               unsigned long cpu_cap, cpu_thermal_cap, util;
+               unsigned long cur_delta, max_spare_cap = 0;
                 bool compute_prev_delta = false;
                 unsigned long base_energy_pd;
                 int max_spare_cap_cpu = -1;
  
                 cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
  
-               for_each_cpu_and(cpu, cpus, sched_domain_span(sd)) {
+               if (cpumask_empty(cpus))
+                       continue;
+
+               /* Account thermal pressure for the energy estimation */
+               cpu = cpumask_first(cpus);
+               cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
+               cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
+
+               eenv.cpu_cap = cpu_thermal_cap;
+               eenv.pd_cap = 0;
+
+               for_each_cpu(cpu, cpus) {
+                       eenv.pd_cap += cpu_thermal_cap;
+
+                       if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
+                               continue;
+
                         if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                                 continue;
  
                         util = cpu_util_next(cpu, p, cpu);
                         cpu_cap = capacity_of(cpu);
-                       spare_cap = cpu_cap;
-                       lsub_positive(&spare_cap, util);
  
                         /*
                          * Skip CPUs that cannot satisfy the capacity request.
@@ -6871,15 +6941,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                         if (!fits_capacity(util, cpu_cap))
                                 continue;
  
+                       lsub_positive(&cpu_cap, util);
+
                         if (cpu == prev_cpu) {
                                 /* Always use prev_cpu as a candidate. */
                                 compute_prev_delta = true;
-                       } else if (spare_cap > max_spare_cap) {
+                       } else if (cpu_cap > max_spare_cap) {
                                 /*
                                  * Find the CPU with the maximum spare capacity
                                  * in the performance domain.
                                  */
-                               max_spare_cap = spare_cap;
+                               max_spare_cap = cpu_cap;
                                 max_spare_cap_cpu = cpu;
                         }
                 }
@@ -6887,13 +6959,16 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                 if (max_spare_cap_cpu < 0 && !compute_prev_delta)
                         continue;
  
+               eenv_pd_busy_time(&eenv, cpus, p);
                 /* Compute the 'base' energy of the pd, without @p */
-               base_energy_pd = compute_energy(p, -1, cpus, pd);
+               base_energy_pd = compute_energy(&eenv, pd, cpus, p, -1);
                 base_energy += base_energy_pd;
  
                 /* Evaluate the energy impact of using prev_cpu. */
                 if (compute_prev_delta) {
-                       prev_delta = compute_energy(p, prev_cpu, cpus, pd);
+                       prev_delta = compute_energy(&eenv, pd, cpus, p,
+                                                   prev_cpu);
+                       /* CPU utilization has changed */
                         if (prev_delta < base_energy_pd)
                                 goto unlock;
                         prev_delta -= base_energy_pd;
@@ -6902,8 +6977,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
  
                 /* Evaluate the energy impact of using max_spare_cap_cpu. */
                 if (max_spare_cap_cpu >= 0) {
-                       cur_delta = compute_energy(p, max_spare_cap_cpu, cpus,
-                                                  pd);
+                       cur_delta = compute_energy(&eenv, pd, cpus, p,
+                                                  max_spare_cap_cpu);
+                       /* CPU utilization has changed */
                         if (cur_delta < base_energy_pd)
                                 goto unlock;
                         cur_delta -= base_energy_pd;
author	Vincent Donnefort <vincent.donnefort@arm.com>
	Tue, 21 Jun 2022 09:04:13 +0000 (10:04 +0100)
committer	Peter Zijlstra <peterz@infradead.org>
	Tue, 28 Jun 2022 07:17:47 +0000 (09:17 +0200)