}
 
 /*
- * compute_energy(): Estimates the energy that would be consumed if @p was
+ * compute_energy(): Estimates the energy that @pd would consume if @p was
  * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
- * landscape of the * CPUs after the task migration, and uses the Energy Model
+ * landscape of @pd's CPUs after the task migration, and uses the Energy Model
  * to compute what would be the energy if we decided to actually migrate that
  * task.
  */
 static long
 compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
 {
-       unsigned int max_util, util_cfs, cpu_util, cpu_cap;
-       unsigned long sum_util, energy = 0;
-       struct task_struct *tsk;
+       struct cpumask *pd_mask = perf_domain_span(pd);
+       unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
+       unsigned long max_util = 0, sum_util = 0;
        int cpu;
 
-       for (; pd; pd = pd->next) {
-               struct cpumask *pd_mask = perf_domain_span(pd);
+       /*
+        * The capacity state of CPUs of the current rd can be driven by CPUs
+        * of another rd if they belong to the same pd. So, account for the
+        * utilization of these CPUs too by masking pd with cpu_online_mask
+        * instead of the rd span.
+        *
+        * If an entire pd is outside of the current rd, it will not appear in
+        * its pd list and will not be accounted by compute_energy().
+        */
+       for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
+               unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
+               struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
 
                /*
-                * The energy model mandates all the CPUs of a performance
-                * domain have the same capacity.
+                * Busy time computation: utilization clamping is not
+                * required since the ratio (sum_util / cpu_capacity)
+                * is already enough to scale the EM reported power
+                * consumption at the (eventually clamped) cpu_capacity.
                 */
-               cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
-               max_util = sum_util = 0;
+               sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+                                              ENERGY_UTIL, NULL);
 
                /*
-                * The capacity state of CPUs of the current rd can be driven by
-                * CPUs of another rd if they belong to the same performance
-                * domain. So, account for the utilization of these CPUs too
-                * by masking pd with cpu_online_mask instead of the rd span.
-                *
-                * If an entire performance domain is outside of the current rd,
-                * it will not appear in its pd list and will not be accounted
-                * by compute_energy().
+                * Performance domain frequency: utilization clamping
+                * must be considered since it affects the selection
+                * of the performance domain frequency.
+                * NOTE: in case RT tasks are running, by default the
+                * FREQUENCY_UTIL's utilization can be max OPP.
                 */
-               for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
-                       util_cfs = cpu_util_next(cpu, p, dst_cpu);
-
-                       /*
-                        * Busy time computation: utilization clamping is not
-                        * required since the ratio (sum_util / cpu_capacity)
-                        * is already enough to scale the EM reported power
-                        * consumption at the (eventually clamped) cpu_capacity.
-                        */
-                       sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
-                                                      ENERGY_UTIL, NULL);
-
-                       /*
-                        * Performance domain frequency: utilization clamping
-                        * must be considered since it affects the selection
-                        * of the performance domain frequency.
-                        * NOTE: in case RT tasks are running, by default the
-                        * FREQUENCY_UTIL's utilization can be max OPP.
-                        */
-                       tsk = cpu == dst_cpu ? p : NULL;
-                       cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
-                                                     FREQUENCY_UTIL, tsk);
-                       max_util = max(max_util, cpu_util);
-               }
-
-               energy += em_pd_energy(pd->em_pd, max_util, sum_util);
+               cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+                                             FREQUENCY_UTIL, tsk);
+               max_util = max(max_util, cpu_util);
        }
 
-       return energy;
+       return em_pd_energy(pd->em_pd, max_util, sum_util);
 }
 
 /*
  * other use-cases too. So, until someone finds a better way to solve this,
  * let's keep things simple by re-using the existing slow path.
  */
-
 static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 {
-       unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
+       unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+       unsigned long cpu_cap, util, base_energy = 0;
        int cpu, best_energy_cpu = prev_cpu;
-       struct perf_domain *head, *pd;
-       unsigned long cpu_cap, util;
        struct sched_domain *sd;
+       struct perf_domain *pd;
 
        rcu_read_lock();
        pd = rcu_dereference(rd->pd);
        if (!pd || READ_ONCE(rd->overutilized))
                goto fail;
-       head = pd;
 
        /*
         * Energy-aware wake-up happens on the lowest sched_domain starting
                goto unlock;
 
        for (; pd; pd = pd->next) {
-               unsigned long cur_energy, spare_cap, max_spare_cap = 0;
+               unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+               unsigned long base_energy_pd;
                int max_spare_cap_cpu = -1;
 
+               /* Compute the 'base' energy of the pd, without @p */
+               base_energy_pd = compute_energy(p, -1, pd);
+               base_energy += base_energy_pd;
+
                for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
                        if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                                continue;
 
                        /* Always use prev_cpu as a candidate. */
                        if (cpu == prev_cpu) {
-                               prev_energy = compute_energy(p, prev_cpu, head);
-                               best_energy = min(best_energy, prev_energy);
-                               continue;
+                               prev_delta = compute_energy(p, prev_cpu, pd);
+                               prev_delta -= base_energy_pd;
+                               best_delta = min(best_delta, prev_delta);
                        }
 
                        /*
 
                /* Evaluate the energy impact of using this CPU. */
                if (max_spare_cap_cpu >= 0) {
-                       cur_energy = compute_energy(p, max_spare_cap_cpu, head);
-                       if (cur_energy < best_energy) {
-                               best_energy = cur_energy;
+                       cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
+                       cur_delta -= base_energy_pd;
+                       if (cur_delta < best_delta) {
+                               best_delta = cur_delta;
                                best_energy_cpu = max_spare_cap_cpu;
                        }
                }
         * Pick the best CPU if prev_cpu cannot be used, or if it saves at
         * least 6% of the energy used by prev_cpu.
         */
-       if (prev_energy == ULONG_MAX)
+       if (prev_delta == ULONG_MAX)
                return best_energy_cpu;
 
-       if ((prev_energy - best_energy) > (prev_energy >> 4))
+       if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
                return best_energy_cpu;
 
        return prev_cpu;