When waking affine, check for an idle shared cache, and if
found, wake to that CPU/sibling instead of the waker's CPU.
This improves pgsql+oltp ramp up by roughly 8%. Possibly more
for other loads, depending on overlap. The trade-off is a
roughly 1% peak downturn if tasks are truly synchronous.
Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@kernel.org>
LKML-Reference: <
1256654138.17752.7.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
                                want_sd = 0;
                }
 
-               if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
-                   cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+               if (want_affine && (tmp->flags & SD_WAKE_AFFINE)) {
+                       int candidate = -1, i;
 
-                       affine_sd = tmp;
-                       want_affine = 0;
+                       if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
+                               candidate = cpu;
+
+                       /*
+                        * Check for an idle shared cache.
+                        */
+                       if (tmp->flags & SD_PREFER_SIBLING) {
+                               if (candidate == cpu) {
+                                       if (!cpu_rq(prev_cpu)->cfs.nr_running)
+                                               candidate = prev_cpu;
+                               }
+
+                               if (candidate == -1 || candidate == cpu) {
+                                       for_each_cpu(i, sched_domain_span(tmp)) {
+                                               if (!cpu_rq(i)->cfs.nr_running) {
+                                                       candidate = i;
+                                                       break;
+                                               }
+                                       }
+                               }
+                       }
+
+                       if (candidate >= 0) {
+                               affine_sd = tmp;
+                               want_affine = 0;
+                               cpu = candidate;
+                       }
                }
 
                if (!want_sd && !want_affine)