sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
                atomic_inc(&sg->sgp->ref);
+               sg->balance_cpu = -1;
 
                if (cpumask_test_cpu(cpu, sg_span))
                        groups = sg;
 
                cpumask_clear(sched_group_cpus(sg));
                sg->sgp->power = 0;
+               sg->balance_cpu = -1;
 
                for_each_cpu(j, span) {
                        if (get_group(j, sdd, NULL) != group)
 
         */
        if (local_group) {
                if (idle != CPU_NEWLY_IDLE) {
-                       if (balance_cpu != this_cpu) {
+                       if (balance_cpu != this_cpu ||
+                           cmpxchg(&group->balance_cpu, -1, balance_cpu) != -1) {
                                *balance = 0;
                                return;
                        }
        int balance = 1;
        struct rq *rq = cpu_rq(cpu);
        unsigned long interval;
-       struct sched_domain *sd;
+       struct sched_domain *sd, *last = NULL;
        /* Earliest time when we have to do rebalance again */
        unsigned long next_balance = jiffies + 60*HZ;
        int update_next_balance = 0;
 
        rcu_read_lock();
        for_each_domain(cpu, sd) {
+               last = sd;
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
 
                if (!balance)
                        break;
        }
+       for (sd = last; sd; sd = sd->child)
+               (void)cmpxchg(&sd->groups->balance_cpu, cpu, -1);
+
        rcu_read_unlock();
 
        /*