struct callback_head *head = rq->balance_callback;
 
        lockdep_assert_held(&rq->lock);
-       if (head)
+       if (head) {
                rq->balance_callback = NULL;
+               rq->balance_flags &= ~BALANCE_WORK;
+       }
 
        return head;
 }
        }
 }
 
+static void balance_push(struct rq *rq);
+
+static inline void balance_switch(struct rq *rq)
+{
+       if (likely(!rq->balance_flags))
+               return;
+
+       if (rq->balance_flags & BALANCE_PUSH) {
+               balance_push(rq);
+               return;
+       }
+
+       __balance_callbacks(rq);
+}
+
 #else
 
 static inline void __balance_callbacks(struct rq *rq)
 {
 }
 
+static inline void balance_switch(struct rq *rq)
+{
+}
+
 #endif
 
 static inline void
         * prev into current:
         */
        spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-       __balance_callbacks(rq);
+       balance_switch(rq);
        raw_spin_unlock_irq(&rq->lock);
 }
 
 
        rq->stop = stop;
 }
+
+static int __balance_push_cpu_stop(void *arg)
+{
+       struct task_struct *p = arg;
+       struct rq *rq = this_rq();
+       struct rq_flags rf;
+       int cpu;
+
+       raw_spin_lock_irq(&p->pi_lock);
+       rq_lock(rq, &rf);
+
+       update_rq_clock(rq);
+
+       if (task_rq(p) == rq && task_on_rq_queued(p)) {
+               cpu = select_fallback_rq(rq->cpu, p);
+               rq = __migrate_task(rq, &rf, p, cpu);
+       }
+
+       rq_unlock(rq, &rf);
+       raw_spin_unlock_irq(&p->pi_lock);
+
+       put_task_struct(p);
+
+       return 0;
+}
+
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
+
+/*
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
+ */
+static void balance_push(struct rq *rq)
+{
+       struct task_struct *push_task = rq->curr;
+
+       lockdep_assert_held(&rq->lock);
+       SCHED_WARN_ON(rq->cpu != smp_processor_id());
+
+       /*
+        * Both the cpu-hotplug and stop task are in this case and are
+        * required to complete the hotplug process.
+        */
+       if (is_per_cpu_kthread(push_task))
+               return;
+
+       get_task_struct(push_task);
+       /*
+        * Temporarily drop rq->lock such that we can wake-up the stop task.
+        * Both preemption and IRQs are still disabled.
+        */
+       raw_spin_unlock(&rq->lock);
+       stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
+                           this_cpu_ptr(&push_work));
+       /*
+        * At this point need_resched() is true and we'll take the loop in
+        * schedule(). The next pick is obviously going to be the stop task
+        * which is_per_cpu_kthread() and will push this task away.
+        */
+       raw_spin_lock(&rq->lock);
+}
+
+static void balance_push_set(int cpu, bool on)
+{
+       struct rq *rq = cpu_rq(cpu);
+       struct rq_flags rf;
+
+       rq_lock_irqsave(rq, &rf);
+       if (on)
+               rq->balance_flags |= BALANCE_PUSH;
+       else
+               rq->balance_flags &= ~BALANCE_PUSH;
+       rq_unlock_irqrestore(rq, &rf);
+}
+
+#else
+
+static inline void balance_push(struct rq *rq)
+{
+}
+
+static inline void balance_push_set(int cpu, bool on)
+{
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void set_rq_online(struct rq *rq)
        struct rq *rq = cpu_rq(cpu);
        struct rq_flags rf;
 
+       balance_push_set(cpu, false);
+
 #ifdef CONFIG_SCHED_SMT
        /*
         * When going up, increment the number of cores with SMT present.
         */
        synchronize_rcu();
 
+       balance_push_set(cpu, true);
+
 #ifdef CONFIG_SCHED_SMT
        /*
         * When going down, decrement the number of cores with SMT present.
 
        ret = cpuset_cpu_inactive(cpu);
        if (ret) {
+               balance_push_set(cpu, false);
                set_cpu_active(cpu, true);
                return ret;
        }
 
        unsigned long           cpu_capacity_orig;
 
        struct callback_head    *balance_callback;
+       unsigned char           balance_flags;
 
        unsigned char           nohz_idle_balance;
        unsigned char           idle_balance;
 
 #ifdef CONFIG_SMP
 
+#define BALANCE_WORK   0x01
+#define BALANCE_PUSH   0x02
+
 static inline void
 queue_balance_callback(struct rq *rq,
                       struct callback_head *head,
 {
        lockdep_assert_held(&rq->lock);
 
-       if (unlikely(head->next))
+       if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH)))
                return;
 
        head->func = (void (*)(struct callback_head *))func;
        head->next = rq->balance_callback;
        rq->balance_callback = head;
+       rq->balance_flags |= BALANCE_WORK;
 }
 
 #define rcu_dereference_check_sched_domain(p) \