resched_curr(rq);
 }
 
-static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+#ifdef CONFIG_SMP
+static void __cfsb_csd_unthrottle(void *arg)
 {
-       struct cfs_rq *cfs_rq;
+       struct cfs_rq *cursor, *tmp;
+       struct rq *rq = arg;
+       struct rq_flags rf;
+
+       rq_lock(rq, &rf);
+
+       /*
+        * Since we hold rq lock we're safe from concurrent manipulation of
+        * the CSD list. However, this RCU critical section annotates the
+        * fact that we pair with sched_free_group_rcu(), so that we cannot
+        * race with group being freed in the window between removing it
+        * from the list and advancing to the next entry in the list.
+        */
+       rcu_read_lock();
+
+       list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
+                                throttled_csd_list) {
+               list_del_init(&cursor->throttled_csd_list);
+
+               if (cfs_rq_throttled(cursor))
+                       unthrottle_cfs_rq(cursor);
+       }
+
+       rcu_read_unlock();
+
+       rq_unlock(rq, &rf);
+}
+
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+       struct rq *rq = rq_of(cfs_rq);
+       bool first;
+
+       if (rq == this_rq()) {
+               unthrottle_cfs_rq(cfs_rq);
+               return;
+       }
+
+       /* Already enqueued */
+       if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
+               return;
+
+       first = list_empty(&rq->cfsb_csd_list);
+       list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
+       if (first)
+               smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
+}
+#else
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+       unthrottle_cfs_rq(cfs_rq);
+}
+#endif
+
+static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+       lockdep_assert_rq_held(rq_of(cfs_rq));
+
+       if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
+           cfs_rq->runtime_remaining <= 0))
+               return;
+
+       __unthrottle_cfs_rq_async(cfs_rq);
+}
+
+static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+{
+       struct cfs_rq *local_unthrottle = NULL;
+       int this_cpu = smp_processor_id();
        u64 runtime, remaining = 1;
+       bool throttled = false;
+       struct cfs_rq *cfs_rq;
+       struct rq_flags rf;
+       struct rq *rq;
 
        rcu_read_lock();
        list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
                                throttled_list) {
-               struct rq *rq = rq_of(cfs_rq);
-               struct rq_flags rf;
+               rq = rq_of(cfs_rq);
+
+               if (!remaining) {
+                       throttled = true;
+                       break;
+               }
 
                rq_lock_irqsave(rq, &rf);
                if (!cfs_rq_throttled(cfs_rq))
                        goto next;
 
-               /* By the above check, this should never be true */
+#ifdef CONFIG_SMP
+               /* Already queued for async unthrottle */
+               if (!list_empty(&cfs_rq->throttled_csd_list))
+                       goto next;
+#endif
+
+               /* By the above checks, this should never be true */
                SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
 
                raw_spin_lock(&cfs_b->lock);
                cfs_rq->runtime_remaining += runtime;
 
                /* we check whether we're throttled above */
-               if (cfs_rq->runtime_remaining > 0)
-                       unthrottle_cfs_rq(cfs_rq);
+               if (cfs_rq->runtime_remaining > 0) {
+                       if (cpu_of(rq) != this_cpu ||
+                           SCHED_WARN_ON(local_unthrottle))
+                               unthrottle_cfs_rq_async(cfs_rq);
+                       else
+                               local_unthrottle = cfs_rq;
+               } else {
+                       throttled = true;
+               }
 
 next:
                rq_unlock_irqrestore(rq, &rf);
-
-               if (!remaining)
-                       break;
        }
        rcu_read_unlock();
+
+       if (local_unthrottle) {
+               rq = cpu_rq(this_cpu);
+               rq_lock_irqsave(rq, &rf);
+               if (cfs_rq_throttled(local_unthrottle))
+                       unthrottle_cfs_rq(local_unthrottle);
+               rq_unlock_irqrestore(rq, &rf);
+       }
+
+       return throttled;
 }
 
 /*
        while (throttled && cfs_b->runtime > 0) {
                raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
                /* we can't nest cfs_b->lock while distributing bandwidth */
-               distribute_cfs_runtime(cfs_b);
+               throttled = distribute_cfs_runtime(cfs_b);
                raw_spin_lock_irqsave(&cfs_b->lock, flags);
-
-               throttled = !list_empty(&cfs_b->throttled_cfs_rq);
        }
 
        /*
 {
        cfs_rq->runtime_enabled = 0;
        INIT_LIST_HEAD(&cfs_rq->throttled_list);
+#ifdef CONFIG_SMP
+       INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+#endif
 }
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
+       int __maybe_unused i;
+
        /* init_cfs_bandwidth() was not called */
        if (!cfs_b->throttled_cfs_rq.next)
                return;
 
        hrtimer_cancel(&cfs_b->period_timer);
        hrtimer_cancel(&cfs_b->slack_timer);
+
+       /*
+        * It is possible that we still have some cfs_rq's pending on a CSD
+        * list, though this race is very rare. In order for this to occur, we
+        * must have raced with the last task leaving the group while there
+        * exist throttled cfs_rq(s), and the period_timer must have queued the
+        * CSD item but the remote cpu has not yet processed it. To handle this,
+        * we can simply flush all pending CSD work inline here. We're
+        * guaranteed at this point that no additional cfs_rq of this group can
+        * join a CSD list.
+        */
+#ifdef CONFIG_SMP
+       for_each_possible_cpu(i) {
+               struct rq *rq = cpu_rq(i);
+               unsigned long flags;
+
+               if (list_empty(&rq->cfsb_csd_list))
+                       continue;
+
+               local_irq_save(flags);
+               __cfsb_csd_unthrottle(rq);
+               local_irq_restore(flags);
+       }
+#endif
 }
 
 /*
        for_each_possible_cpu(i) {
                zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
                zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i));
+
+#ifdef CONFIG_CFS_BANDWIDTH
+               INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
+               INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
+#endif
        }
 
        open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);