sched: Async unthrottling for cfs bandwidth

author Josh Don <joshdon@google.com>

Thu, 17 Nov 2022 00:54:18 +0000 (16:54 -0800)

committer Peter Zijlstra <peterz@infradead.org>

Tue, 27 Dec 2022 11:52:09 +0000 (12:52 +0100)
author Josh Don <joshdon@google.com>
Thu, 17 Nov 2022 00:54:18 +0000 (16:54 -0800)
committer Peter Zijlstra <peterz@infradead.org>
Tue, 27 Dec 2022 11:52:09 +0000 (12:52 +0100)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index c36aa54ae071a746bc9d585a33228c731e5de025..ea81d481f59b8efa4caa9fec4964941792f6a277 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5461,22 +5461,105 @@ unthrottle_throttle:
                 resched_curr(rq);
  }
  
-static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+#ifdef CONFIG_SMP
+static void __cfsb_csd_unthrottle(void *arg)
  {
-       struct cfs_rq *cfs_rq;
+       struct cfs_rq *cursor, *tmp;
+       struct rq *rq = arg;
+       struct rq_flags rf;
+
+       rq_lock(rq, &rf);
+
+       /*
+        * Since we hold rq lock we're safe from concurrent manipulation of
+        * the CSD list. However, this RCU critical section annotates the
+        * fact that we pair with sched_free_group_rcu(), so that we cannot
+        * race with group being freed in the window between removing it
+        * from the list and advancing to the next entry in the list.
+        */
+       rcu_read_lock();
+
+       list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
+                                throttled_csd_list) {
+               list_del_init(&cursor->throttled_csd_list);
+
+               if (cfs_rq_throttled(cursor))
+                       unthrottle_cfs_rq(cursor);
+       }
+
+       rcu_read_unlock();
+
+       rq_unlock(rq, &rf);
+}
+
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+       struct rq *rq = rq_of(cfs_rq);
+       bool first;
+
+       if (rq == this_rq()) {
+               unthrottle_cfs_rq(cfs_rq);
+               return;
+       }
+
+       /* Already enqueued */
+       if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
+               return;
+
+       first = list_empty(&rq->cfsb_csd_list);
+       list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
+       if (first)
+               smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
+}
+#else
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+       unthrottle_cfs_rq(cfs_rq);
+}
+#endif
+
+static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+       lockdep_assert_rq_held(rq_of(cfs_rq));
+
+       if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
+           cfs_rq->runtime_remaining <= 0))
+               return;
+
+       __unthrottle_cfs_rq_async(cfs_rq);
+}
+
+static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+{
+       struct cfs_rq *local_unthrottle = NULL;
+       int this_cpu = smp_processor_id();
         u64 runtime, remaining = 1;
+       bool throttled = false;
+       struct cfs_rq *cfs_rq;
+       struct rq_flags rf;
+       struct rq *rq;
  
         rcu_read_lock();
         list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
                                 throttled_list) {
-               struct rq *rq = rq_of(cfs_rq);
-               struct rq_flags rf;
+               rq = rq_of(cfs_rq);
+
+               if (!remaining) {
+                       throttled = true;
+                       break;
+               }
  
                 rq_lock_irqsave(rq, &rf);
                 if (!cfs_rq_throttled(cfs_rq))
                         goto next;
  
-               /* By the above check, this should never be true */
+#ifdef CONFIG_SMP
+               /* Already queued for async unthrottle */
+               if (!list_empty(&cfs_rq->throttled_csd_list))
+                       goto next;
+#endif
+
+               /* By the above checks, this should never be true */
                 SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
  
                 raw_spin_lock(&cfs_b->lock);
@@ -5490,16 +5573,30 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
                 cfs_rq->runtime_remaining += runtime;
  
                 /* we check whether we're throttled above */
-               if (cfs_rq->runtime_remaining > 0)
-                       unthrottle_cfs_rq(cfs_rq);
+               if (cfs_rq->runtime_remaining > 0) {
+                       if (cpu_of(rq) != this_cpu ||
+                           SCHED_WARN_ON(local_unthrottle))
+                               unthrottle_cfs_rq_async(cfs_rq);
+                       else
+                               local_unthrottle = cfs_rq;
+               } else {
+                       throttled = true;
+               }
  
  next:
                 rq_unlock_irqrestore(rq, &rf);
-
-               if (!remaining)
-                       break;
         }
         rcu_read_unlock();
+
+       if (local_unthrottle) {
+               rq = cpu_rq(this_cpu);
+               rq_lock_irqsave(rq, &rf);
+               if (cfs_rq_throttled(local_unthrottle))
+                       unthrottle_cfs_rq(local_unthrottle);
+               rq_unlock_irqrestore(rq, &rf);
+       }
+
+       return throttled;
  }
  
  /*
@@ -5544,10 +5641,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
         while (throttled && cfs_b->runtime > 0) {
                 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
                 /* we can't nest cfs_b->lock while distributing bandwidth */
-               distribute_cfs_runtime(cfs_b);
+               throttled = distribute_cfs_runtime(cfs_b);
                 raw_spin_lock_irqsave(&cfs_b->lock, flags);
-
-               throttled = !list_empty(&cfs_b->throttled_cfs_rq);
         }
  
         /*
@@ -5824,6 +5919,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  {
         cfs_rq->runtime_enabled = 0;
         INIT_LIST_HEAD(&cfs_rq->throttled_list);
+#ifdef CONFIG_SMP
+       INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+#endif
  }
  
  void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -5840,12 +5938,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
  
  static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
  {
+       int __maybe_unused i;
+
         /* init_cfs_bandwidth() was not called */
         if (!cfs_b->throttled_cfs_rq.next)
                 return;
  
         hrtimer_cancel(&cfs_b->period_timer);
         hrtimer_cancel(&cfs_b->slack_timer);
+
+       /*
+        * It is possible that we still have some cfs_rq's pending on a CSD
+        * list, though this race is very rare. In order for this to occur, we
+        * must have raced with the last task leaving the group while there
+        * exist throttled cfs_rq(s), and the period_timer must have queued the
+        * CSD item but the remote cpu has not yet processed it. To handle this,
+        * we can simply flush all pending CSD work inline here. We're
+        * guaranteed at this point that no additional cfs_rq of this group can
+        * join a CSD list.
+        */
+#ifdef CONFIG_SMP
+       for_each_possible_cpu(i) {
+               struct rq *rq = cpu_rq(i);
+               unsigned long flags;
+
+               if (list_empty(&rq->cfsb_csd_list))
+                       continue;
+
+               local_irq_save(flags);
+               __cfsb_csd_unthrottle(rq);
+               local_irq_restore(flags);
+       }
+#endif
  }
  
  /*
@@ -12474,6 +12598,11 @@ __init void init_sched_fair_class(void)
         for_each_possible_cpu(i) {
                 zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
                 zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i));
+
+#ifdef CONFIG_CFS_BANDWIDTH
+               INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
+               INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
+#endif
         }
  
         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 771f8ddb705337ebfab2bd4fa7ef69b1a59fedae..b3d6e819127c973dce807750bb95f368053f193c 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -645,6 +645,9 @@ struct cfs_rq {
         int                     throttled;
         int                     throttle_count;
         struct list_head        throttled_list;
+#ifdef CONFIG_SMP
+       struct list_head        throttled_csd_list;
+#endif
  #endif /* CONFIG_CFS_BANDWIDTH */
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  };
@@ -1154,6 +1157,11 @@ struct rq {
  
         /* Scratch cpumask to be temporarily used under rq_lock */
         cpumask_var_t           scratch_mask;
+
+#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
+       call_single_data_t      cfsb_csd;
+       struct list_head        cfsb_csd_list;
+#endif
  };
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
author	Josh Don <joshdon@google.com>
	Thu, 17 Nov 2022 00:54:18 +0000 (16:54 -0800)
committer	Peter Zijlstra <peterz@infradead.org>
	Tue, 27 Dec 2022 11:52:09 +0000 (12:52 +0100)
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history