sched_ext: Move event_stats_cpu into scx_sched

author Tejun Heo <tj@kernel.org>

Tue, 29 Apr 2025 18:40:10 +0000 (08:40 -1000)

committer Tejun Heo <tj@kernel.org>

Tue, 29 Apr 2025 18:40:10 +0000 (08:40 -1000)
author Tejun Heo <tj@kernel.org>
Tue, 29 Apr 2025 18:40:10 +0000 (08:40 -1000)
committer Tejun Heo <tj@kernel.org>
Tue, 29 Apr 2025 18:40:10 +0000 (08:40 -1000)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index 747071b6fc3861f19811b7d4ad778417b744e2cd..86cad4722b60c46dc634b142e9f69ceba29723de 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -838,6 +838,13 @@ struct scx_sched {
         struct rhashtable       dsq_hash;
         struct scx_dispatch_q   **global_dsqs;
  
+       /*
+        * The event counters are in a per-CPU variable to minimize the
+        * accounting overhead. A system-wide view on the event counter is
+        * constructed when requested by scx_bpf_events().
+        */
+       struct scx_event_stats __percpu *event_stats_cpu;
+
         bool                    warned_zero_slice;
  
         atomic_t                exit_kind;
@@ -1599,34 +1606,29 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
         return p;
  }
  
-/*
- * The event counter is organized by a per-CPU variable to minimize the
- * accounting overhead without synchronization. A system-wide view on the
- * event counter is constructed when requested by scx_bpf_get_event_stat().
- */
-static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
-
  /**
   * scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @sch: scx_sched to account events for
   * @name: an event name defined in struct scx_event_stats
   * @cnt: the number of the event occured
   *
   * This can be used when preemption is not disabled.
   */
-#define scx_add_event(name, cnt) do {                                          \
-       this_cpu_add(event_stats_cpu.name, cnt);                                \
-       trace_sched_ext_event(#name, cnt);                                      \
+#define scx_add_event(sch, name, cnt) do {                                     \
+       this_cpu_add((sch)->event_stats_cpu->name, (cnt));                      \
+       trace_sched_ext_event(#name, (cnt));                                    \
  } while(0)
  
  /**
   * __scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @sch: scx_sched to account events for
   * @name: an event name defined in struct scx_event_stats
   * @cnt: the number of the event occured
   *
   * This should be used only when preemption is disabled.
   */
-#define __scx_add_event(name, cnt) do {                                                \
-       __this_cpu_add(event_stats_cpu.name, cnt);                              \
+#define __scx_add_event(sch, name, cnt) do {                                   \
+       __this_cpu_add((sch)->event_stats_cpu->name, (cnt));                    \
         trace_sched_ext_event(#name, cnt);                                      \
  } while(0)
  
@@ -1651,7 +1653,8 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
  } while (0)
  
  
-static void scx_read_events(struct scx_event_stats *events);
+static void scx_read_events(struct scx_sched *sch,
+                           struct scx_event_stats *events);
  
  static enum scx_enable_state scx_enable_state(void)
  {
@@ -1877,7 +1880,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
  static void refill_task_slice_dfl(struct task_struct *p)
  {
         p->scx.slice = SCX_SLICE_DFL;
-       __scx_add_event(SCX_EV_REFILL_SLICE_DFL, 1);
+       __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1);
  }
  
  static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
@@ -2206,7 +2209,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
                 goto local;
  
         if (scx_rq_bypassing(rq)) {
-               __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
+               __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
                 goto global;
         }
  
@@ -2216,14 +2219,14 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
         /* see %SCX_OPS_ENQ_EXITING */
         if (!(scx_root->ops.flags & SCX_OPS_ENQ_EXITING) &&
             unlikely(p->flags & PF_EXITING)) {
-               __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1);
+               __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1);
                 goto local;
         }
  
         /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
         if (!(scx_root->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
             is_migration_disabled(p)) {
-               __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
+               __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
                 goto local;
         }
  
@@ -2346,7 +2349,7 @@ out:
  
         if ((enq_flags & SCX_ENQ_CPU_SELECTED) &&
             unlikely(cpu_of(rq) != p->scx.selected_cpu))
-               __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1);
+               __scx_add_event(scx_root, SCX_EV_SELECT_CPU_FALLBACK, 1);
  }
  
  static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
@@ -2574,7 +2577,8 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
  
         if (!scx_rq_online(rq)) {
                 if (enforce)
-                       __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
+                       __scx_add_event(scx_root,
+                                       SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
                 return false;
         }
  
@@ -3096,7 +3100,7 @@ no_tasks:
         if (prev_on_rq &&
             (!(scx_root->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
                 rq->scx.flags |= SCX_RQ_BAL_KEEP;
-               __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1);
+               __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1);
                 goto has_tasks;
         }
         rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
@@ -3427,6 +3431,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
  
  static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
  {
+       struct scx_sched *sch = scx_root;
         bool rq_bypass;
  
         /*
@@ -3443,7 +3448,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
                 return prev_cpu;
  
         rq_bypass = scx_rq_bypassing(task_rq(p));
-       if (likely(SCX_HAS_OP(scx_root, select_cpu)) && !rq_bypass) {
+       if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) {
                 s32 cpu;
                 struct task_struct **ddsp_taskp;
  
@@ -3472,7 +3477,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
                 p->scx.selected_cpu = cpu;
  
                 if (rq_bypass)
-                       __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
+                       __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
                 return cpu;
         }
  }
@@ -4413,6 +4418,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
         struct scx_dispatch_q *dsq;
         int node;
  
+       free_percpu(sch->event_stats_cpu);
+
         for_each_node_state(node, N_POSSIBLE)
                 kfree(sch->global_dsqs[node]);
         kfree(sch->global_dsqs);
@@ -4455,10 +4462,11 @@ SCX_ATTR(ops);
  static ssize_t scx_attr_events_show(struct kobject *kobj,
                                     struct kobj_attribute *ka, char *buf)
  {
+       struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
         struct scx_event_stats events;
         int at = 0;
  
-       scx_read_events(&events);
+       scx_read_events(sch, &events);
         at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
         at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
         at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
@@ -4591,25 +4599,29 @@ static void scx_bypass(bool bypass)
  {
         static DEFINE_RAW_SPINLOCK(bypass_lock);
         static unsigned long bypass_timestamp;
-
-       int cpu;
+       struct scx_sched *sch;
         unsigned long flags;
+       int cpu;
  
         raw_spin_lock_irqsave(&bypass_lock, flags);
+       sch = rcu_dereference_bh(scx_root);
+
         if (bypass) {
                 scx_bypass_depth++;
                 WARN_ON_ONCE(scx_bypass_depth <= 0);
                 if (scx_bypass_depth != 1)
                         goto unlock;
                 bypass_timestamp = ktime_get_ns();
-               scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1);
+               if (sch)
+                       scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
         } else {
                 scx_bypass_depth--;
                 WARN_ON_ONCE(scx_bypass_depth < 0);
                 if (scx_bypass_depth != 0)
                         goto unlock;
-               scx_add_event(SCX_EV_BYPASS_DURATION,
-                             ktime_get_ns() - bypass_timestamp);
+               if (sch)
+                       scx_add_event(sch, SCX_EV_BYPASS_DURATION,
+                                     ktime_get_ns() - bypass_timestamp);
         }
  
         atomic_inc(&scx_breather_depth);
@@ -5182,7 +5194,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
         dump_line(&s, "Event counters");
         dump_line(&s, "--------------");
  
-       scx_read_events(&events);
+       scx_read_events(scx_root, &events);
         scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
         scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
         scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
@@ -5290,6 +5302,10 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
                 sch->global_dsqs[node] = dsq;
         }
  
+       sch->event_stats_cpu = alloc_percpu(struct scx_event_stats);
+       if (!sch->event_stats_cpu)
+               goto err_free_gdsqs;
+
         atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
         sch->ops = *ops;
         ops->priv = sch;
@@ -5297,10 +5313,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
         sch->kobj.kset = scx_kset;
         ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
         if (ret < 0)
-               goto err_free_gdsqs;
+               goto err_free_event_stats;
  
         return sch;
  
+err_free_event_stats:
+       free_percpu(sch->event_stats_cpu);
  err_free_gdsqs:
         for_each_node_state(node, N_POSSIBLE)
                 kfree(sch->global_dsqs[node]);
@@ -5376,15 +5394,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
  
         mutex_lock(&scx_enable_mutex);
  
-       /*
-        * Clear event counters so a new scx scheduler gets
-        * fresh event counter values.
-        */
-       for_each_possible_cpu(cpu) {
-               struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu);
-               memset(e, 0, sizeof(*e));
-       }
-
         if (!scx_helper) {
                 WRITE_ONCE(scx_helper, scx_create_rt_helper("sched_ext_helper"));
                 if (!scx_helper) {
@@ -7407,7 +7416,7 @@ __bpf_kfunc u64 scx_bpf_now(void)
         return clock;
  }
  
-static void scx_read_events(struct scx_event_stats *events)
+static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events)
  {
         struct scx_event_stats *e_cpu;
         int cpu;
@@ -7415,7 +7424,7 @@ static void scx_read_events(struct scx_event_stats *events)
         /* Aggregate per-CPU event counters into @events. */
         memset(events, 0, sizeof(*events));
         for_each_possible_cpu(cpu) {
-               e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
+               e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu);
                 scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
                 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
                 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
@@ -7436,9 +7445,16 @@ static void scx_read_events(struct scx_event_stats *events)
  __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
                                 size_t events__sz)
  {
+       struct scx_sched *sch;
         struct scx_event_stats e_sys;
  
-       scx_read_events(&e_sys);
+       rcu_read_lock();
+       sch = rcu_dereference(scx_root);
+       if (sch)
+               scx_read_events(sch, &e_sys);
+       else
+               memset(&e_sys, 0, sizeof(e_sys));
+       rcu_read_unlock();
  
         /*
          * We cannot entirely trust a BPF-provided size since a BPF program
author	Tejun Heo <tj@kernel.org>
	Tue, 29 Apr 2025 18:40:10 +0000 (08:40 -1000)
committer	Tejun Heo <tj@kernel.org>
	Tue, 29 Apr 2025 18:40:10 +0000 (08:40 -1000)