sched_ext: Use dynamic allocation for scx_sched

author Tejun Heo <tj@kernel.org>

Tue, 29 Apr 2025 18:40:10 +0000 (08:40 -1000)

committer Tejun Heo <tj@kernel.org>

Tue, 29 Apr 2025 18:40:10 +0000 (08:40 -1000)
author Tejun Heo <tj@kernel.org>
Tue, 29 Apr 2025 18:40:10 +0000 (08:40 -1000)
committer Tejun Heo <tj@kernel.org>
Tue, 29 Apr 2025 18:40:10 +0000 (08:40 -1000)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c

index 784bdf12db44eb1dbefc46888bdc80509abddf1f..42e28e5421ecee2a379148ed099c773d4543461f 100644 (file)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -752,6 +752,9 @@ struct sched_ext_ops {
          * BPF scheduler is enabled.
          */
         char name[SCX_OPS_NAME_LEN];
+
+       /* internal use only, must be NULL */
+       void *priv;
  };
  
  enum scx_opi {
@@ -772,7 +775,8 @@ struct scx_sched {
         atomic_t                exit_kind;
         struct scx_exit_info    *exit_info;
  
-       struct kobject          *kobj;
+       struct kobject          kobj;
+       struct rcu_work         rcu_work;
  };
  
  enum scx_wake_flags {
@@ -933,11 +937,7 @@ enum scx_ops_state {
  #define SCX_OPSS_STATE_MASK    ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
  #define SCX_OPSS_QSEQ_MASK     (~SCX_OPSS_STATE_MASK)
  
-static struct scx_sched __scx_root = {
-       .exit_kind              = ATOMIC_INIT(SCX_EXIT_DONE),
-};
-
-static struct scx_sched *scx_root = &__scx_root;
+static struct scx_sched __rcu *scx_root;
  
  /*
   * During exit, a task may schedule after losing its PIDs. When disabling the
@@ -4417,9 +4417,23 @@ static const struct attribute_group scx_global_attr_group = {
         .attrs = scx_global_attrs,
  };
  
+static void free_exit_info(struct scx_exit_info *ei);
+
+static void scx_sched_free_rcu_work(struct work_struct *work)
+{
+       struct rcu_work *rcu_work = to_rcu_work(work);
+       struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
+
+       free_exit_info(sch->exit_info);
+       kfree(sch);
+}
+
  static void scx_kobj_release(struct kobject *kobj)
  {
-       kfree(kobj);
+       struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+
+       INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
+       queue_rcu_work(system_unbound_wq, &sch->rcu_work);
  }
  
  static ssize_t scx_attr_ops_show(struct kobject *kobj,
@@ -4709,14 +4723,15 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
  
  static void scx_disable_workfn(struct kthread_work *work)
  {
-       struct scx_exit_info *ei = scx_root->exit_info;
+       struct scx_sched *sch = scx_root;
+       struct scx_exit_info *ei = sch->exit_info;
         struct scx_task_iter sti;
         struct task_struct *p;
         struct rhashtable_iter rht_iter;
         struct scx_dispatch_q *dsq;
         int kind, cpu;
  
-       kind = atomic_read(&scx_root->exit_kind);
+       kind = atomic_read(&sch->exit_kind);
         while (true) {
                 /*
                  * NONE indicates that a new scx_ops has been registered since
@@ -4725,7 +4740,7 @@ static void scx_disable_workfn(struct kthread_work *work)
                  */
                 if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
                         return;
-               if (atomic_try_cmpxchg(&scx_root->exit_kind, &kind, SCX_EXIT_DONE))
+               if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
                         break;
         }
         ei->kind = kind;
@@ -4740,7 +4755,7 @@ static void scx_disable_workfn(struct kthread_work *work)
                 break;
         case SCX_DISABLED:
                 pr_warn("sched_ext: ops error detected without ops (%s)\n",
-                       scx_root->exit_info->msg);
+                       sch->exit_info->msg);
                 WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
                 goto done;
         default:
@@ -4807,41 +4822,43 @@ static void scx_disable_workfn(struct kthread_work *work)
  
         /* no task is on scx, turn off all the switches and flush in-progress calls */
         static_branch_disable(&__scx_enabled);
-       bitmap_zero(scx_root->has_op, SCX_OPI_END);
+       bitmap_zero(sch->has_op, SCX_OPI_END);
         scx_idle_disable();
         synchronize_rcu();
  
         if (ei->kind >= SCX_EXIT_ERROR) {
                 pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
-                      scx_root->ops.name, ei->reason);
+                      sch->ops.name, ei->reason);
  
                 if (ei->msg[0] != '\0')
-                       pr_err("sched_ext: %s: %s\n",
-                              scx_root->ops.name, ei->msg);
+                       pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
  #ifdef CONFIG_STACKTRACE
                 stack_trace_print(ei->bt, ei->bt_len, 2);
  #endif
         } else {
                 pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
-                       scx_root->ops.name, ei->reason);
+                       sch->ops.name, ei->reason);
         }
  
-       if (scx_root->ops.exit)
+       if (sch->ops.exit)
                 SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei);
  
         cancel_delayed_work_sync(&scx_watchdog_work);
  
         /*
-        * Delete the kobject from the hierarchy eagerly in addition to just
-        * dropping a reference. Otherwise, if the object is deleted
-        * asynchronously, sysfs could observe an object of the same name still
-        * in the hierarchy when another scheduler is loaded.
+        * scx_root clearing must be inside cpus_read_lock(). See
+        * handle_hotplug().
          */
-       kobject_del(scx_root->kobj);
-       kobject_put(scx_root->kobj);
-       scx_root->kobj = NULL;
+       cpus_read_lock();
+       RCU_INIT_POINTER(scx_root, NULL);
+       cpus_read_unlock();
  
-       memset(&scx_root->ops, 0, sizeof(scx_root->ops));
+       /*
+        * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs
+        * could observe an object of the same name still in the hierarchy when
+        * the next scheduler is loaded.
+        */
+       kobject_del(&sch->kobj);
  
         rhashtable_walk_enter(&dsq_hash, &rht_iter);
         do {
@@ -4858,9 +4875,6 @@ static void scx_disable_workfn(struct kthread_work *work)
         scx_dsp_ctx = NULL;
         scx_dsp_max_batch = 0;
  
-       free_exit_info(scx_root->exit_info);
-       scx_root->exit_info = NULL;
-
         mutex_unlock(&scx_enable_mutex);
  
         WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
@@ -4885,13 +4899,18 @@ static void schedule_scx_disable_work(void)
  static void scx_disable(enum scx_exit_kind kind)
  {
         int none = SCX_EXIT_NONE;
+       struct scx_sched *sch;
  
         if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
                 kind = SCX_EXIT_ERROR;
  
-       atomic_try_cmpxchg(&scx_root->exit_kind, &none, kind);
-
-       schedule_scx_disable_work();
+       rcu_read_lock();
+       sch = rcu_dereference(scx_root);
+       if (sch) {
+               atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
+               schedule_scx_disable_work();
+       }
+       rcu_read_unlock();
  }
  
  static void dump_newline(struct seq_buf *s)
@@ -5288,6 +5307,7 @@ static int validate_ops(const struct sched_ext_ops *ops)
  
  static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
  {
+       struct scx_sched *sch;
         struct scx_task_iter sti;
         struct task_struct *p;
         unsigned long timeout;
@@ -5351,33 +5371,33 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
                 goto err_unlock;
         }
  
-       scx_root->kobj = kzalloc(sizeof(*scx_root->kobj), GFP_KERNEL);
-       if (!scx_root->kobj) {
+       sch = kzalloc(sizeof(*sch), GFP_KERNEL);
+       if (!sch) {
                 ret = -ENOMEM;
                 goto err_unlock;
         }
  
-       scx_root->kobj->kset = scx_kset;
-       ret = kobject_init_and_add(scx_root->kobj, &scx_ktype, NULL, "root");
-       if (ret < 0)
-               goto err;
-
-       scx_root->exit_info = alloc_exit_info(ops->exit_dump_len);
-       if (!scx_root->exit_info) {
+       sch->exit_info = alloc_exit_info(ops->exit_dump_len);
+       if (!sch->exit_info) {
                 ret = -ENOMEM;
-               goto err_del;
+               goto err_free;
         }
  
+       sch->kobj.kset = scx_kset;
+       ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
+       if (ret < 0)
+               goto err_free;
+
+       atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
+       sch->ops = *ops;
+       ops->priv = sch;
+
         /*
-        * Set scx_ops, transition to ENABLING and clear exit info to arm the
-        * disable path. Failure triggers full disabling from here on.
+        * Transition to ENABLING and clear exit info to arm the disable path.
+        * Failure triggers full disabling from here on.
          */
-       scx_root->ops = *ops;
-
         WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
-
-       atomic_set(&scx_root->exit_kind, SCX_EXIT_NONE);
-       scx_root->warned_zero_slice = false;
+       WARN_ON_ONCE(scx_root);
  
         atomic_long_set(&scx_nr_rejected, 0);
  
@@ -5390,9 +5410,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
          */
         cpus_read_lock();
  
+       /*
+        * Make the scheduler instance visible. Must be inside cpus_read_lock().
+        * See handle_hotplug().
+        */
+       rcu_assign_pointer(scx_root, sch);
+
         scx_idle_enable(ops);
  
-       if (scx_root->ops.init) {
+       if (sch->ops.init) {
                 ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL);
                 if (ret) {
                         ret = ops_sanitize_err("init", ret);
@@ -5404,7 +5430,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
  
         for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
                 if (((void (**)(void))ops)[i])
-                       set_bit(i, scx_root->has_op);
+                       set_bit(i, sch->has_op);
  
         check_hotplug_seq(ops);
         scx_idle_update_selcpu_topology(ops);
@@ -5445,10 +5471,10 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
  
         for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
                 if (((void (**)(void))ops)[i])
-                       set_bit(i, scx_root->has_op);
+                       set_bit(i, sch->has_op);
  
-       if (scx_root->ops.cpu_acquire || scx_root->ops.cpu_release)
-               scx_root->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
+       if (sch->ops.cpu_acquire || sch->ops.cpu_release)
+               sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
  
         /*
          * Lock out forks, cgroup on/offlining and moves before opening the
@@ -5547,7 +5573,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
         scx_bypass(false);
  
         if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
-               WARN_ON_ONCE(atomic_read(&scx_root->exit_kind) == SCX_EXIT_NONE);
+               WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
                 goto err_disable;
         }
  
@@ -5555,23 +5581,18 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
                 static_branch_enable(&__scx_switched_all);
  
         pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
-               scx_root->ops.name, scx_switched_all() ? "" : " (partial)");
-       kobject_uevent(scx_root->kobj, KOBJ_ADD);
+               sch->ops.name, scx_switched_all() ? "" : " (partial)");
+       kobject_uevent(&sch->kobj, KOBJ_ADD);
         mutex_unlock(&scx_enable_mutex);
  
         atomic_long_inc(&scx_enable_seq);
  
         return 0;
  
-err_del:
-       kobject_del(scx_root->kobj);
-err:
-       kobject_put(scx_root->kobj);
-       scx_root->kobj = NULL;
-       if (scx_root->exit_info) {
-               free_exit_info(scx_root->exit_info);
-               scx_root->exit_info = NULL;
-       }
+err_free:
+       if (sch->exit_info)
+               free_exit_info(sch->exit_info);
+       kfree(sch);
  err_unlock:
         mutex_unlock(&scx_enable_mutex);
         return ret;
@@ -5589,7 +5610,7 @@ err_disable:
          * is notified through ops.exit() with all the details.
          *
          * Flush scx_disable_work to ensure that error is reported before init
-        * completion.
+        * completion. sch's base reference will be put by bpf_scx_unreg().
          */
         scx_error("scx_enable() failed (%d)", ret);
         kthread_flush_work(&scx_disable_work);
@@ -5741,8 +5762,12 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
  
  static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
  {
+       struct sched_ext_ops *ops = kdata;
+       struct scx_sched *sch = ops->priv;
+
         scx_disable(SCX_EXIT_UNREG);
         kthread_flush_work(&scx_disable_work);
+       kobject_put(&sch->kobj);
  }
  
  static int bpf_scx_init(struct btf *btf)
author	Tejun Heo <tj@kernel.org>
	Tue, 29 Apr 2025 18:40:10 +0000 (08:40 -1000)
committer	Tejun Heo <tj@kernel.org>
	Tue, 29 Apr 2025 18:40:10 +0000 (08:40 -1000)