sched_ext: idle: Extend topology optimizations to all tasks

author Andrea Righi <arighi@nvidia.com>

Sat, 5 Apr 2025 13:39:21 +0000 (15:39 +0200)

committer Tejun Heo <tj@kernel.org>

Mon, 7 Apr 2025 17:13:52 +0000 (07:13 -1000)
author Andrea Righi <arighi@nvidia.com>
Sat, 5 Apr 2025 13:39:21 +0000 (15:39 +0200)
committer Tejun Heo <tj@kernel.org>
Mon, 7 Apr 2025 17:13:52 +0000 (07:13 -1000)
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c

index cce2746f9f34a022fd243e52a8d9f7682b1428dc..ed37fb8e45180fd938ee0e97ccadd008753fc2c9 100644 (file)
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -46,6 +46,12 @@ static struct scx_idle_cpus scx_idle_global_masks;
   */
  static struct scx_idle_cpus **scx_idle_node_masks;
  
+/*
+ * Local per-CPU cpumasks (used to generate temporary idle cpumasks).
+ */
+static DEFINE_PER_CPU(cpumask_var_t, local_llc_idle_cpumask);
+static DEFINE_PER_CPU(cpumask_var_t, local_numa_idle_cpumask);
+
  /*
   * Return the idle masks associated to a target @node.
   *
@@ -391,6 +397,14 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
                 static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
  }
  
+/*
+ * Return true if @p can run on all possible CPUs, false otherwise.
+ */
+static inline bool task_affinity_all(const struct task_struct *p)
+{
+       return p->nr_cpus_allowed >= num_possible_cpus();
+}
+
  /*
   * Built-in CPU idle selection policy:
   *
@@ -426,8 +440,7 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
   */
  s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags)
  {
-       const struct cpumask *llc_cpus = NULL;
-       const struct cpumask *numa_cpus = NULL;
+       const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL;
         int node = scx_cpu_node_if_enabled(prev_cpu);
         s32 cpu;
  
@@ -437,22 +450,30 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
         rcu_read_lock();
  
         /*
-        * Determine the scheduling domain only if the task is allowed to run
-        * on all CPUs.
+        * Determine the subset of CPUs that the task can use in its
+        * current LLC and node.
          *
-        * This is done primarily for efficiency, as it avoids the overhead of
-        * updating a cpumask every time we need to select an idle CPU (which
-        * can be costly in large SMP systems), but it also aligns logically:
-        * if a task's scheduling domain is restricted by user-space (through
-        * CPU affinity), the task will simply use the flat scheduling domain
-        * defined by user-space.
+        * If the task can run on all CPUs, use the node and LLC cpumasks
+        * directly.
          */
-       if (p->nr_cpus_allowed >= num_possible_cpus()) {
-               if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
-                       numa_cpus = numa_span(prev_cpu);
+       if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) {
+               struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_numa_idle_cpumask);
+               const struct cpumask *cpus = numa_span(prev_cpu);
+
+               if (task_affinity_all(p))
+                       numa_cpus = cpus;
+               else if (cpus && cpumask_and(local_cpus, p->cpus_ptr, cpus))
+                       numa_cpus = local_cpus;
+       }
  
-               if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
-                       llc_cpus = llc_span(prev_cpu);
+       if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
+               struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_llc_idle_cpumask);
+               const struct cpumask *cpus = llc_span(prev_cpu);
+
+               if (task_affinity_all(p))
+                       llc_cpus = cpus;
+               else if (cpus && cpumask_and(local_cpus, p->cpus_ptr, cpus))
+                       llc_cpus = local_cpus;
         }
  
         /*
@@ -596,7 +617,7 @@ out_unlock:
   */
  void scx_idle_init_masks(void)
  {
-       int node;
+       int i;
  
         /* Allocate global idle cpumasks */
         BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL));
@@ -607,13 +628,21 @@ void scx_idle_init_masks(void)
                                       sizeof(*scx_idle_node_masks), GFP_KERNEL);
         BUG_ON(!scx_idle_node_masks);
  
-       for_each_node(node) {
-               scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks),
-                                                        GFP_KERNEL, node);
-               BUG_ON(!scx_idle_node_masks[node]);
+       for_each_node(i) {
+               scx_idle_node_masks[i] = kzalloc_node(sizeof(**scx_idle_node_masks),
+                                                        GFP_KERNEL, i);
+               BUG_ON(!scx_idle_node_masks[i]);
+
+               BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->cpu, GFP_KERNEL, i));
+               BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->smt, GFP_KERNEL, i));
+       }
  
-               BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node));
-               BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node));
+       /* Allocate local per-cpu idle cpumasks */
+       for_each_possible_cpu(i) {
+               BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_llc_idle_cpumask, i),
+                                              GFP_KERNEL, cpu_to_node(i)));
+               BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_numa_idle_cpumask, i),
+                                              GFP_KERNEL, cpu_to_node(i)));
         }
  }
author	Andrea Righi <arighi@nvidia.com>
	Sat, 5 Apr 2025 13:39:21 +0000 (15:39 +0200)
committer	Tejun Heo <tj@kernel.org>
	Mon, 7 Apr 2025 17:13:52 +0000 (07:13 -1000)