sched_ext: idle: Accept an arbitrary cpumask in scx_select_cpu_dfl()

author Andrea Righi <arighi@nvidia.com>

Sat, 5 Apr 2025 13:39:23 +0000 (15:39 +0200)

committer Tejun Heo <tj@kernel.org>

Mon, 7 Apr 2025 17:13:52 +0000 (07:13 -1000)
author Andrea Righi <arighi@nvidia.com>
Sat, 5 Apr 2025 13:39:23 +0000 (15:39 +0200)
committer Tejun Heo <tj@kernel.org>
Mon, 7 Apr 2025 17:13:52 +0000 (07:13 -1000)
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c

index 5d6253c6ed9084ae0638105ff1108ccc20b1ea19..f39b34fbb8a61d943f9faf907f0046dc8267dd5c 100644 (file)
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -49,6 +49,7 @@ static struct scx_idle_cpus **scx_idle_node_masks;
  /*
   * Local per-CPU cpumasks (used to generate temporary idle cpumasks).
   */
+static DEFINE_PER_CPU(cpumask_var_t, local_idle_cpumask);
  static DEFINE_PER_CPU(cpumask_var_t, local_llc_idle_cpumask);
  static DEFINE_PER_CPU(cpumask_var_t, local_numa_idle_cpumask);
  
@@ -417,13 +418,15 @@ static inline bool task_affinity_all(const struct task_struct *p)
   *     branch prediction optimizations.
   *
   * 3. Pick a CPU within the same LLC (Last-Level Cache):
- *   - if the above conditions aren't met, pick a CPU that shares the same LLC
- *     to maintain cache locality.
+ *   - if the above conditions aren't met, pick a CPU that shares the same
+ *     LLC, if the LLC domain is a subset of @cpus_allowed, to maintain
+ *     cache locality.
   *
   * 4. Pick a CPU within the same NUMA node, if enabled:
- *   - choose a CPU from the same NUMA node to reduce memory access latency.
+ *   - choose a CPU from the same NUMA node, if the node cpumask is a
+ *     subset of @cpus_allowed, to reduce memory access latency.
   *
- * 5. Pick any idle CPU usable by the task.
+ * 5. Pick any idle CPU within the @cpus_allowed domain.
   *
   * Step 3 and 4 are performed only if the system has, respectively,
   * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and
@@ -446,6 +449,39 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
         int node = scx_cpu_node_if_enabled(prev_cpu);
         s32 cpu;
  
+       preempt_disable();
+
+       /*
+        * Determine the subset of CPUs usable by @p within @cpus_allowed.
+        */
+       if (allowed != p->cpus_ptr) {
+               struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_idle_cpumask);
+
+               if (task_affinity_all(p)) {
+                       allowed = cpus_allowed;
+               } else if (cpumask_and(local_cpus, cpus_allowed, p->cpus_ptr)) {
+                       allowed = local_cpus;
+               } else {
+                       cpu = -EBUSY;
+                       goto out_enable;
+               }
+
+               /*
+                * If @prev_cpu is not in the allowed CPUs, skip topology
+                * optimizations and try to pick any idle CPU usable by the
+                * task.
+                *
+                * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, prioritize
+                * the current node, as it may optimize some waker->wakee
+                * workloads.
+                */
+               if (!cpumask_test_cpu(prev_cpu, allowed)) {
+                       node = scx_cpu_node_if_enabled(smp_processor_id());
+                       cpu = scx_pick_idle_cpu(allowed, node, flags);
+                       goto out_enable;
+               }
+       }
+
         /*
          * This is necessary to protect llc_cpus.
          */
@@ -610,6 +646,8 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
  
  out_unlock:
         rcu_read_unlock();
+out_enable:
+       preempt_enable();
  
         return cpu;
  }
@@ -641,6 +679,8 @@ void scx_idle_init_masks(void)
  
         /* Allocate local per-cpu idle cpumasks */
         for_each_possible_cpu(i) {
+               BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_idle_cpumask, i),
+                                              GFP_KERNEL, cpu_to_node(i)));
                 BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_llc_idle_cpumask, i),
                                                GFP_KERNEL, cpu_to_node(i)));
                 BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_numa_idle_cpumask, i),
author	Andrea Righi <arighi@nvidia.com>
	Sat, 5 Apr 2025 13:39:23 +0000 (15:39 +0200)
committer	Tejun Heo <tj@kernel.org>
	Mon, 7 Apr 2025 17:13:52 +0000 (07:13 -1000)