sched: Fix migrate_disable() vs rt/dl balancing

author Peter Zijlstra <peterz@infradead.org>

Mon, 28 Sep 2020 15:06:07 +0000 (17:06 +0200)

committer Peter Zijlstra <peterz@infradead.org>

Tue, 10 Nov 2020 17:39:01 +0000 (18:39 +0100)
author Peter Zijlstra <peterz@infradead.org>
Mon, 28 Sep 2020 15:06:07 +0000 (17:06 +0200)
committer Peter Zijlstra <peterz@infradead.org>
Tue, 10 Nov 2020 17:39:01 +0000 (18:39 +0100)
diff --git a/include/linux/preempt.h b/include/linux/preempt.h

index 97ba7c920653d3b10ed90a4055615cc318fcde23..8b43922e65dfe7c5331504325905dd064fc77c49 100644 (file)
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -325,24 +325,28 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
  #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
  
  /*
- * Migrate-Disable and why it is (strongly) undesired.
- *
- * The premise of the Real-Time schedulers we have on Linux
- * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
- * concurrently, provided there are sufficient runnable tasks, also known as
- * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
- * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
- *
- * The correctness of various scheduling models depends on this, but is it
- * broken by migrate_disable() that doesn't imply preempt_disable(). Where
- * preempt_disable() implies an immediate priority ceiling, preemptible
- * migrate_disable() allows nesting.
- *
- * The worst case is that all tasks preempt one another in a migrate_disable()
- * region and stack on a single CPU. This then reduces the available bandwidth
- * to a single CPU. And since Real-Time schedulability theory considers the
- * Worst-Case only, all Real-Time analysis shall revert to single-CPU
- * (instantly solving the SMP analysis problem).
+ * Migrate-Disable and why it is undesired.
+ *
+ * When a preempted task becomes elegible to run under the ideal model (IOW it
+ * becomes one of the M highest priority tasks), it might still have to wait
+ * for the preemptee's migrate_disable() section to complete. Thereby suffering
+ * a reduction in bandwidth in the exact duration of the migrate_disable()
+ * section.
+ *
+ * Per this argument, the change from preempt_disable() to migrate_disable()
+ * gets us:
+ *
+ * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
+ *   it would have had to wait for the lower priority task.
+ *
+ * - a lower priority tasks; which under preempt_disable() could've instantly
+ *   migrated away when another CPU becomes available, is now constrained
+ *   by the ability to push the higher priority task away, which might itself be
+ *   in a migrate_disable() section, reducing it's available bandwidth.
+ *
+ * IOW it trades latency / moves the interference term, but it stays in the
+ * system, and as long as it remains unbounded, the system is not fully
+ * deterministic.
   *
   *
   * The reason we have it anyway.
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 90a0c92741d727c1c7747a47780c96354036a81f..3af9d52fe09354db11390f7653eba02fae7b9e60 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -716,8 +716,9 @@ struct task_struct {
         cpumask_t                       cpus_mask;
         void                            *migration_pending;
  #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
-       int                             migration_disabled;
+       unsigned short                  migration_disabled;
  #endif
+       unsigned short                  migration_flags;
  
  #ifdef CONFIG_PREEMPT_RCU
         int                             rcu_read_lock_nesting;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 9ce2fc7d3d51846a78c1c07610f289eb6c17a86b..e92d7853057c23d683d4c6779ffaf4e9438761a5 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1763,11 +1763,6 @@ void migrate_enable(void)
  }
  EXPORT_SYMBOL_GPL(migrate_enable);
  
-static inline bool is_migration_disabled(struct task_struct *p)
-{
-       return p->migration_disabled;
-}
-
  static inline bool rq_has_pinned_tasks(struct rq *rq)
  {
         return rq->nr_pinned;
@@ -1972,6 +1967,49 @@ out:
         return 0;
  }
  
+int push_cpu_stop(void *arg)
+{
+       struct rq *lowest_rq = NULL, *rq = this_rq();
+       struct task_struct *p = arg;
+
+       raw_spin_lock_irq(&p->pi_lock);
+       raw_spin_lock(&rq->lock);
+
+       if (task_rq(p) != rq)
+               goto out_unlock;
+
+       if (is_migration_disabled(p)) {
+               p->migration_flags |= MDF_PUSH;
+               goto out_unlock;
+       }
+
+       p->migration_flags &= ~MDF_PUSH;
+
+       if (p->sched_class->find_lock_rq)
+               lowest_rq = p->sched_class->find_lock_rq(p, rq);
+
+       if (!lowest_rq)
+               goto out_unlock;
+
+       // XXX validate p is still the highest prio task
+       if (task_rq(p) == rq) {
+               deactivate_task(rq, p, 0);
+               set_task_cpu(p, lowest_rq->cpu);
+               activate_task(lowest_rq, p, 0);
+               resched_curr(lowest_rq);
+       }
+
+       double_unlock_balance(rq, lowest_rq);
+
+out_unlock:
+       rq->push_busy = false;
+       raw_spin_unlock(&rq->lock);
+       raw_spin_unlock_irq(&p->pi_lock);
+
+       put_task_struct(p);
+       return 0;
+}
+
  /*
   * sched_class::set_cpus_allowed must do the below, but is not required to
   * actually call this function.
@@ -2052,6 +2090,14 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
  
         /* Can the task run on the task's current CPU? If so, we're done */
         if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+               struct task_struct *push_task = NULL;
+
+               if ((flags & SCA_MIGRATE_ENABLE) &&
+                   (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
+                       rq->push_busy = true;
+                       push_task = get_task_struct(p);
+               }
+
                 pending = p->migration_pending;
                 if (pending) {
                         refcount_inc(&pending->refs);
@@ -2060,6 +2106,11 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
                 }
                 task_rq_unlock(rq, p, rf);
  
+               if (push_task) {
+                       stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+                                           p, &rq->push_work);
+               }
+
                 if (complete)
                         goto do_complete;
  
@@ -2098,6 +2149,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
         if (flags & SCA_MIGRATE_ENABLE) {
  
                 refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
+               p->migration_flags &= ~MDF_PUSH;
                 task_rq_unlock(rq, p, rf);
  
                 pending->arg = (struct migration_arg) {
@@ -2716,11 +2768,6 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
  
  static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
  
-static inline bool is_migration_disabled(struct task_struct *p)
-{
-       return false;
-}
-
  static inline bool rq_has_pinned_tasks(struct rq *rq)
  {
         return false;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index 3d3fd8370447dbfdbfe1475f9a1d81d083f25d7f..eed2e449b3131c7326f97ed8c7928d31fb509457 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2129,6 +2129,9 @@ static int push_dl_task(struct rq *rq)
                 return 0;
  
  retry:
+       if (is_migration_disabled(next_task))
+               return 0;
+
         if (WARN_ON(next_task == rq->curr))
                 return 0;
  
@@ -2206,7 +2209,7 @@ static void push_dl_tasks(struct rq *rq)
  static void pull_dl_task(struct rq *this_rq)
  {
         int this_cpu = this_rq->cpu, cpu;
-       struct task_struct *p;
+       struct task_struct *p, *push_task;
         bool resched = false;
         struct rq *src_rq;
         u64 dmin = LONG_MAX;
@@ -2236,6 +2239,7 @@ static void pull_dl_task(struct rq *this_rq)
                         continue;
  
                 /* Might drop this_rq->lock */
+               push_task = NULL;
                 double_lock_balance(this_rq, src_rq);
  
                 /*
@@ -2267,17 +2271,27 @@ static void pull_dl_task(struct rq *this_rq)
                                            src_rq->curr->dl.deadline))
                                 goto skip;
  
-                       resched = true;
-
-                       deactivate_task(src_rq, p, 0);
-                       set_task_cpu(p, this_cpu);
-                       activate_task(this_rq, p, 0);
-                       dmin = p->dl.deadline;
+                       if (is_migration_disabled(p)) {
+                               push_task = get_push_task(src_rq);
+                       } else {
+                               deactivate_task(src_rq, p, 0);
+                               set_task_cpu(p, this_cpu);
+                               activate_task(this_rq, p, 0);
+                               dmin = p->dl.deadline;
+                               resched = true;
+                       }
  
                         /* Is there any other task even earlier? */
                 }
  skip:
                 double_unlock_balance(this_rq, src_rq);
+
+               if (push_task) {
+                       raw_spin_unlock(&this_rq->lock);
+                       stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+                                           push_task, &src_rq->push_work);
+                       raw_spin_lock(&this_rq->lock);
+               }
         }
  
         if (resched)
@@ -2524,6 +2538,7 @@ const struct sched_class dl_sched_class
         .rq_online              = rq_online_dl,
         .rq_offline             = rq_offline_dl,
         .task_woken             = task_woken_dl,
+       .find_lock_rq           = find_lock_later_rq,
  #endif
  
         .task_tick              = task_tick_dl,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index cf63346a07e4ed2931754e30da55287e3cba19aa..c592e47cafed00a8828cede65c41aee58791c19d 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1859,7 +1859,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
   * running task can migrate over to a CPU that is running a task
   * of lesser priority.
   */
-static int push_rt_task(struct rq *rq)
+static int push_rt_task(struct rq *rq, bool pull)
  {
         struct task_struct *next_task;
         struct rq *lowest_rq;
@@ -1873,6 +1873,34 @@ static int push_rt_task(struct rq *rq)
                 return 0;
  
  retry:
+       if (is_migration_disabled(next_task)) {
+               struct task_struct *push_task = NULL;
+               int cpu;
+
+               if (!pull || rq->push_busy)
+                       return 0;
+
+               cpu = find_lowest_rq(rq->curr);
+               if (cpu == -1 || cpu == rq->cpu)
+                       return 0;
+
+               /*
+                * Given we found a CPU with lower priority than @next_task,
+                * therefore it should be running. However we cannot migrate it
+                * to this other CPU, instead attempt to push the current
+                * running task on this CPU away.
+                */
+               push_task = get_push_task(rq);
+               if (push_task) {
+                       raw_spin_unlock(&rq->lock);
+                       stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+                                           push_task, &rq->push_work);
+                       raw_spin_lock(&rq->lock);
+               }
+
+               return 0;
+       }
+
         if (WARN_ON(next_task == rq->curr))
                 return 0;
  
@@ -1927,12 +1955,10 @@ retry:
         deactivate_task(rq, next_task, 0);
         set_task_cpu(next_task, lowest_rq->cpu);
         activate_task(lowest_rq, next_task, 0);
-       ret = 1;
-
         resched_curr(lowest_rq);
+       ret = 1;
  
         double_unlock_balance(rq, lowest_rq);
-
  out:
         put_task_struct(next_task);
  
@@ -1942,7 +1968,7 @@ out:
  static void push_rt_tasks(struct rq *rq)
  {
         /* push_rt_task will return true if it moved an RT */
-       while (push_rt_task(rq))
+       while (push_rt_task(rq, false))
                 ;
  }
  
@@ -2095,7 +2121,8 @@ void rto_push_irq_work_func(struct irq_work *work)
          */
         if (has_pushable_tasks(rq)) {
                 raw_spin_lock(&rq->lock);
-               push_rt_tasks(rq);
+               while (push_rt_task(rq, true))
+                       ;
                 raw_spin_unlock(&rq->lock);
         }
  
@@ -2120,7 +2147,7 @@ static void pull_rt_task(struct rq *this_rq)
  {
         int this_cpu = this_rq->cpu, cpu;
         bool resched = false;
-       struct task_struct *p;
+       struct task_struct *p, *push_task;
         struct rq *src_rq;
         int rt_overload_count = rt_overloaded(this_rq);
  
@@ -2167,6 +2194,7 @@ static void pull_rt_task(struct rq *this_rq)
                  * double_lock_balance, and another CPU could
                  * alter this_rq
                  */
+               push_task = NULL;
                 double_lock_balance(this_rq, src_rq);
  
                 /*
@@ -2194,11 +2222,14 @@ static void pull_rt_task(struct rq *this_rq)
                         if (p->prio < src_rq->curr->prio)
                                 goto skip;
  
-                       resched = true;
-
-                       deactivate_task(src_rq, p, 0);
-                       set_task_cpu(p, this_cpu);
-                       activate_task(this_rq, p, 0);
+                       if (is_migration_disabled(p)) {
+                               push_task = get_push_task(src_rq);
+                       } else {
+                               deactivate_task(src_rq, p, 0);
+                               set_task_cpu(p, this_cpu);
+                               activate_task(this_rq, p, 0);
+                               resched = true;
+                       }
                         /*
                          * We continue with the search, just in
                          * case there's an even higher prio task
@@ -2208,6 +2239,13 @@ static void pull_rt_task(struct rq *this_rq)
                 }
  skip:
                 double_unlock_balance(this_rq, src_rq);
+
+               if (push_task) {
+                       raw_spin_unlock(&this_rq->lock);
+                       stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+                                           push_task, &src_rq->push_work);
+                       raw_spin_lock(&this_rq->lock);
+               }
         }
  
         if (resched)
@@ -2449,6 +2487,7 @@ const struct sched_class rt_sched_class
         .rq_offline             = rq_offline_rt,
         .task_woken             = task_woken_rt,
         .switched_from          = switched_from_rt,
+       .find_lock_rq           = find_lock_lowest_rq,
  #endif
  
         .task_tick              = task_tick_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 42de1406c0dc3825db5ea5658814c3d0fcaae371..56992aaca48ef4f817d0aaedb42bf3e69680c67a 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1057,6 +1057,8 @@ struct rq {
  #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
         unsigned int            nr_pinned;
  #endif
+       unsigned int            push_busy;
+       struct cpu_stop_work    push_work;
  };
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1084,6 +1086,16 @@ static inline int cpu_of(struct rq *rq)
  #endif
  }
  
+#define MDF_PUSH       0x01
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+       return p->migration_disabled;
+#else
+       return false;
+#endif
+}
  
  #ifdef CONFIG_SCHED_SMT
  extern void __update_idle_core(struct rq *rq);
@@ -1823,6 +1835,8 @@ struct sched_class {
  
         void (*rq_online)(struct rq *rq);
         void (*rq_offline)(struct rq *rq);
+
+       struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
  #endif
  
         void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
@@ -1918,6 +1932,24 @@ extern void trigger_load_balance(struct rq *rq);
  
  extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
  
+static inline struct task_struct *get_push_task(struct rq *rq)
+{
+       struct task_struct *p = rq->curr;
+
+       lockdep_assert_held(&rq->lock);
+
+       if (rq->push_busy)
+               return NULL;
+
+       if (p->nr_cpus_allowed == 1)
+               return NULL;
+
+       rq->push_busy = true;
+       return get_task_struct(p);
+}
+
+extern int push_cpu_stop(void *arg);
+
  #endif
  
  #ifdef CONFIG_CPU_IDLE
author	Peter Zijlstra <peterz@infradead.org>
	Mon, 28 Sep 2020 15:06:07 +0000 (17:06 +0200)
committer	Peter Zijlstra <peterz@infradead.org>
	Tue, 10 Nov 2020 17:39:01 +0000 (18:39 +0100)
include/linux/preempt.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/deadline.c		patch \| blob \| history
kernel/sched/rt.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history