psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim

author Brian Chen <brianchen118@gmail.com>

Wed, 10 Nov 2021 21:33:12 +0000 (21:33 +0000)

committer Peter Zijlstra <peterz@infradead.org>

Wed, 17 Nov 2021 13:49:00 +0000 (14:49 +0100)
author Brian Chen <brianchen118@gmail.com>
Wed, 10 Nov 2021 21:33:12 +0000 (21:33 +0000)
committer Peter Zijlstra <peterz@infradead.org>
Wed, 17 Nov 2021 13:49:00 +0000 (14:49 +0100)
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h

index bf50068d5d4ba22838f17650f1771c9a29005bf2..516c0fe836fd5be6b17f29c73ffcac892bf9addc 100644 (file)
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -22,7 +22,17 @@ enum psi_task_count {
          * don't have to special case any state tracking for it.
          */
         NR_ONCPU,
-       NR_PSI_TASK_COUNTS = 4,
+       /*
+        * For IO and CPU stalls the presence of running/oncpu tasks
+        * in the domain means a partial rather than a full stall.
+        * For memory it's not so simple because of page reclaimers:
+        * they are running/oncpu while representing a stall. To tell
+        * whether a domain has productivity left or not, we need to
+        * distinguish between regular running (i.e. productive)
+        * threads and memstall ones.
+        */
+       NR_MEMSTALL_RUNNING,
+       NR_PSI_TASK_COUNTS = 5,
  };
  
  /* Task state bitmasks */
@@ -30,6 +40,7 @@ enum psi_task_count {
  #define TSK_MEMSTALL   (1 << NR_MEMSTALL)
  #define TSK_RUNNING    (1 << NR_RUNNING)
  #define TSK_ONCPU      (1 << NR_ONCPU)
+#define TSK_MEMSTALL_RUNNING   (1 << NR_MEMSTALL_RUNNING)
  
  /* Resources that workloads could be stalled on */
  enum psi_res {
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c

index 3397fa0011575c88538981a66bf33ee16e65c8cc..a679613a7cb748bd895d1d217d1ac55b60e6f7e5 100644 (file)
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -35,13 +35,19 @@
   * delayed on that resource such that nobody is advancing and the CPU
   * goes idle. This leaves both workload and CPU unproductive.
   *
- * Naturally, the FULL state doesn't exist for the CPU resource at the
- * system level, but exist at the cgroup level, means all non-idle tasks
- * in a cgroup are delayed on the CPU resource which used by others outside
- * of the cgroup or throttled by the cgroup cpu.max configuration.
- *
   *     SOME = nr_delayed_tasks != 0
- *     FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
+ *     FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
+ *
+ * What it means for a task to be productive is defined differently
+ * for each resource. For IO, productive means a running task. For
+ * memory, productive means a running task that isn't a reclaimer. For
+ * CPU, productive means an oncpu task.
+ *
+ * Naturally, the FULL state doesn't exist for the CPU resource at the
+ * system level, but exist at the cgroup level. At the cgroup level,
+ * FULL means all non-idle tasks in the cgroup are delayed on the CPU
+ * resource which is being used by others outside of the cgroup or
+ * throttled by the cgroup cpu.max configuration.
   *
   * The percentage of wallclock time spent in those compound stall
   * states gives pressure numbers between 0 and 100 for each resource,
@@ -82,13 +88,13 @@
   *
   *     threads = min(nr_nonidle_tasks, nr_cpus)
   *        SOME = min(nr_delayed_tasks / threads, 1)
- *        FULL = (threads - min(nr_running_tasks, threads)) / threads
+ *        FULL = (threads - min(nr_productive_tasks, threads)) / threads
   *
   * For the 257 number crunchers on 256 CPUs, this yields:
   *
   *     threads = min(257, 256)
   *        SOME = min(1 / 256, 1)             = 0.4%
- *        FULL = (256 - min(257, 256)) / 256 = 0%
+ *        FULL = (256 - min(256, 256)) / 256 = 0%
   *
   * For the 1 out of 4 memory-delayed tasks, this yields:
   *
@@ -113,7 +119,7 @@
   * For each runqueue, we track:
   *
   *        tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
- *        tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
+ *        tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
   *     tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
   *
   * and then periodically aggregate:
@@ -234,7 +240,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
         case PSI_MEM_SOME:
                 return unlikely(tasks[NR_MEMSTALL]);
         case PSI_MEM_FULL:
-               return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
+               return unlikely(tasks[NR_MEMSTALL] &&
+                       tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
         case PSI_CPU_SOME:
                 return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
         case PSI_CPU_FULL:
@@ -711,10 +718,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
                 if (groupc->tasks[t]) {
                         groupc->tasks[t]--;
                 } else if (!psi_bug) {
-                       printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
+                       printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
                                         cpu, t, groupc->tasks[0],
                                         groupc->tasks[1], groupc->tasks[2],
-                                       groupc->tasks[3], clear, set);
+                                       groupc->tasks[3], groupc->tasks[4],
+                                       clear, set);
                         psi_bug = 1;
                 }
         }
@@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
                 int clear = TSK_ONCPU, set = 0;
  
                 /*
-                * When we're going to sleep, psi_dequeue() lets us handle
-                * TSK_RUNNING and TSK_IOWAIT here, where we can combine it
-                * with TSK_ONCPU and save walking common ancestors twice.
+                * When we're going to sleep, psi_dequeue() lets us
+                * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
+                * TSK_IOWAIT here, where we can combine it with
+                * TSK_ONCPU and save walking common ancestors twice.
                  */
                 if (sleep) {
                         clear |= TSK_RUNNING;
+                       if (prev->in_memstall)
+                               clear |= TSK_MEMSTALL_RUNNING;
                         if (prev->in_iowait)
                                 set |= TSK_IOWAIT;
                 }
@@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags)
         rq = this_rq_lock_irq(&rf);
  
         current->in_memstall = 1;
-       psi_task_change(current, 0, TSK_MEMSTALL);
+       psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
  
         rq_unlock_irq(rq, &rf);
  }
@@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags)
         rq = this_rq_lock_irq(&rf);
  
         current->in_memstall = 0;
-       psi_task_change(current, TSK_MEMSTALL, 0);
+       psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
  
         rq_unlock_irq(rq, &rf);
  }
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h

index cfb0893a83d45e5e7221e4cb31246242b547c9ee..3a3c826dd83a77dd0439b4705d7bbca965eb5a37 100644 (file)
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -118,6 +118,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
         if (static_branch_likely(&psi_disabled))
                 return;
  
+       if (p->in_memstall)
+               set |= TSK_MEMSTALL_RUNNING;
+
         if (!wakeup || p->sched_psi_wake_requeue) {
                 if (p->in_memstall)
                         set |= TSK_MEMSTALL;
@@ -148,7 +151,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
                 return;
  
         if (p->in_memstall)
-               clear |= TSK_MEMSTALL;
+               clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
  
         psi_task_change(p, clear, 0);
  }
author	Brian Chen <brianchen118@gmail.com>
	Wed, 10 Nov 2021 21:33:12 +0000 (21:33 +0000)
committer	Peter Zijlstra <peterz@infradead.org>
	Wed, 17 Nov 2021 13:49:00 +0000 (14:49 +0100)
include/linux/psi_types.h		patch \| blob \| history
kernel/sched/psi.c		patch \| blob \| history
kernel/sched/stats.h		patch \| blob \| history