* don't have to special case any state tracking for it.
         */
        NR_ONCPU,
-       NR_PSI_TASK_COUNTS = 4,
+       /*
+        * For IO and CPU stalls the presence of running/oncpu tasks
+        * in the domain means a partial rather than a full stall.
+        * For memory it's not so simple because of page reclaimers:
+        * they are running/oncpu while representing a stall. To tell
+        * whether a domain has productivity left or not, we need to
+        * distinguish between regular running (i.e. productive)
+        * threads and memstall ones.
+        */
+       NR_MEMSTALL_RUNNING,
+       NR_PSI_TASK_COUNTS = 5,
 };
 
 /* Task state bitmasks */
 #define TSK_MEMSTALL   (1 << NR_MEMSTALL)
 #define TSK_RUNNING    (1 << NR_RUNNING)
 #define TSK_ONCPU      (1 << NR_ONCPU)
+#define TSK_MEMSTALL_RUNNING   (1 << NR_MEMSTALL_RUNNING)
 
 /* Resources that workloads could be stalled on */
 enum psi_res {
 
  * delayed on that resource such that nobody is advancing and the CPU
  * goes idle. This leaves both workload and CPU unproductive.
  *
- * Naturally, the FULL state doesn't exist for the CPU resource at the
- * system level, but exist at the cgroup level, means all non-idle tasks
- * in a cgroup are delayed on the CPU resource which used by others outside
- * of the cgroup or throttled by the cgroup cpu.max configuration.
- *
  *     SOME = nr_delayed_tasks != 0
- *     FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
+ *     FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
+ *
+ * What it means for a task to be productive is defined differently
+ * for each resource. For IO, productive means a running task. For
+ * memory, productive means a running task that isn't a reclaimer. For
+ * CPU, productive means an oncpu task.
+ *
+ * Naturally, the FULL state doesn't exist for the CPU resource at the
+ * system level, but exist at the cgroup level. At the cgroup level,
+ * FULL means all non-idle tasks in the cgroup are delayed on the CPU
+ * resource which is being used by others outside of the cgroup or
+ * throttled by the cgroup cpu.max configuration.
  *
  * The percentage of wallclock time spent in those compound stall
  * states gives pressure numbers between 0 and 100 for each resource,
  *
  *     threads = min(nr_nonidle_tasks, nr_cpus)
  *        SOME = min(nr_delayed_tasks / threads, 1)
- *        FULL = (threads - min(nr_running_tasks, threads)) / threads
+ *        FULL = (threads - min(nr_productive_tasks, threads)) / threads
  *
  * For the 257 number crunchers on 256 CPUs, this yields:
  *
  *     threads = min(257, 256)
  *        SOME = min(1 / 256, 1)             = 0.4%
- *        FULL = (256 - min(257, 256)) / 256 = 0%
+ *        FULL = (256 - min(256, 256)) / 256 = 0%
  *
  * For the 1 out of 4 memory-delayed tasks, this yields:
  *
  * For each runqueue, we track:
  *
  *        tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
- *        tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
+ *        tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
  *     tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
  *
  * and then periodically aggregate:
        case PSI_MEM_SOME:
                return unlikely(tasks[NR_MEMSTALL]);
        case PSI_MEM_FULL:
-               return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
+               return unlikely(tasks[NR_MEMSTALL] &&
+                       tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
        case PSI_CPU_SOME:
                return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
        case PSI_CPU_FULL:
                if (groupc->tasks[t]) {
                        groupc->tasks[t]--;
                } else if (!psi_bug) {
-                       printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
+                       printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
                                        cpu, t, groupc->tasks[0],
                                        groupc->tasks[1], groupc->tasks[2],
-                                       groupc->tasks[3], clear, set);
+                                       groupc->tasks[3], groupc->tasks[4],
+                                       clear, set);
                        psi_bug = 1;
                }
        }
                int clear = TSK_ONCPU, set = 0;
 
                /*
-                * When we're going to sleep, psi_dequeue() lets us handle
-                * TSK_RUNNING and TSK_IOWAIT here, where we can combine it
-                * with TSK_ONCPU and save walking common ancestors twice.
+                * When we're going to sleep, psi_dequeue() lets us
+                * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
+                * TSK_IOWAIT here, where we can combine it with
+                * TSK_ONCPU and save walking common ancestors twice.
                 */
                if (sleep) {
                        clear |= TSK_RUNNING;
+                       if (prev->in_memstall)
+                               clear |= TSK_MEMSTALL_RUNNING;
                        if (prev->in_iowait)
                                set |= TSK_IOWAIT;
                }
        rq = this_rq_lock_irq(&rf);
 
        current->in_memstall = 1;
-       psi_task_change(current, 0, TSK_MEMSTALL);
+       psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
 
        rq_unlock_irq(rq, &rf);
 }
        rq = this_rq_lock_irq(&rf);
 
        current->in_memstall = 0;
-       psi_task_change(current, TSK_MEMSTALL, 0);
+       psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
 
        rq_unlock_irq(rq, &rf);
 }