]> www.infradead.org Git - nvme.git/commitdiff
psi: Fix race when task wakes up before psi_sched_switch() adjusts flags
authorChengming Zhou <chengming.zhou@linux.dev>
Fri, 27 Dec 2024 06:19:41 +0000 (06:19 +0000)
committerPeter Zijlstra <peterz@infradead.org>
Mon, 13 Jan 2025 13:10:26 +0000 (14:10 +0100)
When running hackbench in a cgroup with bandwidth throttling enabled,
following PSI splat was observed:

    psi: inconsistent task state! task=1831:hackbench cpu=8 psi_flags=14 clear=0 set=4

When investigating the series of events leading up to the splat,
following sequence was observed:

    [008] d..2.: sched_switch: ... ==> next_comm=hackbench next_pid=1831 next_prio=120
        ...
    [008] dN.2.: dequeue_entity(task delayed): task=hackbench pid=1831 cfs_rq->throttled=0
    [008] dN.2.: pick_task_fair: check_cfs_rq_runtime() throttled cfs_rq on CPU8
    # CPU8 goes into newidle balance and releases the rq lock
        ...
    # CPU15 on same LLC Domain is trying to wakeup hackbench(pid=1831)
    [015] d..4.: psi_flags_change: psi: task state: task=1831:hackbench cpu=8 psi_flags=14 clear=0 set=4 final=14 # Splat (cfs_rq->throttled=1)
    [015] d..4.: sched_wakeup: comm=hackbench pid=1831 prio=120 target_cpu=008 # Task has woken on a throttled hierarchy
    [008] d..2.: sched_switch: prev_comm=hackbench prev_pid=1831 prev_prio=120 prev_state=S ==> ...

psi_dequeue() relies on psi_sched_switch() to set the correct PSI flags
for the blocked entity, however, with the introduction of DELAY_DEQUEUE,
the block task can wakeup when newidle balance drops the runqueue lock
during __schedule().

If a task wakes before psi_sched_switch() adjusts the PSI flags, skip
any modifications in psi_enqueue() which would still see the flags of a
running task and not a blocked one. Instead, rely on psi_sched_switch()
to do the right thing.

Since the status returned by try_to_block_task() may no longer be true
by the time schedule reaches psi_sched_switch(), check if the task is
blocked or not using a combination of task_on_rq_queued() and
p->se.sched_delayed checks.

[ prateek: Commit message, testing, early bailout in psi_enqueue() ]

Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue") # 1a6151017ee5
Signed-off-by: Chengming Zhou <chengming.zhou@linux.dev>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Link: https://lore.kernel.org/r/20241227061941.2315-1-kprateek.nayak@amd.com
kernel/sched/core.c
kernel/sched/stats.h

index 22dfcd3e92ed2fd30b7992cf35219580971ef9ea..4365b479e3458f23965ed092985df1e78f58d35c 100644 (file)
@@ -6645,7 +6645,6 @@ static void __sched notrace __schedule(int sched_mode)
         * as a preemption by schedule_debug() and RCU.
         */
        bool preempt = sched_mode > SM_NONE;
-       bool block = false;
        unsigned long *switch_count;
        unsigned long prev_state;
        struct rq_flags rf;
@@ -6706,7 +6705,7 @@ static void __sched notrace __schedule(int sched_mode)
                        goto picked;
                }
        } else if (!preempt && prev_state) {
-               block = try_to_block_task(rq, prev, prev_state);
+               try_to_block_task(rq, prev, prev_state);
                switch_count = &prev->nvcsw;
        }
 
@@ -6752,7 +6751,8 @@ picked:
 
                migrate_disable_switch(rq, prev);
                psi_account_irqtime(rq, prev, next);
-               psi_sched_switch(prev, next, block);
+               psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
+                                            prev->se.sched_delayed);
 
                trace_sched_switch(preempt, prev, next, prev_state);
 
index 8ee0add5a48a80f77d938f33ad4d8dfb9255429b..6ade91bce63ee31afaaad0176072eb7ca3931462 100644 (file)
@@ -138,6 +138,10 @@ static inline void psi_enqueue(struct task_struct *p, int flags)
        if (flags & ENQUEUE_RESTORE)
                return;
 
+       /* psi_sched_switch() will handle the flags */
+       if (task_on_cpu(task_rq(p), p))
+               return;
+
        if (p->se.sched_delayed) {
                /* CPU migration of "sleeping" task */
                SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED));