]> www.infradead.org Git - users/hch/misc.git/commitdiff
sched/core: Disable page allocation in task_tick_mm_cid()
authorWaiman Long <longman@redhat.com>
Thu, 10 Oct 2024 01:44:32 +0000 (21:44 -0400)
committerPeter Zijlstra <peterz@infradead.org>
Fri, 11 Oct 2024 08:49:32 +0000 (10:49 +0200)
With KASAN and PREEMPT_RT enabled, calling task_work_add() in
task_tick_mm_cid() may cause the following splat.

[   63.696416] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
[   63.696416] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 610, name: modprobe
[   63.696416] preempt_count: 10001, expected: 0
[   63.696416] RCU nest depth: 1, expected: 1

This problem is caused by the following call trace.

  sched_tick() [ acquire rq->__lock ]
   -> task_tick_mm_cid()
    -> task_work_add()
     -> __kasan_record_aux_stack()
      -> kasan_save_stack()
       -> stack_depot_save_flags()
        -> alloc_pages_mpol_noprof()
         -> __alloc_pages_noprof()
  -> get_page_from_freelist()
   -> rmqueue()
    -> rmqueue_pcplist()
     -> __rmqueue_pcplist()
      -> rmqueue_bulk()
       -> rt_spin_lock()

The rq lock is a raw_spinlock_t. We can't sleep while holding
it. IOW, we can't call alloc_pages() in stack_depot_save_flags().

The task_tick_mm_cid() function with its task_work_add() call was
introduced by commit 223baf9d17f2 ("sched: Fix performance regression
introduced by mm_cid") in v6.4 kernel.

Fortunately, there is a kasan_record_aux_stack_noalloc() variant that
calls stack_depot_save_flags() while not allowing it to allocate
new pages.  To allow task_tick_mm_cid() to use task_work without
page allocation, a new TWAF_NO_ALLOC flag is added to enable calling
kasan_record_aux_stack_noalloc() instead of kasan_record_aux_stack()
if set. The task_tick_mm_cid() function is modified to add this new flag.

The possible downside is the missing stack trace in a KASAN report due
to new page allocation required when task_work_add_noallloc() is called
which should be rare.

Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid")
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20241010014432.194742-1-longman@redhat.com
include/linux/task_work.h
kernel/sched/core.c
kernel/task_work.c

index cf5e7e891a776289105ad72237408a7613b13240..2964171856e00d2c97a2af04c44e0f3aa16c2ce6 100644 (file)
@@ -14,11 +14,14 @@ init_task_work(struct callback_head *twork, task_work_func_t func)
 }
 
 enum task_work_notify_mode {
-       TWA_NONE,
+       TWA_NONE = 0,
        TWA_RESUME,
        TWA_SIGNAL,
        TWA_SIGNAL_NO_IPI,
        TWA_NMI_CURRENT,
+
+       TWA_FLAGS = 0xff00,
+       TWAF_NO_ALLOC = 0x0100,
 };
 
 static inline bool task_work_pending(struct task_struct *task)
index 43e453ab7e20f81ea239919af19ef20c5358c93d..0259301e572e827b16dbcd57d0fe5790abca0360 100644 (file)
@@ -10458,7 +10458,9 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
                return;
        if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
                return;
-       task_work_add(curr, work, TWA_RESUME);
+
+       /* No page allocation under rq lock */
+       task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC);
 }
 
 void sched_mm_cid_exit_signals(struct task_struct *t)
index 5d14d639ac71b54a24dac49acd02f6c2c7e58e04..c969f1f26be58a87babfb7f05464e612bec1bc98 100644 (file)
@@ -55,15 +55,26 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
                  enum task_work_notify_mode notify)
 {
        struct callback_head *head;
+       int flags = notify & TWA_FLAGS;
 
+       notify &= ~TWA_FLAGS;
        if (notify == TWA_NMI_CURRENT) {
                if (WARN_ON_ONCE(task != current))
                        return -EINVAL;
                if (!IS_ENABLED(CONFIG_IRQ_WORK))
                        return -EINVAL;
        } else {
-               /* record the work call stack in order to print it in KASAN reports */
-               kasan_record_aux_stack(work);
+               /*
+                * Record the work call stack in order to print it in KASAN
+                * reports.
+                *
+                * Note that stack allocation can fail if TWAF_NO_ALLOC flag
+                * is set and new page is needed to expand the stack buffer.
+                */
+               if (flags & TWAF_NO_ALLOC)
+                       kasan_record_aux_stack_noalloc(work);
+               else
+                       kasan_record_aux_stack(work);
        }
 
        head = READ_ONCE(task->task_works);