From 029c896c4105b7d74fcd88d99c5fbab2558fee89 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 24 Mar 2025 12:50:25 +0200 Subject: [PATCH 01/16] kernel.h: move PTR_IF() and u64_to_user_ptr() to util_macros.h While the natural choice of PTR_IF() is kconfig.h, the latter is too broad to include C code and actually the macro was moved out from there in the past. But kernel.h is neither a good choice for that. Move it to util_macros.h. Do the same for u64_to_user_ptr(). While moving, add necessary documentation. Link: https://lkml.kernel.org/r/20250324105228.775784-3-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Alexandru Ardelean Cc: Ingo Molnar Signed-off-by: Andrew Morton --- include/linux/kernel.h | 10 +----- include/linux/util_macros.h | 66 +++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 01bb0fac3667..1cce1f6410a9 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -41,15 +42,6 @@ #define STACK_MAGIC 0xdeadbeef -#define PTR_IF(cond, ptr) ((cond) ? (ptr) : NULL) - -#define u64_to_user_ptr(x) ( \ -{ \ - typecheck(u64, (x)); \ - (void __user *)(uintptr_t)(x); \ -} \ -) - struct completion; struct user; diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h index 3b570b765b75..7b64fb597f85 100644 --- a/include/linux/util_macros.h +++ b/include/linux/util_macros.h @@ -79,6 +79,72 @@ (__fc_i); \ }) +/** + * PTR_IF - evaluate to @ptr if @cond is true, or to NULL otherwise. + * @cond: A conditional, usually in a form of IS_ENABLED(CONFIG_FOO) + * @ptr: A pointer to assign if @cond is true. + * + * PTR_IF(IS_ENABLED(CONFIG_FOO), ptr) evaluates to @ptr if CONFIG_FOO is set + * to 'y' or 'm', or to NULL otherwise. The (ptr) argument must be a pointer. + * + * The macro can be very useful to help compiler dropping dead code. + * + * For instance, consider the following:: + * + * #ifdef CONFIG_FOO_SUSPEND + * static int foo_suspend(struct device *dev) + * { + * ... + * } + * #endif + * + * static struct pm_ops foo_ops = { + * #ifdef CONFIG_FOO_SUSPEND + * .suspend = foo_suspend, + * #endif + * }; + * + * While this works, the foo_suspend() macro is compiled conditionally, + * only when CONFIG_FOO_SUSPEND is set. This is problematic, as there could + * be a build bug in this function, we wouldn't have a way to know unless + * the configuration option is set. + * + * An alternative is to declare foo_suspend() always, but mark it + * as __maybe_unused. This works, but the __maybe_unused attribute + * is required to instruct the compiler that the function may not + * be referenced anywhere, and is safe to remove without making + * a fuss about it. This makes the programmer responsible for tagging + * the functions that can be garbage-collected. + * + * With the macro it is possible to write the following: + * + * static int foo_suspend(struct device *dev) + * { + * ... + * } + * + * static struct pm_ops foo_ops = { + * .suspend = PTR_IF(IS_ENABLED(CONFIG_FOO_SUSPEND), foo_suspend), + * }; + * + * The foo_suspend() function will now be automatically dropped by the + * compiler, and it does not require any specific attribute. + */ +#define PTR_IF(cond, ptr) ((cond) ? (ptr) : NULL) + +/** + * to_user_ptr - cast a pointer passed as u64 from user space to void __user * + * @x: The u64 value from user space, usually via IOCTL + * + * to_user_ptr() simply casts a pointer passed as u64 from user space to void + * __user * correctly. Using this lets us get rid of all the tiresome casts. + */ +#define u64_to_user_ptr(x) \ +({ \ + typecheck(u64, (x)); \ + (void __user *)(uintptr_t)(x); \ +}) + /** * is_insidevar - check if the @ptr points inside the @var memory range. * @ptr: the pointer to a memory address. -- 2.51.0 From ae5b3500856fb9a565470d83033bb91786ec16f1 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 20 Mar 2025 21:25:01 -0500 Subject: [PATCH 02/16] kstrtox: add support for enabled and disabled in kstrtobool() In some places in the kernel there is a design pattern for sysfs attributes to use kstrtobool() in store() and str_enabled_disabled() in show(). This is counterintuitive to interact with because kstrtobool() takes on/off but str_enabled_disabled() shows enabled/disabled. Some of those sysfs uses could switch to str_on_off() but for some attributes enabled/disabled really makes more sense. Add support for kstrtobool() to accept enabled/disabled. Link: https://lkml.kernel.org/r/20250321022538.1532445-1-superm1@kernel.org Signed-off-by: Mario Limonciello Signed-off-by: Andrew Morton --- lib/kstrtox.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/kstrtox.c b/lib/kstrtox.c index d586e6af5e5a..bdde40cd69d7 100644 --- a/lib/kstrtox.c +++ b/lib/kstrtox.c @@ -351,6 +351,8 @@ int kstrtobool(const char *s, bool *res) return -EINVAL; switch (s[0]) { + case 'e': + case 'E': case 'y': case 'Y': case 't': @@ -358,6 +360,8 @@ int kstrtobool(const char *s, bool *res) case '1': *res = true; return 0; + case 'd': + case 'D': case 'n': case 'N': case 'f': -- 2.51.0 From 3eff6a3e574c2f704f9be1f4867c3d0b3d804435 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Mon, 7 Apr 2025 19:44:16 +0800 Subject: [PATCH 03/16] errseq: eliminate special limitation for macro MAX_ERRNO Current errseq implementation depends on a very special precondition that macro MAX_ERRNO must be (2^n - 1). Eliminate the limitation by - redefining macro ERRSEQ_SHIFT - defining a new macro ERRNO_MASK instead of MAX_ERRNO for errno mask. There is no plan to change the value of MAX_ERRNO, but this makes the implementation more generic and eliminates the BUILD_BUG_ON(). Link: https://lkml.kernel.org/r/20250407-improve_errseq-v1-1-7b27cbeb8298@quicinc.com Signed-off-by: Zijun Hu Reviewed-by: Jeff Layton Signed-off-by: Andrew Morton --- lib/errseq.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/errseq.c b/lib/errseq.c index 93e9b94358dc..13a2581c5a87 100644 --- a/lib/errseq.c +++ b/lib/errseq.c @@ -34,11 +34,14 @@ */ /* The low bits are designated for error code (max of MAX_ERRNO) */ -#define ERRSEQ_SHIFT ilog2(MAX_ERRNO + 1) +#define ERRSEQ_SHIFT (ilog2(MAX_ERRNO) + 1) /* This bit is used as a flag to indicate whether the value has been seen */ #define ERRSEQ_SEEN (1 << ERRSEQ_SHIFT) +/* Leverage macro ERRSEQ_SEEN to define errno mask macro here */ +#define ERRNO_MASK (ERRSEQ_SEEN - 1) + /* The lowest bit of the counter */ #define ERRSEQ_CTR_INC (1 << (ERRSEQ_SHIFT + 1)) @@ -60,8 +63,6 @@ errseq_t errseq_set(errseq_t *eseq, int err) { errseq_t cur, old; - /* MAX_ERRNO must be able to serve as a mask */ - BUILD_BUG_ON_NOT_POWER_OF_2(MAX_ERRNO + 1); /* * Ensure the error code actually fits where we want it to go. If it @@ -79,7 +80,7 @@ errseq_t errseq_set(errseq_t *eseq, int err) errseq_t new; /* Clear out error bits and set new error */ - new = (old & ~(MAX_ERRNO|ERRSEQ_SEEN)) | -err; + new = (old & ~(ERRNO_MASK | ERRSEQ_SEEN)) | -err; /* Only increment if someone has looked at it */ if (old & ERRSEQ_SEEN) @@ -148,7 +149,7 @@ int errseq_check(errseq_t *eseq, errseq_t since) if (likely(cur == since)) return 0; - return -(cur & MAX_ERRNO); + return -(cur & ERRNO_MASK); } EXPORT_SYMBOL(errseq_check); @@ -200,7 +201,7 @@ int errseq_check_and_advance(errseq_t *eseq, errseq_t *since) if (new != old) cmpxchg(eseq, old, new); *since = new; - err = -(new & MAX_ERRNO); + err = -(new & ERRNO_MASK); } return err; } -- 2.51.0 From fe6f600c43e0931f4fb5e1debb2311b7a6ebabd8 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 19 Mar 2025 20:54:36 +0100 Subject: [PATCH 04/16] exit: combine work under lock in synchronize_group_exit() and coredump_task_exit() This reduces single-threaded overhead as it avoids one lock+irq trip on exit. It also improves scalability of spawning and killing threads within one process (just shy of 5% when doing it on 24 cores on my test jig). Both routines are moved below kcov and kmsan exit, which should be harmless. Link: https://lkml.kernel.org/r/20250319195436.1864415-1-mjguzik@gmail.com Signed-off-by: Mateusz Guzik Reviewed-by: Oleg Nesterov Signed-off-by: Andrew Morton --- kernel/exit.c | 68 ++++++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 36 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index f1db86dcbeb1..921f28249b9b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -415,44 +415,30 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -static void coredump_task_exit(struct task_struct *tsk) +static void coredump_task_exit(struct task_struct *tsk, + struct core_state *core_state) { - struct core_state *core_state; + struct core_thread self; + self.task = tsk; + if (self.task->flags & PF_SIGNALED) + self.next = xchg(&core_state->dumper.next, &self); + else + self.task = NULL; /* - * Serialize with any possible pending coredump. - * We must hold siglock around checking core_state - * and setting PF_POSTCOREDUMP. The core-inducing thread - * will increment ->nr_threads for each thread in the - * group without PF_POSTCOREDUMP set. + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. */ - spin_lock_irq(&tsk->sighand->siglock); - tsk->flags |= PF_POSTCOREDUMP; - core_state = tsk->signal->core_state; - spin_unlock_irq(&tsk->sighand->siglock); - if (core_state) { - struct core_thread self; - - self.task = current; - if (self.task->flags & PF_SIGNALED) - self.next = xchg(&core_state->dumper.next, &self); - else - self.task = NULL; - /* - * Implies mb(), the result of xchg() must be visible - * to core_state->dumper. - */ - if (atomic_dec_and_test(&core_state->nr_threads)) - complete(&core_state->startup); + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); - for (;;) { - set_current_state(TASK_IDLE|TASK_FREEZABLE); - if (!self.task) /* see coredump_finish() */ - break; - schedule(); - } - __set_current_state(TASK_RUNNING); + for (;;) { + set_current_state(TASK_IDLE|TASK_FREEZABLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); } + __set_current_state(TASK_RUNNING); } #ifdef CONFIG_MEMCG @@ -876,6 +862,7 @@ static void synchronize_group_exit(struct task_struct *tsk, long code) { struct sighand_struct *sighand = tsk->sighand; struct signal_struct *signal = tsk->signal; + struct core_state *core_state; spin_lock_irq(&sighand->siglock); signal->quick_threads--; @@ -885,7 +872,19 @@ static void synchronize_group_exit(struct task_struct *tsk, long code) signal->group_exit_code = code; signal->group_stop_count = 0; } + /* + * Serialize with any possible pending coredump. + * We must hold siglock around checking core_state + * and setting PF_POSTCOREDUMP. The core-inducing thread + * will increment ->nr_threads for each thread in the + * group without PF_POSTCOREDUMP set. + */ + tsk->flags |= PF_POSTCOREDUMP; + core_state = signal->core_state; spin_unlock_irq(&sighand->siglock); + + if (unlikely(core_state)) + coredump_task_exit(tsk, core_state); } void __noreturn do_exit(long code) @@ -894,15 +893,12 @@ void __noreturn do_exit(long code) int group_dead; WARN_ON(irqs_disabled()); - - synchronize_group_exit(tsk, code); - WARN_ON(tsk->plug); kcov_task_exit(tsk); kmsan_task_exit(tsk); - coredump_task_exit(tsk); + synchronize_group_exit(tsk, code); ptrace_event(PTRACE_EVENT_EXIT, code); user_events_exit(tsk); -- 2.51.0 From 734aa85390ea693bb7eaf2240623d41b03705c84 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Wed, 9 Apr 2025 03:47:47 +0100 Subject: [PATCH 05/16] Squashfs: check return result of sb_min_blocksize Syzkaller reports an "UBSAN: shift-out-of-bounds in squashfs_bio_read" bug. Syzkaller forks multiple processes which after mounting the Squashfs filesystem, issues an ioctl("/dev/loop0", LOOP_SET_BLOCK_SIZE, 0x8000). Now if this ioctl occurs at the same time another process is in the process of mounting a Squashfs filesystem on /dev/loop0, the failure occurs. When this happens the following code in squashfs_fill_super() fails. ---- msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); msblk->devblksize_log2 = ffz(~msblk->devblksize); ---- sb_min_blocksize() returns 0, which means msblk->devblksize is set to 0. As a result, ffz(~msblk->devblksize) returns 64, and msblk->devblksize_log2 is set to 64. This subsequently causes the UBSAN: shift-out-of-bounds in fs/squashfs/block.c:195:36 shift exponent 64 is too large for 64-bit type 'u64' (aka 'unsigned long long') This commit adds a check for a 0 return by sb_min_blocksize(). Link: https://lkml.kernel.org/r/20250409024747.876480-1-phillip@squashfs.org.uk Fixes: 0aa666190509 ("Squashfs: super block operations") Reported-by: syzbot+65761fc25a137b9c8c6e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67f0dd7a.050a0220.0a13.0230.GAE@google.com/ Signed-off-by: Phillip Lougher Signed-off-by: Andrew Morton --- fs/squashfs/super.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 67c55fe32ce8..992ea0e37257 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -202,6 +202,11 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) msblk->panic_on_errors = (opts->errors == Opt_errors_panic); msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); + if (!msblk->devblksize) { + errorf(fc, "squashfs: unable to set blocksize\n"); + return -EINVAL; + } + msblk->devblksize_log2 = ffz(~msblk->devblksize); mutex_init(&msblk->meta_index_mutex); -- 2.51.0 From 50af973cd71ab9eea3b18429343659f4a6ebd825 Mon Sep 17 00:00:00 2001 From: WangYuli Date: Fri, 11 Apr 2025 18:26:10 +0800 Subject: [PATCH 06/16] ocfs2: o2net_idle_timer: Rename del_timer_sync in comment Commit 8fa7292fee5c ("treewide: Switch/rename to timer_delete[_sync]()") switched del_timer_sync to timer_delete_sync, but did not modify the comment for o2net_idle_timer(). Now fix it. Link: https://lkml.kernel.org/r/BDDB1E4E2876C36C+20250411102610.165946-1-wangyuli@uniontech.com Signed-off-by: WangYuli Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/cluster/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index fce9beb214f0..43e652a2adaf 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1483,7 +1483,7 @@ static void o2net_sc_send_keep_req(struct work_struct *work) sc_put(sc); } -/* socket shutdown does a del_timer_sync against this as it tears down. +/* socket shutdown does a timer_delete_sync against this as it tears down. * we can't start this timer until we've got to the point in sc buildup * where shutdown is going to be involved */ static void o2net_idle_timer(struct timer_list *t) -- 2.51.0 From e711faaafbe54a884f33b53472434063d342f6d4 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Mon, 14 Apr 2025 22:59:43 +0800 Subject: [PATCH 07/16] hung_task: replace blocker_mutex with encoded blocker Patch series "hung_task: extend blocking task stacktrace dump to semaphore", v5. Inspired by mutex blocker tracking[1], this patch series extend the feature to not only dump the blocker task holding a mutex but also to support semaphores. Unlike mutexes, semaphores lack explicit ownership tracking, making it challenging to identify the root cause of hangs. To address this, we introduce a last_holder field to the semaphore structure, which is updated when a task successfully calls down() and cleared during up(). The assumption is that if a task is blocked on a semaphore, the holders must not have released it. While this does not guarantee that the last holder is one of the current blockers, it likely provides a practical hint for diagnosing semaphore-related stalls. With this change, the hung task detector can now show blocker task's info like below: [Tue Apr 8 12:19:07 2025] INFO: task cat:945 blocked for more than 120 seconds. [Tue Apr 8 12:19:07 2025] Tainted: G E 6.14.0-rc6+ #1 [Tue Apr 8 12:19:07 2025] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [Tue Apr 8 12:19:07 2025] task:cat state:D stack:0 pid:945 tgid:945 ppid:828 task_flags:0x400000 flags:0x00000000 [Tue Apr 8 12:19:07 2025] Call Trace: [Tue Apr 8 12:19:07 2025] [Tue Apr 8 12:19:07 2025] __schedule+0x491/0xbd0 [Tue Apr 8 12:19:07 2025] schedule+0x27/0xf0 [Tue Apr 8 12:19:07 2025] schedule_timeout+0xe3/0xf0 [Tue Apr 8 12:19:07 2025] ? __folio_mod_stat+0x2a/0x80 [Tue Apr 8 12:19:07 2025] ? set_ptes.constprop.0+0x27/0x90 [Tue Apr 8 12:19:07 2025] __down_common+0x155/0x280 [Tue Apr 8 12:19:07 2025] down+0x53/0x70 [Tue Apr 8 12:19:07 2025] read_dummy_semaphore+0x23/0x60 [Tue Apr 8 12:19:07 2025] full_proxy_read+0x5f/0xa0 [Tue Apr 8 12:19:07 2025] vfs_read+0xbc/0x350 [Tue Apr 8 12:19:07 2025] ? __count_memcg_events+0xa5/0x140 [Tue Apr 8 12:19:07 2025] ? count_memcg_events.constprop.0+0x1a/0x30 [Tue Apr 8 12:19:07 2025] ? handle_mm_fault+0x180/0x260 [Tue Apr 8 12:19:07 2025] ksys_read+0x66/0xe0 [Tue Apr 8 12:19:07 2025] do_syscall_64+0x51/0x120 [Tue Apr 8 12:19:07 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Tue Apr 8 12:19:07 2025] RIP: 0033:0x7f419478f46e [Tue Apr 8 12:19:07 2025] RSP: 002b:00007fff1c4d2668 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Tue Apr 8 12:19:07 2025] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f419478f46e [Tue Apr 8 12:19:07 2025] RDX: 0000000000020000 RSI: 00007f4194683000 RDI: 0000000000000003 [Tue Apr 8 12:19:07 2025] RBP: 00007f4194683000 R08: 00007f4194682010 R09: 0000000000000000 [Tue Apr 8 12:19:07 2025] R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000 [Tue Apr 8 12:19:07 2025] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 [Tue Apr 8 12:19:07 2025] [Tue Apr 8 12:19:07 2025] INFO: task cat:945 blocked on a semaphore likely last held by task cat:938 [Tue Apr 8 12:19:07 2025] task:cat state:S stack:0 pid:938 tgid:938 ppid:584 task_flags:0x400000 flags:0x00000000 [Tue Apr 8 12:19:07 2025] Call Trace: [Tue Apr 8 12:19:07 2025] [Tue Apr 8 12:19:07 2025] __schedule+0x491/0xbd0 [Tue Apr 8 12:19:07 2025] ? _raw_spin_unlock_irqrestore+0xe/0x40 [Tue Apr 8 12:19:07 2025] schedule+0x27/0xf0 [Tue Apr 8 12:19:07 2025] schedule_timeout+0x77/0xf0 [Tue Apr 8 12:19:07 2025] ? __pfx_process_timeout+0x10/0x10 [Tue Apr 8 12:19:07 2025] msleep_interruptible+0x49/0x60 [Tue Apr 8 12:19:07 2025] read_dummy_semaphore+0x2d/0x60 [Tue Apr 8 12:19:07 2025] full_proxy_read+0x5f/0xa0 [Tue Apr 8 12:19:07 2025] vfs_read+0xbc/0x350 [Tue Apr 8 12:19:07 2025] ? __count_memcg_events+0xa5/0x140 [Tue Apr 8 12:19:07 2025] ? count_memcg_events.constprop.0+0x1a/0x30 [Tue Apr 8 12:19:07 2025] ? handle_mm_fault+0x180/0x260 [Tue Apr 8 12:19:07 2025] ksys_read+0x66/0xe0 [Tue Apr 8 12:19:07 2025] do_syscall_64+0x51/0x120 [Tue Apr 8 12:19:07 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Tue Apr 8 12:19:07 2025] RIP: 0033:0x7f7c584a646e [Tue Apr 8 12:19:07 2025] RSP: 002b:00007ffdba8ce158 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Tue Apr 8 12:19:07 2025] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f7c584a646e [Tue Apr 8 12:19:07 2025] RDX: 0000000000020000 RSI: 00007f7c5839a000 RDI: 0000000000000003 [Tue Apr 8 12:19:07 2025] RBP: 00007f7c5839a000 R08: 00007f7c58399010 R09: 0000000000000000 [Tue Apr 8 12:19:07 2025] R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000 [Tue Apr 8 12:19:07 2025] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 [Tue Apr 8 12:19:07 2025] This patch (of 3): This patch replaces 'struct mutex *blocker_mutex' with 'unsigned long blocker', as only one blocker is active at a time. The blocker filed can store both the lock addrees and the lock type, with LSB used to encode the type as Masami suggested, making it easier to extend the feature to cover other types of locks. Also, once the lock type is determined, we can directly extract the address and cast it to a lock pointer ;) Link: https://lkml.kernel.org/r/20250414145945.84916-1-ioworker0@gmail.com Link: https://lore.kernel.org/all/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com [1] Link: https://lkml.kernel.org/r/20250414145945.84916-2-ioworker0@gmail.com Signed-off-by: Mingzhe Yang Signed-off-by: Lance Yang Reviewed-by: Masami Hiramatsu (Google) Suggested-by: Andrew Morton Suggested-by: Masami Hiramatsu (Google) Cc: Anna Schumaker Cc: Boqun Feng Cc: Ingo Molnar Cc: Joel Granados Cc: John Stultz Cc: Kent Overstreet Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Tomasz Figa Cc: Waiman Long Cc: Will Deacon Cc: Yongliang Gao Cc: Zi Li Signed-off-by: Andrew Morton --- include/linux/hung_task.h | 99 +++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 6 ++- kernel/hung_task.c | 13 +++-- kernel/locking/mutex.c | 5 +- 4 files changed, 115 insertions(+), 8 deletions(-) create mode 100644 include/linux/hung_task.h diff --git a/include/linux/hung_task.h b/include/linux/hung_task.h new file mode 100644 index 000000000000..1bc2b3244613 --- /dev/null +++ b/include/linux/hung_task.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Detect Hung Task: detecting tasks stuck in D state + * + * Copyright (C) 2025 Tongcheng Travel (www.ly.com) + * Author: Lance Yang + */ +#ifndef __LINUX_HUNG_TASK_H +#define __LINUX_HUNG_TASK_H + +#include +#include +#include + +/* + * @blocker: Combines lock address and blocking type. + * + * Since lock pointers are at least 4-byte aligned(32-bit) or 8-byte + * aligned(64-bit). This leaves the 2 least bits (LSBs) of the pointer + * always zero. So we can use these bits to encode the specific blocking + * type. + * + * Type encoding: + * 00 - Blocked on mutex (BLOCKER_TYPE_MUTEX) + * 01 - Blocked on semaphore (BLOCKER_TYPE_SEM) + * 10 - Blocked on rt-mutex (BLOCKER_TYPE_RTMUTEX) + * 11 - Blocked on rw-semaphore (BLOCKER_TYPE_RWSEM) + */ +#define BLOCKER_TYPE_MUTEX 0x00UL +#define BLOCKER_TYPE_SEM 0x01UL +#define BLOCKER_TYPE_RTMUTEX 0x02UL +#define BLOCKER_TYPE_RWSEM 0x03UL + +#define BLOCKER_TYPE_MASK 0x03UL + +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +static inline void hung_task_set_blocker(void *lock, unsigned long type) +{ + unsigned long lock_ptr = (unsigned long)lock; + + WARN_ON_ONCE(!lock_ptr); + WARN_ON_ONCE(READ_ONCE(current->blocker)); + + /* + * If the lock pointer matches the BLOCKER_TYPE_MASK, return + * without writing anything. + */ + if (WARN_ON_ONCE(lock_ptr & BLOCKER_TYPE_MASK)) + return; + + WRITE_ONCE(current->blocker, lock_ptr | type); +} + +static inline void hung_task_clear_blocker(void) +{ + WARN_ON_ONCE(!READ_ONCE(current->blocker)); + + WRITE_ONCE(current->blocker, 0UL); +} + +/* + * hung_task_get_blocker_type - Extracts blocker type from encoded blocker + * address. + * + * @blocker: Blocker pointer with encoded type (via LSB bits) + * + * Returns: BLOCKER_TYPE_MUTEX, BLOCKER_TYPE_SEM, etc. + */ +static inline unsigned long hung_task_get_blocker_type(unsigned long blocker) +{ + WARN_ON_ONCE(!blocker); + + return blocker & BLOCKER_TYPE_MASK; +} + +static inline void *hung_task_blocker_to_lock(unsigned long blocker) +{ + WARN_ON_ONCE(!blocker); + + return (void *)(blocker & ~BLOCKER_TYPE_MASK); +} +#else +static inline void hung_task_set_blocker(void *lock, unsigned long type) +{ +} +static inline void hung_task_clear_blocker(void) +{ +} +static inline unsigned long hung_task_get_blocker_type(unsigned long blocker) +{ + return 0UL; +} +static inline void *hung_task_blocker_to_lock(unsigned long blocker) +{ + return NULL; +} +#endif + +#endif /* __LINUX_HUNG_TASK_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index f96ac1982893..393d81978f40 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1240,7 +1240,11 @@ struct task_struct { #endif #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER - struct mutex *blocker_mutex; + /* + * Encoded lock address causing task block (lower 2 bits = type from + * ). Accessed via hung_task_*() helpers. + */ + unsigned long blocker; #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP diff --git a/kernel/hung_task.c b/kernel/hung_task.c index dc898ec93463..79558d76ef06 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -98,16 +99,18 @@ static struct notifier_block panic_block = { static void debug_show_blocker(struct task_struct *task) { struct task_struct *g, *t; - unsigned long owner; - struct mutex *lock; + unsigned long owner, blocker; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); - lock = READ_ONCE(task->blocker_mutex); - if (!lock) + blocker = READ_ONCE(task->blocker); + if (!blocker || + hung_task_get_blocker_type(blocker) != BLOCKER_TYPE_MUTEX) return; - owner = mutex_get_owner(lock); + owner = mutex_get_owner( + (struct mutex *)hung_task_blocker_to_lock(blocker)); + if (unlikely(!owner)) { pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n", task->comm, task->pid); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 555e2b3a665a..61fa97da7989 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -29,6 +29,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -191,7 +192,7 @@ __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct list_head *list) { #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER - WRITE_ONCE(current->blocker_mutex, lock); + hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX); #endif debug_mutex_add_waiter(lock, waiter, current); @@ -209,7 +210,7 @@ __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) debug_mutex_remove_waiter(lock, waiter, current); #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER - WRITE_ONCE(current->blocker_mutex, NULL); + hung_task_clear_blocker(); #endif } -- 2.51.0 From 194a9b9e843b4077033be70a07d191107972aa53 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Mon, 14 Apr 2025 22:59:44 +0800 Subject: [PATCH 08/16] hung_task: show the blocker task if the task is hung on semaphore Inspired by mutex blocker tracking[1], this patch makes a trade-off to balance the overhead and utility of the hung task detector. Unlike mutexes, semaphores lack explicit ownership tracking, making it challenging to identify the root cause of hangs. To address this, we introduce a last_holder field to the semaphore structure, which is updated when a task successfully calls down() and cleared during up(). The assumption is that if a task is blocked on a semaphore, the holders must not have released it. While this does not guarantee that the last holder is one of the current blockers, it likely provides a practical hint for diagnosing semaphore-related stalls. With this change, the hung task detector can now show blocker task's info like below: [Tue Apr 8 12:19:07 2025] INFO: task cat:945 blocked for more than 120 seconds. [Tue Apr 8 12:19:07 2025] Tainted: G E 6.14.0-rc6+ #1 [Tue Apr 8 12:19:07 2025] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [Tue Apr 8 12:19:07 2025] task:cat state:D stack:0 pid:945 tgid:945 ppid:828 task_flags:0x400000 flags:0x00000000 [Tue Apr 8 12:19:07 2025] Call Trace: [Tue Apr 8 12:19:07 2025] [Tue Apr 8 12:19:07 2025] __schedule+0x491/0xbd0 [Tue Apr 8 12:19:07 2025] schedule+0x27/0xf0 [Tue Apr 8 12:19:07 2025] schedule_timeout+0xe3/0xf0 [Tue Apr 8 12:19:07 2025] ? __folio_mod_stat+0x2a/0x80 [Tue Apr 8 12:19:07 2025] ? set_ptes.constprop.0+0x27/0x90 [Tue Apr 8 12:19:07 2025] __down_common+0x155/0x280 [Tue Apr 8 12:19:07 2025] down+0x53/0x70 [Tue Apr 8 12:19:07 2025] read_dummy_semaphore+0x23/0x60 [Tue Apr 8 12:19:07 2025] full_proxy_read+0x5f/0xa0 [Tue Apr 8 12:19:07 2025] vfs_read+0xbc/0x350 [Tue Apr 8 12:19:07 2025] ? __count_memcg_events+0xa5/0x140 [Tue Apr 8 12:19:07 2025] ? count_memcg_events.constprop.0+0x1a/0x30 [Tue Apr 8 12:19:07 2025] ? handle_mm_fault+0x180/0x260 [Tue Apr 8 12:19:07 2025] ksys_read+0x66/0xe0 [Tue Apr 8 12:19:07 2025] do_syscall_64+0x51/0x120 [Tue Apr 8 12:19:07 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Tue Apr 8 12:19:07 2025] RIP: 0033:0x7f419478f46e [Tue Apr 8 12:19:07 2025] RSP: 002b:00007fff1c4d2668 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Tue Apr 8 12:19:07 2025] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f419478f46e [Tue Apr 8 12:19:07 2025] RDX: 0000000000020000 RSI: 00007f4194683000 RDI: 0000000000000003 [Tue Apr 8 12:19:07 2025] RBP: 00007f4194683000 R08: 00007f4194682010 R09: 0000000000000000 [Tue Apr 8 12:19:07 2025] R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000 [Tue Apr 8 12:19:07 2025] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 [Tue Apr 8 12:19:07 2025] [Tue Apr 8 12:19:07 2025] INFO: task cat:945 blocked on a semaphore likely last held by task cat:938 [Tue Apr 8 12:19:07 2025] task:cat state:S stack:0 pid:938 tgid:938 ppid:584 task_flags:0x400000 flags:0x00000000 [Tue Apr 8 12:19:07 2025] Call Trace: [Tue Apr 8 12:19:07 2025] [Tue Apr 8 12:19:07 2025] __schedule+0x491/0xbd0 [Tue Apr 8 12:19:07 2025] ? _raw_spin_unlock_irqrestore+0xe/0x40 [Tue Apr 8 12:19:07 2025] schedule+0x27/0xf0 [Tue Apr 8 12:19:07 2025] schedule_timeout+0x77/0xf0 [Tue Apr 8 12:19:07 2025] ? __pfx_process_timeout+0x10/0x10 [Tue Apr 8 12:19:07 2025] msleep_interruptible+0x49/0x60 [Tue Apr 8 12:19:07 2025] read_dummy_semaphore+0x2d/0x60 [Tue Apr 8 12:19:07 2025] full_proxy_read+0x5f/0xa0 [Tue Apr 8 12:19:07 2025] vfs_read+0xbc/0x350 [Tue Apr 8 12:19:07 2025] ? __count_memcg_events+0xa5/0x140 [Tue Apr 8 12:19:07 2025] ? count_memcg_events.constprop.0+0x1a/0x30 [Tue Apr 8 12:19:07 2025] ? handle_mm_fault+0x180/0x260 [Tue Apr 8 12:19:07 2025] ksys_read+0x66/0xe0 [Tue Apr 8 12:19:07 2025] do_syscall_64+0x51/0x120 [Tue Apr 8 12:19:07 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Tue Apr 8 12:19:07 2025] RIP: 0033:0x7f7c584a646e [Tue Apr 8 12:19:07 2025] RSP: 002b:00007ffdba8ce158 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Tue Apr 8 12:19:07 2025] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f7c584a646e [Tue Apr 8 12:19:07 2025] RDX: 0000000000020000 RSI: 00007f7c5839a000 RDI: 0000000000000003 [Tue Apr 8 12:19:07 2025] RBP: 00007f7c5839a000 R08: 00007f7c58399010 R09: 0000000000000000 [Tue Apr 8 12:19:07 2025] R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000 [Tue Apr 8 12:19:07 2025] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 [Tue Apr 8 12:19:07 2025] [1] https://lore.kernel.org/all/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com Link: https://lkml.kernel.org/r/20250414145945.84916-3-ioworker0@gmail.com Signed-off-by: Mingzhe Yang Signed-off-by: Lance Yang Suggested-by: Andrew Morton Suggested-by: Masami Hiramatsu (Google) Reviewed-by: Masami Hiramatsu (Google) Cc: Anna Schumaker Cc: Boqun Feng Cc: Ingo Molnar Cc: Joel Granados Cc: John Stultz Cc: Kent Overstreet Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Tomasz Figa Cc: Waiman Long Cc: Will Deacon Cc: Yongliang Gao Cc: Zi Li Signed-off-by: Andrew Morton --- include/linux/semaphore.h | 15 +++++++++- kernel/hung_task.c | 52 ++++++++++++++++++++++++++-------- kernel/locking/semaphore.c | 57 ++++++++++++++++++++++++++++++++++---- 3 files changed, 106 insertions(+), 18 deletions(-) diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h index 04655faadc2d..89706157e622 100644 --- a/include/linux/semaphore.h +++ b/include/linux/semaphore.h @@ -16,13 +16,25 @@ struct semaphore { raw_spinlock_t lock; unsigned int count; struct list_head wait_list; + +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER + unsigned long last_holder; +#endif }; +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +#define __LAST_HOLDER_SEMAPHORE_INITIALIZER \ + , .last_holder = 0UL +#else +#define __LAST_HOLDER_SEMAPHORE_INITIALIZER +#endif + #define __SEMAPHORE_INITIALIZER(name, n) \ { \ .lock = __RAW_SPIN_LOCK_UNLOCKED((name).lock), \ .count = n, \ - .wait_list = LIST_HEAD_INIT((name).wait_list), \ + .wait_list = LIST_HEAD_INIT((name).wait_list) \ + __LAST_HOLDER_SEMAPHORE_INITIALIZER \ } /* @@ -47,5 +59,6 @@ extern int __must_check down_killable(struct semaphore *sem); extern int __must_check down_trylock(struct semaphore *sem); extern int __must_check down_timeout(struct semaphore *sem, long jiffies); extern void up(struct semaphore *sem); +extern unsigned long sem_last_holder(struct semaphore *sem); #endif /* __LINUX_SEMAPHORE_H */ diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 79558d76ef06..d2432df2b905 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -99,32 +99,62 @@ static struct notifier_block panic_block = { static void debug_show_blocker(struct task_struct *task) { struct task_struct *g, *t; - unsigned long owner, blocker; + unsigned long owner, blocker, blocker_type; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); blocker = READ_ONCE(task->blocker); - if (!blocker || - hung_task_get_blocker_type(blocker) != BLOCKER_TYPE_MUTEX) + if (!blocker) return; - owner = mutex_get_owner( - (struct mutex *)hung_task_blocker_to_lock(blocker)); + blocker_type = hung_task_get_blocker_type(blocker); + + switch (blocker_type) { + case BLOCKER_TYPE_MUTEX: + owner = mutex_get_owner( + (struct mutex *)hung_task_blocker_to_lock(blocker)); + break; + case BLOCKER_TYPE_SEM: + owner = sem_last_holder( + (struct semaphore *)hung_task_blocker_to_lock(blocker)); + break; + default: + WARN_ON_ONCE(1); + return; + } + if (unlikely(!owner)) { - pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n", - task->comm, task->pid); + switch (blocker_type) { + case BLOCKER_TYPE_MUTEX: + pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n", + task->comm, task->pid); + break; + case BLOCKER_TYPE_SEM: + pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n", + task->comm, task->pid); + break; + } return; } /* Ensure the owner information is correct. */ for_each_process_thread(g, t) { - if ((unsigned long)t == owner) { + if ((unsigned long)t != owner) + continue; + + switch (blocker_type) { + case BLOCKER_TYPE_MUTEX: pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n", - task->comm, task->pid, t->comm, t->pid); - sched_show_task(t); - return; + task->comm, task->pid, t->comm, t->pid); + break; + case BLOCKER_TYPE_SEM: + pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n", + task->comm, task->pid, t->comm, t->pid); + break; } + sched_show_task(t); + return; } } #else diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index de9117c0e671..3ef032e22f7e 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -34,6 +34,7 @@ #include #include #include +#include static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); @@ -41,6 +42,41 @@ static noinline int __down_killable(struct semaphore *sem); static noinline int __down_timeout(struct semaphore *sem, long timeout); static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q); +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +static inline void hung_task_sem_set_holder(struct semaphore *sem) +{ + WRITE_ONCE((sem)->last_holder, (unsigned long)current); +} + +static inline void hung_task_sem_clear_if_holder(struct semaphore *sem) +{ + if (READ_ONCE((sem)->last_holder) == (unsigned long)current) + WRITE_ONCE((sem)->last_holder, 0UL); +} + +unsigned long sem_last_holder(struct semaphore *sem) +{ + return READ_ONCE(sem->last_holder); +} +#else +static inline void hung_task_sem_set_holder(struct semaphore *sem) +{ +} +static inline void hung_task_sem_clear_if_holder(struct semaphore *sem) +{ +} +unsigned long sem_last_holder(struct semaphore *sem) +{ + return 0UL; +} +#endif + +static inline void __sem_acquire(struct semaphore *sem) +{ + sem->count--; + hung_task_sem_set_holder(sem); +} + /** * down - acquire the semaphore * @sem: the semaphore to be acquired @@ -59,7 +95,7 @@ void __sched down(struct semaphore *sem) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else __down(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -83,7 +119,7 @@ int __sched down_interruptible(struct semaphore *sem) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_interruptible(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -110,7 +146,7 @@ int __sched down_killable(struct semaphore *sem) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_killable(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -140,7 +176,7 @@ int __sched down_trylock(struct semaphore *sem) raw_spin_lock_irqsave(&sem->lock, flags); count = sem->count - 1; if (likely(count >= 0)) - sem->count = count; + __sem_acquire(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); return (count < 0); @@ -165,7 +201,7 @@ int __sched down_timeout(struct semaphore *sem, long timeout) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_timeout(sem, timeout); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -187,6 +223,9 @@ void __sched up(struct semaphore *sem) DEFINE_WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->lock, flags); + + hung_task_sem_clear_if_holder(sem); + if (likely(list_empty(&sem->wait_list))) sem->count++; else @@ -228,8 +267,10 @@ static inline int __sched ___down_common(struct semaphore *sem, long state, raw_spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); raw_spin_lock_irq(&sem->lock); - if (waiter.up) + if (waiter.up) { + hung_task_sem_set_holder(sem); return 0; + } } timed_out: @@ -246,10 +287,14 @@ static inline int __sched __down_common(struct semaphore *sem, long state, { int ret; + hung_task_set_blocker(sem, BLOCKER_TYPE_SEM); + trace_contention_begin(sem, 0); ret = ___down_common(sem, state, timeout); trace_contention_end(sem, ret); + hung_task_clear_blocker(); + return ret; } -- 2.51.0 From 1abf729e9d9f4eb42c01fa9cb86eeb1766ecd49f Mon Sep 17 00:00:00 2001 From: Zi Li Date: Mon, 14 Apr 2025 22:59:45 +0800 Subject: [PATCH 09/16] samples: extend hung_task detector test with semaphore support Extend the existing hung_task detector test module to support multiple lock types, including mutex and semaphore, with room for future additions (e.g., spinlock, etc.). This module creates dummy files under /hung_task, such as 'mutex' and 'semaphore'. The read process on any of these files will sleep for enough long time (256 seconds) while holding the respective lock. As a result, the second process will wait on the lock for a prolonged duration and be detected by the hung_task detector. This change unifies the previous mutex-only sample into a single, extensible hung_task_tests module, reducing code duplication and improving maintainability. Usage is: > cd /sys/kernel/debug/hung_task > cat mutex & cat mutex # Test mutex blocking > cat semaphore & cat semaphore # Test semaphore blocking Update the Kconfig description to reflect multiple debugfs files support. Link: https://lkml.kernel.org/r/20250414145945.84916-4-ioworker0@gmail.com Signed-off-by: Lance Yang Signed-off-by: Zi Li Suggested-by: Masami Hiramatsu (Google) Acked-by: Masami Hiramatsu (Google) Cc: Anna Schumaker Cc: Boqun Feng Cc: Ingo Molnar Cc: Joel Granados Cc: John Stultz Cc: Kent Overstreet Cc: Mingzhe Yang Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Tomasz Figa Cc: Waiman Long Cc: Will Deacon Cc: Yongliang Gao Signed-off-by: Andrew Morton --- samples/Kconfig | 9 +-- samples/hung_task/Makefile | 2 +- samples/hung_task/hung_task_mutex.c | 66 -------------------- samples/hung_task/hung_task_tests.c | 97 +++++++++++++++++++++++++++++ 4 files changed, 103 insertions(+), 71 deletions(-) delete mode 100644 samples/hung_task/hung_task_mutex.c create mode 100644 samples/hung_task/hung_task_tests.c diff --git a/samples/Kconfig b/samples/Kconfig index 09011be2391a..753ed1f170b5 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -304,10 +304,11 @@ config SAMPLE_HUNG_TASK tristate "Hung task detector test code" depends on DETECT_HUNG_TASK && DEBUG_FS help - Build a module which provide a simple debugfs file. If user reads - the file, it will sleep long time (256 seconds) with holding a - mutex. Thus if there are 2 or more processes read this file, it - will be detected by the hung_task watchdog. + Build a module that provides debugfs files (e.g., mutex, semaphore, + etc.) under /hung_task. If user reads one of these files, + it will sleep long time (256 seconds) with holding a lock. Thus, + if 2 or more processes read the same file concurrently, it will + be detected by the hung_task watchdog. source "samples/rust/Kconfig" diff --git a/samples/hung_task/Makefile b/samples/hung_task/Makefile index f4d6ab563488..86036f1a204d 100644 --- a/samples/hung_task/Makefile +++ b/samples/hung_task/Makefile @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_SAMPLE_HUNG_TASK) += hung_task_mutex.o +obj-$(CONFIG_SAMPLE_HUNG_TASK) += hung_task_tests.o diff --git a/samples/hung_task/hung_task_mutex.c b/samples/hung_task/hung_task_mutex.c deleted file mode 100644 index 47ed38239ea3..000000000000 --- a/samples/hung_task/hung_task_mutex.c +++ /dev/null @@ -1,66 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * hung_task_mutex.c - Sample code which causes hung task by mutex - * - * Usage: load this module and read `/hung_task/mutex` - * by 2 or more processes. - * - * This is for testing kernel hung_task error message. - * Note that this will make your system freeze and maybe - * cause panic. So do not use this except for the test. - */ - -#include -#include -#include -#include -#include - -#define HUNG_TASK_DIR "hung_task" -#define HUNG_TASK_FILE "mutex" -#define SLEEP_SECOND 256 - -static const char dummy_string[] = "This is a dummy string."; -static DEFINE_MUTEX(dummy_mutex); -static struct dentry *hung_task_dir; - -static ssize_t read_dummy(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) -{ - /* If the second task waits on the lock, it is uninterruptible sleep. */ - guard(mutex)(&dummy_mutex); - - /* When the first task sleep here, it is interruptible. */ - msleep_interruptible(SLEEP_SECOND * 1000); - - return simple_read_from_buffer(user_buf, count, ppos, - dummy_string, sizeof(dummy_string)); -} - -static const struct file_operations hung_task_fops = { - .read = read_dummy, -}; - -static int __init hung_task_sample_init(void) -{ - hung_task_dir = debugfs_create_dir(HUNG_TASK_DIR, NULL); - if (IS_ERR(hung_task_dir)) - return PTR_ERR(hung_task_dir); - - debugfs_create_file(HUNG_TASK_FILE, 0400, hung_task_dir, - NULL, &hung_task_fops); - - return 0; -} - -static void __exit hung_task_sample_exit(void) -{ - debugfs_remove_recursive(hung_task_dir); -} - -module_init(hung_task_sample_init); -module_exit(hung_task_sample_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Masami Hiramatsu"); -MODULE_DESCRIPTION("Simple sleep under mutex file for testing hung task"); diff --git a/samples/hung_task/hung_task_tests.c b/samples/hung_task/hung_task_tests.c new file mode 100644 index 000000000000..a5c09bd3a47d --- /dev/null +++ b/samples/hung_task/hung_task_tests.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * hung_task_tests.c - Sample code for testing hung tasks with mutex, + * semaphore, etc. + * + * Usage: Load this module and read `/hung_task/mutex`, + * `/hung_task/semaphore`, etc., with 2 or more processes. + * + * This is for testing kernel hung_task error messages with various locking + * mechanisms (e.g., mutex, semaphore, etc.). Note that this may freeze + * your system or cause a panic. Use only for testing purposes. + */ + +#include +#include +#include +#include +#include +#include + +#define HUNG_TASK_DIR "hung_task" +#define HUNG_TASK_MUTEX_FILE "mutex" +#define HUNG_TASK_SEM_FILE "semaphore" +#define SLEEP_SECOND 256 + +static const char dummy_string[] = "This is a dummy string."; +static DEFINE_MUTEX(dummy_mutex); +static DEFINE_SEMAPHORE(dummy_sem, 1); +static struct dentry *hung_task_dir; + +/* Mutex-based read function */ +static ssize_t read_dummy_mutex(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + /* Second task waits on mutex, entering uninterruptible sleep */ + guard(mutex)(&dummy_mutex); + + /* First task sleeps here, interruptible */ + msleep_interruptible(SLEEP_SECOND * 1000); + + return simple_read_from_buffer(user_buf, count, ppos, dummy_string, + sizeof(dummy_string)); +} + +/* Semaphore-based read function */ +static ssize_t read_dummy_semaphore(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + /* Second task waits on semaphore, entering uninterruptible sleep */ + down(&dummy_sem); + + /* First task sleeps here, interruptible */ + msleep_interruptible(SLEEP_SECOND * 1000); + + up(&dummy_sem); + + return simple_read_from_buffer(user_buf, count, ppos, dummy_string, + sizeof(dummy_string)); +} + +/* File operations for mutex */ +static const struct file_operations hung_task_mutex_fops = { + .read = read_dummy_mutex, +}; + +/* File operations for semaphore */ +static const struct file_operations hung_task_sem_fops = { + .read = read_dummy_semaphore, +}; + +static int __init hung_task_tests_init(void) +{ + hung_task_dir = debugfs_create_dir(HUNG_TASK_DIR, NULL); + if (IS_ERR(hung_task_dir)) + return PTR_ERR(hung_task_dir); + + /* Create debugfs files for mutex and semaphore tests */ + debugfs_create_file(HUNG_TASK_MUTEX_FILE, 0400, hung_task_dir, NULL, + &hung_task_mutex_fops); + debugfs_create_file(HUNG_TASK_SEM_FILE, 0400, hung_task_dir, NULL, + &hung_task_sem_fops); + + return 0; +} + +static void __exit hung_task_tests_exit(void) +{ + debugfs_remove_recursive(hung_task_dir); +} + +module_init(hung_task_tests_init); +module_exit(hung_task_tests_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Masami Hiramatsu "); +MODULE_AUTHOR("Zi Li "); +MODULE_DESCRIPTION("Simple sleep under lock files for testing hung task"); -- 2.51.0 From 7d9b05277ae89b9306f9d71ec35a03ee59508334 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 8 Apr 2025 12:34:07 +0200 Subject: [PATCH 10/16] ocfs2: simplify return statement in ocfs2_filecheck_attr_store() Don't negate 'ret' and simplify the return statement. No functional changes intended. Signed-off-by: Thorsten Blum Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/filecheck.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index 1ad7106741f8..3ad7baf67658 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -505,5 +505,5 @@ static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, ocfs2_filecheck_handle_entry(ent, entry); exit: - return (!ret ? count : ret); + return ret ?: count; } -- 2.51.0 From 8d1d4b538bb12a858fa95a073c4aff31e79088ee Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 16 Apr 2025 10:06:13 -0600 Subject: [PATCH 11/16] scatterlist: inline sg_next() sg_next() is a short function called frequently in I/O paths. Define it in the header file so it can be inlined into its callers. Link: https://lkml.kernel.org/r/20250416160615.3571958-1-csander@purestorage.com Signed-off-by: Caleb Sander Mateos Cc: Eric Biggers Signed-off-by: Andrew Morton --- include/linux/scatterlist.h | 23 ++++++++++++++++++++++- lib/scatterlist.c | 23 ----------------------- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 138e2f1bd08f..0cdbfc42f153 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -94,6 +94,28 @@ static inline bool sg_is_last(struct scatterlist *sg) return __sg_flags(sg) & SG_END; } +/** + * sg_next - return the next scatterlist entry in a list + * @sg: The current sg entry + * + * Description: + * Usually the next entry will be @sg@ + 1, but if this sg element is part + * of a chained scatterlist, it could jump to the start of a new + * scatterlist array. + * + **/ +static inline struct scatterlist *sg_next(struct scatterlist *sg) +{ + if (sg_is_last(sg)) + return NULL; + + sg++; + if (unlikely(sg_is_chain(sg))) + sg = sg_chain_ptr(sg); + + return sg; +} + /** * sg_assign_page - Assign a given page to an SG entry * @sg: SG entry @@ -418,7 +440,6 @@ static inline void sg_init_marker(struct scatterlist *sgl, int sg_nents(struct scatterlist *sg); int sg_nents_for_len(struct scatterlist *sg, u64 len); -struct scatterlist *sg_next(struct scatterlist *); struct scatterlist *sg_last(struct scatterlist *s, unsigned int); void sg_init_table(struct scatterlist *, unsigned int); void sg_init_one(struct scatterlist *, const void *, unsigned int); diff --git a/lib/scatterlist.c b/lib/scatterlist.c index b58d5ef1a34b..7582dfab7fe3 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -13,29 +13,6 @@ #include #include -/** - * sg_next - return the next scatterlist entry in a list - * @sg: The current sg entry - * - * Description: - * Usually the next entry will be @sg@ + 1, but if this sg element is part - * of a chained scatterlist, it could jump to the start of a new - * scatterlist array. - * - **/ -struct scatterlist *sg_next(struct scatterlist *sg) -{ - if (sg_is_last(sg)) - return NULL; - - sg++; - if (unlikely(sg_is_chain(sg))) - sg = sg_chain_ptr(sg); - - return sg; -} -EXPORT_SYMBOL(sg_next); - /** * sg_nents - return total count of entries in scatterlist * @sg: The scatterlist -- 2.51.0 From b7df1f254e1a326e6c55698b76bf407df4ec6b6c Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 19 Apr 2025 21:30:11 +0100 Subject: [PATCH 12/16] rapidio: remove some dead defines Patch series "rapidio deadcoding". A couple of rapidio deadcoding patches. The first of these is a repost and was originally posted almost a year ago https://lore.kernel.org/all/20240528002515.211366-1-linux@treblig.org/ but got no answer. Other than being rebased and a typo fixed, it's not changed. This patch (of 2): 'mport_dma_buf', 'rio_mport_dma_map' and 'MPORT_MAX_DMA_BUFS' were added in the original commit e8de370188d0 ("rapidio: add mport char device driver") but never used. 'rio_cm_work' was unused since the original commit b6e8d4aa1110 ("rapidio: add RapidIO channelized messaging driver") but never used. Remove them. Link: https://lkml.kernel.org/r/20250419203012.429787-1-linux@treblig.org Link: https://lkml.kernel.org/r/20250419203012.429787-2-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Cc: Alexandre Bounine Cc: Matt Porter Signed-off-by: Andrew Morton --- drivers/rapidio/devices/rio_mport_cdev.c | 20 -------------------- drivers/rapidio/rio_cm.c | 6 ------ 2 files changed, 26 deletions(-) diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index cbf531d0ba68..995cfeca972b 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -97,18 +97,6 @@ module_param(dbg_level, uint, S_IWUSR | S_IWGRP | S_IRUGO); MODULE_PARM_DESC(dbg_level, "Debugging output level (default 0 = none)"); #endif -/* - * An internal DMA coherent buffer - */ -struct mport_dma_buf { - void *ib_base; - dma_addr_t ib_phys; - u32 ib_size; - u64 ib_rio_base; - bool ib_map; - struct file *filp; -}; - /* * Internal memory mapping structure */ @@ -131,14 +119,6 @@ struct rio_mport_mapping { struct file *filp; }; -struct rio_mport_dma_map { - int valid; - u64 length; - void *vaddr; - dma_addr_t paddr; -}; - -#define MPORT_MAX_DMA_BUFS 16 #define MPORT_EVENT_DEPTH 10 /* diff --git a/drivers/rapidio/rio_cm.c b/drivers/rapidio/rio_cm.c index 9135227301c8..97287e838ce1 100644 --- a/drivers/rapidio/rio_cm.c +++ b/drivers/rapidio/rio_cm.c @@ -198,12 +198,6 @@ struct cm_peer { struct rio_dev *rdev; }; -struct rio_cm_work { - struct work_struct work; - struct cm_dev *cm; - void *data; -}; - struct conn_req { struct list_head node; u32 destid; /* requester destID */ -- 2.51.0 From ba8182d44b4e557e2dc34e51a88105ce5b083372 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 19 Apr 2025 21:30:12 +0100 Subject: [PATCH 13/16] rapidio: remove unused functions rio_request_dma() and rio_dma_prep_slave_sg() were added in 2012 by commit e42d98ebe7d7 ("rapidio: add DMA engine support for RIO data transfers") but never used. rio_find_mport() last use was removed in 2013 by commit 9edbc30b434f ("rapidio: update enumerator registration mechanism") rio_unregister_scan() was added in 2013 by commit a11650e11093 ("rapidio: make enumeration/discovery configurable") but never used. Remove them. Link: https://lkml.kernel.org/r/20250419203012.429787-3-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Cc: Alexandre Bounine Cc: Matt Porter Signed-off-by: Andrew Morton --- drivers/rapidio/rio.c | 103 ---------------------------------------- drivers/rapidio/rio.h | 2 - include/linux/rio_drv.h | 5 -- 3 files changed, 110 deletions(-) diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index 9544b8ee0c96..46daf32ea13b 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -1774,19 +1774,6 @@ struct dma_chan *rio_request_mport_dma(struct rio_mport *mport) } EXPORT_SYMBOL_GPL(rio_request_mport_dma); -/** - * rio_request_dma - request RapidIO capable DMA channel that supports - * specified target RapidIO device. - * @rdev: RIO device associated with DMA transfer - * - * Returns pointer to allocated DMA channel or NULL if failed. - */ -struct dma_chan *rio_request_dma(struct rio_dev *rdev) -{ - return rio_request_mport_dma(rdev->net->hport); -} -EXPORT_SYMBOL_GPL(rio_request_dma); - /** * rio_release_dma - release specified DMA channel * @dchan: DMA channel to release @@ -1834,56 +1821,8 @@ struct dma_async_tx_descriptor *rio_dma_prep_xfer(struct dma_chan *dchan, } EXPORT_SYMBOL_GPL(rio_dma_prep_xfer); -/** - * rio_dma_prep_slave_sg - RapidIO specific wrapper - * for device_prep_slave_sg callback defined by DMAENGINE. - * @rdev: RIO device control structure - * @dchan: DMA channel to configure - * @data: RIO specific data descriptor - * @direction: DMA data transfer direction (TO or FROM the device) - * @flags: dmaengine defined flags - * - * Initializes RapidIO capable DMA channel for the specified data transfer. - * Uses DMA channel private extension to pass information related to remote - * target RIO device. - * - * Returns: pointer to DMA transaction descriptor if successful, - * error-valued pointer or NULL if failed. - */ -struct dma_async_tx_descriptor *rio_dma_prep_slave_sg(struct rio_dev *rdev, - struct dma_chan *dchan, struct rio_dma_data *data, - enum dma_transfer_direction direction, unsigned long flags) -{ - return rio_dma_prep_xfer(dchan, rdev->destid, data, direction, flags); -} -EXPORT_SYMBOL_GPL(rio_dma_prep_slave_sg); - #endif /* CONFIG_RAPIDIO_DMA_ENGINE */ -/** - * rio_find_mport - find RIO mport by its ID - * @mport_id: number (ID) of mport device - * - * Given a RIO mport number, the desired mport is located - * in the global list of mports. If the mport is found, a pointer to its - * data structure is returned. If no mport is found, %NULL is returned. - */ -struct rio_mport *rio_find_mport(int mport_id) -{ - struct rio_mport *port; - - mutex_lock(&rio_mport_list_lock); - list_for_each_entry(port, &rio_mports, node) { - if (port->id == mport_id) - goto found; - } - port = NULL; -found: - mutex_unlock(&rio_mport_list_lock); - - return port; -} - /** * rio_register_scan - enumeration/discovery method registration interface * @mport_id: mport device ID for which fabric scan routine has to be set @@ -1961,48 +1900,6 @@ err_out: } EXPORT_SYMBOL_GPL(rio_register_scan); -/** - * rio_unregister_scan - removes enumeration/discovery method from mport - * @mport_id: mport device ID for which fabric scan routine has to be - * unregistered (RIO_MPORT_ANY = apply to all mports that use - * the specified scan_ops) - * @scan_ops: enumeration/discovery operations structure - * - * Removes enumeration or discovery method assigned to the specified mport - * device. If RIO_MPORT_ANY is specified, removes the specified operations from - * all mports that have them attached. - */ -int rio_unregister_scan(int mport_id, struct rio_scan *scan_ops) -{ - struct rio_mport *port; - struct rio_scan_node *scan; - - pr_debug("RIO: %s for mport_id=%d\n", __func__, mport_id); - - if (mport_id != RIO_MPORT_ANY && mport_id >= RIO_MAX_MPORTS) - return -EINVAL; - - mutex_lock(&rio_mport_list_lock); - - list_for_each_entry(port, &rio_mports, node) - if (port->id == mport_id || - (mport_id == RIO_MPORT_ANY && port->nscan == scan_ops)) - port->nscan = NULL; - - list_for_each_entry(scan, &rio_scans, node) { - if (scan->mport_id == mport_id) { - list_del(&scan->node); - kfree(scan); - break; - } - } - - mutex_unlock(&rio_mport_list_lock); - - return 0; -} -EXPORT_SYMBOL_GPL(rio_unregister_scan); - /** * rio_mport_scan - execute enumeration/discovery on the specified mport * @mport_id: number (ID) of mport device diff --git a/drivers/rapidio/rio.h b/drivers/rapidio/rio.h index f482de0d0370..a0e2a09ddb8e 100644 --- a/drivers/rapidio/rio.h +++ b/drivers/rapidio/rio.h @@ -41,9 +41,7 @@ extern void rio_del_device(struct rio_dev *rdev, enum rio_device_state state); extern int rio_enable_rx_tx_port(struct rio_mport *port, int local, u16 destid, u8 hopcount, u8 port_num); extern int rio_register_scan(int mport_id, struct rio_scan *scan_ops); -extern int rio_unregister_scan(int mport_id, struct rio_scan *scan_ops); extern void rio_attach_device(struct rio_dev *rdev); -extern struct rio_mport *rio_find_mport(int mport_id); extern int rio_mport_scan(int mport_id); /* Structures internal to the RIO core code */ diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h index e49c32b0f394..dd8afe511242 100644 --- a/include/linux/rio_drv.h +++ b/include/linux/rio_drv.h @@ -391,13 +391,8 @@ struct rio_dev *rio_dev_get(struct rio_dev *); void rio_dev_put(struct rio_dev *); #ifdef CONFIG_RAPIDIO_DMA_ENGINE -extern struct dma_chan *rio_request_dma(struct rio_dev *rdev); extern struct dma_chan *rio_request_mport_dma(struct rio_mport *mport); extern void rio_release_dma(struct dma_chan *dchan); -extern struct dma_async_tx_descriptor *rio_dma_prep_slave_sg( - struct rio_dev *rdev, struct dma_chan *dchan, - struct rio_dma_data *data, - enum dma_transfer_direction direction, unsigned long flags); extern struct dma_async_tx_descriptor *rio_dma_prep_xfer( struct dma_chan *dchan, u16 destid, struct rio_dma_data *data, -- 2.51.0 From 2a1c6158131f05c228001e1f73304535bc205444 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 19 Apr 2025 00:49:32 +0100 Subject: [PATCH 14/16] relay: remove unused relay_late_setup_files The last use of relay_late_setup_files() was removed in 2018 by commit 2b47733045aa ("drm/i915/guc: Merge log relay file and channel creation") Remove it and the helper it used. relay_late_setup_files() was used for eventually registering 'buffer only' channels. With it gone, delete the docs that explain how to do that. Which suggests it should be possible to lose the 'has_base_filename' flags. (Are there any other uses??) Link: https://lkml.kernel.org/r/20250418234932.490863-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Jens Axboe Cc: Al Viro Cc: Andriy Shevchenko Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/filesystems/relay.rst | 10 --- include/linux/relay.h | 3 - kernel/relay.c | 111 +--------------------------- 3 files changed, 1 insertion(+), 123 deletions(-) diff --git a/Documentation/filesystems/relay.rst b/Documentation/filesystems/relay.rst index 04ad083cfe62..292ba8492aeb 100644 --- a/Documentation/filesystems/relay.rst +++ b/Documentation/filesystems/relay.rst @@ -301,16 +301,6 @@ user-defined data with a channel, and is immediately available (including in create_buf_file()) via chan->private_data or buf->chan->private_data. -Buffer-only channels --------------------- - -These channels have no files associated and can be created with -relay_open(NULL, NULL, ...). Such channels are useful in scenarios such -as when doing early tracing in the kernel, before the VFS is up. In these -cases, one may open a buffer-only channel and then call -relay_late_setup_files() when the kernel is ready to handle files, -to expose the buffered data to the userspace. - Channel 'modes' --------------- diff --git a/include/linux/relay.h b/include/linux/relay.h index 72b876dd5cb8..b3224111d074 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -159,9 +159,6 @@ struct rchan *relay_open(const char *base_filename, size_t n_subbufs, const struct rchan_callbacks *cb, void *private_data); -extern int relay_late_setup_files(struct rchan *chan, - const char *base_filename, - struct dentry *parent); extern void relay_close(struct rchan *chan); extern void relay_flush(struct rchan *chan); extern void relay_subbufs_consumed(struct rchan *chan, diff --git a/kernel/relay.c b/kernel/relay.c index 5ac7e711e4b6..c0c93a04d4ce 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -452,7 +452,7 @@ int relay_prepare_cpu(unsigned int cpu) /** * relay_open - create a new relay channel - * @base_filename: base name of files to create, %NULL for buffering only + * @base_filename: base name of files to create * @parent: dentry of parent directory, %NULL for root directory or buffer * @subbuf_size: size of sub-buffers * @n_subbufs: number of sub-buffers @@ -465,10 +465,6 @@ int relay_prepare_cpu(unsigned int cpu) * attributes specified. The created channel buffer files * will be named base_filename0...base_filenameN-1. File * permissions will be %S_IRUSR. - * - * If opening a buffer (@parent = NULL) that you later wish to register - * in a filesystem, call relay_late_setup_files() once the @parent dentry - * is available. */ struct rchan *relay_open(const char *base_filename, struct dentry *parent, @@ -540,111 +536,6 @@ struct rchan_percpu_buf_dispatcher { struct dentry *dentry; }; -/* Called in atomic context. */ -static void __relay_set_buf_dentry(void *info) -{ - struct rchan_percpu_buf_dispatcher *p = info; - - relay_set_buf_dentry(p->buf, p->dentry); -} - -/** - * relay_late_setup_files - triggers file creation - * @chan: channel to operate on - * @base_filename: base name of files to create - * @parent: dentry of parent directory, %NULL for root directory - * - * Returns 0 if successful, non-zero otherwise. - * - * Use to setup files for a previously buffer-only channel created - * by relay_open() with a NULL parent dentry. - * - * For example, this is useful for perfomring early tracing in kernel, - * before VFS is up and then exposing the early results once the dentry - * is available. - */ -int relay_late_setup_files(struct rchan *chan, - const char *base_filename, - struct dentry *parent) -{ - int err = 0; - unsigned int i, curr_cpu; - unsigned long flags; - struct dentry *dentry; - struct rchan_buf *buf; - struct rchan_percpu_buf_dispatcher disp; - - if (!chan || !base_filename) - return -EINVAL; - - strscpy(chan->base_filename, base_filename, NAME_MAX); - - mutex_lock(&relay_channels_mutex); - /* Is chan already set up? */ - if (unlikely(chan->has_base_filename)) { - mutex_unlock(&relay_channels_mutex); - return -EEXIST; - } - chan->has_base_filename = 1; - chan->parent = parent; - - if (chan->is_global) { - err = -EINVAL; - buf = *per_cpu_ptr(chan->buf, 0); - if (!WARN_ON_ONCE(!buf)) { - dentry = relay_create_buf_file(chan, buf, 0); - if (dentry && !WARN_ON_ONCE(!chan->is_global)) { - relay_set_buf_dentry(buf, dentry); - err = 0; - } - } - mutex_unlock(&relay_channels_mutex); - return err; - } - - curr_cpu = get_cpu(); - /* - * The CPU hotplug notifier ran before us and created buffers with - * no files associated. So it's safe to call relay_setup_buf_file() - * on all currently online CPUs. - */ - for_each_online_cpu(i) { - buf = *per_cpu_ptr(chan->buf, i); - if (unlikely(!buf)) { - WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); - err = -EINVAL; - break; - } - - dentry = relay_create_buf_file(chan, buf, i); - if (unlikely(!dentry)) { - err = -EINVAL; - break; - } - - if (curr_cpu == i) { - local_irq_save(flags); - relay_set_buf_dentry(buf, dentry); - local_irq_restore(flags); - } else { - disp.buf = buf; - disp.dentry = dentry; - smp_mb(); - /* relay_channels_mutex must be held, so wait. */ - err = smp_call_function_single(i, - __relay_set_buf_dentry, - &disp, 1); - } - if (unlikely(err)) - break; - } - put_cpu(); - mutex_unlock(&relay_channels_mutex); - - return err; -} -EXPORT_SYMBOL_GPL(relay_late_setup_files); - /** * relay_switch_subbuf - switch to a new sub-buffer * @buf: channel buffer -- 2.51.0 From 92f3c5a0051d2b56379651522b587ff7309b2606 Mon Sep 17 00:00:00 2001 From: "Herton R. Krzesinski" Date: Fri, 18 Apr 2025 13:50:47 -0300 Subject: [PATCH 15/16] lib/test_kmod: do not hardcode/depend on any filesystem Right now test_kmod has hardcoded dependencies on btrfs/xfs. That is not optimal since you end up needing to select/build them, but it is not really required since other fs could be selected for the testing. Also, we can't change the default/driver module used for testing on initialization. Thus make it more generic: introduce two module parameters (start_driver and start_test_fs), which allow to select which modules/fs to use for the testing on test_kmod initialization. Then it's up to the user to select which modules/fs to use for testing based on his config. However, keep test_module as required default. This way, config/modules becomes selectable as when the testing is done from selftests (userspace). While at it, also change trigger_config_run_type, since at module initialization we already set the defaults at __kmod_config_init and should not need to do it again in test_kmod_init(), thus we can avoid to again set test_driver/test_fs. Link: https://lkml.kernel.org/r/20250418165047.702487-1-herton@redhat.com Signed-off-by: Herton R. Krzesinski Reviewed-by: Luis Chambelrain Cc: Daniel Gomez Cc: Nathan Chancellor Cc: Petr Pavlu Cc: Sami Tolvanen Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 6 --- lib/test_kmod.c | 64 +++++++++++++++-------------- tools/testing/selftests/kmod/config | 5 --- 3 files changed, 34 insertions(+), 41 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f9051ab610d5..f999d7e86023 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2983,13 +2983,7 @@ config TEST_DYNAMIC_DEBUG config TEST_KMOD tristate "kmod stress tester" depends on m - depends on NETDEVICES && NET_CORE && INET # for TUN - depends on BLOCK - depends on PAGE_SIZE_LESS_THAN_256KB # for BTRFS select TEST_LKM - select XFS_FS - select TUN - select BTRFS_FS help Test the kernel's module loading mechanism: kmod. kmod implements support to load modules using the Linux kernel's usermode helper. diff --git a/lib/test_kmod.c b/lib/test_kmod.c index 064ed0fce75a..f0dd092860ea 100644 --- a/lib/test_kmod.c +++ b/lib/test_kmod.c @@ -28,14 +28,20 @@ #define TEST_START_NUM_THREADS 50 #define TEST_START_DRIVER "test_module" -#define TEST_START_TEST_FS "xfs" #define TEST_START_TEST_CASE TEST_KMOD_DRIVER - static bool force_init_test = false; -module_param(force_init_test, bool_enable_only, 0644); +module_param(force_init_test, bool_enable_only, 0444); MODULE_PARM_DESC(force_init_test, "Force kicking a test immediately after driver loads"); +static char *start_driver; +module_param(start_driver, charp, 0444); +MODULE_PARM_DESC(start_driver, + "Module/driver to use for the testing after driver loads"); +static char *start_test_fs; +module_param(start_test_fs, charp, 0444); +MODULE_PARM_DESC(start_test_fs, + "File system to use for the testing after driver loads"); /* * For device allocation / registration @@ -508,6 +514,11 @@ static int __trigger_config_run(struct kmod_test_device *test_dev) case TEST_KMOD_DRIVER: return run_test_driver(test_dev); case TEST_KMOD_FS_TYPE: + if (!config->test_fs) { + dev_warn(test_dev->dev, + "No fs type specified, can't run the test\n"); + return -EINVAL; + } return run_test_fs_type(test_dev); default: dev_warn(test_dev->dev, @@ -721,26 +732,20 @@ static ssize_t config_test_fs_show(struct device *dev, static DEVICE_ATTR_RW(config_test_fs); static int trigger_config_run_type(struct kmod_test_device *test_dev, - enum kmod_test_case test_case, - const char *test_str) + enum kmod_test_case test_case) { - int copied = 0; struct test_config *config = &test_dev->config; mutex_lock(&test_dev->config_mutex); switch (test_case) { case TEST_KMOD_DRIVER: - kfree_const(config->test_driver); - config->test_driver = NULL; - copied = config_copy_test_driver_name(config, test_str, - strlen(test_str)); break; case TEST_KMOD_FS_TYPE: - kfree_const(config->test_fs); - config->test_fs = NULL; - copied = config_copy_test_fs(config, test_str, - strlen(test_str)); + if (!config->test_fs) { + mutex_unlock(&test_dev->config_mutex); + return 0; + } break; default: mutex_unlock(&test_dev->config_mutex); @@ -751,11 +756,6 @@ static int trigger_config_run_type(struct kmod_test_device *test_dev, mutex_unlock(&test_dev->config_mutex); - if (copied <= 0 || copied != strlen(test_str)) { - test_dev->test_is_oom = true; - return -ENOMEM; - } - test_dev->test_is_oom = false; return trigger_config_run(test_dev); @@ -800,19 +800,24 @@ static unsigned int kmod_init_test_thread_limit(void) static int __kmod_config_init(struct kmod_test_device *test_dev) { struct test_config *config = &test_dev->config; + const char *test_start_driver = start_driver ? start_driver : + TEST_START_DRIVER; int ret = -ENOMEM, copied; __kmod_config_free(config); - copied = config_copy_test_driver_name(config, TEST_START_DRIVER, - strlen(TEST_START_DRIVER)); - if (copied != strlen(TEST_START_DRIVER)) + copied = config_copy_test_driver_name(config, test_start_driver, + strlen(test_start_driver)); + if (copied != strlen(test_start_driver)) goto err_out; - copied = config_copy_test_fs(config, TEST_START_TEST_FS, - strlen(TEST_START_TEST_FS)); - if (copied != strlen(TEST_START_TEST_FS)) - goto err_out; + + if (start_test_fs) { + copied = config_copy_test_fs(config, start_test_fs, + strlen(start_test_fs)); + if (copied != strlen(start_test_fs)) + goto err_out; + } config->num_threads = kmod_init_test_thread_limit(); config->test_result = 0; @@ -1178,12 +1183,11 @@ static int __init test_kmod_init(void) * lowering the init level for more fun. */ if (force_init_test) { - ret = trigger_config_run_type(test_dev, - TEST_KMOD_DRIVER, "tun"); + ret = trigger_config_run_type(test_dev, TEST_KMOD_DRIVER); if (WARN_ON(ret)) return ret; - ret = trigger_config_run_type(test_dev, - TEST_KMOD_FS_TYPE, "btrfs"); + + ret = trigger_config_run_type(test_dev, TEST_KMOD_FS_TYPE); if (WARN_ON(ret)) return ret; } diff --git a/tools/testing/selftests/kmod/config b/tools/testing/selftests/kmod/config index 259f4fd6b5e2..1f1e63494af9 100644 --- a/tools/testing/selftests/kmod/config +++ b/tools/testing/selftests/kmod/config @@ -1,7 +1,2 @@ CONFIG_TEST_KMOD=m CONFIG_TEST_LKM=m -CONFIG_XFS_FS=m - -# For the module parameter force_init_test is used -CONFIG_TUN=m -CONFIG_BTRFS_FS=m -- 2.51.0 From f0eba23cb70a945ea0947bdbf0706ca00da50d26 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 18 Apr 2025 13:03:31 +0100 Subject: [PATCH 16/16] crash: fix spelling mistake "crahskernel" -> "crashkernel" There is a spelling mistake in a pr_warn message. Fix it. Link: https://lkml.kernel.org/r/20250418120331.535086-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Acked-by: Baoquan He Cc: Dave Young Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/crash_reserve.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index aff7c0fdbefa..acb6bf42e30d 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -131,7 +131,7 @@ static int __init parse_crashkernel_mem(char *cmdline, cur++; *crash_base = memparse(cur, &tmp); if (cur == tmp) { - pr_warn("crahskernel: Memory value expected after '@'\n"); + pr_warn("crashkernel: Memory value expected after '@'\n"); return -EINVAL; } } -- 2.51.0