From 87d82cff3829733fa6838492a9215303ad98a61c Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 5 Feb 2025 11:55:12 +0100 Subject: [PATCH 01/16] hrtimers: Merge __hrtimer_init() into __hrtimer_setup() __hrtimer_init() is only called by __hrtimer_setup(). Simplify by merging __hrtimer_init() into __hrtimer_setup(). Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/all/8a0a847a35f711f66b2d05b57255aa44e7e61279.1738746927.git.namcao@linutronix.de --- kernel/time/hrtimer.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 2d2835cf2659..163cde35f0c3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1592,8 +1592,9 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) } } -static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) +static void __hrtimer_setup(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); struct hrtimer_cpu_base *cpu_base; @@ -1626,13 +1627,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); -} - -static void __hrtimer_setup(struct hrtimer *timer, - enum hrtimer_restart (*function)(struct hrtimer *), - clockid_t clock_id, enum hrtimer_mode mode) -{ - __hrtimer_init(timer, clock_id, mode); if (WARN_ON_ONCE(!function)) timer->function = hrtimer_dummy_timeout; -- 2.51.0 From 04257da0c99c9d4ff7c5bb93046482e1f7d34938 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 5 Feb 2025 11:55:16 +0100 Subject: [PATCH 02/16] hrtimers: Make callback function pointer private Make the struct hrtimer::function field private, to prevent users from changing this field in an unsafe way. hrtimer_update_function() should be used if the callback function needs to be changed. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/all/7d0e6e0c5c59a64a9bea940051aac05d750bc0c2.1738746927.git.namcao@linutronix.de --- include/linux/hrtimer_types.h | 2 +- include/trace/events/timer.h | 4 ++-- kernel/time/hrtimer.c | 8 ++++---- kernel/time/timer_list.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index 7c5b27daa89d..8fbbb6bdf7a1 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -39,7 +39,7 @@ enum hrtimer_restart { struct hrtimer { struct timerqueue_node node; ktime_t _softexpires; - enum hrtimer_restart (*function)(struct hrtimer *); + enum hrtimer_restart (*__private function)(struct hrtimer *); struct hrtimer_clock_base *base; u8 state; u8 is_rel; diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 1ef58a04fc57..f8c906be4cd0 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -235,7 +235,7 @@ TRACE_EVENT(hrtimer_start, TP_fast_assign( __entry->hrtimer = hrtimer; - __entry->function = hrtimer->function; + __entry->function = ACCESS_PRIVATE(hrtimer, function); __entry->expires = hrtimer_get_expires(hrtimer); __entry->softexpires = hrtimer_get_softexpires(hrtimer); __entry->mode = mode; @@ -271,7 +271,7 @@ TRACE_EVENT(hrtimer_expire_entry, TP_fast_assign( __entry->hrtimer = hrtimer; __entry->now = *now; - __entry->function = hrtimer->function; + __entry->function = ACCESS_PRIVATE(hrtimer, function); ), TP_printk("hrtimer=%p function=%ps now=%llu", diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 163cde35f0c3..88ea4bbea9c1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1316,7 +1316,7 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base; unsigned long flags; - if (WARN_ON_ONCE(!timer->function)) + if (WARN_ON_ONCE(!ACCESS_PRIVATE(timer, function))) return; /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft @@ -1629,9 +1629,9 @@ static void __hrtimer_setup(struct hrtimer *timer, timerqueue_init(&timer->node); if (WARN_ON_ONCE(!function)) - timer->function = hrtimer_dummy_timeout; + ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout; else - timer->function = function; + ACCESS_PRIVATE(timer, function) = function; } /** @@ -1743,7 +1743,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, raw_write_seqcount_barrier(&base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); - fn = timer->function; + fn = ACCESS_PRIVATE(timer, function); /* * Clear the 'is relative' flag for the TIME_LOW_RES case. If the diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index cfbb46cc4e76..b03d0ada6469 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -46,7 +46,7 @@ static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { - SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, timer->function); + SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function)); SEQ_printf(m, ", S:%02x", timer->state); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", -- 2.51.0 From 1cc24f2e766c5a6606b834a677bd58991c1b9781 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 5 Feb 2025 11:55:17 +0100 Subject: [PATCH 03/16] hrtimers: Remove unnecessary NULL check in hrtimer_start_range_ns() The struct hrtimer::function field can only be changed using hrtimer_setup*() or hrtimer_update_function(), and both already null-check 'function'. Therefore, null-checking 'function' in hrtimer_start_range_ns() is not necessary. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/all/4661c571ee87980c340ccc318fc1a473c0c8f6bc.1738746927.git.namcao@linutronix.de --- kernel/time/hrtimer.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 88ea4bbea9c1..e883f65cd175 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1316,8 +1316,6 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base; unsigned long flags; - if (WARN_ON_ONCE(!ACCESS_PRIVATE(timer, function))) - return; /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard -- 2.51.0 From fcea1ccf2476ca793b0ca3f80ca23f5a28cbb0b3 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 5 Feb 2025 11:55:18 +0100 Subject: [PATCH 04/16] hrtimers: Rename __hrtimer_init_sleeper() to __hrtimer_setup_sleeper() All the hrtimer_init*() functions have been renamed to hrtimer_setup*(). Rename __hrtimer_init_sleeper() to __hrtimer_setup_sleeper() as well, to keep the names consistent. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/all/807694aedad9353421c4a7347629a30c5c31026f.1738746927.git.namcao@linutronix.de --- kernel/time/hrtimer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e883f65cd175..8cb2c85cebe0 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2016,7 +2016,7 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, * Make the enqueue delivery mode check work on RT. If the sleeper * was initialized for hard interrupt delivery, force the mode bit. * This is a special case for hrtimer_sleepers because - * __hrtimer_init_sleeper() determines the delivery mode on RT so the + * __hrtimer_setup_sleeper() determines the delivery mode on RT so the * fiddling with this decision is avoided at the call sites. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) @@ -2026,8 +2026,8 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); -static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode) +static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode) { /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly @@ -2067,7 +2067,7 @@ void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { debug_init_on_stack(&sl->timer, clock_id, mode); - __hrtimer_init_sleeper(sl, clock_id, mode); + __hrtimer_setup_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack); -- 2.51.0 From e9ef2093ad9edec8d8a060e14891952570c82b8b Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 5 Feb 2025 11:55:19 +0100 Subject: [PATCH 05/16] hrtimers: Rename debug_init() to debug_setup() All the hrtimer_init*() functions have been renamed to hrtimer_setup*(). Rename debug_init() to debug_setup() as well, to keep the names consistent. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/all/4b730c1f79648b16a1c5413f928fdc2e138dfc43.1738746927.git.namcao@linutronix.de --- kernel/time/hrtimer.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 8cb2c85cebe0..472c29816d79 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -465,9 +465,7 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer, static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif -static inline void -debug_init(struct hrtimer *timer, clockid_t clockid, - enum hrtimer_mode mode) +static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init(timer); trace_hrtimer_init(timer, clockid, mode); @@ -1648,7 +1646,7 @@ static void __hrtimer_setup(struct hrtimer *timer, void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { - debug_init(timer, clock_id, mode); + debug_setup(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup); -- 2.51.0 From 59c9edafc0f3843c3e616eb8136a310c7c552595 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 5 Feb 2025 11:55:20 +0100 Subject: [PATCH 06/16] hrtimers: Rename debug_init_on_stack() to debug_setup_on_stack() All the hrtimer_init*() functions have been renamed to hrtimer_setup*(). Rename debug_init_on_stack() to debug_setup_on_stack() as well, to keep the names consistent. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/all/073cf6162779a2f5b12624677d4c49ee7eccc1ed.1738746927.git.namcao@linutronix.de --- kernel/time/hrtimer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 472c29816d79..4bf91fa2d9bd 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -471,8 +471,8 @@ static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hr trace_hrtimer_init(timer, clockid, mode); } -static inline void debug_init_on_stack(struct hrtimer *timer, clockid_t clockid, - enum hrtimer_mode mode) +static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) { debug_hrtimer_init_on_stack(timer); trace_hrtimer_init(timer, clockid, mode); @@ -1665,7 +1665,7 @@ void hrtimer_setup_on_stack(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { - debug_init_on_stack(timer, clock_id, mode); + debug_setup_on_stack(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack); @@ -2064,7 +2064,7 @@ static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { - debug_init_on_stack(&sl->timer, clock_id, mode); + debug_setup_on_stack(&sl->timer, clock_id, mode); __hrtimer_setup_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack); -- 2.51.0 From 244132c4e5777fe0a4544ef23afba0d9a50e5ec5 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 5 Feb 2025 11:55:21 +0100 Subject: [PATCH 07/16] tracing/timers: Rename the hrtimer_init event to hrtimer_setup The function hrtimer_init() doesn't exist anymore. It was replaced by hrtimer_setup(). Thus, rename the hrtimer_init trace event to hrtimer_setup to keep it consistent. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/all/cba84c3d853c5258aa3a262363a6eac08e2c7afc.1738746927.git.namcao@linutronix.de --- Documentation/trace/ftrace.rst | 4 ++-- include/trace/events/timer.h | 4 ++-- kernel/time/hrtimer.c | 4 ++-- tools/perf/tests/shell/trace_btf_enum.sh | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index 2b74f96d09d5..c9e88bf65709 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -3077,7 +3077,7 @@ Notice that we lost the sys_nanosleep. # cat set_ftrace_filter hrtimer_run_queues hrtimer_run_pending - hrtimer_init + hrtimer_setup hrtimer_cancel hrtimer_try_to_cancel hrtimer_forward @@ -3115,7 +3115,7 @@ Again, now we want to append. # cat set_ftrace_filter hrtimer_run_queues hrtimer_run_pending - hrtimer_init + hrtimer_setup hrtimer_cancel hrtimer_try_to_cancel hrtimer_forward diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index f8c906be4cd0..1641ae3e6ca0 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -185,12 +185,12 @@ TRACE_EVENT(timer_base_idle, { HRTIMER_MODE_REL_PINNED_HARD, "REL|PINNED|HARD" }) /** - * hrtimer_init - called when the hrtimer is initialized + * hrtimer_setup - called when the hrtimer is initialized * @hrtimer: pointer to struct hrtimer * @clockid: the hrtimers clock * @mode: the hrtimers mode */ -TRACE_EVENT(hrtimer_init, +TRACE_EVENT(hrtimer_setup, TP_PROTO(struct hrtimer *hrtimer, clockid_t clockid, enum hrtimer_mode mode), diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 4bf91fa2d9bd..517ee2590a29 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -468,14 +468,14 @@ static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init(timer); - trace_hrtimer_init(timer, clockid, mode); + trace_hrtimer_setup(timer, clockid, mode); } static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init_on_stack(timer); - trace_hrtimer_init(timer, clockid, mode); + trace_hrtimer_setup(timer, clockid, mode); } static inline void debug_activate(struct hrtimer *timer, diff --git a/tools/perf/tests/shell/trace_btf_enum.sh b/tools/perf/tests/shell/trace_btf_enum.sh index 60b3fa254cf6..f0b49f7fb57d 100755 --- a/tools/perf/tests/shell/trace_btf_enum.sh +++ b/tools/perf/tests/shell/trace_btf_enum.sh @@ -6,7 +6,7 @@ err=0 set -e syscall="landlock_add_rule" -non_syscall="timer:hrtimer_init,timer:hrtimer_start" +non_syscall="timer:hrtimer_setup,timer:hrtimer_start" TESTPROG="perf test -w landlock" -- 2.51.0 From 427011db477dfb8cca001e492c2b312fdf7c7173 Mon Sep 17 00:00:00 2001 From: Artur Rojek Date: Sun, 16 Feb 2025 18:55:44 +0100 Subject: [PATCH 08/16] sh: Align .bss section padding to 8-byte boundary J2-based devices expect to find a device tree blob at the end of the .bss section. As of a77725a9a3c5 ("scripts/dtc: Update to upstream version v1.6.1-19-g0a3a9d3449c8"), libfdt enforces 8-byte alignment for the DTB, causing J2 devices to fail early in sh_fdt_init(). As the J2 loader firmware calculates the DTB location based on the kernel image .bss section size rather than the __bss_stop symbol offset, the required alignment can't be enforced with BSS_SECTION(0, PAGE_SIZE, 8). To fix this, inline a modified version of the above macro which grows .bss by the required size. While this change affects all existing SH boards, it should be benign on platforms which don't need this alignment. Signed-off-by: Artur Rojek Reviewed-by: John Paul Adrian Glaubitz Tested-by: Rob Landley Signed-off-by: John Paul Adrian Glaubitz --- arch/sh/kernel/vmlinux.lds.S | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 9644fe187a3f..008c30289eaa 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -71,7 +71,20 @@ SECTIONS . = ALIGN(PAGE_SIZE); __init_end = .; - BSS_SECTION(0, PAGE_SIZE, 4) + __bss_start = .; + SBSS(0) + . = ALIGN(PAGE_SIZE); + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + BSS_FIRST_SECTIONS + . = ALIGN(PAGE_SIZE); + *(.bss..page_aligned) + . = ALIGN(PAGE_SIZE); + *(.dynbss) + *(BSS_MAIN) + *(COMMON) + . = ALIGN(8); + } + __bss_stop = .; _end = . ; STABS_DEBUG -- 2.51.0 From 5f2efd67a17e5f4e2fccdb86014efaf8725f57a7 Mon Sep 17 00:00:00 2001 From: Johan Korsnes Date: Sun, 23 Mar 2025 20:13:30 +0100 Subject: [PATCH 09/16] sh: defconfig: Drop obsolete CONFIG_NET_CLS_TCINDEX This option was removed from Kconfig in 8c710f75256b ("net/sched: Retire tcindex classifier") but from the defconfigs. Fixes: 8c710f75256b ("net/sched: Retire tcindex classifier") Signed-off-by: Johan Korsnes Cc: Yoshinori Sato Cc: Rich Felker Cc: John Paul Adrian Glaubitz Reviewed-by: Geert Uytterhoeven Reviewed-by: John Paul Adrian Glaubitz Signed-off-by: John Paul Adrian Glaubitz --- arch/sh/configs/se7712_defconfig | 1 - arch/sh/configs/se7721_defconfig | 1 - arch/sh/configs/sh7710voipgw_defconfig | 1 - arch/sh/configs/titan_defconfig | 1 - 4 files changed, 4 deletions(-) diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig index 20f07aee5bde..49a4961889de 100644 --- a/arch/sh/configs/se7712_defconfig +++ b/arch/sh/configs/se7712_defconfig @@ -57,7 +57,6 @@ CONFIG_NET_SCH_TBF=y CONFIG_NET_SCH_GRED=y CONFIG_NET_SCH_DSMARK=y CONFIG_NET_SCH_NETEM=y -CONFIG_NET_CLS_TCINDEX=y CONFIG_NET_CLS_ROUTE4=y CONFIG_NET_CLS_FW=y CONFIG_MTD=y diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig index 00862d3c030d..de293792db84 100644 --- a/arch/sh/configs/se7721_defconfig +++ b/arch/sh/configs/se7721_defconfig @@ -56,7 +56,6 @@ CONFIG_NET_SCH_TBF=y CONFIG_NET_SCH_GRED=y CONFIG_NET_SCH_DSMARK=y CONFIG_NET_SCH_NETEM=y -CONFIG_NET_CLS_TCINDEX=y CONFIG_NET_CLS_ROUTE4=y CONFIG_NET_CLS_FW=y CONFIG_MTD=y diff --git a/arch/sh/configs/sh7710voipgw_defconfig b/arch/sh/configs/sh7710voipgw_defconfig index 99a5d0760532..5b151bb2bc43 100644 --- a/arch/sh/configs/sh7710voipgw_defconfig +++ b/arch/sh/configs/sh7710voipgw_defconfig @@ -27,7 +27,6 @@ CONFIG_NETFILTER=y CONFIG_NET_SCHED=y CONFIG_NET_SCH_CBQ=y CONFIG_NET_CLS_BASIC=y -CONFIG_NET_CLS_TCINDEX=y CONFIG_NET_CLS_ROUTE4=y CONFIG_NET_CLS_U32=y CONFIG_MTD=y diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig index 99bc0e889287..e2928311a126 100644 --- a/arch/sh/configs/titan_defconfig +++ b/arch/sh/configs/titan_defconfig @@ -119,7 +119,6 @@ CONFIG_NET_SCH_DSMARK=m CONFIG_NET_SCH_NETEM=m CONFIG_NET_SCH_INGRESS=m CONFIG_NET_CLS_BASIC=m -CONFIG_NET_CLS_TCINDEX=m CONFIG_NET_CLS_ROUTE4=m CONFIG_NET_CLS_FW=m CONFIG_NET_CLS_U32=m -- 2.51.0 From 3b8241f64c469e449e862cf2bdc50b879fa18f93 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 22 Dec 2024 09:30:53 +0900 Subject: [PATCH 10/16] nios2: migrate to the generic rule for built-in DTB Commit 654102df2ac2 ("kbuild: add generic support for built-in boot DTBs") introduced generic support for built-in DTBs. Select GENERIC_BUILTIN_DTB when built-in DTB support is enabled. To keep consistency across architectures, this commit also renames CONFIG_NIOS2_DTB_SOURCE_BOOL to CONFIG_BUILTIN_DTB, and CONFIG_NIOS2_DTB_SOURCE to CONFIG_BUILTIN_DTB_NAME. Signed-off-by: Masahiro Yamada --- arch/nios2/Kbuild | 2 +- arch/nios2/boot/dts/Makefile | 4 ++-- arch/nios2/kernel/prom.c | 2 +- arch/nios2/platform/Kconfig.platform | 11 ++++++----- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/nios2/Kbuild b/arch/nios2/Kbuild index fc2952edd2de..fa64c5954b20 100644 --- a/arch/nios2/Kbuild +++ b/arch/nios2/Kbuild @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += kernel/ mm/ platform/ boot/dts/ +obj-y += kernel/ mm/ platform/ # for cleaning subdir- += boot diff --git a/arch/nios2/boot/dts/Makefile b/arch/nios2/boot/dts/Makefile index 1a2e8996bec7..1b8f41c4154f 100644 --- a/arch/nios2/boot/dts/Makefile +++ b/arch/nios2/boot/dts/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := $(patsubst %.dts,%.dtb.o,$(CONFIG_NIOS2_DTB_SOURCE)) +dtb-y := $(addsuffix .dtb, $(CONFIG_BUILTIN_DTB_NAME)) -dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(src)/%.dts,%.dtb, $(wildcard $(src)/*.dts)) +dtb-$(CONFIG_OF_ALL_DTBS) += $(patsubst $(src)/%.dts,%.dtb, $(wildcard $(src)/*.dts)) diff --git a/arch/nios2/kernel/prom.c b/arch/nios2/kernel/prom.c index db049249766f..4f8c14da6490 100644 --- a/arch/nios2/kernel/prom.c +++ b/arch/nios2/kernel/prom.c @@ -32,7 +32,7 @@ void __init early_init_devtree(void *params) } #endif -#ifdef CONFIG_NIOS2_DTB_SOURCE_BOOL +#ifdef CONFIG_BUILTIN_DTB if (be32_to_cpu((__be32) *dtb) == OF_DT_HEADER) params = (void *)__dtb_start; #endif diff --git a/arch/nios2/platform/Kconfig.platform b/arch/nios2/platform/Kconfig.platform index e849daff6fd1..c75cadd92388 100644 --- a/arch/nios2/platform/Kconfig.platform +++ b/arch/nios2/platform/Kconfig.platform @@ -35,19 +35,20 @@ config NIOS2_DTB_PHYS_ADDR help Physical address of a dtb blob. -config NIOS2_DTB_SOURCE_BOOL +config BUILTIN_DTB bool "Compile and link device tree into kernel image" depends on !COMPILE_TEST + select GENERIC_BUILTIN_DTB help This allows you to specify a dts (device tree source) file which will be compiled and linked into the kernel image. -config NIOS2_DTB_SOURCE - string "Device tree source file" - depends on NIOS2_DTB_SOURCE_BOOL +config BUILTIN_DTB_NAME + string "Built-in device tree name" + depends on BUILTIN_DTB default "" help - Absolute path to the device tree source (dts) file describing your + Relative path to the device tree without suffix describing your system. comment "Nios II instructions" -- 2.51.0 From a26fe287eed112b4e21e854f173c8918a6a8596d Mon Sep 17 00:00:00 2001 From: Daniel Gomez Date: Fri, 28 Mar 2025 14:28:37 +0000 Subject: [PATCH 11/16] kconfig: merge_config: use an empty file as initfile The scripts/kconfig/merge_config.sh script requires an existing $INITFILE (or the $1 argument) as a base file for merging Kconfig fragments. However, an empty $INITFILE can serve as an initial starting point, later referenced by the KCONFIG_ALLCONFIG Makefile variable if -m is not used. This variable can point to any configuration file containing preset config symbols (the merged output) as stated in Documentation/kbuild/kconfig.rst. When -m is used $INITFILE will contain just the merge output requiring the user to run make (i.e. KCONFIG_ALLCONFIG=<$INITFILE> make or make olddefconfig). Instead of failing when `$INITFILE` is missing, create an empty file and use it as the starting point for merges. Signed-off-by: Daniel Gomez Signed-off-by: Masahiro Yamada --- scripts/kconfig/merge_config.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/kconfig/merge_config.sh b/scripts/kconfig/merge_config.sh index 0b7952471c18..79c09b378be8 100755 --- a/scripts/kconfig/merge_config.sh +++ b/scripts/kconfig/merge_config.sh @@ -112,8 +112,8 @@ INITFILE=$1 shift; if [ ! -r "$INITFILE" ]; then - echo "The base file '$INITFILE' does not exist. Exit." >&2 - exit 1 + echo "The base file '$INITFILE' does not exist. Creating one..." >&2 + touch "$INITFILE" fi MERGE_LIST=$* -- 2.51.0 From a7c699d090a1f3795c3271c2b399230e182db06e Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Mon, 31 Mar 2025 16:46:32 -0600 Subject: [PATCH 12/16] kbuild: rpm-pkg: build a debuginfo RPM The rpm-pkg make target currently suffers from a few issues related to debuginfo: 1. debuginfo for things built into the kernel (vmlinux) is not available in any RPM produced by make rpm-pkg. This makes using tools like systemtap against a make rpm-pkg kernel impossible. 2. debug source for the kernel is not available. This means that commands like 'disas /s' in gdb, which display source intermixed with assembly, can only print file names/line numbers which then must be painstakingly resolved to actual source in a separate editor. 3. debuginfo for modules is available, but it remains bundled with the .ko files that contain module code, in the main kernel RPM. This is a waste of space for users who do not need to debug the kernel (i.e. most users). Address all of these issues by additionally building a debuginfo RPM when the kernel configuration allows for it, in line with standard patterns followed by RPM distributors. With these changes: 1. systemtap now works (when these changes are backported to 6.11, since systemtap lags a bit behind in compatibility), as verified by the following simple test script: # stap -e 'probe kernel.function("do_sys_open").call { printf("%s\n", $$parms); }' dfd=0xffffffffffffff9c filename=0x7fe18800b160 flags=0x88800 mode=0x0 ... 2. disas /s works correctly in gdb, with source and disassembly interspersed: # gdb vmlinux --batch -ex 'disas /s blk_op_str' Dump of assembler code for function blk_op_str: block/blk-core.c: 125 { 0xffffffff814c8740 <+0>: endbr64 127 128 if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op]) 0xffffffff814c8744 <+4>: mov $0xffffffff824a7378,%rax 0xffffffff814c874b <+11>: cmp $0x23,%edi 0xffffffff814c874e <+14>: ja 0xffffffff814c8768 0xffffffff814c8750 <+16>: mov %edi,%edi 126 const char *op_str = "UNKNOWN"; 0xffffffff814c8752 <+18>: mov $0xffffffff824a7378,%rdx 127 128 if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op]) 0xffffffff814c8759 <+25>: mov -0x7dfa0160(,%rdi,8),%rax 126 const char *op_str = "UNKNOWN"; 0xffffffff814c8761 <+33>: test %rax,%rax 0xffffffff814c8764 <+36>: cmove %rdx,%rax 129 op_str = blk_op_name[op]; 130 131 return op_str; 132 } 0xffffffff814c8768 <+40>: jmp 0xffffffff81d01360 <__x86_return_thunk> End of assembler dump. 3. The size of the main kernel package goes down substantially, especially if many modules are built (quite typical). Here is a comparison of installed size of the kernel package (configured with allmodconfig, dwarf4 debuginfo, and module compression turned off) before and after this patch: # rpm -qi kernel-6.13* | grep -E '^(Version|Size)' Version : 6.13.0postpatch+ Size : 1382874089 Version : 6.13.0prepatch+ Size : 17870795887 This is a ~92% size reduction. Note that a debuginfo package can only be produced if the following configs are set: - CONFIG_DEBUG_INFO=y - CONFIG_MODULE_COMPRESS=n - CONFIG_DEBUG_INFO_SPLIT=n The first of these is obvious - we can't produce debuginfo if the build does not generate it. The second two requirements can in principle be removed, but doing so is difficult with the current approach, which uses a generic rpmbuild script find-debuginfo.sh that processes all packaged executables. If we want to remove those requirements the best path forward is likely to add some debuginfo extraction/installation logic to the modules_install target (controllable by flags). That way, it's easier to operate on modules before they're compressed, and the logic can be reused by all packaging targets. Signed-off-by: Uday Shankar Signed-off-by: Masahiro Yamada --- scripts/package/kernel.spec | 46 +++++++++++++++++++++++++++++++++++-- scripts/package/mkspec | 10 ++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/scripts/package/kernel.spec b/scripts/package/kernel.spec index ac3e5ac01d8a..726f34e11960 100644 --- a/scripts/package/kernel.spec +++ b/scripts/package/kernel.spec @@ -2,8 +2,6 @@ %{!?_arch: %define _arch dummy} %{!?make: %define make make} %define makeflags %{?_smp_mflags} ARCH=%{ARCH} -%define __spec_install_post /usr/lib/rpm/brp-compress || : -%define debug_package %{nil} Name: kernel Summary: The Linux Kernel @@ -46,6 +44,36 @@ This package provides kernel headers and makefiles sufficient to build modules against the %{version} kernel package. %endif +%if %{with_debuginfo} +# list of debuginfo-related options taken from distribution kernel.spec +# files +%undefine _include_minidebuginfo +%undefine _find_debuginfo_dwz_opts +%undefine _unique_build_ids +%undefine _unique_debug_names +%undefine _unique_debug_srcs +%undefine _debugsource_packages +%undefine _debuginfo_subpackages +%global _find_debuginfo_opts -r +%global _missing_build_ids_terminate_build 1 +%global _no_recompute_build_ids 1 +%{debug_package} +%endif +# some (but not all) versions of rpmbuild emit %%debug_package with +# %%install. since we've already emitted it manually, that would cause +# a package redefinition error. ensure that doesn't happen +%define debug_package %{nil} + +# later, we make all modules executable so that find-debuginfo.sh strips +# them up. but they don't actually need to be executable, so remove the +# executable bit, taking care to do it _after_ find-debuginfo.sh has run +%define __spec_install_post \ + %{?__debug_package:%{__debug_install_post}} \ + %{__arch_install_post} \ + %{__os_install_post} \ + find %{buildroot}/lib/modules/%{KERNELRELEASE} -name "*.ko" -type f \\\ + | xargs --no-run-if-empty chmod u-x + %prep %setup -q -n linux cp %{SOURCE1} .config @@ -89,8 +117,22 @@ ln -fns /usr/src/kernels/%{KERNELRELEASE} %{buildroot}/lib/modules/%{KERNELRELEA echo "%exclude /lib/modules/%{KERNELRELEASE}/build" } > %{buildroot}/kernel.list +# make modules executable so that find-debuginfo.sh strips them. this +# will be undone later in %%__spec_install_post +find %{buildroot}/lib/modules/%{KERNELRELEASE} -name "*.ko" -type f \ + | xargs --no-run-if-empty chmod u+x + +%if %{with_debuginfo} +# copying vmlinux directly to the debug directory means it will not get +# stripped (but its source paths will still be collected + fixed up) +mkdir -p %{buildroot}/usr/lib/debug/lib/modules/%{KERNELRELEASE} +cp vmlinux %{buildroot}/usr/lib/debug/lib/modules/%{KERNELRELEASE} +%endif + %clean rm -rf %{buildroot} +rm -f debugfiles.list debuglinks.list debugsourcefiles.list debugsources.list \ + elfbins.list %post if [ -x /usr/bin/kernel-install ]; then diff --git a/scripts/package/mkspec b/scripts/package/mkspec index 4dc1466dfc81..c7375bfc25a9 100755 --- a/scripts/package/mkspec +++ b/scripts/package/mkspec @@ -23,6 +23,16 @@ else echo '%define with_devel 0' fi +# debuginfo package generation uses find-debuginfo.sh under the hood, +# which only works on uncompressed modules that contain debuginfo +if grep -q CONFIG_DEBUG_INFO=y include/config/auto.conf && + (! grep -q CONFIG_MODULE_COMPRESS=y include/config/auto.conf) && + (! grep -q CONFIG_DEBUG_INFO_SPLIT=y include/config/auto.conf); then +echo '%define with_debuginfo %{?_without_debuginfo: 0} %{?!_without_debuginfo: 1}' +else +echo '%define with_debuginfo 0' +fi + cat< Date: Sun, 6 Apr 2025 10:00:04 -0700 Subject: [PATCH 13/16] Disable SLUB_TINY for build testing ... and don't error out so hard on missing module descriptions. Before commit 6c6c1fc09de3 ("modpost: require a MODULE_DESCRIPTION()") we used to warn about missing module descriptions, but only when building with extra warnigns (ie 'W=1'). After that commit the warning became an unconditional hard error. And it turns out not all modules have been converted despite the claims to the contrary. As reported by Damian Tometzki, the slub KUnit test didn't have a module description, and apparently nobody ever really noticed. The reason nobody noticed seems to be that the slub KUnit tests get disabled by SLUB_TINY, which also ends up disabling a lot of other code, both in tests and in slub itself. And so anybody doing full build tests didn't actually see this failre. So let's disable SLUB_TINY for build-only tests, since it clearly ends up limiting build coverage. Also turn the missing module descriptions error back into a warning, but let's keep it around for non-'W=1' builds. Reported-by: Damian Tometzki Link: https://lore.kernel.org/all/01070196099fd059-e8463438-7b1b-4ec8-816d-173874be9966-000000@eu-central-1.amazonses.com/ Cc: Masahiro Yamada Cc: Jeff Johnson Fixes: 6c6c1fc09de3 ("modpost: require a MODULE_DESCRIPTION()") Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- scripts/mod/modpost.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index d3fb3762887b..e113f713b493 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -201,7 +201,7 @@ config KVFREE_RCU_BATCHED config SLUB_TINY bool "Configure for minimal memory footprint" - depends on EXPERT + depends on EXPERT && !COMPILE_TEST select SLAB_MERGE_DEFAULT help Configures the slab allocator in a way to achieve minimal memory diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 92627e8d0e16..be89921d60b6 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1603,7 +1603,7 @@ static void read_symbols(const char *modname) } if (!get_modinfo(&info, "description")) - error("missing MODULE_DESCRIPTION() in %s\n", modname); + warn("missing MODULE_DESCRIPTION() in %s\n", modname); } for (sym = info.symtab_start; sym < info.symtab_stop; sym++) { -- 2.51.0 From 0ba3a4ab76fd3367b9cb680cad70182c896c795c Mon Sep 17 00:00:00 2001 From: Gabriel Shahrouzi Date: Sat, 5 Apr 2025 16:30:36 -0400 Subject: [PATCH 14/16] perf/core: Fix WARN_ON(!ctx) in __free_event() for partial init Move the get_ctx(child_ctx) call and the child_event->ctx assignment to occur immediately after the child event is allocated. Ensure that child_event->ctx is non-NULL before any subsequent error path within inherit_event calls free_event(), satisfying the assumptions of the cleanup code. Details: There's no clear Fixes tag, because this bug is a side-effect of multiple interacting commits over time (up to 15 years old), not a single regression. The code initially incremented refcount then assigned context immediately after the child_event was created. Later, an early validity check for child_event was added before the refcount/assignment. Even later, a WARN_ON_ONCE() cleanup check was added, assuming event->ctx is valid if the pmu_ctx is valid. The problem is that the WARN_ON_ONCE() could trigger after the initial check passed but before child_event->ctx was assigned, violating its precondition. The solution is to assign child_event->ctx right after its initial validation. This ensures the context exists for any subsequent checks or cleanup routines, resolving the WARN_ON_ONCE(). To resolve it, defer the refcount update and child_event->ctx assignment directly after child_event->pmu_ctx is set but before checking if the parent event is orphaned. The cleanup routine depends on event->pmu_ctx being non-NULL before it verifies event->ctx is non-NULL. This also maintains the author's original intent of passing in child_ctx to find_get_pmu_context before its refcount/assignment. [ mingo: Expanded the changelog from another email by Gabriel Shahrouzi. ] Reported-by: syzbot+ff3aa851d46ab82953a3@syzkaller.appspotmail.com Signed-off-by: Gabriel Shahrouzi Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Kan Liang Cc: Oleg Nesterov Cc: Alexander Shishkin Link: https://lore.kernel.org/r/20250405203036.582721-1-gshahrouzi@gmail.com Closes: https://syzkaller.appspot.com/bug?extid=ff3aa851d46ab82953a3 --- kernel/events/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 128db74e9eab..9af9726ef5f5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -14016,6 +14016,9 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; + get_ctx(child_ctx); + child_event->ctx = child_ctx; + pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); if (IS_ERR(pmu_ctx)) { free_event(child_event); @@ -14037,8 +14040,6 @@ inherit_event(struct perf_event *parent_event, return NULL; } - get_ctx(child_ctx); - /* * Make the child state follow the state of the parent event, * not its attr.disabled bit. We hold the parent's mutex, @@ -14059,7 +14060,6 @@ inherit_event(struct perf_event *parent_event, local64_set(&hwc->period_left, sample_period); } - child_event->ctx = child_ctx; child_event->overflow_handler = parent_event->overflow_handler; child_event->overflow_handler_context = parent_event->overflow_handler_context; -- 2.51.0 From 0cd575cab10e114e95921321f069a08d45bc412e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 4 Apr 2025 12:48:48 -0700 Subject: [PATCH 15/16] uprobes: Avoid false-positive lockdep splat on CONFIG_PREEMPT_RT=y in the ri_timer() uprobe timer callback, use raw_write_seqcount_*() Avoid a false-positive lockdep warning in the CONFIG_PREEMPT_RT=y configuration when using write_seqcount_begin() in the uprobe timer callback by using raw_write_* APIs. Uprobe's use of timer callback is guaranteed to not race with itself for a given uprobe_task, and as such seqcount's insistence on having preemption disabled on the writer side is irrelevant. So switch to raw_ variants of seqcount API instead of disabling preemption unnecessarily. Also, point out in the comments more explicitly why we use seqcount despite our reader side being rather simple and never retrying. We favor well-maintained kernel primitive in favor of open-coding our own memory barriers. Fixes: 8622e45b5da1 ("uprobes: Reuse return_instances between multiple uretprobes within task") Reported-by: Alexei Starovoitov Suggested-by: Sebastian Siewior Signed-off-by: Andrii Nakryiko Signed-off-by: Ingo Molnar Acked-by: Oleg Nesterov Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: stable@kernel.org Link: https://lore.kernel.org/r/20250404194848.2109539-1-andrii@kernel.org --- kernel/events/uprobes.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 615b4e6d22c7..8d783b5882b6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1956,6 +1956,9 @@ static void free_ret_instance(struct uprobe_task *utask, * to-be-reused return instances for future uretprobes. If ri_timer() * happens to be running right now, though, we fallback to safety and * just perform RCU-delated freeing of ri. + * Admittedly, this is a rather simple use of seqcount, but it nicely + * abstracts away all the necessary memory barriers, so we use + * a well-supported kernel primitive here. */ if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) { /* immediate reuse of ri without RCU GP is OK */ @@ -2016,12 +2019,20 @@ static void ri_timer(struct timer_list *timer) /* RCU protects return_instance from freeing. */ guard(rcu)(); - write_seqcount_begin(&utask->ri_seqcount); + /* + * See free_ret_instance() for notes on seqcount use. + * We also employ raw API variants to avoid lockdep false-positive + * warning complaining about enabled preemption. The timer can only be + * invoked once for a uprobe_task. Therefore there can only be one + * writer. The reader does not require an even sequence count to make + * progress, so it is OK to remain preemptible on PREEMPT_RT. + */ + raw_write_seqcount_begin(&utask->ri_seqcount); for_each_ret_instance_rcu(ri, utask->return_instances) hprobe_expire(&ri->hprobe, false); - write_seqcount_end(&utask->ri_seqcount); + raw_write_seqcount_end(&utask->ri_seqcount); } static struct uprobe_task *alloc_utask(void) -- 2.51.0 From 56799bc035658738f362acec3e7647bb84e68933 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 4 Mar 2025 14:54:46 +0100 Subject: [PATCH 16/16] perf: Fix hang while freeing sigtrap event Perf can hang while freeing a sigtrap event if a related deferred signal hadn't managed to be sent before the file got closed: perf_event_overflow() task_work_add(perf_pending_task) fput() task_work_add(____fput()) task_work_run() ____fput() perf_release() perf_event_release_kernel() _free_event() perf_pending_task_sync() task_work_cancel() -> FAILED rcuwait_wait_event() Once task_work_run() is running, the list of pending callbacks is removed from the task_struct and from this point on task_work_cancel() can't remove any pending and not yet started work items, hence the task_work_cancel() failure and the hang on rcuwait_wait_event(). Task work could be changed to remove one work at a time, so a work running on the current task can always cancel a pending one, however the wait / wake design is still subject to inverted dependencies when remote targets are involved, as pictured by Oleg: T1 T2 fd = perf_event_open(pid => T2->pid); fd = perf_event_open(pid => T1->pid); close(fd) close(fd) perf_event_overflow() perf_event_overflow() task_work_add(perf_pending_task) task_work_add(perf_pending_task) fput() fput() task_work_add(____fput()) task_work_add(____fput()) task_work_run() task_work_run() ____fput() ____fput() perf_release() perf_release() perf_event_release_kernel() perf_event_release_kernel() _free_event() _free_event() perf_pending_task_sync() perf_pending_task_sync() rcuwait_wait_event() rcuwait_wait_event() Therefore the only option left is to acquire the event reference count upon queueing the perf task work and release it from the task work, just like it was done before 3a5465418f5f ("perf: Fix event leak upon exec and file release") but without the leaks it fixed. Some adjustments are necessary to make it work: * A child event might dereference its parent upon freeing. Care must be taken to release the parent last. * Some places assuming the event doesn't have any reference held and therefore can be freed right away must instead put the reference and let the reference counting to its job. Reported-by: "Yi Lai" Closes: https://lore.kernel.org/all/Zx9Losv4YcJowaP%2F@ly-workstation/ Reported-by: syzbot+3c4321e10eea460eb606@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/673adf75.050a0220.87769.0024.GAE@google.com/ Fixes: 3a5465418f5f ("perf: Fix event leak upon exec and file release") Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250304135446.18905-1-frederic@kernel.org --- include/linux/perf_event.h | 1 - kernel/events/core.c | 64 +++++++++++--------------------------- 2 files changed, 18 insertions(+), 47 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5a9bf15d4461..0069ba6866a4 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -823,7 +823,6 @@ struct perf_event { struct irq_work pending_disable_irq; struct callback_head pending_task; unsigned int pending_work; - struct rcuwait pending_work_wait; atomic_t event_limit; diff --git a/kernel/events/core.c b/kernel/events/core.c index 9af9726ef5f5..e93c19565914 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5518,30 +5518,6 @@ static bool exclusive_event_installable(struct perf_event *event, static void perf_free_addr_filters(struct perf_event *event); -static void perf_pending_task_sync(struct perf_event *event) -{ - struct callback_head *head = &event->pending_task; - - if (!event->pending_work) - return; - /* - * If the task is queued to the current task's queue, we - * obviously can't wait for it to complete. Simply cancel it. - */ - if (task_work_cancel(current, head)) { - event->pending_work = 0; - local_dec(&event->ctx->nr_no_switch_fast); - return; - } - - /* - * All accesses related to the event are within the same RCU section in - * perf_pending_task(). The RCU grace period before the event is freed - * will make sure all those accesses are complete by then. - */ - rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE); -} - /* vs perf_event_alloc() error */ static void __free_event(struct perf_event *event) { @@ -5599,7 +5575,6 @@ static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending_irq); irq_work_sync(&event->pending_disable_irq); - perf_pending_task_sync(event); unaccount_event(event); @@ -5692,10 +5667,17 @@ static void perf_remove_from_owner(struct perf_event *event) static void put_event(struct perf_event *event) { + struct perf_event *parent; + if (!atomic_long_dec_and_test(&event->refcount)) return; + parent = event->parent; _free_event(event); + + /* Matches the refcount bump in inherit_event() */ + if (parent) + put_event(parent); } /* @@ -5779,11 +5761,6 @@ again: if (tmp == child) { perf_remove_from_context(child, DETACH_GROUP); list_move(&child->child_list, &free_list); - /* - * This matches the refcount bump in inherit_event(); - * this can't be the last reference. - */ - put_event(event); } else { var = &ctx->refcount; } @@ -5809,7 +5786,8 @@ again: void *var = &child->ctx->refcount; list_del(&child->child_list); - free_event(child); + /* Last reference unless ->pending_task work is pending */ + put_event(child); /* * Wake any perf_event_free_task() waiting for this event to be @@ -5820,7 +5798,11 @@ again: } no_ctx: - put_event(event); /* Must be the 'last' reference */ + /* + * Last reference unless ->pending_task work is pending on this event + * or any of its children. + */ + put_event(event); return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); @@ -7235,12 +7217,6 @@ static void perf_pending_task(struct callback_head *head) struct perf_event *event = container_of(head, struct perf_event, pending_task); int rctx; - /* - * All accesses to the event must belong to the same implicit RCU read-side - * critical section as the ->pending_work reset. See comment in - * perf_pending_task_sync(). - */ - rcu_read_lock(); /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. @@ -7251,9 +7227,8 @@ static void perf_pending_task(struct callback_head *head) event->pending_work = 0; perf_sigtrap(event); local_dec(&event->ctx->nr_no_switch_fast); - rcuwait_wake_up(&event->pending_work_wait); } - rcu_read_unlock(); + put_event(event); if (rctx >= 0) perf_swevent_put_recursion_context(rctx); @@ -10248,6 +10223,7 @@ static int __perf_event_overflow(struct perf_event *event, !task_work_add(current, &event->pending_task, notify_mode)) { event->pending_work = pending_id; local_inc(&event->ctx->nr_no_switch_fast); + WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); event->pending_addr = 0; if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) @@ -12610,7 +12586,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_irq_work(&event->pending_irq, perf_pending_irq); event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); init_task_work(&event->pending_task, perf_pending_task); - rcuwait_init(&event->pending_work_wait); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); @@ -13747,8 +13722,7 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) * Kick perf_poll() for is_event_hup(); */ perf_event_wakeup(parent_event); - free_event(event); - put_event(parent_event); + put_event(event); return; } @@ -13872,13 +13846,11 @@ static void perf_free_event(struct perf_event *event, list_del_init(&event->child_list); mutex_unlock(&parent->child_mutex); - put_event(parent); - raw_spin_lock_irq(&ctx->lock); perf_group_detach(event); list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); - free_event(event); + put_event(event); } /* -- 2.51.0