From 535d1784d8a96dce79023ac5081de1aa8669388a Mon Sep 17 00:00:00 2001 From: Diogo Ivo Date: Mon, 17 Mar 2025 10:55:06 +0000 Subject: [PATCH 01/16] watchdog: Add driver for Intel OC WDT Add a driver for the Intel Over-Clocking Watchdog found in Intel Platform Controller (PCH) chipsets. This watchdog is controlled via a simple single-register interface and would otherwise be standard except for the presence of a LOCK bit that can only be set once per power cycle, needing extra handling around it. Signed-off-by: Diogo Ivo Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20250317-ivo-intel_oc_wdt-v3-1-32c396f4eefd@siemens.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 11 ++ drivers/watchdog/Makefile | 1 + drivers/watchdog/intel_oc_wdt.c | 233 ++++++++++++++++++++++++++++++++ 3 files changed, 245 insertions(+) create mode 100644 drivers/watchdog/intel_oc_wdt.c diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 16c49ca63fba..0c25b2ed44eb 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -1372,6 +1372,17 @@ config INTEL_MID_WATCHDOG To compile this driver as a module, choose M here. +config INTEL_OC_WATCHDOG + tristate "Intel OC Watchdog" + depends on (X86 || COMPILE_TEST) && ACPI && HAS_IOPORT + select WATCHDOG_CORE + help + Hardware driver for Intel Over-Clocking watchdog present in + Platform Controller Hub (PCH) chipsets. + + To compile this driver as a module, choose M here: the + module will be called intel_oc_wdt. + config ITCO_WDT tristate "Intel TCO Timer/Watchdog" depends on X86 && PCI diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index 7b64ce9680a8..bbd4d62d2cc3 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -151,6 +151,7 @@ obj-$(CONFIG_W83977F_WDT) += w83977f_wdt.o obj-$(CONFIG_MACHZ_WDT) += machzwd.o obj-$(CONFIG_SBC_EPX_C3_WATCHDOG) += sbc_epx_c3.o obj-$(CONFIG_INTEL_MID_WATCHDOG) += intel-mid_wdt.o +obj-$(CONFIG_INTEL_OC_WATCHDOG) += intel_oc_wdt.o obj-$(CONFIG_INTEL_MEI_WDT) += mei_wdt.o obj-$(CONFIG_NI903X_WDT) += ni903x_wdt.o obj-$(CONFIG_NIC7018_WDT) += nic7018_wdt.o diff --git a/drivers/watchdog/intel_oc_wdt.c b/drivers/watchdog/intel_oc_wdt.c new file mode 100644 index 000000000000..7c0551106981 --- /dev/null +++ b/drivers/watchdog/intel_oc_wdt.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel OC Watchdog driver + * + * Copyright (C) 2025, Siemens + * Author: Diogo Ivo + */ + +#define DRV_NAME "intel_oc_wdt" + +#include +#include +#include +#include +#include +#include +#include + +#define INTEL_OC_WDT_TOV GENMASK(9, 0) +#define INTEL_OC_WDT_MIN_TOV 1 +#define INTEL_OC_WDT_MAX_TOV 1024 +#define INTEL_OC_WDT_DEF_TOV 60 + +/* + * One-time writable lock bit. If set forbids + * modification of itself, _TOV and _EN until + * next reboot. + */ +#define INTEL_OC_WDT_CTL_LCK BIT(12) + +#define INTEL_OC_WDT_EN BIT(14) +#define INTEL_OC_WDT_NO_ICCSURV_STS BIT(24) +#define INTEL_OC_WDT_ICCSURV_STS BIT(25) +#define INTEL_OC_WDT_RLD BIT(31) + +#define INTEL_OC_WDT_STS_BITS (INTEL_OC_WDT_NO_ICCSURV_STS | \ + INTEL_OC_WDT_ICCSURV_STS) + +#define INTEL_OC_WDT_CTRL_REG(wdt) ((wdt)->ctrl_res->start) + +struct intel_oc_wdt { + struct watchdog_device wdd; + struct resource *ctrl_res; + bool locked; +}; + +static int heartbeat; +module_param(heartbeat, uint, 0); +MODULE_PARM_DESC(heartbeat, "Watchdog heartbeats in seconds. (default=" + __MODULE_STRING(WDT_HEARTBEAT) ")"); + +static bool nowayout = WATCHDOG_NOWAYOUT; +module_param(nowayout, bool, 0); +MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" + __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); + +static int intel_oc_wdt_start(struct watchdog_device *wdd) +{ + struct intel_oc_wdt *oc_wdt = watchdog_get_drvdata(wdd); + + if (oc_wdt->locked) + return 0; + + outl(inl(INTEL_OC_WDT_CTRL_REG(oc_wdt)) | INTEL_OC_WDT_EN, + INTEL_OC_WDT_CTRL_REG(oc_wdt)); + + return 0; +} + +static int intel_oc_wdt_stop(struct watchdog_device *wdd) +{ + struct intel_oc_wdt *oc_wdt = watchdog_get_drvdata(wdd); + + outl(inl(INTEL_OC_WDT_CTRL_REG(oc_wdt)) & ~INTEL_OC_WDT_EN, + INTEL_OC_WDT_CTRL_REG(oc_wdt)); + + return 0; +} + +static int intel_oc_wdt_ping(struct watchdog_device *wdd) +{ + struct intel_oc_wdt *oc_wdt = watchdog_get_drvdata(wdd); + + outl(inl(INTEL_OC_WDT_CTRL_REG(oc_wdt)) | INTEL_OC_WDT_RLD, + INTEL_OC_WDT_CTRL_REG(oc_wdt)); + + return 0; +} + +static int intel_oc_wdt_set_timeout(struct watchdog_device *wdd, + unsigned int t) +{ + struct intel_oc_wdt *oc_wdt = watchdog_get_drvdata(wdd); + + outl((inl(INTEL_OC_WDT_CTRL_REG(oc_wdt)) & ~INTEL_OC_WDT_TOV) | (t - 1), + INTEL_OC_WDT_CTRL_REG(oc_wdt)); + + wdd->timeout = t; + + return 0; +} + +static const struct watchdog_info intel_oc_wdt_info = { + .options = WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE | WDIOF_KEEPALIVEPING, + .identity = DRV_NAME, +}; + +static const struct watchdog_ops intel_oc_wdt_ops = { + .owner = THIS_MODULE, + .start = intel_oc_wdt_start, + .stop = intel_oc_wdt_stop, + .ping = intel_oc_wdt_ping, + .set_timeout = intel_oc_wdt_set_timeout, +}; + +static int intel_oc_wdt_setup(struct intel_oc_wdt *oc_wdt) +{ + struct watchdog_info *info; + unsigned long val; + + val = inl(INTEL_OC_WDT_CTRL_REG(oc_wdt)); + + if (val & INTEL_OC_WDT_STS_BITS) + oc_wdt->wdd.bootstatus |= WDIOF_CARDRESET; + + oc_wdt->locked = !!(val & INTEL_OC_WDT_CTL_LCK); + + if (val & INTEL_OC_WDT_EN) { + /* + * No need to issue a ping here to "commit" the new timeout + * value to hardware as the watchdog core schedules one + * immediately when registering the watchdog. + */ + set_bit(WDOG_HW_RUNNING, &oc_wdt->wdd.status); + + if (oc_wdt->locked) { + info = (struct watchdog_info *)&intel_oc_wdt_info; + /* + * Set nowayout unconditionally as we cannot stop + * the watchdog. + */ + nowayout = true; + /* + * If we are locked read the current timeout value + * and inform the core we can't change it. + */ + oc_wdt->wdd.timeout = (val & INTEL_OC_WDT_TOV) + 1; + info->options &= ~WDIOF_SETTIMEOUT; + + dev_info(oc_wdt->wdd.parent, + "Register access locked, heartbeat fixed at: %u s\n", + oc_wdt->wdd.timeout); + } + } else if (oc_wdt->locked) { + /* + * In case the watchdog is disabled and locked there + * is nothing we can do with it so just fail probing. + */ + return -EACCES; + } + + val &= ~INTEL_OC_WDT_TOV; + outl(val | (oc_wdt->wdd.timeout - 1), INTEL_OC_WDT_CTRL_REG(oc_wdt)); + + return 0; +} + +static int intel_oc_wdt_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct intel_oc_wdt *oc_wdt; + struct watchdog_device *wdd; + int ret; + + oc_wdt = devm_kzalloc(&pdev->dev, sizeof(*oc_wdt), GFP_KERNEL); + if (!oc_wdt) + return -ENOMEM; + + oc_wdt->ctrl_res = platform_get_resource(pdev, IORESOURCE_IO, 0); + if (!oc_wdt->ctrl_res) { + dev_err(&pdev->dev, "missing I/O resource\n"); + return -ENODEV; + } + + if (!devm_request_region(&pdev->dev, oc_wdt->ctrl_res->start, + resource_size(oc_wdt->ctrl_res), pdev->name)) { + dev_err(dev, "resource %pR already in use, device disabled\n", + oc_wdt->ctrl_res); + return -EBUSY; + } + + wdd = &oc_wdt->wdd; + wdd->min_timeout = INTEL_OC_WDT_MIN_TOV; + wdd->max_timeout = INTEL_OC_WDT_MAX_TOV; + wdd->timeout = INTEL_OC_WDT_DEF_TOV; + wdd->info = &intel_oc_wdt_info; + wdd->ops = &intel_oc_wdt_ops; + wdd->parent = dev; + + watchdog_init_timeout(wdd, heartbeat, dev); + + ret = intel_oc_wdt_setup(oc_wdt); + if (ret) + return ret; + + watchdog_set_drvdata(wdd, oc_wdt); + watchdog_set_nowayout(wdd, nowayout); + watchdog_stop_on_reboot(wdd); + watchdog_stop_on_unregister(wdd); + + return devm_watchdog_register_device(dev, wdd); +} + +static const struct acpi_device_id intel_oc_wdt_match[] = { + { "INT3F0D" }, + { "INTC1099" }, + { }, +}; +MODULE_DEVICE_TABLE(acpi, intel_oc_wdt_match); + +static struct platform_driver intel_oc_wdt_platform_driver = { + .driver = { + .name = DRV_NAME, + .acpi_match_table = intel_oc_wdt_match, + }, + .probe = intel_oc_wdt_probe, +}; + +module_platform_driver(intel_oc_wdt_platform_driver); + +MODULE_AUTHOR("Diogo Ivo "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Intel OC Watchdog driver"); -- 2.51.0 From 158f9f2f71523bab787f4fa7a7a1f390524350ca Mon Sep 17 00:00:00 2001 From: Ziyan Fu Date: Tue, 29 Apr 2025 18:25:33 +0800 Subject: [PATCH 02/16] watchdog: iTCO_wdt: Update the heartbeat value after clamping timeout When executing "modprobe iTCO_wdt heartbeat=700", the user-specified 'heartbeat' parameter exceeds the valid range, the driver clamps the timeout to default 30s but fails to update the logged 'heartbeat' value, resulting in misleading log output: iTCO_wdt iTCO_wdt: timeout value out of range, using 30 iTCO_wdt iTCO_wdt: initialized. heartbeat=700 sec (nowayout=0) After validating the range, update the 'heartbeat' value with the clamped timeout value to ensure that log messages accurately reflect the actual runtime parameters. Signed-off-by: Ziyan Fu Reviewed-by: Wim Van Sebroeck Link: https://lore.kernel.org/r/20250429102533.11886-1-13281011316@163.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/iTCO_wdt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c index 30a8d72d84fa..9ab769aa0244 100644 --- a/drivers/watchdog/iTCO_wdt.c +++ b/drivers/watchdog/iTCO_wdt.c @@ -580,6 +580,7 @@ static int iTCO_wdt_probe(struct platform_device *pdev) iTCO_wdt_set_timeout(&p->wddev, WATCHDOG_TIMEOUT); dev_info(dev, "timeout value out of range, using %d\n", WATCHDOG_TIMEOUT); + heartbeat = WATCHDOG_TIMEOUT; } watchdog_stop_on_reboot(&p->wddev); -- 2.51.0 From 5c78e793f78732b60276401f75cc1a101f9ad121 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 30 May 2025 12:06:47 -0700 Subject: [PATCH 03/16] overflow: Introduce __DEFINE_FLEX for having no initializer While not yet in the tree, there is a proposed patch[1] that was depending on the prior behavior of _DEFINE_FLEX, which did not have an explicit initializer. Provide this via __DEFINE_FLEX now, which can also have attributes applied (e.g. __uninitialized). Examples of the resulting initializer behaviors can be seen here: https://godbolt.org/z/P7Go8Tr33 Link: https://lore.kernel.org/netdev/20250520205920.2134829-9-anthony.l.nguyen@intel.com [1] Fixes: 47e36ed78406 ("overflow: Fix direct struct member initialization in _DEFINE_FLEX()") Signed-off-by: Kees Cook --- include/linux/overflow.h | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/include/linux/overflow.h b/include/linux/overflow.h index 7b7be27ca113..154ed0dbb43f 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -389,24 +389,37 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) struct_size((type *)NULL, member, count) /** - * _DEFINE_FLEX() - helper macro for DEFINE_FLEX() family. - * Enables caller macro to pass (different) initializer. + * __DEFINE_FLEX() - helper macro for DEFINE_FLEX() family. + * Enables caller macro to pass arbitrary trailing expressions * * @type: structure type name, including "struct" keyword. * @name: Name for a variable to define. * @member: Name of the array member. * @count: Number of elements in the array; must be compile-time const. - * @initializer: Initializer expression (e.g., pass `= { }` at minimum). + * @trailer: Trailing expressions for attributes and/or initializers. */ -#define _DEFINE_FLEX(type, name, member, count, initializer...) \ +#define __DEFINE_FLEX(type, name, member, count, trailer...) \ _Static_assert(__builtin_constant_p(count), \ "onstack flex array members require compile-time const count"); \ union { \ u8 bytes[struct_size_t(type, member, count)]; \ type obj; \ - } name##_u = { .obj initializer }; \ + } name##_u trailer; \ type *name = (type *)&name##_u +/** + * _DEFINE_FLEX() - helper macro for DEFINE_FLEX() family. + * Enables caller macro to pass (different) initializer. + * + * @type: structure type name, including "struct" keyword. + * @name: Name for a variable to define. + * @member: Name of the array member. + * @count: Number of elements in the array; must be compile-time const. + * @initializer: Initializer expression (e.g., pass `= { }` at minimum). + */ +#define _DEFINE_FLEX(type, name, member, count, initializer...) \ + __DEFINE_FLEX(type, name, member, count, = { .obj initializer }) + /** * DEFINE_RAW_FLEX() - Define an on-stack instance of structure with a trailing * flexible array member, when it does not have a __counted_by annotation. @@ -424,7 +437,7 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) * elements in array @member. */ #define DEFINE_RAW_FLEX(type, name, member, count) \ - _DEFINE_FLEX(type, name, member, count, = {}) + __DEFINE_FLEX(type, name, member, count, = { }) /** * DEFINE_FLEX() - Define an on-stack instance of structure with a trailing -- 2.51.0 From f39f18f3c3531aa802b58a20d39d96e82eb96c14 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 30 May 2025 15:18:28 -0700 Subject: [PATCH 04/16] randstruct: gcc-plugin: Fix attribute addition Based on changes in the 2021 public version of the randstruct out-of-tree GCC plugin[1], more carefully update the attributes on resulting decls, to avoid tripping checks in GCC 15's comptypes_check_enum_int() when it has been configured with "--enable-checking=misc": arch/arm64/kernel/kexec_image.c:132:14: internal compiler error: in comptypes_check_enum_int, at c/c-typeck.cc:1519 132 | const struct kexec_file_ops kexec_image_ops = { | ^~~~~~~~~~~~~~ internal_error(char const*, ...), at gcc/gcc/diagnostic-global-context.cc:517 fancy_abort(char const*, int, char const*), at gcc/gcc/diagnostic.cc:1803 comptypes_check_enum_int(tree_node*, tree_node*, bool*), at gcc/gcc/c/c-typeck.cc:1519 ... Link: https://archive.org/download/grsecurity/grsecurity-3.1-5.10.41-202105280954.patch.gz [1] Reported-by: Thiago Jung Bauermann Closes: https://github.com/KSPP/linux/issues/367 Closes: https://lore.kernel.org/lkml/20250530000646.104457-1-thiago.bauermann@linaro.org/ Reported-by: Ingo Saitz Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1104745 Fixes: 313dd1b62921 ("gcc-plugins: Add the randstruct plugin") Tested-by: Thiago Jung Bauermann Link: https://lore.kernel.org/r/20250530221824.work.623-kees@kernel.org Signed-off-by: Kees Cook --- scripts/gcc-plugins/gcc-common.h | 32 +++++++++++++++++++ scripts/gcc-plugins/randomize_layout_plugin.c | 22 ++++++------- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/scripts/gcc-plugins/gcc-common.h b/scripts/gcc-plugins/gcc-common.h index 3222c1070444..ef12c8f929ed 100644 --- a/scripts/gcc-plugins/gcc-common.h +++ b/scripts/gcc-plugins/gcc-common.h @@ -123,6 +123,38 @@ static inline tree build_const_char_string(int len, const char *str) return cstr; } +static inline void __add_type_attr(tree type, const char *attr, tree args) +{ + tree oldattr; + + if (type == NULL_TREE) + return; + oldattr = lookup_attribute(attr, TYPE_ATTRIBUTES(type)); + if (oldattr != NULL_TREE) { + gcc_assert(TREE_VALUE(oldattr) == args || TREE_VALUE(TREE_VALUE(oldattr)) == TREE_VALUE(args)); + return; + } + + TYPE_ATTRIBUTES(type) = copy_list(TYPE_ATTRIBUTES(type)); + TYPE_ATTRIBUTES(type) = tree_cons(get_identifier(attr), args, TYPE_ATTRIBUTES(type)); +} + +static inline void add_type_attr(tree type, const char *attr, tree args) +{ + tree main_variant = TYPE_MAIN_VARIANT(type); + + __add_type_attr(TYPE_CANONICAL(type), attr, args); + __add_type_attr(TYPE_CANONICAL(main_variant), attr, args); + __add_type_attr(main_variant, attr, args); + + for (type = TYPE_NEXT_VARIANT(main_variant); type; type = TYPE_NEXT_VARIANT(type)) { + if (!lookup_attribute(attr, TYPE_ATTRIBUTES(type))) + TYPE_ATTRIBUTES(type) = TYPE_ATTRIBUTES(main_variant); + + __add_type_attr(TYPE_CANONICAL(type), attr, args); + } +} + #define PASS_INFO(NAME, REF, ID, POS) \ struct register_pass_info NAME##_pass_info = { \ .pass = make_##NAME##_pass(), \ diff --git a/scripts/gcc-plugins/randomize_layout_plugin.c b/scripts/gcc-plugins/randomize_layout_plugin.c index 971a1908a8cc..ff65a4f87f24 100644 --- a/scripts/gcc-plugins/randomize_layout_plugin.c +++ b/scripts/gcc-plugins/randomize_layout_plugin.c @@ -73,6 +73,9 @@ static tree handle_randomize_layout_attr(tree *node, tree name, tree args, int f if (TYPE_P(*node)) { type = *node; + } else if (TREE_CODE(*node) == FIELD_DECL) { + *no_add_attrs = false; + return NULL_TREE; } else { gcc_assert(TREE_CODE(*node) == TYPE_DECL); type = TREE_TYPE(*node); @@ -348,15 +351,14 @@ static int relayout_struct(tree type) TREE_CHAIN(newtree[i]) = newtree[i+1]; TREE_CHAIN(newtree[num_fields - 1]) = NULL_TREE; + add_type_attr(type, "randomize_performed", NULL_TREE); + add_type_attr(type, "designated_init", NULL_TREE); + if (has_flexarray) + add_type_attr(type, "has_flexarray", NULL_TREE); + main_variant = TYPE_MAIN_VARIANT(type); - for (variant = main_variant; variant; variant = TYPE_NEXT_VARIANT(variant)) { + for (variant = main_variant; variant; variant = TYPE_NEXT_VARIANT(variant)) TYPE_FIELDS(variant) = newtree[0]; - TYPE_ATTRIBUTES(variant) = copy_list(TYPE_ATTRIBUTES(variant)); - TYPE_ATTRIBUTES(variant) = tree_cons(get_identifier("randomize_performed"), NULL_TREE, TYPE_ATTRIBUTES(variant)); - TYPE_ATTRIBUTES(variant) = tree_cons(get_identifier("designated_init"), NULL_TREE, TYPE_ATTRIBUTES(variant)); - if (has_flexarray) - TYPE_ATTRIBUTES(type) = tree_cons(get_identifier("has_flexarray"), NULL_TREE, TYPE_ATTRIBUTES(type)); - } /* * force a re-layout of the main variant @@ -424,10 +426,8 @@ static void randomize_type(tree type) if (lookup_attribute("randomize_layout", TYPE_ATTRIBUTES(TYPE_MAIN_VARIANT(type))) || is_pure_ops_struct(type)) relayout_struct(type); - for (variant = TYPE_MAIN_VARIANT(type); variant; variant = TYPE_NEXT_VARIANT(variant)) { - TYPE_ATTRIBUTES(type) = copy_list(TYPE_ATTRIBUTES(type)); - TYPE_ATTRIBUTES(type) = tree_cons(get_identifier("randomize_considered"), NULL_TREE, TYPE_ATTRIBUTES(type)); - } + add_type_attr(type, "randomize_considered", NULL_TREE); + #ifdef __DEBUG_PLUGIN fprintf(stderr, "Marking randomize_considered on struct %s\n", ORIG_TYPE_NAME(type)); #ifdef __DEBUG_VERBOSE -- 2.51.0 From 438e22801b1958f86883812af70d402eda29c4f5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 30 May 2025 03:45:13 -0400 Subject: [PATCH 05/16] rtmutex_api: provide correct extern functions Commit fb49f07ba1d9 ("locking/mutex: implement mutex_lock_killable_nest_lock") changed the set of functions that mutex.c defines when CONFIG_DEBUG_LOCK_ALLOC is set. - it removed the "extern" declaration of mutex_lock_killable_nested from include/linux/mutex.h, and replaced it with a macro since it could be treated as a special case of _mutex_lock_killable. It also removed a definition of the function in kernel/locking/mutex.c. - likewise, it replaced mutex_trylock() with the more generic mutex_trylock_nest_lock() and replaced mutex_trylock() with a macro. However, it left the old definitions in place in kernel/locking/rtmutex_api.c, which causes failures when building with CONFIG_RT_MUTEXES=y. Bring over the changes. Fixes: fb49f07ba1d9 ("locking/mutex: implement mutex_lock_killable_nest_lock") Reported-by: Randy Dunlap Tested-by: Randy Dunlap Cc: Peter Zijlstra Signed-off-by: Paolo Bonzini --- kernel/locking/rtmutex_api.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 191e4720e546..2d933528a0fa 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -544,12 +544,12 @@ int __sched mutex_lock_interruptible_nested(struct mutex *lock, } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); -int __sched mutex_lock_killable_nested(struct mutex *lock, - unsigned int subclass) +int __sched _mutex_lock_killable(struct mutex *lock, unsigned int subclass, + struct lockdep_map *nest_lock) { - return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); + return __mutex_lock_common(lock, TASK_KILLABLE, subclass, nest_lock, _RET_IP_); } -EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); +EXPORT_SYMBOL_GPL(_mutex_lock_killable); void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) { @@ -563,6 +563,21 @@ void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) } EXPORT_SYMBOL_GPL(mutex_lock_io_nested); +int __sched _mutex_trylock_nest_lock(struct mutex *lock, + struct lockdep_map *nest_lock) +{ + int ret; + + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) + return 0; + + ret = __rt_mutex_trylock(&lock->rtmutex); + if (ret) + mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL_GPL(_mutex_trylock_nest_lock); #else /* CONFIG_DEBUG_LOCK_ALLOC */ void __sched mutex_lock(struct mutex *lock) @@ -591,22 +606,16 @@ void __sched mutex_lock_io(struct mutex *lock) io_schedule_finish(token); } EXPORT_SYMBOL(mutex_lock_io); -#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ int __sched mutex_trylock(struct mutex *lock) { - int ret; - if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) return 0; - ret = __rt_mutex_trylock(&lock->rtmutex); - if (ret) - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - - return ret; + return __rt_mutex_trylock(&lock->rtmutex); } EXPORT_SYMBOL(mutex_trylock); +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ void __sched mutex_unlock(struct mutex *lock) { -- 2.51.0 From f914b52c379c12288b7623bb814d0508dbe7481d Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 29 May 2025 19:19:54 +0800 Subject: [PATCH 06/16] ftrace: Fix UAF when lookup kallsym after ftrace disabled The following issue happens with a buggy module: BUG: unable to handle page fault for address: ffffffffc05d0218 PGD 1bd66f067 P4D 1bd66f067 PUD 1bd671067 PMD 101808067 PTE 0 Oops: Oops: 0000 [#1] SMP KASAN PTI Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS RIP: 0010:sized_strscpy+0x81/0x2f0 RSP: 0018:ffff88812d76fa08 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffffffffc0601010 RCX: dffffc0000000000 RDX: 0000000000000038 RSI: dffffc0000000000 RDI: ffff88812608da2d RBP: 8080808080808080 R08: ffff88812608da2d R09: ffff88812608da68 R10: ffff88812608d82d R11: ffff88812608d810 R12: 0000000000000038 R13: ffff88812608da2d R14: ffffffffc05d0218 R15: fefefefefefefeff FS: 00007fef552de740(0000) GS:ffff8884251c7000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffc05d0218 CR3: 00000001146f0000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ftrace_mod_get_kallsym+0x1ac/0x590 update_iter_mod+0x239/0x5b0 s_next+0x5b/0xa0 seq_read_iter+0x8c9/0x1070 seq_read+0x249/0x3b0 proc_reg_read+0x1b0/0x280 vfs_read+0x17f/0x920 ksys_read+0xf3/0x1c0 do_syscall_64+0x5f/0x2e0 entry_SYSCALL_64_after_hwframe+0x76/0x7e The above issue may happen as follows: (1) Add kprobe tracepoint; (2) insmod test.ko; (3) Module triggers ftrace disabled; (4) rmmod test.ko; (5) cat /proc/kallsyms; --> Will trigger UAF as test.ko already removed; ftrace_mod_get_kallsym() ... strscpy(module_name, mod_map->mod->name, MODULE_NAME_LEN); ... The problem is when a module triggers an issue with ftrace and sets ftrace_disable. The ftrace_disable is set when an anomaly is discovered and to prevent any more damage, ftrace stops all text modification. The issue that happened was that the ftrace_disable stops more than just the text modification. When a module is loaded, its init functions can also be traced. Because kallsyms deletes the init functions after a module has loaded, ftrace saves them when the module is loaded and function tracing is enabled. This allows the output of the function trace to show the init function names instead of just their raw memory addresses. When a module is removed, ftrace_release_mod() is called, and if ftrace_disable is set, it just returns without doing anything more. The problem here is that it leaves the mod_list still around and if kallsyms is called, it will call into this code and access the module memory that has already been freed as it will return: strscpy(module_name, mod_map->mod->name, MODULE_NAME_LEN); Where the "mod" no longer exists and triggers a UAF bug. Link: https://lore.kernel.org/all/20250523135452.626d8dcd@gandalf.local.home/ Cc: stable@vger.kernel.org Fixes: aba4b5c22cba ("ftrace: Save module init functions kallsyms symbols for tracing") Link: https://lore.kernel.org/20250529111955.2349189-2-yebin@huaweicloud.com Signed-off-by: Ye Bin Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1af952cba48d..84fd2f8263fa 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7438,9 +7438,10 @@ void ftrace_release_mod(struct module *mod) mutex_lock(&ftrace_lock); - if (ftrace_disabled) - goto out_unlock; - + /* + * To avoid the UAF problem after the module is unloaded, the + * 'mod_map' resource needs to be released unconditionally. + */ list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) { if (mod_map->mod == mod) { list_del_rcu(&mod_map->list); @@ -7449,6 +7450,9 @@ void ftrace_release_mod(struct module *mod) } } + if (ftrace_disabled) + goto out_unlock; + /* * Each module has its own ftrace_pages, remove * them from the list. -- 2.51.0 From 5834a597386c59fedc601a81091028074d6ae9a2 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 29 May 2025 19:19:55 +0800 Subject: [PATCH 07/16] ftrace: Don't allocate ftrace module map if ftrace is disabled If ftrace is disabled, it is meaningless to allocate a module map. Add a check in allocate_ftrace_mod_map() to not allocate if ftrace is disabled. Link: https://lore.kernel.org/20250529111955.2349189-3-yebin@huaweicloud.com Signed-off-by: Ye Bin Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 84fd2f8263fa..a7291685902e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7631,6 +7631,9 @@ allocate_ftrace_mod_map(struct module *mod, { struct ftrace_mod_map *mod_map; + if (ftrace_disabled) + return NULL; + mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL); if (!mod_map) return NULL; -- 2.51.0 From e27e43a5cbda77d211757eb83101f11f67788204 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 29 May 2025 16:05:50 -0400 Subject: [PATCH 08/16] xdp: Remove unused mem_return_failed event The change to allow page_pool to handle its own page destruction instead of relying on XDP removed the trace_mem_return_failed() tracepoint caller, but did not remove the mem_return_failed trace event. As trace events take up memory when they are created regardless of if they are used or not, having this unused event around wastes around 5K of memory. Remove the unused event. Link: https://lore.kernel.org/all/20250529130138.544ffec4@gandalf.local.home/ Cc: netdev Cc: Jonathan Lemon Cc: Mathieu Desnoyers Cc: Masami Hiramatsu Link: https://lore.kernel.org/20250529160550.1f888b15@gandalf.local.home Fixes: c3f812cea0d7 ("page_pool: do not release pool until inflight == 0.") Acked-by: Jesper Dangaard Brouer Acked-by: Jakub Kicinski Signed-off-by: Steven Rostedt (Google) --- include/trace/events/xdp.h | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index a7e5452b5d21..d3ef86c97ae3 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -379,32 +379,6 @@ TRACE_EVENT(mem_connect, ) ); -TRACE_EVENT(mem_return_failed, - - TP_PROTO(const struct xdp_mem_info *mem, - const struct page *page), - - TP_ARGS(mem, page), - - TP_STRUCT__entry( - __field(const struct page *, page) - __field(u32, mem_id) - __field(u32, mem_type) - ), - - TP_fast_assign( - __entry->page = page; - __entry->mem_id = mem->id; - __entry->mem_type = mem->type; - ), - - TP_printk("mem_id=%d mem_type=%s page=%p", - __entry->mem_id, - __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), - __entry->page - ) -); - TRACE_EVENT(bpf_xdp_link_attach_failed, TP_PROTO(const char *msg), -- 2.51.0 From 167d7ede0007d320a891494be57b635b4c069995 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 29 May 2025 13:57:39 -0400 Subject: [PATCH 09/16] genirq/matrix: Remove unused irq_matrix_alloc_reserved tracepoint The tracepoint irq_matrix_alloc_reserved was added but never used. Remove it. Link: https://lore.kernel.org/all/20250529130138.544ffec4@gandalf.local.home/ Cc: Juergen Gross Cc: Mathieu Desnoyers Cc: Masami Hiramatsu Link: https://lore.kernel.org/20250529135739.26e5c075@gandalf.local.home Fixes: ec0f7cd273dc4 ("genirq/matrix: Add tracepoints") Reviewed-by: Thomas Gleixner Signed-off-by: Steven Rostedt (Google) --- include/trace/events/irq_matrix.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/trace/events/irq_matrix.h b/include/trace/events/irq_matrix.h index 267d4cbbf360..93244078b4e6 100644 --- a/include/trace/events/irq_matrix.h +++ b/include/trace/events/irq_matrix.h @@ -138,14 +138,6 @@ DEFINE_EVENT(irq_matrix_global_update, irq_matrix_assign_system, TP_ARGS(bit, matrix) ); -DEFINE_EVENT(irq_matrix_cpu, irq_matrix_alloc_reserved, - - TP_PROTO(int bit, unsigned int cpu, - struct irq_matrix *matrix, struct cpumap *cmap), - - TP_ARGS(bit, cpu, matrix, cmap) -); - DEFINE_EVENT(irq_matrix_cpu, irq_matrix_reserve_managed, TP_PROTO(int bit, unsigned int cpu, -- 2.51.0 From b8628379957d0c8c057782ec1a8512de6d4ba29c Mon Sep 17 00:00:00 2001 From: Sumanth Gavini Date: Mon, 19 May 2025 19:08:05 -0700 Subject: [PATCH 10/16] mfd: maxim: Correct Samsung "Electronics" spelling in headers Fix the misspelling of 'Electronics' in MFD driver headers. Link: https://lore.kernel.org/lkml/3aa30119-60e5-4dcb-b13a-1753966ca775@sirena.org.uk/#t Link: https://lore.kernel.org/r/20250520020808.159586-1-sumanth.gavini@yahoo.com Signed-off-by: Sumanth Gavini Signed-off-by: Lee Jones --- include/linux/mfd/max8997-private.h | 2 +- include/linux/mfd/max8998-private.h | 2 +- include/linux/mfd/max8998.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/mfd/max8997-private.h b/include/linux/mfd/max8997-private.h index f70eea0f2264..261c0aae7d00 100644 --- a/include/linux/mfd/max8997-private.h +++ b/include/linux/mfd/max8997-private.h @@ -2,7 +2,7 @@ /* * max8997-private.h - Voltage regulator driver for the Maxim 8997 * - * Copyright (C) 2010 Samsung Electrnoics + * Copyright (C) 2010 Samsung Electronics * MyungJoo Ham */ diff --git a/include/linux/mfd/max8998-private.h b/include/linux/mfd/max8998-private.h index 6deb5f577602..d77dc18db6eb 100644 --- a/include/linux/mfd/max8998-private.h +++ b/include/linux/mfd/max8998-private.h @@ -2,7 +2,7 @@ /* * max8998-private.h - Voltage regulator driver for the Maxim 8998 * - * Copyright (C) 2009-2010 Samsung Electrnoics + * Copyright (C) 2009-2010 Samsung Electronics * Kyungmin Park * Marek Szyprowski */ diff --git a/include/linux/mfd/max8998.h b/include/linux/mfd/max8998.h index a054e55c8646..5473f1983e31 100644 --- a/include/linux/mfd/max8998.h +++ b/include/linux/mfd/max8998.h @@ -2,7 +2,7 @@ /* * max8998.h - Voltage regulator driver for the Maxim 8998 * - * Copyright (C) 2009-2010 Samsung Electrnoics + * Copyright (C) 2009-2010 Samsung Electronics * Kyungmin Park * Marek Szyprowski */ -- 2.51.0 From ffb006aa433e8109ec79320c344fb69947997ba1 Mon Sep 17 00:00:00 2001 From: Sumanth Gavini Date: Mon, 19 May 2025 16:20:23 -0700 Subject: [PATCH 11/16] mfd: maxim: Correct Samsung "Electronics" spelling in copyright headers Fix the misspelling of 'Electronics' in MFD driver copyright headers. Link: https://lore.kernel.org/lkml/3aa30119-60e5-4dcb-b13a-1753966ca775@sirena.org.uk/#t Link: https://lore.kernel.org/r/20250519232025.152769-1-sumanth.gavini@yahoo.com Signed-off-by: Sumanth Gavini Signed-off-by: Lee Jones --- include/linux/mfd/max14577-private.h | 2 +- include/linux/mfd/max14577.h | 2 +- include/linux/mfd/max77686-private.h | 2 +- include/linux/mfd/max77686.h | 2 +- include/linux/mfd/max77693-private.h | 2 +- include/linux/mfd/max77693.h | 2 +- include/linux/mfd/max8997.h | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/mfd/max14577-private.h b/include/linux/mfd/max14577-private.h index a21374f8ad26..dd51a37fa37f 100644 --- a/include/linux/mfd/max14577-private.h +++ b/include/linux/mfd/max14577-private.h @@ -2,7 +2,7 @@ /* * max14577-private.h - Common API for the Maxim 14577/77836 internal sub chip * - * Copyright (C) 2014 Samsung Electrnoics + * Copyright (C) 2014 Samsung Electronics * Chanwoo Choi * Krzysztof Kozlowski */ diff --git a/include/linux/mfd/max14577.h b/include/linux/mfd/max14577.h index 8b3ef891ba42..0fda5c2e745a 100644 --- a/include/linux/mfd/max14577.h +++ b/include/linux/mfd/max14577.h @@ -2,7 +2,7 @@ /* * max14577.h - Driver for the Maxim 14577/77836 * - * Copyright (C) 2014 Samsung Electrnoics + * Copyright (C) 2014 Samsung Electronics * Chanwoo Choi * Krzysztof Kozlowski * diff --git a/include/linux/mfd/max77686-private.h b/include/linux/mfd/max77686-private.h index ea635d12a741..e6b8b4014dc0 100644 --- a/include/linux/mfd/max77686-private.h +++ b/include/linux/mfd/max77686-private.h @@ -2,7 +2,7 @@ /* * max77686-private.h - Voltage regulator driver for the Maxim 77686/802 * - * Copyright (C) 2012 Samsung Electrnoics + * Copyright (C) 2012 Samsung Electronics * Chiwoong Byun */ diff --git a/include/linux/mfd/max77686.h b/include/linux/mfd/max77686.h index d0fb510875e6..7c4624acd1db 100644 --- a/include/linux/mfd/max77686.h +++ b/include/linux/mfd/max77686.h @@ -2,7 +2,7 @@ /* * max77686.h - Driver for the Maxim 77686/802 * - * Copyright (C) 2012 Samsung Electrnoics + * Copyright (C) 2012 Samsung Electronics * Chiwoong Byun * * This driver is based on max8997.h diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h index c324d548619e..8e7c35b5ea1c 100644 --- a/include/linux/mfd/max77693-private.h +++ b/include/linux/mfd/max77693-private.h @@ -2,7 +2,7 @@ /* * max77693-private.h - Voltage regulator driver for the Maxim 77693 * - * Copyright (C) 2012 Samsung Electrnoics + * Copyright (C) 2012 Samsung Electronics * SangYoung Son * * This program is not provided / owned by Maxim Integrated Products. diff --git a/include/linux/mfd/max77693.h b/include/linux/mfd/max77693.h index c67c16ba8649..8e77ebeb7cf1 100644 --- a/include/linux/mfd/max77693.h +++ b/include/linux/mfd/max77693.h @@ -2,7 +2,7 @@ /* * max77693.h - Driver for the Maxim 77693 * - * Copyright (C) 2012 Samsung Electrnoics + * Copyright (C) 2012 Samsung Electronics * SangYoung Son * * This program is not provided / owned by Maxim Integrated Products. diff --git a/include/linux/mfd/max8997.h b/include/linux/mfd/max8997.h index 5c2cc1103437..fb36e1386069 100644 --- a/include/linux/mfd/max8997.h +++ b/include/linux/mfd/max8997.h @@ -2,7 +2,7 @@ /* * max8997.h - Driver for the Maxim 8997/8966 * - * Copyright (C) 2009-2010 Samsung Electrnoics + * Copyright (C) 2009-2010 Samsung Electronics * MyungJoo Ham * * This driver is based on max8998.h -- 2.51.0 From a4a45a9a72f3a9eaa17ec502d6e97c8eaa901825 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 29 May 2025 15:22:11 -0400 Subject: [PATCH 12/16] fsdax: Remove unused trace events for dax insert mapping When the dax_fault_actor() helper was factored out, it removed the calls to the dax_pmd_insert_mapping and dax_insert_mapping events but never removed the events themselves. As each event created takes up memory (roughly 5K each), this is a waste as it is never used. Remove the unused dax_pmd_insert_mapping and dax_insert_mapping trace events. Link: https://lore.kernel.org/all/20250529130138.544ffec4@gandalf.local.home/ Cc: Shiyang Ruan Cc: "Darrick J. Wong" Cc: Ross Zwisler Cc: Andrew Morton Link: https://lore.kernel.org/20250529152211.688800c9@gandalf.local.home Fixes: c2436190e492 ("fsdax: factor out a dax_fault_actor() helper") Acked-by: Dan Williams Reviewed-by: Alison Schofield Signed-off-by: Steven Rostedt (Google) --- include/trace/events/fs_dax.h | 78 ----------------------------------- 1 file changed, 78 deletions(-) diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index 86fe6aecff1e..76b56f78abb0 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -102,54 +102,6 @@ DEFINE_EVENT(dax_pmd_load_hole_class, name, \ DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole); DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback); -DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class, - TP_PROTO(struct inode *inode, struct vm_fault *vmf, - long length, pfn_t pfn, void *radix_entry), - TP_ARGS(inode, vmf, length, pfn, radix_entry), - TP_STRUCT__entry( - __field(unsigned long, ino) - __field(unsigned long, vm_flags) - __field(unsigned long, address) - __field(long, length) - __field(u64, pfn_val) - __field(void *, radix_entry) - __field(dev_t, dev) - __field(int, write) - ), - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->vm_flags = vmf->vma->vm_flags; - __entry->address = vmf->address; - __entry->write = vmf->flags & FAULT_FLAG_WRITE; - __entry->length = length; - __entry->pfn_val = pfn.val; - __entry->radix_entry = radix_entry; - ), - TP_printk("dev %d:%d ino %#lx %s %s address %#lx length %#lx " - "pfn %#llx %s radix_entry %#lx", - MAJOR(__entry->dev), - MINOR(__entry->dev), - __entry->ino, - __entry->vm_flags & VM_SHARED ? "shared" : "private", - __entry->write ? "write" : "read", - __entry->address, - __entry->length, - __entry->pfn_val & ~PFN_FLAGS_MASK, - __print_flags_u64(__entry->pfn_val & PFN_FLAGS_MASK, "|", - PFN_FLAGS_TRACE), - (unsigned long)__entry->radix_entry - ) -) - -#define DEFINE_PMD_INSERT_MAPPING_EVENT(name) \ -DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \ - TP_PROTO(struct inode *inode, struct vm_fault *vmf, \ - long length, pfn_t pfn, void *radix_entry), \ - TP_ARGS(inode, vmf, length, pfn, radix_entry)) - -DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping); - DECLARE_EVENT_CLASS(dax_pte_fault_class, TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), TP_ARGS(inode, vmf, result), @@ -194,36 +146,6 @@ DEFINE_PTE_FAULT_EVENT(dax_load_hole); DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite_no_entry); DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite); -TRACE_EVENT(dax_insert_mapping, - TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry), - TP_ARGS(inode, vmf, radix_entry), - TP_STRUCT__entry( - __field(unsigned long, ino) - __field(unsigned long, vm_flags) - __field(unsigned long, address) - __field(void *, radix_entry) - __field(dev_t, dev) - __field(int, write) - ), - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->vm_flags = vmf->vma->vm_flags; - __entry->address = vmf->address; - __entry->write = vmf->flags & FAULT_FLAG_WRITE; - __entry->radix_entry = radix_entry; - ), - TP_printk("dev %d:%d ino %#lx %s %s address %#lx radix_entry %#lx", - MAJOR(__entry->dev), - MINOR(__entry->dev), - __entry->ino, - __entry->vm_flags & VM_SHARED ? "shared" : "private", - __entry->write ? "write" : "read", - __entry->address, - (unsigned long)__entry->radix_entry - ) -) - DECLARE_EVENT_CLASS(dax_writeback_range_class, TP_PROTO(struct inode *inode, pgoff_t start_index, pgoff_t end_index), TP_ARGS(inode, start_index, end_index), -- 2.51.0 From a9d0aab5eb33a44792a66b7af13ff50d7b3e7022 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 6 Jun 2025 20:20:20 -0400 Subject: [PATCH 13/16] tracing: Fix regression of filter waiting a long time on RCU synchronization When faultable trace events were added, a trace event may no longer use normal RCU to synchronize but instead used synchronize_rcu_tasks_trace(). This synchronization takes a much longer time to synchronize. The filter logic would free the filters by calling tracepoint_synchronize_unregister() after it unhooked the filter strings and before freeing them. With this function now calling synchronize_rcu_tasks_trace() this increased the time to free a filter tremendously. On a PREEMPT_RT system, it was even more noticeable. # time trace-cmd record -p function sleep 1 [..] real 2m29.052s user 0m0.244s sys 0m20.136s As trace-cmd would clear out all the filters before recording, it could take up to 2 minutes to do a recording of "sleep 1". To find out where the issues was: ~# trace-cmd sqlhist -e -n sched_stack select start.prev_state as state, end.next_comm as comm, TIMESTAMP_DELTA_USECS as delta, start.STACKTRACE as stack from sched_switch as start join sched_switch as end on start.prev_pid = end.next_pid Which will produce the following commands (and -e will also execute them): echo 's:sched_stack s64 state; char comm[16]; u64 delta; unsigned long stack[];' >> /sys/kernel/tracing/dynamic_events echo 'hist:keys=prev_pid:__arg_18057_2=prev_state,__arg_18057_4=common_timestamp.usecs,__arg_18057_7=common_stacktrace' >> /sys/kernel/tracing/events/sched/sched_switch/trigger echo 'hist:keys=next_pid:__state_18057_1=$__arg_18057_2,__comm_18057_3=next_comm,__delta_18057_5=common_timestamp.usecs-$__arg_18057_4,__stack_18057_6=$__arg_18057_7:onmatch(sched.sched_switch).trace(sched_stack,$__state_18057_1,$__comm_18057_3,$__delta_18057_5,$__stack_18057_6)' >> /sys/kernel/tracing/events/sched/sched_switch/trigger The above creates a synthetic event that creates a stack trace when a task schedules out and records it with the time it scheduled back in. Basically the time a task is off the CPU. It also records the state of the task when it left the CPU (running, blocked, sleeping, etc). It also saves the comm of the task as "comm" (needed for the next command). ~# echo 'hist:keys=state,stack.stacktrace:vals=delta:sort=state,delta if comm == "trace-cmd" && state & 3' > /sys/kernel/tracing/events/synthetic/sched_stack/trigger The above creates a histogram with buckets per state, per stack, and the value of the total time it was off the CPU for that stack trace. It filters on tasks with "comm == trace-cmd" and only the sleeping and blocked states (1 - sleeping, 2 - blocked). ~# trace-cmd record -p function sleep 1 ~# cat /sys/kernel/tracing/events/synthetic/sched_stack/hist | tail -18 { state: 2, stack.stacktrace __schedule+0x1545/0x3700 schedule+0xe2/0x390 schedule_timeout+0x175/0x200 wait_for_completion_state+0x294/0x440 __wait_rcu_gp+0x247/0x4f0 synchronize_rcu_tasks_generic+0x151/0x230 apply_subsystem_event_filter+0xa2b/0x1300 subsystem_filter_write+0x67/0xc0 vfs_write+0x1e2/0xeb0 ksys_write+0xff/0x1d0 do_syscall_64+0x7b/0x420 entry_SYSCALL_64_after_hwframe+0x76/0x7e } hitcount: 237 delta: 99756288 <<--------------- Delta is 99 seconds! Totals: Hits: 525 Entries: 21 Dropped: 0 This shows that this particular trace waited for 99 seconds on synchronize_rcu_tasks() in apply_subsystem_event_filter(). In fact, there's a lot of places in the filter code that spends a lot of time waiting for synchronize_rcu_tasks_trace() in order to free the filters. Add helper functions that will use call_rcu*() variants to asynchronously free the filters. This brings the timings back to normal: # time trace-cmd record -p function sleep 1 [..] real 0m14.681s user 0m0.335s sys 0m28.616s And the histogram also shows this: ~# cat /sys/kernel/tracing/events/synthetic/sched_stack/hist | tail -21 { state: 2, stack.stacktrace __schedule+0x1545/0x3700 schedule+0xe2/0x390 schedule_timeout+0x175/0x200 wait_for_completion_state+0x294/0x440 __wait_rcu_gp+0x247/0x4f0 synchronize_rcu_normal+0x3db/0x5c0 tracing_reset_online_cpus+0x8f/0x1e0 tracing_open+0x335/0x440 do_dentry_open+0x4c6/0x17a0 vfs_open+0x82/0x360 path_openat+0x1a36/0x2990 do_filp_open+0x1c5/0x420 do_sys_openat2+0xed/0x180 __x64_sys_openat+0x108/0x1d0 do_syscall_64+0x7b/0x420 } hitcount: 2 delta: 77044 Totals: Hits: 55 Entries: 28 Dropped: 0 Where the total waiting time of synchronize_rcu_tasks_trace() is 77 milliseconds. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: "Paul E. McKenney" Cc: Jan Kiszka Cc: Andreas Ziegler Cc: Felix MOESSBAUER Link: https://lore.kernel.org/20250606201936.1e3d09a9@batman.local.home Reported-by: "Flot, Julien" Tested-by: Julien Flot Fixes: a363d27cdbc2 ("tracing: Allow system call tracepoints to handle page faults") Closes: https://lore.kernel.org/all/240017f656631c7dd4017aa93d91f41f653788ea.camel@siemens.com/ Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_filter.c | 186 +++++++++++++++++++++-------- 1 file changed, 138 insertions(+), 48 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2048560264bb..711520081741 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1335,22 +1335,139 @@ static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, } } +struct filter_list { + struct list_head list; + struct event_filter *filter; +}; + +struct filter_head { + struct list_head list; + struct rcu_head rcu; +}; + + +static void free_filter_list(struct rcu_head *rhp) +{ + struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu); + struct filter_list *filter_item, *tmp; + + list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + kfree(filter_list); +} + +static void free_filter_list_tasks(struct rcu_head *rhp) +{ + call_rcu(rhp, free_filter_list); +} + +/* + * The tracepoint_synchronize_unregister() is a double rcu call. + * It calls synchronize_rcu_tasks_trace() followed by synchronize_rcu(). + * Instead of waiting for it, simply call these via the call_rcu*() + * variants. + */ +static void delay_free_filter(struct filter_head *head) +{ + call_rcu_tasks_trace(&head->rcu, free_filter_list_tasks); +} + +static void try_delay_free_filter(struct event_filter *filter) +{ + struct filter_head *head; + struct filter_list *item; + + head = kmalloc(sizeof(*head), GFP_KERNEL); + if (!head) + goto free_now; + + INIT_LIST_HEAD(&head->list); + + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) { + kfree(head); + goto free_now; + } + + item->filter = filter; + list_add_tail(&item->list, &head->list); + delay_free_filter(head); + return; + + free_now: + /* Make sure the filter is not being used */ + tracepoint_synchronize_unregister(); + __free_filter(filter); +} + static inline void __free_subsystem_filter(struct trace_event_file *file) { __free_filter(file->filter); file->filter = NULL; } +static inline void event_set_filter(struct trace_event_file *file, + struct event_filter *filter) +{ + rcu_assign_pointer(file->filter, filter); +} + +static inline void event_clear_filter(struct trace_event_file *file) +{ + RCU_INIT_POINTER(file->filter, NULL); +} + static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, - struct trace_array *tr) + struct trace_array *tr, + struct event_filter *filter) { struct trace_event_file *file; + struct filter_head *head; + struct filter_list *item; + + head = kmalloc(sizeof(*head), GFP_KERNEL); + if (!head) + goto free_now; + + INIT_LIST_HEAD(&head->list); + + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) { + kfree(head); + goto free_now; + } + + item->filter = filter; + list_add_tail(&item->list, &head->list); list_for_each_entry(file, &tr->events, list) { if (file->system != dir) continue; - __free_subsystem_filter(file); + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) + goto free_now; + item->filter = event_filter(file); + list_add_tail(&item->list, &head->list); + event_clear_filter(file); } + + delay_free_filter(head); + return; + free_now: + tracepoint_synchronize_unregister(); + + if (head) + free_filter_list(&head->rcu); + + list_for_each_entry(file, &tr->events, list) { + if (file->system != dir || !file->filter) + continue; + __free_filter(file->filter); + } + __free_filter(filter); } int filter_assign_type(const char *type) @@ -2120,22 +2237,6 @@ static inline void event_set_filtered_flag(struct trace_event_file *file) trace_buffered_event_enable(); } -static inline void event_set_filter(struct trace_event_file *file, - struct event_filter *filter) -{ - rcu_assign_pointer(file->filter, filter); -} - -static inline void event_clear_filter(struct trace_event_file *file) -{ - RCU_INIT_POINTER(file->filter, NULL); -} - -struct filter_list { - struct list_head list; - struct event_filter *filter; -}; - static int process_system_preds(struct trace_subsystem_dir *dir, struct trace_array *tr, struct filter_parse_error *pe, @@ -2144,11 +2245,16 @@ static int process_system_preds(struct trace_subsystem_dir *dir, struct trace_event_file *file; struct filter_list *filter_item; struct event_filter *filter = NULL; - struct filter_list *tmp; - LIST_HEAD(filter_list); + struct filter_head *filter_list; bool fail = true; int err; + filter_list = kmalloc(sizeof(*filter_list), GFP_KERNEL); + if (!filter_list) + return -ENOMEM; + + INIT_LIST_HEAD(&filter_list->list); + list_for_each_entry(file, &tr->events, list) { if (file->system != dir) @@ -2175,7 +2281,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, if (!filter_item) goto fail_mem; - list_add_tail(&filter_item->list, &filter_list); + list_add_tail(&filter_item->list, &filter_list->list); /* * Regardless of if this returned an error, we still * replace the filter for the call. @@ -2195,31 +2301,22 @@ static int process_system_preds(struct trace_subsystem_dir *dir, * Do a synchronize_rcu() and to ensure all calls are * done with them before we free them. */ - tracepoint_synchronize_unregister(); - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - __free_filter(filter_item->filter); - list_del(&filter_item->list); - kfree(filter_item); - } + delay_free_filter(filter_list); return 0; fail: /* No call succeeded */ - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - list_del(&filter_item->list); - kfree(filter_item); - } + free_filter_list(&filter_list->rcu); parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; fail_mem: __free_filter(filter); + /* If any call succeeded, we still need to sync */ if (!fail) - tracepoint_synchronize_unregister(); - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - __free_filter(filter_item->filter); - list_del(&filter_item->list); - kfree(filter_item); - } + delay_free_filter(filter_list); + else + free_filter_list(&filter_list->rcu); + return -ENOMEM; } @@ -2361,9 +2458,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) event_clear_filter(file); - /* Make sure the filter is not being used */ - tracepoint_synchronize_unregister(); - __free_filter(filter); + try_delay_free_filter(filter); return 0; } @@ -2387,11 +2482,8 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) event_set_filter(file, filter); - if (tmp) { - /* Make sure the call is done with the filter */ - tracepoint_synchronize_unregister(); - __free_filter(tmp); - } + if (tmp) + try_delay_free_filter(tmp); } return err; @@ -2417,9 +2509,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, filter = system->filter; system->filter = NULL; /* Ensure all filters are no longer used */ - tracepoint_synchronize_unregister(); - filter_free_subsystem_filters(dir, tr); - __free_filter(filter); + filter_free_subsystem_filters(dir, tr, filter); return 0; } -- 2.51.0 From 40ee2afafc1d9fe3aa44a6fbe440d78a5c96a72e Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Fri, 6 Jun 2025 14:22:42 +0300 Subject: [PATCH 14/16] ring-buffer: Fix buffer locking in ring_buffer_subbuf_order_set() Enlarge the critical section in ring_buffer_subbuf_order_set() to ensure that error handling takes place with per-buffer mutex held, thus preventing list corruption and other concurrency-related issues. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Tzvetomir Stoyanov Link: https://lore.kernel.org/20250606112242.1510605-1-dmantipov@yandex.ru Reported-by: syzbot+05d673e83ec640f0ced9@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=05d673e83ec640f0ced9 Fixes: f9b94daa542a8 ("ring-buffer: Set new size of the ring buffer sub page") Signed-off-by: Dmitry Antipov Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e24509bd0af5..00fc38d70e86 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6795,7 +6795,7 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) old_size = buffer->subbuf_size; /* prevent another thread from changing buffer sizes */ - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); atomic_inc(&buffer->record_disabled); /* Make sure all commits have finished */ @@ -6900,7 +6900,6 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) } atomic_dec(&buffer->record_disabled); - mutex_unlock(&buffer->mutex); return 0; @@ -6909,7 +6908,6 @@ error: buffer->subbuf_size = old_size; atomic_dec(&buffer->record_disabled); - mutex_unlock(&buffer->mutex); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; -- 2.51.0 From de6fdc076d39c97c0f7ed4873bfc84f58909f479 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 Jun 2025 16:21:06 -0400 Subject: [PATCH 15/16] tracing: PM: Remove unused clock events The events clock_enable, clock_disable, and clock_set_rate were added back in 2010. In 2011 they were used by the arm architecture but removed in 2013. These events add around 7K of memory which was wasted for the last 12 years. Remove them. Link: https://lore.kernel.org/all/20250529130138.544ffec4@gandalf.local.home/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Kajetan Puchalski Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/20250605162106.1a459dad@gandalf.local.home Fixes: 74704ac6ea402 ("tracing, perf: Add more power related events") Signed-off-by: Steven Rostedt (Google) --- include/trace/events/power.h | 47 ------------------------------------ 1 file changed, 47 deletions(-) diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 9253e83b9bb4..6c631eec23e3 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -337,53 +337,6 @@ DEFINE_EVENT(wakeup_source, wakeup_source_deactivate, TP_ARGS(name, state) ); -/* - * The clock events are used for clock enable/disable and for - * clock rate change - */ -DECLARE_EVENT_CLASS(clock, - - TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), - - TP_ARGS(name, state, cpu_id), - - TP_STRUCT__entry( - __string( name, name ) - __field( u64, state ) - __field( u64, cpu_id ) - ), - - TP_fast_assign( - __assign_str(name); - __entry->state = state; - __entry->cpu_id = cpu_id; - ), - - TP_printk("%s state=%lu cpu_id=%lu", __get_str(name), - (unsigned long)__entry->state, (unsigned long)__entry->cpu_id) -); - -DEFINE_EVENT(clock, clock_enable, - - TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), - - TP_ARGS(name, state, cpu_id) -); - -DEFINE_EVENT(clock, clock_disable, - - TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), - - TP_ARGS(name, state, cpu_id) -); - -DEFINE_EVENT(clock, clock_set_rate, - - TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), - - TP_ARGS(name, state, cpu_id) -); - /* * The power domain events are used for power domains transitions */ -- 2.51.0 From 549e914c96ae67760f36b9714b424dc992a0a69b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 7 Jun 2025 10:28:21 -0400 Subject: [PATCH 16/16] tracing: Add rcu annotation around file->filter accesses Running sparse on trace_events_filter.c triggered several warnings about file->filter being accessed directly even though it's annotated with __rcu. Add rcu_dereference() around it and shuffle the logic slightly so that it's always referenced via accessor functions. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250607102821.6c7effbf@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_filter.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 711520081741..ea8b364b6818 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1250,7 +1250,9 @@ static void append_filter_err(struct trace_array *tr, static inline struct event_filter *event_filter(struct trace_event_file *file) { - return file->filter; + return rcu_dereference_protected(file->filter, + lockdep_is_held(&event_mutex)); + } /* caller must hold event_mutex */ @@ -1320,7 +1322,7 @@ void free_event_filter(struct event_filter *filter) static inline void __remove_filter(struct trace_event_file *file) { filter_disable(file); - remove_filter_string(file->filter); + remove_filter_string(event_filter(file)); } static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, @@ -1405,7 +1407,7 @@ static void try_delay_free_filter(struct event_filter *filter) static inline void __free_subsystem_filter(struct trace_event_file *file) { - __free_filter(file->filter); + __free_filter(event_filter(file)); file->filter = NULL; } @@ -1465,7 +1467,7 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, list_for_each_entry(file, &tr->events, list) { if (file->system != dir || !file->filter) continue; - __free_filter(file->filter); + __free_subsystem_filter(file); } __free_filter(filter); } -- 2.51.0