From 535d1784d8a96dce79023ac5081de1aa8669388a Mon Sep 17 00:00:00 2001
From: Diogo Ivo <diogo.ivo@siemens.com>
Date: Mon, 17 Mar 2025 10:55:06 +0000
Subject: [PATCH 01/16] watchdog: Add driver for Intel OC WDT

Add a driver for the Intel Over-Clocking Watchdog found in Intel
Platform Controller (PCH) chipsets. This watchdog is controlled
via a simple single-register interface and would otherwise be
standard except for the presence of a LOCK bit that can only be
set once per power cycle, needing extra handling around it.

Signed-off-by: Diogo Ivo <diogo.ivo@siemens.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20250317-ivo-intel_oc_wdt-v3-1-32c396f4eefd@siemens.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
---
 drivers/watchdog/Kconfig        |  11 ++
 drivers/watchdog/Makefile       |   1 +
 drivers/watchdog/intel_oc_wdt.c | 233 ++++++++++++++++++++++++++++++++
 3 files changed, 245 insertions(+)
 create mode 100644 drivers/watchdog/intel_oc_wdt.c

diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 16c49ca63fba..0c25b2ed44eb 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -1372,6 +1372,17 @@ config INTEL_MID_WATCHDOG
 
 	  To compile this driver as a module, choose M here.
 
+config INTEL_OC_WATCHDOG
+	tristate "Intel OC Watchdog"
+	depends on (X86 || COMPILE_TEST) && ACPI && HAS_IOPORT
+	select WATCHDOG_CORE
+	help
+	  Hardware driver for Intel Over-Clocking watchdog present in
+	  Platform Controller Hub (PCH) chipsets.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called intel_oc_wdt.
+
 config ITCO_WDT
 	tristate "Intel TCO Timer/Watchdog"
 	depends on X86 && PCI
diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
index 7b64ce9680a8..bbd4d62d2cc3 100644
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile
@@ -151,6 +151,7 @@ obj-$(CONFIG_W83977F_WDT) += w83977f_wdt.o
 obj-$(CONFIG_MACHZ_WDT) += machzwd.o
 obj-$(CONFIG_SBC_EPX_C3_WATCHDOG) += sbc_epx_c3.o
 obj-$(CONFIG_INTEL_MID_WATCHDOG) += intel-mid_wdt.o
+obj-$(CONFIG_INTEL_OC_WATCHDOG) += intel_oc_wdt.o
 obj-$(CONFIG_INTEL_MEI_WDT) += mei_wdt.o
 obj-$(CONFIG_NI903X_WDT) += ni903x_wdt.o
 obj-$(CONFIG_NIC7018_WDT) += nic7018_wdt.o
diff --git a/drivers/watchdog/intel_oc_wdt.c b/drivers/watchdog/intel_oc_wdt.c
new file mode 100644
index 000000000000..7c0551106981
--- /dev/null
+++ b/drivers/watchdog/intel_oc_wdt.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel OC Watchdog driver
+ *
+ * Copyright (C) 2025, Siemens
+ * Author: Diogo Ivo <diogo.ivo@siemens.com>
+ */
+
+#define DRV_NAME	"intel_oc_wdt"
+
+#include <linux/acpi.h>
+#include <linux/bits.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/platform_device.h>
+#include <linux/watchdog.h>
+
+#define INTEL_OC_WDT_TOV		GENMASK(9, 0)
+#define INTEL_OC_WDT_MIN_TOV		1
+#define INTEL_OC_WDT_MAX_TOV		1024
+#define INTEL_OC_WDT_DEF_TOV		60
+
+/*
+ * One-time writable lock bit. If set forbids
+ * modification of itself, _TOV and _EN until
+ * next reboot.
+ */
+#define INTEL_OC_WDT_CTL_LCK		BIT(12)
+
+#define INTEL_OC_WDT_EN			BIT(14)
+#define INTEL_OC_WDT_NO_ICCSURV_STS	BIT(24)
+#define INTEL_OC_WDT_ICCSURV_STS	BIT(25)
+#define INTEL_OC_WDT_RLD		BIT(31)
+
+#define INTEL_OC_WDT_STS_BITS (INTEL_OC_WDT_NO_ICCSURV_STS | \
+			       INTEL_OC_WDT_ICCSURV_STS)
+
+#define INTEL_OC_WDT_CTRL_REG(wdt)	((wdt)->ctrl_res->start)
+
+struct intel_oc_wdt {
+	struct watchdog_device wdd;
+	struct resource *ctrl_res;
+	bool locked;
+};
+
+static int heartbeat;
+module_param(heartbeat, uint, 0);
+MODULE_PARM_DESC(heartbeat, "Watchdog heartbeats in seconds. (default="
+		 __MODULE_STRING(WDT_HEARTBEAT) ")");
+
+static bool nowayout = WATCHDOG_NOWAYOUT;
+module_param(nowayout, bool, 0);
+MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
+		 __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+
+static int intel_oc_wdt_start(struct watchdog_device *wdd)
+{
+	struct intel_oc_wdt *oc_wdt = watchdog_get_drvdata(wdd);
+
+	if (oc_wdt->locked)
+		return 0;
+
+	outl(inl(INTEL_OC_WDT_CTRL_REG(oc_wdt)) | INTEL_OC_WDT_EN,
+	     INTEL_OC_WDT_CTRL_REG(oc_wdt));
+
+	return 0;
+}
+
+static int intel_oc_wdt_stop(struct watchdog_device *wdd)
+{
+	struct intel_oc_wdt *oc_wdt = watchdog_get_drvdata(wdd);
+
+	outl(inl(INTEL_OC_WDT_CTRL_REG(oc_wdt)) & ~INTEL_OC_WDT_EN,
+	     INTEL_OC_WDT_CTRL_REG(oc_wdt));
+
+	return 0;
+}
+
+static int intel_oc_wdt_ping(struct watchdog_device *wdd)
+{
+	struct intel_oc_wdt *oc_wdt = watchdog_get_drvdata(wdd);
+
+	outl(inl(INTEL_OC_WDT_CTRL_REG(oc_wdt)) | INTEL_OC_WDT_RLD,
+	     INTEL_OC_WDT_CTRL_REG(oc_wdt));
+
+	return 0;
+}
+
+static int intel_oc_wdt_set_timeout(struct watchdog_device *wdd,
+				    unsigned int t)
+{
+	struct intel_oc_wdt *oc_wdt = watchdog_get_drvdata(wdd);
+
+	outl((inl(INTEL_OC_WDT_CTRL_REG(oc_wdt)) & ~INTEL_OC_WDT_TOV) | (t - 1),
+	     INTEL_OC_WDT_CTRL_REG(oc_wdt));
+
+	wdd->timeout = t;
+
+	return 0;
+}
+
+static const struct watchdog_info intel_oc_wdt_info = {
+	.options = WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE | WDIOF_KEEPALIVEPING,
+	.identity = DRV_NAME,
+};
+
+static const struct watchdog_ops intel_oc_wdt_ops = {
+	.owner = THIS_MODULE,
+	.start = intel_oc_wdt_start,
+	.stop = intel_oc_wdt_stop,
+	.ping = intel_oc_wdt_ping,
+	.set_timeout = intel_oc_wdt_set_timeout,
+};
+
+static int intel_oc_wdt_setup(struct intel_oc_wdt *oc_wdt)
+{
+	struct watchdog_info *info;
+	unsigned long val;
+
+	val = inl(INTEL_OC_WDT_CTRL_REG(oc_wdt));
+
+	if (val & INTEL_OC_WDT_STS_BITS)
+		oc_wdt->wdd.bootstatus |= WDIOF_CARDRESET;
+
+	oc_wdt->locked = !!(val & INTEL_OC_WDT_CTL_LCK);
+
+	if (val & INTEL_OC_WDT_EN) {
+		/*
+		 * No need to issue a ping here to "commit" the new timeout
+		 * value to hardware as the watchdog core schedules one
+		 * immediately when registering the watchdog.
+		 */
+		set_bit(WDOG_HW_RUNNING, &oc_wdt->wdd.status);
+
+		if (oc_wdt->locked) {
+			info = (struct watchdog_info *)&intel_oc_wdt_info;
+			/*
+			 * Set nowayout unconditionally as we cannot stop
+			 * the watchdog.
+			 */
+			nowayout = true;
+			/*
+			 * If we are locked read the current timeout value
+			 * and inform the core we can't change it.
+			 */
+			oc_wdt->wdd.timeout = (val & INTEL_OC_WDT_TOV) + 1;
+			info->options &= ~WDIOF_SETTIMEOUT;
+
+			dev_info(oc_wdt->wdd.parent,
+				 "Register access locked, heartbeat fixed at: %u s\n",
+				 oc_wdt->wdd.timeout);
+		}
+	} else if (oc_wdt->locked) {
+		/*
+		 * In case the watchdog is disabled and locked there
+		 * is nothing we can do with it so just fail probing.
+		 */
+		return -EACCES;
+	}
+
+	val &= ~INTEL_OC_WDT_TOV;
+	outl(val | (oc_wdt->wdd.timeout - 1), INTEL_OC_WDT_CTRL_REG(oc_wdt));
+
+	return 0;
+}
+
+static int intel_oc_wdt_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct intel_oc_wdt *oc_wdt;
+	struct watchdog_device *wdd;
+	int ret;
+
+	oc_wdt = devm_kzalloc(&pdev->dev, sizeof(*oc_wdt), GFP_KERNEL);
+	if (!oc_wdt)
+		return -ENOMEM;
+
+	oc_wdt->ctrl_res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+	if (!oc_wdt->ctrl_res) {
+		dev_err(&pdev->dev, "missing I/O resource\n");
+		return -ENODEV;
+	}
+
+	if (!devm_request_region(&pdev->dev, oc_wdt->ctrl_res->start,
+				 resource_size(oc_wdt->ctrl_res), pdev->name)) {
+		dev_err(dev, "resource %pR already in use, device disabled\n",
+			oc_wdt->ctrl_res);
+		return -EBUSY;
+	}
+
+	wdd = &oc_wdt->wdd;
+	wdd->min_timeout = INTEL_OC_WDT_MIN_TOV;
+	wdd->max_timeout = INTEL_OC_WDT_MAX_TOV;
+	wdd->timeout = INTEL_OC_WDT_DEF_TOV;
+	wdd->info = &intel_oc_wdt_info;
+	wdd->ops = &intel_oc_wdt_ops;
+	wdd->parent = dev;
+
+	watchdog_init_timeout(wdd, heartbeat, dev);
+
+	ret = intel_oc_wdt_setup(oc_wdt);
+	if (ret)
+		return ret;
+
+	watchdog_set_drvdata(wdd, oc_wdt);
+	watchdog_set_nowayout(wdd, nowayout);
+	watchdog_stop_on_reboot(wdd);
+	watchdog_stop_on_unregister(wdd);
+
+	return devm_watchdog_register_device(dev, wdd);
+}
+
+static const struct acpi_device_id intel_oc_wdt_match[] = {
+	{ "INT3F0D" },
+	{ "INTC1099" },
+	{ },
+};
+MODULE_DEVICE_TABLE(acpi, intel_oc_wdt_match);
+
+static struct platform_driver intel_oc_wdt_platform_driver = {
+	.driver = {
+		.name = DRV_NAME,
+		.acpi_match_table = intel_oc_wdt_match,
+	},
+	.probe = intel_oc_wdt_probe,
+};
+
+module_platform_driver(intel_oc_wdt_platform_driver);
+
+MODULE_AUTHOR("Diogo Ivo <diogo.ivo@siemens.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Intel OC Watchdog driver");
-- 
2.51.0


From 158f9f2f71523bab787f4fa7a7a1f390524350ca Mon Sep 17 00:00:00 2001
From: Ziyan Fu <fuzy5@lenovo.com>
Date: Tue, 29 Apr 2025 18:25:33 +0800
Subject: [PATCH 02/16] watchdog: iTCO_wdt: Update the heartbeat value after
 clamping timeout

When executing "modprobe iTCO_wdt heartbeat=700", the user-specified
'heartbeat' parameter exceeds the valid range, the driver clamps the
timeout to default 30s but fails to update the logged 'heartbeat' value,
resulting in misleading log output:

iTCO_wdt iTCO_wdt: timeout value out of range, using 30
iTCO_wdt iTCO_wdt: initialized. heartbeat=700 sec (nowayout=0)

After validating the range, update the 'heartbeat' value with the clamped
timeout value to ensure that log messages accurately reflect the actual
runtime parameters.

Signed-off-by: Ziyan Fu <fuzy5@lenovo.com>
Reviewed-by: Wim Van Sebroeck <wim@linux-watchdog.org>
Link: https://lore.kernel.org/r/20250429102533.11886-1-13281011316@163.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
---
 drivers/watchdog/iTCO_wdt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c
index 30a8d72d84fa..9ab769aa0244 100644
--- a/drivers/watchdog/iTCO_wdt.c
+++ b/drivers/watchdog/iTCO_wdt.c
@@ -580,6 +580,7 @@ static int iTCO_wdt_probe(struct platform_device *pdev)
 		iTCO_wdt_set_timeout(&p->wddev, WATCHDOG_TIMEOUT);
 		dev_info(dev, "timeout value out of range, using %d\n",
 			WATCHDOG_TIMEOUT);
+		heartbeat = WATCHDOG_TIMEOUT;
 	}
 
 	watchdog_stop_on_reboot(&p->wddev);
-- 
2.51.0


From 5c78e793f78732b60276401f75cc1a101f9ad121 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Fri, 30 May 2025 12:06:47 -0700
Subject: [PATCH 03/16] overflow: Introduce __DEFINE_FLEX for having no
 initializer

While not yet in the tree, there is a proposed patch[1] that was
depending on the prior behavior of _DEFINE_FLEX, which did not have an
explicit initializer. Provide this via __DEFINE_FLEX now, which can also
have attributes applied (e.g. __uninitialized).

Examples of the resulting initializer behaviors can be seen here:
https://godbolt.org/z/P7Go8Tr33

Link: https://lore.kernel.org/netdev/20250520205920.2134829-9-anthony.l.nguyen@intel.com [1]
Fixes: 47e36ed78406 ("overflow: Fix direct struct member initialization in _DEFINE_FLEX()")
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/overflow.h | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index 7b7be27ca113..154ed0dbb43f 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -389,24 +389,37 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend)
 	struct_size((type *)NULL, member, count)
 
 /**
- * _DEFINE_FLEX() - helper macro for DEFINE_FLEX() family.
- * Enables caller macro to pass (different) initializer.
+ * __DEFINE_FLEX() - helper macro for DEFINE_FLEX() family.
+ * Enables caller macro to pass arbitrary trailing expressions
  *
  * @type: structure type name, including "struct" keyword.
  * @name: Name for a variable to define.
  * @member: Name of the array member.
  * @count: Number of elements in the array; must be compile-time const.
- * @initializer: Initializer expression (e.g., pass `= { }` at minimum).
+ * @trailer: Trailing expressions for attributes and/or initializers.
  */
-#define _DEFINE_FLEX(type, name, member, count, initializer...)			\
+#define __DEFINE_FLEX(type, name, member, count, trailer...)			\
 	_Static_assert(__builtin_constant_p(count),				\
 		       "onstack flex array members require compile-time const count"); \
 	union {									\
 		u8 bytes[struct_size_t(type, member, count)];			\
 		type obj;							\
-	} name##_u = { .obj initializer };					\
+	} name##_u trailer;							\
 	type *name = (type *)&name##_u
 
+/**
+ * _DEFINE_FLEX() - helper macro for DEFINE_FLEX() family.
+ * Enables caller macro to pass (different) initializer.
+ *
+ * @type: structure type name, including "struct" keyword.
+ * @name: Name for a variable to define.
+ * @member: Name of the array member.
+ * @count: Number of elements in the array; must be compile-time const.
+ * @initializer: Initializer expression (e.g., pass `= { }` at minimum).
+ */
+#define _DEFINE_FLEX(type, name, member, count, initializer...)			\
+	__DEFINE_FLEX(type, name, member, count, = { .obj initializer })
+
 /**
  * DEFINE_RAW_FLEX() - Define an on-stack instance of structure with a trailing
  * flexible array member, when it does not have a __counted_by annotation.
@@ -424,7 +437,7 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend)
  * elements in array @member.
  */
 #define DEFINE_RAW_FLEX(type, name, member, count)	\
-	_DEFINE_FLEX(type, name, member, count, = {})
+	__DEFINE_FLEX(type, name, member, count, = { })
 
 /**
  * DEFINE_FLEX() - Define an on-stack instance of structure with a trailing
-- 
2.51.0


From f39f18f3c3531aa802b58a20d39d96e82eb96c14 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Fri, 30 May 2025 15:18:28 -0700
Subject: [PATCH 04/16] randstruct: gcc-plugin: Fix attribute addition

Based on changes in the 2021 public version of the randstruct
out-of-tree GCC plugin[1], more carefully update the attributes on
resulting decls, to avoid tripping checks in GCC 15's
comptypes_check_enum_int() when it has been configured with
"--enable-checking=misc":

arch/arm64/kernel/kexec_image.c:132:14: internal compiler error: in comptypes_check_enum_int, at c/c-typeck.cc:1519
  132 | const struct kexec_file_ops kexec_image_ops = {
      |              ^~~~~~~~~~~~~~
 internal_error(char const*, ...), at gcc/gcc/diagnostic-global-context.cc:517
 fancy_abort(char const*, int, char const*), at gcc/gcc/diagnostic.cc:1803
 comptypes_check_enum_int(tree_node*, tree_node*, bool*), at gcc/gcc/c/c-typeck.cc:1519
 ...

Link: https://archive.org/download/grsecurity/grsecurity-3.1-5.10.41-202105280954.patch.gz [1]
Reported-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Closes: https://github.com/KSPP/linux/issues/367
Closes: https://lore.kernel.org/lkml/20250530000646.104457-1-thiago.bauermann@linaro.org/
Reported-by: Ingo Saitz <ingo@hannover.ccc.de>
Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1104745
Fixes: 313dd1b62921 ("gcc-plugins: Add the randstruct plugin")
Tested-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Link: https://lore.kernel.org/r/20250530221824.work.623-kees@kernel.org
Signed-off-by: Kees Cook <kees@kernel.org>
---
 scripts/gcc-plugins/gcc-common.h              | 32 +++++++++++++++++++
 scripts/gcc-plugins/randomize_layout_plugin.c | 22 ++++++-------
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/scripts/gcc-plugins/gcc-common.h b/scripts/gcc-plugins/gcc-common.h
index 3222c1070444..ef12c8f929ed 100644
--- a/scripts/gcc-plugins/gcc-common.h
+++ b/scripts/gcc-plugins/gcc-common.h
@@ -123,6 +123,38 @@ static inline tree build_const_char_string(int len, const char *str)
 	return cstr;
 }
 
+static inline void __add_type_attr(tree type, const char *attr, tree args)
+{
+	tree oldattr;
+
+	if (type == NULL_TREE)
+		return;
+	oldattr = lookup_attribute(attr, TYPE_ATTRIBUTES(type));
+	if (oldattr != NULL_TREE) {
+		gcc_assert(TREE_VALUE(oldattr) == args || TREE_VALUE(TREE_VALUE(oldattr)) == TREE_VALUE(args));
+		return;
+	}
+
+	TYPE_ATTRIBUTES(type) = copy_list(TYPE_ATTRIBUTES(type));
+	TYPE_ATTRIBUTES(type) = tree_cons(get_identifier(attr), args, TYPE_ATTRIBUTES(type));
+}
+
+static inline void add_type_attr(tree type, const char *attr, tree args)
+{
+	tree main_variant = TYPE_MAIN_VARIANT(type);
+
+	__add_type_attr(TYPE_CANONICAL(type), attr, args);
+	__add_type_attr(TYPE_CANONICAL(main_variant), attr, args);
+	__add_type_attr(main_variant, attr, args);
+
+	for (type = TYPE_NEXT_VARIANT(main_variant); type; type = TYPE_NEXT_VARIANT(type)) {
+		if (!lookup_attribute(attr, TYPE_ATTRIBUTES(type)))
+			TYPE_ATTRIBUTES(type) = TYPE_ATTRIBUTES(main_variant);
+
+		__add_type_attr(TYPE_CANONICAL(type), attr, args);
+	}
+}
+
 #define PASS_INFO(NAME, REF, ID, POS)		\
 struct register_pass_info NAME##_pass_info = {	\
 	.pass = make_##NAME##_pass(),		\
diff --git a/scripts/gcc-plugins/randomize_layout_plugin.c b/scripts/gcc-plugins/randomize_layout_plugin.c
index 971a1908a8cc..ff65a4f87f24 100644
--- a/scripts/gcc-plugins/randomize_layout_plugin.c
+++ b/scripts/gcc-plugins/randomize_layout_plugin.c
@@ -73,6 +73,9 @@ static tree handle_randomize_layout_attr(tree *node, tree name, tree args, int f
 
 	if (TYPE_P(*node)) {
 		type = *node;
+	} else if (TREE_CODE(*node) == FIELD_DECL) {
+		*no_add_attrs = false;
+		return NULL_TREE;
 	} else {
 		gcc_assert(TREE_CODE(*node) == TYPE_DECL);
 		type = TREE_TYPE(*node);
@@ -348,15 +351,14 @@ static int relayout_struct(tree type)
 		TREE_CHAIN(newtree[i]) = newtree[i+1];
 	TREE_CHAIN(newtree[num_fields - 1]) = NULL_TREE;
 
+	add_type_attr(type, "randomize_performed", NULL_TREE);
+	add_type_attr(type, "designated_init", NULL_TREE);
+	if (has_flexarray)
+		add_type_attr(type, "has_flexarray", NULL_TREE);
+
 	main_variant = TYPE_MAIN_VARIANT(type);
-	for (variant = main_variant; variant; variant = TYPE_NEXT_VARIANT(variant)) {
+	for (variant = main_variant; variant; variant = TYPE_NEXT_VARIANT(variant))
 		TYPE_FIELDS(variant) = newtree[0];
-		TYPE_ATTRIBUTES(variant) = copy_list(TYPE_ATTRIBUTES(variant));
-		TYPE_ATTRIBUTES(variant) = tree_cons(get_identifier("randomize_performed"), NULL_TREE, TYPE_ATTRIBUTES(variant));
-		TYPE_ATTRIBUTES(variant) = tree_cons(get_identifier("designated_init"), NULL_TREE, TYPE_ATTRIBUTES(variant));
-		if (has_flexarray)
-			TYPE_ATTRIBUTES(type) = tree_cons(get_identifier("has_flexarray"), NULL_TREE, TYPE_ATTRIBUTES(type));
-	}
 
 	/*
 	 * force a re-layout of the main variant
@@ -424,10 +426,8 @@ static void randomize_type(tree type)
 	if (lookup_attribute("randomize_layout", TYPE_ATTRIBUTES(TYPE_MAIN_VARIANT(type))) || is_pure_ops_struct(type))
 		relayout_struct(type);
 
-	for (variant = TYPE_MAIN_VARIANT(type); variant; variant = TYPE_NEXT_VARIANT(variant)) {
-		TYPE_ATTRIBUTES(type) = copy_list(TYPE_ATTRIBUTES(type));
-		TYPE_ATTRIBUTES(type) = tree_cons(get_identifier("randomize_considered"), NULL_TREE, TYPE_ATTRIBUTES(type));
-	}
+	add_type_attr(type, "randomize_considered", NULL_TREE);
+
 #ifdef __DEBUG_PLUGIN
 	fprintf(stderr, "Marking randomize_considered on struct %s\n", ORIG_TYPE_NAME(type));
 #ifdef __DEBUG_VERBOSE
-- 
2.51.0


From 438e22801b1958f86883812af70d402eda29c4f5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 30 May 2025 03:45:13 -0400
Subject: [PATCH 05/16] rtmutex_api: provide correct extern functions

Commit fb49f07ba1d9 ("locking/mutex: implement mutex_lock_killable_nest_lock")
changed the set of functions that mutex.c defines when CONFIG_DEBUG_LOCK_ALLOC
is set.

- it removed the "extern" declaration of mutex_lock_killable_nested from
  include/linux/mutex.h, and replaced it with a macro since it could be
  treated as a special case of _mutex_lock_killable.  It also removed a
  definition of the function in kernel/locking/mutex.c.

- likewise, it replaced mutex_trylock() with the more generic
  mutex_trylock_nest_lock() and replaced mutex_trylock() with a macro.

However, it left the old definitions in place in kernel/locking/rtmutex_api.c,
which causes failures when building with CONFIG_RT_MUTEXES=y.  Bring over
the changes.

Fixes: fb49f07ba1d9 ("locking/mutex: implement mutex_lock_killable_nest_lock")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 kernel/locking/rtmutex_api.c | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index 191e4720e546..2d933528a0fa 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -544,12 +544,12 @@ int __sched mutex_lock_interruptible_nested(struct mutex *lock,
 }
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
 
-int __sched mutex_lock_killable_nested(struct mutex *lock,
-					    unsigned int subclass)
+int __sched _mutex_lock_killable(struct mutex *lock, unsigned int subclass,
+				 struct lockdep_map *nest_lock)
 {
-	return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
+	return __mutex_lock_common(lock, TASK_KILLABLE, subclass, nest_lock, _RET_IP_);
 }
-EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
+EXPORT_SYMBOL_GPL(_mutex_lock_killable);
 
 void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
 {
@@ -563,6 +563,21 @@ void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
 }
 EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
 
+int __sched _mutex_trylock_nest_lock(struct mutex *lock,
+				     struct lockdep_map *nest_lock)
+{
+	int ret;
+
+	if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+		return 0;
+
+	ret = __rt_mutex_trylock(&lock->rtmutex);
+	if (ret)
+		mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(_mutex_trylock_nest_lock);
 #else /* CONFIG_DEBUG_LOCK_ALLOC */
 
 void __sched mutex_lock(struct mutex *lock)
@@ -591,22 +606,16 @@ void __sched mutex_lock_io(struct mutex *lock)
 	io_schedule_finish(token);
 }
 EXPORT_SYMBOL(mutex_lock_io);
-#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
 
 int __sched mutex_trylock(struct mutex *lock)
 {
-	int ret;
-
 	if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
 		return 0;
 
-	ret = __rt_mutex_trylock(&lock->rtmutex);
-	if (ret)
-		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-
-	return ret;
+	return __rt_mutex_trylock(&lock->rtmutex);
 }
 EXPORT_SYMBOL(mutex_trylock);
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
 
 void __sched mutex_unlock(struct mutex *lock)
 {
-- 
2.51.0


From 15ac613f124e51a6623975efad9657b1f3ee47e7 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 19 May 2025 15:56:57 +0100
Subject: [PATCH 06/16] KVM: s390: rename PROT_NONE to PROT_TYPE_DUMMY

The enum type prot_type declared in arch/s390/kvm/gaccess.c declares an
unfortunate identifier within it - PROT_NONE.

This clashes with the protection bit define from the uapi for mmap()
declared in include/uapi/asm-generic/mman-common.h, which is indeed what
those casually reading this code would assume this to refer to.

This means that any changes which subsequently alter headers in any way
which results in the uapi header being imported here will cause build
errors.

Resolve the issue by renaming PROT_NONE to PROT_TYPE_DUMMY.

Link: https://lkml.kernel.org/r/20250519145657.178365-1-lorenzo.stoakes@oracle.com
Fixes: b3cefd6bf16e ("KVM: s390: Pass initialized arg even if unused")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Suggested-by: Ignacio Moreno Gonzalez <Ignacio.MorenoGonzalez@kuka.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202505140943.IgHDa9s7-lkp@intel.com/
Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Acked-by: Ignacio Moreno Gonzalez <Ignacio.MorenoGonzalez@kuka.com>
Acked-by: Yang Shi <yang@os.amperecomputing.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: <stable@vger.kernel.org>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/kvm/gaccess.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index e23670e1949c..21c2e61fece4 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -319,7 +319,7 @@ enum prot_type {
 	PROT_TYPE_DAT  = 3,
 	PROT_TYPE_IEP  = 4,
 	/* Dummy value for passing an initialized value when code != PGM_PROTECTION */
-	PROT_NONE,
+	PROT_TYPE_DUMMY,
 };
 
 static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar,
@@ -335,7 +335,7 @@ static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva,
 	switch (code) {
 	case PGM_PROTECTION:
 		switch (prot) {
-		case PROT_NONE:
+		case PROT_TYPE_DUMMY:
 			/* We should never get here, acts like termination */
 			WARN_ON_ONCE(1);
 			break;
@@ -805,7 +805,7 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 			gpa = kvm_s390_real_to_abs(vcpu, ga);
 			if (!kvm_is_gpa_in_memslot(vcpu->kvm, gpa)) {
 				rc = PGM_ADDRESSING;
-				prot = PROT_NONE;
+				prot = PROT_TYPE_DUMMY;
 			}
 		}
 		if (rc)
@@ -963,7 +963,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 		if (rc == PGM_PROTECTION)
 			prot = PROT_TYPE_KEYC;
 		else
-			prot = PROT_NONE;
+			prot = PROT_TYPE_DUMMY;
 		rc = trans_exc_ending(vcpu, rc, ga, ar, mode, prot, terminate);
 	}
 out_unlock:
-- 
2.51.0


From 684088173b1852e5f5891a7b2a124b7974cb827d Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 29 May 2025 13:38:32 +0300
Subject: [PATCH 07/16] mm: fix vmstat after removing NR_BOUNCE

Hongyu noticed that the nr_unaccepted counter kept growing even in the
absence of unaccepted memory on the machine.

This happens due to a commit that removed NR_BOUNCE: it removed the
counter from the enum zone_stat_item, but left it in the vmstat_text
array.

As a result, all counters below nr_bounce in /proc/vmstat are shifted by
one line, causing the numa_hit counter to be labeled as nr_unaccepted.

To fix this issue, remove nr_bounce from the vmstat_text array.

Link: https://lkml.kernel.org/r/20250529103832.2937460-1-kirill.shutemov@linux.intel.com
Fixes: 194df9f66db8 ("mm: remove NR_BOUNCE zone stat")
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Hongyu Ning <hongyu.ning@linux.intel.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmstat.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6f740f070b3d..429ae5339bfe 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1201,7 +1201,6 @@ const char * const vmstat_text[] = {
 	"nr_zone_unevictable",
 	"nr_zone_write_pending",
 	"nr_mlock",
-	"nr_bounce",
 #if IS_ENABLED(CONFIG_ZSMALLOC)
 	"nr_zspages",
 #endif
-- 
2.51.0


From 9c49e5d09f076001e05537734d7df002162eb2b5 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 2 Jun 2025 10:49:26 -0700
Subject: [PATCH 08/16] mm/madvise: handle madvise_lock() failure during race
 unwinding

When unwinding race on -ERESTARTNOINTR handling of process_madvise(),
madvise_lock() failure is ignored.  Check the failure and abort remaining
works in the case.

Link: https://lkml.kernel.org/r/20250602174926.1074-1-sj@kernel.org
Fixes: 4000e3d0a367 ("mm/madvise: remove redundant mmap_lock operations from process_madvise()")
Signed-off-by: SeongJae Park <sj@kernel.org>
Reported-by: Barry Song <21cnbao@gmail.com>
Closes: https://lore.kernel.org/CAGsJ_4xJXXO0G+4BizhohSZ4yDteziPw43_uF8nPXPWxUVChzw@mail.gmail.com
Reviewed-by: Jann Horn <jannh@google.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Barry Song <baohua@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/madvise.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 8433ac9b27e0..5f7a66a1617e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1881,7 +1881,9 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
 			/* Drop and reacquire lock to unwind race. */
 			madvise_finish_tlb(&madv_behavior);
 			madvise_unlock(mm, behavior);
-			madvise_lock(mm, behavior);
+			ret = madvise_lock(mm, behavior);
+			if (ret)
+				goto out;
 			madvise_init_tlb(&madv_behavior, mm);
 			continue;
 		}
@@ -1892,6 +1894,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
 	madvise_finish_tlb(&madv_behavior);
 	madvise_unlock(mm, behavior);
 
+out:
 	ret = (total_len - iov_iter_count(iter)) ? : ret;
 
 	return ret;
-- 
2.51.0


From 044d2aee6c575231ed4a24fb3d119bad0937488b Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Wed, 21 May 2025 09:06:02 -0700
Subject: [PATCH 09/16] alloc_tag: handle module codetag load errors as module
 load failures

Failures inside codetag_load_module() are currently ignored.  As a result
an error there would not cause a module load failure and freeing of the
associated resources.  Correct this behavior by propagating the error code
to the caller and handling possible errors.  With this change, error to
allocate percpu counters, which happens at this stage, will not be ignored
and will cause a module load failure and freeing of resources.  With this
change we also do not need to disable memory allocation profiling when
this error happens, instead we fail to load the module.

Link: https://lkml.kernel.org/r/20250521160602.1940771-1-surenb@google.com
Fixes: 10075262888b ("alloc_tag: allocate percpu counters for module tags dynamically")
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reported-by: Casey Chen <cachen@purestorage.com>
Closes: https://lore.kernel.org/all/20250520231620.15259-1-cachen@purestorage.com/
Cc: Daniel Gomez <da.gomez@samsung.com>
Cc: David Wang <00107082@163.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Luis Chamberalin <mcgrof@kernel.org>
Cc: Petr Pavlu <petr.pavlu@suse.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/codetag.h |  8 ++++----
 kernel/module/main.c    |  5 +++--
 lib/alloc_tag.c         | 12 +++++++-----
 lib/codetag.c           | 34 +++++++++++++++++++++++++---------
 4 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index 0ee4c21c6dbc..5f2b9a1f722c 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -36,8 +36,8 @@ union codetag_ref {
 struct codetag_type_desc {
 	const char *section;
 	size_t tag_size;
-	void (*module_load)(struct module *mod,
-			    struct codetag *start, struct codetag *end);
+	int (*module_load)(struct module *mod,
+			   struct codetag *start, struct codetag *end);
 	void (*module_unload)(struct module *mod,
 			      struct codetag *start, struct codetag *end);
 #ifdef CONFIG_MODULES
@@ -89,7 +89,7 @@ void *codetag_alloc_module_section(struct module *mod, const char *name,
 				   unsigned long align);
 void codetag_free_module_sections(struct module *mod);
 void codetag_module_replaced(struct module *mod, struct module *new_mod);
-void codetag_load_module(struct module *mod);
+int codetag_load_module(struct module *mod);
 void codetag_unload_module(struct module *mod);
 
 #else /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */
@@ -103,7 +103,7 @@ codetag_alloc_module_section(struct module *mod, const char *name,
 			     unsigned long align) { return NULL; }
 static inline void codetag_free_module_sections(struct module *mod) {}
 static inline void codetag_module_replaced(struct module *mod, struct module *new_mod) {}
-static inline void codetag_load_module(struct module *mod) {}
+static inline int codetag_load_module(struct module *mod) { return 0; }
 static inline void codetag_unload_module(struct module *mod) {}
 
 #endif /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 3d64e69cc03e..08b59c37735e 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -3386,11 +3386,12 @@ static int load_module(struct load_info *info, const char __user *uargs,
 			goto sysfs_cleanup;
 	}
 
+	if (codetag_load_module(mod))
+		goto sysfs_cleanup;
+
 	/* Get rid of temporary copy. */
 	free_copy(info, flags);
 
-	codetag_load_module(mod);
-
 	/* Done! */
 	trace_module_load(mod);
 
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 45dae7da70e1..d48b80f3f007 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -607,15 +607,16 @@ out:
 	mas_unlock(&mas);
 }
 
-static void load_module(struct module *mod, struct codetag *start, struct codetag *stop)
+static int load_module(struct module *mod, struct codetag *start, struct codetag *stop)
 {
 	/* Allocate module alloc_tag percpu counters */
 	struct alloc_tag *start_tag;
 	struct alloc_tag *stop_tag;
 	struct alloc_tag *tag;
 
+	/* percpu counters for core allocations are already statically allocated */
 	if (!mod)
-		return;
+		return 0;
 
 	start_tag = ct_to_alloc_tag(start);
 	stop_tag = ct_to_alloc_tag(stop);
@@ -627,12 +628,13 @@ static void load_module(struct module *mod, struct codetag *start, struct codeta
 				free_percpu(tag->counters);
 				tag->counters = NULL;
 			}
-			shutdown_mem_profiling(true);
-			pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s. Memory allocation profiling is disabled!\n",
+			pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s\n",
 			       mod->name);
-			break;
+			return -ENOMEM;
 		}
 	}
+
+	return 0;
 }
 
 static void replace_module(struct module *mod, struct module *new_mod)
diff --git a/lib/codetag.c b/lib/codetag.c
index de332e98d6f5..650d54d7e14d 100644
--- a/lib/codetag.c
+++ b/lib/codetag.c
@@ -167,6 +167,7 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod)
 {
 	struct codetag_range range;
 	struct codetag_module *cmod;
+	int mod_id;
 	int err;
 
 	range = get_section_range(mod, cttype->desc.section);
@@ -190,11 +191,20 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod)
 	cmod->range = range;
 
 	down_write(&cttype->mod_lock);
-	err = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
-	if (err >= 0) {
-		cttype->count += range_size(cttype, &range);
-		if (cttype->desc.module_load)
-			cttype->desc.module_load(mod, range.start, range.stop);
+	mod_id = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
+	if (mod_id >= 0) {
+		if (cttype->desc.module_load) {
+			err = cttype->desc.module_load(mod, range.start, range.stop);
+			if (!err)
+				cttype->count += range_size(cttype, &range);
+			else
+				idr_remove(&cttype->mod_idr, mod_id);
+		} else {
+			cttype->count += range_size(cttype, &range);
+			err = 0;
+		}
+	} else {
+		err = mod_id;
 	}
 	up_write(&cttype->mod_lock);
 
@@ -295,17 +305,23 @@ void codetag_module_replaced(struct module *mod, struct module *new_mod)
 	mutex_unlock(&codetag_lock);
 }
 
-void codetag_load_module(struct module *mod)
+int codetag_load_module(struct module *mod)
 {
 	struct codetag_type *cttype;
+	int ret = 0;
 
 	if (!mod)
-		return;
+		return 0;
 
 	mutex_lock(&codetag_lock);
-	list_for_each_entry(cttype, &codetag_types, link)
-		codetag_module_init(cttype, mod);
+	list_for_each_entry(cttype, &codetag_types, link) {
+		ret = codetag_module_init(cttype, mod);
+		if (ret)
+			break;
+	}
 	mutex_unlock(&codetag_lock);
+
+	return ret;
 }
 
 void codetag_unload_module(struct module *mod)
-- 
2.51.0


From 41ffaa0ea76206e4fb7589a5a499e6752c33188a Mon Sep 17 00:00:00 2001
From: Joshua Hahn <joshua.hahnjy@gmail.com>
Date: Mon, 2 Jun 2025 09:23:39 -0700
Subject: [PATCH 10/16] mm/mempolicy: fix incorrect freeing of wi_kobj

We should not free wi_group->wi_kobj here.  In the error path of
add_weighted_interleave_group() where this snippet is called from,
kobj_{del, put} is immediately called right after this section.  Thus, it
is not only unnecessary but also incorrect to free it here.

Link: https://lkml.kernel.org/r/20250602162345.2595696-1-joshua.hahnjy@gmail.com
Fixes: e341f9c3c841 ("mm/mempolicy: Weighted Interleave Auto-tuning")
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202506011545.Fduxqxqj-lkp@intel.com/
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mempolicy.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 72fd72e156b1..3b1dfd08338b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -3708,15 +3708,13 @@ static void wi_state_free(void)
 			lockdep_is_held(&wi_state_lock));
 	if (!old_wi_state) {
 		mutex_unlock(&wi_state_lock);
-		goto out;
+		return;
 	}
 
 	rcu_assign_pointer(wi_state, NULL);
 	mutex_unlock(&wi_state_lock);
 	synchronize_rcu();
 	kfree(old_wi_state);
-out:
-	kfree(&wi_group->wi_kobj);
 }
 
 static struct kobj_attribute wi_auto_attr =
-- 
2.51.0


From 334d7c4fb60cf21e0abac134d92fe49e9b04377e Mon Sep 17 00:00:00 2001
From: Nitesh Shetty <nj.shetty@samsung.com>
Date: Mon, 28 Apr 2025 15:28:48 +0530
Subject: [PATCH 11/16] iov_iter: use iov_offset for length calculation in
 iov_iter_aligned_bvec

If iov_offset is non-zero, then we need to consider iov_offset in length
calculation, otherwise we might pass smaller IOs such as 512 bytes, in
below scenario [1].

This issue is reproducible using lib-uring test/fixed-seg.c application
with fixed buffer on a 512 LBA formatted device.

[1]

At present we pass the alignment check, for 512 LBA formatted devices,
len_mask = 511 when IO is smaller, i->count = 512 has an offset,
i->io_offset = 3584 with bvec values, bvec->bv_offset = 256,
bvec->bv_len = 3840.  In short, the first 256 bytes are in the current
page, next 256 bytes are in the another page.  Ideally we expect to
fail the IO.


I can think of 2 userspace scenarios where we experience this.

a: From userspace, we observe a different behaviour when device LBA
   size is 512 vs 4096 bytes.  For 4096 LBA formatted device, I see the
   same liburing test [2] failing, whereas 512 the test passes without
   this.  This is reproducible everytime.

   [2] https://github.com/axboe/liburing/

b: Although I was not able to reproduce the below condition, but I
   suspect below case should be possible from user space for devices
   with 512 LBA formatted device.  Lets say from userspace while
   allocating a virtually single chunk of memory, if we get 2 physical
   chunk of memory, and IO happens to be at the boundary of first
   physical chunk with length crossing first chunk, then we allow IOs
   to proceed and hence we might map wrong physical address length and
   proceed with IO rather than failing.

: --- a/test/fixed-seg.c
: +++ b/test/fixed-seg.c
: @@ -64,7 +64,7 @@ static int test(struct io_uring *ring, int fd, int
: vec_off)
: 		return T_EXIT_FAIL;
: 	}
:
: -       ret = read_it(ring, fd, 4096, vec_off);
: +       ret = read_it(ring, fd, 4096, 7*512 + 256);
: 	if (ret) {
: 		fprintf(stderr, "4096 0 failed\n");
: 		return T_EXIT_FAIL;

Effectively this is a write crossing the page boundary.

Link: https://lkml.kernel.org/r/20250428095849.11709-1-nj.shetty@samsung.com
Fixes: 2263639f96f2 ("iov_iter: streamline iovec/bvec alignment iteration")
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Keith Busch <kbusch@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/iov_iter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 969d4ad510df..f9193f952f49 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -817,7 +817,7 @@ static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
 	size_t size = i->count;
 
 	do {
-		size_t len = bvec->bv_len;
+		size_t len = bvec->bv_len - skip;
 
 		if (len > size)
 			len = size;
-- 
2.51.0


From a2946fb271b1e01f49df35042e338cca1bd28095 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Fri, 30 May 2025 11:49:17 +1000
Subject: [PATCH 12/16] MAINTAINERS: add Alistair as reviewer of mm memory
 policy

I'm particularly familiar with mm/migrate.c and especially
mm/migrate_device.c so add myself to MAINTAINERS.

Link: https://lkml.kernel.org/r/20250530014917.2946940-1-apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Acked-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Matthew Brost <matthew.brost@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 98201e1f4ab5..4b4317dd0788 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15744,6 +15744,7 @@ R:	Rakie Kim <rakie.kim@sk.com>
 R:	Byungchul Park <byungchul@sk.com>
 R:	Gregory Price <gourry@gourry.net>
 R:	Ying Huang <ying.huang@linux.alibaba.com>
+R:	Alistair Popple <apopple@nvidia.com>
 L:	linux-mm@kvack.org
 S:	Maintained
 W:	http://www.linux-mm.org
-- 
2.51.0


From 081056dc00a27bccb55ccc3c6f230a3d5fd3f7e0 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Tue, 27 May 2025 23:23:53 +0200
Subject: [PATCH 13/16] mm/hugetlb: unshare page tables during VMA split, not
 before

Currently, __split_vma() triggers hugetlb page table unsharing through
vm_ops->may_split().  This happens before the VMA lock and rmap locks are
taken - which is too early, it allows racing VMA-locked page faults in our
process and racing rmap walks from other processes to cause page tables to
be shared again before we actually perform the split.

Fix it by explicitly calling into the hugetlb unshare logic from
__split_vma() in the same place where THP splitting also happens.  At that
point, both the VMA and the rmap(s) are write-locked.

An annoying detail is that we can now call into the helper
hugetlb_unshare_pmds() from two different locking contexts:

1. from hugetlb_split(), holding:
    - mmap lock (exclusively)
    - VMA lock
    - file rmap lock (exclusively)
2. hugetlb_unshare_all_pmds(), which I think is designed to be able to
   call us with only the mmap lock held (in shared mode), but currently
   only runs while holding mmap lock (exclusively) and VMA lock

Backporting note:
This commit fixes a racy protection that was introduced in commit
b30c14cd6102 ("hugetlb: unshare some PMDs when splitting VMAs"); that
commit claimed to fix an issue introduced in 5.13, but it should actually
also go all the way back.

[jannh@google.com: v2]
  Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-1-1329349bad1a@google.com
Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-0-1329349bad1a@google.com
Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-1-f4136f5ec58a@google.com
Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
Signed-off-by: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>	[b30c14cd6102: hugetlb: unshare some PMDs when splitting VMAs]
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h          |  3 ++
 mm/hugetlb.c                     | 60 +++++++++++++++++++++++---------
 mm/vma.c                         |  7 ++++
 tools/testing/vma/vma_internal.h |  2 ++
 4 files changed, 56 insertions(+), 16 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 0598f36931de..42f374e828a2 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -279,6 +279,7 @@ bool is_hugetlb_entry_migration(pte_t pte);
 bool is_hugetlb_entry_hwpoisoned(pte_t pte);
 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
 void fixup_hugetlb_reservations(struct vm_area_struct *vma);
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
 
 #else /* !CONFIG_HUGETLB_PAGE */
 
@@ -476,6 +477,8 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
 {
 }
 
+static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
+
 #endif /* !CONFIG_HUGETLB_PAGE */
 
 #ifndef pgd_write
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f0b1d53079f9..7ba020d489d4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -121,7 +121,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
 static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
 static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
 static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
-		unsigned long start, unsigned long end);
+		unsigned long start, unsigned long end, bool take_locks);
 static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
 
 static void hugetlb_free_folio(struct folio *folio)
@@ -5426,26 +5426,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
 {
 	if (addr & ~(huge_page_mask(hstate_vma(vma))))
 		return -EINVAL;
+	return 0;
+}
 
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
+{
 	/*
 	 * PMD sharing is only possible for PUD_SIZE-aligned address ranges
 	 * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
 	 * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
+	 * This function is called in the middle of a VMA split operation, with
+	 * MM, VMA and rmap all write-locked to prevent concurrent page table
+	 * walks (except hardware and gup_fast()).
 	 */
+	vma_assert_write_locked(vma);
+	i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
 	if (addr & ~PUD_MASK) {
-		/*
-		 * hugetlb_vm_op_split is called right before we attempt to
-		 * split the VMA. We will need to unshare PMDs in the old and
-		 * new VMAs, so let's unshare before we split.
-		 */
 		unsigned long floor = addr & PUD_MASK;
 		unsigned long ceil = floor + PUD_SIZE;
 
-		if (floor >= vma->vm_start && ceil <= vma->vm_end)
-			hugetlb_unshare_pmds(vma, floor, ceil);
+		if (floor >= vma->vm_start && ceil <= vma->vm_end) {
+			/*
+			 * Locking:
+			 * Use take_locks=false here.
+			 * The file rmap lock is already held.
+			 * The hugetlb VMA lock can't be taken when we already
+			 * hold the file rmap lock, and we don't need it because
+			 * its purpose is to synchronize against concurrent page
+			 * table walks, which are not possible thanks to the
+			 * locks held by our caller.
+			 */
+			hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
+		}
 	}
-
-	return 0;
 }
 
 static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
@@ -7885,9 +7899,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
 	spin_unlock_irq(&hugetlb_lock);
 }
 
+/*
+ * If @take_locks is false, the caller must ensure that no concurrent page table
+ * access can happen (except for gup_fast() and hardware page walks).
+ * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
+ * concurrent page fault handling) and the file rmap lock.
+ */
 static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
 				   unsigned long start,
-				   unsigned long end)
+				   unsigned long end,
+				   bool take_locks)
 {
 	struct hstate *h = hstate_vma(vma);
 	unsigned long sz = huge_page_size(h);
@@ -7911,8 +7932,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
 				start, end);
 	mmu_notifier_invalidate_range_start(&range);
-	hugetlb_vma_lock_write(vma);
-	i_mmap_lock_write(vma->vm_file->f_mapping);
+	if (take_locks) {
+		hugetlb_vma_lock_write(vma);
+		i_mmap_lock_write(vma->vm_file->f_mapping);
+	} else {
+		i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+	}
 	for (address = start; address < end; address += PUD_SIZE) {
 		ptep = hugetlb_walk(vma, address, sz);
 		if (!ptep)
@@ -7922,8 +7947,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
 		spin_unlock(ptl);
 	}
 	flush_hugetlb_tlb_range(vma, start, end);
-	i_mmap_unlock_write(vma->vm_file->f_mapping);
-	hugetlb_vma_unlock_write(vma);
+	if (take_locks) {
+		i_mmap_unlock_write(vma->vm_file->f_mapping);
+		hugetlb_vma_unlock_write(vma);
+	}
 	/*
 	 * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
 	 * Documentation/mm/mmu_notifier.rst.
@@ -7938,7 +7965,8 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
 {
 	hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
-			ALIGN_DOWN(vma->vm_end, PUD_SIZE));
+			ALIGN_DOWN(vma->vm_end, PUD_SIZE),
+			/* take_locks = */ true);
 }
 
 /*
diff --git a/mm/vma.c b/mm/vma.c
index 1c6595f282e5..7ebc9eb608f4 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -539,7 +539,14 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	init_vma_prep(&vp, vma);
 	vp.insert = new;
 	vma_prepare(&vp);
+
+	/*
+	 * Get rid of huge pages and shared page tables straddling the split
+	 * boundary.
+	 */
 	vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
+	if (is_vm_hugetlb_page(vma))
+		hugetlb_split(vma, addr);
 
 	if (new_below) {
 		vma->vm_start = addr;
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 441feb21aa5a..4505b1c31be1 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -932,6 +932,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
 	(void)next;
 }
 
+static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
+
 static inline void vma_iter_free(struct vma_iterator *vmi)
 {
 	mas_destroy(&vmi->mas);
-- 
2.51.0


From 1013af4f585fccc4d3e5c5824d174de2257f7d6d Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Tue, 27 May 2025 23:23:54 +0200
Subject: [PATCH 14/16] mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race

huge_pmd_unshare() drops a reference on a page table that may have
previously been shared across processes, potentially turning it into a
normal page table used in another process in which unrelated VMAs can
afterwards be installed.

If this happens in the middle of a concurrent gup_fast(), gup_fast() could
end up walking the page tables of another process.  While I don't see any
way in which that immediately leads to kernel memory corruption, it is
really weird and unexpected.

Fix it with an explicit broadcast IPI through tlb_remove_table_sync_one(),
just like we do in khugepaged when removing page tables for a THP
collapse.

Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-2-1329349bad1a@google.com
Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-2-f4136f5ec58a@google.com
Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
Signed-off-by: Jann Horn <jannh@google.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/hugetlb.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7ba020d489d4..8746ed2fec13 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7629,6 +7629,13 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 		return 0;
 
 	pud_clear(pud);
+	/*
+	 * Once our caller drops the rmap lock, some other process might be
+	 * using this page table as a normal, non-hugetlb page table.
+	 * Wait for pending gup_fast() in other threads to finish before letting
+	 * that happen.
+	 */
+	tlb_remove_table_sync_one();
 	ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
 	mm_dec_nr_pmds(mm);
 	return 1;
-- 
2.51.0


From 1b11a28308923a67fc76b8f7be06e1b2d80d07ca Mon Sep 17 00:00:00 2001
From: Tal Zussman <tz2294@columbia.edu>
Date: Tue, 3 Jun 2025 18:50:17 -0400
Subject: [PATCH 15/16] MAINTAINERS: add tlb trace events to MMU GATHER AND TLB
 INVALIDATION

The MMU GATHER AND TLB INVALIDATION entry lists other TLB-related files.
Add the tlb.h tracepoint file there as well.

Link: https://lore.kernel.org/linux-mm/ce048e11-f79d-44a6-bacc-46e1ebc34b24@redhat.com/
Link: https://lkml.kernel.org/r/20250603-tlb-maintainers-v1-1-726d193c6693@columbia.edu
Signed-off-by: Tal Zussman <tz2294@columbia.edu>
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@kernel.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 4b4317dd0788..6b0b5f55ab72 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16654,6 +16654,7 @@ L:	linux-mm@kvack.org
 S:	Maintained
 F:	arch/*/include/asm/tlb.h
 F:	include/asm-generic/tlb.h
+F:	include/trace/events/tlb.h
 F:	mm/mmu_gather.c
 
 MN88472 MEDIA DRIVER
-- 
2.51.0


From ffa74d44f4b95a97b36a038c66f53e4edeaffb99 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 3 Jun 2025 09:53:07 +0200
Subject: [PATCH 16/16] kmsan: test: add module description

Every module should have a description, and kbuild now warns for those
that don't.

WARNING: modpost: missing MODULE_DESCRIPTION() in mm/kmsan/kmsan_test.o

Link: https://lkml.kernel.org/r/20250603075323.1839608-1-arnd@kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Macro Elver <elver@google.com>
Cc: Sabyrzhan Tasbolatov <snovitoll@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kmsan/kmsan_test.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c
index 9733a22c46c1..c6c5b2bbede0 100644
--- a/mm/kmsan/kmsan_test.c
+++ b/mm/kmsan/kmsan_test.c
@@ -732,3 +732,4 @@ kunit_test_suites(&kmsan_test_suite);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Alexander Potapenko <glider@google.com>");
+MODULE_DESCRIPTION("Test cases for KMSAN");
-- 
2.51.0