From f8b136ef2605c1bf62020462d10e35228760aa19 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 19 Mar 2025 08:53:07 +0800 Subject: [PATCH 01/16] tools/power turbostat: Restore GFX sysfs fflush() call Do fflush() to discard the buffered data, before each read of the graphics sysfs knobs. Fixes: ba99a4fc8c24 ("tools/power turbostat: Remove unnecessary fflush() call") Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 70e17d4ad9b6..c9a34c16c7a8 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -6039,6 +6039,7 @@ int snapshot_graphics(int idx) int retval; rewind(gfx_info[idx].fp); + fflush(gfx_info[idx].fp); switch (idx) { case GFX_rc6: -- 2.51.0 From 994633894f208a0151baaee1688ab3c431912553 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 6 Apr 2025 12:53:18 -0400 Subject: [PATCH 02/16] tools/power turbostat: re-factor sysfs code Probe cpuidle "sysfs" residency and counts separately, since soon we will make one disabled on, and the other disabled off. Clarify that some BIC (build-in-counters) are actually "groups". since we're about to re-name some of those groups. no functional change. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 31 ++++++++++++++++++--------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c9a34c16c7a8..df0391bedcde 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -273,10 +273,10 @@ struct msr_counter bic[] = { #define BIC_NMI (1ULL << 61) #define BIC_CPU_c1e (1ULL << 62) -#define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die) -#define BIC_THERMAL_PWR (BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt) -#define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) -#define BIC_IDLE (BIC_Busy | BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) +#define BIC_GROUP_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die) +#define BIC_GROUP_THERMAL_PWR (BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt) +#define BIC_GROUP_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) +#define BIC_GROUP_IDLE (BIC_Busy | BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) #define BIC_OTHER (BIC_IRQ | BIC_NMI | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) @@ -2354,16 +2354,16 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) retval |= ~0; break; } else if (!strcmp(name_list, "topology")) { - retval |= BIC_TOPOLOGY; + retval |= BIC_GROUP_TOPOLOGY; break; } else if (!strcmp(name_list, "power")) { - retval |= BIC_THERMAL_PWR; + retval |= BIC_GROUP_THERMAL_PWR; break; } else if (!strcmp(name_list, "idle")) { - retval |= BIC_IDLE; + retval |= BIC_GROUP_IDLE; break; } else if (!strcmp(name_list, "frequency")) { - retval |= BIC_FREQUENCY; + retval |= BIC_GROUP_FREQUENCY; break; } else if (!strcmp(name_list, "other")) { retval |= BIC_OTHER; @@ -10260,7 +10260,7 @@ int is_deferred_skip(char *name) return 0; } -void probe_sysfs(void) +void probe_cpuidle_residency(void) { char path[64]; char name_buf[16]; @@ -10304,6 +10304,16 @@ void probe_sysfs(void) if (state < min_state) min_state = state; } +} + +void probe_cpuidle_counts(void) +{ + char path[64]; + char name_buf[16]; + FILE *input; + int state; + int min_state = 1024, max_state = 0; + char *sp; for (state = 10; state >= 0; --state) { @@ -10602,7 +10612,8 @@ skip_cgroup_setting: print_bootcmd(); } - probe_sysfs(); + probe_cpuidle_residency(); + probe_cpuidle_counts(); if (!getuid()) set_rlimit(); -- 2.51.0 From 6f110a5e4f9977c31ce76fefbfef6fd4eab6bfb7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 6 Apr 2025 10:00:04 -0700 Subject: [PATCH 03/16] Disable SLUB_TINY for build testing ... and don't error out so hard on missing module descriptions. Before commit 6c6c1fc09de3 ("modpost: require a MODULE_DESCRIPTION()") we used to warn about missing module descriptions, but only when building with extra warnigns (ie 'W=1'). After that commit the warning became an unconditional hard error. And it turns out not all modules have been converted despite the claims to the contrary. As reported by Damian Tometzki, the slub KUnit test didn't have a module description, and apparently nobody ever really noticed. The reason nobody noticed seems to be that the slub KUnit tests get disabled by SLUB_TINY, which also ends up disabling a lot of other code, both in tests and in slub itself. And so anybody doing full build tests didn't actually see this failre. So let's disable SLUB_TINY for build-only tests, since it clearly ends up limiting build coverage. Also turn the missing module descriptions error back into a warning, but let's keep it around for non-'W=1' builds. Reported-by: Damian Tometzki Link: https://lore.kernel.org/all/01070196099fd059-e8463438-7b1b-4ec8-816d-173874be9966-000000@eu-central-1.amazonses.com/ Cc: Masahiro Yamada Cc: Jeff Johnson Fixes: 6c6c1fc09de3 ("modpost: require a MODULE_DESCRIPTION()") Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- scripts/mod/modpost.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index d3fb3762887b..e113f713b493 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -201,7 +201,7 @@ config KVFREE_RCU_BATCHED config SLUB_TINY bool "Configure for minimal memory footprint" - depends on EXPERT + depends on EXPERT && !COMPILE_TEST select SLAB_MERGE_DEFAULT help Configures the slab allocator in a way to achieve minimal memory diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 92627e8d0e16..be89921d60b6 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1603,7 +1603,7 @@ static void read_symbols(const char *modname) } if (!get_modinfo(&info, "description")) - error("missing MODULE_DESCRIPTION() in %s\n", modname); + warn("missing MODULE_DESCRIPTION() in %s\n", modname); } for (sym = info.symtab_start; sym < info.symtab_stop; sym++) { -- 2.51.0 From ec4acd3166d8a7a03b059d01b9c6f11a658e833f Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 6 Apr 2025 14:29:57 -0400 Subject: [PATCH 04/16] tools/power turbostat: disable "cpuidle" invocation counters, by default Create "pct_idle" counter group, the sofware notion of residency so it can now be singled out, independent of other counter groups. Create "cpuidle" group, the cpuidle invocation counts. Disable "cpuidle", by default. Create "swidle" = "cpuidle" + "pct_idle". Undocument "sysfs", the old name for "swidle", but keep it working for backwards compatibilty. Create "hwidle", all the HW idle counters Modify "idle", enabled by default "idle" = "hwidle" + "pct_idle" (and now excludes "cpuidle") Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 12 +++++----- tools/power/x86/turbostat/turbostat.c | 34 +++++++++++++++++++++------ 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index e86493880c16..b74ed916057e 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -100,7 +100,7 @@ The column name "all" can be used to enable all disabled-by-default built-in cou .PP \fB--show column\fP show only the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names. .PP -\fB--show CATEGORY --hide CATEGORY\fP Show and hide also accept a single CATEGORY of columns: "all", "topology", "idle", "frequency", "power", "sysfs", "other". +\fB--show CATEGORY --hide CATEGORY\fP Show and hide also accept a single CATEGORY of columns: "all", "topology", "idle", "frequency", "power", "cpuidle", "hwidle", "swidle", "other". "idle" (enabled by default), includes "hwidle" and "idle_pct". "cpuidle" (default disabled) includes cpuidle software invocation counters. "swidle" includes "cpuidle" plus "idle_pct". "hwidle" includes only hardware based idle residency counters. Older versions of turbostat used the term "sysfs" for what is now "swidle". .PP \fB--Dump\fP displays the raw counter values. .PP @@ -158,15 +158,15 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBSMI\fP The number of System Management Interrupts serviced CPU during the measurement interval. While this counter is actually per-CPU, SMI are triggered on all processors, so the number should be the same for all CPUs. .PP -\fBC1, C2, C3...\fP The number times Linux requested the C1, C2, C3 idle state during the measurement interval. The system summary line shows the sum for all CPUs. These are C-state names as exported in /sys/devices/system/cpu/cpu*/cpuidle/state*/name. While their names are generic, their attributes are processor specific. They the system description section of output shows what MWAIT sub-states they are mapped to on each system. +\fBC1, C2, C3...\fP The number times Linux requested the C1, C2, C3 idle state during the measurement interval. The system summary line shows the sum for all CPUs. These are C-state names as exported in /sys/devices/system/cpu/cpu*/cpuidle/state*/name. While their names are generic, their attributes are processor specific. They the system description section of output shows what MWAIT sub-states they are mapped to on each system. These counters are in the "cpuidle" group, which is disabled, by default. .PP -\fBC1+, C2+, C3+...\fP The idle governor idle state misprediction statistics. Inidcates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a deeper idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/below file. +\fBC1+, C2+, C3+...\fP The idle governor idle state misprediction statistics. Inidcates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a deeper idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/below file. These counters are in the "cpuidle" group, which is disabled, by default. .PP -\fBC1-, C2-, C3-...\fP The idle governor idle state misprediction statistics. Inidcates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a shallower idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/above file. +\fBC1-, C2-, C3-...\fP The idle governor idle state misprediction statistics. Inidcates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a shallower idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/above file. These counters are in the "cpuidle" group, which is disabled, by default. .PP -\fBC1%, C2%, C3%\fP The residency percentage that Linux requested C1, C2, C3.... The system summary is the average of all CPUs in the system. Note that these are software, reflecting what was requested. The hardware counters reflect what was actually achieved. +\fBC1%, C2%, C3%\fP The residency percentage that Linux requested C1, C2, C3.... The system summary is the average of all CPUs in the system. Note that these are software, reflecting what was requested. The hardware counters reflect what was actually achieved. These counters are in the "pct_idle" group, which is enabled by default. .PP -\fBCPU%c1, CPU%c3, CPU%c6, CPU%c7\fP show the percentage residency in hardware core idle states. These numbers are from hardware residency counters. +\fBCPU%c1, CPU%c3, CPU%c6, CPU%c7\fP show the percentage residency in hardware core idle states. These numbers are from hardware residency counters and are in the "hwidle" group, which is enabled, by default. .PP \fBCoreTmp\fP Degrees Celsius reported by the per-core Digital Thermal Sensor. .PP diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index df0391bedcde..ab184f95cdaf 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -153,7 +153,7 @@ struct msr_counter bic[] = { { 0x0, "TSC_MHz", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "IRQ", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "SMI", NULL, 32, 0, FORMAT_DELTA, NULL, 0 }, - { 0x0, "sysfs", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "cpuidle", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "CPU%c1", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "CPU%c3", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "CPU%c6", NULL, 0, 0, 0, NULL, 0 }, @@ -206,6 +206,7 @@ struct msr_counter bic[] = { { 0x0, "Sys_J", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "NMI", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "CPU%c1e", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "pct_idle", NULL, 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -219,7 +220,7 @@ struct msr_counter bic[] = { #define BIC_TSC_MHz (1ULL << 7) #define BIC_IRQ (1ULL << 8) #define BIC_SMI (1ULL << 9) -#define BIC_sysfs (1ULL << 10) +#define BIC_cpuidle (1ULL << 10) #define BIC_CPU_c1 (1ULL << 11) #define BIC_CPU_c3 (1ULL << 12) #define BIC_CPU_c6 (1ULL << 13) @@ -272,17 +273,20 @@ struct msr_counter bic[] = { #define BIC_Sys_J (1ULL << 60) #define BIC_NMI (1ULL << 61) #define BIC_CPU_c1e (1ULL << 62) +#define BIC_pct_idle (1ULL << 63) #define BIC_GROUP_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die) #define BIC_GROUP_THERMAL_PWR (BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt) #define BIC_GROUP_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) -#define BIC_GROUP_IDLE (BIC_Busy | BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) +#define BIC_GROUP_HW_IDLE (BIC_Busy | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) +#define BIC_GROUP_SW_IDLE (BIC_Busy | BIC_cpuidle | BIC_pct_idle ) +#define BIC_GROUP_IDLE (BIC_GROUP_HW_IDLE | BIC_pct_idle) #define BIC_OTHER (BIC_IRQ | BIC_NMI | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) -#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) +#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC | BIC_cpuidle) unsigned long long bic_enabled = (0xFFFFFFFFFFFFFFFFULL & ~BIC_DISABLED_BY_DEFAULT); -unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC_X2APIC; +unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_cpuidle | BIC_pct_idle | BIC_APIC | BIC_X2APIC; #define DO_BIC(COUNTER_NAME) (bic_enabled & bic_present & COUNTER_NAME) #define DO_BIC_READ(COUNTER_NAME) (bic_present & COUNTER_NAME) @@ -2362,6 +2366,15 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) } else if (!strcmp(name_list, "idle")) { retval |= BIC_GROUP_IDLE; break; + } else if (!strcmp(name_list, "swidle")) { + retval |= BIC_GROUP_SW_IDLE; + break; + } else if (!strcmp(name_list, "sysfs")) { /* legacy compatibility */ + retval |= BIC_GROUP_SW_IDLE; + break; + } else if (!strcmp(name_list, "hwidle")) { + retval |= BIC_GROUP_HW_IDLE; + break; } else if (!strcmp(name_list, "frequency")) { retval |= BIC_GROUP_FREQUENCY; break; @@ -2372,6 +2385,7 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) } if (i == MAX_BIC) { + fprintf(stderr, "deferred %s\n", name_list); if (mode == SHOW_LIST) { deferred_add_names[deferred_add_index++] = name_list; if (deferred_add_index >= MAX_DEFERRED) { @@ -10269,6 +10283,9 @@ void probe_cpuidle_residency(void) int min_state = 1024, max_state = 0; char *sp; + if (!DO_BIC(BIC_pct_idle)) + return; + for (state = 10; state >= 0; --state) { sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state); @@ -10291,7 +10308,7 @@ void probe_cpuidle_residency(void) sprintf(path, "cpuidle/state%d/time", state); - if (!DO_BIC(BIC_sysfs) && !is_deferred_add(name_buf)) + if (!DO_BIC(BIC_pct_idle) && !is_deferred_add(name_buf)) continue; if (is_deferred_skip(name_buf)) @@ -10315,6 +10332,9 @@ void probe_cpuidle_counts(void) int min_state = 1024, max_state = 0; char *sp; + if (!DO_BIC(BIC_cpuidle)) + return; + for (state = 10; state >= 0; --state) { sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state); @@ -10327,7 +10347,7 @@ void probe_cpuidle_counts(void) remove_underbar(name_buf); - if (!DO_BIC(BIC_sysfs) && !is_deferred_add(name_buf)) + if (!DO_BIC(BIC_cpuidle) && !is_deferred_add(name_buf)) continue; if (is_deferred_skip(name_buf)) -- 2.51.0 From 03e00e373cab981ad808271b2650700cfa0fbda6 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 6 Apr 2025 14:49:20 -0400 Subject: [PATCH 05/16] tools/power turbostat: v2025.05.06 Support up to 8192 processors Add cpuidle governor debug telemetry, disabled by default Update default output to exclude cpuidle invocation counts Bug fixes Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ab184f95cdaf..1b9fdc1a7ee8 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -9594,7 +9594,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2025.02.02 - Len Brown \n"); + fprintf(outf, "turbostat version 2025.04.06 - Len Brown \n"); } #define COMMAND_LINE_SIZE 2048 -- 2.51.0 From 0efdedb3358aa78102967f242379686f94315830 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 2 Apr 2025 21:21:57 +0100 Subject: [PATCH 06/16] tools/include: make uapi/linux/types.h usable from assembly MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The "real" linux/types.h UAPI header gracefully degrades to a NOOP when included from assembly code. Mirror this behaviour in the tools/ variant. Test for __ASSEMBLER__ over __ASSEMBLY__ as the former is provided by the toolchain automatically. Reported-by: Mark Brown Closes: https://lore.kernel.org/lkml/af553c62-ca2f-4956-932c-dd6e3a126f58@sirena.org.uk/ Fixes: c9fbaa879508 ("selftests: vDSO: parse_vdso: Use UAPI headers instead of libc headers") Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20250321-uapi-consistency-v1-1-439070118dc0@linutronix.de Signed-off-by: Mark Brown Reviewed-by: Mark Brown Signed-off-by: Linus Torvalds --- tools/include/uapi/linux/types.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/include/uapi/linux/types.h b/tools/include/uapi/linux/types.h index 91fa51a9c31d..85aa327245c6 100644 --- a/tools/include/uapi/linux/types.h +++ b/tools/include/uapi/linux/types.h @@ -4,6 +4,8 @@ #include +#ifndef __ASSEMBLER__ + /* copied from linux:include/uapi/linux/types.h */ #define __bitwise typedef __u16 __bitwise __le16; @@ -20,4 +22,5 @@ typedef __u32 __bitwise __wsum; #define __aligned_be64 __be64 __attribute__((aligned(8))) #define __aligned_le64 __le64 __attribute__((aligned(8))) +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_LINUX_TYPES_H */ -- 2.51.0 From 0af2f6be1b4281385b618cb86ad946eded089ac8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 6 Apr 2025 13:11:33 -0700 Subject: [PATCH 07/16] Linux 6.15-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index e55726a71d95..38689a0c3605 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 -PATCHLEVEL = 14 +PATCHLEVEL = 15 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.51.0 From eb3a04a8516ee9b5174379306f94279fc90424c4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 10 Feb 2025 15:11:22 +0100 Subject: [PATCH 08/16] ovl: don't allow datadir only In theory overlayfs could support upper layer directly referring to a data layer, but there's no current use case for this. Originally, when data-only layers were introduced, this wasn't allowed, only introduced by the "datadir+" feature, but without actually handling this case, resulting in an Oops. Fix by disallowing datadir without lowerdir. Reported-by: Giuseppe Scrivano Fixes: 24e16e385f22 ("ovl: add support for appending lowerdirs one by one") Cc: # v6.7 Reviewed-by: Amir Goldstein Reviewed-by: Alexander Larsson Reviewed-by: Christian Brauner Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index b63474d1b064..e19940d649ca 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1138,6 +1138,11 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, return ERR_PTR(-EINVAL); } + if (ctx->nr == ctx->nr_data) { + pr_err("at least one non-data lowerdir is required\n"); + return ERR_PTR(-EINVAL); + } + err = -EINVAL; for (i = 0; i < ctx->nr; i++) { l = &ctx->lower[i]; -- 2.51.0 From a6eb9a4a69cc360b930dad9dc8513f8fd9b3577f Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Mon, 10 Feb 2025 13:07:55 +0100 Subject: [PATCH 09/16] ovl: remove unused forward declaration The ovl_get_verity_xattr() function was never added, only its declaration. Signed-off-by: Giuseppe Scrivano Fixes: 184996e92e86 ("ovl: Validate verity xattr when resolving lowerdata") Reviewed-by: Amir Goldstein Reviewed-by: Alexander Larsson Reviewed-by: Christian Brauner Signed-off-by: Miklos Szeredi --- fs/overlayfs/overlayfs.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 6f2f8f4cfbbc..aef942a758ce 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -541,8 +541,6 @@ int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d, bool ovl_is_metacopy_dentry(struct dentry *dentry); char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding); int ovl_ensure_verity_loaded(struct path *path); -int ovl_get_verity_xattr(struct ovl_fs *ofs, const struct path *path, - u8 *digest_buf, int *buf_length); int ovl_validate_verity(struct ovl_fs *ofs, struct path *metapath, struct path *datapath); -- 2.51.0 From 924577e4f6ca473de1528953a0e13505fae61d7b Mon Sep 17 00:00:00 2001 From: =?utf8?q?Andr=C3=A9=20Almeida?= Date: Tue, 29 Apr 2025 15:38:50 -0300 Subject: [PATCH 10/16] ovl: Fix nested backing file paths MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When the lowerdir of an overlayfs is a merged directory of another overlayfs, ovl_open_realfile() will fail to open the real file and point to a lower dentry copy, without the proper parent path. After this, d_path() will then display the path incorrectly as if the file is placed in the root directory. This bug can be triggered with the following setup: mkdir -p ovl-A/lower ovl-A/upper ovl-A/merge ovl-A/work mkdir -p ovl-B/upper ovl-B/merge ovl-B/work cp /bin/cat ovl-A/lower/ mount -t overlay overlay -o \ lowerdir=ovl-A/lower,upperdir=ovl-A/upper,workdir=ovl-A/work \ ovl-A/merge mount -t overlay overlay -o \ lowerdir=ovl-A/merge,upperdir=ovl-B/upper,workdir=ovl-B/work \ ovl-B/merge ovl-A/merge/cat /proc/self/maps | grep --color cat ovl-B/merge/cat /proc/self/maps | grep --color cat The first cat will correctly show `/ovl-A/merge/cat`, while the second one shows just `/cat`. To fix that, uses file_user_path() inside of backing_file_open() to get the correct file path for the dentry. Co-developed-by: John Schoenick Signed-off-by: John Schoenick Signed-off-by: André Almeida Fixes: def3ae83da02 ("fs: store real path instead of fake path in backing file f_path") Cc: # v6.7 Signed-off-by: Miklos Szeredi --- fs/overlayfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 969b458100fe..dfea7bd800cb 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -48,8 +48,8 @@ static struct file *ovl_open_realfile(const struct file *file, if (!inode_owner_or_capable(real_idmap, realinode)) flags &= ~O_NOATIME; - realfile = backing_file_open(&file->f_path, flags, realpath, - current_cred()); + realfile = backing_file_open(file_user_path((struct file *) file), + flags, realpath, current_cred()); } ovl_revert_creds(old_cred); -- 2.51.0 From a6fcfe9bb26df18ba7b5c4d064edd13c80ea2466 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 7 Feb 2025 16:39:55 +0100 Subject: [PATCH 11/16] ovl: make redirect/metacopy rejection consistent When overlayfs finds a file with metacopy and/or redirect attributes and the metacopy and/or redirect features are not enabled, then it refuses to act on those attributes while also issuing a warning. There was an inconsistency in not checking metacopy found from the index. And also only warning on an upper metacopy if it found the next file on the lower layer, while always warning for metacopy found on a lower layer. Fix these inconsistencies and make the logic more straightforward, paving the way for following patches to change when data redirects are allowed. Signed-off-by: Miklos Szeredi --- fs/overlayfs/namei.c | 90 +++++++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 35 deletions(-) diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index be5c65d6f848..f010e7456668 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -16,6 +16,7 @@ struct ovl_lookup_data { struct super_block *sb; + struct dentry *dentry; const struct ovl_layer *layer; struct qstr name; bool is_dir; @@ -24,6 +25,7 @@ struct ovl_lookup_data { bool stop; bool last; char *redirect; + char *upperredirect; int metacopy; /* Referring to last redirect xattr */ bool absolute_redirect; @@ -1024,6 +1026,31 @@ int ovl_verify_lowerdata(struct dentry *dentry) return ovl_maybe_validate_verity(dentry); } +/* + * Following redirects/metacopy can have security consequences: it's like a + * symlink into the lower layer without the permission checks. + * + * This is only a problem if the upper layer is untrusted (e.g comes from an USB + * drive). This can allow a non-readable file or directory to become readable. + * + * Only following redirects when redirects are enabled disables this attack + * vector when not necessary. + */ +static bool ovl_check_follow_redirect(struct ovl_lookup_data *d) +{ + struct ovl_fs *ofs = OVL_FS(d->sb); + + if (d->metacopy && !ofs->config.metacopy) { + pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", d->dentry); + return false; + } + if ((d->redirect || d->upperredirect) && !ovl_redirect_follow(ofs)) { + pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n", d->dentry); + return false; + } + return true; +} + struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { @@ -1039,7 +1066,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int ctr = 0; struct inode *inode = NULL; bool upperopaque = false; - char *upperredirect = NULL; struct dentry *this; unsigned int i; int err; @@ -1047,12 +1073,14 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, int metacopy_size = 0; struct ovl_lookup_data d = { .sb = dentry->d_sb, + .dentry = dentry, .name = dentry->d_name, .is_dir = false, .opaque = false, .stop = false, .last = ovl_redirect_follow(ofs) ? false : !ovl_numlower(poe), .redirect = NULL, + .upperredirect = NULL, .metacopy = 0, }; @@ -1094,8 +1122,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (d.redirect) { err = -ENOMEM; - upperredirect = kstrdup(d.redirect, GFP_KERNEL); - if (!upperredirect) + d.upperredirect = kstrdup(d.redirect, GFP_KERNEL); + if (!d.upperredirect) goto out_put_upper; if (d.redirect[0] == '/') poe = roe; @@ -1113,6 +1141,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, for (i = 0; !d.stop && i < ovl_numlower(poe); i++) { struct ovl_path lower = ovl_lowerstack(poe)[i]; + if (!ovl_check_follow_redirect(&d)) { + err = -EPERM; + goto out_put; + } + if (!ovl_redirect_follow(ofs)) d.last = i == ovl_numlower(poe) - 1; else if (d.is_dir || !ofs->numdatalayer) @@ -1126,13 +1159,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (!this) continue; - if ((uppermetacopy || d.metacopy) && !ofs->config.metacopy) { - dput(this); - err = -EPERM; - pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", dentry); - goto out_put; - } - /* * If no origin fh is stored in upper of a merge dir, store fh * of lower dir and set upper parent "impure". @@ -1185,23 +1211,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, ctr++; } - /* - * Following redirects can have security consequences: it's like - * a symlink into the lower layer without the permission checks. - * This is only a problem if the upper layer is untrusted (e.g - * comes from an USB drive). This can allow a non-readable file - * or directory to become readable. - * - * Only following redirects when redirects are enabled disables - * this attack vector when not necessary. - */ - err = -EPERM; - if (d.redirect && !ovl_redirect_follow(ofs)) { - pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n", - dentry); - goto out_put; - } - if (d.stop) break; @@ -1212,6 +1221,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, } } + if (!ovl_check_follow_redirect(&d)) { + err = -EPERM; + goto out_put; + } + /* Defer lookup of lowerdata in data-only layers to first access */ if (d.metacopy && ctr && ofs->numdatalayer && d.absolute_redirect) { d.metacopy = 0; @@ -1298,20 +1312,26 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, /* * It's safe to assign upperredirect here: the previous - * assignment of happens only if upperdentry is non-NULL, and + * assignment happens only if upperdentry is non-NULL, and * this one only if upperdentry is NULL. */ - upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0); - if (IS_ERR(upperredirect)) { - err = PTR_ERR(upperredirect); - upperredirect = NULL; + d.upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0); + if (IS_ERR(d.upperredirect)) { + err = PTR_ERR(d.upperredirect); + d.upperredirect = NULL; goto out_free_oe; } + err = ovl_check_metacopy_xattr(ofs, &upperpath, NULL); if (err < 0) goto out_free_oe; - uppermetacopy = err; + d.metacopy = uppermetacopy = err; metacopy_size = err; + + if (!ovl_check_follow_redirect(&d)) { + err = -EPERM; + goto out_free_oe; + } } if (upperdentry || ctr) { @@ -1319,7 +1339,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, .upperdentry = upperdentry, .oe = oe, .index = index, - .redirect = upperredirect, + .redirect = d.upperredirect, }; /* Store lowerdata redirect for lazy lookup */ @@ -1361,7 +1381,7 @@ out_put_upper: kfree(origin_path); } dput(upperdentry); - kfree(upperredirect); + kfree(d.upperredirect); out: kfree(d.redirect); ovl_revert_creds(old_cred); -- 2.51.0 From 5ef7bcdeecc982ae17d13b682a85123c7d74b200 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 7 Feb 2025 17:12:06 +0100 Subject: [PATCH 12/16] ovl: relax redirect/metacopy requirements for lower -> data redirect Allow the special case of a redirect from a lower layer to a data layer without having to turn on metacopy. This makes the feature work with userxattr, which in turn allows data layers to be usable in user namespaces. Minimize the risk by only enabling redirect from a single lower layer to a data layer iff a data layer is specified. The only way to access a data layer is to enable this, so there's really no reason not to enable this. This can be used safely if the lower layer is read-only and the user.overlay.redirect xattr cannot be modified. Reviewed-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- Documentation/filesystems/overlayfs.rst | 7 +++++++ fs/overlayfs/namei.c | 18 ++++++++++-------- fs/overlayfs/params.c | 5 ----- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/Documentation/filesystems/overlayfs.rst b/Documentation/filesystems/overlayfs.rst index 2db379b4b31e..4133a336486d 100644 --- a/Documentation/filesystems/overlayfs.rst +++ b/Documentation/filesystems/overlayfs.rst @@ -443,6 +443,13 @@ Only the data of the files in the "data-only" lower layers may be visible when a "metacopy" file in one of the lower layers above it, has a "redirect" to the absolute path of the "lower data" file in the "data-only" lower layer. +Instead of explicitly enabling "metacopy=on" it is sufficient to specify at +least one data-only layer to enable redirection of data to a data-only layer. +In this case other forms of metacopy are rejected. Note: this way data-only +layers may be used toghether with "userxattr", in which case careful attention +must be given to privileges needed to change the "user.overlay.redirect" xattr +to prevent misuse. + Since kernel version v6.8, "data-only" lower layers can also be added using the "datadir+" mount options and the fsconfig syscall from new mount api. For example:: diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index f010e7456668..d489e80feb6f 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -1066,6 +1066,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int ctr = 0; struct inode *inode = NULL; bool upperopaque = false; + bool check_redirect = (ovl_redirect_follow(ofs) || ofs->numdatalayer); struct dentry *this; unsigned int i; int err; @@ -1078,7 +1079,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, .is_dir = false, .opaque = false, .stop = false, - .last = ovl_redirect_follow(ofs) ? false : !ovl_numlower(poe), + .last = check_redirect ? false : !ovl_numlower(poe), .redirect = NULL, .upperredirect = NULL, .metacopy = 0, @@ -1146,7 +1147,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out_put; } - if (!ovl_redirect_follow(ofs)) + if (!check_redirect) d.last = i == ovl_numlower(poe) - 1; else if (d.is_dir || !ofs->numdatalayer) d.last = lower.layer->idx == ovl_numlower(roe); @@ -1221,15 +1222,16 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, } } - if (!ovl_check_follow_redirect(&d)) { - err = -EPERM; - goto out_put; - } - - /* Defer lookup of lowerdata in data-only layers to first access */ + /* + * Defer lookup of lowerdata in data-only layers to first access. + * Don't require redirect=follow and metacopy=on in this case. + */ if (d.metacopy && ctr && ofs->numdatalayer && d.absolute_redirect) { d.metacopy = 0; ctr++; + } else if (!ovl_check_follow_redirect(&d)) { + err = -EPERM; + goto out_put; } /* diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 6759f7d040c8..2468b436bb13 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -1025,11 +1025,6 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, */ } - if (ctx->nr_data > 0 && !config->metacopy) { - pr_err("lower data-only dirs require metacopy support.\n"); - return -EINVAL; - } - return 0; } -- 2.51.0 From b71db54ef3b86c94eb87f68a6d4d3d866e704a4a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 10 Feb 2025 20:14:37 +0100 Subject: [PATCH 13/16] ovl: don't require "metacopy=on" for "verity" This allows the "verity" mount option to be used with "userxattr" data-only layer(s). Also it allows dropping the "metacopy=on" option when the "datadir+" option is to be used. This cleanly separates the two features that have been lumped together under "metacopy=on": - data-redirect: data access is redirected to the data-only layer - meta-copy: copy up metadata only if possible Previous patches made sure that with "userxattr" metacopy only works in the lower -> data scenario. In this scenario the lower (metadata) layer must be secured against tampering, in which case the verity checksums contained in this layer can ensure integrity of data even in the case of an untrusted data layer. Reviewed-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/params.c | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 2468b436bb13..e297681ecac7 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -871,18 +871,6 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, config->uuid = OVL_UUID_NULL; } - /* Resolve verity -> metacopy dependency */ - if (config->verity_mode && !config->metacopy) { - /* Don't allow explicit specified conflicting combinations */ - if (set.metacopy) { - pr_err("conflicting options: metacopy=off,verity=%s\n", - ovl_verity_mode(config)); - return -EINVAL; - } - /* Otherwise automatically enable metacopy. */ - config->metacopy = true; - } - /* * This is to make the logic below simpler. It doesn't make any other * difference, since redirect_dir=on is only used for upper. @@ -890,18 +878,13 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, if (!config->upperdir && config->redirect_mode == OVL_REDIRECT_FOLLOW) config->redirect_mode = OVL_REDIRECT_ON; - /* Resolve verity -> metacopy -> redirect_dir dependency */ + /* metacopy -> redirect_dir dependency */ if (config->metacopy && config->redirect_mode != OVL_REDIRECT_ON) { if (set.metacopy && set.redirect) { pr_err("conflicting options: metacopy=on,redirect_dir=%s\n", ovl_redirect_mode(config)); return -EINVAL; } - if (config->verity_mode && set.redirect) { - pr_err("conflicting options: verity=%s,redirect_dir=%s\n", - ovl_verity_mode(config), ovl_redirect_mode(config)); - return -EINVAL; - } if (set.redirect) { /* * There was an explicit redirect_dir=... that resulted @@ -970,7 +953,7 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, } - /* Resolve userxattr -> !redirect && !metacopy && !verity dependency */ + /* Resolve userxattr -> !redirect && !metacopy dependency */ if (config->userxattr) { if (set.redirect && config->redirect_mode != OVL_REDIRECT_NOFOLLOW) { @@ -982,11 +965,6 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, pr_err("conflicting options: userxattr,metacopy=on\n"); return -EINVAL; } - if (config->verity_mode) { - pr_err("conflicting options: userxattr,verity=%s\n", - ovl_verity_mode(config)); - return -EINVAL; - } /* * Silently disable default setting of redirect and metacopy. * This shall be the default in the future as well: these -- 2.51.0 From 50e638beb67e020a9124d77bd8a88bde3cd380e3 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 14 Apr 2025 22:54:08 +0200 Subject: [PATCH 14/16] ovl: Use str_on_off() helper in ovl_show_options() Remove hard-coded strings by using the str_on_off() helper function. Acked-by: Amir Goldstein Signed-off-by: Thorsten Blum Signed-off-by: Miklos Szeredi --- fs/overlayfs/params.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index e297681ecac7..f42488c01957 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -1051,17 +1051,16 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry) seq_printf(m, ",redirect_dir=%s", ovl_redirect_mode(&ofs->config)); if (ofs->config.index != ovl_index_def) - seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off"); + seq_printf(m, ",index=%s", str_on_off(ofs->config.index)); if (ofs->config.uuid != ovl_uuid_def()) seq_printf(m, ",uuid=%s", ovl_uuid_mode(&ofs->config)); if (ofs->config.nfs_export != ovl_nfs_export_def) - seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ? - "on" : "off"); + seq_printf(m, ",nfs_export=%s", + str_on_off(ofs->config.nfs_export)); if (ofs->config.xino != ovl_xino_def() && !ovl_same_fs(ofs)) seq_printf(m, ",xino=%s", ovl_xino_mode(&ofs->config)); if (ofs->config.metacopy != ovl_metacopy_def) - seq_printf(m, ",metacopy=%s", - ofs->config.metacopy ? "on" : "off"); + seq_printf(m, ",metacopy=%s", str_on_off(ofs->config.metacopy)); if (ofs->config.ovl_volatile) seq_puts(m, ",volatile"); if (ofs->config.userxattr) -- 2.51.0 From 8a39f1c870e9d6fbac5638f3a42a6a6363829c49 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 21 Apr 2025 16:15:19 -0700 Subject: [PATCH 15/16] ovl: Check for NULL d_inode() in ovl_dentry_upper() In ovl_path_type() and ovl_is_metacopy_dentry() GCC notices that it is possible for OVL_E() to return NULL (which implies that d_inode(dentry) may be NULL). This would result in out of bounds reads via container_of(), seen with GCC 15's -Warray-bounds -fdiagnostics-details. For example: In file included from arch/x86/include/generated/asm/rwonce.h:1, from include/linux/compiler.h:339, from include/linux/export.h:5, from include/linux/linkage.h:7, from include/linux/fs.h:5, from fs/overlayfs/util.c:7: In function 'ovl_upperdentry_dereference', inlined from 'ovl_dentry_upper' at ../fs/overlayfs/util.c:305:9, inlined from 'ovl_path_type' at ../fs/overlayfs/util.c:216:6: include/asm-generic/rwonce.h:44:26: error: array subscript 0 is outside array bounds of 'struct inode[7486503276667837]' [-Werror=array-bounds=] 44 | #define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) | ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ include/asm-generic/rwonce.h:50:9: note: in expansion of macro '__READ_ONCE' 50 | __READ_ONCE(x); \ | ^~~~~~~~~~~ fs/overlayfs/ovl_entry.h:195:16: note: in expansion of macro 'READ_ONCE' 195 | return READ_ONCE(oi->__upperdentry); | ^~~~~~~~~ 'ovl_path_type': event 1 185 | return inode ? OVL_I(inode)->oe : NULL; 'ovl_path_type': event 2 Avoid this by allowing ovl_dentry_upper() to return NULL if d_inode() is NULL, as that means the problematic dereferencing can never be reached. Note that this fixes the over-eager compiler warning in an effort to being able to enable -Warray-bounds globally. There is no known behavioral bug here. Suggested-by: Amir Goldstein Signed-off-by: Kees Cook Signed-off-by: Miklos Szeredi --- fs/overlayfs/util.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 0819c739cc2f..5d6b60d56c27 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -305,7 +305,9 @@ enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path) struct dentry *ovl_dentry_upper(struct dentry *dentry) { - return ovl_upperdentry_dereference(OVL_I(d_inode(dentry))); + struct inode *inode = d_inode(dentry); + + return inode ? ovl_upperdentry_dereference(OVL_I(inode)) : NULL; } struct dentry *ovl_dentry_lower(struct dentry *dentry) -- 2.51.0 From 5aaf6a8cc3305e1000417f6fac9d131c47db9117 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sat, 3 May 2025 13:52:03 +0200 Subject: [PATCH 16/16] ovl: Replace offsetof() with struct_size() in ovl_cache_entry_new() Compared to offsetof(), struct_size() provides additional compile-time checks for structs with flexible arrays (e.g., __must_be_array()). No functional changes intended. Signed-off-by: Thorsten Blum Signed-off-by: Miklos Szeredi --- fs/overlayfs/readdir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 881ec5592da5..efe4700c797e 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "overlayfs.h" struct ovl_cache_entry { @@ -147,9 +148,8 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; - size_t size = offsetof(struct ovl_cache_entry, name[len + 1]); - p = kmalloc(size, GFP_KERNEL); + p = kmalloc(struct_size(p, name, len + 1), GFP_KERNEL); if (!p) return NULL; -- 2.51.0