From 3ae8508663372b93c5556a887e96ed0ca5df0711 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 6 Apr 2025 12:23:22 -0400 Subject: [PATCH 01/16] tools/power turbostat: Document GNR UncMHz domain convention Document that on Intel Granite Rapids Systems, Uncore domains 0-2 are CPU domains, and uncore domains 3-4 are IO domains. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 144565151e1e..e86493880c16 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -205,6 +205,7 @@ The system configuration dump (if --quiet is not used) is followed by statistics \fBUncMHz\fP per-package uncore MHz, instantaneous sample. .PP \fBUMHz1.0\fP per-package uncore MHz for domain=1 and fabric_cluster=0, instantaneous sample. System summary is the average of all packages. +Intel Granite Rapids systems use domains 0-2 for CPUs, and 3-4 for IO, with cluster always 0. For the "--show" and "--hide" options, use "UncMHz" to operate on all UMHz*.* as a group. .SH TOO MUCH INFORMATION EXAMPLE By default, turbostat dumps all possible information -- a system configuration header, followed by columns for all counters. -- 2.51.0 From f8b136ef2605c1bf62020462d10e35228760aa19 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 19 Mar 2025 08:53:07 +0800 Subject: [PATCH 02/16] tools/power turbostat: Restore GFX sysfs fflush() call Do fflush() to discard the buffered data, before each read of the graphics sysfs knobs. Fixes: ba99a4fc8c24 ("tools/power turbostat: Remove unnecessary fflush() call") Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 70e17d4ad9b6..c9a34c16c7a8 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -6039,6 +6039,7 @@ int snapshot_graphics(int idx) int retval; rewind(gfx_info[idx].fp); + fflush(gfx_info[idx].fp); switch (idx) { case GFX_rc6: -- 2.51.0 From 994633894f208a0151baaee1688ab3c431912553 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 6 Apr 2025 12:53:18 -0400 Subject: [PATCH 03/16] tools/power turbostat: re-factor sysfs code Probe cpuidle "sysfs" residency and counts separately, since soon we will make one disabled on, and the other disabled off. Clarify that some BIC (build-in-counters) are actually "groups". since we're about to re-name some of those groups. no functional change. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 31 ++++++++++++++++++--------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c9a34c16c7a8..df0391bedcde 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -273,10 +273,10 @@ struct msr_counter bic[] = { #define BIC_NMI (1ULL << 61) #define BIC_CPU_c1e (1ULL << 62) -#define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die) -#define BIC_THERMAL_PWR (BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt) -#define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) -#define BIC_IDLE (BIC_Busy | BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) +#define BIC_GROUP_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die) +#define BIC_GROUP_THERMAL_PWR (BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt) +#define BIC_GROUP_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) +#define BIC_GROUP_IDLE (BIC_Busy | BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) #define BIC_OTHER (BIC_IRQ | BIC_NMI | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) @@ -2354,16 +2354,16 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) retval |= ~0; break; } else if (!strcmp(name_list, "topology")) { - retval |= BIC_TOPOLOGY; + retval |= BIC_GROUP_TOPOLOGY; break; } else if (!strcmp(name_list, "power")) { - retval |= BIC_THERMAL_PWR; + retval |= BIC_GROUP_THERMAL_PWR; break; } else if (!strcmp(name_list, "idle")) { - retval |= BIC_IDLE; + retval |= BIC_GROUP_IDLE; break; } else if (!strcmp(name_list, "frequency")) { - retval |= BIC_FREQUENCY; + retval |= BIC_GROUP_FREQUENCY; break; } else if (!strcmp(name_list, "other")) { retval |= BIC_OTHER; @@ -10260,7 +10260,7 @@ int is_deferred_skip(char *name) return 0; } -void probe_sysfs(void) +void probe_cpuidle_residency(void) { char path[64]; char name_buf[16]; @@ -10304,6 +10304,16 @@ void probe_sysfs(void) if (state < min_state) min_state = state; } +} + +void probe_cpuidle_counts(void) +{ + char path[64]; + char name_buf[16]; + FILE *input; + int state; + int min_state = 1024, max_state = 0; + char *sp; for (state = 10; state >= 0; --state) { @@ -10602,7 +10612,8 @@ skip_cgroup_setting: print_bootcmd(); } - probe_sysfs(); + probe_cpuidle_residency(); + probe_cpuidle_counts(); if (!getuid()) set_rlimit(); -- 2.51.0 From 6f110a5e4f9977c31ce76fefbfef6fd4eab6bfb7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 6 Apr 2025 10:00:04 -0700 Subject: [PATCH 04/16] Disable SLUB_TINY for build testing ... and don't error out so hard on missing module descriptions. Before commit 6c6c1fc09de3 ("modpost: require a MODULE_DESCRIPTION()") we used to warn about missing module descriptions, but only when building with extra warnigns (ie 'W=1'). After that commit the warning became an unconditional hard error. And it turns out not all modules have been converted despite the claims to the contrary. As reported by Damian Tometzki, the slub KUnit test didn't have a module description, and apparently nobody ever really noticed. The reason nobody noticed seems to be that the slub KUnit tests get disabled by SLUB_TINY, which also ends up disabling a lot of other code, both in tests and in slub itself. And so anybody doing full build tests didn't actually see this failre. So let's disable SLUB_TINY for build-only tests, since it clearly ends up limiting build coverage. Also turn the missing module descriptions error back into a warning, but let's keep it around for non-'W=1' builds. Reported-by: Damian Tometzki Link: https://lore.kernel.org/all/01070196099fd059-e8463438-7b1b-4ec8-816d-173874be9966-000000@eu-central-1.amazonses.com/ Cc: Masahiro Yamada Cc: Jeff Johnson Fixes: 6c6c1fc09de3 ("modpost: require a MODULE_DESCRIPTION()") Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- scripts/mod/modpost.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index d3fb3762887b..e113f713b493 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -201,7 +201,7 @@ config KVFREE_RCU_BATCHED config SLUB_TINY bool "Configure for minimal memory footprint" - depends on EXPERT + depends on EXPERT && !COMPILE_TEST select SLAB_MERGE_DEFAULT help Configures the slab allocator in a way to achieve minimal memory diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 92627e8d0e16..be89921d60b6 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1603,7 +1603,7 @@ static void read_symbols(const char *modname) } if (!get_modinfo(&info, "description")) - error("missing MODULE_DESCRIPTION() in %s\n", modname); + warn("missing MODULE_DESCRIPTION() in %s\n", modname); } for (sym = info.symtab_start; sym < info.symtab_stop; sym++) { -- 2.51.0 From ec4acd3166d8a7a03b059d01b9c6f11a658e833f Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 6 Apr 2025 14:29:57 -0400 Subject: [PATCH 05/16] tools/power turbostat: disable "cpuidle" invocation counters, by default Create "pct_idle" counter group, the sofware notion of residency so it can now be singled out, independent of other counter groups. Create "cpuidle" group, the cpuidle invocation counts. Disable "cpuidle", by default. Create "swidle" = "cpuidle" + "pct_idle". Undocument "sysfs", the old name for "swidle", but keep it working for backwards compatibilty. Create "hwidle", all the HW idle counters Modify "idle", enabled by default "idle" = "hwidle" + "pct_idle" (and now excludes "cpuidle") Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 12 +++++----- tools/power/x86/turbostat/turbostat.c | 34 +++++++++++++++++++++------ 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index e86493880c16..b74ed916057e 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -100,7 +100,7 @@ The column name "all" can be used to enable all disabled-by-default built-in cou .PP \fB--show column\fP show only the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names. .PP -\fB--show CATEGORY --hide CATEGORY\fP Show and hide also accept a single CATEGORY of columns: "all", "topology", "idle", "frequency", "power", "sysfs", "other". +\fB--show CATEGORY --hide CATEGORY\fP Show and hide also accept a single CATEGORY of columns: "all", "topology", "idle", "frequency", "power", "cpuidle", "hwidle", "swidle", "other". "idle" (enabled by default), includes "hwidle" and "idle_pct". "cpuidle" (default disabled) includes cpuidle software invocation counters. "swidle" includes "cpuidle" plus "idle_pct". "hwidle" includes only hardware based idle residency counters. Older versions of turbostat used the term "sysfs" for what is now "swidle". .PP \fB--Dump\fP displays the raw counter values. .PP @@ -158,15 +158,15 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBSMI\fP The number of System Management Interrupts serviced CPU during the measurement interval. While this counter is actually per-CPU, SMI are triggered on all processors, so the number should be the same for all CPUs. .PP -\fBC1, C2, C3...\fP The number times Linux requested the C1, C2, C3 idle state during the measurement interval. The system summary line shows the sum for all CPUs. These are C-state names as exported in /sys/devices/system/cpu/cpu*/cpuidle/state*/name. While their names are generic, their attributes are processor specific. They the system description section of output shows what MWAIT sub-states they are mapped to on each system. +\fBC1, C2, C3...\fP The number times Linux requested the C1, C2, C3 idle state during the measurement interval. The system summary line shows the sum for all CPUs. These are C-state names as exported in /sys/devices/system/cpu/cpu*/cpuidle/state*/name. While their names are generic, their attributes are processor specific. They the system description section of output shows what MWAIT sub-states they are mapped to on each system. These counters are in the "cpuidle" group, which is disabled, by default. .PP -\fBC1+, C2+, C3+...\fP The idle governor idle state misprediction statistics. Inidcates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a deeper idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/below file. +\fBC1+, C2+, C3+...\fP The idle governor idle state misprediction statistics. Inidcates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a deeper idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/below file. These counters are in the "cpuidle" group, which is disabled, by default. .PP -\fBC1-, C2-, C3-...\fP The idle governor idle state misprediction statistics. Inidcates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a shallower idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/above file. +\fBC1-, C2-, C3-...\fP The idle governor idle state misprediction statistics. Inidcates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a shallower idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/above file. These counters are in the "cpuidle" group, which is disabled, by default. .PP -\fBC1%, C2%, C3%\fP The residency percentage that Linux requested C1, C2, C3.... The system summary is the average of all CPUs in the system. Note that these are software, reflecting what was requested. The hardware counters reflect what was actually achieved. +\fBC1%, C2%, C3%\fP The residency percentage that Linux requested C1, C2, C3.... The system summary is the average of all CPUs in the system. Note that these are software, reflecting what was requested. The hardware counters reflect what was actually achieved. These counters are in the "pct_idle" group, which is enabled by default. .PP -\fBCPU%c1, CPU%c3, CPU%c6, CPU%c7\fP show the percentage residency in hardware core idle states. These numbers are from hardware residency counters. +\fBCPU%c1, CPU%c3, CPU%c6, CPU%c7\fP show the percentage residency in hardware core idle states. These numbers are from hardware residency counters and are in the "hwidle" group, which is enabled, by default. .PP \fBCoreTmp\fP Degrees Celsius reported by the per-core Digital Thermal Sensor. .PP diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index df0391bedcde..ab184f95cdaf 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -153,7 +153,7 @@ struct msr_counter bic[] = { { 0x0, "TSC_MHz", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "IRQ", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "SMI", NULL, 32, 0, FORMAT_DELTA, NULL, 0 }, - { 0x0, "sysfs", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "cpuidle", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "CPU%c1", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "CPU%c3", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "CPU%c6", NULL, 0, 0, 0, NULL, 0 }, @@ -206,6 +206,7 @@ struct msr_counter bic[] = { { 0x0, "Sys_J", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "NMI", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "CPU%c1e", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "pct_idle", NULL, 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -219,7 +220,7 @@ struct msr_counter bic[] = { #define BIC_TSC_MHz (1ULL << 7) #define BIC_IRQ (1ULL << 8) #define BIC_SMI (1ULL << 9) -#define BIC_sysfs (1ULL << 10) +#define BIC_cpuidle (1ULL << 10) #define BIC_CPU_c1 (1ULL << 11) #define BIC_CPU_c3 (1ULL << 12) #define BIC_CPU_c6 (1ULL << 13) @@ -272,17 +273,20 @@ struct msr_counter bic[] = { #define BIC_Sys_J (1ULL << 60) #define BIC_NMI (1ULL << 61) #define BIC_CPU_c1e (1ULL << 62) +#define BIC_pct_idle (1ULL << 63) #define BIC_GROUP_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die) #define BIC_GROUP_THERMAL_PWR (BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt) #define BIC_GROUP_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) -#define BIC_GROUP_IDLE (BIC_Busy | BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) +#define BIC_GROUP_HW_IDLE (BIC_Busy | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) +#define BIC_GROUP_SW_IDLE (BIC_Busy | BIC_cpuidle | BIC_pct_idle ) +#define BIC_GROUP_IDLE (BIC_GROUP_HW_IDLE | BIC_pct_idle) #define BIC_OTHER (BIC_IRQ | BIC_NMI | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) -#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) +#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC | BIC_cpuidle) unsigned long long bic_enabled = (0xFFFFFFFFFFFFFFFFULL & ~BIC_DISABLED_BY_DEFAULT); -unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC_X2APIC; +unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_cpuidle | BIC_pct_idle | BIC_APIC | BIC_X2APIC; #define DO_BIC(COUNTER_NAME) (bic_enabled & bic_present & COUNTER_NAME) #define DO_BIC_READ(COUNTER_NAME) (bic_present & COUNTER_NAME) @@ -2362,6 +2366,15 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) } else if (!strcmp(name_list, "idle")) { retval |= BIC_GROUP_IDLE; break; + } else if (!strcmp(name_list, "swidle")) { + retval |= BIC_GROUP_SW_IDLE; + break; + } else if (!strcmp(name_list, "sysfs")) { /* legacy compatibility */ + retval |= BIC_GROUP_SW_IDLE; + break; + } else if (!strcmp(name_list, "hwidle")) { + retval |= BIC_GROUP_HW_IDLE; + break; } else if (!strcmp(name_list, "frequency")) { retval |= BIC_GROUP_FREQUENCY; break; @@ -2372,6 +2385,7 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) } if (i == MAX_BIC) { + fprintf(stderr, "deferred %s\n", name_list); if (mode == SHOW_LIST) { deferred_add_names[deferred_add_index++] = name_list; if (deferred_add_index >= MAX_DEFERRED) { @@ -10269,6 +10283,9 @@ void probe_cpuidle_residency(void) int min_state = 1024, max_state = 0; char *sp; + if (!DO_BIC(BIC_pct_idle)) + return; + for (state = 10; state >= 0; --state) { sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state); @@ -10291,7 +10308,7 @@ void probe_cpuidle_residency(void) sprintf(path, "cpuidle/state%d/time", state); - if (!DO_BIC(BIC_sysfs) && !is_deferred_add(name_buf)) + if (!DO_BIC(BIC_pct_idle) && !is_deferred_add(name_buf)) continue; if (is_deferred_skip(name_buf)) @@ -10315,6 +10332,9 @@ void probe_cpuidle_counts(void) int min_state = 1024, max_state = 0; char *sp; + if (!DO_BIC(BIC_cpuidle)) + return; + for (state = 10; state >= 0; --state) { sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state); @@ -10327,7 +10347,7 @@ void probe_cpuidle_counts(void) remove_underbar(name_buf); - if (!DO_BIC(BIC_sysfs) && !is_deferred_add(name_buf)) + if (!DO_BIC(BIC_cpuidle) && !is_deferred_add(name_buf)) continue; if (is_deferred_skip(name_buf)) -- 2.51.0 From 03e00e373cab981ad808271b2650700cfa0fbda6 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 6 Apr 2025 14:49:20 -0400 Subject: [PATCH 06/16] tools/power turbostat: v2025.05.06 Support up to 8192 processors Add cpuidle governor debug telemetry, disabled by default Update default output to exclude cpuidle invocation counts Bug fixes Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ab184f95cdaf..1b9fdc1a7ee8 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -9594,7 +9594,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2025.02.02 - Len Brown \n"); + fprintf(outf, "turbostat version 2025.04.06 - Len Brown \n"); } #define COMMAND_LINE_SIZE 2048 -- 2.51.0 From 0efdedb3358aa78102967f242379686f94315830 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 2 Apr 2025 21:21:57 +0100 Subject: [PATCH 07/16] tools/include: make uapi/linux/types.h usable from assembly MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The "real" linux/types.h UAPI header gracefully degrades to a NOOP when included from assembly code. Mirror this behaviour in the tools/ variant. Test for __ASSEMBLER__ over __ASSEMBLY__ as the former is provided by the toolchain automatically. Reported-by: Mark Brown Closes: https://lore.kernel.org/lkml/af553c62-ca2f-4956-932c-dd6e3a126f58@sirena.org.uk/ Fixes: c9fbaa879508 ("selftests: vDSO: parse_vdso: Use UAPI headers instead of libc headers") Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20250321-uapi-consistency-v1-1-439070118dc0@linutronix.de Signed-off-by: Mark Brown Reviewed-by: Mark Brown Signed-off-by: Linus Torvalds --- tools/include/uapi/linux/types.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/include/uapi/linux/types.h b/tools/include/uapi/linux/types.h index 91fa51a9c31d..85aa327245c6 100644 --- a/tools/include/uapi/linux/types.h +++ b/tools/include/uapi/linux/types.h @@ -4,6 +4,8 @@ #include +#ifndef __ASSEMBLER__ + /* copied from linux:include/uapi/linux/types.h */ #define __bitwise typedef __u16 __bitwise __le16; @@ -20,4 +22,5 @@ typedef __u32 __bitwise __wsum; #define __aligned_be64 __be64 __attribute__((aligned(8))) #define __aligned_le64 __le64 __attribute__((aligned(8))) +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_LINUX_TYPES_H */ -- 2.51.0 From 0af2f6be1b4281385b618cb86ad946eded089ac8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 6 Apr 2025 13:11:33 -0700 Subject: [PATCH 08/16] Linux 6.15-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index e55726a71d95..38689a0c3605 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 -PATCHLEVEL = 14 +PATCHLEVEL = 15 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.51.0 From 56a7b9f8b05981e929becb45a826558779bca6f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 13 Mar 2025 15:31:50 -0700 Subject: [PATCH 09/16] ratelimit: Create functions to handle ratelimit_state internals A number of ratelimit use cases do open-coded access to the ratelimit_state structure's ->missed field. This works, but is a bit messy and makes it more annoying to make changes to this field. Therefore, provide a ratelimit_state_inc_miss() function that increments the ->missed field, a ratelimit_state_get_miss() function that reads out the ->missed field, and a ratelimit_state_reset_miss() function that reads out that field, but that also resets its value to zero. These functions will replace client-code open-coded uses of ->missed. In addition, a new ratelimit_state_reset_interval() function encapsulates what was previously open-coded lock acquisition and direct field updates. [ paulmck: Apply kernel test robot feedback. ] Link: https://lore.kernel.org/all/fbe93a52-365e-47fe-93a4-44a44547d601@paulmck-laptop/ Link: https://lore.kernel.org/all/20250423115409.3425-1-spasswolf@web.de/ Signed-off-by: Paul E. McKenney Reviewed-by: Petr Mladek Cc: Andrew Morton Cc: Kuniyuki Iwashima Cc: Mateusz Guzik Cc: Steven Rostedt Cc: John Ogness Cc: Sergey Senozhatsky --- include/linux/ratelimit.h | 40 ++++++++++++++++++++++++++++++++++----- lib/ratelimit.c | 8 ++++---- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h index b17e0cd0a30c..8400c5356c18 100644 --- a/include/linux/ratelimit.h +++ b/include/linux/ratelimit.h @@ -22,16 +22,46 @@ static inline void ratelimit_default_init(struct ratelimit_state *rs) DEFAULT_RATELIMIT_BURST); } +static inline void ratelimit_state_inc_miss(struct ratelimit_state *rs) +{ + rs->missed++; +} + +static inline int ratelimit_state_get_miss(struct ratelimit_state *rs) +{ + return rs->missed; +} + +static inline int ratelimit_state_reset_miss(struct ratelimit_state *rs) +{ + int ret = rs->missed; + + rs->missed = 0; + return ret; +} + +static inline void ratelimit_state_reset_interval(struct ratelimit_state *rs, int interval_init) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rs->lock, flags); + rs->interval = interval_init; + rs->begin = 0; + rs->printed = 0; + ratelimit_state_reset_miss(rs); + raw_spin_unlock_irqrestore(&rs->lock, flags); +} + static inline void ratelimit_state_exit(struct ratelimit_state *rs) { + int m; + if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) return; - if (rs->missed) { - pr_warn("%s: %d output lines suppressed due to ratelimiting\n", - current->comm, rs->missed); - rs->missed = 0; - } + m = ratelimit_state_reset_miss(rs); + if (m) + pr_warn("%s: %d output lines suppressed due to ratelimiting\n", current->comm, m); } static inline void diff --git a/lib/ratelimit.c b/lib/ratelimit.c index ce945c17980b..85e22f00180c 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -51,12 +51,12 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) rs->begin = jiffies; if (time_is_before_jiffies(rs->begin + interval)) { - if (rs->missed) { + int m = ratelimit_state_reset_miss(rs); + + if (m) { if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { printk_deferred(KERN_WARNING - "%s: %d callbacks suppressed\n", - func, rs->missed); - rs->missed = 0; + "%s: %d callbacks suppressed\n", func, m); } } rs->begin = jiffies; -- 2.51.0 From 48e864ae865744d1451b1776a14acfa422e8a115 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 13 Mar 2025 16:11:38 -0700 Subject: [PATCH 10/16] random: Avoid open-coded use of ratelimit_state structure's ->missed field The _credit_init_bits() function directly accesses the ratelimit_state structure's ->missed field, which works, but which also makes it more difficult to change this field. Therefore, make use of the ratelimit_state_get_miss() and ratelimit_state_inc_miss() functions instead of directly accessing the ->missed field. Link: https://lore.kernel.org/all/fbe93a52-365e-47fe-93a4-44a44547d601@paulmck-laptop/ Link: https://lore.kernel.org/all/20250423115409.3425-1-spasswolf@web.de/ Signed-off-by: Paul E. McKenney Reviewed-by: Petr Mladek Cc: "Theodore Ts'o" "Jason A. Donenfeld" --- drivers/char/random.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 38f2fab29c56..416dac0ab565 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -726,6 +726,7 @@ static void __cold _credit_init_bits(size_t bits) static DECLARE_WORK(set_ready, crng_set_ready); unsigned int new, orig, add; unsigned long flags; + int m; if (!bits) return; @@ -748,9 +749,9 @@ static void __cold _credit_init_bits(size_t bits) wake_up_interruptible(&crng_init_wait); kill_fasync(&fasync, SIGIO, POLL_IN); pr_notice("crng init done\n"); - if (urandom_warning.missed) - pr_notice("%d urandom warning(s) missed due to ratelimiting\n", - urandom_warning.missed); + m = ratelimit_state_get_miss(&urandom_warning); + if (m) + pr_notice("%d urandom warning(s) missed due to ratelimiting\n", m); } else if (orig < POOL_EARLY_BITS && new >= POOL_EARLY_BITS) { spin_lock_irqsave(&base_crng.lock, flags); /* Check if crng_init is CRNG_EMPTY, to avoid race with crng_reseed(). */ @@ -1466,7 +1467,7 @@ static ssize_t urandom_read_iter(struct kiocb *kiocb, struct iov_iter *iter) if (!crng_ready()) { if (!ratelimit_disable && maxwarn <= 0) - ++urandom_warning.missed; + ratelimit_state_inc_miss(&urandom_warning); else if (ratelimit_disable || __ratelimit(&urandom_warning)) { --maxwarn; pr_notice("%s: uninitialized urandom read (%zu bytes read)\n", -- 2.51.0 From 25228c60999f848fdeb291031858de49c115b64e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 13 Mar 2025 16:24:36 -0700 Subject: [PATCH 11/16] drm/i915: Avoid open-coded use of ratelimit_state structure's ->missed field The i915_oa_stream_destroy() function directly accesses the ratelimit_state structure's ->missed field, which works, but which also makes it more difficult to change this field. Therefore, make use of the ratelimit_state_get_miss() function instead of directly accessing the ->missed field. Link: https://lore.kernel.org/all/fbe93a52-365e-47fe-93a4-44a44547d601@paulmck-laptop/ Link: https://lore.kernel.org/all/20250423115409.3425-1-spasswolf@web.de/ Signed-off-by: Paul E. McKenney Reviewed-by: Petr Mladek Acked-by: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: Tvrtko Ursulin Cc: David Airlie Cc: Simona Vetter Cc: Cc: --- drivers/gpu/drm/i915/i915_perf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index de0b413600a1..1658f1246c6f 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -1666,6 +1666,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream) struct i915_perf *perf = stream->perf; struct intel_gt *gt = stream->engine->gt; struct i915_perf_group *g = stream->engine->oa_group; + int m; if (WARN_ON(stream != g->exclusive_stream)) return; @@ -1690,10 +1691,9 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream) free_oa_configs(stream); free_noa_wait(stream); - if (perf->spurious_report_rs.missed) { - gt_notice(gt, "%d spurious OA report notices suppressed due to ratelimiting\n", - perf->spurious_report_rs.missed); - } + m = ratelimit_state_get_miss(&perf->spurious_report_rs); + if (m) + gt_notice(gt, "%d spurious OA report notices suppressed due to ratelimiting\n", m); } static void gen7_init_oa_buffer(struct i915_perf_stream *stream) -- 2.51.0 From c6f7f1b2c0ff46b9069a8fbc7d167c9ead66dfce Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 17 Mar 2025 20:28:40 -0700 Subject: [PATCH 12/16] drm/amd/pm: Avoid open-coded use of ratelimit_state structure's internals MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The amdgpu_set_thermal_throttling_logging() function directly accesses the ratelimit_state structure's ->missed field, which works, but which also makes it more difficult to change this field. Therefore, make use of the ratelimit_state_reset_interval() function instead of directly accessing the ->missed field. Nevertheless, open-coded use of ->burst and ->interval is still permitted, for example, for runtime sysfs adjustment of these fields. Link: https://lore.kernel.org/all/fbe93a52-365e-47fe-93a4-44a44547d601@paulmck-laptop/ Link: https://lore.kernel.org/all/20250423115409.3425-1-spasswolf@web.de/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202503180826.EiekA1MB-lkp@intel.com/ Signed-off-by: Paul E. McKenney Acked-by: Alex Deucher Reviewed-by: Petr Mladek Cc: Kenneth Feng Cc: "Christian König" Cc: Xinhui Pan Cc: David Airlie Cc: Simona Vetter Cc: Cc: --- drivers/gpu/drm/amd/pm/amdgpu_pm.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index 922def51685b..d533c79f7e21 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -1606,7 +1606,6 @@ static ssize_t amdgpu_set_thermal_throttling_logging(struct device *dev, struct drm_device *ddev = dev_get_drvdata(dev); struct amdgpu_device *adev = drm_to_adev(ddev); long throttling_logging_interval; - unsigned long flags; int ret = 0; ret = kstrtol(buf, 0, &throttling_logging_interval); @@ -1617,18 +1616,12 @@ static ssize_t amdgpu_set_thermal_throttling_logging(struct device *dev, return -EINVAL; if (throttling_logging_interval > 0) { - raw_spin_lock_irqsave(&adev->throttling_logging_rs.lock, flags); /* * Reset the ratelimit timer internals. * This can effectively restart the timer. */ - adev->throttling_logging_rs.interval = - (throttling_logging_interval - 1) * HZ; - adev->throttling_logging_rs.begin = 0; - adev->throttling_logging_rs.printed = 0; - adev->throttling_logging_rs.missed = 0; - raw_spin_unlock_irqrestore(&adev->throttling_logging_rs.lock, flags); - + ratelimit_state_reset_interval(&adev->throttling_logging_rs, + (throttling_logging_interval - 1) * HZ); atomic_set(&adev->throttling_logging_enabled, 1); } else { atomic_set(&adev->throttling_logging_enabled, 0); -- 2.51.0 From 78bf44de47b3cec72ee8ddc14161d5b3212f25e0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Mar 2025 09:07:13 -0700 Subject: [PATCH 13/16] ratelimit: Convert the ->missed field to atomic_t The ratelimit_state structure's ->missed field is sometimes incremented locklessly, and it would be good to avoid lost counts. This is also needed to count the number of misses due to trylock failure. Therefore, convert the ratelimit_state structure's ->missed field to atomic_t. Link: https://lore.kernel.org/all/fbe93a52-365e-47fe-93a4-44a44547d601@paulmck-laptop/ Link: https://lore.kernel.org/all/20250423115409.3425-1-spasswolf@web.de/ Signed-off-by: Paul E. McKenney Reviewed-by: Petr Mladek Cc: Andrew Morton Cc: Kuniyuki Iwashima Cc: Mateusz Guzik Cc: Steven Rostedt Cc: John Ogness Cc: Sergey Senozhatsky --- include/linux/ratelimit.h | 9 +++------ include/linux/ratelimit_types.h | 2 +- lib/ratelimit.c | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h index 8400c5356c18..c78b92b3e5cd 100644 --- a/include/linux/ratelimit.h +++ b/include/linux/ratelimit.h @@ -24,20 +24,17 @@ static inline void ratelimit_default_init(struct ratelimit_state *rs) static inline void ratelimit_state_inc_miss(struct ratelimit_state *rs) { - rs->missed++; + atomic_inc(&rs->missed); } static inline int ratelimit_state_get_miss(struct ratelimit_state *rs) { - return rs->missed; + return atomic_read(&rs->missed); } static inline int ratelimit_state_reset_miss(struct ratelimit_state *rs) { - int ret = rs->missed; - - rs->missed = 0; - return ret; + return atomic_xchg_relaxed(&rs->missed, 0); } static inline void ratelimit_state_reset_interval(struct ratelimit_state *rs, int interval_init) diff --git a/include/linux/ratelimit_types.h b/include/linux/ratelimit_types.h index 765232ce0b5e..d21fe82b67f6 100644 --- a/include/linux/ratelimit_types.h +++ b/include/linux/ratelimit_types.h @@ -18,7 +18,7 @@ struct ratelimit_state { int interval; int burst; int printed; - int missed; + atomic_t missed; unsigned int flags; unsigned long begin; }; diff --git a/lib/ratelimit.c b/lib/ratelimit.c index 85e22f00180c..18703f92d73e 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -66,7 +66,7 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) rs->printed++; ret = 1; } else { - rs->missed++; + ratelimit_state_inc_miss(rs); ret = 0; } raw_spin_unlock_irqrestore(&rs->lock, flags); -- 2.51.0 From d343732ddbfa15db88a628bf3284eb088efbaaf7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Mar 2025 11:26:59 -0700 Subject: [PATCH 14/16] ratelimit: Count misses due to lock contention The ___ratelimit() function simply returns zero ("do ratelimiting") if the trylock fails, but does not adjust the ->missed field. This means that the resulting dropped printk()s are dropped silently, which could seriously confuse people trying to do console-log-based debugging. Therefore, increment the ->missed field upon trylock failure. Link: https://lore.kernel.org/all/fbe93a52-365e-47fe-93a4-44a44547d601@paulmck-laptop/ Link: https://lore.kernel.org/all/20250423115409.3425-1-spasswolf@web.de/ Signed-off-by: Paul E. McKenney Reviewed-by: Petr Mladek Cc: Andrew Morton Cc: Kuniyuki Iwashima Cc: Mateusz Guzik Cc: Steven Rostedt Cc: John Ogness Cc: Sergey Senozhatsky --- lib/ratelimit.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/ratelimit.c b/lib/ratelimit.c index 18703f92d73e..19ad3cdbd171 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -44,8 +44,10 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) * in addition to the one that will be printed by * the entity that is holding the lock already: */ - if (!raw_spin_trylock_irqsave(&rs->lock, flags)) + if (!raw_spin_trylock_irqsave(&rs->lock, flags)) { + ratelimit_state_inc_miss(rs); return 0; + } if (!rs->begin) rs->begin = jiffies; -- 2.51.0 From e64a348dc148af41cec266b2143305a2d0228ee7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 Mar 2025 15:32:54 -0700 Subject: [PATCH 15/16] ratelimit: Avoid jiffies=0 special case The ___ratelimit() function special-cases the jiffies-counter value of zero as "uninitialized". This works well on 64-bit systems, where the jiffies counter is not going to return to zero for more than half a billion years on systems with HZ=1000, but similar 32-bit systems take less than 50 days to wrap the jiffies counter. And although the consequences of wrapping the jiffies counter seem to be limited to minor confusion on the duration of the rate-limiting interval that happens to end at time zero, it is almost no work to avoid this confusion. Therefore, introduce a RATELIMIT_INITIALIZED bit to the ratelimit_state structure's ->flags field so that a ->begin value of zero is no longer special. Link: https://lore.kernel.org/all/fbe93a52-365e-47fe-93a4-44a44547d601@paulmck-laptop/ Link: https://lore.kernel.org/all/20250423115409.3425-1-spasswolf@web.de/ Signed-off-by: Paul E. McKenney Reviewed-by: Petr Mladek Cc: Andrew Morton Cc: Kuniyuki Iwashima Cc: Mateusz Guzik Cc: Steven Rostedt Cc: John Ogness Cc: Sergey Senozhatsky --- include/linux/ratelimit.h | 2 +- include/linux/ratelimit_types.h | 1 + lib/ratelimit.c | 4 +++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h index c78b92b3e5cd..adfec24061d1 100644 --- a/include/linux/ratelimit.h +++ b/include/linux/ratelimit.h @@ -43,7 +43,7 @@ static inline void ratelimit_state_reset_interval(struct ratelimit_state *rs, in raw_spin_lock_irqsave(&rs->lock, flags); rs->interval = interval_init; - rs->begin = 0; + rs->flags &= ~RATELIMIT_INITIALIZED; rs->printed = 0; ratelimit_state_reset_miss(rs); raw_spin_unlock_irqrestore(&rs->lock, flags); diff --git a/include/linux/ratelimit_types.h b/include/linux/ratelimit_types.h index d21fe82b67f6..ef6711b6b229 100644 --- a/include/linux/ratelimit_types.h +++ b/include/linux/ratelimit_types.h @@ -11,6 +11,7 @@ /* issue num suppressed message on exit */ #define RATELIMIT_MSG_ON_RELEASE BIT(0) +#define RATELIMIT_INITIALIZED BIT(1) struct ratelimit_state { raw_spinlock_t lock; /* protect the state */ diff --git a/lib/ratelimit.c b/lib/ratelimit.c index 19ad3cdbd171..bd6e3b429e33 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -49,8 +49,10 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) return 0; } - if (!rs->begin) + if (!(rs->flags & RATELIMIT_INITIALIZED)) { rs->begin = jiffies; + rs->flags |= RATELIMIT_INITIALIZED; + } if (time_is_before_jiffies(rs->begin + interval)) { int m = ratelimit_state_reset_miss(rs); -- 2.51.0 From cf8cfa8a9978bfe77f2648a04824a46d58258e68 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 9 Apr 2025 16:54:33 -0700 Subject: [PATCH 16/16] ratelimit: Reduce ___ratelimit() false-positive rate limiting Retain the locked design, but check rate-limiting even when the lock could not be acquired. Link: https://lore.kernel.org/all/Z_VRo63o2UsVoxLG@pathway.suse.cz/ Link: https://lore.kernel.org/all/fbe93a52-365e-47fe-93a4-44a44547d601@paulmck-laptop/ Link: https://lore.kernel.org/all/20250423115409.3425-1-spasswolf@web.de/ Signed-off-by: Petr Mladek Signed-off-by: Paul E. McKenney Cc: Petr Mladek Cc: Andrew Morton Cc: Kuniyuki Iwashima Cc: Mateusz Guzik Cc: Steven Rostedt Cc: John Ogness Cc: Sergey Senozhatsky --- include/linux/ratelimit.h | 2 +- include/linux/ratelimit_types.h | 2 +- lib/ratelimit.c | 51 ++++++++++++++++++++++++--------- 3 files changed, 40 insertions(+), 15 deletions(-) diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h index adfec24061d1..7aaad158ee37 100644 --- a/include/linux/ratelimit.h +++ b/include/linux/ratelimit.h @@ -44,7 +44,7 @@ static inline void ratelimit_state_reset_interval(struct ratelimit_state *rs, in raw_spin_lock_irqsave(&rs->lock, flags); rs->interval = interval_init; rs->flags &= ~RATELIMIT_INITIALIZED; - rs->printed = 0; + atomic_set(&rs->rs_n_left, rs->burst); ratelimit_state_reset_miss(rs); raw_spin_unlock_irqrestore(&rs->lock, flags); } diff --git a/include/linux/ratelimit_types.h b/include/linux/ratelimit_types.h index ef6711b6b229..b19c4354540a 100644 --- a/include/linux/ratelimit_types.h +++ b/include/linux/ratelimit_types.h @@ -18,7 +18,7 @@ struct ratelimit_state { int interval; int burst; - int printed; + atomic_t rs_n_left; atomic_t missed; unsigned int flags; unsigned long begin; diff --git a/lib/ratelimit.c b/lib/ratelimit.c index bd6e3b429e33..90c9fe57eb42 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -39,12 +39,22 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) return 1; /* - * If we contend on this state's lock then almost - * by definition we are too busy to print a message, - * in addition to the one that will be printed by - * the entity that is holding the lock already: + * If we contend on this state's lock then just check if + * the current burst is used or not. It might cause + * false positive when we are past the interval and + * the current lock owner is just about to reset it. */ if (!raw_spin_trylock_irqsave(&rs->lock, flags)) { + unsigned int rs_flags = READ_ONCE(rs->flags); + + if (rs_flags & RATELIMIT_INITIALIZED && burst) { + int n_left; + + n_left = atomic_dec_return(&rs->rs_n_left); + if (n_left >= 0) + return 1; + } + ratelimit_state_inc_miss(rs); return 0; } @@ -52,27 +62,42 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) if (!(rs->flags & RATELIMIT_INITIALIZED)) { rs->begin = jiffies; rs->flags |= RATELIMIT_INITIALIZED; + atomic_set(&rs->rs_n_left, rs->burst); } if (time_is_before_jiffies(rs->begin + interval)) { - int m = ratelimit_state_reset_miss(rs); + int m; + + /* + * Reset rs_n_left ASAP to reduce false positives + * in parallel calls, see above. + */ + atomic_set(&rs->rs_n_left, rs->burst); + rs->begin = jiffies; + m = ratelimit_state_reset_miss(rs); if (m) { if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { printk_deferred(KERN_WARNING "%s: %d callbacks suppressed\n", func, m); } } - rs->begin = jiffies; - rs->printed = 0; } - if (burst && burst > rs->printed) { - rs->printed++; - ret = 1; - } else { - ratelimit_state_inc_miss(rs); - ret = 0; + if (burst) { + int n_left; + + /* The burst might have been taken by a parallel call. */ + n_left = atomic_dec_return(&rs->rs_n_left); + if (n_left >= 0) { + ret = 1; + goto unlock_ret; + } } + + ratelimit_state_inc_miss(rs); + ret = 0; + +unlock_ret: raw_spin_unlock_irqrestore(&rs->lock, flags); return ret; -- 2.51.0