From f729775f79a9c942c6c82ed6b44bd030afe10423 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 6 Apr 2025 11:18:39 -0400 Subject: [PATCH 01/16] tools/power turbostat: report CoreThr per measurement interval The CoreThr column displays total thermal throttling events since boot time. Change it to report events during the measurement interval. This is more useful for showing a user the current conditions. Total events since boot time are still available to the user via /sys/devices/system/cpu/cpu*/thermal_throttle/* Document CoreThr on turbostat.8 Fixes: eae97e053fe30 ("turbostat: Support thermal throttle count print") Reported-by: Arjan van de Ven Signed-off-by: Len Brown Cc: Chen Yu --- tools/power/x86/turbostat/turbostat.8 | 2 ++ tools/power/x86/turbostat/turbostat.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 52d727e29ea7..144565151e1e 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -172,6 +172,8 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBPkgTmp\fP Degrees Celsius reported by the per-package Package Thermal Monitor. .PP +\fBCoreThr\fP Core Thermal Throttling events during the measurement interval. Note that events since boot can be find in /sys/devices/system/cpu/cpu*/thermal_throttle/* +.PP \fBGFX%rc6\fP The percentage of time the GPU is in the "render C6" state, rc6, during the measurement interval. From /sys/class/drm/card0/power/rc6_residency_ms or /sys/class/drm/card0/gt/gt0/rc6_residency_ms or /sys/class/drm/card0/device/tile0/gtN/gtidle/idle_residency_ms depending on the graphics driver being used. .PP \fBGFXMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz or /sys/class/drm/card0/gt_cur_freq_mhz or /sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/cur_freq depending on the graphics driver being used. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 218aca958923..70e17d4ad9b6 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3485,7 +3485,7 @@ void delta_core(struct core_data *new, struct core_data *old) old->c6 = new->c6 - old->c6; old->c7 = new->c7 - old->c7; old->core_temp_c = new->core_temp_c; - old->core_throt_cnt = new->core_throt_cnt; + old->core_throt_cnt = new->core_throt_cnt - old->core_throt_cnt; old->mc6_us = new->mc6_us - old->mc6_us; DELTA_WRAP32(new->core_energy.raw_value, old->core_energy.raw_value); -- 2.51.0 From 3ae8508663372b93c5556a887e96ed0ca5df0711 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 6 Apr 2025 12:23:22 -0400 Subject: [PATCH 02/16] tools/power turbostat: Document GNR UncMHz domain convention Document that on Intel Granite Rapids Systems, Uncore domains 0-2 are CPU domains, and uncore domains 3-4 are IO domains. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 144565151e1e..e86493880c16 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -205,6 +205,7 @@ The system configuration dump (if --quiet is not used) is followed by statistics \fBUncMHz\fP per-package uncore MHz, instantaneous sample. .PP \fBUMHz1.0\fP per-package uncore MHz for domain=1 and fabric_cluster=0, instantaneous sample. System summary is the average of all packages. +Intel Granite Rapids systems use domains 0-2 for CPUs, and 3-4 for IO, with cluster always 0. For the "--show" and "--hide" options, use "UncMHz" to operate on all UMHz*.* as a group. .SH TOO MUCH INFORMATION EXAMPLE By default, turbostat dumps all possible information -- a system configuration header, followed by columns for all counters. -- 2.51.0 From f8b136ef2605c1bf62020462d10e35228760aa19 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 19 Mar 2025 08:53:07 +0800 Subject: [PATCH 03/16] tools/power turbostat: Restore GFX sysfs fflush() call Do fflush() to discard the buffered data, before each read of the graphics sysfs knobs. Fixes: ba99a4fc8c24 ("tools/power turbostat: Remove unnecessary fflush() call") Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 70e17d4ad9b6..c9a34c16c7a8 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -6039,6 +6039,7 @@ int snapshot_graphics(int idx) int retval; rewind(gfx_info[idx].fp); + fflush(gfx_info[idx].fp); switch (idx) { case GFX_rc6: -- 2.51.0 From 994633894f208a0151baaee1688ab3c431912553 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 6 Apr 2025 12:53:18 -0400 Subject: [PATCH 04/16] tools/power turbostat: re-factor sysfs code Probe cpuidle "sysfs" residency and counts separately, since soon we will make one disabled on, and the other disabled off. Clarify that some BIC (build-in-counters) are actually "groups". since we're about to re-name some of those groups. no functional change. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 31 ++++++++++++++++++--------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c9a34c16c7a8..df0391bedcde 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -273,10 +273,10 @@ struct msr_counter bic[] = { #define BIC_NMI (1ULL << 61) #define BIC_CPU_c1e (1ULL << 62) -#define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die) -#define BIC_THERMAL_PWR (BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt) -#define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) -#define BIC_IDLE (BIC_Busy | BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) +#define BIC_GROUP_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die) +#define BIC_GROUP_THERMAL_PWR (BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt) +#define BIC_GROUP_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) +#define BIC_GROUP_IDLE (BIC_Busy | BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) #define BIC_OTHER (BIC_IRQ | BIC_NMI | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) @@ -2354,16 +2354,16 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) retval |= ~0; break; } else if (!strcmp(name_list, "topology")) { - retval |= BIC_TOPOLOGY; + retval |= BIC_GROUP_TOPOLOGY; break; } else if (!strcmp(name_list, "power")) { - retval |= BIC_THERMAL_PWR; + retval |= BIC_GROUP_THERMAL_PWR; break; } else if (!strcmp(name_list, "idle")) { - retval |= BIC_IDLE; + retval |= BIC_GROUP_IDLE; break; } else if (!strcmp(name_list, "frequency")) { - retval |= BIC_FREQUENCY; + retval |= BIC_GROUP_FREQUENCY; break; } else if (!strcmp(name_list, "other")) { retval |= BIC_OTHER; @@ -10260,7 +10260,7 @@ int is_deferred_skip(char *name) return 0; } -void probe_sysfs(void) +void probe_cpuidle_residency(void) { char path[64]; char name_buf[16]; @@ -10304,6 +10304,16 @@ void probe_sysfs(void) if (state < min_state) min_state = state; } +} + +void probe_cpuidle_counts(void) +{ + char path[64]; + char name_buf[16]; + FILE *input; + int state; + int min_state = 1024, max_state = 0; + char *sp; for (state = 10; state >= 0; --state) { @@ -10602,7 +10612,8 @@ skip_cgroup_setting: print_bootcmd(); } - probe_sysfs(); + probe_cpuidle_residency(); + probe_cpuidle_counts(); if (!getuid()) set_rlimit(); -- 2.51.0 From 6f110a5e4f9977c31ce76fefbfef6fd4eab6bfb7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 6 Apr 2025 10:00:04 -0700 Subject: [PATCH 05/16] Disable SLUB_TINY for build testing ... and don't error out so hard on missing module descriptions. Before commit 6c6c1fc09de3 ("modpost: require a MODULE_DESCRIPTION()") we used to warn about missing module descriptions, but only when building with extra warnigns (ie 'W=1'). After that commit the warning became an unconditional hard error. And it turns out not all modules have been converted despite the claims to the contrary. As reported by Damian Tometzki, the slub KUnit test didn't have a module description, and apparently nobody ever really noticed. The reason nobody noticed seems to be that the slub KUnit tests get disabled by SLUB_TINY, which also ends up disabling a lot of other code, both in tests and in slub itself. And so anybody doing full build tests didn't actually see this failre. So let's disable SLUB_TINY for build-only tests, since it clearly ends up limiting build coverage. Also turn the missing module descriptions error back into a warning, but let's keep it around for non-'W=1' builds. Reported-by: Damian Tometzki Link: https://lore.kernel.org/all/01070196099fd059-e8463438-7b1b-4ec8-816d-173874be9966-000000@eu-central-1.amazonses.com/ Cc: Masahiro Yamada Cc: Jeff Johnson Fixes: 6c6c1fc09de3 ("modpost: require a MODULE_DESCRIPTION()") Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- scripts/mod/modpost.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index d3fb3762887b..e113f713b493 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -201,7 +201,7 @@ config KVFREE_RCU_BATCHED config SLUB_TINY bool "Configure for minimal memory footprint" - depends on EXPERT + depends on EXPERT && !COMPILE_TEST select SLAB_MERGE_DEFAULT help Configures the slab allocator in a way to achieve minimal memory diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 92627e8d0e16..be89921d60b6 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1603,7 +1603,7 @@ static void read_symbols(const char *modname) } if (!get_modinfo(&info, "description")) - error("missing MODULE_DESCRIPTION() in %s\n", modname); + warn("missing MODULE_DESCRIPTION() in %s\n", modname); } for (sym = info.symtab_start; sym < info.symtab_stop; sym++) { -- 2.51.0 From ec4acd3166d8a7a03b059d01b9c6f11a658e833f Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 6 Apr 2025 14:29:57 -0400 Subject: [PATCH 06/16] tools/power turbostat: disable "cpuidle" invocation counters, by default Create "pct_idle" counter group, the sofware notion of residency so it can now be singled out, independent of other counter groups. Create "cpuidle" group, the cpuidle invocation counts. Disable "cpuidle", by default. Create "swidle" = "cpuidle" + "pct_idle". Undocument "sysfs", the old name for "swidle", but keep it working for backwards compatibilty. Create "hwidle", all the HW idle counters Modify "idle", enabled by default "idle" = "hwidle" + "pct_idle" (and now excludes "cpuidle") Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 12 +++++----- tools/power/x86/turbostat/turbostat.c | 34 +++++++++++++++++++++------ 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index e86493880c16..b74ed916057e 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -100,7 +100,7 @@ The column name "all" can be used to enable all disabled-by-default built-in cou .PP \fB--show column\fP show only the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names. .PP -\fB--show CATEGORY --hide CATEGORY\fP Show and hide also accept a single CATEGORY of columns: "all", "topology", "idle", "frequency", "power", "sysfs", "other". +\fB--show CATEGORY --hide CATEGORY\fP Show and hide also accept a single CATEGORY of columns: "all", "topology", "idle", "frequency", "power", "cpuidle", "hwidle", "swidle", "other". "idle" (enabled by default), includes "hwidle" and "idle_pct". "cpuidle" (default disabled) includes cpuidle software invocation counters. "swidle" includes "cpuidle" plus "idle_pct". "hwidle" includes only hardware based idle residency counters. Older versions of turbostat used the term "sysfs" for what is now "swidle". .PP \fB--Dump\fP displays the raw counter values. .PP @@ -158,15 +158,15 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBSMI\fP The number of System Management Interrupts serviced CPU during the measurement interval. While this counter is actually per-CPU, SMI are triggered on all processors, so the number should be the same for all CPUs. .PP -\fBC1, C2, C3...\fP The number times Linux requested the C1, C2, C3 idle state during the measurement interval. The system summary line shows the sum for all CPUs. These are C-state names as exported in /sys/devices/system/cpu/cpu*/cpuidle/state*/name. While their names are generic, their attributes are processor specific. They the system description section of output shows what MWAIT sub-states they are mapped to on each system. +\fBC1, C2, C3...\fP The number times Linux requested the C1, C2, C3 idle state during the measurement interval. The system summary line shows the sum for all CPUs. These are C-state names as exported in /sys/devices/system/cpu/cpu*/cpuidle/state*/name. While their names are generic, their attributes are processor specific. They the system description section of output shows what MWAIT sub-states they are mapped to on each system. These counters are in the "cpuidle" group, which is disabled, by default. .PP -\fBC1+, C2+, C3+...\fP The idle governor idle state misprediction statistics. Inidcates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a deeper idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/below file. +\fBC1+, C2+, C3+...\fP The idle governor idle state misprediction statistics. Inidcates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a deeper idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/below file. These counters are in the "cpuidle" group, which is disabled, by default. .PP -\fBC1-, C2-, C3-...\fP The idle governor idle state misprediction statistics. Inidcates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a shallower idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/above file. +\fBC1-, C2-, C3-...\fP The idle governor idle state misprediction statistics. Inidcates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a shallower idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/above file. These counters are in the "cpuidle" group, which is disabled, by default. .PP -\fBC1%, C2%, C3%\fP The residency percentage that Linux requested C1, C2, C3.... The system summary is the average of all CPUs in the system. Note that these are software, reflecting what was requested. The hardware counters reflect what was actually achieved. +\fBC1%, C2%, C3%\fP The residency percentage that Linux requested C1, C2, C3.... The system summary is the average of all CPUs in the system. Note that these are software, reflecting what was requested. The hardware counters reflect what was actually achieved. These counters are in the "pct_idle" group, which is enabled by default. .PP -\fBCPU%c1, CPU%c3, CPU%c6, CPU%c7\fP show the percentage residency in hardware core idle states. These numbers are from hardware residency counters. +\fBCPU%c1, CPU%c3, CPU%c6, CPU%c7\fP show the percentage residency in hardware core idle states. These numbers are from hardware residency counters and are in the "hwidle" group, which is enabled, by default. .PP \fBCoreTmp\fP Degrees Celsius reported by the per-core Digital Thermal Sensor. .PP diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index df0391bedcde..ab184f95cdaf 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -153,7 +153,7 @@ struct msr_counter bic[] = { { 0x0, "TSC_MHz", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "IRQ", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "SMI", NULL, 32, 0, FORMAT_DELTA, NULL, 0 }, - { 0x0, "sysfs", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "cpuidle", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "CPU%c1", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "CPU%c3", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "CPU%c6", NULL, 0, 0, 0, NULL, 0 }, @@ -206,6 +206,7 @@ struct msr_counter bic[] = { { 0x0, "Sys_J", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "NMI", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "CPU%c1e", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "pct_idle", NULL, 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -219,7 +220,7 @@ struct msr_counter bic[] = { #define BIC_TSC_MHz (1ULL << 7) #define BIC_IRQ (1ULL << 8) #define BIC_SMI (1ULL << 9) -#define BIC_sysfs (1ULL << 10) +#define BIC_cpuidle (1ULL << 10) #define BIC_CPU_c1 (1ULL << 11) #define BIC_CPU_c3 (1ULL << 12) #define BIC_CPU_c6 (1ULL << 13) @@ -272,17 +273,20 @@ struct msr_counter bic[] = { #define BIC_Sys_J (1ULL << 60) #define BIC_NMI (1ULL << 61) #define BIC_CPU_c1e (1ULL << 62) +#define BIC_pct_idle (1ULL << 63) #define BIC_GROUP_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die) #define BIC_GROUP_THERMAL_PWR (BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt) #define BIC_GROUP_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) -#define BIC_GROUP_IDLE (BIC_Busy | BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) +#define BIC_GROUP_HW_IDLE (BIC_Busy | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) +#define BIC_GROUP_SW_IDLE (BIC_Busy | BIC_cpuidle | BIC_pct_idle ) +#define BIC_GROUP_IDLE (BIC_GROUP_HW_IDLE | BIC_pct_idle) #define BIC_OTHER (BIC_IRQ | BIC_NMI | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) -#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) +#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC | BIC_cpuidle) unsigned long long bic_enabled = (0xFFFFFFFFFFFFFFFFULL & ~BIC_DISABLED_BY_DEFAULT); -unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC_X2APIC; +unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_cpuidle | BIC_pct_idle | BIC_APIC | BIC_X2APIC; #define DO_BIC(COUNTER_NAME) (bic_enabled & bic_present & COUNTER_NAME) #define DO_BIC_READ(COUNTER_NAME) (bic_present & COUNTER_NAME) @@ -2362,6 +2366,15 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) } else if (!strcmp(name_list, "idle")) { retval |= BIC_GROUP_IDLE; break; + } else if (!strcmp(name_list, "swidle")) { + retval |= BIC_GROUP_SW_IDLE; + break; + } else if (!strcmp(name_list, "sysfs")) { /* legacy compatibility */ + retval |= BIC_GROUP_SW_IDLE; + break; + } else if (!strcmp(name_list, "hwidle")) { + retval |= BIC_GROUP_HW_IDLE; + break; } else if (!strcmp(name_list, "frequency")) { retval |= BIC_GROUP_FREQUENCY; break; @@ -2372,6 +2385,7 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) } if (i == MAX_BIC) { + fprintf(stderr, "deferred %s\n", name_list); if (mode == SHOW_LIST) { deferred_add_names[deferred_add_index++] = name_list; if (deferred_add_index >= MAX_DEFERRED) { @@ -10269,6 +10283,9 @@ void probe_cpuidle_residency(void) int min_state = 1024, max_state = 0; char *sp; + if (!DO_BIC(BIC_pct_idle)) + return; + for (state = 10; state >= 0; --state) { sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state); @@ -10291,7 +10308,7 @@ void probe_cpuidle_residency(void) sprintf(path, "cpuidle/state%d/time", state); - if (!DO_BIC(BIC_sysfs) && !is_deferred_add(name_buf)) + if (!DO_BIC(BIC_pct_idle) && !is_deferred_add(name_buf)) continue; if (is_deferred_skip(name_buf)) @@ -10315,6 +10332,9 @@ void probe_cpuidle_counts(void) int min_state = 1024, max_state = 0; char *sp; + if (!DO_BIC(BIC_cpuidle)) + return; + for (state = 10; state >= 0; --state) { sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state); @@ -10327,7 +10347,7 @@ void probe_cpuidle_counts(void) remove_underbar(name_buf); - if (!DO_BIC(BIC_sysfs) && !is_deferred_add(name_buf)) + if (!DO_BIC(BIC_cpuidle) && !is_deferred_add(name_buf)) continue; if (is_deferred_skip(name_buf)) -- 2.51.0 From 03e00e373cab981ad808271b2650700cfa0fbda6 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 6 Apr 2025 14:49:20 -0400 Subject: [PATCH 07/16] tools/power turbostat: v2025.05.06 Support up to 8192 processors Add cpuidle governor debug telemetry, disabled by default Update default output to exclude cpuidle invocation counts Bug fixes Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ab184f95cdaf..1b9fdc1a7ee8 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -9594,7 +9594,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2025.02.02 - Len Brown \n"); + fprintf(outf, "turbostat version 2025.04.06 - Len Brown \n"); } #define COMMAND_LINE_SIZE 2048 -- 2.51.0 From 0efdedb3358aa78102967f242379686f94315830 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 2 Apr 2025 21:21:57 +0100 Subject: [PATCH 08/16] tools/include: make uapi/linux/types.h usable from assembly MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The "real" linux/types.h UAPI header gracefully degrades to a NOOP when included from assembly code. Mirror this behaviour in the tools/ variant. Test for __ASSEMBLER__ over __ASSEMBLY__ as the former is provided by the toolchain automatically. Reported-by: Mark Brown Closes: https://lore.kernel.org/lkml/af553c62-ca2f-4956-932c-dd6e3a126f58@sirena.org.uk/ Fixes: c9fbaa879508 ("selftests: vDSO: parse_vdso: Use UAPI headers instead of libc headers") Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20250321-uapi-consistency-v1-1-439070118dc0@linutronix.de Signed-off-by: Mark Brown Reviewed-by: Mark Brown Signed-off-by: Linus Torvalds --- tools/include/uapi/linux/types.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/include/uapi/linux/types.h b/tools/include/uapi/linux/types.h index 91fa51a9c31d..85aa327245c6 100644 --- a/tools/include/uapi/linux/types.h +++ b/tools/include/uapi/linux/types.h @@ -4,6 +4,8 @@ #include +#ifndef __ASSEMBLER__ + /* copied from linux:include/uapi/linux/types.h */ #define __bitwise typedef __u16 __bitwise __le16; @@ -20,4 +22,5 @@ typedef __u32 __bitwise __wsum; #define __aligned_be64 __be64 __attribute__((aligned(8))) #define __aligned_le64 __le64 __attribute__((aligned(8))) +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_LINUX_TYPES_H */ -- 2.51.0 From 0af2f6be1b4281385b618cb86ad946eded089ac8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 6 Apr 2025 13:11:33 -0700 Subject: [PATCH 09/16] Linux 6.15-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index e55726a71d95..38689a0c3605 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 -PATCHLEVEL = 14 +PATCHLEVEL = 15 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.51.0 From 47a1a15645d558699765f0f7e82032302ae47a46 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Christian=20G=C3=B6ttsche?= Date: Tue, 18 Mar 2025 09:33:34 +0100 Subject: [PATCH 10/16] selinux: constify network address pointer MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The network address, either an IPv4 or IPv6 one, is not modified. Signed-off-by: Christian Göttsche Signed-off-by: Paul Moore --- security/selinux/include/netnode.h | 2 +- security/selinux/include/security.h | 2 +- security/selinux/netnode.c | 8 ++++---- security/selinux/ss/services.c | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/security/selinux/include/netnode.h b/security/selinux/include/netnode.h index 9b8b655a8cd3..e4dc904c3585 100644 --- a/security/selinux/include/netnode.h +++ b/security/selinux/include/netnode.h @@ -21,6 +21,6 @@ void sel_netnode_flush(void); -int sel_netnode_sid(void *addr, u16 family, u32 *sid); +int sel_netnode_sid(const void *addr, u16 family, u32 *sid); #endif diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index e7827ed7be5f..278c144c22d6 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -309,7 +309,7 @@ int security_ib_endport_sid(const char *dev_name, u8 port_num, u32 *out_sid); int security_netif_sid(const char *name, u32 *if_sid); -int security_node_sid(u16 domain, void *addr, u32 addrlen, u32 *out_sid); +int security_node_sid(u16 domain, const void *addr, u32 addrlen, u32 *out_sid); int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid, u16 tclass); diff --git a/security/selinux/netnode.c b/security/selinux/netnode.c index 5c8c77e50aad..b7900d5ae557 100644 --- a/security/selinux/netnode.c +++ b/security/selinux/netnode.c @@ -187,7 +187,7 @@ static void sel_netnode_insert(struct sel_netnode *node) * failure. * */ -static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid) +static int sel_netnode_sid_slow(const void *addr, u16 family, u32 *sid) { int ret; struct sel_netnode *node; @@ -207,13 +207,13 @@ static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid) ret = security_node_sid(PF_INET, addr, sizeof(struct in_addr), sid); if (new) - new->nsec.addr.ipv4 = *(__be32 *)addr; + new->nsec.addr.ipv4 = *(const __be32 *)addr; break; case PF_INET6: ret = security_node_sid(PF_INET6, addr, sizeof(struct in6_addr), sid); if (new) - new->nsec.addr.ipv6 = *(struct in6_addr *)addr; + new->nsec.addr.ipv6 = *(const struct in6_addr *)addr; break; default: BUG(); @@ -247,7 +247,7 @@ static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid) * on failure. * */ -int sel_netnode_sid(void *addr, u16 family, u32 *sid) +int sel_netnode_sid(const void *addr, u16 family, u32 *sid) { struct sel_netnode *node; diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index e431772c6168..ec9ddfccc7ee 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -2643,7 +2643,7 @@ static bool match_ipv6_addrmask(const u32 input[4], const u32 addr[4], const u32 * @out_sid: security identifier */ int security_node_sid(u16 domain, - void *addrp, + const void *addrp, u32 addrlen, u32 *out_sid) { @@ -2672,7 +2672,7 @@ retry: if (addrlen != sizeof(u32)) goto out; - addr = *((u32 *)addrp); + addr = *((const u32 *)addrp); c = policydb->ocontexts[OCON_NODE]; while (c) { -- 2.51.0 From 9cc034be10a52c30719f8b9436d81b981421bfb7 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Christian=20G=C3=B6ttsche?= Date: Tue, 18 Mar 2025 09:33:29 +0100 Subject: [PATCH 11/16] selinux: contify network namespace pointer MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The network namespace is not modified. Signed-off-by: Christian Göttsche Signed-off-by: Paul Moore --- security/selinux/include/objsec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index c88cae81ee4c..b11c97c9feed 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -82,7 +82,7 @@ struct ipc_security_struct { }; struct netif_security_struct { - struct net *ns; /* network namespace */ + const struct net *ns; /* network namespace */ int ifindex; /* device index */ u32 sid; /* SID for this interface */ }; -- 2.51.0 From e6fb56b2253d49d192d4fe790698462d5422c041 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Christian=20G=C3=B6ttsche?= Date: Tue, 18 Mar 2025 09:33:30 +0100 Subject: [PATCH 12/16] selinux: add likely hints for fast paths MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit In the network hashtable lookup code add likely() compiler hints in the fast path, like already done in sel_netif_sid(). Signed-off-by: Christian Göttsche Signed-off-by: Paul Moore --- security/selinux/ibpkey.c | 2 +- security/selinux/netnode.c | 2 +- security/selinux/netport.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/security/selinux/ibpkey.c b/security/selinux/ibpkey.c index 48f537b41c58..94f3eef22bad 100644 --- a/security/selinux/ibpkey.c +++ b/security/selinux/ibpkey.c @@ -184,7 +184,7 @@ int sel_ib_pkey_sid(u64 subnet_prefix, u16 pkey_num, u32 *sid) rcu_read_lock(); pkey = sel_ib_pkey_find(subnet_prefix, pkey_num); - if (pkey) { + if (likely(pkey)) { *sid = pkey->psec.sid; rcu_read_unlock(); return 0; diff --git a/security/selinux/netnode.c b/security/selinux/netnode.c index b7900d5ae557..8bb456d80dd5 100644 --- a/security/selinux/netnode.c +++ b/security/selinux/netnode.c @@ -253,7 +253,7 @@ int sel_netnode_sid(const void *addr, u16 family, u32 *sid) rcu_read_lock(); node = sel_netnode_find(addr, family); - if (node != NULL) { + if (likely(node != NULL)) { *sid = node->nsec.sid; rcu_read_unlock(); return 0; diff --git a/security/selinux/netport.c b/security/selinux/netport.c index 2e22ad9c2bd0..7d2207384d40 100644 --- a/security/selinux/netport.c +++ b/security/selinux/netport.c @@ -186,7 +186,7 @@ int sel_netport_sid(u8 protocol, u16 pnum, u32 *sid) rcu_read_lock(); port = sel_netport_find(protocol, pnum); - if (port != NULL) { + if (likely(port != NULL)) { *sid = port->psec.sid; rcu_read_unlock(); return 0; -- 2.51.0 From cde3b1b66f2de9eba3ce1beadb91739d803a1018 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Christian=20G=C3=B6ttsche?= Date: Tue, 18 Mar 2025 09:33:32 +0100 Subject: [PATCH 13/16] selinux: unify OOM handling in network hashtables MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit For network objects, like interfaces, nodes, port and InfiniBands, the object to SID lookup is cached in hashtables. OOM during such hashtable additions of new objects is considered non-fatal and the computed SID is simply returned without adding the compute result into the hash table. Actually ignore OOM in the InfiniBand code, despite the comment already suggesting to do so. This reverts commit c350f8bea271 ("selinux: Fix error return code in sel_ib_pkey_sid_slow()"). Add comments in the other places. Use kmalloc() instead of kzalloc(), since all members are initialized on success and the data is only used in internbal hash tables, so no risk of information leakage to userspace. Fixes: c350f8bea271 ("selinux: Fix error return code in sel_ib_pkey_sid_slow()") Signed-off-by: Christian Göttsche Signed-off-by: Paul Moore --- security/selinux/ibpkey.c | 11 +++++------ security/selinux/netif.c | 6 +++++- security/selinux/netnode.c | 5 ++++- security/selinux/netport.c | 6 +++++- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/security/selinux/ibpkey.c b/security/selinux/ibpkey.c index 94f3eef22bad..470481cfe0e8 100644 --- a/security/selinux/ibpkey.c +++ b/security/selinux/ibpkey.c @@ -130,7 +130,7 @@ static int sel_ib_pkey_sid_slow(u64 subnet_prefix, u16 pkey_num, u32 *sid) { int ret; struct sel_ib_pkey *pkey; - struct sel_ib_pkey *new = NULL; + struct sel_ib_pkey *new; unsigned long flags; spin_lock_irqsave(&sel_ib_pkey_lock, flags); @@ -146,12 +146,11 @@ static int sel_ib_pkey_sid_slow(u64 subnet_prefix, u16 pkey_num, u32 *sid) if (ret) goto out; - /* If this memory allocation fails still return 0. The SID - * is valid, it just won't be added to the cache. - */ - new = kzalloc(sizeof(*new), GFP_ATOMIC); + new = kmalloc(sizeof(*new), GFP_ATOMIC); if (!new) { - ret = -ENOMEM; + /* If this memory allocation fails still return 0. The SID + * is valid, it just won't be added to the cache. + */ goto out; } diff --git a/security/selinux/netif.c b/security/selinux/netif.c index 43a0d3594b72..78afbecdbe57 100644 --- a/security/selinux/netif.c +++ b/security/selinux/netif.c @@ -156,7 +156,11 @@ static int sel_netif_sid_slow(struct net *ns, int ifindex, u32 *sid) ret = security_netif_sid(dev->name, sid); if (ret != 0) goto out; - new = kzalloc(sizeof(*new), GFP_ATOMIC); + + /* If this memory allocation fails still return 0. The SID + * is valid, it just won't be added to the cache. + */ + new = kmalloc(sizeof(*new), GFP_ATOMIC); if (new) { new->nsec.ns = ns; new->nsec.ifindex = ifindex; diff --git a/security/selinux/netnode.c b/security/selinux/netnode.c index 8bb456d80dd5..5d0ed08d46e5 100644 --- a/security/selinux/netnode.c +++ b/security/selinux/netnode.c @@ -201,7 +201,10 @@ static int sel_netnode_sid_slow(const void *addr, u16 family, u32 *sid) return 0; } - new = kzalloc(sizeof(*new), GFP_ATOMIC); + /* If this memory allocation fails still return 0. The SID + * is valid, it just won't be added to the cache. + */ + new = kmalloc(sizeof(*new), GFP_ATOMIC); switch (family) { case PF_INET: ret = security_node_sid(PF_INET, diff --git a/security/selinux/netport.c b/security/selinux/netport.c index 7d2207384d40..09ef75a18d82 100644 --- a/security/selinux/netport.c +++ b/security/selinux/netport.c @@ -151,7 +151,11 @@ static int sel_netport_sid_slow(u8 protocol, u16 pnum, u32 *sid) ret = security_port_sid(protocol, pnum, sid); if (ret != 0) goto out; - new = kzalloc(sizeof(*new), GFP_ATOMIC); + + /* If this memory allocation fails still return 0. The SID + * is valid, it just won't be added to the cache. + */ + new = kmalloc(sizeof(*new), GFP_ATOMIC); if (new) { new->psec.port = pnum; new->psec.protocol = protocol; -- 2.51.0 From 4926c3fd83d50987685a0c1131bd60b252a3d541 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Christian=20G=C3=B6ttsche?= Date: Tue, 18 Mar 2025 09:50:05 +0100 Subject: [PATCH 14/16] selinux: drop copy-paste comment MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Port labeling is based on port number and protocol (TCP/UDP/...) but not based on network family (IPv4/IPv6). Signed-off-by: Christian Göttsche Signed-off-by: Paul Moore --- security/selinux/netport.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/security/selinux/netport.c b/security/selinux/netport.c index 09ef75a18d82..6fd7da4b3576 100644 --- a/security/selinux/netport.c +++ b/security/selinux/netport.c @@ -47,12 +47,6 @@ struct sel_netport { struct rcu_head rcu; }; -/* NOTE: we are using a combined hash table for both IPv4 and IPv6, the reason - * for this is that I suspect most users will not make heavy use of both - * address families at the same time so one table will usually end up wasted, - * if this becomes a problem we can always add a hash table for each address - * family later */ - static DEFINE_SPINLOCK(sel_netport_lock); static struct sel_netport_bkt sel_netport_hash[SEL_NETPORT_HASH_SIZE]; -- 2.51.0 From 8716451a4e57cc82f3656d7a71b67d3b5831ef3f Mon Sep 17 00:00:00 2001 From: Takaya Saeki Date: Tue, 18 Mar 2025 08:31:39 +0000 Subject: [PATCH 15/16] selinux: support wildcard match in genfscon Currently, genfscon only supports string prefix match to label files. Thus, labeling numerous dynamic sysfs entries requires many specific path rules. For example, labeling device paths such as `/sys/devices/pci0000:00/0000:00:03.1/<...>/0000:04:00.1/wakeup` requires listing all specific PCI paths, which is challenging to maintain. While user-space restorecon can handle these paths with regular expression rules, relabeling thousands of paths under sysfs after it is mounted is inefficient compared to using genfscon. This commit adds wildcard matching to genfscon to make rules more efficient and expressive. This new behavior is enabled by genfs_seclabel_wildcard capability. With this capability, genfscon does wildcard matching instead of prefix matching. When multiple wildcard rules match against a path, then the longest rule (determined by the length of the rule string) will be applied. If multiple rules of the same length match, the first matching rule encountered in the given genfscon policy will be applied. Users are encouraged to write longer, more explicit path rules to avoid relying on this behavior. This change resulted in nice real-world performance improvements. For example, boot times on test Android devices were reduced by 15%. This improvement is due to the elimination of the "restorecon -R /sys" step during boot, which takes more than two seconds in the worst case. Signed-off-by: Takaya Saeki Signed-off-by: Paul Moore --- security/selinux/include/policycap.h | 1 + security/selinux/include/policycap_names.h | 1 + security/selinux/ss/services.c | 19 +++++++++++++++---- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/security/selinux/include/policycap.h b/security/selinux/include/policycap.h index bd402d3fd3ae..7405154e6c42 100644 --- a/security/selinux/include/policycap.h +++ b/security/selinux/include/policycap.h @@ -16,6 +16,7 @@ enum { POLICYDB_CAP_USERSPACE_INITIAL_CONTEXT, POLICYDB_CAP_NETLINK_XPERM, POLICYDB_CAP_NETIF_WILDCARD, + POLICYDB_CAP_GENFS_SECLABEL_WILDCARD, __POLICYDB_CAP_MAX }; #define POLICYDB_CAP_MAX (__POLICYDB_CAP_MAX - 1) diff --git a/security/selinux/include/policycap_names.h b/security/selinux/include/policycap_names.h index ac1342d6d5bb..d8962fcf2ff9 100644 --- a/security/selinux/include/policycap_names.h +++ b/security/selinux/include/policycap_names.h @@ -19,6 +19,7 @@ const char *const selinux_policycap_names[__POLICYDB_CAP_MAX] = { "userspace_initial_context", "netlink_xperm", "netif_wildcard", + "genfs_seclabel_wildcard", }; /* clang-format on */ diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index ec9ddfccc7ee..5309bb885576 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include "flask.h" @@ -2872,6 +2873,7 @@ static inline int __security_genfs_sid(struct selinux_policy *policy, struct genfs *genfs; struct ocontext *c; int cmp = 0; + bool wildcard; while (path[0] == '/' && path[1] == '/') path++; @@ -2888,11 +2890,20 @@ static inline int __security_genfs_sid(struct selinux_policy *policy, if (!genfs || cmp) return -ENOENT; + wildcard = ebitmap_get_bit(&policy->policydb.policycaps, + POLICYDB_CAP_GENFS_SECLABEL_WILDCARD); for (c = genfs->head; c; c = c->next) { - size_t len = strlen(c->u.name); - if ((!c->v.sclass || sclass == c->v.sclass) && - (strncmp(c->u.name, path, len) == 0)) - break; + if (!c->v.sclass || sclass == c->v.sclass) { + if (wildcard) { + if (match_wildcard(c->u.name, path)) + break; + } else { + size_t len = strlen(c->u.name); + + if ((strncmp(c->u.name, path, len)) == 0) + break; + } + } } if (!c) -- 2.51.0 From 5d7ddc59b3d89b724a5aa8f30d0db94ff8d2d93f Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 10 Apr 2025 15:20:32 -0400 Subject: [PATCH 16/16] selinux: reduce path walk overhead Reduce the SELinux performance overhead during path walks through the use of a per-task directory access cache and some minor code optimizations. The directory access cache is per-task because it allows for a lockless cache while also fitting well with a common application pattern of heavily accessing a relatively small number of SELinux directory labels. The cache is inherited by child processes when the child runs with the same SELinux domain as the parent, and invalidated on changes to the task's SELinux domain or the loaded SELinux policy. A cache of four entries was chosen based on testing with the Fedora "targeted" policy, a SELinux Reference Policy variant, and 'make allmodconfig' on Linux v6.14. Code optimizations include better use of inline functions to reduce function calls in the common case, especially in the inode revalidation code paths, and elimination of redundant checks between the LSM and SELinux layers. As mentioned briefly above, aside from general use and regression testing with the selinux-testsuite, performance was measured using 'make allmodconfig' with Linux v6.14 as a base reference. As expected, there were variations from one test run to another, but the measurements below are a good representation of the test results seen on my test system. * Linux v6.14 REF 1.26% [k] __d_lookup_rcu SELINUX (1.31%) 0.58% [k] selinux_inode_permission 0.29% [k] avc_lookup 0.25% [k] avc_has_perm_noaudit 0.19% [k] __inode_security_revalidate * Linux v6.14 + patch REF 1.41% [k] __d_lookup_rcu SELINUX (0.89%) 0.65% [k] selinux_inode_permission 0.15% [k] avc_lookup 0.05% [k] avc_has_perm_noaudit 0.04% [k] avc_policy_seqno X.XX% [k] __inode_security_revalidate (now inline) In both cases the __d_lookup_rcu() function was used as a reference point to establish a context for the SELinux related functions. On a unpatched Linux v6.14 system we see the time spent in the combined SELinux functions exceeded that of __d_lookup_rcu(), 1.31% compared to 1.26%. However, with this patch applied the time spent in the combined SELinux functions dropped to roughly 65% of the time spent in __d_lookup_rcu(), 0.89% compared to 1.41%. Aside from the significant decrease in time spent in the SELinux AVC, it appears that any additional time spent searching and updating the cache is offset by other code improvements, e.g. time spent in selinux_inode_permission() + __inode_security_revalidate() + avc_policy_seqno() is less on the patched kernel than the unpatched kernel. It is worth noting that in this patch the use of the per-task cache is limited to the security_inode_permission() LSM callback, selinux_inode_permission(), but future work could expand the cache into inode_has_perm(), likely through consolidation of the two functions. While this would likely have little to no impact on path walks, it may benefit other operations. Reviewed-by: Stephen Smalley Tested-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/hooks.c | 225 +++++++++++++++++++++++------- security/selinux/include/objsec.h | 14 ++ 2 files changed, 185 insertions(+), 54 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index e7a7dcab81db..edc9ae81e202 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -213,8 +213,10 @@ static void cred_init_security(void) { struct task_security_struct *tsec; + /* NOTE: the lsm framework zeros out the buffer on allocation */ + tsec = selinux_cred(unrcu_pointer(current->real_cred)); - tsec->osid = tsec->sid = SECINITSID_KERNEL; + tsec->osid = tsec->sid = tsec->avdcache.sid = SECINITSID_KERNEL; } /* @@ -278,27 +280,21 @@ static int __inode_security_revalidate(struct inode *inode, struct dentry *dentry, bool may_sleep) { - struct inode_security_struct *isec = selinux_inode(inode); + if (!selinux_initialized()) + return 0; - might_sleep_if(may_sleep); + if (may_sleep) + might_sleep(); + else + return -ECHILD; /* - * The check of isec->initialized below is racy but - * inode_doinit_with_dentry() will recheck with - * isec->lock held. + * Check to ensure that an inode's SELinux state is valid and try + * reloading the inode security label if necessary. This will fail if + * @dentry is NULL and no dentry for this inode can be found; in that + * case, continue using the old label. */ - if (selinux_initialized() && - data_race(isec->initialized != LABEL_INITIALIZED)) { - if (!may_sleep) - return -ECHILD; - - /* - * Try reloading the inode security label. This will fail if - * @opt_dentry is NULL and no dentry for this inode can be - * found; in that case, continue using the old label. - */ - inode_doinit_with_dentry(inode, dentry); - } + inode_doinit_with_dentry(inode, dentry); return 0; } @@ -307,41 +303,53 @@ static struct inode_security_struct *inode_security_novalidate(struct inode *ino return selinux_inode(inode); } -static struct inode_security_struct *inode_security_rcu(struct inode *inode, bool rcu) +static inline struct inode_security_struct *inode_security_rcu(struct inode *inode, + bool rcu) { - int error; + int rc; + struct inode_security_struct *isec = selinux_inode(inode); - error = __inode_security_revalidate(inode, NULL, !rcu); - if (error) - return ERR_PTR(error); - return selinux_inode(inode); + /* check below is racy, but revalidate will recheck with lock held */ + if (data_race(likely(isec->initialized == LABEL_INITIALIZED))) + return isec; + rc = __inode_security_revalidate(inode, NULL, !rcu); + if (rc) + return ERR_PTR(rc); + return isec; } /* * Get the security label of an inode. */ -static struct inode_security_struct *inode_security(struct inode *inode) +static inline struct inode_security_struct *inode_security(struct inode *inode) { + struct inode_security_struct *isec = selinux_inode(inode); + + /* check below is racy, but revalidate will recheck with lock held */ + if (data_race(likely(isec->initialized == LABEL_INITIALIZED))) + return isec; __inode_security_revalidate(inode, NULL, true); - return selinux_inode(inode); + return isec; } -static struct inode_security_struct *backing_inode_security_novalidate(struct dentry *dentry) +static inline struct inode_security_struct *backing_inode_security_novalidate(struct dentry *dentry) { - struct inode *inode = d_backing_inode(dentry); - - return selinux_inode(inode); + return selinux_inode(d_backing_inode(dentry)); } /* * Get the security label of a dentry's backing inode. */ -static struct inode_security_struct *backing_inode_security(struct dentry *dentry) +static inline struct inode_security_struct *backing_inode_security(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); + struct inode_security_struct *isec = selinux_inode(inode); + /* check below is racy, but revalidate will recheck with lock held */ + if (data_race(likely(isec->initialized == LABEL_INITIALIZED))) + return isec; __inode_security_revalidate(inode, dentry, true); - return selinux_inode(inode); + return isec; } static void inode_free_security(struct inode *inode) @@ -1683,12 +1691,15 @@ static inline int dentry_has_perm(const struct cred *cred, struct dentry *dentry, u32 av) { - struct inode *inode = d_backing_inode(dentry); struct common_audit_data ad; + struct inode *inode = d_backing_inode(dentry); + struct inode_security_struct *isec = selinux_inode(inode); ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; - __inode_security_revalidate(inode, dentry, true); + /* check below is racy, but revalidate will recheck with lock held */ + if (data_race(unlikely(isec->initialized != LABEL_INITIALIZED))) + __inode_security_revalidate(inode, dentry, true); return inode_has_perm(cred, inode, av, &ad); } @@ -1699,12 +1710,15 @@ static inline int path_has_perm(const struct cred *cred, const struct path *path, u32 av) { - struct inode *inode = d_backing_inode(path->dentry); struct common_audit_data ad; + struct inode *inode = d_backing_inode(path->dentry); + struct inode_security_struct *isec = selinux_inode(inode); ad.type = LSM_AUDIT_DATA_PATH; ad.u.path = *path; - __inode_security_revalidate(inode, path->dentry, true); + /* check below is racy, but revalidate will recheck with lock held */ + if (data_race(unlikely(isec->initialized != LABEL_INITIALIZED))) + __inode_security_revalidate(inode, path->dentry, true); return inode_has_perm(cred, inode, av, &ad); } @@ -3088,44 +3102,147 @@ static noinline int audit_inode_permission(struct inode *inode, audited, denied, result, &ad); } -static int selinux_inode_permission(struct inode *inode, int mask) +/** + * task_avdcache_reset - Reset the task's AVD cache + * @tsec: the task's security state + * + * Clear the task's AVD cache in @tsec and reset it to the current policy's + * and task's info. + */ +static inline void task_avdcache_reset(struct task_security_struct *tsec) +{ + memset(&tsec->avdcache.dir, 0, sizeof(tsec->avdcache.dir)); + tsec->avdcache.sid = tsec->sid; + tsec->avdcache.seqno = avc_policy_seqno(); + tsec->avdcache.dir_spot = TSEC_AVDC_DIR_SIZE - 1; +} + +/** + * task_avdcache_search - Search the task's AVD cache + * @tsec: the task's security state + * @isec: the inode to search for in the cache + * @avdc: matching avd cache entry returned to the caller + * + * Search @tsec for a AVD cache entry that matches @isec and return it to the + * caller via @avdc. Returns 0 if a match is found, negative values otherwise. + */ +static inline int task_avdcache_search(struct task_security_struct *tsec, + struct inode_security_struct *isec, + struct avdc_entry **avdc) +{ + int orig, iter; + + /* focused on path walk optimization, only cache directories */ + if (isec->sclass != SECCLASS_DIR) + return -ENOENT; + + if (unlikely(tsec->sid != tsec->avdcache.sid || + tsec->avdcache.seqno != avc_policy_seqno())) { + task_avdcache_reset(tsec); + return -ENOENT; + } + + orig = iter = tsec->avdcache.dir_spot; + do { + if (tsec->avdcache.dir[iter].isid == isec->sid) { + /* cache hit */ + tsec->avdcache.dir_spot = iter; + *avdc = &tsec->avdcache.dir[iter]; + return 0; + } + iter = (iter - 1) & (TSEC_AVDC_DIR_SIZE - 1); + } while (iter != orig); + + return -ENOENT; +} + +/** + * task_avdcache_update - Update the task's AVD cache + * @tsec: the task's security state + * @isec: the inode associated with the cache entry + * @avdc: the AVD info to cache + * @audited: the permission audit bitmask to cache + * + * Update the AVD cache in @tsec with the @avdc and @audited info associated + * with @isec. + */ +static inline void task_avdcache_update(struct task_security_struct *tsec, + struct inode_security_struct *isec, + struct av_decision *avd, + u32 audited) { + int spot; + + /* focused on path walk optimization, only cache directories */ + if (isec->sclass != SECCLASS_DIR) + return; + + /* update cache */ + spot = (tsec->avdcache.dir_spot + 1) & (TSEC_AVDC_DIR_SIZE - 1); + tsec->avdcache.dir_spot = spot; + tsec->avdcache.dir[spot].isid = isec->sid; + tsec->avdcache.dir[spot].audited = audited; + tsec->avdcache.dir[spot].allowed = avd->allowed; + tsec->avdcache.dir[spot].permissive = avd->flags & AVD_FLAGS_PERMISSIVE; +} + +/** + * selinux_inode_permission - Check if the current task can access an inode + * @inode: the inode that is being accessed + * @requested: the accesses being requested + * + * Check if the current task is allowed to access @inode according to + * @requested. Returns 0 if allowed, negative values otherwise. + */ +static int selinux_inode_permission(struct inode *inode, int requested) +{ + int mask; u32 perms; - bool from_access; - bool no_block = mask & MAY_NOT_BLOCK; + struct task_security_struct *tsec; struct inode_security_struct *isec; - u32 sid = current_sid(); - struct av_decision avd; + struct avdc_entry *avdc; int rc, rc2; u32 audited, denied; - from_access = mask & MAY_ACCESS; - mask &= (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND); + mask = requested & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND); /* No permission to check. Existence test. */ if (!mask) return 0; - if (unlikely(IS_PRIVATE(inode))) - return 0; - - perms = file_mask_to_av(inode->i_mode, mask); - - isec = inode_security_rcu(inode, no_block); + isec = inode_security_rcu(inode, requested & MAY_NOT_BLOCK); if (IS_ERR(isec)) return PTR_ERR(isec); + tsec = selinux_cred(current_cred()); + perms = file_mask_to_av(inode->i_mode, mask); + + rc = task_avdcache_search(tsec, isec, &avdc); + if (likely(!rc)) { + /* Cache hit. */ + audited = perms & avdc->audited; + denied = perms & ~avdc->allowed; + if (unlikely(denied && enforcing_enabled() && + !avdc->permissive)) + rc = -EACCES; + } else { + struct av_decision avd; + + /* Cache miss. */ + rc = avc_has_perm_noaudit(tsec->sid, isec->sid, isec->sclass, + perms, 0, &avd); + audited = avc_audit_required(perms, &avd, rc, + (requested & MAY_ACCESS) ? FILE__AUDIT_ACCESS : 0, + &denied); + task_avdcache_update(tsec, isec, &avd, audited); + } - rc = avc_has_perm_noaudit(sid, isec->sid, isec->sclass, perms, 0, - &avd); - audited = avc_audit_required(perms, &avd, rc, - from_access ? FILE__AUDIT_ACCESS : 0, - &denied); if (likely(!audited)) return rc; rc2 = audit_inode_permission(inode, perms, audited, denied, rc); if (rc2) return rc2; + return rc; } diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index b11c97c9feed..6ee7dc4dfd6e 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -29,6 +29,13 @@ #include "flask.h" #include "avc.h" +struct avdc_entry { + u32 isid; /* inode SID */ + u32 allowed; /* allowed permission bitmask */ + u32 audited; /* audited permission bitmask */ + bool permissive; /* AVC permissive flag */ +}; + struct task_security_struct { u32 osid; /* SID prior to last execve */ u32 sid; /* current SID */ @@ -36,6 +43,13 @@ struct task_security_struct { u32 create_sid; /* fscreate SID */ u32 keycreate_sid; /* keycreate SID */ u32 sockcreate_sid; /* fscreate SID */ +#define TSEC_AVDC_DIR_SIZE (1 << 2) + struct { + u32 sid; /* current SID for cached entries */ + u32 seqno; /* AVC sequence number */ + unsigned int dir_spot; /* dir cache index to check first */ + struct avdc_entry dir[TSEC_AVDC_DIR_SIZE]; /* dir entries */ + } avdcache; } __randomize_layout; enum label_initialized { -- 2.51.0