From d071004e623b7433573019d67cba79e345d83006 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 14 Nov 2024 15:59:44 +0800 Subject: [PATCH 01/16] tools/power turbostat: Consolidate graphics sysfs access Currently, there is an inconsistency in how graphics sysfs knobs are accessed: graphics residency sysfs knobs are opened and closed for each read, while graphics frequency sysfs knobs are opened once and remain open until turbostat exits. This inconsistency is confusing and adds unnecessary code complexity. Consolidate the access method by opening the sysfs files once and reusing the file pointers for subsequent accesses. This approach simplifies the code and ensures a consistent method for accessing graphics sysfs knobs. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c0596ccf92cd..e5b100b8db24 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5764,27 +5764,24 @@ int snapshot_proc_interrupts(void) */ int snapshot_graphics(int idx) { - FILE *fp; int retval; + if (gfx_info[idx].fp == NULL) + gfx_info[idx].fp = fopen_or_die(gfx_info[idx].path, "r"); + else + rewind(gfx_info[idx].fp); + switch (idx) { case GFX_rc6: case SAM_mc6: - fp = fopen_or_die(gfx_info[idx].path, "r"); - retval = fscanf(fp, "%lld", &gfx_info[idx].val_ull); + retval = fscanf(gfx_info[idx].fp, "%lld", &gfx_info[idx].val_ull); if (retval != 1) err(1, "rc6"); - fclose(fp); return 0; case GFX_MHz: case GFX_ACTMHz: case SAM_MHz: case SAM_ACTMHz: - if (gfx_info[idx].fp == NULL) - gfx_info[idx].fp = fopen_or_die(gfx_info[idx].path, "r"); - else - rewind(gfx_info[idx].fp); - retval = fscanf(gfx_info[idx].fp, "%d", &gfx_info[idx].val); if (retval != 1) err(1, "MHz"); -- 2.51.0 From c7538f33853b11d0ff2a81efb78bde125d1fc49f Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 14 Nov 2024 15:59:45 +0800 Subject: [PATCH 02/16] tools/power turbostat: Cache graphics sysfs file descriptors during probe Snapshots of the graphics sysfs knobs are taken based on file descriptors. To optimize this process, open the files and cache the file descriptors during the graphics probe phase. As a result, the previously cached pathnames become redundant and are removed. This change aims to streamline the code without altering its functionality. No functional change intended. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 82 +++++++++++---------------- 1 file changed, 32 insertions(+), 50 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e5b100b8db24..28513172ffce 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -376,7 +376,6 @@ enum gfx_sysfs_idx { }; struct gfx_sysfs_info { - const char *path; FILE *fp; unsigned int val; unsigned long long val_ull; @@ -5766,10 +5765,7 @@ int snapshot_graphics(int idx) { int retval; - if (gfx_info[idx].fp == NULL) - gfx_info[idx].fp = fopen_or_die(gfx_info[idx].path, "r"); - else - rewind(gfx_info[idx].fp); + rewind(gfx_info[idx].fp); switch (idx) { case GFX_rc6: @@ -6474,6 +6470,12 @@ static void probe_intel_uncore_frequency(void) probe_intel_uncore_frequency_legacy(); } +static void set_graphics_fp(char *path, int idx) +{ + if (!access(path, R_OK)) + gfx_info[idx].fp = fopen_or_die(path, "r"); +} + static void probe_graphics(void) { /* Xe graphics sysfs knobs */ @@ -6481,7 +6483,6 @@ static void probe_graphics(void) FILE *fp; char buf[8]; bool gt0_is_gt; - int idx; fp = fopen("/sys/class/drm/card0/device/tile0/gt0/gtidle/name", "r"); if (!fp) @@ -6500,28 +6501,17 @@ static void probe_graphics(void) else goto next; - idx = gt0_is_gt ? GFX_rc6 : SAM_mc6; - gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms"; + set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms", gt0_is_gt ? GFX_rc6 : SAM_mc6); - idx = gt0_is_gt ? GFX_MHz : SAM_MHz; - if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq", R_OK)) - gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq"; + set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq", gt0_is_gt ? GFX_MHz : SAM_MHz); - idx = gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz; - if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq", R_OK)) - gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq"; + set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq", gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz); - idx = gt0_is_gt ? SAM_mc6 : GFX_rc6; - if (!access("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms", R_OK)) - gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms"; + set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms", gt0_is_gt ? SAM_mc6 : GFX_rc6); - idx = gt0_is_gt ? SAM_MHz : GFX_MHz; - if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq", R_OK)) - gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq"; + set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq", gt0_is_gt ? SAM_MHz : GFX_MHz); - idx = gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz; - if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq", R_OK)) - gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq"; + set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq", gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz); goto end; } @@ -6529,52 +6519,44 @@ static void probe_graphics(void) next: /* New i915 graphics sysfs knobs */ if (!access("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", R_OK)) { - gfx_info[GFX_rc6].path = "/sys/class/drm/card0/gt/gt0/rc6_residency_ms"; + set_graphics_fp("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", GFX_rc6); - if (!access("/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz", R_OK)) - gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz"; + set_graphics_fp("/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz", GFX_MHz); - if (!access("/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz", R_OK)) - gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz"; + set_graphics_fp("/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz", GFX_ACTMHz); - if (!access("/sys/class/drm/card0/gt/gt1/rc6_residency_ms", R_OK)) - gfx_info[SAM_mc6].path = "/sys/class/drm/card0/gt/gt1/rc6_residency_ms"; + set_graphics_fp("/sys/class/drm/card0/gt/gt1/rc6_residency_ms", SAM_mc6); - if (!access("/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz", R_OK)) - gfx_info[SAM_MHz].path = "/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz"; + set_graphics_fp("/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz", SAM_MHz); - if (!access("/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz", R_OK)) - gfx_info[SAM_ACTMHz].path = "/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz"; + set_graphics_fp("/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz", SAM_ACTMHz); goto end; } /* Fall back to traditional i915 graphics sysfs knobs */ - if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) - gfx_info[GFX_rc6].path = "/sys/class/drm/card0/power/rc6_residency_ms"; + set_graphics_fp("/sys/class/drm/card0/power/rc6_residency_ms", GFX_rc6); - if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK)) - gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt_cur_freq_mhz"; - else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) - gfx_info[GFX_MHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz"; + set_graphics_fp("/sys/class/drm/card0/gt_cur_freq_mhz", GFX_MHz); + if (!gfx_info[GFX_MHz].fp) + set_graphics_fp("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", GFX_MHz); - if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK)) - gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt_act_freq_mhz"; - else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) - gfx_info[GFX_ACTMHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz"; + set_graphics_fp("/sys/class/drm/card0/gt_act_freq_mhz", GFX_ACTMHz); + if (!gfx_info[GFX_ACTMHz].fp) + set_graphics_fp("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", GFX_ACTMHz); end: - if (gfx_info[GFX_rc6].path) + if (gfx_info[GFX_rc6].fp) BIC_PRESENT(BIC_GFX_rc6); - if (gfx_info[GFX_MHz].path) + if (gfx_info[GFX_MHz].fp) BIC_PRESENT(BIC_GFXMHz); - if (gfx_info[GFX_ACTMHz].path) + if (gfx_info[GFX_ACTMHz].fp) BIC_PRESENT(BIC_GFXACTMHz); - if (gfx_info[SAM_mc6].path) + if (gfx_info[SAM_mc6].fp) BIC_PRESENT(BIC_SAM_mc6); - if (gfx_info[SAM_MHz].path) + if (gfx_info[SAM_MHz].fp) BIC_PRESENT(BIC_SAMMHz); - if (gfx_info[SAM_ACTMHz].path) + if (gfx_info[SAM_ACTMHz].fp) BIC_PRESENT(BIC_SAMACTMHz); } -- 2.51.0 From 03109e2f0d18dcb84218bd91c4fbf864193ca934 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 14 Nov 2024 15:59:46 +0800 Subject: [PATCH 03/16] tools/power turbostat: Add support for /sys/class/drm/card1 On some machines, the graphics device is enumerated as /sys/class/drm/card1 instead of /sys/class/drm/card0. The current implementation does not handle this scenario, resulting in the loss of graphics C6 residency and frequency information. Add support for /sys/class/drm/card1, ensuring that turbostat can retrieve and display the graphics columns for these platforms. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 38 ++++++++++++++++++++------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 28513172ffce..b250676c174e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -6476,8 +6476,14 @@ static void set_graphics_fp(char *path, int idx) gfx_info[idx].fp = fopen_or_die(path, "r"); } +/* Enlarge this if there are /sys/class/drm/card2 ... */ +#define GFX_MAX_CARDS 2 + static void probe_graphics(void) { + char path[PATH_MAX]; + int i; + /* Xe graphics sysfs knobs */ if (!access("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms", R_OK)) { FILE *fp; @@ -6518,22 +6524,36 @@ static void probe_graphics(void) next: /* New i915 graphics sysfs knobs */ - if (!access("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", R_OK)) { - set_graphics_fp("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", GFX_rc6); + for (i = 0; i < GFX_MAX_CARDS; i++) { + snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt0/rc6_residency_ms", i); + if (!access(path, R_OK)) + break; + } - set_graphics_fp("/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz", GFX_MHz); + if (i == GFX_MAX_CARDS) + goto legacy_i915; - set_graphics_fp("/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz", GFX_ACTMHz); + snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt0/rc6_residency_ms", i); + set_graphics_fp(path, GFX_rc6); - set_graphics_fp("/sys/class/drm/card0/gt/gt1/rc6_residency_ms", SAM_mc6); + snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt0/rps_cur_freq_mhz", i); + set_graphics_fp(path, GFX_MHz); - set_graphics_fp("/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz", SAM_MHz); + snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt0/rps_act_freq_mhz", i); + set_graphics_fp(path, GFX_ACTMHz); - set_graphics_fp("/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz", SAM_ACTMHz); + snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt1/rc6_residency_ms", i); + set_graphics_fp(path, SAM_mc6); - goto end; - } + snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt1/rps_cur_freq_mhz", i); + set_graphics_fp(path, SAM_MHz); + + snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt1/rps_act_freq_mhz", i); + set_graphics_fp(path, SAM_ACTMHz); + + goto end; +legacy_i915: /* Fall back to traditional i915 graphics sysfs knobs */ set_graphics_fp("/sys/class/drm/card0/power/rc6_residency_ms", GFX_rc6); -- 2.51.0 From bcfab87108b33f20d847fd71a2a93114dd2ce83e Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 24 Oct 2024 15:17:45 +0200 Subject: [PATCH 04/16] tools/power turbostat: Force --no-perf in --dump mode Force the --no-perf early to prevent using it as a source. User asks for raw values, but perf returns them relative to the opening of the file descriptor. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b250676c174e..1fed799a5537 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -9897,6 +9897,12 @@ void cmdline(int argc, char **argv) break; case 'D': dump_only++; + /* + * Force the no_perf early to prevent using it as a source. + * User asks for raw values, but perf returns them relative + * to the opening of the file descriptor. + */ + no_perf = 1; break; case 'e': /* --enable specified counter */ -- 2.51.0 From 1da0daf746342dfdc114e4dc8fbf3ece28666d4f Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 13 Nov 2024 15:48:22 +0100 Subject: [PATCH 05/16] tools/power turbostat: Fix child's argument forwarding Add '+' to optstring when early scanning for --no-msr and --no-perf. It causes option processing to stop as soon as a nonoption argument is encountered, effectively skipping child's arguments. Fixes: 3e4048466c39 ("tools/power turbostat: Add --no-msr option") Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 1fed799a5537..9025c2945737 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -9873,7 +9873,7 @@ void cmdline(int argc, char **argv) * Parse some options early, because they may make other options invalid, * like adding the MSR counter with --add and at the same time using --no-msr. */ - while ((opt = getopt_long_only(argc, argv, "MPn:", long_options, &option_index)) != -1) { + while ((opt = getopt_long_only(argc, argv, "+MPn:", long_options, &option_index)) != -1) { switch (opt) { case 'M': no_msr = 1; -- 2.51.0 From e5f687b89bc2a892256f48e14a568970b59a4812 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 2 Oct 2024 15:05:15 +0200 Subject: [PATCH 06/16] tools/power turbostat: Add RAPL psys as a built-in counter Introduce the counter as a part of global, platform counters structure. We open the counter for only one cpu, but otherwise treat it as an ordinary RAPL counter, allowing for grouped perf read. The counter is disabled by default, because it's interpretation may require additional, platform specific information, making it unsuitable for general use. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 + tools/power/x86/turbostat/turbostat.c | 93 ++++++++++++++++++++++++--- 2 files changed, 85 insertions(+), 10 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 56c7ff6efcda..95eb02346d3a 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -190,6 +190,8 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBRAMWatt\fP Watts consumed by the DRAM DIMMS -- available only on server processors. .PP +\fBSysWatt\fP Watts consumed by the whole platform (RAPL PSYS). Disabled by default. May require platform specific information to interpret the data, making it not suitable for general use. +.PP \fBPKG_%\fP percent of the interval that RAPL throttling was active on the Package. Note that the system summary is the sum of the package throttling time, and thus may be higher than 100% on a multi-package system. Note that the meaning of this field is model specific. For example, some hardware increments this counter when RAPL responds to thermal limits, but does not increment this counter when RAPL responds to power limits. Comparing PkgWatt and PkgTmp to system limits is necessary. .PP \fBRAM_%\fP percent of the interval that RAPL throttling was active on DRAM. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9025c2945737..88c7f896c5b2 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -200,6 +200,8 @@ struct msr_counter bic[] = { { 0x0, "SAMMHz", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "SAMAMHz", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "Die%c6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SysWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Sys_J", NULL, 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -262,6 +264,8 @@ struct msr_counter bic[] = { #define BIC_SAMMHz (1ULL << 56) #define BIC_SAMACTMHz (1ULL << 57) #define BIC_Diec6 (1ULL << 58) +#define BIC_SysWatt (1ULL << 59) +#define BIC_Sys_J (1ULL << 60) #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) @@ -269,7 +273,7 @@ struct msr_counter bic[] = { #define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) -#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) +#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC | BIC_SysWatt | BIC_Sys_J) unsigned long long bic_enabled = (0xFFFFFFFFFFFFFFFFULL & ~BIC_DISABLED_BY_DEFAULT); unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC_X2APIC; @@ -507,12 +511,15 @@ enum rapl_msrs { RAPL_AMD_PWR_UNIT = BIT(14), /* 0xc0010299 MSR_AMD_RAPL_POWER_UNIT */ RAPL_AMD_CORE_ENERGY_STAT = BIT(15), /* 0xc001029a MSR_AMD_CORE_ENERGY_STATUS */ RAPL_AMD_PKG_ENERGY_STAT = BIT(16), /* 0xc001029b MSR_AMD_PKG_ENERGY_STATUS */ + RAPL_PLATFORM_ENERGY_LIMIT = BIT(17), /* 0x64c MSR_PLATFORM_ENERGY_LIMIT */ + RAPL_PLATFORM_ENERGY_STATUS = BIT(18), /* 0x64d MSR_PLATFORM_ENERGY_STATUS */ }; #define RAPL_PKG (RAPL_PKG_ENERGY_STATUS | RAPL_PKG_POWER_LIMIT) #define RAPL_DRAM (RAPL_DRAM_ENERGY_STATUS | RAPL_DRAM_POWER_LIMIT) #define RAPL_CORE (RAPL_CORE_ENERGY_STATUS | RAPL_CORE_POWER_LIMIT) #define RAPL_GFX (RAPL_GFX_POWER_LIMIT | RAPL_GFX_ENERGY_STATUS) +#define RAPL_PSYS (RAPL_PLATFORM_ENERGY_STATUS | RAPL_PLATFORM_ENERGY_LIMIT) #define RAPL_PKG_ALL (RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO) #define RAPL_DRAM_ALL (RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_DRAM_POWER_INFO) @@ -713,7 +720,7 @@ static const struct platform_features skl_features = { .has_ext_cst_msrs = 1, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, - .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX | RAPL_PSYS, .enable_tsc_tweak = 1, }; @@ -730,7 +737,7 @@ static const struct platform_features cnl_features = { .has_ext_cst_msrs = 1, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, - .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX | RAPL_PSYS, .enable_tsc_tweak = 1, }; @@ -797,7 +804,7 @@ static const struct platform_features icx_features = { .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, - .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS, .has_fixed_rapl_unit = 1, }; @@ -813,7 +820,7 @@ static const struct platform_features spr_features = { .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, - .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS, }; static const struct platform_features srf_features = { @@ -829,7 +836,7 @@ static const struct platform_features srf_features = { .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, - .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS, }; static const struct platform_features grr_features = { @@ -845,7 +852,7 @@ static const struct platform_features grr_features = { .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, - .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS, }; static const struct platform_features slv_features = { @@ -1108,6 +1115,7 @@ enum rapl_rci_index { RAPL_RCI_INDEX_PKG_PERF_STATUS = 4, RAPL_RCI_INDEX_DRAM_PERF_STATUS = 5, RAPL_RCI_INDEX_CORE_ENERGY = 6, + RAPL_RCI_INDEX_ENERGY_PLATFORM = 7, NUM_RAPL_COUNTERS, }; @@ -1134,6 +1142,7 @@ struct rapl_counter_info_t { struct rapl_counter_info_t *rapl_counter_info_perdomain; unsigned int rapl_counter_info_perdomain_size; +#define RAPL_COUNTER_FLAG_PLATFORM_COUNTER (1u << 0) #define RAPL_COUNTER_FLAG_USE_MSR_SUM (1u << 1) struct rapl_counter_arch_info { @@ -1255,6 +1264,19 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = { .compat_scale = 1.0, .flags = 0, }, + { + .feature_mask = RAPL_PSYS, + .perf_subsys = "power", + .perf_name = "energy-psys", + .msr = MSR_PLATFORM_ENERGY_STATUS, + .msr_mask = 0x00000000FFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_PLATFORM, + .bic = BIC_SysWatt | BIC_Sys_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_PLATFORM_COUNTER | RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, }; struct rapl_counter { @@ -1682,6 +1704,7 @@ enum { IDX_PP1_ENERGY, IDX_PKG_PERF, IDX_DRAM_PERF, + IDX_PSYS_ENERGY, IDX_COUNT, }; @@ -1726,6 +1749,9 @@ off_t idx_to_offset(int idx) case IDX_DRAM_PERF: offset = MSR_DRAM_PERF_STATUS; break; + case IDX_PSYS_ENERGY: + offset = MSR_PLATFORM_ENERGY_STATUS; + break; default: offset = -1; } @@ -1756,6 +1782,9 @@ int offset_to_idx(off_t offset) case MSR_DRAM_PERF_STATUS: idx = IDX_DRAM_PERF; break; + case MSR_PLATFORM_ENERGY_STATUS: + idx = IDX_PSYS_ENERGY; + break; default: idx = -1; } @@ -1777,6 +1806,8 @@ int idx_valid(int idx) return platform->rapl_msrs & RAPL_PKG_PERF_STATUS; case IDX_DRAM_PERF: return platform->rapl_msrs & RAPL_DRAM_PERF_STATUS; + case IDX_PSYS_ENERGY: + return platform->rapl_msrs & RAPL_PSYS; default: return 0; } @@ -1848,6 +1879,10 @@ struct system_summary { struct pkg_data packages; } average; +struct platform_counters { + struct rapl_counter energy_psys; /* MSR_PLATFORM_ENERGY_STATUS */ +} platform_counters_odd, platform_counters_even; + struct cpu_topology { int physical_package_id; int die_id; @@ -2512,6 +2547,11 @@ void print_header(char *delim) ppmt = ppmt->next; } + if (DO_BIC(BIC_SysWatt)) + outp += sprintf(outp, "%sSysWatt", (printed++ ? delim : "")); + if (DO_BIC(BIC_Sys_J)) + outp += sprintf(outp, "%sSys_J", (printed++ ? delim : "")); + outp += sprintf(outp, "\n"); } @@ -2519,6 +2559,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p { int i; struct msr_counter *mp; + struct platform_counters *pplat_cnt = p == package_odd ? &platform_counters_odd : &platform_counters_even; outp += sprintf(outp, "t %p, c %p, p %p\n", t, c, p); @@ -2590,6 +2631,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores.raw_value); outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx.raw_value); outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram.raw_value); + outp += sprintf(outp, "Joules PSYS: %0llX\n", pplat_cnt->energy_psys.raw_value); outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status.raw_value); outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status.raw_value); outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c); @@ -2628,6 +2670,9 @@ double rapl_counter_get_value(const struct rapl_counter *c, enum rapl_unit desir */ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { + static int count; + + struct platform_counters *pplat_cnt = NULL; double interval_float, tsc; char *fmt8; int i; @@ -2637,6 +2682,11 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data char *delim = "\t"; int printed = 0; + if (t == &average.threads) { + pplat_cnt = count & 1 ? &platform_counters_odd : &platform_counters_even; + ++count; + } + /* if showing only 1st thread in core and this isn't one, bail out */ if (show_core_only && !is_cpu_first_thread_in_core(t, c, p)) return 0; @@ -3093,6 +3143,13 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data } } + if (DO_BIC(BIC_SysWatt) && (t == &average.threads)) + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&pplat_cnt->energy_psys, RAPL_UNIT_WATTS, interval_float)); + if (DO_BIC(BIC_Sys_J) && (t == &average.threads)) + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&pplat_cnt->energy_psys, RAPL_UNIT_JOULES, interval_float)); + done: if (*(outp - 1) != '\n') outp += sprintf(outp, "\n"); @@ -3400,6 +3457,11 @@ int delta_cpu(struct thread_data *t, struct core_data *c, return retval; } +void delta_platform(struct platform_counters *new, struct platform_counters *old) +{ + old->energy_psys.raw_value = new->energy_psys.raw_value - old->energy_psys.raw_value; +} + void rapl_counter_clear(struct rapl_counter *c) { c->raw_value = 0; @@ -4129,6 +4191,9 @@ static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci, unsigned int idx) { + if (rci->source[idx] == COUNTER_SOURCE_NONE) + return; + rc->raw_value = rci->data[idx]; rc->unit = rci->unit[idx]; rc->scale = rci->scale[idx]; @@ -4136,6 +4201,7 @@ void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct pkg_data *p) { + struct platform_counters *pplat_cnt = p == package_odd ? &platform_counters_odd : &platform_counters_even; unsigned long long perf_data[NUM_RAPL_COUNTERS + 1]; struct rapl_counter_info_t *rci; @@ -4163,6 +4229,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) { switch (rci->source[i]) { case COUNTER_SOURCE_NONE: + rci->data[i] = 0; break; case COUNTER_SOURCE_PERF: @@ -4201,7 +4268,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct } } - BUILD_BUG_ON(NUM_RAPL_COUNTERS != 7); + BUILD_BUG_ON(NUM_RAPL_COUNTERS != 8); write_rapl_counter(&p->energy_pkg, rci, RAPL_RCI_INDEX_ENERGY_PKG); write_rapl_counter(&p->energy_cores, rci, RAPL_RCI_INDEX_ENERGY_CORES); write_rapl_counter(&p->energy_dram, rci, RAPL_RCI_INDEX_DRAM); @@ -4209,6 +4276,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct write_rapl_counter(&p->rapl_pkg_perf_status, rci, RAPL_RCI_INDEX_PKG_PERF_STATUS); write_rapl_counter(&p->rapl_dram_perf_status, rci, RAPL_RCI_INDEX_DRAM_PERF_STATUS); write_rapl_counter(&c->core_energy, rci, RAPL_RCI_INDEX_CORE_ENERGY); + write_rapl_counter(&pplat_cnt->energy_psys, rci, RAPL_RCI_INDEX_ENERGY_PLATFORM); return 0; } @@ -6144,6 +6212,7 @@ restart: re_initialize(); goto restart; } + delta_platform(&platform_counters_odd, &platform_counters_even); compute_average(EVEN_COUNTERS); format_all_counters(EVEN_COUNTERS); flush_output_stdout(); @@ -6167,6 +6236,7 @@ restart: re_initialize(); goto restart; } + delta_platform(&platform_counters_even, &platform_counters_odd); compute_average(ODD_COUNTERS); format_all_counters(ODD_COUNTERS); flush_output_stdout(); @@ -6945,8 +7015,8 @@ void rapl_probe_intel(void) unsigned long long msr; unsigned int time_unit; double tdp; - const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt | BIC_RAMWatt | BIC_GFXWatt; - const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J | BIC_RAM_J | BIC_GFX_J; + const unsigned long long bic_watt_bits = BIC_SysWatt | BIC_PkgWatt | BIC_CorWatt | BIC_RAMWatt | BIC_GFXWatt; + const unsigned long long bic_joules_bits = BIC_Sys_J | BIC_Pkg_J | BIC_Cor_J | BIC_RAM_J | BIC_GFX_J; if (rapl_joules) bic_enabled &= ~bic_watt_bits; @@ -7606,6 +7676,9 @@ void rapl_perf_init(void) domain_visited[next_domain] = 1; + if ((cai->flags & RAPL_COUNTER_FLAG_PLATFORM_COUNTER) && (cpu != base_cpu)) + continue; + struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[next_domain]; /* Check if the counter is enabled and accessible */ -- 2.51.0 From 86d237734091201d2ab2c1d2e1063893621c770f Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 30 Nov 2024 16:22:00 -0500 Subject: [PATCH 07/16] tools/power turbostat: 2024.11.30 since 2024.07.26: assorted minor bug fixes assorted platform specific tweaks initial RAPL PSYS (SysWatt) support Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 +- tools/power/x86/turbostat/turbostat.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 95eb02346d3a..a7f7ed01421c 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -190,7 +190,7 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBRAMWatt\fP Watts consumed by the DRAM DIMMS -- available only on server processors. .PP -\fBSysWatt\fP Watts consumed by the whole platform (RAPL PSYS). Disabled by default. May require platform specific information to interpret the data, making it not suitable for general use. +\fBSysWatt\fP Watts consumed by the whole platform (RAPL PSYS). Disabled by default. Enable with --enable SysWatt. .PP \fBPKG_%\fP percent of the interval that RAPL throttling was active on the Package. Note that the system summary is the sum of the package throttling time, and thus may be higher than 100% on a multi-package system. Note that the meaning of this field is model specific. For example, some hardware increments this counter when RAPL responds to thermal limits, but does not increment this counter when RAPL responds to power limits. Comparing PkgWatt and PkgTmp to system limits is necessary. .PP diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 88c7f896c5b2..58a487c225a7 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -9236,7 +9236,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2024.07.26 - Len Brown \n"); + fprintf(outf, "turbostat version 2024.11.30 - Len Brown \n"); } #define COMMAND_LINE_SIZE 2048 -- 2.51.0 From f69e63756f7822fcdad8a34f9967e8b243e883ee Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 2 Oct 2024 18:31:47 +0100 Subject: [PATCH 08/16] printf: Remove unused 'bprintf' bprintf() is unused. Remove it. It was added in the commit 4370aa4aa753 ("vsprintf: add binary printf") but as far as I can see was never used, unlike the other two functions in that patch. Link: https://lore.kernel.org/20241002173147.210107-1-linux@treblig.org Reviewed-by: Andy Shevchenko Acked-by: Petr Mladek Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Steven Rostedt (Google) --- include/linux/string.h | 1 - lib/vsprintf.c | 23 ----------------------- 2 files changed, 24 deletions(-) diff --git a/include/linux/string.h b/include/linux/string.h index 0dd27afcfaf7..493ac4862c77 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -335,7 +335,6 @@ int __sysfs_match_string(const char * const *array, size_t n, const char *s); #ifdef CONFIG_BINARY_PRINTF int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf); -int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); #endif extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 6ac02bbb7df1..9d3dac38a3f4 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -3428,29 +3428,6 @@ out: } EXPORT_SYMBOL_GPL(bstr_printf); -/** - * bprintf - Parse a format string and place args' binary value in a buffer - * @bin_buf: The buffer to place args' binary value - * @size: The size of the buffer(by words(32bits), not characters) - * @fmt: The format string to use - * @...: Arguments for the format string - * - * The function returns the number of words(u32) written - * into @bin_buf. - */ -int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) -{ - va_list args; - int ret; - - va_start(args, fmt); - ret = vbin_printf(bin_buf, size, fmt, args); - va_end(args); - - return ret; -} -EXPORT_SYMBOL_GPL(bprintf); - #endif /* CONFIG_BINARY_PRINTF */ /** -- 2.51.0 From 9022ed0e7e65734d83a0648648589b9fbea8e8c9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 1 Dec 2024 09:23:33 -0800 Subject: [PATCH 09/16] strscpy: write destination buffer only once The point behind strscpy() was to once and for all avoid all the problems with 'strncpy()' and later broken "fixed" versions like strlcpy() that just made things worse. So strscpy not only guarantees NUL-termination (unlike strncpy), it also doesn't do unnecessary padding at the destination. But at the same time also avoids byte-at-a-time reads and writes by _allowing_ some extra NUL writes - within the size, of course - so that the whole copy can be done with word operations. It is also stable in the face of a mutable source string: it explicitly does not read the source buffer multiple times (so an implementation using "strnlen()+memcpy()" would be wrong), and does not read the source buffer past the size (like the mis-design that is strlcpy does). Finally, the return value is designed to be simple and unambiguous: if the string cannot be copied fully, it returns an actual negative error, making error handling clearer and simpler (and the caller already knows the size of the buffer). Otherwise it returns the string length of the result. However, there was one final stability issue that can be important to callers: the stability of the destination buffer. In particular, the same way we shouldn't read the source buffer more than once, we should avoid doing multiple writes to the destination buffer: first writing a potentially non-terminated string, and then terminating it with NUL at the end does not result in a stable result buffer. Yes, it gives the right result in the end, but if the rule for the destination buffer was that it is _always_ NUL-terminated even when accessed concurrently with updates, the final byte of the buffer needs to always _stay_ as a NUL byte. [ Note that "final byte is NUL" here is literally about the final byte in the destination array, not the terminating NUL at the end of the string itself. There is no attempt to try to make concurrent reads and writes give any kind of consistent string length or contents, but we do want to guarantee that there is always at least that final terminating NUL character at the end of the destination array if it existed before ] This is relevant in the kernel for the tsk->comm[] array, for example. Even without locking (for either readers or writers), we want to know that while the buffer contents may be garbled, it is always a valid C string and always has a NUL character at 'comm[TASK_COMM_LEN-1]' (and never has any "out of thin air" data). So avoid any "copy possibly non-terminated string, and terminate later" behavior, and write the destination buffer only once. Signed-off-by: Linus Torvalds --- lib/string.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/lib/string.c b/lib/string.c index 76327b51e36f..eb4486ed40d2 100644 --- a/lib/string.c +++ b/lib/string.c @@ -104,6 +104,12 @@ char *strncpy(char *dest, const char *src, size_t count) EXPORT_SYMBOL(strncpy); #endif +#ifdef __BIG_ENDIAN +# define ALLBUTLAST_BYTE_MASK (~255ul) +#else +# define ALLBUTLAST_BYTE_MASK (~0ul >> 8) +#endif + ssize_t sized_strscpy(char *dest, const char *src, size_t count) { const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; @@ -147,13 +153,18 @@ ssize_t sized_strscpy(char *dest, const char *src, size_t count) *(unsigned long *)(dest+res) = c & zero_bytemask(data); return res + find_zero(data); } + count -= sizeof(unsigned long); + if (unlikely(!count)) { + c &= ALLBUTLAST_BYTE_MASK; + *(unsigned long *)(dest+res) = c; + return -E2BIG; + } *(unsigned long *)(dest+res) = c; res += sizeof(unsigned long); - count -= sizeof(unsigned long); max -= sizeof(unsigned long); } - while (count) { + while (count > 1) { char c; c = src[res]; @@ -164,11 +175,11 @@ ssize_t sized_strscpy(char *dest, const char *src, size_t count) count--; } - /* Hit buffer length without finding a NUL; force NUL-termination. */ - if (res) - dest[res-1] = '\0'; + /* Force NUL-termination. */ + dest[res] = '\0'; - return -E2BIG; + /* Return E2BIG if the source didn't stop */ + return src[res] ? -E2BIG : res; } EXPORT_SYMBOL(sized_strscpy); -- 2.51.0 From 40384c840ea1944d7c5a392e8975ed088ecf0b37 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 1 Dec 2024 14:28:56 -0800 Subject: [PATCH 10/16] Linux 6.13-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index e34a97473fb6..93ab62cef244 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 -PATCHLEVEL = 12 +PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.51.0 From f2f7358c3890e7366cbcb7512b4bc8b4394b2d61 Mon Sep 17 00:00:00 2001 From: Remi Pommarel Date: Fri, 22 Nov 2024 16:52:48 +0100 Subject: [PATCH 11/16] batman-adv: Do not send uninitialized TT changes The number of TT changes can be less than initially expected in batadv_tt_tvlv_container_update() (changes can be removed by batadv_tt_local_event() in ADD+DEL sequence between reading tt_diff_entries_num and actually iterating the change list under lock). Thus tt_diff_len could be bigger than the actual changes size that need to be sent. Because batadv_send_my_tt_response sends the whole packet, uninitialized data can be interpreted as TT changes on other nodes leading to weird TT global entries on those nodes such as: * 00:00:00:00:00:00 -1 [....] ( 0) 88:12:4e:ad:7e:ba (179) (0x45845380) * 00:00:00:00:78:79 4092 [.W..] ( 0) 88:12:4e:ad:7e:3c (145) (0x8ebadb8b) All of the above also applies to OGM tvlv container buffer's tvlv_len. Remove the extra allocated space to avoid sending uninitialized TT changes in batadv_send_my_tt_response() and batadv_v_ogm_send_softif(). Fixes: e1bf0c14096f ("batman-adv: tvlv - convert tt data sent within OGMs") Signed-off-by: Remi Pommarel Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/translation-table.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index b44c382226a1..996d1f01171a 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -948,6 +948,7 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) int tt_diff_len, tt_change_len = 0; int tt_diff_entries_num = 0; int tt_diff_entries_count = 0; + size_t tt_extra_len = 0; u16 tvlv_len; tt_diff_entries_num = atomic_read(&bat_priv->tt.local_changes); @@ -985,6 +986,9 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) } spin_unlock_bh(&bat_priv->tt.changes_list_lock); + tt_extra_len = batadv_tt_len(tt_diff_entries_num - + tt_diff_entries_count); + /* Keep the buffer for possible tt_request */ spin_lock_bh(&bat_priv->tt.last_changeset_lock); kfree(bat_priv->tt.last_changeset); @@ -993,6 +997,7 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) tt_change_len = batadv_tt_len(tt_diff_entries_count); /* check whether this new OGM has no changes due to size problems */ if (tt_diff_entries_count > 0) { + tt_diff_len -= tt_extra_len; /* if kmalloc() fails we will reply with the full table * instead of providing the diff */ @@ -1005,6 +1010,8 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) } spin_unlock_bh(&bat_priv->tt.last_changeset_lock); + /* Remove extra packet space for OGM */ + tvlv_len -= tt_extra_len; container_register: batadv_tvlv_container_register(bat_priv, BATADV_TVLV_TT, 1, tt_data, tvlv_len); -- 2.51.0 From 8038806db64da15721775d6b834990cacbfcf0b2 Mon Sep 17 00:00:00 2001 From: Remi Pommarel Date: Fri, 22 Nov 2024 16:52:49 +0100 Subject: [PATCH 12/16] batman-adv: Remove uninitialized data in full table TT response The number of entries filled by batadv_tt_tvlv_generate() can be less than initially expected in batadv_tt_prepare_tvlv_{global,local}_data() (changes can be removed by batadv_tt_local_event() in ADD+DEL sequence in the meantime as the lock held during the whole tvlv global/local data generation). Thus tvlv_len could be bigger than the actual TT entry size that need to be sent so full table TT_RESPONSE could hold invalid TT entries such as below. * 00:00:00:00:00:00 -1 [....] ( 0) 88:12:4e:ad:7e:ba (179) (0x45845380) * 00:00:00:00:78:79 4092 [.W..] ( 0) 88:12:4e:ad:7e:3c (145) (0x8ebadb8b) Remove the extra allocated space to avoid sending uninitialized entries for full table TT_RESPONSE in both batadv_send_other_tt_response() and batadv_send_my_tt_response(). Fixes: 7ea7b4a14275 ("batman-adv: make the TT CRC logic VLAN specific") Signed-off-by: Remi Pommarel Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/translation-table.c | 37 ++++++++++++++++++------------ 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 996d1f01171a..f5201738628c 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -2712,14 +2712,16 @@ static bool batadv_tt_global_valid(const void *entry_ptr, * * Fills the tvlv buff with the tt entries from the specified hash. If valid_cb * is not provided then this becomes a no-op. + * + * Return: Remaining unused length in tvlv_buff. */ -static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, - struct batadv_hashtable *hash, - void *tvlv_buff, u16 tt_len, - bool (*valid_cb)(const void *, - const void *, - u8 *flags), - void *cb_data) +static u16 batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, + struct batadv_hashtable *hash, + void *tvlv_buff, u16 tt_len, + bool (*valid_cb)(const void *, + const void *, + u8 *flags), + void *cb_data) { struct batadv_tt_common_entry *tt_common_entry; struct batadv_tvlv_tt_change *tt_change; @@ -2733,7 +2735,7 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, tt_change = tvlv_buff; if (!valid_cb) - return; + return tt_len; rcu_read_lock(); for (i = 0; i < hash->size; i++) { @@ -2759,6 +2761,8 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, } } rcu_read_unlock(); + + return batadv_tt_len(tt_tot - tt_num_entries); } /** @@ -3029,10 +3033,11 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, goto out; /* fill the rest of the tvlv with the real TT entries */ - batadv_tt_tvlv_generate(bat_priv, bat_priv->tt.global_hash, - tt_change, tt_len, - batadv_tt_global_valid, - req_dst_orig_node); + tvlv_len -= batadv_tt_tvlv_generate(bat_priv, + bat_priv->tt.global_hash, + tt_change, tt_len, + batadv_tt_global_valid, + req_dst_orig_node); } /* Don't send the response, if larger than fragmented packet. */ @@ -3156,9 +3161,11 @@ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv, goto out; /* fill the rest of the tvlv with the real TT entries */ - batadv_tt_tvlv_generate(bat_priv, bat_priv->tt.local_hash, - tt_change, tt_len, - batadv_tt_local_valid, NULL); + tvlv_len -= batadv_tt_tvlv_generate(bat_priv, + bat_priv->tt.local_hash, + tt_change, tt_len, + batadv_tt_local_valid, + NULL); } tvlv_tt_data->flags = BATADV_TT_RESPONSE; -- 2.51.0 From fff8f17c1a6fc802ca23bbd3a276abfde8cc58e6 Mon Sep 17 00:00:00 2001 From: Remi Pommarel Date: Fri, 22 Nov 2024 16:52:50 +0100 Subject: [PATCH 13/16] batman-adv: Do not let TT changes list grows indefinitely When TT changes list is too big to fit in packet due to MTU size, an empty OGM is sent expected other node to send TT request to get the changes. The issue is that tt.last_changeset was not built thus the originator was responding with previous changes to those TT requests (see batadv_send_my_tt_response). Also the changes list was never cleaned up effectively never ending growing from this point onwards, repeatedly sending the same TT response changes over and over, and creating a new empty OGM every OGM interval expecting for the local changes to be purged. When there is more TT changes that can fit in packet, drop all changes, send empty OGM and wait for TT request so we can respond with a full table instead. Fixes: e1bf0c14096f ("batman-adv: tvlv - convert tt data sent within OGMs") Signed-off-by: Remi Pommarel Acked-by: Antonio Quartulli Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/translation-table.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index f5201738628c..760d51fdbdf6 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -948,6 +948,7 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) int tt_diff_len, tt_change_len = 0; int tt_diff_entries_num = 0; int tt_diff_entries_count = 0; + bool drop_changes = false; size_t tt_extra_len = 0; u16 tvlv_len; @@ -955,10 +956,17 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) tt_diff_len = batadv_tt_len(tt_diff_entries_num); /* if we have too many changes for one packet don't send any - * and wait for the tt table request which will be fragmented + * and wait for the tt table request so we can reply with the full + * (fragmented) table. + * + * The local change history should still be cleaned up so the next + * TT round can start again with a clean state. */ - if (tt_diff_len > bat_priv->soft_iface->mtu) + if (tt_diff_len > bat_priv->soft_iface->mtu) { tt_diff_len = 0; + tt_diff_entries_num = 0; + drop_changes = true; + } tvlv_len = batadv_tt_prepare_tvlv_local_data(bat_priv, &tt_data, &tt_change, &tt_diff_len); @@ -967,7 +975,7 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) tt_data->flags = BATADV_TT_OGM_DIFF; - if (tt_diff_len == 0) + if (!drop_changes && tt_diff_len == 0) goto container_register; spin_lock_bh(&bat_priv->tt.changes_list_lock); -- 2.51.0 From ccb7276a6d26d6f8416e315b43b45e15ee7f29e2 Mon Sep 17 00:00:00 2001 From: Andy Strohman Date: Thu, 9 Jan 2025 02:27:56 +0000 Subject: [PATCH 14/16] batman-adv: fix panic during interface removal Reference counting is used to ensure that batadv_hardif_neigh_node and batadv_hard_iface are not freed before/during batadv_v_elp_throughput_metric_update work is finished. But there isn't a guarantee that the hard if will remain associated with a soft interface up until the work is finished. This fixes a crash triggered by reboot that looks like this: Call trace: batadv_v_mesh_free+0xd0/0x4dc [batman_adv] batadv_v_elp_throughput_metric_update+0x1c/0xa4 process_one_work+0x178/0x398 worker_thread+0x2e8/0x4d0 kthread+0xd8/0xdc ret_from_fork+0x10/0x20 (the batadv_v_mesh_free call is misleading, and does not actually happen) I was able to make the issue happen more reliably by changing hardif_neigh->bat_v.metric_work work to be delayed work. This allowed me to track down and confirm the fix. Cc: stable@vger.kernel.org Fixes: c833484e5f38 ("batman-adv: ELP - compute the metric based on the estimated throughput") Signed-off-by: Andy Strohman [sven@narfation.org: prevent entering batadv_v_elp_get_throughput without soft_iface] Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_elp.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 1d704574e6bf..fbf499bcc671 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -66,12 +66,19 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) { struct batadv_hard_iface *hard_iface = neigh->if_incoming; + struct net_device *soft_iface = hard_iface->soft_iface; struct ethtool_link_ksettings link_settings; struct net_device *real_netdev; struct station_info sinfo; u32 throughput; int ret; + /* don't query throughput when no longer associated with any + * batman-adv interface + */ + if (!soft_iface) + return BATADV_THROUGHPUT_DEFAULT_VALUE; + /* if the user specified a customised value for this interface, then * return it directly */ @@ -141,7 +148,7 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) default_throughput: if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT)) { - batadv_info(hard_iface->soft_iface, + batadv_info(soft_iface, "WiFi driver or ethtool info does not provide information about link speeds on interface %s, therefore defaulting to hardcoded throughput values of %u.%1u Mbps. Consider overriding the throughput manually or checking your driver.\n", hard_iface->net_dev->name, BATADV_THROUGHPUT_DEFAULT_VALUE / 10, -- 2.51.0 From e7e34ffc976aaae4f465b7898303241b81ceefc3 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 20 Jan 2025 20:35:28 +0100 Subject: [PATCH 15/16] batman-adv: Ignore neighbor throughput metrics in error case If a temporary error happened in the evaluation of the neighbor throughput information, then the invalid throughput result should not be stored in the throughtput EWMA. Cc: stable@vger.kernel.org Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_elp.c | 50 ++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index fbf499bcc671..65e52de52bcd 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -59,11 +59,13 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) /** * batadv_v_elp_get_throughput() - get the throughput towards a neighbour * @neigh: the neighbour for which the throughput has to be obtained + * @pthroughput: calculated throughput towards the given neighbour in multiples + * of 100kpbs (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc). * - * Return: The throughput towards the given neighbour in multiples of 100kpbs - * (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc). + * Return: true when value behind @pthroughput was set */ -static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) +static bool batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh, + u32 *pthroughput) { struct batadv_hard_iface *hard_iface = neigh->if_incoming; struct net_device *soft_iface = hard_iface->soft_iface; @@ -77,14 +79,16 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) * batman-adv interface */ if (!soft_iface) - return BATADV_THROUGHPUT_DEFAULT_VALUE; + return false; /* if the user specified a customised value for this interface, then * return it directly */ throughput = atomic_read(&hard_iface->bat_v.throughput_override); - if (throughput != 0) - return throughput; + if (throughput != 0) { + *pthroughput = throughput; + return true; + } /* if this is a wireless device, then ask its throughput through * cfg80211 API @@ -111,19 +115,24 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) * possible to delete this neighbor. For now set * the throughput metric to 0. */ - return 0; + *pthroughput = 0; + return true; } if (ret) goto default_throughput; - if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) - return sinfo.expected_throughput / 100; + if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) { + *pthroughput = sinfo.expected_throughput / 100; + return true; + } /* try to estimate the expected throughput based on reported tx * rates */ - if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) - return cfg80211_calculate_bitrate(&sinfo.txrate) / 3; + if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) { + *pthroughput = cfg80211_calculate_bitrate(&sinfo.txrate) / 3; + return true; + } goto default_throughput; } @@ -142,8 +151,10 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX; throughput = link_settings.base.speed; - if (throughput && throughput != SPEED_UNKNOWN) - return throughput * 10; + if (throughput && throughput != SPEED_UNKNOWN) { + *pthroughput = throughput * 10; + return true; + } } default_throughput: @@ -157,7 +168,8 @@ default_throughput: } /* if none of the above cases apply, return the base_throughput */ - return BATADV_THROUGHPUT_DEFAULT_VALUE; + *pthroughput = BATADV_THROUGHPUT_DEFAULT_VALUE; + return true; } /** @@ -169,15 +181,21 @@ void batadv_v_elp_throughput_metric_update(struct work_struct *work) { struct batadv_hardif_neigh_node_bat_v *neigh_bat_v; struct batadv_hardif_neigh_node *neigh; + u32 throughput; + bool valid; neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v, metric_work); neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node, bat_v); - ewma_throughput_add(&neigh->bat_v.throughput, - batadv_v_elp_get_throughput(neigh)); + valid = batadv_v_elp_get_throughput(neigh, &throughput); + if (!valid) + goto put_neigh; + + ewma_throughput_add(&neigh->bat_v.throughput, throughput); +put_neigh: /* decrement refcounter to balance increment performed before scheduling * this task */ -- 2.51.0 From 8c8ecc98f5c65947b0070a24bac11e12e47cc65d Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 20 Jan 2025 00:06:11 +0100 Subject: [PATCH 16/16] batman-adv: Drop unmanaged ELP metric worker The ELP worker needs to calculate new metric values for all neighbors "reachable" over an interface. Some of the used metric sources require locks which might need to sleep. This sleep is incompatible with the RCU list iterator used for the recorded neighbors. The initial approach to work around of this problem was to queue another work item per neighbor and then run this in a new context. Even when this solved the RCU vs might_sleep() conflict, it has a major problems: Nothing was stopping the work item in case it is not needed anymore - for example because one of the related interfaces was removed or the batman-adv module was unloaded - resulting in potential invalid memory accesses. Directly canceling the metric worker also has various problems: * cancel_work_sync for a to-be-deactivated interface is called with rtnl_lock held. But the code in the ELP metric worker also tries to use rtnl_lock() - which will never return in this case. This also means that cancel_work_sync would never return because it is waiting for the worker to finish. * iterating over the neighbor list for the to-be-deactivated interface is currently done using the RCU specific methods. Which means that it is possible to miss items when iterating over it without the associated spinlock - a behaviour which is acceptable for a periodic metric check but not for a cleanup routine (which must "stop" all still running workers) The better approch is to get rid of the per interface neighbor metric worker and handle everything in the interface worker. The original problems are solved by: * creating a list of neighbors which require new metric information inside the RCU protected context, gathering the metric according to the new list outside the RCU protected context * only use rcu_trylock inside metric gathering code to avoid a deadlock when the cancel_delayed_work_sync is called in the interface removal code (which is called with the rtnl_lock held) Cc: stable@vger.kernel.org Fixes: c833484e5f38 ("batman-adv: ELP - compute the metric based on the estimated throughput") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v.c | 2 -- net/batman-adv/bat_v_elp.c | 71 ++++++++++++++++++++++++++------------ net/batman-adv/bat_v_elp.h | 2 -- net/batman-adv/types.h | 3 -- 4 files changed, 48 insertions(+), 30 deletions(-) diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index ac11f1f08db0..d35479c465e2 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -113,8 +113,6 @@ static void batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) { ewma_throughput_init(&hardif_neigh->bat_v.throughput); - INIT_WORK(&hardif_neigh->bat_v.metric_work, - batadv_v_elp_throughput_metric_update); } /** diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 65e52de52bcd..b065578b4436 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +43,18 @@ #include "routing.h" #include "send.h" +/** + * struct batadv_v_metric_queue_entry - list of hardif neighbors which require + * and metric update + */ +struct batadv_v_metric_queue_entry { + /** @hardif_neigh: hardif neighbor scheduled for metric update */ + struct batadv_hardif_neigh_node *hardif_neigh; + + /** @list: list node for metric_queue */ + struct list_head list; +}; + /** * batadv_v_elp_start_timer() - restart timer for ELP periodic work * @hard_iface: the interface for which the timer has to be reset @@ -137,10 +151,17 @@ static bool batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh, goto default_throughput; } + /* only use rtnl_trylock because the elp worker will be cancelled while + * the rntl_lock is held. the cancel_delayed_work_sync() would otherwise + * wait forever when the elp work_item was started and it is then also + * trying to rtnl_lock + */ + if (!rtnl_trylock()) + return false; + /* if not a wifi interface, check if this device provides data via * ethtool (e.g. an Ethernet adapter) */ - rtnl_lock(); ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings); rtnl_unlock(); if (ret == 0) { @@ -175,31 +196,19 @@ default_throughput: /** * batadv_v_elp_throughput_metric_update() - worker updating the throughput * metric of a single hop neighbour - * @work: the work queue item + * @neigh: the neighbour to probe */ -void batadv_v_elp_throughput_metric_update(struct work_struct *work) +static void +batadv_v_elp_throughput_metric_update(struct batadv_hardif_neigh_node *neigh) { - struct batadv_hardif_neigh_node_bat_v *neigh_bat_v; - struct batadv_hardif_neigh_node *neigh; u32 throughput; bool valid; - neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v, - metric_work); - neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node, - bat_v); - valid = batadv_v_elp_get_throughput(neigh, &throughput); if (!valid) - goto put_neigh; + return; ewma_throughput_add(&neigh->bat_v.throughput, throughput); - -put_neigh: - /* decrement refcounter to balance increment performed before scheduling - * this task - */ - batadv_hardif_neigh_put(neigh); } /** @@ -273,14 +282,16 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) */ static void batadv_v_elp_periodic_work(struct work_struct *work) { + struct batadv_v_metric_queue_entry *metric_entry; + struct batadv_v_metric_queue_entry *metric_safe; struct batadv_hardif_neigh_node *hardif_neigh; struct batadv_hard_iface *hard_iface; struct batadv_hard_iface_bat_v *bat_v; struct batadv_elp_packet *elp_packet; + struct list_head metric_queue; struct batadv_priv *bat_priv; struct sk_buff *skb; u32 elp_interval; - bool ret; bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work); hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v); @@ -316,6 +327,8 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) atomic_inc(&hard_iface->bat_v.elp_seqno); + INIT_LIST_HEAD(&metric_queue); + /* The throughput metric is updated on each sent packet. This way, if a * node is dead and no longer sends packets, batman-adv is still able to * react timely to its death. @@ -340,16 +353,28 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) /* Reading the estimated throughput from cfg80211 is a task that * may sleep and that is not allowed in an rcu protected - * context. Therefore schedule a task for that. + * context. Therefore add it to metric_queue and process it + * outside rcu protected context. */ - ret = queue_work(batadv_event_workqueue, - &hardif_neigh->bat_v.metric_work); - - if (!ret) + metric_entry = kzalloc(sizeof(*metric_entry), GFP_ATOMIC); + if (!metric_entry) { batadv_hardif_neigh_put(hardif_neigh); + continue; + } + + metric_entry->hardif_neigh = hardif_neigh; + list_add(&metric_entry->list, &metric_queue); } rcu_read_unlock(); + list_for_each_entry_safe(metric_entry, metric_safe, &metric_queue, list) { + batadv_v_elp_throughput_metric_update(metric_entry->hardif_neigh); + + batadv_hardif_neigh_put(metric_entry->hardif_neigh); + list_del(&metric_entry->list); + kfree(metric_entry); + } + restart_timer: batadv_v_elp_start_timer(hard_iface); out: diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index 9e2740195fa2..c9cb0a307100 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -10,7 +10,6 @@ #include "main.h" #include -#include int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface); void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface); @@ -19,6 +18,5 @@ void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface, void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface); int batadv_v_elp_packet_recv(struct sk_buff *skb, struct batadv_hard_iface *if_incoming); -void batadv_v_elp_throughput_metric_update(struct work_struct *work); #endif /* _NET_BATMAN_ADV_BAT_V_ELP_H_ */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 04f6398b3a40..85a50096f5b2 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -596,9 +596,6 @@ struct batadv_hardif_neigh_node_bat_v { * neighbor */ unsigned long last_unicast_tx; - - /** @metric_work: work queue callback item for metric update */ - struct work_struct metric_work; }; /** -- 2.51.0