From d071004e623b7433573019d67cba79e345d83006 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 14 Nov 2024 15:59:44 +0800 Subject: [PATCH 01/16] tools/power turbostat: Consolidate graphics sysfs access Currently, there is an inconsistency in how graphics sysfs knobs are accessed: graphics residency sysfs knobs are opened and closed for each read, while graphics frequency sysfs knobs are opened once and remain open until turbostat exits. This inconsistency is confusing and adds unnecessary code complexity. Consolidate the access method by opening the sysfs files once and reusing the file pointers for subsequent accesses. This approach simplifies the code and ensures a consistent method for accessing graphics sysfs knobs. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c0596ccf92cd..e5b100b8db24 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5764,27 +5764,24 @@ int snapshot_proc_interrupts(void) */ int snapshot_graphics(int idx) { - FILE *fp; int retval; + if (gfx_info[idx].fp == NULL) + gfx_info[idx].fp = fopen_or_die(gfx_info[idx].path, "r"); + else + rewind(gfx_info[idx].fp); + switch (idx) { case GFX_rc6: case SAM_mc6: - fp = fopen_or_die(gfx_info[idx].path, "r"); - retval = fscanf(fp, "%lld", &gfx_info[idx].val_ull); + retval = fscanf(gfx_info[idx].fp, "%lld", &gfx_info[idx].val_ull); if (retval != 1) err(1, "rc6"); - fclose(fp); return 0; case GFX_MHz: case GFX_ACTMHz: case SAM_MHz: case SAM_ACTMHz: - if (gfx_info[idx].fp == NULL) - gfx_info[idx].fp = fopen_or_die(gfx_info[idx].path, "r"); - else - rewind(gfx_info[idx].fp); - retval = fscanf(gfx_info[idx].fp, "%d", &gfx_info[idx].val); if (retval != 1) err(1, "MHz"); -- 2.51.0 From c7538f33853b11d0ff2a81efb78bde125d1fc49f Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 14 Nov 2024 15:59:45 +0800 Subject: [PATCH 02/16] tools/power turbostat: Cache graphics sysfs file descriptors during probe Snapshots of the graphics sysfs knobs are taken based on file descriptors. To optimize this process, open the files and cache the file descriptors during the graphics probe phase. As a result, the previously cached pathnames become redundant and are removed. This change aims to streamline the code without altering its functionality. No functional change intended. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 82 +++++++++++---------------- 1 file changed, 32 insertions(+), 50 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e5b100b8db24..28513172ffce 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -376,7 +376,6 @@ enum gfx_sysfs_idx { }; struct gfx_sysfs_info { - const char *path; FILE *fp; unsigned int val; unsigned long long val_ull; @@ -5766,10 +5765,7 @@ int snapshot_graphics(int idx) { int retval; - if (gfx_info[idx].fp == NULL) - gfx_info[idx].fp = fopen_or_die(gfx_info[idx].path, "r"); - else - rewind(gfx_info[idx].fp); + rewind(gfx_info[idx].fp); switch (idx) { case GFX_rc6: @@ -6474,6 +6470,12 @@ static void probe_intel_uncore_frequency(void) probe_intel_uncore_frequency_legacy(); } +static void set_graphics_fp(char *path, int idx) +{ + if (!access(path, R_OK)) + gfx_info[idx].fp = fopen_or_die(path, "r"); +} + static void probe_graphics(void) { /* Xe graphics sysfs knobs */ @@ -6481,7 +6483,6 @@ static void probe_graphics(void) FILE *fp; char buf[8]; bool gt0_is_gt; - int idx; fp = fopen("/sys/class/drm/card0/device/tile0/gt0/gtidle/name", "r"); if (!fp) @@ -6500,28 +6501,17 @@ static void probe_graphics(void) else goto next; - idx = gt0_is_gt ? GFX_rc6 : SAM_mc6; - gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms"; + set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms", gt0_is_gt ? GFX_rc6 : SAM_mc6); - idx = gt0_is_gt ? GFX_MHz : SAM_MHz; - if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq", R_OK)) - gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq"; + set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq", gt0_is_gt ? GFX_MHz : SAM_MHz); - idx = gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz; - if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq", R_OK)) - gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq"; + set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq", gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz); - idx = gt0_is_gt ? SAM_mc6 : GFX_rc6; - if (!access("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms", R_OK)) - gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms"; + set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms", gt0_is_gt ? SAM_mc6 : GFX_rc6); - idx = gt0_is_gt ? SAM_MHz : GFX_MHz; - if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq", R_OK)) - gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq"; + set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq", gt0_is_gt ? SAM_MHz : GFX_MHz); - idx = gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz; - if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq", R_OK)) - gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq"; + set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq", gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz); goto end; } @@ -6529,52 +6519,44 @@ static void probe_graphics(void) next: /* New i915 graphics sysfs knobs */ if (!access("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", R_OK)) { - gfx_info[GFX_rc6].path = "/sys/class/drm/card0/gt/gt0/rc6_residency_ms"; + set_graphics_fp("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", GFX_rc6); - if (!access("/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz", R_OK)) - gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz"; + set_graphics_fp("/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz", GFX_MHz); - if (!access("/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz", R_OK)) - gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz"; + set_graphics_fp("/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz", GFX_ACTMHz); - if (!access("/sys/class/drm/card0/gt/gt1/rc6_residency_ms", R_OK)) - gfx_info[SAM_mc6].path = "/sys/class/drm/card0/gt/gt1/rc6_residency_ms"; + set_graphics_fp("/sys/class/drm/card0/gt/gt1/rc6_residency_ms", SAM_mc6); - if (!access("/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz", R_OK)) - gfx_info[SAM_MHz].path = "/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz"; + set_graphics_fp("/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz", SAM_MHz); - if (!access("/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz", R_OK)) - gfx_info[SAM_ACTMHz].path = "/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz"; + set_graphics_fp("/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz", SAM_ACTMHz); goto end; } /* Fall back to traditional i915 graphics sysfs knobs */ - if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) - gfx_info[GFX_rc6].path = "/sys/class/drm/card0/power/rc6_residency_ms"; + set_graphics_fp("/sys/class/drm/card0/power/rc6_residency_ms", GFX_rc6); - if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK)) - gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt_cur_freq_mhz"; - else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) - gfx_info[GFX_MHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz"; + set_graphics_fp("/sys/class/drm/card0/gt_cur_freq_mhz", GFX_MHz); + if (!gfx_info[GFX_MHz].fp) + set_graphics_fp("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", GFX_MHz); - if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK)) - gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt_act_freq_mhz"; - else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) - gfx_info[GFX_ACTMHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz"; + set_graphics_fp("/sys/class/drm/card0/gt_act_freq_mhz", GFX_ACTMHz); + if (!gfx_info[GFX_ACTMHz].fp) + set_graphics_fp("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", GFX_ACTMHz); end: - if (gfx_info[GFX_rc6].path) + if (gfx_info[GFX_rc6].fp) BIC_PRESENT(BIC_GFX_rc6); - if (gfx_info[GFX_MHz].path) + if (gfx_info[GFX_MHz].fp) BIC_PRESENT(BIC_GFXMHz); - if (gfx_info[GFX_ACTMHz].path) + if (gfx_info[GFX_ACTMHz].fp) BIC_PRESENT(BIC_GFXACTMHz); - if (gfx_info[SAM_mc6].path) + if (gfx_info[SAM_mc6].fp) BIC_PRESENT(BIC_SAM_mc6); - if (gfx_info[SAM_MHz].path) + if (gfx_info[SAM_MHz].fp) BIC_PRESENT(BIC_SAMMHz); - if (gfx_info[SAM_ACTMHz].path) + if (gfx_info[SAM_ACTMHz].fp) BIC_PRESENT(BIC_SAMACTMHz); } -- 2.51.0 From 03109e2f0d18dcb84218bd91c4fbf864193ca934 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 14 Nov 2024 15:59:46 +0800 Subject: [PATCH 03/16] tools/power turbostat: Add support for /sys/class/drm/card1 On some machines, the graphics device is enumerated as /sys/class/drm/card1 instead of /sys/class/drm/card0. The current implementation does not handle this scenario, resulting in the loss of graphics C6 residency and frequency information. Add support for /sys/class/drm/card1, ensuring that turbostat can retrieve and display the graphics columns for these platforms. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 38 ++++++++++++++++++++------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 28513172ffce..b250676c174e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -6476,8 +6476,14 @@ static void set_graphics_fp(char *path, int idx) gfx_info[idx].fp = fopen_or_die(path, "r"); } +/* Enlarge this if there are /sys/class/drm/card2 ... */ +#define GFX_MAX_CARDS 2 + static void probe_graphics(void) { + char path[PATH_MAX]; + int i; + /* Xe graphics sysfs knobs */ if (!access("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms", R_OK)) { FILE *fp; @@ -6518,22 +6524,36 @@ static void probe_graphics(void) next: /* New i915 graphics sysfs knobs */ - if (!access("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", R_OK)) { - set_graphics_fp("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", GFX_rc6); + for (i = 0; i < GFX_MAX_CARDS; i++) { + snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt0/rc6_residency_ms", i); + if (!access(path, R_OK)) + break; + } - set_graphics_fp("/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz", GFX_MHz); + if (i == GFX_MAX_CARDS) + goto legacy_i915; - set_graphics_fp("/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz", GFX_ACTMHz); + snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt0/rc6_residency_ms", i); + set_graphics_fp(path, GFX_rc6); - set_graphics_fp("/sys/class/drm/card0/gt/gt1/rc6_residency_ms", SAM_mc6); + snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt0/rps_cur_freq_mhz", i); + set_graphics_fp(path, GFX_MHz); - set_graphics_fp("/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz", SAM_MHz); + snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt0/rps_act_freq_mhz", i); + set_graphics_fp(path, GFX_ACTMHz); - set_graphics_fp("/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz", SAM_ACTMHz); + snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt1/rc6_residency_ms", i); + set_graphics_fp(path, SAM_mc6); - goto end; - } + snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt1/rps_cur_freq_mhz", i); + set_graphics_fp(path, SAM_MHz); + + snprintf(path, PATH_MAX, "/sys/class/drm/card%d/gt/gt1/rps_act_freq_mhz", i); + set_graphics_fp(path, SAM_ACTMHz); + + goto end; +legacy_i915: /* Fall back to traditional i915 graphics sysfs knobs */ set_graphics_fp("/sys/class/drm/card0/power/rc6_residency_ms", GFX_rc6); -- 2.51.0 From bcfab87108b33f20d847fd71a2a93114dd2ce83e Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 24 Oct 2024 15:17:45 +0200 Subject: [PATCH 04/16] tools/power turbostat: Force --no-perf in --dump mode Force the --no-perf early to prevent using it as a source. User asks for raw values, but perf returns them relative to the opening of the file descriptor. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b250676c174e..1fed799a5537 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -9897,6 +9897,12 @@ void cmdline(int argc, char **argv) break; case 'D': dump_only++; + /* + * Force the no_perf early to prevent using it as a source. + * User asks for raw values, but perf returns them relative + * to the opening of the file descriptor. + */ + no_perf = 1; break; case 'e': /* --enable specified counter */ -- 2.51.0 From 1da0daf746342dfdc114e4dc8fbf3ece28666d4f Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 13 Nov 2024 15:48:22 +0100 Subject: [PATCH 05/16] tools/power turbostat: Fix child's argument forwarding Add '+' to optstring when early scanning for --no-msr and --no-perf. It causes option processing to stop as soon as a nonoption argument is encountered, effectively skipping child's arguments. Fixes: 3e4048466c39 ("tools/power turbostat: Add --no-msr option") Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 1fed799a5537..9025c2945737 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -9873,7 +9873,7 @@ void cmdline(int argc, char **argv) * Parse some options early, because they may make other options invalid, * like adding the MSR counter with --add and at the same time using --no-msr. */ - while ((opt = getopt_long_only(argc, argv, "MPn:", long_options, &option_index)) != -1) { + while ((opt = getopt_long_only(argc, argv, "+MPn:", long_options, &option_index)) != -1) { switch (opt) { case 'M': no_msr = 1; -- 2.51.0 From e5f687b89bc2a892256f48e14a568970b59a4812 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 2 Oct 2024 15:05:15 +0200 Subject: [PATCH 06/16] tools/power turbostat: Add RAPL psys as a built-in counter Introduce the counter as a part of global, platform counters structure. We open the counter for only one cpu, but otherwise treat it as an ordinary RAPL counter, allowing for grouped perf read. The counter is disabled by default, because it's interpretation may require additional, platform specific information, making it unsuitable for general use. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 + tools/power/x86/turbostat/turbostat.c | 93 ++++++++++++++++++++++++--- 2 files changed, 85 insertions(+), 10 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 56c7ff6efcda..95eb02346d3a 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -190,6 +190,8 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBRAMWatt\fP Watts consumed by the DRAM DIMMS -- available only on server processors. .PP +\fBSysWatt\fP Watts consumed by the whole platform (RAPL PSYS). Disabled by default. May require platform specific information to interpret the data, making it not suitable for general use. +.PP \fBPKG_%\fP percent of the interval that RAPL throttling was active on the Package. Note that the system summary is the sum of the package throttling time, and thus may be higher than 100% on a multi-package system. Note that the meaning of this field is model specific. For example, some hardware increments this counter when RAPL responds to thermal limits, but does not increment this counter when RAPL responds to power limits. Comparing PkgWatt and PkgTmp to system limits is necessary. .PP \fBRAM_%\fP percent of the interval that RAPL throttling was active on DRAM. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9025c2945737..88c7f896c5b2 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -200,6 +200,8 @@ struct msr_counter bic[] = { { 0x0, "SAMMHz", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "SAMAMHz", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "Die%c6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SysWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Sys_J", NULL, 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -262,6 +264,8 @@ struct msr_counter bic[] = { #define BIC_SAMMHz (1ULL << 56) #define BIC_SAMACTMHz (1ULL << 57) #define BIC_Diec6 (1ULL << 58) +#define BIC_SysWatt (1ULL << 59) +#define BIC_Sys_J (1ULL << 60) #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) @@ -269,7 +273,7 @@ struct msr_counter bic[] = { #define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) -#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) +#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC | BIC_SysWatt | BIC_Sys_J) unsigned long long bic_enabled = (0xFFFFFFFFFFFFFFFFULL & ~BIC_DISABLED_BY_DEFAULT); unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC_X2APIC; @@ -507,12 +511,15 @@ enum rapl_msrs { RAPL_AMD_PWR_UNIT = BIT(14), /* 0xc0010299 MSR_AMD_RAPL_POWER_UNIT */ RAPL_AMD_CORE_ENERGY_STAT = BIT(15), /* 0xc001029a MSR_AMD_CORE_ENERGY_STATUS */ RAPL_AMD_PKG_ENERGY_STAT = BIT(16), /* 0xc001029b MSR_AMD_PKG_ENERGY_STATUS */ + RAPL_PLATFORM_ENERGY_LIMIT = BIT(17), /* 0x64c MSR_PLATFORM_ENERGY_LIMIT */ + RAPL_PLATFORM_ENERGY_STATUS = BIT(18), /* 0x64d MSR_PLATFORM_ENERGY_STATUS */ }; #define RAPL_PKG (RAPL_PKG_ENERGY_STATUS | RAPL_PKG_POWER_LIMIT) #define RAPL_DRAM (RAPL_DRAM_ENERGY_STATUS | RAPL_DRAM_POWER_LIMIT) #define RAPL_CORE (RAPL_CORE_ENERGY_STATUS | RAPL_CORE_POWER_LIMIT) #define RAPL_GFX (RAPL_GFX_POWER_LIMIT | RAPL_GFX_ENERGY_STATUS) +#define RAPL_PSYS (RAPL_PLATFORM_ENERGY_STATUS | RAPL_PLATFORM_ENERGY_LIMIT) #define RAPL_PKG_ALL (RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO) #define RAPL_DRAM_ALL (RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_DRAM_POWER_INFO) @@ -713,7 +720,7 @@ static const struct platform_features skl_features = { .has_ext_cst_msrs = 1, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, - .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX | RAPL_PSYS, .enable_tsc_tweak = 1, }; @@ -730,7 +737,7 @@ static const struct platform_features cnl_features = { .has_ext_cst_msrs = 1, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, - .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX | RAPL_PSYS, .enable_tsc_tweak = 1, }; @@ -797,7 +804,7 @@ static const struct platform_features icx_features = { .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, - .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS, .has_fixed_rapl_unit = 1, }; @@ -813,7 +820,7 @@ static const struct platform_features spr_features = { .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, - .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS, }; static const struct platform_features srf_features = { @@ -829,7 +836,7 @@ static const struct platform_features srf_features = { .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, - .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS, }; static const struct platform_features grr_features = { @@ -845,7 +852,7 @@ static const struct platform_features grr_features = { .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, - .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS, }; static const struct platform_features slv_features = { @@ -1108,6 +1115,7 @@ enum rapl_rci_index { RAPL_RCI_INDEX_PKG_PERF_STATUS = 4, RAPL_RCI_INDEX_DRAM_PERF_STATUS = 5, RAPL_RCI_INDEX_CORE_ENERGY = 6, + RAPL_RCI_INDEX_ENERGY_PLATFORM = 7, NUM_RAPL_COUNTERS, }; @@ -1134,6 +1142,7 @@ struct rapl_counter_info_t { struct rapl_counter_info_t *rapl_counter_info_perdomain; unsigned int rapl_counter_info_perdomain_size; +#define RAPL_COUNTER_FLAG_PLATFORM_COUNTER (1u << 0) #define RAPL_COUNTER_FLAG_USE_MSR_SUM (1u << 1) struct rapl_counter_arch_info { @@ -1255,6 +1264,19 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = { .compat_scale = 1.0, .flags = 0, }, + { + .feature_mask = RAPL_PSYS, + .perf_subsys = "power", + .perf_name = "energy-psys", + .msr = MSR_PLATFORM_ENERGY_STATUS, + .msr_mask = 0x00000000FFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_PLATFORM, + .bic = BIC_SysWatt | BIC_Sys_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_PLATFORM_COUNTER | RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, }; struct rapl_counter { @@ -1682,6 +1704,7 @@ enum { IDX_PP1_ENERGY, IDX_PKG_PERF, IDX_DRAM_PERF, + IDX_PSYS_ENERGY, IDX_COUNT, }; @@ -1726,6 +1749,9 @@ off_t idx_to_offset(int idx) case IDX_DRAM_PERF: offset = MSR_DRAM_PERF_STATUS; break; + case IDX_PSYS_ENERGY: + offset = MSR_PLATFORM_ENERGY_STATUS; + break; default: offset = -1; } @@ -1756,6 +1782,9 @@ int offset_to_idx(off_t offset) case MSR_DRAM_PERF_STATUS: idx = IDX_DRAM_PERF; break; + case MSR_PLATFORM_ENERGY_STATUS: + idx = IDX_PSYS_ENERGY; + break; default: idx = -1; } @@ -1777,6 +1806,8 @@ int idx_valid(int idx) return platform->rapl_msrs & RAPL_PKG_PERF_STATUS; case IDX_DRAM_PERF: return platform->rapl_msrs & RAPL_DRAM_PERF_STATUS; + case IDX_PSYS_ENERGY: + return platform->rapl_msrs & RAPL_PSYS; default: return 0; } @@ -1848,6 +1879,10 @@ struct system_summary { struct pkg_data packages; } average; +struct platform_counters { + struct rapl_counter energy_psys; /* MSR_PLATFORM_ENERGY_STATUS */ +} platform_counters_odd, platform_counters_even; + struct cpu_topology { int physical_package_id; int die_id; @@ -2512,6 +2547,11 @@ void print_header(char *delim) ppmt = ppmt->next; } + if (DO_BIC(BIC_SysWatt)) + outp += sprintf(outp, "%sSysWatt", (printed++ ? delim : "")); + if (DO_BIC(BIC_Sys_J)) + outp += sprintf(outp, "%sSys_J", (printed++ ? delim : "")); + outp += sprintf(outp, "\n"); } @@ -2519,6 +2559,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p { int i; struct msr_counter *mp; + struct platform_counters *pplat_cnt = p == package_odd ? &platform_counters_odd : &platform_counters_even; outp += sprintf(outp, "t %p, c %p, p %p\n", t, c, p); @@ -2590,6 +2631,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores.raw_value); outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx.raw_value); outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram.raw_value); + outp += sprintf(outp, "Joules PSYS: %0llX\n", pplat_cnt->energy_psys.raw_value); outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status.raw_value); outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status.raw_value); outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c); @@ -2628,6 +2670,9 @@ double rapl_counter_get_value(const struct rapl_counter *c, enum rapl_unit desir */ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { + static int count; + + struct platform_counters *pplat_cnt = NULL; double interval_float, tsc; char *fmt8; int i; @@ -2637,6 +2682,11 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data char *delim = "\t"; int printed = 0; + if (t == &average.threads) { + pplat_cnt = count & 1 ? &platform_counters_odd : &platform_counters_even; + ++count; + } + /* if showing only 1st thread in core and this isn't one, bail out */ if (show_core_only && !is_cpu_first_thread_in_core(t, c, p)) return 0; @@ -3093,6 +3143,13 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data } } + if (DO_BIC(BIC_SysWatt) && (t == &average.threads)) + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&pplat_cnt->energy_psys, RAPL_UNIT_WATTS, interval_float)); + if (DO_BIC(BIC_Sys_J) && (t == &average.threads)) + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&pplat_cnt->energy_psys, RAPL_UNIT_JOULES, interval_float)); + done: if (*(outp - 1) != '\n') outp += sprintf(outp, "\n"); @@ -3400,6 +3457,11 @@ int delta_cpu(struct thread_data *t, struct core_data *c, return retval; } +void delta_platform(struct platform_counters *new, struct platform_counters *old) +{ + old->energy_psys.raw_value = new->energy_psys.raw_value - old->energy_psys.raw_value; +} + void rapl_counter_clear(struct rapl_counter *c) { c->raw_value = 0; @@ -4129,6 +4191,9 @@ static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci, unsigned int idx) { + if (rci->source[idx] == COUNTER_SOURCE_NONE) + return; + rc->raw_value = rci->data[idx]; rc->unit = rci->unit[idx]; rc->scale = rci->scale[idx]; @@ -4136,6 +4201,7 @@ void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct pkg_data *p) { + struct platform_counters *pplat_cnt = p == package_odd ? &platform_counters_odd : &platform_counters_even; unsigned long long perf_data[NUM_RAPL_COUNTERS + 1]; struct rapl_counter_info_t *rci; @@ -4163,6 +4229,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) { switch (rci->source[i]) { case COUNTER_SOURCE_NONE: + rci->data[i] = 0; break; case COUNTER_SOURCE_PERF: @@ -4201,7 +4268,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct } } - BUILD_BUG_ON(NUM_RAPL_COUNTERS != 7); + BUILD_BUG_ON(NUM_RAPL_COUNTERS != 8); write_rapl_counter(&p->energy_pkg, rci, RAPL_RCI_INDEX_ENERGY_PKG); write_rapl_counter(&p->energy_cores, rci, RAPL_RCI_INDEX_ENERGY_CORES); write_rapl_counter(&p->energy_dram, rci, RAPL_RCI_INDEX_DRAM); @@ -4209,6 +4276,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct write_rapl_counter(&p->rapl_pkg_perf_status, rci, RAPL_RCI_INDEX_PKG_PERF_STATUS); write_rapl_counter(&p->rapl_dram_perf_status, rci, RAPL_RCI_INDEX_DRAM_PERF_STATUS); write_rapl_counter(&c->core_energy, rci, RAPL_RCI_INDEX_CORE_ENERGY); + write_rapl_counter(&pplat_cnt->energy_psys, rci, RAPL_RCI_INDEX_ENERGY_PLATFORM); return 0; } @@ -6144,6 +6212,7 @@ restart: re_initialize(); goto restart; } + delta_platform(&platform_counters_odd, &platform_counters_even); compute_average(EVEN_COUNTERS); format_all_counters(EVEN_COUNTERS); flush_output_stdout(); @@ -6167,6 +6236,7 @@ restart: re_initialize(); goto restart; } + delta_platform(&platform_counters_even, &platform_counters_odd); compute_average(ODD_COUNTERS); format_all_counters(ODD_COUNTERS); flush_output_stdout(); @@ -6945,8 +7015,8 @@ void rapl_probe_intel(void) unsigned long long msr; unsigned int time_unit; double tdp; - const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt | BIC_RAMWatt | BIC_GFXWatt; - const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J | BIC_RAM_J | BIC_GFX_J; + const unsigned long long bic_watt_bits = BIC_SysWatt | BIC_PkgWatt | BIC_CorWatt | BIC_RAMWatt | BIC_GFXWatt; + const unsigned long long bic_joules_bits = BIC_Sys_J | BIC_Pkg_J | BIC_Cor_J | BIC_RAM_J | BIC_GFX_J; if (rapl_joules) bic_enabled &= ~bic_watt_bits; @@ -7606,6 +7676,9 @@ void rapl_perf_init(void) domain_visited[next_domain] = 1; + if ((cai->flags & RAPL_COUNTER_FLAG_PLATFORM_COUNTER) && (cpu != base_cpu)) + continue; + struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[next_domain]; /* Check if the counter is enabled and accessible */ -- 2.51.0 From 86d237734091201d2ab2c1d2e1063893621c770f Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 30 Nov 2024 16:22:00 -0500 Subject: [PATCH 07/16] tools/power turbostat: 2024.11.30 since 2024.07.26: assorted minor bug fixes assorted platform specific tweaks initial RAPL PSYS (SysWatt) support Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 +- tools/power/x86/turbostat/turbostat.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 95eb02346d3a..a7f7ed01421c 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -190,7 +190,7 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBRAMWatt\fP Watts consumed by the DRAM DIMMS -- available only on server processors. .PP -\fBSysWatt\fP Watts consumed by the whole platform (RAPL PSYS). Disabled by default. May require platform specific information to interpret the data, making it not suitable for general use. +\fBSysWatt\fP Watts consumed by the whole platform (RAPL PSYS). Disabled by default. Enable with --enable SysWatt. .PP \fBPKG_%\fP percent of the interval that RAPL throttling was active on the Package. Note that the system summary is the sum of the package throttling time, and thus may be higher than 100% on a multi-package system. Note that the meaning of this field is model specific. For example, some hardware increments this counter when RAPL responds to thermal limits, but does not increment this counter when RAPL responds to power limits. Comparing PkgWatt and PkgTmp to system limits is necessary. .PP diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 88c7f896c5b2..58a487c225a7 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -9236,7 +9236,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2024.07.26 - Len Brown \n"); + fprintf(outf, "turbostat version 2024.11.30 - Len Brown \n"); } #define COMMAND_LINE_SIZE 2048 -- 2.51.0 From f69e63756f7822fcdad8a34f9967e8b243e883ee Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 2 Oct 2024 18:31:47 +0100 Subject: [PATCH 08/16] printf: Remove unused 'bprintf' bprintf() is unused. Remove it. It was added in the commit 4370aa4aa753 ("vsprintf: add binary printf") but as far as I can see was never used, unlike the other two functions in that patch. Link: https://lore.kernel.org/20241002173147.210107-1-linux@treblig.org Reviewed-by: Andy Shevchenko Acked-by: Petr Mladek Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Steven Rostedt (Google) --- include/linux/string.h | 1 - lib/vsprintf.c | 23 ----------------------- 2 files changed, 24 deletions(-) diff --git a/include/linux/string.h b/include/linux/string.h index 0dd27afcfaf7..493ac4862c77 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -335,7 +335,6 @@ int __sysfs_match_string(const char * const *array, size_t n, const char *s); #ifdef CONFIG_BINARY_PRINTF int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf); -int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); #endif extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 6ac02bbb7df1..9d3dac38a3f4 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -3428,29 +3428,6 @@ out: } EXPORT_SYMBOL_GPL(bstr_printf); -/** - * bprintf - Parse a format string and place args' binary value in a buffer - * @bin_buf: The buffer to place args' binary value - * @size: The size of the buffer(by words(32bits), not characters) - * @fmt: The format string to use - * @...: Arguments for the format string - * - * The function returns the number of words(u32) written - * into @bin_buf. - */ -int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) -{ - va_list args; - int ret; - - va_start(args, fmt); - ret = vbin_printf(bin_buf, size, fmt, args); - va_end(args); - - return ret; -} -EXPORT_SYMBOL_GPL(bprintf); - #endif /* CONFIG_BINARY_PRINTF */ /** -- 2.51.0 From 9022ed0e7e65734d83a0648648589b9fbea8e8c9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 1 Dec 2024 09:23:33 -0800 Subject: [PATCH 09/16] strscpy: write destination buffer only once The point behind strscpy() was to once and for all avoid all the problems with 'strncpy()' and later broken "fixed" versions like strlcpy() that just made things worse. So strscpy not only guarantees NUL-termination (unlike strncpy), it also doesn't do unnecessary padding at the destination. But at the same time also avoids byte-at-a-time reads and writes by _allowing_ some extra NUL writes - within the size, of course - so that the whole copy can be done with word operations. It is also stable in the face of a mutable source string: it explicitly does not read the source buffer multiple times (so an implementation using "strnlen()+memcpy()" would be wrong), and does not read the source buffer past the size (like the mis-design that is strlcpy does). Finally, the return value is designed to be simple and unambiguous: if the string cannot be copied fully, it returns an actual negative error, making error handling clearer and simpler (and the caller already knows the size of the buffer). Otherwise it returns the string length of the result. However, there was one final stability issue that can be important to callers: the stability of the destination buffer. In particular, the same way we shouldn't read the source buffer more than once, we should avoid doing multiple writes to the destination buffer: first writing a potentially non-terminated string, and then terminating it with NUL at the end does not result in a stable result buffer. Yes, it gives the right result in the end, but if the rule for the destination buffer was that it is _always_ NUL-terminated even when accessed concurrently with updates, the final byte of the buffer needs to always _stay_ as a NUL byte. [ Note that "final byte is NUL" here is literally about the final byte in the destination array, not the terminating NUL at the end of the string itself. There is no attempt to try to make concurrent reads and writes give any kind of consistent string length or contents, but we do want to guarantee that there is always at least that final terminating NUL character at the end of the destination array if it existed before ] This is relevant in the kernel for the tsk->comm[] array, for example. Even without locking (for either readers or writers), we want to know that while the buffer contents may be garbled, it is always a valid C string and always has a NUL character at 'comm[TASK_COMM_LEN-1]' (and never has any "out of thin air" data). So avoid any "copy possibly non-terminated string, and terminate later" behavior, and write the destination buffer only once. Signed-off-by: Linus Torvalds --- lib/string.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/lib/string.c b/lib/string.c index 76327b51e36f..eb4486ed40d2 100644 --- a/lib/string.c +++ b/lib/string.c @@ -104,6 +104,12 @@ char *strncpy(char *dest, const char *src, size_t count) EXPORT_SYMBOL(strncpy); #endif +#ifdef __BIG_ENDIAN +# define ALLBUTLAST_BYTE_MASK (~255ul) +#else +# define ALLBUTLAST_BYTE_MASK (~0ul >> 8) +#endif + ssize_t sized_strscpy(char *dest, const char *src, size_t count) { const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; @@ -147,13 +153,18 @@ ssize_t sized_strscpy(char *dest, const char *src, size_t count) *(unsigned long *)(dest+res) = c & zero_bytemask(data); return res + find_zero(data); } + count -= sizeof(unsigned long); + if (unlikely(!count)) { + c &= ALLBUTLAST_BYTE_MASK; + *(unsigned long *)(dest+res) = c; + return -E2BIG; + } *(unsigned long *)(dest+res) = c; res += sizeof(unsigned long); - count -= sizeof(unsigned long); max -= sizeof(unsigned long); } - while (count) { + while (count > 1) { char c; c = src[res]; @@ -164,11 +175,11 @@ ssize_t sized_strscpy(char *dest, const char *src, size_t count) count--; } - /* Hit buffer length without finding a NUL; force NUL-termination. */ - if (res) - dest[res-1] = '\0'; + /* Force NUL-termination. */ + dest[res] = '\0'; - return -E2BIG; + /* Return E2BIG if the source didn't stop */ + return src[res] ? -E2BIG : res; } EXPORT_SYMBOL(sized_strscpy); -- 2.51.0 From 40384c840ea1944d7c5a392e8975ed088ecf0b37 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 1 Dec 2024 14:28:56 -0800 Subject: [PATCH 10/16] Linux 6.13-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index e34a97473fb6..93ab62cef244 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 -PATCHLEVEL = 12 +PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.51.0 From c43ec96e8d34399bd9dab2f2dc316b904892133f Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 29 Oct 2024 08:28:45 +0000 Subject: [PATCH 11/16] dmaengine: at_xdmac: avoid null_prt_deref in at_xdmac_prep_dma_memset The at_xdmac_memset_create_desc may return NULL, which will lead to a null pointer dereference. For example, the len input is error, or the atchan->free_descs_list is empty and memory is exhausted. Therefore, add check to avoid this. Fixes: b206d9a23ac7 ("dmaengine: xdmac: Add memset support") Signed-off-by: Chen Ridong Link: https://lore.kernel.org/r/20241029082845.1185380-1-chenridong@huaweicloud.com Signed-off-by: Vinod Koul --- drivers/dma/at_xdmac.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index 9c7b40220004..ba25c23164e7 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -1363,6 +1363,8 @@ at_xdmac_prep_dma_memset(struct dma_chan *chan, dma_addr_t dest, int value, return NULL; desc = at_xdmac_memset_create_desc(chan, atchan, dest, len, value); + if (!desc) + return NULL; list_add_tail(&desc->desc_node, &desc->descs_list); desc->tx_dma_desc.cookie = -EBUSY; -- 2.51.0 From f0e870a0e9c5521f2952ea9f3ea9d3d122631a89 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 4 Nov 2024 11:50:50 +0200 Subject: [PATCH 12/16] dmaengine: dw: Select only supported masters for ACPI devices The recently submitted fix-commit revealed a problem in the iDMA 32-bit platform code. Even though the controller supported only a single master the dw_dma_acpi_filter() method hard-coded two master interfaces with IDs 0 and 1. As a result the sanity check implemented in the commit b336268dde75 ("dmaengine: dw: Add peripheral bus width verification") got incorrect interface data width and thus prevented the client drivers from configuring the DMA-channel with the EINVAL error returned. E.g., the next error was printed for the PXA2xx SPI controller driver trying to configure the requested channels: > [ 164.525604] pxa2xx_spi_pci 0000:00:07.1: DMA slave config failed > [ 164.536105] pxa2xx_spi_pci 0000:00:07.1: failed to get DMA TX descriptor > [ 164.543213] spidev spi-SPT0001:00: SPI transfer failed: -16 The problem would have been spotted much earlier if the iDMA 32-bit controller supported more than one master interfaces. But since it supports just a single master and the iDMA 32-bit specific code just ignores the master IDs in the CTLLO preparation method, the issue has been gone unnoticed so far. Fix the problem by specifying the default master ID for both memory and peripheral devices in the driver data. Thus the issue noticed for the iDMA 32-bit controllers will be eliminated and the ACPI-probed DW DMA controllers will be configured with the correct master ID by default. Cc: stable@vger.kernel.org Fixes: b336268dde75 ("dmaengine: dw: Add peripheral bus width verification") Fixes: 199244d69458 ("dmaengine: dw: add support of iDMA 32-bit hardware") Reported-by: Ferry Toth Closes: https://lore.kernel.org/dmaengine/ZuXbCKUs1iOqFu51@black.fi.intel.com/ Reported-by: Andy Shevchenko Closes: https://lore.kernel.org/dmaengine/ZuXgI-VcHpMgbZ91@black.fi.intel.com/ Tested-by: Ferry Toth Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241104095142.157925-1-andriy.shevchenko@linux.intel.com Signed-off-by: Vinod Koul --- drivers/dma/dw/acpi.c | 6 ++++-- drivers/dma/dw/internal.h | 8 ++++++++ drivers/dma/dw/pci.c | 4 ++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/dma/dw/acpi.c b/drivers/dma/dw/acpi.c index c510c109d2c3..b6452fffa657 100644 --- a/drivers/dma/dw/acpi.c +++ b/drivers/dma/dw/acpi.c @@ -8,13 +8,15 @@ static bool dw_dma_acpi_filter(struct dma_chan *chan, void *param) { + struct dw_dma *dw = to_dw_dma(chan->device); + struct dw_dma_chip_pdata *data = dev_get_drvdata(dw->dma.dev); struct acpi_dma_spec *dma_spec = param; struct dw_dma_slave slave = { .dma_dev = dma_spec->dev, .src_id = dma_spec->slave_id, .dst_id = dma_spec->slave_id, - .m_master = 0, - .p_master = 1, + .m_master = data->m_master, + .p_master = data->p_master, }; return dw_dma_filter(chan, &slave); diff --git a/drivers/dma/dw/internal.h b/drivers/dma/dw/internal.h index 563ce73488db..f1bd06a20cd6 100644 --- a/drivers/dma/dw/internal.h +++ b/drivers/dma/dw/internal.h @@ -51,11 +51,15 @@ struct dw_dma_chip_pdata { int (*probe)(struct dw_dma_chip *chip); int (*remove)(struct dw_dma_chip *chip); struct dw_dma_chip *chip; + u8 m_master; + u8 p_master; }; static __maybe_unused const struct dw_dma_chip_pdata dw_dma_chip_pdata = { .probe = dw_dma_probe, .remove = dw_dma_remove, + .m_master = 0, + .p_master = 1, }; static const struct dw_dma_platform_data idma32_pdata = { @@ -72,6 +76,8 @@ static __maybe_unused const struct dw_dma_chip_pdata idma32_chip_pdata = { .pdata = &idma32_pdata, .probe = idma32_dma_probe, .remove = idma32_dma_remove, + .m_master = 0, + .p_master = 0, }; static const struct dw_dma_platform_data xbar_pdata = { @@ -88,6 +94,8 @@ static __maybe_unused const struct dw_dma_chip_pdata xbar_chip_pdata = { .pdata = &xbar_pdata, .probe = idma32_dma_probe, .remove = idma32_dma_remove, + .m_master = 0, + .p_master = 0, }; #endif /* _DMA_DW_INTERNAL_H */ diff --git a/drivers/dma/dw/pci.c b/drivers/dma/dw/pci.c index ad2d4d012cf7..e8a0eb81726a 100644 --- a/drivers/dma/dw/pci.c +++ b/drivers/dma/dw/pci.c @@ -56,10 +56,10 @@ static int dw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *pid) if (ret) return ret; - dw_dma_acpi_controller_register(chip->dw); - pci_set_drvdata(pdev, data); + dw_dma_acpi_controller_register(chip->dw); + return 0; } -- 2.51.0 From 4b65d5322e1d8994acfdb9b867aa00bdb30d177b Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Mon, 28 Oct 2024 17:34:13 +0800 Subject: [PATCH 13/16] dmaengine: loongson2-apb: Change GENMASK to GENMASK_ULL Fix the following smatch static checker warning: drivers/dma/loongson2-apb-dma.c:189 ls2x_dma_write_cmd() warn: was expecting a 64 bit value instead of '~(((0)) + (((~((0))) - (((1)) << (0)) + 1) & (~((0)) >> ((8 * 4) - 1 - (4)))))' The GENMASK macro used "unsigned long", which caused build issues when using a 32-bit toolchain because it would try to access bits > 31. This patch switches GENMASK to GENMASK_ULL, which uses "unsigned long long". Fixes: 71e7d3cb6e55 ("dmaengine: ls2x-apb: New driver for the Loongson LS2X APB DMA controller") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/87cdc025-7246-4548-85ca-3d36fdc2be2d@stanley.mountain/ Signed-off-by: Binbin Zhou Link: https://lore.kernel.org/r/20241028093413.1145820-1-zhoubinbin@loongson.cn Signed-off-by: Vinod Koul --- drivers/dma/loongson2-apb-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/loongson2-apb-dma.c b/drivers/dma/loongson2-apb-dma.c index 367ed34ce4da..c528f02b9f84 100644 --- a/drivers/dma/loongson2-apb-dma.c +++ b/drivers/dma/loongson2-apb-dma.c @@ -31,7 +31,7 @@ #define LDMA_ASK_VALID BIT(2) #define LDMA_START BIT(3) /* DMA start operation */ #define LDMA_STOP BIT(4) /* DMA stop operation */ -#define LDMA_CONFIG_MASK GENMASK(4, 0) /* DMA controller config bits mask */ +#define LDMA_CONFIG_MASK GENMASK_ULL(4, 0) /* DMA controller config bits mask */ /* Bitfields in ndesc_addr field of HW descriptor */ #define LDMA_DESC_EN BIT(0) /*1: The next descriptor is valid */ -- 2.51.0 From 790fb9956eead785b720ccc0851f09a5ca3a093e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 2 Dec 2024 09:20:04 -0800 Subject: [PATCH 14/16] linux/dmaengine.h: fix a few kernel-doc warnings The comment block for "Interleaved Transfer Request" should not begin with "/**" since it is not in kernel-doc format. Fix doc name for enum sum_check_flags. Fix all (4) missing struct member warnings. Use "Warning:" for one "Note:" in enum dma_desc_metadata_mode since scripts/kernel-doc does not allow more than one Note: per function or identifier description. This leaves around 49 kernel-doc warnings like: include/linux/dmaengine.h:43: warning: Enum value 'DMA_OUT_OF_ORDER' not described in enum 'dma_status' and another scripts/kernel-doc problem with it not being able to parse some typedefs. Fixes: b14dab792dee ("DMAEngine: Define interleaved transfer request api") Fixes: ad283ea4a3ce ("async_tx: add sum check flags") Fixes: 272420214d26 ("dmaengine: Add DMA_CTRL_REUSE") Fixes: f067025bc676 ("dmaengine: add support to provide error result from a DMA transation") Fixes: d38a8c622a1b ("dmaengine: prepare for generic 'unmap' data") Fixes: 5878853fc938 ("dmaengine: Add API function dmaengine_prep_peripheral_dma_vec()") Signed-off-by: Randy Dunlap Cc: Dan Williams Cc: Dave Jiang Cc: Paul Cercueil Cc: Nuno Sa Cc: Vinod Koul Cc: dmaengine@vger.kernel.org Link: https://lore.kernel.org/r/20241202172004.76020-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index b137fdb56093..346251bf1026 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -84,7 +84,7 @@ enum dma_transfer_direction { DMA_TRANS_NONE, }; -/** +/* * Interleaved Transfer Request * ---------------------------- * A chunk is collection of contiguous bytes to be transferred. @@ -223,7 +223,7 @@ enum sum_check_bits { }; /** - * enum pq_check_flags - result of async_{xor,pq}_zero_sum operations + * enum sum_check_flags - result of async_{xor,pq}_zero_sum operations * @SUM_CHECK_P_RESULT - 1 if xor zero sum error, 0 otherwise * @SUM_CHECK_Q_RESULT - 1 if reed-solomon zero sum error, 0 otherwise */ @@ -286,7 +286,7 @@ typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t; * pointer to the engine's metadata area * 4. Read out the metadata from the pointer * - * Note: the two mode is not compatible and clients must use one mode for a + * Warning: the two modes are not compatible and clients must use one mode for a * descriptor. */ enum dma_desc_metadata_mode { @@ -594,9 +594,13 @@ struct dma_descriptor_metadata_ops { * @phys: physical address of the descriptor * @chan: target channel for this operation * @tx_submit: accept the descriptor, assign ordered cookie and mark the + * @desc_free: driver's callback function to free a resusable descriptor + * after completion * descriptor pending. To be pushed on .issue_pending() call * @callback: routine to call after this operation is complete + * @callback_result: error result from a DMA transaction * @callback_param: general parameter to pass to the callback routine + * @unmap: hook for generic DMA unmap data * @desc_metadata_mode: core managed metadata mode to protect mixed use of * DESC_METADATA_CLIENT or DESC_METADATA_ENGINE. Otherwise * DESC_METADATA_NONE @@ -827,6 +831,9 @@ struct dma_filter { * @device_prep_dma_memset: prepares a memset operation * @device_prep_dma_memset_sg: prepares a memset operation over a scatter list * @device_prep_dma_interrupt: prepares an end of chain interrupt operation + * @device_prep_peripheral_dma_vec: prepares a scatter-gather DMA transfer, + * where the address and size of each segment is located in one entry of + * the dma_vec array. * @device_prep_slave_sg: prepares a slave dma operation * @device_prep_dma_cyclic: prepare a cyclic dma operation suitable for audio. * The function takes a buffer of size buf_len. The callback function will -- 2.51.0 From 8d55e8a16f019211163f1180fd9f9fbe05901900 Mon Sep 17 00:00:00 2001 From: Sasha Finkelstein Date: Sun, 24 Nov 2024 16:48:28 +0100 Subject: [PATCH 15/16] dmaengine: apple-admac: Avoid accessing registers in probe The ADMAC attached to the AOP has complex power sequencing, and is power gated when the probe callback runs. Move the register reads to other functions, where we can guarantee that the hardware is switched on. Fixes: 568aa6dd641f ("dmaengine: apple-admac: Allocate cache SRAM to channels") Signed-off-by: Sasha Finkelstein Link: https://lore.kernel.org/r/20241124-admac-power-v1-1-58f2165a4d55@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/apple-admac.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/dma/apple-admac.c b/drivers/dma/apple-admac.c index c499173d80b2..bd49f0374291 100644 --- a/drivers/dma/apple-admac.c +++ b/drivers/dma/apple-admac.c @@ -153,6 +153,8 @@ static int admac_alloc_sram_carveout(struct admac_data *ad, { struct admac_sram *sram; int i, ret = 0, nblocks; + ad->txcache.size = readl_relaxed(ad->base + REG_TX_SRAM_SIZE); + ad->rxcache.size = readl_relaxed(ad->base + REG_RX_SRAM_SIZE); if (dir == DMA_MEM_TO_DEV) sram = &ad->txcache; @@ -912,12 +914,7 @@ static int admac_probe(struct platform_device *pdev) goto free_irq; } - ad->txcache.size = readl_relaxed(ad->base + REG_TX_SRAM_SIZE); - ad->rxcache.size = readl_relaxed(ad->base + REG_RX_SRAM_SIZE); - dev_info(&pdev->dev, "Audio DMA Controller\n"); - dev_info(&pdev->dev, "imprint %x TX cache %u RX cache %u\n", - readl_relaxed(ad->base + REG_IMPRINT), ad->txcache.size, ad->rxcache.size); return 0; -- 2.51.0 From dcbef0798eb825cd584f7a93f62bed63f7fbbfc9 Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Wed, 18 Sep 2024 11:10:22 -0700 Subject: [PATCH 16/16] dmaengine: amd: qdma: Remove using the private get and set dma_ops APIs The get_dma_ops and set_dma_ops APIs were never for driver to use. Remove these calls from QDMA driver. Instead, pass the DMA device pointer from the qdma_platdata structure. Fixes: 73d5fc92a11c ("dmaengine: amd: qdma: Add AMD QDMA driver") Signed-off-by: Lizhi Hou Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240918181022.2155715-1-lizhi.hou@amd.com Signed-off-by: Vinod Koul --- drivers/dma/amd/qdma/qdma.c | 28 +++++++++++--------------- include/linux/platform_data/amd_qdma.h | 2 ++ 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/dma/amd/qdma/qdma.c b/drivers/dma/amd/qdma/qdma.c index 6d9079458fe9..66f00ad67351 100644 --- a/drivers/dma/amd/qdma/qdma.c +++ b/drivers/dma/amd/qdma/qdma.c @@ -7,9 +7,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -492,18 +492,9 @@ static int qdma_device_verify(struct qdma_device *qdev) static int qdma_device_setup(struct qdma_device *qdev) { - struct device *dev = &qdev->pdev->dev; u32 ring_sz = QDMA_DEFAULT_RING_SIZE; int ret = 0; - while (dev && get_dma_ops(dev)) - dev = dev->parent; - if (!dev) { - qdma_err(qdev, "dma device not found"); - return -EINVAL; - } - set_dma_ops(&qdev->pdev->dev, get_dma_ops(dev)); - ret = qdma_setup_fmap_context(qdev); if (ret) { qdma_err(qdev, "Failed setup fmap context"); @@ -548,11 +539,12 @@ static void qdma_free_queue_resources(struct dma_chan *chan) { struct qdma_queue *queue = to_qdma_queue(chan); struct qdma_device *qdev = queue->qdev; - struct device *dev = qdev->dma_dev.dev; + struct qdma_platdata *pdata; qdma_clear_queue_context(queue); vchan_free_chan_resources(&queue->vchan); - dma_free_coherent(dev, queue->ring_size * QDMA_MM_DESC_SIZE, + pdata = dev_get_platdata(&qdev->pdev->dev); + dma_free_coherent(pdata->dma_dev, queue->ring_size * QDMA_MM_DESC_SIZE, queue->desc_base, queue->dma_desc_base); } @@ -565,6 +557,7 @@ static int qdma_alloc_queue_resources(struct dma_chan *chan) struct qdma_queue *queue = to_qdma_queue(chan); struct qdma_device *qdev = queue->qdev; struct qdma_ctxt_sw_desc desc; + struct qdma_platdata *pdata; size_t size; int ret; @@ -572,8 +565,9 @@ static int qdma_alloc_queue_resources(struct dma_chan *chan) if (ret) return ret; + pdata = dev_get_platdata(&qdev->pdev->dev); size = queue->ring_size * QDMA_MM_DESC_SIZE; - queue->desc_base = dma_alloc_coherent(qdev->dma_dev.dev, size, + queue->desc_base = dma_alloc_coherent(pdata->dma_dev, size, &queue->dma_desc_base, GFP_KERNEL); if (!queue->desc_base) { @@ -588,7 +582,7 @@ static int qdma_alloc_queue_resources(struct dma_chan *chan) if (ret) { qdma_err(qdev, "Failed to setup SW desc ctxt for %s", chan->name); - dma_free_coherent(qdev->dma_dev.dev, size, queue->desc_base, + dma_free_coherent(pdata->dma_dev, size, queue->desc_base, queue->dma_desc_base); return ret; } @@ -948,8 +942,9 @@ static int qdma_init_error_irq(struct qdma_device *qdev) static int qdmam_alloc_qintr_rings(struct qdma_device *qdev) { - u32 ctxt[QDMA_CTXT_REGMAP_LEN]; + struct qdma_platdata *pdata = dev_get_platdata(&qdev->pdev->dev); struct device *dev = &qdev->pdev->dev; + u32 ctxt[QDMA_CTXT_REGMAP_LEN]; struct qdma_intr_ring *ring; struct qdma_ctxt_intr intr_ctxt; u32 vector; @@ -969,7 +964,8 @@ static int qdmam_alloc_qintr_rings(struct qdma_device *qdev) ring->msix_id = qdev->err_irq_idx + i + 1; ring->ridx = i; ring->color = 1; - ring->base = dmam_alloc_coherent(dev, QDMA_INTR_RING_SIZE, + ring->base = dmam_alloc_coherent(pdata->dma_dev, + QDMA_INTR_RING_SIZE, &ring->dev_base, GFP_KERNEL); if (!ring->base) { qdma_err(qdev, "Failed to alloc intr ring %d", i); diff --git a/include/linux/platform_data/amd_qdma.h b/include/linux/platform_data/amd_qdma.h index 576d952f97ed..967a6ef31cf9 100644 --- a/include/linux/platform_data/amd_qdma.h +++ b/include/linux/platform_data/amd_qdma.h @@ -26,11 +26,13 @@ struct dma_slave_map; * @max_mm_channels: Maximum number of MM DMA channels in each direction * @device_map: DMA slave map * @irq_index: The index of first IRQ + * @dma_dev: The device pointer for dma operations */ struct qdma_platdata { u32 max_mm_channels; u32 irq_index; struct dma_slave_map *device_map; + struct device *dma_dev; }; #endif /* _PLATDATA_AMD_QDMA_H */ -- 2.51.0