]> www.infradead.org Git - linux.git/commitdiff
perf mem/c2c amd: Add ldlat support
authorRavi Bangoria <ravi.bangoria@amd.com>
Tue, 29 Apr 2025 03:59:37 +0000 (03:59 +0000)
committerArnaldo Carvalho de Melo <acme@redhat.com>
Wed, 30 Apr 2025 01:30:46 +0000 (22:30 -0300)
'perf mem/c2c' uses IBS Op PMU on AMD platforms.

IBS Op PMU on Zen5 uarch has added support for Load Latency filtering.

Implement 'perf mem/c2c' --ldlat using IBS Op Load Latency filtering
capability.

Some subtle differences between AMD and other arch:

o --ldlat is disabled by default on AMD

o Supported values are 128 to 2048.

Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Ananth Narayan <ananth.narayan@amd.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Joe Mario <jmario@redhat.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Santosh Shukla <santosh.shukla@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20250429035938.1301-4-ravi.bangoria@amd.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
tools/perf/Documentation/perf-c2c.txt
tools/perf/Documentation/perf-mem.txt
tools/perf/arch/x86/util/mem-events.c
tools/perf/arch/x86/util/mem-events.h
tools/perf/arch/x86/util/pmu.c
tools/perf/tests/shell/test_data_symbol.sh
tools/perf/util/pmu.c
tools/perf/util/pmu.h

index 856f0dfb8e5a30f228460911bd0f297aaa291ef6..f4af2dd6ab3185250056614d5bb6837f24a46dac 100644 (file)
@@ -54,8 +54,15 @@ RECORD OPTIONS
 
 -l::
 --ldlat::
-       Configure mem-loads latency. Supported on Intel and Arm64 processors
-       only. Ignored on other archs.
+       Configure mem-loads latency. Supported on Intel, Arm64 and some AMD
+       processors. Ignored on other archs.
+
+       On supported AMD processors:
+       - /sys/bus/event_source/devices/ibs_op/caps/ldlat file contains '1'.
+       - Supported latency values are 128 to 2048 (both inclusive).
+       - Latency value which is a multiple of 128 incurs a little less profiling
+         overhead compared to other values.
+       - Load latency filtering is disabled by default.
 
 -k::
 --all-kernel::
index 8a1bd9ff0f86dfb895a95c2b916c9b4f676f9f90..a9e3c71a220557d492432b224b336c150b52f85e 100644 (file)
@@ -28,6 +28,8 @@ and kernel support is required. See linkperf:perf-arm-spe[1] for a setup guide.
 Due to the statistical nature of SPE sampling, not every memory operation will
 be sampled.
 
+On AMD this use IBS Op PMU to sample load-store operations.
+
 COMMON OPTIONS
 --------------
 -f::
@@ -67,8 +69,15 @@ RECORD OPTIONS
        Configure all used events to run in user space.
 
 --ldlat <n>::
-       Specify desired latency for loads event. Supported on Intel and Arm64
-       processors only. Ignored on other archs.
+       Specify desired latency for loads event. Supported on Intel, Arm64 and
+       some AMD processors. Ignored on other archs.
+
+       On supported AMD processors:
+       - /sys/bus/event_source/devices/ibs_op/caps/ldlat file contains '1'.
+       - Supported latency values are 128 to 2048 (both inclusive).
+       - Latency value which is a multiple of 128 incurs a little less profiling
+         overhead compared to other values.
+       - Load latency filtering is disabled by default.
 
 REPORT OPTIONS
 --------------
index 62df03e91c7e1ce535089138476442b53f04ddbc..b38f519020ff8c6f23b7250752d7e27f0714d86d 100644 (file)
@@ -26,3 +26,9 @@ struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX] = {
        E(NULL,         NULL,           NULL,   false,  0),
        E("mem-ldst",   "%s//",         NULL,   false,  0),
 };
+
+struct perf_mem_event perf_mem_events_amd_ldlat[PERF_MEM_EVENTS__MAX] = {
+       E(NULL,         NULL,           NULL,   false,  0),
+       E(NULL,         NULL,           NULL,   false,  0),
+       E("mem-ldst",   "%s/ldlat=%u/", NULL,   true,   0),
+};
index f55c8d3b7d5982ee7acff77e029eb344b877a130..11e09a256f5bb084b8c272f8830f512037616bc4 100644 (file)
@@ -6,5 +6,6 @@ extern struct perf_mem_event perf_mem_events_intel[PERF_MEM_EVENTS__MAX];
 extern struct perf_mem_event perf_mem_events_intel_aux[PERF_MEM_EVENTS__MAX];
 
 extern struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX];
+extern struct perf_mem_event perf_mem_events_amd_ldlat[PERF_MEM_EVENTS__MAX];
 
 #endif /* _X86_MEM_EVENTS_H */
index e0060dac2a9f9242ae80c0513a74483828da7cd4..8712cbbbc712239559a6e42a0249d3876fa9c68b 100644 (file)
 #include "mem-events.h"
 #include "util/env.h"
 
-void perf_pmu__arch_init(struct perf_pmu *pmu __maybe_unused)
+void perf_pmu__arch_init(struct perf_pmu *pmu)
 {
+       struct perf_pmu_caps *ldlat_cap;
+
 #ifdef HAVE_AUXTRACE_SUPPORT
        if (!strcmp(pmu->name, INTEL_PT_PMU_NAME)) {
                pmu->auxtrace = true;
@@ -33,8 +35,20 @@ void perf_pmu__arch_init(struct perf_pmu *pmu __maybe_unused)
 #endif
 
        if (x86__is_amd_cpu()) {
-               if (!strcmp(pmu->name, "ibs_op"))
-                       pmu->mem_events = perf_mem_events_amd;
+               if (strcmp(pmu->name, "ibs_op"))
+                       return;
+
+               pmu->mem_events = perf_mem_events_amd;
+
+               if (!perf_pmu__caps_parse(pmu))
+                       return;
+
+               ldlat_cap = perf_pmu__get_cap(pmu, "ldlat");
+               if (!ldlat_cap || strcmp(ldlat_cap->value, "1"))
+                       return;
+
+               perf_mem_events__loads_ldlat = 0;
+               pmu->mem_events = perf_mem_events_amd_ldlat;
        } else if (pmu->is_core) {
                if (perf_pmu__have_event(pmu, "mem-loads-aux"))
                        pmu->mem_events = perf_mem_events_intel_aux;
index bbe8277496aee5bb8e98e8bfc183341c1ddb8c5c..d61b5659a46d9a778f621ce71cb46b20ea1a8cde 100755 (executable)
@@ -54,11 +54,34 @@ trap cleanup_files exit term int
 
 echo "Recording workload..."
 
-# perf mem/c2c internally uses IBS PMU on AMD CPU which doesn't support
-# user/kernel filtering and per-process monitoring, spin program on
-# specific CPU and test in per-CPU mode.
 is_amd=$(grep -E -c 'vendor_id.*AuthenticAMD' /proc/cpuinfo)
 if (($is_amd >= 1)); then
+       mem_events="$(perf mem record -v -e list 2>&1)"
+       if ! [[ "$mem_events" =~ ^mem\-ldst.*ibs_op/(.*)/.*available ]]; then
+               echo "ERROR: mem-ldst event is not matching"
+               exit 1
+       fi
+
+       # --ldlat on AMD:
+       # o Zen4 and earlier uarch does not support ldlat
+       # o Even on supported platforms, it's disabled (--ldlat=0) by default.
+       ldlat=${BASH_REMATCH[1]}
+       if [[ -n $ldlat ]]; then
+               if ! [[ "$ldlat" =~ ldlat=0 ]]; then
+                       echo "ERROR: ldlat not initialized to 0?"
+                       exit 1
+               fi
+
+               mem_events="$(perf mem record -v --ldlat=150 -e list 2>&1)"
+               if ! [[ "$mem_events" =~ ^mem-ldst.*ibs_op/ldlat=150/.*available ]]; then
+                       echo "ERROR: --ldlat not honored?"
+                       exit 1
+               fi
+       fi
+
+       # perf mem/c2c internally uses IBS PMU on AMD CPU which doesn't
+       # support user/kernel filtering and per-process monitoring on older
+       # kernels, spin program on specific CPU and test in per-CPU mode.
        perf mem record -vvv -o ${PERF_DATA} -C 0 -- taskset -c 0 $TEST_PROGRAM 2>"${ERR_FILE}"
 else
        perf mem record -vvv --all-user -o ${PERF_DATA} -- $TEST_PROGRAM 2>"${ERR_FILE}"
index bbb906bb2159f2e37a71270d79aa05deb0d0e794..d08972aa461c6a6a01f202ef072e55ebe45f417a 100644 (file)
@@ -2259,6 +2259,17 @@ static void perf_pmu__del_caps(struct perf_pmu *pmu)
        }
 }
 
+struct perf_pmu_caps *perf_pmu__get_cap(struct perf_pmu *pmu, const char *name)
+{
+       struct perf_pmu_caps *caps;
+
+       list_for_each_entry(caps, &pmu->caps, list) {
+               if (!strcmp(caps->name, name))
+                       return caps;
+       }
+       return NULL;
+}
+
 /*
  * Reading/parsing the given pmu capabilities, which should be located at:
  * /sys/bus/event_source/devices/<dev>/caps as sysfs group attributes.
index 13dd3511f5042b543937ccd30a0b1f8101d1dd8d..a1fdd6d50c53ffbe63684c0560e2d1569422208f 100644 (file)
@@ -277,6 +277,8 @@ bool pmu_uncore_identifier_match(const char *compat, const char *id);
 
 int perf_pmu__convert_scale(const char *scale, char **end, double *sval);
 
+struct perf_pmu_caps *perf_pmu__get_cap(struct perf_pmu *pmu, const char *name);
+
 int perf_pmu__caps_parse(struct perf_pmu *pmu);
 
 void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config,