From 859fe0f4f2be73c1727fffaa4eed1a4e0d16cf5e Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 27 Jul 2022 15:08:13 -0700 Subject: [PATCH] perf vendor events: Update Intel haswellx Update to v25, the metrics are based on TMA 4.4 full. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py to download and generate the latest events and metrics. Manually copy the haswellx files into perf and update mapfile.csv. Tested with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok 90: perf all metricgroups test : Ok 91: perf all metrics test : Failed 93: perf all PMU test : Ok The test 91 failure is a pre-existing failure on the test system with the metric Load_Miss_Real_Latency which is fixed by prefixing it with --metric-no-group. Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Andi Kleen Cc: Caleb Biggers Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Kshipra Bopardikar Cc: Mark Rutland Cc: Maxime Coquelin Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Sedat Dilek Cc: Stephane Eranian Cc: Xing Zhengjun Link: http://lore.kernel.org/lkml/20220727220832.2865794-12-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/x86/haswellx/cache.json | 9 +- .../arch/x86/haswellx/hsx-metrics.json | 85 +++++++++++++------ .../arch/x86/haswellx/pipeline.json | 5 -- .../arch/x86/haswellx/uncore-cache.json | 8 +- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 5 files changed, 67 insertions(+), 42 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/haswellx/cache.json b/tools/perf/pmu-events/arch/x86/haswellx/cache.json index 5760ffb1eaf46..7557a203a1b66 100644 --- a/tools/perf/pmu-events/arch/x86/haswellx/cache.json +++ b/tools/perf/pmu-events/arch/x86/haswellx/cache.json @@ -20,7 +20,7 @@ "UMask": "0x2" }, { - "BriefDescription": "L1D miss oustandings duration in cycles", + "BriefDescription": "L1D miss outstanding duration in cycles", "Counter": "2", "CounterHTOff": "2", "EventCode": "0x48", @@ -592,7 +592,7 @@ "UMask": "0x20" }, { - "BriefDescription": "All retired load uops.", + "BriefDescription": "Retired load uops.", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3", "Data_LA": "1", @@ -600,11 +600,12 @@ "EventCode": "0xD0", "EventName": "MEM_UOPS_RETIRED.ALL_LOADS", "PEBS": "1", + "PublicDescription": "Counts all retired load uops. This event accounts for SW prefetch uops of PREFETCHNTA or PREFETCHT0/1/2 or PREFETCHW.", "SampleAfterValue": "2000003", "UMask": "0x81" }, { - "BriefDescription": "All retired store uops.", + "BriefDescription": "Retired store uops.", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3", "Data_LA": "1", @@ -613,6 +614,7 @@ "EventName": "MEM_UOPS_RETIRED.ALL_STORES", "L1_Hit_Indication": "1", "PEBS": "1", + "PublicDescription": "Counts all retired store uops.", "SampleAfterValue": "2000003", "UMask": "0x82" }, @@ -1071,7 +1073,6 @@ "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0xf4", "EventName": "SQ_MISC.SPLIT_LOCK", - "PublicDescription": "SQ_MISC.SPLIT_LOCK", "SampleAfterValue": "100003", "UMask": "0x10" } diff --git a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json index c99734fd907d6..5c9e008ca995f 100644 --- a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json @@ -111,17 +111,11 @@ "MetricName": "CoreIPC_SMT" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", "MetricExpr": "( UOPS_EXECUTED.CORE / 2 / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) ) if #SMT_on else UOPS_EXECUTED.CORE / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP" }, - { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "IpMispredict" - }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", @@ -170,6 +164,12 @@ "MetricGroup": "Summary;TmaL1", "MetricName": "Instructions" }, + { + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "Retire" + }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )", @@ -177,11 +177,16 @@ "MetricName": "DSB_Coverage" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load instructions (in core cycles)", + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "IpMispredict" + }, + { + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )", "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "Load_Miss_Real_Latency", - "PublicDescription": "Actual Average Latency for L1 data-cache miss demand load instructions (in core cycles). Latency may be overestimated for multi-load instructions - e.g. repeat strings." + "MetricName": "Load_Miss_Real_Latency" }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", @@ -189,24 +194,6 @@ "MetricGroup": "Mem;MemoryBound;MemoryBW", "MetricName": "MLP" }, - { - "BriefDescription": "Average data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "L1D_Cache_Fill_BW" - }, - { - "BriefDescription": "Average data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1000000000 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "L2_Cache_Fill_BW" - }, - { - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "L3_Cache_Fill_BW" - }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", @@ -238,6 +225,48 @@ "MetricGroup": "Mem;MemoryTLB_SMT", "MetricName": "Page_Walks_Utilization_SMT" }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "L1D_Cache_Fill_BW" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1000000000 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "L2_Cache_Fill_BW" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "L3_Cache_Fill_BW" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "L1D_Cache_Fill_BW_1T" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "L2_Cache_Fill_BW_1T" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "L3_Cache_Fill_BW_1T" + }, + { + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "0", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "L3_Cache_Access_BW_1T" + }, { "BriefDescription": "Average CPU Utilization", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", diff --git a/tools/perf/pmu-events/arch/x86/haswellx/pipeline.json b/tools/perf/pmu-events/arch/x86/haswellx/pipeline.json index 6165933ee1a4a..42f6a81006613 100644 --- a/tools/perf/pmu-events/arch/x86/haswellx/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/haswellx/pipeline.json @@ -1035,7 +1035,6 @@ "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0xA1", "EventName": "UOPS_EXECUTED_PORT.PORT_0_CORE", - "PublicDescription": "Cycles per core when uops are exectuted in port 0.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1056,7 +1055,6 @@ "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0xA1", "EventName": "UOPS_EXECUTED_PORT.PORT_1_CORE", - "PublicDescription": "Cycles per core when uops are exectuted in port 1.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -1117,7 +1115,6 @@ "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0xA1", "EventName": "UOPS_EXECUTED_PORT.PORT_4_CORE", - "PublicDescription": "Cycles per core when uops are exectuted in port 4.", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -1138,7 +1135,6 @@ "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0xA1", "EventName": "UOPS_EXECUTED_PORT.PORT_5_CORE", - "PublicDescription": "Cycles per core when uops are exectuted in port 5.", "SampleAfterValue": "2000003", "UMask": "0x20" }, @@ -1159,7 +1155,6 @@ "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0xA1", "EventName": "UOPS_EXECUTED_PORT.PORT_6_CORE", - "PublicDescription": "Cycles per core when uops are exectuted in port 6.", "SampleAfterValue": "2000003", "UMask": "0x40" }, diff --git a/tools/perf/pmu-events/arch/x86/haswellx/uncore-cache.json b/tools/perf/pmu-events/arch/x86/haswellx/uncore-cache.json index b48833d1c170f..03598904d7468 100644 --- a/tools/perf/pmu-events/arch/x86/haswellx/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/haswellx/uncore-cache.json @@ -511,7 +511,7 @@ "Unit": "CBO" }, { - "BriefDescription": "AD", + "BriefDescription": "UNC_C_RING_SINK_STARVED.AD", "Counter": "0,1,2,3", "EventCode": "0x6", "EventName": "UNC_C_RING_SINK_STARVED.AD", @@ -520,7 +520,7 @@ "Unit": "CBO" }, { - "BriefDescription": "AK", + "BriefDescription": "UNC_C_RING_SINK_STARVED.AK", "Counter": "0,1,2,3", "EventCode": "0x6", "EventName": "UNC_C_RING_SINK_STARVED.AK", @@ -529,7 +529,7 @@ "Unit": "CBO" }, { - "BriefDescription": "IV", + "BriefDescription": "UNC_C_RING_SINK_STARVED.IV", "Counter": "0,1,2,3", "EventCode": "0x6", "EventName": "UNC_C_RING_SINK_STARVED.IV", @@ -538,7 +538,7 @@ "Unit": "CBO" }, { - "BriefDescription": "BL", + "BriefDescription": "UNC_C_RING_SINK_STARVED.BL", "Counter": "0,1,2,3", "EventCode": "0x6", "EventName": "UNC_C_RING_SINK_STARVED.BL", diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 39741e24f733b..b602d2da1b89a 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -9,7 +9,7 @@ GenuineIntel-6-96,v1.03,elkhartlake,core GenuineIntel-6-5[CF],v13,goldmont,core GenuineIntel-6-7A,v1.01,goldmontplus,core GenuineIntel-6-(3C|45|46),v31,haswell,core -GenuineIntel-6-3F,v17,haswellx,core +GenuineIntel-6-3F,v25,haswellx,core GenuineIntel-6-3A,v18,ivybridge,core GenuineIntel-6-3E,v19,ivytown,core GenuineIntel-6-2D,v20,jaketown,core -- 2.49.0