From 2570c02c3a5a8b27e2e14cd205ccb796e01f3308 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 13 Feb 2025 10:08:19 +0100 Subject: [PATCH] perf report: Add --latency flag Add record/report --latency flag that allows to capture and show latency-centric profiles rather than the default CPU-consumption-centric profiles. For latency profiles record captures context switch events, and report shows Latency as the first column. Signed-off-by: Dmitry Vyukov Reviewed-by: Andi Kleen Link: https://lore.kernel.org/r/e9640464bcbc47dde2cb557003f421052ebc9eec.1739437531.git.dvyukov@google.com Signed-off-by: Namhyung Kim --- tools/perf/Documentation/perf-record.txt | 4 ++ tools/perf/Documentation/perf-report.txt | 5 +++ tools/perf/builtin-record.c | 20 +++++++++ tools/perf/builtin-report.c | 32 ++++++++++++-- tools/perf/ui/hist.c | 54 ++++++++++++++++++++---- tools/perf/util/hist.h | 1 + tools/perf/util/sort.c | 33 ++++++++++++--- tools/perf/util/sort.h | 2 +- tools/perf/util/symbol_conf.h | 4 +- 9 files changed, 135 insertions(+), 20 deletions(-) diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 80686d590de2..c7fc1ba265e2 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -227,6 +227,10 @@ OPTIONS '--filter' exists, the new filter expression will be combined with them by '&&'. +--latency:: + Enable data collection for latency profiling. + Use perf report --latency for latency-centric profile. + -a:: --all-cpus:: System-wide collection from all CPUs (default if no target is specified). diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 87f864519406..66794131aec4 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -68,6 +68,11 @@ OPTIONS --hide-unresolved:: Only display entries resolved to a symbol. +--latency:: + Show latency-centric profile rather than the default + CPU-consumption-centric profile + (requires perf record --latency flag). + -s:: --sort=:: Sort histogram entries by given key(s) - multiple keys can be specified diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index cda7e6a7b45d..9af3f21fd015 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -161,6 +161,7 @@ struct record { struct evlist *sb_evlist; pthread_t thread_id; int realtime_prio; + bool latency; bool switch_output_event_set; bool no_buildid; bool no_buildid_set; @@ -3373,6 +3374,9 @@ static struct option __record_options[] = { parse_events_option), OPT_CALLBACK(0, "filter", &record.evlist, "filter", "event filter", parse_filter), + OPT_BOOLEAN(0, "latency", &record.latency, + "Enable data collection for latency profiling.\n" + "\t\t\t Use perf report --latency for latency-centric profile."), OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist, NULL, "don't record events from perf itself", exclude_perf), @@ -4019,6 +4023,22 @@ int cmd_record(int argc, const char **argv) } + if (record.latency) { + /* + * There is no fundamental reason why latency profiling + * can't work for system-wide mode, but exact semantics + * and details are to be defined. + * See the following thread for details: + * https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/ + */ + if (record.opts.target.system_wide) { + pr_err("Failed: latency profiling is not supported with system-wide collection.\n"); + err = -EINVAL; + goto out_opts; + } + record.opts.record_switch_events = true; + } + if (rec->buildid_mmap) { if (!perf_can_record_build_id()) { pr_err("Failed: no support to record build id in mmap events, update your kernel.\n"); diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 2a19abdc869a..8e064b8bd589 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -112,6 +112,8 @@ struct report { u64 nr_entries; u64 queue_size; u64 total_cycles; + u64 total_samples; + u64 singlethreaded_samples; int socket_filter; DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); struct branch_type_stat brtype_stat; @@ -331,6 +333,10 @@ static int process_sample_event(const struct perf_tool *tool, &rep->total_cycles, evsel); } + rep->total_samples++; + if (al.parallelism == 1) + rep->singlethreaded_samples++; + ret = hist_entry_iter__add(&iter, &al, rep->max_stack, rep); if (ret < 0) pr_debug("problem adding hist entry, skipping event\n"); @@ -1079,6 +1085,11 @@ static int __cmd_report(struct report *rep) return ret; } + /* Don't show Latency column for non-parallel profiles by default. */ + if (!symbol_conf.prefer_latency && rep->total_samples && + rep->singlethreaded_samples * 100 / rep->total_samples >= 99) + perf_hpp__cancel_latency(); + evlist__check_mem_load_aux(session->evlist); if (rep->stats_mode) @@ -1468,6 +1479,10 @@ int cmd_report(int argc, const char **argv) "Disable raw trace ordering"), OPT_BOOLEAN(0, "skip-empty", &report.skip_empty, "Do not display empty (or dummy) events in the output"), + OPT_BOOLEAN(0, "latency", &symbol_conf.prefer_latency, + "Show latency-centric profile rather than the default\n" + "\t\t\t CPU-consumption-centric profile\n" + "\t\t\t (requires perf record --latency flag)."), OPT_END() }; struct perf_data data = { @@ -1722,16 +1737,25 @@ repeat: symbol_conf.annotate_data_sample = true; } + symbol_conf.enable_latency = true; if (report.disable_order || !perf_session__has_switch_events(session)) { if (symbol_conf.parallelism_list_str || - (sort_order && strstr(sort_order, "parallelism")) || - (field_order && strstr(field_order, "parallelism"))) { + symbol_conf.prefer_latency || + (sort_order && (strstr(sort_order, "latency") || + strstr(sort_order, "parallelism"))) || + (field_order && (strstr(field_order, "latency") || + strstr(field_order, "parallelism")))) { if (report.disable_order) - ui__error("Use of parallelism is incompatible with --disable-order.\n"); + ui__error("Use of latency profile or parallelism is incompatible with --disable-order.\n"); else - ui__error("Use of parallelism requires --switch-events during record.\n"); + ui__error("Use of latency profile or parallelism requires --latency flag during record.\n"); return -1; } + /* + * If user did not ask for anything related to + * latency/parallelism explicitly, just don't show it. + */ + symbol_conf.enable_latency = false; } if (sort_order && strstr(sort_order, "ipc")) { diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c index 6de6309595f9..ae3b7fe1dadc 100644 --- a/tools/perf/ui/hist.c +++ b/tools/perf/ui/hist.c @@ -631,28 +631,48 @@ void perf_hpp__init(void) if (is_strict_order(field_order)) return; + /* + * Overhead and latency columns are added in setup_overhead(), + * so they are added implicitly here only if they were added + * by setup_overhead() before (have was_taken flag set). + * This is required because setup_overhead() has more complex + * logic, in particular it does not add "overhead" if user + * specified "latency" in sort order, and vise versa. + */ if (symbol_conf.cumulate_callchain) { - hpp_dimension__add_output(PERF_HPP__OVERHEAD_ACC); + /* + * Addition of fields is idempotent, so we add latency + * column twice to get desired order with simpler logic. + */ + if (symbol_conf.prefer_latency) + hpp_dimension__add_output(PERF_HPP__LATENCY_ACC, true); + hpp_dimension__add_output(PERF_HPP__OVERHEAD_ACC, true); + if (symbol_conf.enable_latency) + hpp_dimension__add_output(PERF_HPP__LATENCY_ACC, true); perf_hpp__format[PERF_HPP__OVERHEAD].name = "Self"; } - hpp_dimension__add_output(PERF_HPP__OVERHEAD); + if (symbol_conf.prefer_latency) + hpp_dimension__add_output(PERF_HPP__LATENCY, true); + hpp_dimension__add_output(PERF_HPP__OVERHEAD, true); + if (symbol_conf.enable_latency) + hpp_dimension__add_output(PERF_HPP__LATENCY, true); if (symbol_conf.show_cpu_utilization) { - hpp_dimension__add_output(PERF_HPP__OVERHEAD_SYS); - hpp_dimension__add_output(PERF_HPP__OVERHEAD_US); + hpp_dimension__add_output(PERF_HPP__OVERHEAD_SYS, false); + hpp_dimension__add_output(PERF_HPP__OVERHEAD_US, false); if (perf_guest) { - hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_SYS); - hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_US); + hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_SYS, false); + hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_US, false); } } if (symbol_conf.show_nr_samples) - hpp_dimension__add_output(PERF_HPP__SAMPLES); + hpp_dimension__add_output(PERF_HPP__SAMPLES, false); if (symbol_conf.show_total_period) - hpp_dimension__add_output(PERF_HPP__PERIOD); + hpp_dimension__add_output(PERF_HPP__PERIOD, false); } void perf_hpp_list__column_register(struct perf_hpp_list *list, @@ -701,6 +721,24 @@ void perf_hpp__cancel_cumulate(void) } } +void perf_hpp__cancel_latency(void) +{ + struct perf_hpp_fmt *fmt, *lat, *acc, *tmp; + + if (is_strict_order(field_order)) + return; + if (sort_order && strstr(sort_order, "latency")) + return; + + lat = &perf_hpp__format[PERF_HPP__LATENCY]; + acc = &perf_hpp__format[PERF_HPP__LATENCY_ACC]; + + perf_hpp_list__for_each_format_safe(&perf_hpp_list, fmt, tmp) { + if (fmt_equal(lat, fmt) || fmt_equal(acc, fmt)) + perf_hpp__column_unregister(fmt); + } +} + void perf_hpp__setup_output_field(struct perf_hpp_list *list) { struct perf_hpp_fmt *fmt; diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 91159f16c60b..29d4c7a3d174 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -582,6 +582,7 @@ enum { void perf_hpp__init(void); void perf_hpp__cancel_cumulate(void); +void perf_hpp__cancel_latency(void); void perf_hpp__setup_output_field(struct perf_hpp_list *list); void perf_hpp__reset_output_field(struct perf_hpp_list *list); void perf_hpp__append_sort_keys(struct perf_hpp_list *list); diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index bc4c3acfe755..2b6023de7a53 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -2622,6 +2622,7 @@ struct hpp_dimension { const char *name; struct perf_hpp_fmt *fmt; int taken; + int was_taken; }; #define DIM(d, n) { .name = n, .fmt = &perf_hpp__format[d], } @@ -3513,6 +3514,7 @@ static int __hpp_dimension__add(struct hpp_dimension *hd, return -1; hd->taken = 1; + hd->was_taken = 1; perf_hpp_list__register_sort_field(list, fmt); return 0; } @@ -3547,10 +3549,15 @@ static int __hpp_dimension__add_output(struct perf_hpp_list *list, return 0; } -int hpp_dimension__add_output(unsigned col) +int hpp_dimension__add_output(unsigned col, bool implicit) { + struct hpp_dimension *hd; + BUG_ON(col >= PERF_HPP__MAX_INDEX); - return __hpp_dimension__add_output(&perf_hpp_list, &hpp_sort_dimensions[col]); + hd = &hpp_sort_dimensions[col]; + if (implicit && !hd->was_taken) + return 0; + return __hpp_dimension__add_output(&perf_hpp_list, hd); } int sort_dimension__add(struct perf_hpp_list *list, const char *tok, @@ -3809,10 +3816,24 @@ static char *setup_overhead(char *keys) if (sort__mode == SORT_MODE__DIFF) return keys; - keys = prefix_if_not_in("overhead", keys); - - if (symbol_conf.cumulate_callchain) - keys = prefix_if_not_in("overhead_children", keys); + if (symbol_conf.prefer_latency) { + keys = prefix_if_not_in("overhead", keys); + keys = prefix_if_not_in("latency", keys); + if (symbol_conf.cumulate_callchain) { + keys = prefix_if_not_in("overhead_children", keys); + keys = prefix_if_not_in("latency_children", keys); + } + } else if (!keys || (!strstr(keys, "overhead") && + !strstr(keys, "latency"))) { + if (symbol_conf.enable_latency) + keys = prefix_if_not_in("latency", keys); + keys = prefix_if_not_in("overhead", keys); + if (symbol_conf.cumulate_callchain) { + if (symbol_conf.enable_latency) + keys = prefix_if_not_in("latency_children", keys); + keys = prefix_if_not_in("overhead_children", keys); + } + } return keys; } diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 11fb15f91409..180d36a2bea3 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -141,7 +141,7 @@ int report_parse_ignore_callees_opt(const struct option *opt, const char *arg, i bool is_strict_order(const char *order); -int hpp_dimension__add_output(unsigned col); +int hpp_dimension__add_output(unsigned col, bool implicit); void reset_dimensions(void); int sort_dimension__add(struct perf_hpp_list *list, const char *tok, struct evlist *evlist, diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h index c5b2e56127e2..cd9aa82c7d5a 100644 --- a/tools/perf/util/symbol_conf.h +++ b/tools/perf/util/symbol_conf.h @@ -49,7 +49,9 @@ struct symbol_conf { keep_exited_threads, annotate_data_member, annotate_data_sample, - skip_empty; + skip_empty, + enable_latency, + prefer_latency; const char *vmlinux_name, *kallsyms_name, *source_prefix, -- 2.50.1