'--filter' exists, the new filter expression will be combined with
        them by '&&'.
 
+--latency::
+       Enable data collection for latency profiling.
+       Use perf report --latency for latency-centric profile.
+
 -a::
 --all-cpus::
         System-wide collection from all CPUs (default if no target is specified).
 
 --hide-unresolved::
         Only display entries resolved to a symbol.
 
+--latency::
+        Show latency-centric profile rather than the default
+        CPU-consumption-centric profile
+        (requires perf record --latency flag).
+
 -s::
 --sort=::
        Sort histogram entries by given key(s) - multiple keys can be specified
 
        struct evlist           *sb_evlist;
        pthread_t               thread_id;
        int                     realtime_prio;
+       bool                    latency;
        bool                    switch_output_event_set;
        bool                    no_buildid;
        bool                    no_buildid_set;
                     parse_events_option),
        OPT_CALLBACK(0, "filter", &record.evlist, "filter",
                     "event filter", parse_filter),
+       OPT_BOOLEAN(0, "latency", &record.latency,
+                   "Enable data collection for latency profiling.\n"
+                   "\t\t\t  Use perf report --latency for latency-centric profile."),
        OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
                           NULL, "don't record events from perf itself",
                           exclude_perf),
 
        }
 
+       if (record.latency) {
+               /*
+                * There is no fundamental reason why latency profiling
+                * can't work for system-wide mode, but exact semantics
+                * and details are to be defined.
+                * See the following thread for details:
+                * https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/
+                */
+               if (record.opts.target.system_wide) {
+                       pr_err("Failed: latency profiling is not supported with system-wide collection.\n");
+                       err = -EINVAL;
+                       goto out_opts;
+               }
+               record.opts.record_switch_events = true;
+       }
+
        if (rec->buildid_mmap) {
                if (!perf_can_record_build_id()) {
                        pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
 
        u64                     nr_entries;
        u64                     queue_size;
        u64                     total_cycles;
+       u64                     total_samples;
+       u64                     singlethreaded_samples;
        int                     socket_filter;
        DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
        struct branch_type_stat brtype_stat;
                                     &rep->total_cycles, evsel);
        }
 
+       rep->total_samples++;
+       if (al.parallelism == 1)
+               rep->singlethreaded_samples++;
+
        ret = hist_entry_iter__add(&iter, &al, rep->max_stack, rep);
        if (ret < 0)
                pr_debug("problem adding hist entry, skipping event\n");
                return ret;
        }
 
+       /* Don't show Latency column for non-parallel profiles by default. */
+       if (!symbol_conf.prefer_latency && rep->total_samples &&
+               rep->singlethreaded_samples * 100 / rep->total_samples >= 99)
+               perf_hpp__cancel_latency();
+
        evlist__check_mem_load_aux(session->evlist);
 
        if (rep->stats_mode)
                    "Disable raw trace ordering"),
        OPT_BOOLEAN(0, "skip-empty", &report.skip_empty,
                    "Do not display empty (or dummy) events in the output"),
+       OPT_BOOLEAN(0, "latency", &symbol_conf.prefer_latency,
+                   "Show latency-centric profile rather than the default\n"
+                   "\t\t\t  CPU-consumption-centric profile\n"
+                   "\t\t\t  (requires perf record --latency flag)."),
        OPT_END()
        };
        struct perf_data data = {
                symbol_conf.annotate_data_sample = true;
        }
 
+       symbol_conf.enable_latency = true;
        if (report.disable_order || !perf_session__has_switch_events(session)) {
                if (symbol_conf.parallelism_list_str ||
-                               (sort_order && strstr(sort_order, "parallelism")) ||
-                               (field_order && strstr(field_order, "parallelism"))) {
+                       symbol_conf.prefer_latency ||
+                       (sort_order && (strstr(sort_order, "latency") ||
+                               strstr(sort_order, "parallelism"))) ||
+                       (field_order && (strstr(field_order, "latency") ||
+                               strstr(field_order, "parallelism")))) {
                        if (report.disable_order)
-                               ui__error("Use of parallelism is incompatible with --disable-order.\n");
+                               ui__error("Use of latency profile or parallelism is incompatible with --disable-order.\n");
                        else
-                               ui__error("Use of parallelism requires --switch-events during record.\n");
+                               ui__error("Use of latency profile or parallelism requires --latency flag during record.\n");
                        return -1;
                }
+               /*
+                * If user did not ask for anything related to
+                * latency/parallelism explicitly, just don't show it.
+                */
+               symbol_conf.enable_latency = false;
        }
 
        if (sort_order && strstr(sort_order, "ipc")) {
 
        if (is_strict_order(field_order))
                return;
 
+       /*
+        * Overhead and latency columns are added in setup_overhead(),
+        * so they are added implicitly here only if they were added
+        * by setup_overhead() before (have was_taken flag set).
+        * This is required because setup_overhead() has more complex
+        * logic, in particular it does not add "overhead" if user
+        * specified "latency" in sort order, and vise versa.
+        */
        if (symbol_conf.cumulate_callchain) {
-               hpp_dimension__add_output(PERF_HPP__OVERHEAD_ACC);
+               /*
+                * Addition of fields is idempotent, so we add latency
+                * column twice to get desired order with simpler logic.
+                */
+               if (symbol_conf.prefer_latency)
+                       hpp_dimension__add_output(PERF_HPP__LATENCY_ACC, true);
+               hpp_dimension__add_output(PERF_HPP__OVERHEAD_ACC, true);
+               if (symbol_conf.enable_latency)
+                       hpp_dimension__add_output(PERF_HPP__LATENCY_ACC, true);
                perf_hpp__format[PERF_HPP__OVERHEAD].name = "Self";
        }
 
-       hpp_dimension__add_output(PERF_HPP__OVERHEAD);
+       if (symbol_conf.prefer_latency)
+               hpp_dimension__add_output(PERF_HPP__LATENCY, true);
+       hpp_dimension__add_output(PERF_HPP__OVERHEAD, true);
+       if (symbol_conf.enable_latency)
+               hpp_dimension__add_output(PERF_HPP__LATENCY, true);
 
        if (symbol_conf.show_cpu_utilization) {
-               hpp_dimension__add_output(PERF_HPP__OVERHEAD_SYS);
-               hpp_dimension__add_output(PERF_HPP__OVERHEAD_US);
+               hpp_dimension__add_output(PERF_HPP__OVERHEAD_SYS, false);
+               hpp_dimension__add_output(PERF_HPP__OVERHEAD_US, false);
 
                if (perf_guest) {
-                       hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_SYS);
-                       hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_US);
+                       hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_SYS, false);
+                       hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_US, false);
                }
        }
 
        if (symbol_conf.show_nr_samples)
-               hpp_dimension__add_output(PERF_HPP__SAMPLES);
+               hpp_dimension__add_output(PERF_HPP__SAMPLES, false);
 
        if (symbol_conf.show_total_period)
-               hpp_dimension__add_output(PERF_HPP__PERIOD);
+               hpp_dimension__add_output(PERF_HPP__PERIOD, false);
 }
 
 void perf_hpp_list__column_register(struct perf_hpp_list *list,
        }
 }
 
+void perf_hpp__cancel_latency(void)
+{
+       struct perf_hpp_fmt *fmt, *lat, *acc, *tmp;
+
+       if (is_strict_order(field_order))
+               return;
+       if (sort_order && strstr(sort_order, "latency"))
+               return;
+
+       lat = &perf_hpp__format[PERF_HPP__LATENCY];
+       acc = &perf_hpp__format[PERF_HPP__LATENCY_ACC];
+
+       perf_hpp_list__for_each_format_safe(&perf_hpp_list, fmt, tmp) {
+               if (fmt_equal(lat, fmt) || fmt_equal(acc, fmt))
+                       perf_hpp__column_unregister(fmt);
+       }
+}
+
 void perf_hpp__setup_output_field(struct perf_hpp_list *list)
 {
        struct perf_hpp_fmt *fmt;
 
 
 void perf_hpp__init(void);
 void perf_hpp__cancel_cumulate(void);
+void perf_hpp__cancel_latency(void);
 void perf_hpp__setup_output_field(struct perf_hpp_list *list);
 void perf_hpp__reset_output_field(struct perf_hpp_list *list);
 void perf_hpp__append_sort_keys(struct perf_hpp_list *list);
 
        const char              *name;
        struct perf_hpp_fmt     *fmt;
        int                     taken;
+       int                     was_taken;
 };
 
 #define DIM(d, n) { .name = n, .fmt = &perf_hpp__format[d], }
                return -1;
 
        hd->taken = 1;
+       hd->was_taken = 1;
        perf_hpp_list__register_sort_field(list, fmt);
        return 0;
 }
        return 0;
 }
 
-int hpp_dimension__add_output(unsigned col)
+int hpp_dimension__add_output(unsigned col, bool implicit)
 {
+       struct hpp_dimension *hd;
+
        BUG_ON(col >= PERF_HPP__MAX_INDEX);
-       return __hpp_dimension__add_output(&perf_hpp_list, &hpp_sort_dimensions[col]);
+       hd = &hpp_sort_dimensions[col];
+       if (implicit && !hd->was_taken)
+               return 0;
+       return __hpp_dimension__add_output(&perf_hpp_list, hd);
 }
 
 int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
        if (sort__mode == SORT_MODE__DIFF)
                return keys;
 
-       keys = prefix_if_not_in("overhead", keys);
-
-       if (symbol_conf.cumulate_callchain)
-               keys = prefix_if_not_in("overhead_children", keys);
+       if (symbol_conf.prefer_latency) {
+               keys = prefix_if_not_in("overhead", keys);
+               keys = prefix_if_not_in("latency", keys);
+               if (symbol_conf.cumulate_callchain) {
+                       keys = prefix_if_not_in("overhead_children", keys);
+                       keys = prefix_if_not_in("latency_children", keys);
+               }
+       } else if (!keys || (!strstr(keys, "overhead") &&
+                       !strstr(keys, "latency"))) {
+               if (symbol_conf.enable_latency)
+                       keys = prefix_if_not_in("latency", keys);
+               keys = prefix_if_not_in("overhead", keys);
+               if (symbol_conf.cumulate_callchain) {
+                       if (symbol_conf.enable_latency)
+                               keys = prefix_if_not_in("latency_children", keys);
+                       keys = prefix_if_not_in("overhead_children", keys);
+               }
+       }
 
        return keys;
 }
 
 
 bool is_strict_order(const char *order);
 
-int hpp_dimension__add_output(unsigned col);
+int hpp_dimension__add_output(unsigned col, bool implicit);
 void reset_dimensions(void);
 int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
                        struct evlist *evlist,
 
                        keep_exited_threads,
                        annotate_data_member,
                        annotate_data_sample,
-                       skip_empty;
+                       skip_empty,
+                       enable_latency,
+                       prefer_latency;
        const char      *vmlinux_name,
                        *kallsyms_name,
                        *source_prefix,