Set the function name to get the histogram.  Unlike perf ftrace trace,
        it only allows single function to calculate the histogram.
 
+-e::
+--events=::
+       Set the pair of events to get the histogram.  The histogram is calculated
+       by the time difference between the two events from the same thread.  This
+       requires -b/--use-bpf option.
+
 -b::
 --use-bpf::
        Use BPF to measure function latency instead of using the ftrace (it
 
        }
 }
 
+static int parse_filter_event(const struct option *opt, const char *str,
+                            int unset __maybe_unused)
+{
+       struct list_head *head = opt->value;
+       struct filter_entry *entry;
+       char *s, *p;
+       int ret = -ENOMEM;
+
+       s = strdup(str);
+       if (s == NULL)
+               return -ENOMEM;
+
+       while ((p = strsep(&s, ",")) != NULL) {
+               entry = malloc(sizeof(*entry) + strlen(p) + 1);
+               if (entry == NULL)
+                       goto out;
+
+               strcpy(entry->name, p);
+               list_add_tail(&entry->list, head);
+       }
+       ret = 0;
+
+out:
+       free(s);
+       return ret;
+}
+
 static int parse_buffer_size(const struct option *opt,
                             const char *str, int unset)
 {
        const struct option latency_options[] = {
        OPT_CALLBACK('T', "trace-funcs", &ftrace.filters, "func",
                     "Show latency of given function", parse_filter_func),
+       OPT_CALLBACK('e', "events", &ftrace.event_pair, "event1,event2",
+                    "Show latency between the two events", parse_filter_event),
 #ifdef HAVE_BPF_SKEL
        OPT_BOOLEAN('b', "use-bpf", &ftrace.target.use_bpf,
                    "Use BPF to measure function latency"),
        INIT_LIST_HEAD(&ftrace.notrace);
        INIT_LIST_HEAD(&ftrace.graph_funcs);
        INIT_LIST_HEAD(&ftrace.nograph_funcs);
+       INIT_LIST_HEAD(&ftrace.event_pair);
 
        signal(SIGINT, sig_handler);
        signal(SIGUSR1, sig_handler);
                cmd_func = __cmd_ftrace;
                break;
        case PERF_FTRACE_LATENCY:
-               if (list_empty(&ftrace.filters)) {
-                       pr_err("Should provide a function to measure\n");
+               if (list_empty(&ftrace.filters) && list_empty(&ftrace.event_pair)) {
+                       pr_err("Should provide a function or events to measure\n");
                        parse_options_usage(ftrace_usage, options, "T", 1);
+                       parse_options_usage(NULL, options, "e", 1);
+                       ret = -EINVAL;
+                       goto out_delete_filters;
+               }
+               if (!list_empty(&ftrace.filters) && !list_empty(&ftrace.event_pair)) {
+                       pr_err("Please specify either of function or events\n");
+                       parse_options_usage(ftrace_usage, options, "T", 1);
+                       parse_options_usage(NULL, options, "e", 1);
+                       ret = -EINVAL;
+                       goto out_delete_filters;
+               }
+               if (!list_empty(&ftrace.event_pair) && !ftrace.target.use_bpf) {
+                       pr_err("Event processing needs BPF\n");
+                       parse_options_usage(ftrace_usage, options, "b", 1);
+                       parse_options_usage(NULL, options, "e", 1);
                        ret = -EINVAL;
                        goto out_delete_filters;
                }
        delete_filter_func(&ftrace.notrace);
        delete_filter_func(&ftrace.graph_funcs);
        delete_filter_func(&ftrace.nograph_funcs);
+       delete_filter_func(&ftrace.event_pair);
 
        return ret;
 }
 
 {
        int fd, err;
        int i, ncpus = 1, ntasks = 1;
-       struct filter_entry *func;
+       struct filter_entry *func = NULL;
 
-       if (!list_is_singular(&ftrace->filters)) {
-               pr_err("ERROR: %s target function(s).\n",
-                      list_empty(&ftrace->filters) ? "No" : "Too many");
-               return -1;
-       }
+       if (!list_empty(&ftrace->filters)) {
+               if (!list_is_singular(&ftrace->filters)) {
+                       pr_err("ERROR: Too many target functions.\n");
+                       return -1;
+               }
+               func = list_first_entry(&ftrace->filters, struct filter_entry, list);
+       } else {
+               int count = 0;
+               struct list_head *pos;
 
-       func = list_first_entry(&ftrace->filters, struct filter_entry, list);
+               list_for_each(pos, &ftrace->event_pair)
+                       count++;
+
+               if (count != 2) {
+                       pr_err("ERROR: Needs two target events.\n");
+                       return -1;
+               }
+       }
 
        skel = func_latency_bpf__open();
        if (!skel) {
 
        skel->bss->min = INT64_MAX;
 
-       skel->links.func_begin = bpf_program__attach_kprobe(skel->progs.func_begin,
-                                                           false, func->name);
-       if (IS_ERR(skel->links.func_begin)) {
-               pr_err("Failed to attach fentry program\n");
-               err = PTR_ERR(skel->links.func_begin);
-               goto out;
-       }
+       if (func) {
+               skel->links.func_begin = bpf_program__attach_kprobe(skel->progs.func_begin,
+                                                                   false, func->name);
+               if (IS_ERR(skel->links.func_begin)) {
+                       pr_err("Failed to attach fentry program\n");
+                       err = PTR_ERR(skel->links.func_begin);
+                       goto out;
+               }
 
-       skel->links.func_end = bpf_program__attach_kprobe(skel->progs.func_end,
-                                                         true, func->name);
-       if (IS_ERR(skel->links.func_end)) {
-               pr_err("Failed to attach fexit program\n");
-               err = PTR_ERR(skel->links.func_end);
-               goto out;
+               skel->links.func_end = bpf_program__attach_kprobe(skel->progs.func_end,
+                                                                 true, func->name);
+               if (IS_ERR(skel->links.func_end)) {
+                       pr_err("Failed to attach fexit program\n");
+                       err = PTR_ERR(skel->links.func_end);
+                       goto out;
+               }
+       } else {
+               struct filter_entry *event;
+
+               event = list_first_entry(&ftrace->event_pair, struct filter_entry, list);
+
+               skel->links.event_begin = bpf_program__attach_raw_tracepoint(skel->progs.event_begin,
+                                                                            event->name);
+               if (IS_ERR(skel->links.event_begin)) {
+                       pr_err("Failed to attach first tracepoint program\n");
+                       err = PTR_ERR(skel->links.event_begin);
+                       goto out;
+               }
+
+               event = list_next_entry(event, list);
+
+               skel->links.event_end = bpf_program__attach_raw_tracepoint(skel->progs.event_end,
+                                                                            event->name);
+               if (IS_ERR(skel->links.event_end)) {
+                       pr_err("Failed to attach second tracepoint program\n");
+                       err = PTR_ERR(skel->links.event_end);
+                       goto out;
+               }
        }
 
        /* XXX: we don't actually use this fd - just for poll() */
 
 const volatile unsigned int max_latency;
 const volatile unsigned int bucket_num = NUM_BUCKET;
 
-SEC("kprobe/func")
-int BPF_PROG(func_begin)
+static bool can_record(void)
 {
-       __u64 key, now;
-
-       if (!enabled)
-               return 0;
-
-       key = bpf_get_current_pid_tgid();
-
        if (has_cpu) {
                __u32 cpu = bpf_get_smp_processor_id();
                __u8 *ok;
 
                ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
                if (!ok)
-                       return 0;
+                       return false;
        }
 
        if (has_task) {
-               __u32 pid = key & 0xffffffff;
+               __u32 pid = bpf_get_current_pid_tgid();
                __u8 *ok;
 
                ok = bpf_map_lookup_elem(&task_filter, &pid);
                if (!ok)
-                       return 0;
+                       return false;
        }
+       return true;
+}
+
+static void update_latency(__s64 delta)
+{
+       __u64 val = delta;
+       __u32 key = 0;
+       __u64 *hist;
+       __u64 cmp_base = use_nsec ? 1 : 1000;
+
+       if (delta < 0)
+               return;
 
+       if (bucket_range != 0) {
+               val = delta / cmp_base;
+
+               if (min_latency > 0) {
+                       if (val > min_latency)
+                               val -= min_latency;
+                       else
+                               goto do_lookup;
+               }
+
+               // Less than 1 unit (ms or ns), or, in the future,
+               // than the min latency desired.
+               if (val > 0) { // 1st entry: [ 1 unit .. bucket_range units )
+                       key = val / bucket_range + 1;
+                       if (key >= bucket_num)
+                               key = bucket_num - 1;
+               }
+
+               goto do_lookup;
+       }
+       // calculate index using delta
+       for (key = 0; key < (bucket_num - 1); key++) {
+               if (delta < (cmp_base << key))
+                       break;
+       }
+
+do_lookup:
+       hist = bpf_map_lookup_elem(&latency, &key);
+       if (!hist)
+               return;
+
+       __sync_fetch_and_add(hist, 1);
+
+       __sync_fetch_and_add(&total, delta); // always in nsec
+       __sync_fetch_and_add(&count, 1);
+
+       if (delta > max)
+               max = delta;
+       if (delta < min)
+               min = delta;
+}
+
+SEC("kprobe/func")
+int BPF_PROG(func_begin)
+{
+       __u64 key, now;
+
+       if (!enabled || !can_record())
+               return 0;
+
+       key = bpf_get_current_pid_tgid();
        now = bpf_ktime_get_ns();
 
        // overwrite timestamp for nested functions
 {
        __u64 tid;
        __u64 *start;
-       __u64 cmp_base = use_nsec ? 1 : 1000;
 
        if (!enabled)
                return 0;
 
        start = bpf_map_lookup_elem(&functime, &tid);
        if (start) {
-               __s64 delta = bpf_ktime_get_ns() - *start;
-               __u64 val = delta;
-               __u32 key = 0;
-               __u64 *hist;
-
+               update_latency(bpf_ktime_get_ns() - *start);
                bpf_map_delete_elem(&functime, &tid);
+       }
 
-               if (delta < 0)
-                       return 0;
+       return 0;
+}
 
-               if (bucket_range != 0) {
-                       val = delta / cmp_base;
+SEC("raw_tp")
+int BPF_PROG(event_begin)
+{
+       __u64 key, now;
 
-                       if (min_latency > 0) {
-                               if (val > min_latency)
-                                       val -= min_latency;
-                               else
-                                       goto do_lookup;
-                       }
+       if (!enabled || !can_record())
+               return 0;
 
-                       // Less than 1 unit (ms or ns), or, in the future,
-                       // than the min latency desired.
-                       if (val > 0) { // 1st entry: [ 1 unit .. bucket_range units )
-                               key = val / bucket_range + 1;
-                               if (key >= bucket_num)
-                                       key = bucket_num - 1;
-                       }
+       key = bpf_get_current_pid_tgid();
+       now = bpf_ktime_get_ns();
 
-                       goto do_lookup;
-               }
-               // calculate index using delta
-               for (key = 0; key < (bucket_num - 1); key++) {
-                       if (delta < (cmp_base << key))
-                               break;
-               }
+       // overwrite timestamp for nested events
+       bpf_map_update_elem(&functime, &key, &now, BPF_ANY);
+       return 0;
+}
 
-do_lookup:
-               hist = bpf_map_lookup_elem(&latency, &key);
-               if (!hist)
-                       return 0;
+SEC("raw_tp")
+int BPF_PROG(event_end)
+{
+       __u64 tid;
+       __u64 *start;
 
-               __sync_fetch_and_add(hist, 1);
+       if (!enabled)
+               return 0;
 
-               __sync_fetch_and_add(&total, delta); // always in nsec
-               __sync_fetch_and_add(&count, 1);
+       tid = bpf_get_current_pid_tgid();
 
-               if (delta > max)
-                       max = delta;
-               if (delta < min)
-                       min = delta;
+       start = bpf_map_lookup_elem(&functime, &tid);
+       if (start) {
+               update_latency(bpf_ktime_get_ns() - *start);
+               bpf_map_delete_elem(&functime, &tid);
        }
 
        return 0;
 
        struct list_head        notrace;
        struct list_head        graph_funcs;
        struct list_head        nograph_funcs;
+       struct list_head        event_pair;
        struct hashmap          *profile_hash;
        unsigned long           percpu_buffer_size;
        bool                    inherit;