count = &zero;
 
        switch (aggr_mode) {
+       case AGGR_THREAD:
        case AGGR_CORE:
        case AGGR_SOCKET:
        case AGGR_NONE:
                        csv_output ? 0 : -4,
                        perf_evsel__cpus(evsel)->map[id], csv_sep);
                break;
+       case AGGR_THREAD:
+               fprintf(output, "%*s-%*d%s",
+                       csv_output ? 0 : 16,
+                       thread_map__comm(evsel->threads, id),
+                       csv_output ? 0 : -8,
+                       thread_map__pid(evsel->threads, id),
+                       csv_sep);
+               break;
        case AGGR_GLOBAL:
        default:
                break;
        }
 }
 
+static void print_aggr_thread(struct perf_evsel *counter, char *prefix)
+{
+       int nthreads = thread_map__nr(counter->threads);
+       int ncpus = cpu_map__nr(counter->cpus);
+       int cpu, thread;
+       double uval;
+
+       for (thread = 0; thread < nthreads; thread++) {
+               u64 ena = 0, run = 0, val = 0;
+
+               for (cpu = 0; cpu < ncpus; cpu++) {
+                       val += perf_counts(counter->counts, cpu, thread)->val;
+                       ena += perf_counts(counter->counts, cpu, thread)->ena;
+                       run += perf_counts(counter->counts, cpu, thread)->run;
+               }
+
+               if (prefix)
+                       fprintf(output, "%s", prefix);
+
+               uval = val * counter->scale;
+
+               if (nsec_counter(counter))
+                       nsec_printout(thread, 0, counter, uval);
+               else
+                       abs_printout(thread, 0, counter, uval);
+
+               if (!csv_output)
+                       print_noise(counter, 1.0);
+
+               print_running(run, ena);
+               fputc('\n', output);
+       }
+}
+
 /*
  * Print out the results of a single counter:
  * aggregated counts in system-wide mode
                case AGGR_NONE:
                        fprintf(output, "#           time CPU                counts %*s events\n", unit_width, "unit");
                        break;
+               case AGGR_THREAD:
+                       fprintf(output, "#           time             comm-pid                  counts %*s events\n", unit_width, "unit");
+                       break;
                case AGGR_GLOBAL:
                default:
                        fprintf(output, "#           time             counts %*s events\n", unit_width, "unit");
        case AGGR_SOCKET:
                print_aggr(prefix);
                break;
+       case AGGR_THREAD:
+               evlist__for_each(evsel_list, counter)
+                       print_aggr_thread(counter, prefix);
+               break;
        case AGGR_GLOBAL:
                evlist__for_each(evsel_list, counter)
                        print_counter_aggr(counter, prefix);
                break;
        case AGGR_NONE:
        case AGGR_GLOBAL:
+       case AGGR_THREAD:
        default:
                break;
        }
                     "aggregate counts per processor socket", AGGR_SOCKET),
        OPT_SET_UINT(0, "per-core", &aggr_mode,
                     "aggregate counts per physical processor core", AGGR_CORE),
+       OPT_SET_UINT(0, "per-thread", &aggr_mode,
+                    "aggregate counts per thread", AGGR_THREAD),
        OPT_UINTEGER('D', "delay", &initial_delay,
                     "ms to wait before starting measurement after program start"),
        OPT_END()
                run_count = 1;
        }
 
-       /* no_aggr, cgroup are for system-wide only */
-       if ((aggr_mode != AGGR_GLOBAL || nr_cgroups) &&
+       if ((aggr_mode == AGGR_THREAD) && !target__has_task(&target)) {
+               fprintf(stderr, "The --per-thread option is only available "
+                       "when monitoring via -p -t options.\n");
+               parse_options_usage(NULL, options, "p", 1);
+               parse_options_usage(NULL, options, "t", 1);
+               goto out;
+       }
+
+       /*
+        * no_aggr, cgroup are for system-wide only
+        * --per-thread is aggregated per thread, we dont mix it with cpu mode
+        */
+       if (((aggr_mode != AGGR_GLOBAL && aggr_mode != AGGR_THREAD) || nr_cgroups) &&
            !target__has_cpu(&target)) {
                fprintf(stderr, "both cgroup and no-aggregation "
                        "modes only available in system-wide mode\n");
                }
                goto out;
        }
+
+       /*
+        * Initialize thread_map with comm names,
+        * so we could print it out on output.
+        */
+       if (aggr_mode == AGGR_THREAD)
+               thread_map__read_comms(evsel_list->threads);
+
        if (interval && interval < 100) {
                pr_err("print interval must be >= 100ms\n");
                parse_options_usage(stat_usage, options, "I", 1);