continue;
                intel_bts_get_branch_type(btsq, branch);
                if (btsq->bts->synth_opts.thread_stack)
-                       thread_stack__event(thread, btsq->sample_flags,
+                       thread_stack__event(thread, btsq->cpu, btsq->sample_flags,
                                            le64_to_cpu(branch->from),
                                            le64_to_cpu(branch->to),
                                            btsq->intel_pt_insn.length,
            !btsq->bts->synth_opts.thread_stack && thread &&
            (!old_buffer || btsq->bts->sampling_mode ||
             (btsq->bts->snapshot_mode && !buffer->consecutive)))
-               thread_stack__set_trace_nr(thread, buffer->buffer_nr + 1);
+               thread_stack__set_trace_nr(thread, btsq->cpu, buffer->buffer_nr + 1);
 
        err = intel_bts_process_buffer(btsq, buffer, thread);
 
 
        intel_pt_prep_b_sample(pt, ptq, event, sample);
 
        if (pt->synth_opts.callchain) {
-               thread_stack__sample(ptq->thread, ptq->chain,
+               thread_stack__sample(ptq->thread, ptq->cpu, ptq->chain,
                                     pt->synth_opts.callchain_sz + 1,
                                     sample->ip, pt->kernel_start);
                sample->callchain = ptq->chain;
                return 0;
 
        if (pt->synth_opts.callchain || pt->synth_opts.thread_stack)
-               thread_stack__event(ptq->thread, ptq->flags, state->from_ip,
+               thread_stack__event(ptq->thread, ptq->cpu, ptq->flags, state->from_ip,
                                    state->to_ip, ptq->insn_len,
                                    state->trace_nr);
        else
-               thread_stack__set_trace_nr(ptq->thread, state->trace_nr);
+               thread_stack__set_trace_nr(ptq->thread, ptq->cpu, state->trace_nr);
 
        if (pt->sample_branches) {
                err = intel_pt_synth_branch_sample(ptq);
 
 
 #include <linux/rbtree.h>
 #include <linux/list.h>
+#include <linux/log2.h>
 #include <errno.h>
 #include "thread.h"
 #include "event.h"
        unsigned int arr_sz;
 };
 
+/*
+ * Assume pid == tid == 0 identifies the idle task as defined by
+ * perf_session__register_idle_thread(). The idle task is really 1 task per cpu,
+ * and therefore requires a stack for each cpu.
+ */
+static inline bool thread_stack__per_cpu(struct thread *thread)
+{
+       return !(thread->tid || thread->pid_);
+}
+
 static int thread_stack__grow(struct thread_stack *ts)
 {
        struct thread_stack_entry *new_stack;
        return 0;
 }
 
-static struct thread_stack *thread_stack__new(struct thread *thread,
+static struct thread_stack *thread_stack__new(struct thread *thread, int cpu,
                                              struct call_return_processor *crp)
 {
        struct thread_stack *ts = thread->ts, *new_ts;
        unsigned int old_sz = ts ? ts->arr_sz : 0;
        unsigned int new_sz = 1;
 
+       if (thread_stack__per_cpu(thread) && cpu > 0)
+               new_sz = roundup_pow_of_two(cpu + 1);
+
        if (!ts || new_sz > old_sz) {
                new_ts = calloc(new_sz, sizeof(*ts));
                if (!new_ts)
                ts = new_ts;
        }
 
+       if (thread_stack__per_cpu(thread) && cpu > 0 &&
+           (unsigned int)cpu < ts->arr_sz)
+               ts += cpu;
+
        if (!ts->stack &&
            thread_stack__init(ts, thread, crp))
                return NULL;
        return ts;
 }
 
-static inline struct thread_stack *thread__stack(struct thread *thread)
+static struct thread_stack *thread__cpu_stack(struct thread *thread, int cpu)
 {
-       return thread ? thread->ts : NULL;
+       struct thread_stack *ts = thread->ts;
+
+       if (cpu < 0)
+               cpu = 0;
+
+       if (!ts || (unsigned int)cpu >= ts->arr_sz)
+               return NULL;
+
+       ts += cpu;
+
+       if (!ts->stack)
+               return NULL;
+
+       return ts;
+}
+
+static inline struct thread_stack *thread__stack(struct thread *thread,
+                                                   int cpu)
+{
+       if (!thread)
+               return NULL;
+
+       if (thread_stack__per_cpu(thread))
+               return thread__cpu_stack(thread, cpu);
+
+       return thread->ts;
 }
 
 static int thread_stack__push(struct thread_stack *ts, u64 ret_addr,
        return err;
 }
 
-int thread_stack__event(struct thread *thread, u32 flags, u64 from_ip,
+int thread_stack__event(struct thread *thread, int cpu, u32 flags, u64 from_ip,
                        u64 to_ip, u16 insn_len, u64 trace_nr)
 {
-       struct thread_stack *ts = thread__stack(thread);
+       struct thread_stack *ts = thread__stack(thread, cpu);
 
        if (!thread)
                return -EINVAL;
 
        if (!ts) {
-               ts = thread_stack__new(thread, NULL);
+               ts = thread_stack__new(thread, cpu, NULL);
                if (!ts) {
                        pr_warning("Out of memory: no thread stack\n");
                        return -ENOMEM;
        return 0;
 }
 
-void thread_stack__set_trace_nr(struct thread *thread, u64 trace_nr)
+void thread_stack__set_trace_nr(struct thread *thread, int cpu, u64 trace_nr)
 {
-       struct thread_stack *ts = thread__stack(thread);
+       struct thread_stack *ts = thread__stack(thread, cpu);
 
        if (!ts)
                return;
        return ip < kernel_start ? PERF_CONTEXT_USER : PERF_CONTEXT_KERNEL;
 }
 
-void thread_stack__sample(struct thread *thread, struct ip_callchain *chain,
+void thread_stack__sample(struct thread *thread, int cpu,
+                         struct ip_callchain *chain,
                          size_t sz, u64 ip, u64 kernel_start)
 {
-       struct thread_stack *ts = thread__stack(thread);
+       struct thread_stack *ts = thread__stack(thread, cpu);
        u64 context = callchain_context(ip, kernel_start);
        u64 last_context;
        size_t i, j;
                          struct addr_location *to_al, u64 ref,
                          struct call_return_processor *crp)
 {
-       struct thread_stack *ts = thread__stack(thread);
+       struct thread_stack *ts = thread__stack(thread, sample->cpu);
        int err = 0;
 
        if (ts && !ts->crp) {
        }
 
        if (!ts) {
-               ts = thread_stack__new(thread, crp);
+               ts = thread_stack__new(thread, sample->cpu, crp);
                if (!ts)
                        return -ENOMEM;
                ts->comm = comm;
        return err;
 }
 
-size_t thread_stack__depth(struct thread *thread)
+size_t thread_stack__depth(struct thread *thread, int cpu)
 {
-       struct thread_stack *ts = thread__stack(thread);
+       struct thread_stack *ts = thread__stack(thread, cpu);
 
        if (!ts)
                return 0;
 
        void *data;
 };
 
-int thread_stack__event(struct thread *thread, u32 flags, u64 from_ip,
+int thread_stack__event(struct thread *thread, int cpu, u32 flags, u64 from_ip,
                        u64 to_ip, u16 insn_len, u64 trace_nr);
-void thread_stack__set_trace_nr(struct thread *thread, u64 trace_nr);
-void thread_stack__sample(struct thread *thread, struct ip_callchain *chain,
+void thread_stack__set_trace_nr(struct thread *thread, int cpu, u64 trace_nr);
+void thread_stack__sample(struct thread *thread, int cpu, struct ip_callchain *chain,
                          size_t sz, u64 ip, u64 kernel_start);
 int thread_stack__flush(struct thread *thread);
 void thread_stack__free(struct thread *thread);
-size_t thread_stack__depth(struct thread *thread);
+size_t thread_stack__depth(struct thread *thread, int cpu);
 
 struct call_return_processor *
 call_return_processor__new(int (*process)(struct call_return *cr, void *data),