struct thread *unknown_thread;
        struct perf_tsc_conversion tc;
 
+       /*
+        * Timeless has no timestamps in the trace so overlapping mmap lookups
+        * are less accurate but produces smaller trace data. We use context IDs
+        * in the trace instead of matching timestamps with fork records so
+        * they're not really needed in the general case. Overlapping mmaps
+        * happen in cases like between a fork and an exec.
+        */
        bool timeless_decoding;
+
+       /*
+        * Per-thread ignores the trace channel ID and instead assumes that
+        * everything in a buffer comes from the same process regardless of
+        * which CPU it ran on. It also implies no context IDs so the TID is
+        * taken from the auxtrace buffer.
+        */
+       bool per_thread_decoding;
        bool snapshot_mode;
        bool data_queued;
        bool has_virtual_ts; /* Virtual/Kernel timestamps in the trace. */
 /* RB tree for quick conversion between traceID and metadata pointers */
 static struct intlist *traceid_list;
 
-static int cs_etm__process_queues(struct cs_etm_auxtrace *etm);
+static int cs_etm__process_timestamped_queues(struct cs_etm_auxtrace *etm);
 static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
                                           pid_t tid);
 static int cs_etm__get_data_block(struct cs_etm_queue *etmq);
        struct cs_etm_traceid_queue *tidq, **traceid_queues;
        struct cs_etm_auxtrace *etm = etmq->etm;
 
-       if (etm->timeless_decoding)
+       if (etm->per_thread_decoding)
                trace_chan_id = CS_ETM_PER_THREAD_TRACEID;
 
        traceid_queues_list = etmq->traceid_queues_list;
        if (!tool->ordered_events)
                return -EINVAL;
 
-       if (etm->timeless_decoding)
+       if (etm->timeless_decoding) {
+               /*
+                * Pass tid = -1 to process all queues. But likely they will have
+                * already been processed on PERF_RECORD_EXIT anyway.
+                */
                return cs_etm__process_timeless_queues(etm, -1);
+       }
 
-       return cs_etm__process_queues(etm);
+       return cs_etm__process_timestamped_queues(etm);
 }
 
 static void cs_etm__free_traceid_queues(struct cs_etm_queue *etmq)
         * chronological order.
         *
         * Note that packets decoded above are still in the traceID's packet
-        * queue and will be processed in cs_etm__process_queues().
+        * queue and will be processed in cs_etm__process_timestamped_queues().
         */
        cs_queue_nr = TO_CS_QUEUE_NR(queue_nr, trace_chan_id);
        ret = auxtrace_heap__add(&etm->heap, cs_queue_nr, cs_timestamp);
        struct cs_etm_auxtrace *etm = etmq->etm;
        struct cs_etm_packet_queue *packet_queue = &tidq->packet_queue;
 
-       if (etm->timeless_decoding)
-               return 0;
-       else if (etm->has_virtual_ts)
+       if (!etm->timeless_decoding && etm->has_virtual_ts)
                return packet_queue->cs_timestamp;
        else
                return etm->latest_kernel_timestamp;
        }
 }
 
-static int cs_etm__run_decoder(struct cs_etm_queue *etmq)
+static int cs_etm__run_per_thread_timeless_decoder(struct cs_etm_queue *etmq)
 {
        int err = 0;
        struct cs_etm_traceid_queue *tidq;
        return err;
 }
 
+static int cs_etm__run_per_cpu_timeless_decoder(struct cs_etm_queue *etmq)
+{
+       int idx, err = 0;
+       struct cs_etm_traceid_queue *tidq;
+       struct int_node *inode;
+
+       /* Go through each buffer in the queue and decode them one by one */
+       while (1) {
+               err = cs_etm__get_data_block(etmq);
+               if (err <= 0)
+                       return err;
+
+               /* Run trace decoder until buffer consumed or end of trace */
+               do {
+                       err = cs_etm__decode_data_block(etmq);
+                       if (err)
+                               return err;
+
+                       /*
+                        * cs_etm__run_per_thread_timeless_decoder() runs on a
+                        * single traceID queue because each TID has a separate
+                        * buffer. But here in per-cpu mode we need to iterate
+                        * over each channel instead.
+                        */
+                       intlist__for_each_entry(inode,
+                                               etmq->traceid_queues_list) {
+                               idx = (int)(intptr_t)inode->priv;
+                               tidq = etmq->traceid_queues[idx];
+                               cs_etm__process_traceid_queue(etmq, tidq);
+                       }
+               } while (etmq->buf_len);
+
+               intlist__for_each_entry(inode, etmq->traceid_queues_list) {
+                       idx = (int)(intptr_t)inode->priv;
+                       tidq = etmq->traceid_queues[idx];
+                       /* Flush any remaining branch stack entries */
+                       err = cs_etm__end_block(etmq, tidq);
+                       if (err)
+                               return err;
+               }
+       }
+
+       return err;
+}
+
 static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
                                           pid_t tid)
 {
                if (!etmq)
                        continue;
 
-               tidq = cs_etm__etmq_get_traceid_queue(etmq,
-                                               CS_ETM_PER_THREAD_TRACEID);
+               /*
+                * Per-cpu mode has contextIDs in the trace and the decoder
+                * calls cs_etm__set_pid_tid_cpu() automatically so no need
+                * to do this here
+                */
+               if (etm->per_thread_decoding) {
+                       tidq = cs_etm__etmq_get_traceid_queue(
+                               etmq, CS_ETM_PER_THREAD_TRACEID);
 
-               if (!tidq)
-                       continue;
+                       if (!tidq)
+                               continue;
 
-               if ((tid == -1) || (tidq->tid == tid)) {
-                       cs_etm__set_pid_tid_cpu(etm, tidq);
-                       cs_etm__run_decoder(etmq);
-               }
+                       if ((tid == -1) || (tidq->tid == tid)) {
+                               cs_etm__set_pid_tid_cpu(etm, tidq);
+                               cs_etm__run_per_thread_timeless_decoder(etmq);
+                       }
+               } else
+                       cs_etm__run_per_cpu_timeless_decoder(etmq);
        }
 
        return 0;
 }
 
-static int cs_etm__process_queues(struct cs_etm_auxtrace *etm)
+static int cs_etm__process_timestamped_queues(struct cs_etm_auxtrace *etm)
 {
        int ret = 0;
        unsigned int cs_queue_nr, queue_nr, i;
                                 struct perf_sample *sample,
                                 struct perf_tool *tool)
 {
-       u64 sample_kernel_timestamp;
        struct cs_etm_auxtrace *etm = container_of(session->auxtrace,
                                                   struct cs_etm_auxtrace,
                                                   auxtrace);
                return -EINVAL;
        }
 
-       if (sample->time && (sample->time != (u64) -1))
-               sample_kernel_timestamp = sample->time;
-       else
-               sample_kernel_timestamp = 0;
-
-       /*
-        * Don't wait for cs_etm__flush_events() in per-thread/timeless mode to start the decode. We
-        * need the tid of the PERF_RECORD_EXIT event to assign to the synthesised samples because
-        * ETM_OPT_CTXTID is not enabled.
-        */
-       if (etm->timeless_decoding &&
-           event->header.type == PERF_RECORD_EXIT)
-               return cs_etm__process_timeless_queues(etm,
-                                                      event->fork.tid);
+       switch (event->header.type) {
+       case PERF_RECORD_EXIT:
+               /*
+                * Don't need to wait for cs_etm__flush_events() in per-thread mode to
+                * start the decode because we know there will be no more trace from
+                * this thread. All this does is emit samples earlier than waiting for
+                * the flush in other modes, but with timestamps it makes sense to wait
+                * for flush so that events from different threads are interleaved
+                * properly.
+                */
+               if (etm->per_thread_decoding && etm->timeless_decoding)
+                       return cs_etm__process_timeless_queues(etm,
+                                                              event->fork.tid);
+               break;
 
-       if (event->header.type == PERF_RECORD_ITRACE_START)
+       case PERF_RECORD_ITRACE_START:
                return cs_etm__process_itrace_start(etm, event);
-       else if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE)
+
+       case PERF_RECORD_SWITCH_CPU_WIDE:
                return cs_etm__process_switch_cpu_wide(etm, event);
 
-       if (!etm->timeless_decoding && event->header.type == PERF_RECORD_AUX) {
+       case PERF_RECORD_AUX:
                /*
                 * Record the latest kernel timestamp available in the header
                 * for samples so that synthesised samples occur from this point
                 * onwards.
                 */
-               etm->latest_kernel_timestamp = sample_kernel_timestamp;
+               if (sample->time && (sample->time != (u64)-1))
+                       etm->latest_kernel_timestamp = sample->time;
+               break;
+
+       default:
+               break;
        }
 
        return 0;
         * Return 'not found' if mismatch.
         */
        if (auxtrace_event->cpu == (__u32) -1) {
+               etm->per_thread_decoding = true;
                if (auxtrace_event->tid != sample->tid)
                        return 1;
-       } else if (auxtrace_event->cpu != sample->cpu)
+       } else if (auxtrace_event->cpu != sample->cpu) {
+               if (etm->per_thread_decoding) {
+                       /*
+                        * Found a per-cpu buffer after a per-thread one was
+                        * already found
+                        */
+                       pr_err("CS ETM: Inconsistent per-thread/per-cpu mode.\n");
+                       return -EINVAL;
+               }
                return 1;
+       }
 
        if (aux_event->flags & PERF_AUX_FLAG_OVERWRITE) {
                /*