struct perf_event_attr *attr, FILE *fp)
 {
        struct branch_stack *br = sample->branch_stack;
+       struct branch_entry *entries = perf_sample__branch_entries(sample);
        struct addr_location alf, alt;
        u64 i, from, to;
        int printed = 0;
                return 0;
 
        for (i = 0; i < br->nr; i++) {
-               from = br->entries[i].from;
-               to   = br->entries[i].to;
+               from = entries[i].from;
+               to   = entries[i].to;
 
                if (PRINT_FIELD(DSO)) {
                        memset(&alf, 0, sizeof(alf));
                }
 
                printed += fprintf(fp, "/%c/%c/%c/%d ",
-                       mispred_str( br->entries + i),
-                       br->entries[i].flags.in_tx? 'X' : '-',
-                       br->entries[i].flags.abort? 'A' : '-',
-                       br->entries[i].flags.cycles);
+                       mispred_str(entries + i),
+                       entries[i].flags.in_tx ? 'X' : '-',
+                       entries[i].flags.abort ? 'A' : '-',
+                       entries[i].flags.cycles);
        }
 
        return printed;
                                           struct perf_event_attr *attr, FILE *fp)
 {
        struct branch_stack *br = sample->branch_stack;
+       struct branch_entry *entries = perf_sample__branch_entries(sample);
        struct addr_location alf, alt;
        u64 i, from, to;
        int printed = 0;
 
                memset(&alf, 0, sizeof(alf));
                memset(&alt, 0, sizeof(alt));
-               from = br->entries[i].from;
-               to   = br->entries[i].to;
+               from = entries[i].from;
+               to   = entries[i].to;
 
                thread__find_symbol_fb(thread, sample->cpumode, from, &alf);
                thread__find_symbol_fb(thread, sample->cpumode, to, &alt);
                        printed += fprintf(fp, ")");
                }
                printed += fprintf(fp, "/%c/%c/%c/%d ",
-                       mispred_str( br->entries + i),
-                       br->entries[i].flags.in_tx? 'X' : '-',
-                       br->entries[i].flags.abort? 'A' : '-',
-                       br->entries[i].flags.cycles);
+                       mispred_str(entries + i),
+                       entries[i].flags.in_tx ? 'X' : '-',
+                       entries[i].flags.abort ? 'A' : '-',
+                       entries[i].flags.cycles);
        }
 
        return printed;
                                           struct perf_event_attr *attr, FILE *fp)
 {
        struct branch_stack *br = sample->branch_stack;
+       struct branch_entry *entries = perf_sample__branch_entries(sample);
        struct addr_location alf, alt;
        u64 i, from, to;
        int printed = 0;
 
                memset(&alf, 0, sizeof(alf));
                memset(&alt, 0, sizeof(alt));
-               from = br->entries[i].from;
-               to   = br->entries[i].to;
+               from = entries[i].from;
+               to   = entries[i].to;
 
                if (thread__find_map_fb(thread, sample->cpumode, from, &alf) &&
                    !alf.map->dso->adjust_symbols)
                        printed += fprintf(fp, ")");
                }
                printed += fprintf(fp, "/%c/%c/%c/%d ",
-                       mispred_str(br->entries + i),
-                       br->entries[i].flags.in_tx ? 'X' : '-',
-                       br->entries[i].flags.abort ? 'A' : '-',
-                       br->entries[i].flags.cycles);
+                       mispred_str(entries + i),
+                       entries[i].flags.in_tx ? 'X' : '-',
+                       entries[i].flags.abort ? 'A' : '-',
+                       entries[i].flags.cycles);
        }
 
        return printed;
                                            struct machine *machine, FILE *fp)
 {
        struct branch_stack *br = sample->branch_stack;
+       struct branch_entry *entries = perf_sample__branch_entries(sample);
        u64 start, end;
        int i, insn, len, nr, ilen, printed = 0;
        struct perf_insn x;
        printed += fprintf(fp, "%c", '\n');
 
        /* Handle first from jump, of which we don't know the entry. */
-       len = grab_bb(buffer, br->entries[nr-1].from,
-                       br->entries[nr-1].from,
+       len = grab_bb(buffer, entries[nr-1].from,
+                       entries[nr-1].from,
                        machine, thread, &x.is64bit, &x.cpumode, false);
        if (len > 0) {
-               printed += ip__fprintf_sym(br->entries[nr - 1].from, thread,
+               printed += ip__fprintf_sym(entries[nr - 1].from, thread,
                                           x.cpumode, x.cpu, &lastsym, attr, fp);
-               printed += ip__fprintf_jump(br->entries[nr - 1].from, &br->entries[nr - 1],
+               printed += ip__fprintf_jump(entries[nr - 1].from, &entries[nr - 1],
                                            &x, buffer, len, 0, fp, &total_cycles);
                if (PRINT_FIELD(SRCCODE))
-                       printed += print_srccode(thread, x.cpumode, br->entries[nr - 1].from);
+                       printed += print_srccode(thread, x.cpumode, entries[nr - 1].from);
        }
 
        /* Print all blocks */
        for (i = nr - 2; i >= 0; i--) {
-               if (br->entries[i].from || br->entries[i].to)
+               if (entries[i].from || entries[i].to)
                        pr_debug("%d: %" PRIx64 "-%" PRIx64 "\n", i,
-                                br->entries[i].from,
-                                br->entries[i].to);
-               start = br->entries[i + 1].to;
-               end   = br->entries[i].from;
+                                entries[i].from,
+                                entries[i].to);
+               start = entries[i + 1].to;
+               end   = entries[i].from;
 
                len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, false);
                /* Patch up missing kernel transfers due to ring filters */
                if (len == -ENXIO && i > 0) {
-                       end = br->entries[--i].from;
+                       end = entries[--i].from;
                        pr_debug("\tpatching up to %" PRIx64 "-%" PRIx64 "\n", start, end);
                        len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, false);
                }
 
                        printed += ip__fprintf_sym(ip, thread, x.cpumode, x.cpu, &lastsym, attr, fp);
                        if (ip == end) {
-                               printed += ip__fprintf_jump(ip, &br->entries[i], &x, buffer + off, len - off, ++insn, fp,
+                               printed += ip__fprintf_jump(ip, &entries[i], &x, buffer + off, len - off, ++insn, fp,
                                                            &total_cycles);
                                if (PRINT_FIELD(SRCCODE))
                                        printed += print_srccode(thread, x.cpumode, ip);
         * Hit the branch? In this case we are already done, and the target
         * has not been executed yet.
         */
-       if (br->entries[0].from == sample->ip)
+       if (entries[0].from == sample->ip)
                goto out;
-       if (br->entries[0].flags.abort)
+       if (entries[0].flags.abort)
                goto out;
 
        /*
         * between final branch and sample. When this happens just
         * continue walking after the last TO until we hit a branch.
         */
-       start = br->entries[0].to;
+       start = entries[0].to;
        end = sample->ip;
        if (end < start) {
                /* Missing jump. Scan 128 bytes for the next branch */
 
 
        if (type & PERF_SAMPLE_BRANCH_STACK) {
                COMP(branch_stack->nr);
+               COMP(branch_stack->hw_idx);
                for (i = 0; i < s1->branch_stack->nr; i++)
                        MCOMP(branch_stack->entries[i]);
        }
                u64 data[64];
        } branch_stack = {
                /* 1 branch_entry */
-               .data = {1, 211, 212, 213},
+               .data = {1, -1ULL, 211, 212, 213},
        };
        u64 regs[64];
        const u64 raw_data[] = {0x123456780a0b0c0dULL, 0x1102030405060708ULL};
                .transaction    = 112,
                .raw_data       = (void *)raw_data,
                .callchain      = &callchain.callchain,
+               .no_hw_idx      = false,
                .branch_stack   = &branch_stack.branch_stack,
                .user_regs      = {
                        .abi    = PERF_SAMPLE_REGS_ABI_64,
        if (sample_type & PERF_SAMPLE_REGS_INTR)
                evsel.core.attr.sample_regs_intr = sample_regs;
 
+       if (sample_type & PERF_SAMPLE_BRANCH_STACK)
+               evsel.core.attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX;
+
        for (i = 0; i < sizeof(regs); i++)
                *(i + (u8 *)regs) = i & 0xfe;
 
 
 #include <linux/stddef.h>
 #include <linux/perf_event.h>
 #include <linux/types.h>
+#include "event.h"
 
 struct branch_flags {
        u64 mispred:1;
 
 struct branch_stack {
        u64                     nr;
+       u64                     hw_idx;
        struct branch_entry     entries[0];
 };
 
+/*
+ * The hw_idx is only available when PERF_SAMPLE_BRANCH_HW_INDEX is applied.
+ * Otherwise, the output format of a sample with branch stack is
+ * struct branch_stack {
+ *     u64                     nr;
+ *     struct branch_entry     entries[0];
+ * }
+ * Check whether the hw_idx is available,
+ * and return the corresponding pointer of entries[0].
+ */
+static inline struct branch_entry *perf_sample__branch_entries(struct perf_sample *sample)
+{
+       u64 *entry = (u64 *)sample->branch_stack;
+
+       entry++;
+       if (sample->no_hw_idx)
+               return (struct branch_entry *)entry;
+       return (struct branch_entry *)(++entry);
+}
+
 struct branch_type_stat {
        bool    branch_to;
        u64     counts[PERF_BR_MAX];
 
        union perf_event *event = tidq->event_buf;
        struct dummy_branch_stack {
                u64                     nr;
+               u64                     hw_idx;
                struct branch_entry     entries;
        } dummy_bs;
        u64 ip;
        if (etm->synth_opts.last_branch) {
                dummy_bs = (struct dummy_branch_stack){
                        .nr = 1,
+                       .hw_idx = -1ULL,
                        .entries = {
                                .from = sample.ip,
                                .to = sample.addr,
 
        u16 insn_len;
        u8  cpumode;
        u16 misc;
+       bool no_hw_idx;         /* No hw_idx collected in branch_stack */
        char insn[MAX_INSN];
        void *raw_data;
        struct ip_callchain *callchain;
 
 
                if (data->branch_stack->nr > max_branch_nr)
                        return -EFAULT;
+
                sz = data->branch_stack->nr * sizeof(struct branch_entry);
+               if (perf_evsel__has_branch_hw_idx(evsel))
+                       sz += sizeof(u64);
+               else
+                       data->no_hw_idx = true;
                OVERFLOW_CHECK(array, sz, max_size);
                array = (void *)array + sz;
        }
 
        return evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK;
 }
 
+static inline bool perf_evsel__has_branch_hw_idx(const struct evsel *evsel)
+{
+       return evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
+}
+
 static inline bool evsel__has_callchain(const struct evsel *evsel)
 {
        return (evsel->core.attr.sample_type & PERF_SAMPLE_CALLCHAIN) != 0;
 
                          u64 *total_cycles)
 {
        struct branch_info *bi;
+       struct branch_entry *entries = perf_sample__branch_entries(sample);
 
        /* If we have branch cycles always annotate them. */
-       if (bs && bs->nr && bs->entries[0].flags.cycles) {
+       if (bs && bs->nr && entries[0].flags.cycles) {
                int i;
 
                bi = sample__resolve_bstack(sample, al);
 
        struct perf_sample sample = { .ip = 0, };
        struct dummy_branch_stack {
                u64                     nr;
+               u64                     hw_idx;
                struct branch_entry     entries;
        } dummy_bs;
 
        if (pt->synth_opts.last_branch && sort__mode == SORT_MODE__BRANCH) {
                dummy_bs = (struct dummy_branch_stack){
                        .nr = 1,
+                       .hw_idx = -1ULL,
                        .entries = {
                                .from = sample.ip,
                                .to = sample.addr,
 
 {
        unsigned int i;
        const struct branch_stack *bs = sample->branch_stack;
+       struct branch_entry *entries = perf_sample__branch_entries(sample);
        struct branch_info *bi = calloc(bs->nr, sizeof(struct branch_info));
 
        if (!bi)
                return NULL;
 
        for (i = 0; i < bs->nr; i++) {
-               ip__resolve_ams(al->thread, &bi[i].to, bs->entries[i].to);
-               ip__resolve_ams(al->thread, &bi[i].from, bs->entries[i].from);
-               bi[i].flags = bs->entries[i].flags;
+               ip__resolve_ams(al->thread, &bi[i].to, entries[i].to);
+               ip__resolve_ams(al->thread, &bi[i].from, entries[i].from);
+               bi[i].flags = entries[i].flags;
        }
        return bi;
 }
        /* LBR only affects the user callchain */
        if (i != chain_nr) {
                struct branch_stack *lbr_stack = sample->branch_stack;
+               struct branch_entry *entries = perf_sample__branch_entries(sample);
                int lbr_nr = lbr_stack->nr, j, k;
                bool branch;
                struct branch_flags *flags;
                                        ip = chain->ips[j];
                                else if (j > i + 1) {
                                        k = j - i - 2;
-                                       ip = lbr_stack->entries[k].from;
+                                       ip = entries[k].from;
                                        branch = true;
-                                       flags = &lbr_stack->entries[k].flags;
+                                       flags = &entries[k].flags;
                                } else {
-                                       ip = lbr_stack->entries[0].to;
+                                       ip = entries[0].to;
                                        branch = true;
-                                       flags = &lbr_stack->entries[0].flags;
-                                       branch_from =
-                                               lbr_stack->entries[0].from;
+                                       flags = &entries[0].flags;
+                                       branch_from = entries[0].from;
                                }
                        } else {
                                if (j < lbr_nr) {
                                        k = lbr_nr - j - 1;
-                                       ip = lbr_stack->entries[k].from;
+                                       ip = entries[k].from;
                                        branch = true;
-                                       flags = &lbr_stack->entries[k].flags;
+                                       flags = &entries[k].flags;
                                }
                                else if (j > lbr_nr)
                                        ip = chain->ips[i + 1 - (j - lbr_nr)];
                                else {
-                                       ip = lbr_stack->entries[0].to;
+                                       ip = entries[0].to;
                                        branch = true;
-                                       flags = &lbr_stack->entries[0].flags;
-                                       branch_from =
-                                               lbr_stack->entries[0].from;
+                                       flags = &entries[0].flags;
+                                       branch_from = entries[0].from;
                                }
                        }
 
                                            int max_stack)
 {
        struct branch_stack *branch = sample->branch_stack;
+       struct branch_entry *entries = perf_sample__branch_entries(sample);
        struct ip_callchain *chain = sample->callchain;
        int chain_nr = 0;
        u8 cpumode = PERF_RECORD_MISC_USER;
 
                for (i = 0; i < nr; i++) {
                        if (callchain_param.order == ORDER_CALLEE) {
-                               be[i] = branch->entries[i];
+                               be[i] = entries[i];
 
                                if (chain == NULL)
                                        continue;
                                    be[i].from >= chain->ips[first_call] - 8)
                                        first_call++;
                        } else
-                               be[i] = branch->entries[branch->nr - i - 1];
+                               be[i] = entries[branch->nr - i - 1];
                }
 
                memset(iter, 0, sizeof(struct iterations) * nr);
 
                                        struct thread *thread)
 {
        struct branch_stack *br = sample->branch_stack;
+       struct branch_entry *entries = perf_sample__branch_entries(sample);
        PyObject *pylist;
        u64 i;
 
                        Py_FatalError("couldn't create Python dictionary");
 
                pydict_set_item_string_decref(pyelem, "from",
-                   PyLong_FromUnsignedLongLong(br->entries[i].from));
+                   PyLong_FromUnsignedLongLong(entries[i].from));
                pydict_set_item_string_decref(pyelem, "to",
-                   PyLong_FromUnsignedLongLong(br->entries[i].to));
+                   PyLong_FromUnsignedLongLong(entries[i].to));
                pydict_set_item_string_decref(pyelem, "mispred",
-                   PyBool_FromLong(br->entries[i].flags.mispred));
+                   PyBool_FromLong(entries[i].flags.mispred));
                pydict_set_item_string_decref(pyelem, "predicted",
-                   PyBool_FromLong(br->entries[i].flags.predicted));
+                   PyBool_FromLong(entries[i].flags.predicted));
                pydict_set_item_string_decref(pyelem, "in_tx",
-                   PyBool_FromLong(br->entries[i].flags.in_tx));
+                   PyBool_FromLong(entries[i].flags.in_tx));
                pydict_set_item_string_decref(pyelem, "abort",
-                   PyBool_FromLong(br->entries[i].flags.abort));
+                   PyBool_FromLong(entries[i].flags.abort));
                pydict_set_item_string_decref(pyelem, "cycles",
-                   PyLong_FromUnsignedLongLong(br->entries[i].flags.cycles));
+                   PyLong_FromUnsignedLongLong(entries[i].flags.cycles));
 
                thread__find_map_fb(thread, sample->cpumode,
-                                   br->entries[i].from, &al);
+                                   entries[i].from, &al);
                dsoname = get_dsoname(al.map);
                pydict_set_item_string_decref(pyelem, "from_dsoname",
                                              _PyUnicode_FromString(dsoname));
 
                thread__find_map_fb(thread, sample->cpumode,
-                                   br->entries[i].to, &al);
+                                   entries[i].to, &al);
                dsoname = get_dsoname(al.map);
                pydict_set_item_string_decref(pyelem, "to_dsoname",
                                              _PyUnicode_FromString(dsoname));
                                           struct thread *thread)
 {
        struct branch_stack *br = sample->branch_stack;
+       struct branch_entry *entries = perf_sample__branch_entries(sample);
        PyObject *pylist;
        u64 i;
        char bf[512];
                        Py_FatalError("couldn't create Python dictionary");
 
                thread__find_symbol_fb(thread, sample->cpumode,
-                                      br->entries[i].from, &al);
+                                      entries[i].from, &al);
                get_symoff(al.sym, &al, true, bf, sizeof(bf));
                pydict_set_item_string_decref(pyelem, "from",
                                              _PyUnicode_FromString(bf));
 
                thread__find_symbol_fb(thread, sample->cpumode,
-                                      br->entries[i].to, &al);
+                                      entries[i].to, &al);
                get_symoff(al.sym, &al, true, bf, sizeof(bf));
                pydict_set_item_string_decref(pyelem, "to",
                                              _PyUnicode_FromString(bf));
 
-               get_br_mspred(&br->entries[i].flags, bf, sizeof(bf));
+               get_br_mspred(&entries[i].flags, bf, sizeof(bf));
                pydict_set_item_string_decref(pyelem, "pred",
                                              _PyUnicode_FromString(bf));
 
-               if (br->entries[i].flags.in_tx) {
+               if (entries[i].flags.in_tx) {
                        pydict_set_item_string_decref(pyelem, "in_tx",
                                              _PyUnicode_FromString("X"));
                } else {
                                              _PyUnicode_FromString("-"));
                }
 
-               if (br->entries[i].flags.abort) {
+               if (entries[i].flags.abort) {
                        pydict_set_item_string_decref(pyelem, "abort",
                                              _PyUnicode_FromString("A"));
                } else {
 
 {
        struct ip_callchain *callchain = sample->callchain;
        struct branch_stack *lbr_stack = sample->branch_stack;
+       struct branch_entry *entries = perf_sample__branch_entries(sample);
        u64 kernel_callchain_nr = callchain->nr;
        unsigned int i;
 
                               i, callchain->ips[i]);
 
                printf("..... %2d: %016" PRIx64 "\n",
-                      (int)(kernel_callchain_nr), lbr_stack->entries[0].to);
+                      (int)(kernel_callchain_nr), entries[0].to);
                for (i = 0; i < lbr_stack->nr; i++)
                        printf("..... %2d: %016" PRIx64 "\n",
-                              (int)(i + kernel_callchain_nr + 1), lbr_stack->entries[i].from);
+                              (int)(i + kernel_callchain_nr + 1), entries[i].from);
        }
 }
 
 
 static void branch_stack__printf(struct perf_sample *sample, bool callstack)
 {
+       struct branch_entry *entries = perf_sample__branch_entries(sample);
        uint64_t i;
 
        printf("%s: nr:%" PRIu64 "\n",
                sample->branch_stack->nr);
 
        for (i = 0; i < sample->branch_stack->nr; i++) {
-               struct branch_entry *e = &sample->branch_stack->entries[i];
+               struct branch_entry *e = &entries[i];
 
                if (!callstack) {
                        printf("..... %2"PRIu64": %016" PRIx64 " -> %016" PRIx64 " %hu cycles %s%s%s%s %x\n",
 
 
        if (type & PERF_SAMPLE_BRANCH_STACK) {
                sz = sample->branch_stack->nr * sizeof(struct branch_entry);
-               sz += sizeof(u64);
+               /* nr, hw_idx */
+               sz += 2 * sizeof(u64);
                result += sz;
        }
 
 
        if (type & PERF_SAMPLE_BRANCH_STACK) {
                sz = sample->branch_stack->nr * sizeof(struct branch_entry);
-               sz += sizeof(u64);
+               /* nr, hw_idx */
+               sz += 2 * sizeof(u64);
                memcpy(array, sample->branch_stack, sz);
                array = (void *)array + sz;
        }