browser->show_dso);
 
        if (symbol_conf.show_branchflag_count) {
-               if (need_percent)
-                       callchain_list_counts__printf_value(node, chain, NULL,
-                                                           buf, sizeof(buf));
-               else
-                       callchain_list_counts__printf_value(NULL, chain, NULL,
-                                                           buf, sizeof(buf));
+               callchain_list_counts__printf_value(chain, NULL,
+                                                   buf, sizeof(buf));
 
                if (asprintf(&alloc_str2, "%s%s", str, buf) < 0)
                        str = "Not enough memory!";
 
        str = callchain_list__sym_name(chain, bf, sizeof(bf), false);
 
        if (symbol_conf.show_branchflag_count) {
-               if (!period)
-                       callchain_list_counts__printf_value(node, chain, NULL,
-                                                           buf, sizeof(buf));
-               else
-                       callchain_list_counts__printf_value(NULL, chain, NULL,
-                                                           buf, sizeof(buf));
+               callchain_list_counts__printf_value(chain, NULL,
+                                                   buf, sizeof(buf));
 
                if (asprintf(&alloc_str, "%s%s", str, buf) < 0)
                        str = "Not enough memory!";
 
                        if (symbol_conf.show_branchflag_count)
                                ret += callchain_list_counts__printf_value(
-                                               NULL, chain, fp, NULL, 0);
+                                               chain, fp, NULL, 0);
                        ret += fprintf(fp, "\n");
 
                        if (++entries_printed == callchain_param.print_limit)
 
                                call->cycles_count =
                                        cursor_node->branch_flags.cycles;
                                call->iter_count = cursor_node->nr_loop_iter;
-                               call->samples_count = cursor_node->samples;
+                               call->iter_cycles = cursor_node->iter_cycles;
                        }
                }
 
                                cnode->cycles_count +=
                                        node->branch_flags.cycles;
                                cnode->iter_count += node->nr_loop_iter;
-                               cnode->samples_count += node->samples;
+                               cnode->iter_cycles += node->iter_cycles;
                        }
                }
 
 int callchain_cursor_append(struct callchain_cursor *cursor,
                            u64 ip, struct map *map, struct symbol *sym,
                            bool branch, struct branch_flags *flags,
-                           int nr_loop_iter, int samples, u64 branch_from)
+                           int nr_loop_iter, u64 iter_cycles, u64 branch_from)
 {
        struct callchain_cursor_node *node = *cursor->last;
 
        node->sym = sym;
        node->branch = branch;
        node->nr_loop_iter = nr_loop_iter;
-       node->samples = samples;
+       node->iter_cycles = iter_cycles;
 
        if (flags)
                memcpy(&node->branch_flags, flags,
 static int branch_from_str(char *bf, int bfsize,
                           u64 branch_count,
                           u64 cycles_count, u64 iter_count,
-                          u64 samples_count)
+                          u64 iter_cycles)
 {
        int printed = 0, i = 0;
        u64 cycles;
                                bf + printed, bfsize - printed);
        }
 
-       if (iter_count && samples_count) {
-               printed += count_pri64_printf(i++, "iterations",
-                               iter_count / samples_count,
+       if (iter_count) {
+               printed += count_pri64_printf(i++, "iter",
+                               iter_count,
+                               bf + printed, bfsize - printed);
+
+               printed += count_pri64_printf(i++, "avg_cycles",
+                               iter_cycles / iter_count,
                                bf + printed, bfsize - printed);
        }
 
 static int counts_str_build(char *bf, int bfsize,
                             u64 branch_count, u64 predicted_count,
                             u64 abort_count, u64 cycles_count,
-                            u64 iter_count, u64 samples_count,
+                            u64 iter_count, u64 iter_cycles,
                             struct branch_type_stat *brtype_stat)
 {
        int printed;
                                predicted_count, abort_count, brtype_stat);
        } else {
                printed = branch_from_str(bf, bfsize, branch_count,
-                               cycles_count, iter_count, samples_count);
+                               cycles_count, iter_count, iter_cycles);
        }
 
        if (!printed)
 static int callchain_counts_printf(FILE *fp, char *bf, int bfsize,
                                   u64 branch_count, u64 predicted_count,
                                   u64 abort_count, u64 cycles_count,
-                                  u64 iter_count, u64 samples_count,
+                                  u64 iter_count, u64 iter_cycles,
                                   struct branch_type_stat *brtype_stat)
 {
        char str[256];
 
        counts_str_build(str, sizeof(str), branch_count,
                         predicted_count, abort_count, cycles_count,
-                        iter_count, samples_count, brtype_stat);
+                        iter_count, iter_cycles, brtype_stat);
 
        if (fp)
                return fprintf(fp, "%s", str);
        return scnprintf(bf, bfsize, "%s", str);
 }
 
-int callchain_list_counts__printf_value(struct callchain_node *node,
-                                       struct callchain_list *clist,
+int callchain_list_counts__printf_value(struct callchain_list *clist,
                                        FILE *fp, char *bf, int bfsize)
 {
        u64 branch_count, predicted_count;
        u64 abort_count, cycles_count;
-       u64 iter_count = 0, samples_count = 0;
+       u64 iter_count, iter_cycles;
 
        branch_count = clist->branch_count;
        predicted_count = clist->predicted_count;
        abort_count = clist->abort_count;
        cycles_count = clist->cycles_count;
-
-       if (node) {
-               struct callchain_list *call;
-
-               list_for_each_entry(call, &node->val, list) {
-                       iter_count += call->iter_count;
-                       samples_count += call->samples_count;
-               }
-       }
+       iter_count = clist->iter_count;
+       iter_cycles = clist->iter_cycles;
 
        return callchain_counts_printf(fp, bf, bfsize, branch_count,
                                       predicted_count, abort_count,
-                                      cycles_count, iter_count, samples_count,
+                                      cycles_count, iter_count, iter_cycles,
                                       &clist->brtype_stat);
 }
 
 
                rc = callchain_cursor_append(dst, node->ip, node->map, node->sym,
                                             node->branch, &node->branch_flags,
-                                            node->nr_loop_iter, node->samples,
+                                            node->nr_loop_iter,
+                                            node->iter_cycles,
                                             node->branch_from);
                if (rc)
                        break;
 
        u64                     abort_count;
        u64                     cycles_count;
        u64                     iter_count;
-       u64                     samples_count;
+       u64                     iter_cycles;
        struct branch_type_stat brtype_stat;
        char                   *srcline;
        struct list_head        list;
        struct branch_flags             branch_flags;
        u64                             branch_from;
        int                             nr_loop_iter;
-       int                             samples;
+       u64                             iter_cycles;
        struct callchain_cursor_node    *next;
 };
 
 int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip,
                            struct map *map, struct symbol *sym,
                            bool branch, struct branch_flags *flags,
-                           int nr_loop_iter, int samples, u64 branch_from);
+                           int nr_loop_iter, u64 iter_cycles, u64 branch_from);
 
 /* Close a cursor writing session. Initialize for the reader */
 static inline void callchain_cursor_commit(struct callchain_cursor *cursor)
 int callchain_node__fprintf_value(struct callchain_node *node,
                                  FILE *fp, u64 total);
 
-int callchain_list_counts__printf_value(struct callchain_node *node,
-                                       struct callchain_list *clist,
+int callchain_list_counts__printf_value(struct callchain_list *clist,
                                        FILE *fp, char *bf, int bfsize);
 
 void free_callchain(struct callchain_root *root);
 
        return mi;
 }
 
+struct iterations {
+       int nr_loop_iter;
+       u64 cycles;
+};
+
 static int add_callchain_ip(struct thread *thread,
                            struct callchain_cursor *cursor,
                            struct symbol **parent,
                            u64 ip,
                            bool branch,
                            struct branch_flags *flags,
-                           int nr_loop_iter,
-                           int samples,
+                           struct iterations *iter,
                            u64 branch_from)
 {
        struct addr_location al;
+       int nr_loop_iter = 0;
+       u64 iter_cycles = 0;
 
        al.filtered = 0;
        al.sym = NULL;
 
        if (symbol_conf.hide_unresolved && al.sym == NULL)
                return 0;
+
+       if (iter) {
+               nr_loop_iter = iter->nr_loop_iter;
+               iter_cycles = iter->cycles;
+       }
+
        return callchain_cursor_append(cursor, al.addr, al.map, al.sym,
-                                      branch, flags, nr_loop_iter, samples,
-                                      branch_from);
+                                      branch, flags, nr_loop_iter,
+                                      iter_cycles, branch_from);
 }
 
 struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
        return bi;
 }
 
+static void save_iterations(struct iterations *iter,
+                           struct branch_entry *be, int nr)
+{
+       int i;
+
+       iter->nr_loop_iter = nr;
+       iter->cycles = 0;
+
+       for (i = 0; i < nr; i++)
+               iter->cycles += be[i].flags.cycles;
+}
+
 #define CHASHSZ 127
 #define CHASHBITS 7
 #define NO_ENTRY 0xff
 #define PERF_MAX_BRANCH_DEPTH 127
 
 /* Remove loops. */
-static int remove_loops(struct branch_entry *l, int nr)
+static int remove_loops(struct branch_entry *l, int nr,
+                       struct iterations *iter)
 {
        int i, j, off;
        unsigned char chash[CHASHSZ];
                                        break;
                                }
                        if (is_loop) {
-                               memmove(l + i, l + i + off,
-                                       (nr - (i + off)) * sizeof(*l));
+                               j = nr - (i + off);
+                               if (j > 0) {
+                                       save_iterations(iter + i + off,
+                                               l + i, off);
+
+                                       memmove(iter + i, iter + i + off,
+                                               j * sizeof(*iter));
+
+                                       memmove(l + i, l + i + off,
+                                               j * sizeof(*l));
+                               }
+
                                nr -= off;
                        }
                }
 
                        err = add_callchain_ip(thread, cursor, parent,
                                               root_al, &cpumode, ip,
-                                              branch, flags, 0, 0,
+                                              branch, flags, NULL,
                                               branch_from);
                        if (err)
                                return (err < 0) ? err : 0;
        int i, j, err, nr_entries;
        int skip_idx = -1;
        int first_call = 0;
-       int nr_loop_iter;
 
        if (chain)
                chain_nr = chain->nr;
        if (branch && callchain_param.branch_callstack) {
                int nr = min(max_stack, (int)branch->nr);
                struct branch_entry be[nr];
+               struct iterations iter[nr];
 
                if (branch->nr > PERF_MAX_BRANCH_DEPTH) {
                        pr_warning("corrupted branch chain. skipping...\n");
                                be[i] = branch->entries[branch->nr - i - 1];
                }
 
-               nr_loop_iter = nr;
-               nr = remove_loops(be, nr);
-
-               /*
-                * Get the number of iterations.
-                * It's only approximation, but good enough in practice.
-                */
-               if (nr_loop_iter > nr)
-                       nr_loop_iter = nr_loop_iter - nr + 1;
-               else
-                       nr_loop_iter = 0;
+               memset(iter, 0, sizeof(struct iterations) * nr);
+               nr = remove_loops(be, nr, iter);
 
                for (i = 0; i < nr; i++) {
-                       if (i == nr - 1)
-                               err = add_callchain_ip(thread, cursor, parent,
-                                                      root_al,
-                                                      NULL, be[i].to,
-                                                      true, &be[i].flags,
-                                                      nr_loop_iter, 1,
-                                                      be[i].from);
-                       else
-                               err = add_callchain_ip(thread, cursor, parent,
-                                                      root_al,
-                                                      NULL, be[i].to,
-                                                      true, &be[i].flags,
-                                                      0, 0, be[i].from);
+                       err = add_callchain_ip(thread, cursor, parent,
+                                              root_al,
+                                              NULL, be[i].to,
+                                              true, &be[i].flags,
+                                              NULL, be[i].from);
 
                        if (!err)
                                err = add_callchain_ip(thread, cursor, parent, root_al,
                                                       NULL, be[i].from,
                                                       true, &be[i].flags,
-                                                      0, 0, 0);
+                                                      &iter[i], 0);
                        if (err == -EINVAL)
                                break;
                        if (err)
 
                err = add_callchain_ip(thread, cursor, parent,
                                       root_al, &cpumode, ip,
-                                      false, NULL, 0, 0, 0);
+                                      false, NULL, NULL, 0);
 
                if (err)
                        return (err < 0) ? err : 0;