-/* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
  */
 static const char *__doc__=
  "XDP monitor tool, based on tracepoints\n"
        {0, 0, NULL,  0 }
 };
 
+/* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */
+#define EXIT_FAIL_MEM  5
+
 static void usage(char *argv[])
 {
        int i;
        return NULL;
 }
 
+/* Common stats data record shared with _kern.c */
+struct datarec {
+       __u64 processed;
+       __u64 dropped;
+       __u64 info;
+};
+#define MAX_CPUS 64
+
+/* Userspace structs for collection of stats from maps */
 struct record {
-       __u64 counter;
        __u64 timestamp;
+       struct datarec total;
+       struct datarec *cpu;
+};
+struct u64rec {
+       __u64 processed;
+};
+struct record_u64 {
+       /* record for _kern side __u64 values */
+       __u64 timestamp;
+       struct u64rec total;
+       struct u64rec *cpu;
 };
 
 struct stats_record {
-       struct record xdp_redir[REDIR_RES_MAX];
-       struct record xdp_exception[XDP_ACTION_MAX];
+       struct record_u64 xdp_redirect[REDIR_RES_MAX];
+       struct record_u64 xdp_exception[XDP_ACTION_MAX];
+       struct record xdp_cpumap_kthread;
+       struct record xdp_cpumap_enqueue[MAX_CPUS];
 };
 
-static void stats_print_headers(bool err_only)
+static bool map_collect_record(int fd, __u32 key, struct record *rec)
 {
-       if (err_only)
-               printf("\n%s\n", __doc_err_only__);
+       /* For percpu maps, userspace gets a value per possible CPU */
+       unsigned int nr_cpus = bpf_num_possible_cpus();
+       struct datarec values[nr_cpus];
+       __u64 sum_processed = 0;
+       __u64 sum_dropped = 0;
+       __u64 sum_info = 0;
+       int i;
+
+       if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+               fprintf(stderr,
+                       "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+               return false;
+       }
+       /* Get time as close as possible to reading map contents */
+       rec->timestamp = gettime();
 
-       printf("%-14s %-11s %-10s %-18s %-9s\n",
-              "ACTION", "result", "pps ", "pps-human-readable", "measure-period");
+       /* Record and sum values from each CPU */
+       for (i = 0; i < nr_cpus; i++) {
+               rec->cpu[i].processed = values[i].processed;
+               sum_processed        += values[i].processed;
+               rec->cpu[i].dropped = values[i].dropped;
+               sum_dropped        += values[i].dropped;
+               rec->cpu[i].info = values[i].info;
+               sum_info        += values[i].info;
+       }
+       rec->total.processed = sum_processed;
+       rec->total.dropped   = sum_dropped;
+       rec->total.info      = sum_info;
+       return true;
+}
+
+static bool map_collect_record_u64(int fd, __u32 key, struct record_u64 *rec)
+{
+       /* For percpu maps, userspace gets a value per possible CPU */
+       unsigned int nr_cpus = bpf_num_possible_cpus();
+       struct u64rec values[nr_cpus];
+       __u64 sum_total = 0;
+       int i;
+
+       if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+               fprintf(stderr,
+                       "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+               return false;
+       }
+       /* Get time as close as possible to reading map contents */
+       rec->timestamp = gettime();
+
+       /* Record and sum values from each CPU */
+       for (i = 0; i < nr_cpus; i++) {
+               rec->cpu[i].processed = values[i].processed;
+               sum_total            += values[i].processed;
+       }
+       rec->total.processed = sum_total;
+       return true;
 }
 
 static double calc_period(struct record *r, struct record *p)
        return period_;
 }
 
-static double calc_pps(struct record *r, struct record *p, double period)
+static double calc_period_u64(struct record_u64 *r, struct record_u64 *p)
+{
+       double period_ = 0;
+       __u64 period = 0;
+
+       period = r->timestamp - p->timestamp;
+       if (period > 0)
+               period_ = ((double) period / NANOSEC_PER_SEC);
+
+       return period_;
+}
+
+static double calc_pps(struct datarec *r, struct datarec *p, double period)
+{
+       __u64 packets = 0;
+       double pps = 0;
+
+       if (period > 0) {
+               packets = r->processed - p->processed;
+               pps = packets / period;
+       }
+       return pps;
+}
+
+static double calc_pps_u64(struct u64rec *r, struct u64rec *p, double period)
+{
+       __u64 packets = 0;
+       double pps = 0;
+
+       if (period > 0) {
+               packets = r->processed - p->processed;
+               pps = packets / period;
+       }
+       return pps;
+}
+
+static double calc_drop(struct datarec *r, struct datarec *p, double period)
+{
+       __u64 packets = 0;
+       double pps = 0;
+
+       if (period > 0) {
+               packets = r->dropped - p->dropped;
+               pps = packets / period;
+       }
+       return pps;
+}
+
+static double calc_info(struct datarec *r, struct datarec *p, double period)
 {
        __u64 packets = 0;
        double pps = 0;
 
        if (period > 0) {
-               packets = r->counter - p->counter;
+               packets = r->info - p->info;
                pps = packets / period;
        }
        return pps;
 }
 
-static void stats_print(struct stats_record *rec,
-                       struct stats_record *prev,
+static void stats_print(struct stats_record *stats_rec,
+                       struct stats_record *stats_prev,
                        bool err_only)
 {
-       double period = 0, pps = 0;
-       struct record *r, *p;
-       int i = 0;
+       unsigned int nr_cpus = bpf_num_possible_cpus();
+       int rec_i = 0, i, to_cpu;
+       double t = 0, pps = 0;
 
-       char *fmt = "%-14s %-11s %-10.0f %'-18.0f %f\n";
+       /* Header */
+       printf("%-15s %-7s %-12s %-12s %-9s\n",
+              "XDP-event", "CPU:to", "pps", "drop-pps", "extra-info");
 
        /* tracepoint: xdp:xdp_redirect_* */
        if (err_only)
-               i = REDIR_ERROR;
-
-       for (; i < REDIR_RES_MAX; i++) {
-               r = &rec->xdp_redir[i];
-               p = &prev->xdp_redir[i];
-
-               if (p->timestamp) {
-                       period = calc_period(r, p);
-                       pps = calc_pps(r, p, period);
+               rec_i = REDIR_ERROR;
+
+       for (; rec_i < REDIR_RES_MAX; rec_i++) {
+               struct record_u64 *rec, *prev;
+               char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n";
+               char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n";
+
+               rec  =  &stats_rec->xdp_redirect[rec_i];
+               prev = &stats_prev->xdp_redirect[rec_i];
+               t = calc_period_u64(rec, prev);
+
+               for (i = 0; i < nr_cpus; i++) {
+                       struct u64rec *r = &rec->cpu[i];
+                       struct u64rec *p = &prev->cpu[i];
+
+                       pps = calc_pps_u64(r, p, t);
+                       if (pps > 0)
+                               printf(fmt1, "XDP_REDIRECT", i,
+                                      rec_i ? 0.0: pps, rec_i ? pps : 0.0,
+                                      err2str(rec_i));
                }
-               printf(fmt, "XDP_REDIRECT", err2str(i), pps, pps, period);
+               pps = calc_pps_u64(&rec->total, &prev->total, t);
+               printf(fmt2, "XDP_REDIRECT", "total",
+                      rec_i ? 0.0: pps, rec_i ? pps : 0.0, err2str(rec_i));
        }
 
        /* tracepoint: xdp:xdp_exception */
-       for (i = 0; i < XDP_ACTION_MAX; i++) {
-               r = &rec->xdp_exception[i];
-               p = &prev->xdp_exception[i];
-               if (p->timestamp) {
-                       period = calc_period(r, p);
-                       pps = calc_pps(r, p, period);
+       for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) {
+               struct record_u64 *rec, *prev;
+               char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n";
+               char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n";
+
+               rec  =  &stats_rec->xdp_exception[rec_i];
+               prev = &stats_prev->xdp_exception[rec_i];
+               t = calc_period_u64(rec, prev);
+
+               for (i = 0; i < nr_cpus; i++) {
+                       struct u64rec *r = &rec->cpu[i];
+                       struct u64rec *p = &prev->cpu[i];
+
+                       pps = calc_pps_u64(r, p, t);
+                       if (pps > 0)
+                               printf(fmt1, "Exception", i,
+                                      0.0, pps, err2str(rec_i));
                }
+               pps = calc_pps_u64(&rec->total, &prev->total, t);
                if (pps > 0)
-                       printf(fmt, action2str(i), "Exception",
-                              pps, pps, period);
+                       printf(fmt2, "Exception", "total",
+                              0.0, pps, action2str(rec_i));
        }
-       printf("\n");
-}
 
-static __u64 get_key32_value64_percpu(int fd, __u32 key)
-{
-       /* For percpu maps, userspace gets a value per possible CPU */
-       unsigned int nr_cpus = bpf_num_possible_cpus();
-       __u64 values[nr_cpus];
-       __u64 sum = 0;
-       int i;
-
-       if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
-               fprintf(stderr,
-                       "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
-               return 0;
+       /* cpumap enqueue stats */
+       for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) {
+               char *fmt1 = "%-15s %3d:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n";
+               char *fmt2 = "%-15s %3s:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n";
+               struct record *rec, *prev;
+               char *info_str = "";
+               double drop, info;
+
+               rec  =  &stats_rec->xdp_cpumap_enqueue[to_cpu];
+               prev = &stats_prev->xdp_cpumap_enqueue[to_cpu];
+               t = calc_period(rec, prev);
+               for (i = 0; i < nr_cpus; i++) {
+                       struct datarec *r = &rec->cpu[i];
+                       struct datarec *p = &prev->cpu[i];
+
+                       pps  = calc_pps(r, p, t);
+                       drop = calc_drop(r, p, t);
+                       info = calc_info(r, p, t);
+                       if (info > 0) {
+                               info_str = "bulk-average";
+                               info = pps / info; /* calc average bulk size */
+                       }
+                       if (pps > 0)
+                               printf(fmt1, "cpumap-enqueue",
+                                      i, to_cpu, pps, drop, info, info_str);
+               }
+               pps = calc_pps(&rec->total, &prev->total, t);
+               if (pps > 0) {
+                       drop = calc_drop(&rec->total, &prev->total, t);
+                       info = calc_info(&rec->total, &prev->total, t);
+                       if (info > 0) {
+                               info_str = "bulk-average";
+                               info = pps / info; /* calc average bulk size */
+                       }
+                       printf(fmt2, "cpumap-enqueue",
+                              "sum", to_cpu, pps, drop, info, info_str);
+               }
        }
 
-       /* Sum values from each CPU */
-       for (i = 0; i < nr_cpus; i++) {
-               sum += values[i];
+       /* cpumap kthread stats */
+       {
+               char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.0f %s\n";
+               char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.0f %s\n";
+               struct record *rec, *prev;
+               double drop, info;
+               char *i_str = "";
+
+               rec  =  &stats_rec->xdp_cpumap_kthread;
+               prev = &stats_prev->xdp_cpumap_kthread;
+               t = calc_period(rec, prev);
+               for (i = 0; i < nr_cpus; i++) {
+                       struct datarec *r = &rec->cpu[i];
+                       struct datarec *p = &prev->cpu[i];
+
+                       pps  = calc_pps(r, p, t);
+                       drop = calc_drop(r, p, t);
+                       info = calc_info(r, p, t);
+                       if (info > 0)
+                               i_str = "sched";
+                       if (pps > 0)
+                               printf(fmt1, "cpumap-kthread",
+                                      i, pps, drop, info, i_str);
+               }
+               pps = calc_pps(&rec->total, &prev->total, t);
+               drop = calc_drop(&rec->total, &prev->total, t);
+               info = calc_info(&rec->total, &prev->total, t);
+               if (info > 0)
+                       i_str = "sched-sum";
+               printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str);
        }
-       return sum;
+
+       printf("\n");
 }
 
 static bool stats_collect(struct stats_record *rec)
         */
 
        fd = map_data[0].fd; /* map0: redirect_err_cnt */
-       for (i = 0; i < REDIR_RES_MAX; i++) {
-               rec->xdp_redir[i].timestamp = gettime();
-               rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i);
-       }
+       for (i = 0; i < REDIR_RES_MAX; i++)
+               map_collect_record_u64(fd, i, &rec->xdp_redirect[i]);
 
        fd = map_data[1].fd; /* map1: exception_cnt */
        for (i = 0; i < XDP_ACTION_MAX; i++) {
-               rec->xdp_exception[i].timestamp = gettime();
-               rec->xdp_exception[i].counter = get_key32_value64_percpu(fd, i);
+               map_collect_record_u64(fd, i, &rec->xdp_exception[i]);
        }
 
+       fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */
+       for (i = 0; i < MAX_CPUS; i++)
+               map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]);
+
+       fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
+       map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
+
        return true;
 }
 
+static void *alloc_rec_per_cpu(int record_size)
+{
+       unsigned int nr_cpus = bpf_num_possible_cpus();
+       void *array;
+       size_t size;
+
+       size = record_size * nr_cpus;
+       array = malloc(size);
+       memset(array, 0, size);
+       if (!array) {
+               fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
+               exit(EXIT_FAIL_MEM);
+       }
+       return array;
+}
+
+static struct stats_record *alloc_stats_record(void)
+{
+       struct stats_record *rec;
+       int rec_sz;
+       int i;
+
+       /* Alloc main stats_record structure */
+       rec = malloc(sizeof(*rec));
+       memset(rec, 0, sizeof(*rec));
+       if (!rec) {
+               fprintf(stderr, "Mem alloc error\n");
+               exit(EXIT_FAIL_MEM);
+       }
+
+       /* Alloc stats stored per CPU for each record */
+       rec_sz = sizeof(struct u64rec);
+       for (i = 0; i < REDIR_RES_MAX; i++)
+               rec->xdp_redirect[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+       for (i = 0; i < XDP_ACTION_MAX; i++)
+               rec->xdp_exception[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+       rec_sz = sizeof(struct datarec);
+       rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz);
+
+       for (i = 0; i < MAX_CPUS; i++)
+               rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+       return rec;
+}
+
+static void free_stats_record(struct stats_record *r)
+{
+       int i;
+
+       for (i = 0; i < REDIR_RES_MAX; i++)
+               free(r->xdp_redirect[i].cpu);
+
+       for (i = 0; i < XDP_ACTION_MAX; i++)
+               free(r->xdp_exception[i].cpu);
+
+       free(r->xdp_cpumap_kthread.cpu);
+
+       for (i = 0; i < MAX_CPUS; i++)
+               free(r->xdp_cpumap_enqueue[i].cpu);
+
+       free(r);
+}
+
+/* Pointer swap trick */
+static inline void swap(struct stats_record **a, struct stats_record **b)
+{
+       struct stats_record *tmp;
+
+       tmp = *a;
+       *a = *b;
+       *b = tmp;
+}
+
 static void stats_poll(int interval, bool err_only)
 {
-       struct stats_record rec, prev;
+       struct stats_record *rec, *prev;
 
-       memset(&rec, 0, sizeof(rec));
+       rec  = alloc_stats_record();
+       prev = alloc_stats_record();
+       stats_collect(rec);
+
+       if (err_only)
+               printf("\n%s\n", __doc_err_only__);
 
        /* Trick to pretty printf with thousands separators use %' */
        setlocale(LC_NUMERIC, "en_US");
        fflush(stdout);
 
        while (1) {
-               memcpy(&prev, &rec, sizeof(rec));
-               stats_collect(&rec);
-               stats_print_headers(err_only);
-               stats_print(&rec, &prev, err_only);
+               swap(&prev, &rec);
+               stats_collect(rec);
+               stats_print(rec, prev, err_only);
                fflush(stdout);
                sleep(interval);
        }
+
+       free_stats_record(rec);
+       free_stats_record(prev);
 }
 
 static void print_bpf_prog_info(void)