#include <linux/rtnetlink.h>
 #include <linux/pkt_sched.h>
 
+struct gnet_stats_basic_cpu {
+       struct gnet_stats_basic_packed bstats;
+       struct u64_stats_sync syncp;
+};
+
 struct gnet_dump {
        spinlock_t *      lock;
        struct sk_buff *  skb;
                                 spinlock_t *lock, struct gnet_dump *d);
 
 int gnet_stats_copy_basic(struct gnet_dump *d,
+                         struct gnet_stats_basic_cpu __percpu *cpu,
                          struct gnet_stats_basic_packed *b);
+void __gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
+                            struct gnet_stats_basic_cpu __percpu *cpu,
+                            struct gnet_stats_basic_packed *b);
 int gnet_stats_copy_rate_est(struct gnet_dump *d,
                             const struct gnet_stats_basic_packed *b,
                             struct gnet_stats_rate_est64 *r);
 int gnet_stats_finish_copy(struct gnet_dump *d);
 
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
+                     struct gnet_stats_basic_cpu __percpu *cpu_bstats,
                      struct gnet_stats_rate_est64 *rate_est,
                      spinlock_t *stats_lock, struct nlattr *opt);
 void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
                        struct gnet_stats_rate_est64 *rate_est);
 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
+                         struct gnet_stats_basic_cpu __percpu *cpu_bstats,
                          struct gnet_stats_rate_est64 *rate_est,
                          spinlock_t *stats_lock, struct nlattr *opt);
 bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
 
 #include <linux/rcupdate.h>
 #include <linux/pkt_sched.h>
 #include <linux/pkt_cls.h>
+#include <linux/percpu.h>
 #include <net/gen_stats.h>
 #include <net/rtnetlink.h>
 
                                      * multiqueue device.
                                      */
 #define TCQ_F_WARN_NONWC       (1 << 16)
+#define TCQ_F_CPUSTATS         0x20 /* run using percpu statistics */
        u32                     limit;
        const struct Qdisc_ops  *ops;
        struct qdisc_size_table __rcu *stab;
         */
        unsigned long           state;
        struct sk_buff_head     q;
-       struct gnet_stats_basic_packed bstats;
+       union {
+               struct gnet_stats_basic_packed bstats;
+               struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+       } __packed;
        unsigned int            __state;
        struct gnet_stats_queue qstats;
        struct rcu_head         rcu_head;
        return qdisc_enqueue(skb, sch) & NET_XMIT_MASK;
 }
 
+static inline bool qdisc_is_percpu_stats(const struct Qdisc *q)
+{
+       return q->flags & TCQ_F_CPUSTATS;
+}
 
 static inline void bstats_update(struct gnet_stats_basic_packed *bstats,
                                 const struct sk_buff *skb)
        bstats->packets += skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1;
 }
 
+static inline void qdisc_bstats_update_cpu(struct Qdisc *sch,
+                                          const struct sk_buff *skb)
+{
+       struct gnet_stats_basic_cpu *bstats =
+                               this_cpu_ptr(sch->cpu_bstats);
+
+       u64_stats_update_begin(&bstats->syncp);
+       bstats_update(&bstats->bstats, skb);
+       u64_stats_update_end(&bstats->syncp);
+}
+
 static inline void qdisc_bstats_update(struct Qdisc *sch,
                                       const struct sk_buff *skb)
 {
 
        u32                     avpps;
        struct rcu_head         e_rcu;
        struct rb_node          node;
+       struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+       struct rcu_head         head;
 };
 
 struct gen_estimator_head
 
        rcu_read_lock();
        list_for_each_entry_rcu(e, &elist[idx].list, list) {
-               u64 nbytes;
+               struct gnet_stats_basic_packed b = {0};
                u64 brate;
-               u32 npackets;
                u32 rate;
 
                spin_lock(e->stats_lock);
                if (e->bstats == NULL)
                        goto skip;
 
-               nbytes = e->bstats->bytes;
-               npackets = e->bstats->packets;
-               brate = (nbytes - e->last_bytes)<<(7 - idx);
-               e->last_bytes = nbytes;
+               __gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats);
+
+               brate = (b.bytes - e->last_bytes)<<(7 - idx);
+               e->last_bytes = b.bytes;
                e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
                e->rate_est->bps = (e->avbps+0xF)>>5;
 
-               rate = (npackets - e->last_packets)<<(12 - idx);
-               e->last_packets = npackets;
+               rate = (b.packets - e->last_packets)<<(12 - idx);
+               e->last_packets = b.packets;
                e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
                e->rate_est->pps = (e->avpps+0x1FF)>>10;
 skip:
  *
  */
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
+                     struct gnet_stats_basic_cpu __percpu *cpu_bstats,
                      struct gnet_stats_rate_est64 *rate_est,
                      spinlock_t *stats_lock,
                      struct nlattr *opt)
 {
        struct gen_estimator *est;
        struct gnet_estimator *parm = nla_data(opt);
+       struct gnet_stats_basic_packed b = {0};
        int idx;
 
        if (nla_len(opt) < sizeof(*parm))
        if (est == NULL)
                return -ENOBUFS;
 
+       __gnet_stats_copy_basic(&b, cpu_bstats, bstats);
+
        idx = parm->interval + 2;
        est->bstats = bstats;
        est->rate_est = rate_est;
        est->stats_lock = stats_lock;
        est->ewma_log = parm->ewma_log;
-       est->last_bytes = bstats->bytes;
+       est->last_bytes = b.bytes;
        est->avbps = rate_est->bps<<5;
-       est->last_packets = bstats->packets;
+       est->last_packets = b.packets;
        est->avpps = rate_est->pps<<10;
+       est->cpu_bstats = cpu_bstats;
 
        spin_lock_bh(&est_tree_lock);
        if (!elist[idx].timer.function) {
  * Returns 0 on success or a negative error code.
  */
 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
+                         struct gnet_stats_basic_cpu __percpu *cpu_bstats,
                          struct gnet_stats_rate_est64 *rate_est,
                          spinlock_t *stats_lock, struct nlattr *opt)
 {
        gen_kill_estimator(bstats, rate_est);
-       return gen_new_estimator(bstats, rate_est, stats_lock, opt);
+       return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt);
 }
 EXPORT_SYMBOL(gen_replace_estimator);
 
 
 }
 EXPORT_SYMBOL(gnet_stats_start_copy);
 
+static void
+__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
+                           struct gnet_stats_basic_cpu __percpu *cpu)
+{
+       int i;
+
+       for_each_possible_cpu(i) {
+               struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i);
+               unsigned int start;
+               __u64 bytes;
+               __u32 packets;
+
+               do {
+                       start = u64_stats_fetch_begin_irq(&bcpu->syncp);
+                       bytes = bcpu->bstats.bytes;
+                       packets = bcpu->bstats.packets;
+               } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start));
+
+               bstats->bytes += bcpu->bstats.bytes;
+               bstats->packets += bcpu->bstats.packets;
+       }
+}
+
+void
+__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
+                       struct gnet_stats_basic_cpu __percpu *cpu,
+                       struct gnet_stats_basic_packed *b)
+{
+       if (cpu) {
+               __gnet_stats_copy_basic_cpu(bstats, cpu);
+       } else {
+               bstats->bytes = b->bytes;
+               bstats->packets = b->packets;
+       }
+}
+EXPORT_SYMBOL(__gnet_stats_copy_basic);
+
 /**
  * gnet_stats_copy_basic - copy basic statistics into statistic TLV
  * @d: dumping handle
  * if the room in the socket buffer was not sufficient.
  */
 int
-gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b)
+gnet_stats_copy_basic(struct gnet_dump *d,
+                     struct gnet_stats_basic_cpu __percpu *cpu,
+                     struct gnet_stats_basic_packed *b)
 {
+       struct gnet_stats_basic_packed bstats = {0};
+
+       __gnet_stats_copy_basic(&bstats, cpu, b);
+
        if (d->compat_tc_stats) {
-               d->tc_stats.bytes = b->bytes;
-               d->tc_stats.packets = b->packets;
+               d->tc_stats.bytes = bstats.bytes;
+               d->tc_stats.packets = bstats.packets;
        }
 
        if (d->tail) {
                struct gnet_stats_basic sb;
 
                memset(&sb, 0, sizeof(sb));
-               sb.bytes = b->bytes;
-               sb.packets = b->packets;
+               sb.bytes = bstats.bytes;
+               sb.packets = bstats.packets;
                return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb));
        }
        return 0;
 
        cfg.est.interval        = info->interval;
        cfg.est.ewma_log        = info->ewma_log;
 
-       ret = gen_new_estimator(&est->bstats, &est->rstats,
+       ret = gen_new_estimator(&est->bstats, NULL, &est->rstats,
                                &est->lock, &cfg.opt);
        if (ret < 0)
                goto err2;
 
        p->tcfc_tm.install = jiffies;
        p->tcfc_tm.lastuse = jiffies;
        if (est) {
-               int err = gen_new_estimator(&p->tcfc_bstats, &p->tcfc_rate_est,
+               int err = gen_new_estimator(&p->tcfc_bstats, NULL,
+                                           &p->tcfc_rate_est,
                                            &p->tcfc_lock, est);
                if (err) {
                        kfree(p);
        if (err < 0)
                goto errout;
 
-       if (gnet_stats_copy_basic(&d, &p->tcfc_bstats) < 0 ||
+       if (gnet_stats_copy_basic(&d, NULL, &p->tcfc_bstats) < 0 ||
            gnet_stats_copy_rate_est(&d, &p->tcfc_bstats,
                                     &p->tcfc_rate_est) < 0 ||
            gnet_stats_copy_queue(&d, &p->tcfc_qstats) < 0)
 
 
        spin_lock_bh(&police->tcf_lock);
        if (est) {
-               err = gen_replace_estimator(&police->tcf_bstats,
+               err = gen_replace_estimator(&police->tcf_bstats, NULL,
                                            &police->tcf_rate_est,
                                            &police->tcf_lock, est);
                if (err)
 
        sch->handle = handle;
 
        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
+               if (qdisc_is_percpu_stats(sch)) {
+                       sch->cpu_bstats =
+                               alloc_percpu(struct gnet_stats_basic_cpu);
+                       if (!sch->cpu_bstats)
+                               goto err_out4;
+               }
+
                if (tca[TCA_STAB]) {
                        stab = qdisc_get_stab(tca[TCA_STAB]);
                        if (IS_ERR(stab)) {
                        else
                                root_lock = qdisc_lock(sch);
 
-                       err = gen_new_estimator(&sch->bstats, &sch->rate_est,
-                                               root_lock, tca[TCA_RATE]);
+                       err = gen_new_estimator(&sch->bstats,
+                                               sch->cpu_bstats,
+                                               &sch->rate_est,
+                                               root_lock,
+                                               tca[TCA_RATE]);
                        if (err)
                                goto err_out4;
                }
        return NULL;
 
 err_out4:
+       free_percpu(sch->cpu_bstats);
        /*
         * Any broken qdiscs that would require a ops->reset() here?
         * The qdisc was never in action so it shouldn't be necessary.
                   because change can't be undone. */
                if (sch->flags & TCQ_F_MQROOT)
                        goto out;
-               gen_replace_estimator(&sch->bstats, &sch->rate_est,
-                                           qdisc_root_sleeping_lock(sch),
-                                           tca[TCA_RATE]);
+               gen_replace_estimator(&sch->bstats,
+                                     sch->cpu_bstats,
+                                     &sch->rate_est,
+                                     qdisc_root_sleeping_lock(sch),
+                                     tca[TCA_RATE]);
        }
 out:
        return 0;
 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
                         u32 portid, u32 seq, u16 flags, int event)
 {
+       struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
        struct tcmsg *tcm;
        struct nlmsghdr  *nlh;
        unsigned char *b = skb_tail_pointer(skb);
        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
                goto nla_put_failure;
 
-       if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
+       if (qdisc_is_percpu_stats(q))
+               cpu_bstats = q->cpu_bstats;
+
+       if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 ||
            gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
            gnet_stats_copy_queue(&d, &q->qstats) < 0)
                goto nla_put_failure;
 
 
        flow->qstats.qlen = flow->q->q.qlen;
 
-       if (gnet_stats_copy_basic(d, &flow->bstats) < 0 ||
+       if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 ||
            gnet_stats_copy_queue(d, &flow->qstats) < 0)
                return -1;
 
 
        if (cl->undertime != PSCHED_PASTPERFECT)
                cl->xstats.undertime = cl->undertime - q->now;
 
-       if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
+       if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
            gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
            gnet_stats_copy_queue(d, &cl->qstats) < 0)
                return -1;
                }
 
                if (tca[TCA_RATE]) {
-                       err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+                       err = gen_replace_estimator(&cl->bstats, NULL,
+                                                   &cl->rate_est,
                                                    qdisc_root_sleeping_lock(sch),
                                                    tca[TCA_RATE]);
                        if (err) {
                goto failure;
 
        if (tca[TCA_RATE]) {
-               err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+               err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
                                        qdisc_root_sleeping_lock(sch),
                                        tca[TCA_RATE]);
                if (err) {
 
 
        if (cl != NULL) {
                if (tca[TCA_RATE]) {
-                       err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+                       err = gen_replace_estimator(&cl->bstats, NULL,
+                                                   &cl->rate_est,
                                                    qdisc_root_sleeping_lock(sch),
                                                    tca[TCA_RATE]);
                        if (err)
                cl->qdisc = &noop_qdisc;
 
        if (tca[TCA_RATE]) {
-               err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+               err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est,
                                            qdisc_root_sleeping_lock(sch),
                                            tca[TCA_RATE]);
                if (err) {
                cl->qdisc->qstats.qlen = cl->qdisc->q.qlen;
        }
 
-       if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
+       if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
            gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
            gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0)
                return -1;
 
 {
        struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head);
 
+       if (qdisc_is_percpu_stats(qdisc))
+               free_percpu(qdisc->cpu_bstats);
+
        kfree((char *) qdisc - qdisc->padded);
 }
 
 
                cur_time = psched_get_time();
 
                if (tca[TCA_RATE]) {
-                       err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
-                                             qdisc_root_sleeping_lock(sch),
-                                             tca[TCA_RATE]);
+                       spinlock_t *lock = qdisc_root_sleeping_lock(sch);
+
+                       err = gen_replace_estimator(&cl->bstats, NULL,
+                                                   &cl->rate_est,
+                                                   lock,
+                                                   tca[TCA_RATE]);
                        if (err)
                                return err;
                }
                return -ENOBUFS;
 
        if (tca[TCA_RATE]) {
-               err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+               err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
                                        qdisc_root_sleeping_lock(sch),
                                        tca[TCA_RATE]);
                if (err) {
        xstats.work    = cl->cl_total;
        xstats.rtwork  = cl->cl_cumul;
 
-       if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
+       if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
            gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
            gnet_stats_copy_queue(d, &cl->qstats) < 0)
                return -1;
 
        cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens);
        cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens);
 
-       if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
+       if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
            gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
            gnet_stats_copy_queue(d, &cl->qstats) < 0)
                return -1;
                        goto failure;
 
                if (htb_rate_est || tca[TCA_RATE]) {
-                       err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+                       err = gen_new_estimator(&cl->bstats, NULL,
+                                               &cl->rate_est,
                                                qdisc_root_sleeping_lock(sch),
                                                tca[TCA_RATE] ? : &est.nla);
                        if (err) {
                        parent->children++;
        } else {
                if (tca[TCA_RATE]) {
-                       err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
-                                                   qdisc_root_sleeping_lock(sch),
+                       spinlock_t *lock = qdisc_root_sleeping_lock(sch);
+
+                       err = gen_replace_estimator(&cl->bstats, NULL,
+                                                   &cl->rate_est,
+                                                   lock,
                                                    tca[TCA_RATE]);
                        if (err)
                                return err;
 
 
        sch = dev_queue->qdisc_sleeping;
        sch->qstats.qlen = sch->q.qlen;
-       if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
+       if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 ||
            gnet_stats_copy_queue(d, &sch->qstats) < 0)
                return -1;
        return 0;
 
                }
                /* Reclaim root sleeping lock before completing stats */
                spin_lock_bh(d->lock);
-               if (gnet_stats_copy_basic(d, &bstats) < 0 ||
+               if (gnet_stats_copy_basic(d, NULL, &bstats) < 0 ||
                    gnet_stats_copy_queue(d, &qstats) < 0)
                        return -1;
        } else {
 
                sch = dev_queue->qdisc_sleeping;
                sch->qstats.qlen = sch->q.qlen;
-               if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
+               if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 ||
                    gnet_stats_copy_queue(d, &sch->qstats) < 0)
                        return -1;
        }
 
 
        cl_q = q->queues[cl - 1];
        cl_q->qstats.qlen = cl_q->q.qlen;
-       if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
+       if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 ||
            gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
                return -1;
 
 
 
        cl_q = q->queues[cl - 1];
        cl_q->qstats.qlen = cl_q->q.qlen;
-       if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
+       if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 ||
            gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
                return -1;
 
 
 
        if (cl != NULL) { /* modify existing class */
                if (tca[TCA_RATE]) {
-                       err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+                       err = gen_replace_estimator(&cl->bstats, NULL,
+                                                   &cl->rate_est,
                                                    qdisc_root_sleeping_lock(sch),
                                                    tca[TCA_RATE]);
                        if (err)
                cl->qdisc = &noop_qdisc;
 
        if (tca[TCA_RATE]) {
-               err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+               err = gen_new_estimator(&cl->bstats, NULL,
+                                       &cl->rate_est,
                                        qdisc_root_sleeping_lock(sch),
                                        tca[TCA_RATE]);
                if (err)
        xstats.weight = cl->agg->class_weight;
        xstats.lmax = cl->agg->lmax;
 
-       if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
+       if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
            gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
            gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0)
                return -1;