struct mem_cgroup_per_node {
        struct lruvec           lruvec;
 
+       /* Legacy local VM stats */
+       struct lruvec_stat __percpu *lruvec_stat_local;
+
+       /* Subtree VM stats (batched updates) */
        struct lruvec_stat __percpu *lruvec_stat_cpu;
        atomic_long_t           lruvec_stat[NR_VM_NODE_STAT_ITEMS];
-       atomic_long_t           lruvec_stat_local[NR_VM_NODE_STAT_ITEMS];
 
        unsigned long           lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
 
        atomic_t                moving_account;
        struct task_struct      *move_lock_task;
 
-       /* memory.stat */
+       /* Legacy local VM stats and events */
+       struct memcg_vmstats_percpu __percpu *vmstats_local;
+
+       /* Subtree VM stats and events (batched updates) */
        struct memcg_vmstats_percpu __percpu *vmstats_percpu;
 
        MEMCG_PADDING(_pad2_);
 
        atomic_long_t           vmstats[MEMCG_NR_STAT];
-       atomic_long_t           vmstats_local[MEMCG_NR_STAT];
-
        atomic_long_t           vmevents[NR_VM_EVENT_ITEMS];
-       atomic_long_t           vmevents_local[NR_VM_EVENT_ITEMS];
 
+       /* memory.events */
        atomic_long_t           memory_events[MEMCG_NR_MEMORY_EVENTS];
 
        unsigned long           socket_pressure;
 static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
                                                   int idx)
 {
-       long x = atomic_long_read(&memcg->vmstats_local[idx]);
+       long x = 0;
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               x += per_cpu(memcg->vmstats_local->stat[idx], cpu);
 #ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
                                                    enum node_stat_item idx)
 {
        struct mem_cgroup_per_node *pn;
-       long x;
+       long x = 0;
+       int cpu;
 
        if (mem_cgroup_disabled())
                return node_page_state(lruvec_pgdat(lruvec), idx);
 
        pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-       x = atomic_long_read(&pn->lruvec_stat_local[idx]);
+       for_each_possible_cpu(cpu)
+               x += per_cpu(pn->lruvec_stat_local->count[idx], cpu);
 #ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
 
        if (mem_cgroup_disabled())
                return;
 
+       __this_cpu_add(memcg->vmstats_local->stat[idx], val);
+
        x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
        if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
                struct mem_cgroup *mi;
 
-               atomic_long_add(x, &memcg->vmstats_local[idx]);
                for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
                        atomic_long_add(x, &mi->vmstats[idx]);
                x = 0;
        __mod_memcg_state(memcg, idx, val);
 
        /* Update lruvec */
+       __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
+
        x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
        if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
                struct mem_cgroup_per_node *pi;
 
-               atomic_long_add(x, &pn->lruvec_stat_local[idx]);
                for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
                        atomic_long_add(x, &pi->lruvec_stat[idx]);
                x = 0;
        if (mem_cgroup_disabled())
                return;
 
+       __this_cpu_add(memcg->vmstats_local->events[idx], count);
+
        x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
        if (unlikely(x > MEMCG_CHARGE_BATCH)) {
                struct mem_cgroup *mi;
 
-               atomic_long_add(x, &memcg->vmevents_local[idx]);
                for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
                        atomic_long_add(x, &mi->vmevents[idx]);
                x = 0;
 
 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
 {
-       return atomic_long_read(&memcg->vmevents_local[event]);
+       long x = 0;
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               x += per_cpu(memcg->vmstats_local->events[event], cpu);
+       return x;
 }
 
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
                        long x;
 
                        x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
-                       if (x) {
-                               atomic_long_add(x, &memcg->vmstats_local[i]);
+                       if (x)
                                for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
                                        atomic_long_add(x, &memcg->vmstats[i]);
-                       }
 
                        if (i >= NR_VM_NODE_STAT_ITEMS)
                                continue;
 
                                pn = mem_cgroup_nodeinfo(memcg, nid);
                                x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
-                               if (x) {
-                                       atomic_long_add(x, &pn->lruvec_stat_local[i]);
+                               if (x)
                                        do {
                                                atomic_long_add(x, &pn->lruvec_stat[i]);
                                        } while ((pn = parent_nodeinfo(pn, nid)));
-                               }
                        }
                }
 
                        long x;
 
                        x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
-                       if (x) {
-                               atomic_long_add(x, &memcg->vmevents_local[i]);
+                       if (x)
                                for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
                                        atomic_long_add(x, &memcg->vmevents[i]);
-                       }
                }
        }
 
        if (!pn)
                return 1;
 
+       pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat);
+       if (!pn->lruvec_stat_local) {
+               kfree(pn);
+               return 1;
+       }
+
        pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
        if (!pn->lruvec_stat_cpu) {
+               free_percpu(pn->lruvec_stat_local);
                kfree(pn);
                return 1;
        }
                return;
 
        free_percpu(pn->lruvec_stat_cpu);
+       free_percpu(pn->lruvec_stat_local);
        kfree(pn);
 }
 
        for_each_node(node)
                free_mem_cgroup_per_node_info(memcg, node);
        free_percpu(memcg->vmstats_percpu);
+       free_percpu(memcg->vmstats_local);
        kfree(memcg);
 }
 
        if (memcg->id.id < 0)
                goto fail;
 
+       memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
+       if (!memcg->vmstats_local)
+               goto fail;
+
        memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
        if (!memcg->vmstats_percpu)
                goto fail;