pr_cont(" are going to be killed due to memory.oom.group set\n");
 }
 
+/*
+ * The value of NR_MEMCG_STOCK is selected to keep the cached memcgs and their
+ * nr_pages in a single cacheline. This may change in future.
+ */
+#define NR_MEMCG_STOCK 7
 struct memcg_stock_pcp {
        local_trylock_t stock_lock;
-       struct mem_cgroup *cached; /* this never be root cgroup */
-       unsigned int nr_pages;
+       uint8_t nr_pages[NR_MEMCG_STOCK];
+       struct mem_cgroup *cached[NR_MEMCG_STOCK];
 
        struct obj_cgroup *cached_objcg;
        struct pglist_data *cached_pgdat;
        unsigned long flags;
 #define FLUSHING_CACHED_CHARGE 0
 };
-static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
+static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = {
        .stock_lock = INIT_LOCAL_TRYLOCK(stock_lock),
 };
 static DEFINE_MUTEX(percpu_charge_mutex);
                          gfp_t gfp_mask)
 {
        struct memcg_stock_pcp *stock;
-       unsigned int stock_pages;
+       uint8_t stock_pages;
        unsigned long flags;
        bool ret = false;
+       int i;
 
        if (nr_pages > MEMCG_CHARGE_BATCH)
                return ret;
                return ret;
 
        stock = this_cpu_ptr(&memcg_stock);
-       stock_pages = READ_ONCE(stock->nr_pages);
-       if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) {
-               WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages);
-               ret = true;
+
+       for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+               if (memcg != READ_ONCE(stock->cached[i]))
+                       continue;
+
+               stock_pages = READ_ONCE(stock->nr_pages[i]);
+               if (stock_pages >= nr_pages) {
+                       WRITE_ONCE(stock->nr_pages[i], stock_pages - nr_pages);
+                       ret = true;
+               }
+               break;
        }
 
        local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
 /*
  * Returns stocks cached in percpu and reset cached information.
  */
-static void drain_stock(struct memcg_stock_pcp *stock)
+static void drain_stock(struct memcg_stock_pcp *stock, int i)
 {
-       unsigned int stock_pages = READ_ONCE(stock->nr_pages);
-       struct mem_cgroup *old = READ_ONCE(stock->cached);
+       struct mem_cgroup *old = READ_ONCE(stock->cached[i]);
+       uint8_t stock_pages;
 
        if (!old)
                return;
 
+       stock_pages = READ_ONCE(stock->nr_pages[i]);
        if (stock_pages) {
                memcg_uncharge(old, stock_pages);
-               WRITE_ONCE(stock->nr_pages, 0);
+               WRITE_ONCE(stock->nr_pages[i], 0);
        }
 
        css_put(&old->css);
-       WRITE_ONCE(stock->cached, NULL);
+       WRITE_ONCE(stock->cached[i], NULL);
+}
+
+static void drain_stock_fully(struct memcg_stock_pcp *stock)
+{
+       int i;
+
+       for (i = 0; i < NR_MEMCG_STOCK; ++i)
+               drain_stock(stock, i);
 }
 
 static void drain_local_stock(struct work_struct *dummy)
 
        stock = this_cpu_ptr(&memcg_stock);
        drain_obj_stock(stock);
-       drain_stock(stock);
+       drain_stock_fully(stock);
        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 
        local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
        struct memcg_stock_pcp *stock;
-       unsigned int stock_pages;
+       struct mem_cgroup *cached;
+       uint8_t stock_pages;
        unsigned long flags;
+       bool success = false;
+       int empty_slot = -1;
+       int i;
+
+       /*
+        * For now limit MEMCG_CHARGE_BATCH to 127 and less. In future if we
+        * decide to increase it more than 127 then we will need more careful
+        * handling of nr_pages[] in struct memcg_stock_pcp.
+        */
+       BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S8_MAX);
 
        VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg));
 
-       if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
+       if (nr_pages > MEMCG_CHARGE_BATCH ||
+           !local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
                /*
-                * In case of unlikely failure to lock percpu stock_lock
-                * uncharge memcg directly.
+                * In case of larger than batch refill or unlikely failure to
+                * lock the percpu stock_lock, uncharge memcg directly.
                 */
                memcg_uncharge(memcg, nr_pages);
                return;
        }
 
        stock = this_cpu_ptr(&memcg_stock);
-       if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
-               drain_stock(stock);
-               css_get(&memcg->css);
-               WRITE_ONCE(stock->cached, memcg);
+       for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+               cached = READ_ONCE(stock->cached[i]);
+               if (!cached && empty_slot == -1)
+                       empty_slot = i;
+               if (memcg == READ_ONCE(stock->cached[i])) {
+                       stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages;
+                       WRITE_ONCE(stock->nr_pages[i], stock_pages);
+                       if (stock_pages > MEMCG_CHARGE_BATCH)
+                               drain_stock(stock, i);
+                       success = true;
+                       break;
+               }
        }
-       stock_pages = READ_ONCE(stock->nr_pages) + nr_pages;
-       WRITE_ONCE(stock->nr_pages, stock_pages);
 
-       if (stock_pages > MEMCG_CHARGE_BATCH)
-               drain_stock(stock);
+       if (!success) {
+               i = empty_slot;
+               if (i == -1) {
+                       i = get_random_u32_below(NR_MEMCG_STOCK);
+                       drain_stock(stock, i);
+               }
+               css_get(&memcg->css);
+               WRITE_ONCE(stock->cached[i], memcg);
+               WRITE_ONCE(stock->nr_pages[i], nr_pages);
+       }
 
        local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
 }
 
+static bool is_drain_needed(struct memcg_stock_pcp *stock,
+                           struct mem_cgroup *root_memcg)
+{
+       struct mem_cgroup *memcg;
+       bool flush = false;
+       int i;
+
+       rcu_read_lock();
+
+       if (obj_stock_flush_required(stock, root_memcg)) {
+               flush = true;
+               goto out;
+       }
+
+       for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+               memcg = READ_ONCE(stock->cached[i]);
+               if (!memcg)
+                       continue;
+
+               if (READ_ONCE(stock->nr_pages[i]) &&
+                   mem_cgroup_is_descendant(memcg, root_memcg)) {
+                       flush = true;
+                       break;
+               }
+       }
+out:
+       rcu_read_unlock();
+       return flush;
+}
+
 /*
  * Drains all per-CPU charge caches for given root_memcg resp. subtree
  * of the hierarchy under it.
        curcpu = smp_processor_id();
        for_each_online_cpu(cpu) {
                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
-               struct mem_cgroup *memcg;
-               bool flush = false;
-
-               rcu_read_lock();
-               memcg = READ_ONCE(stock->cached);
-               if (memcg && READ_ONCE(stock->nr_pages) &&
-                   mem_cgroup_is_descendant(memcg, root_memcg))
-                       flush = true;
-               else if (obj_stock_flush_required(stock, root_memcg))
-                       flush = true;
-               rcu_read_unlock();
+               bool flush = is_drain_needed(stock, root_memcg);
 
                if (flush &&
                    !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
        drain_obj_stock(stock);
        local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
 
-       drain_stock(stock);
+       drain_stock_fully(stock);
 
        return 0;
 }