static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
-static void drain_all_stock_async(void);
+static void drain_all_stock_async(struct mem_cgroup *mem);
 
 static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
                if (victim == root_mem) {
                        loop++;
                        if (loop >= 1)
-                               drain_all_stock_async();
+                               drain_all_stock_async(root_mem);
                        if (loop >= 2) {
                                /*
                                 * If we have not been able to reclaim
        struct mem_cgroup *cached; /* this never be root cgroup */
        unsigned int nr_pages;
        struct work_struct work;
+       unsigned long flags;
+#define FLUSHING_CACHED_CHARGE (0)
 };
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
-static atomic_t memcg_drain_count;
+static DEFINE_MUTEX(percpu_charge_mutex);
 
 /*
  * Try to consume stocked charge on this cpu. If success, one page is consumed
 {
        struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
        drain_stock(stock);
+       clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 }
 
 /*
  * expects some charges will be back to res_counter later but cannot wait for
  * it.
  */
-static void drain_all_stock_async(void)
+static void drain_all_stock_async(struct mem_cgroup *root_mem)
 {
-       int cpu;
-       /* This function is for scheduling "drain" in asynchronous way.
-        * The result of "drain" is not directly handled by callers. Then,
-        * if someone is calling drain, we don't have to call drain more.
-        * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
-        * there is a race. We just do loose check here.
+       int cpu, curcpu;
+       /*
+        * If someone calls draining, avoid adding more kworker runs.
         */
-       if (atomic_read(&memcg_drain_count))
+       if (!mutex_trylock(&percpu_charge_mutex))
                return;
        /* Notify other cpus that system-wide "drain" is running */
-       atomic_inc(&memcg_drain_count);
        get_online_cpus();
+       /*
+        * Get a hint for avoiding draining charges on the current cpu,
+        * which must be exhausted by our charging.  It is not required that
+        * this be a precise check, so we use raw_smp_processor_id() instead of
+        * getcpu()/putcpu().
+        */
+       curcpu = raw_smp_processor_id();
        for_each_online_cpu(cpu) {
                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
-               schedule_work_on(cpu, &stock->work);
+               struct mem_cgroup *mem;
+
+               if (cpu == curcpu)
+                       continue;
+
+               mem = stock->cached;
+               if (!mem)
+                       continue;
+               if (mem != root_mem) {
+                       if (!root_mem->use_hierarchy)
+                               continue;
+                       /* check whether "mem" is under tree of "root_mem" */
+                       if (!css_is_ancestor(&mem->css, &root_mem->css))
+                               continue;
+               }
+               if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
+                       schedule_work_on(cpu, &stock->work);
        }
        put_online_cpus();
-       atomic_dec(&memcg_drain_count);
+       mutex_unlock(&percpu_charge_mutex);
        /* We don't wait for flush_work */
 }
 
 static void drain_all_stock_sync(void)
 {
        /* called when force_empty is called */
-       atomic_inc(&memcg_drain_count);
+       mutex_lock(&percpu_charge_mutex);
        schedule_on_each_cpu(drain_local_stock);
-       atomic_dec(&memcg_drain_count);
+       mutex_unlock(&percpu_charge_mutex);
 }
 
 /*