*/
 static void flush_memcg_stats_dwork(struct work_struct *w);
 static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
-static DEFINE_SPINLOCK(stats_flush_lock);
 static DEFINE_PER_CPU(unsigned int, stats_updates);
+static atomic_t stats_flush_ongoing = ATOMIC_INIT(0);
 static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
 static u64 flush_next_time;
 
 
 static void __mem_cgroup_flush_stats(void)
 {
-       unsigned long flag;
-
-       if (!spin_trylock_irqsave(&stats_flush_lock, flag))
+       /*
+        * We always flush the entire tree, so concurrent flushers can just
+        * skip. This avoids a thundering herd problem on the rstat global lock
+        * from memcg flushers (e.g. reclaim, refault, etc).
+        */
+       if (atomic_read(&stats_flush_ongoing) ||
+           atomic_xchg(&stats_flush_ongoing, 1))
                return;
 
-       flush_next_time = jiffies_64 + 2*FLUSH_TIME;
+       WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
        cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
        atomic_set(&stats_flush_threshold, 0);
-       spin_unlock_irqrestore(&stats_flush_lock, flag);
+       atomic_set(&stats_flush_ongoing, 0);
 }
 
 void mem_cgroup_flush_stats(void)
 
 void mem_cgroup_flush_stats_ratelimited(void)
 {
-       if (time_after64(jiffies_64, flush_next_time))
+       if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
                mem_cgroup_flush_stats();
 }