MEMCG_SLAB_RECLAIMABLE,
        MEMCG_SLAB_UNRECLAIMABLE,
        MEMCG_SOCK,
+       MEMCG_WORKINGSET_REFAULT,
+       MEMCG_WORKINGSET_ACTIVATE,
+       MEMCG_WORKINGSET_NODERECLAIM,
        MEMCG_NR_STAT,
 };
 
 void lock_page_memcg(struct page *page);
 void unlock_page_memcg(struct page *page);
 
+static inline unsigned long mem_cgroup_read_stat(struct mem_cgroup *memcg,
+                                                enum mem_cgroup_stat_index idx)
+{
+       long val = 0;
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               val += per_cpu(memcg->stat->count[idx], cpu);
+
+       if (val < 0)
+               val = 0;
+
+       return val;
+}
+
+static inline void mem_cgroup_update_stat(struct mem_cgroup *memcg,
+                                  enum mem_cgroup_stat_index idx, int val)
+{
+       if (!mem_cgroup_disabled())
+               this_cpu_add(memcg->stat->count[idx], val);
+}
+
+static inline void mem_cgroup_inc_stat(struct mem_cgroup *memcg,
+                                  enum mem_cgroup_stat_index idx)
+{
+       mem_cgroup_update_stat(memcg, idx, 1);
+}
+
+static inline void mem_cgroup_dec_stat(struct mem_cgroup *memcg,
+                                  enum mem_cgroup_stat_index idx)
+{
+       mem_cgroup_update_stat(memcg, idx, -1);
+}
+
 /**
  * mem_cgroup_update_page_stat - update page state statistics
  * @page: the page
  *   if (TestClearPageState(page))
  *     mem_cgroup_update_page_stat(page, state, -1);
  *   unlock_page(page) or unlock_page_memcg(page)
+ *
+ * Kernel pages are an exception to this, since they'll never move.
  */
 static inline void mem_cgroup_update_page_stat(struct page *page,
                                 enum mem_cgroup_stat_index idx, int val)
 {
-       VM_BUG_ON(!(rcu_read_lock_held() || PageLocked(page)));
-
        if (page->mem_cgroup)
-               this_cpu_add(page->mem_cgroup->stat->count[idx], val);
+               mem_cgroup_update_stat(page->mem_cgroup, idx, val);
 }
 
 static inline void mem_cgroup_inc_page_stat(struct page *page,
        return false;
 }
 
+static inline unsigned long mem_cgroup_read_stat(struct mem_cgroup *memcg,
+                                                enum mem_cgroup_stat_index idx)
+{
+       return 0;
+}
+
+static inline void mem_cgroup_update_stat(struct mem_cgroup *memcg,
+                                  enum mem_cgroup_stat_index idx, int val)
+{
+}
+
+static inline void mem_cgroup_inc_stat(struct mem_cgroup *memcg,
+                                  enum mem_cgroup_stat_index idx)
+{
+}
+
+static inline void mem_cgroup_dec_stat(struct mem_cgroup *memcg,
+                                  enum mem_cgroup_stat_index idx)
+{
+}
+
 static inline void mem_cgroup_update_page_stat(struct page *page,
                                               enum mem_cgroup_stat_index idx,
                                               int nr)
 
  * Both inactive lists should also be large enough that each inactive
  * page has a chance to be referenced again before it is reclaimed.
  *
+ * If that fails and refaulting is observed, the inactive list grows.
+ *
  * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
  * on this LRU, maintained by the pageout code. A zone->inactive_ratio
  * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
  *   10TB     320        32GB
  */
 static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
-                                               struct scan_control *sc, bool trace)
+                                struct mem_cgroup *memcg,
+                                struct scan_control *sc, bool actual_reclaim)
 {
-       unsigned long inactive_ratio;
-       unsigned long inactive, active;
-       enum lru_list inactive_lru = file * LRU_FILE;
        enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+       enum lru_list inactive_lru = file * LRU_FILE;
+       unsigned long inactive, active;
+       unsigned long inactive_ratio;
+       unsigned long refaults;
        unsigned long gb;
 
        /*
        inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
        active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
 
-       gb = (inactive + active) >> (30 - PAGE_SHIFT);
-       if (gb)
-               inactive_ratio = int_sqrt(10 * gb);
+       if (memcg)
+               refaults = mem_cgroup_read_stat(memcg,
+                                               MEMCG_WORKINGSET_ACTIVATE);
        else
-               inactive_ratio = 1;
+               refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
+
+       /*
+        * When refaults are being observed, it means a new workingset
+        * is being established. Disable active list protection to get
+        * rid of the stale workingset quickly.
+        */
+       if (file && actual_reclaim && lruvec->refaults != refaults) {
+               inactive_ratio = 0;
+       } else {
+               gb = (inactive + active) >> (30 - PAGE_SHIFT);
+               if (gb)
+                       inactive_ratio = int_sqrt(10 * gb);
+               else
+                       inactive_ratio = 1;
+       }
 
-       if (trace)
-               trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id,
-                               sc->reclaim_idx,
-                               lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
-                               lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
-                               inactive_ratio, file);
+       if (actual_reclaim)
+               trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
+                       lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
+                       lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
+                       inactive_ratio, file);
 
        return inactive * inactive_ratio < active;
 }
 
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
-                                struct lruvec *lruvec, struct scan_control *sc)
+                                struct lruvec *lruvec, struct mem_cgroup *memcg,
+                                struct scan_control *sc)
 {
        if (is_active_lru(lru)) {
-               if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
+               if (inactive_list_is_low(lruvec, is_file_lru(lru),
+                                        memcg, sc, true))
                        shrink_active_list(nr_to_scan, lruvec, sc, lru);
                return 0;
        }
         * lruvec even if it has plenty of old anonymous pages unless the
         * system is under heavy pressure.
         */
-       if (!inactive_list_is_low(lruvec, true, sc, false) &&
+       if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
            lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
                scan_balance = SCAN_FILE;
                goto out;
                                nr[lru] -= nr_to_scan;
 
                                nr_reclaimed += shrink_list(lru, nr_to_scan,
-                                                           lruvec, sc);
+                                                           lruvec, memcg, sc);
                        }
                }
 
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-       if (inactive_list_is_low(lruvec, false, sc, true))
+       if (inactive_list_is_low(lruvec, false, memcg, sc, true))
                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                   sc, LRU_ACTIVE_ANON);
 }
        sc->gfp_mask = orig_mask;
 }
 
+static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
+{
+       struct mem_cgroup *memcg;
+
+       memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
+       do {
+               unsigned long refaults;
+               struct lruvec *lruvec;
+
+               if (memcg)
+                       refaults = mem_cgroup_read_stat(memcg,
+                                               MEMCG_WORKINGSET_ACTIVATE);
+               else
+                       refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
+
+               lruvec = mem_cgroup_lruvec(pgdat, memcg);
+               lruvec->refaults = refaults;
+       } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
+}
+
 /*
  * This is the main entry point to direct page reclaim.
  *
                                          struct scan_control *sc)
 {
        int initial_priority = sc->priority;
+       pg_data_t *last_pgdat;
+       struct zoneref *z;
+       struct zone *zone;
 retry:
        delayacct_freepages_start();
 
                        sc->may_writepage = 1;
        } while (--sc->priority >= 0);
 
+       last_pgdat = NULL;
+       for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
+                                       sc->nodemask) {
+               if (zone->zone_pgdat == last_pgdat)
+                       continue;
+               last_pgdat = zone->zone_pgdat;
+               snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
+       }
+
        delayacct_freepages_end();
 
        if (sc->nr_reclaimed)
        do {
                struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
 
-               if (inactive_list_is_low(lruvec, false, sc, true))
+               if (inactive_list_is_low(lruvec, false, memcg, sc, true))
                        shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                           sc, LRU_ACTIVE_ANON);
 
                pgdat->kswapd_failures++;
 
 out:
+       snapshot_refaults(NULL, pgdat);
        /*
         * Return the order kswapd stopped reclaiming at as
         * prepare_kswapd_sleep() takes it into account. If another caller