mm/mglru: fix ineffective protection calculation
authorYu Zhao <yuzhao@google.com>
Fri, 12 Jul 2024 23:29:56 +0000 (17:29 -0600)
committerAndrew Morton <akpm@linux-foundation.org>
Thu, 18 Jul 2024 04:08:55 +0000 (21:08 -0700)
mem_cgroup_calculate_protection() is not stateless and should only be used
as part of a top-down tree traversal.  shrink_one() traverses the per-node
memcg LRU instead of the root_mem_cgroup tree, and therefore it should not
call mem_cgroup_calculate_protection().

The existing misuse in shrink_one() can cause ineffective protection of
sub-trees that are grandchildren of root_mem_cgroup.  Fix it by reusing
lru_gen_age_node(), which already traverses the root_mem_cgroup tree, to
calculate the protection.

Previously lru_gen_age_node() opportunistically skips the first pass,
i.e., when scan_control->priority is DEF_PRIORITY.  On the second pass,
lruvec_is_sizable() uses appropriate scan_control->priority, set by
set_initial_priority() from lru_gen_shrink_node(), to decide whether a
memcg is too small to reclaim from.

Now lru_gen_age_node() unconditionally traverses the root_mem_cgroup tree.
So it should call set_initial_priority() upfront, to make sure
lruvec_is_sizable() uses appropriate scan_control->priority on the first
pass.  Otherwise, lruvec_is_reclaimable() can return false negatives and
result in premature OOM kills when min_ttl_ms is used.

Link: https://lkml.kernel.org/r/20240712232956.1427127-1-yuzhao@google.com
Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reported-by: T.J. Mercier <tjmercier@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
mm/vmscan.c

index 6216d79edb7f81e50504047c06bfda2f2e423bd2..525d3ffa84516c3c2f47d842bf6da319da970637 100644 (file)
@@ -3915,6 +3915,32 @@ done:
  *                          working set protection
  ******************************************************************************/
 
+static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
+{
+       int priority;
+       unsigned long reclaimable;
+
+       if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
+               return;
+       /*
+        * Determine the initial priority based on
+        * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
+        * where reclaimed_to_scanned_ratio = inactive / total.
+        */
+       reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
+       if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
+               reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
+
+       /* round down reclaimable and round up sc->nr_to_reclaim */
+       priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
+
+       /*
+        * The estimation is based on LRU pages only, so cap it to prevent
+        * overshoots of shrinker objects by large margins.
+        */
+       sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
+}
+
 static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
 {
        int gen, type, zone;
@@ -3948,19 +3974,17 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        DEFINE_MIN_SEQ(lruvec);
 
-       /* see the comment on lru_gen_folio */
-       gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
-       birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
-
-       if (time_is_after_jiffies(birth + min_ttl))
+       if (mem_cgroup_below_min(NULL, memcg))
                return false;
 
        if (!lruvec_is_sizable(lruvec, sc))
                return false;
 
-       mem_cgroup_calculate_protection(NULL, memcg);
+       /* see the comment on lru_gen_folio */
+       gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+       birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
 
-       return !mem_cgroup_below_min(NULL, memcg);
+       return time_is_before_jiffies(birth + min_ttl);
 }
 
 /* to protect the working set of the last N jiffies */
@@ -3970,23 +3994,20 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 {
        struct mem_cgroup *memcg;
        unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
+       bool reclaimable = !min_ttl;
 
        VM_WARN_ON_ONCE(!current_is_kswapd());
 
-       /* check the order to exclude compaction-induced reclaim */
-       if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
-               return;
+       set_initial_priority(pgdat, sc);
 
        memcg = mem_cgroup_iter(NULL, NULL, NULL);
        do {
                struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
 
-               if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
-                       mem_cgroup_iter_break(NULL, memcg);
-                       return;
-               }
+               mem_cgroup_calculate_protection(NULL, memcg);
 
-               cond_resched();
+               if (!reclaimable)
+                       reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
        } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
 
        /*
@@ -3994,7 +4015,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
         * younger than min_ttl. However, another possibility is all memcgs are
         * either too small or below min.
         */
-       if (mutex_trylock(&oom_lock)) {
+       if (!reclaimable && mutex_trylock(&oom_lock)) {
                struct oom_control oc = {
                        .gfp_mask = sc->gfp_mask,
                };
@@ -4786,8 +4807,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
-       mem_cgroup_calculate_protection(NULL, memcg);
-
+       /* lru_gen_age_node() called mem_cgroup_calculate_protection() */
        if (mem_cgroup_below_min(NULL, memcg))
                return MEMCG_LRU_YOUNG;
 
@@ -4911,32 +4931,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
        blk_finish_plug(&plug);
 }
 
-static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
-{
-       int priority;
-       unsigned long reclaimable;
-
-       if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
-               return;
-       /*
-        * Determine the initial priority based on
-        * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
-        * where reclaimed_to_scanned_ratio = inactive / total.
-        */
-       reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
-       if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
-               reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
-
-       /* round down reclaimable and round up sc->nr_to_reclaim */
-       priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
-
-       /*
-        * The estimation is based on LRU pages only, so cap it to prevent
-        * overshoots of shrinker objects by large margins.
-        */
-       sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
-}
-
 static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
 {
        struct blk_plug plug;