mm: vmscan: block kswapd if it is encountering pages under writeback

author Mel Gorman <mgorman@suse.de>

Wed, 3 Jul 2013 22:01:51 +0000 (15:01 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 3 Jul 2013 23:07:28 +0000 (16:07 -0700)
author Mel Gorman <mgorman@suse.de>
Wed, 3 Jul 2013 22:01:51 +0000 (15:01 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 3 Jul 2013 23:07:28 +0000 (16:07 -0700)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 2aaf72f7e345f0841261db1a25a8c91e529f6f07..fce64afba0426851dee4bcd7dad4a78d6352e47a 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -499,6 +499,9 @@ typedef enum {
                                          * many dirty file pages at the tail
                                          * of the LRU.
                                          */
+       ZONE_WRITEBACK,                 /* reclaim scanning has recently found
+                                        * many pages under writeback
+                                        */
  } zone_flags_t;
  
  static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
@@ -526,6 +529,11 @@ static inline int zone_is_reclaim_dirty(const struct zone *zone)
         return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags);
  }
  
+static inline int zone_is_reclaim_writeback(const struct zone *zone)
+{
+       return test_bit(ZONE_WRITEBACK, &zone->flags);
+}
+
  static inline int zone_is_reclaim_locked(const struct zone *zone)
  {
         return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
diff --git a/mm/vmscan.c b/mm/vmscan.c

index d6c916d808ba2248e112a3dc3767647abf857fc3..1109de0c35bf450605d9be2657706041d4a66cb7 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -724,25 +724,55 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
                         (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
  
+               /*
+                * If a page at the tail of the LRU is under writeback, there
+                * are three cases to consider.
+                *
+                * 1) If reclaim is encountering an excessive number of pages
+                *    under writeback and this page is both under writeback and
+                *    PageReclaim then it indicates that pages are being queued
+                *    for IO but are being recycled through the LRU before the
+                *    IO can complete. Waiting on the page itself risks an
+                *    indefinite stall if it is impossible to writeback the
+                *    page due to IO error or disconnected storage so instead
+                *    block for HZ/10 or until some IO completes then clear the
+                *    ZONE_WRITEBACK flag to recheck if the condition exists.
+                *
+                * 2) Global reclaim encounters a page, memcg encounters a
+                *    page that is not marked for immediate reclaim or
+                *    the caller does not have __GFP_IO. In this case mark
+                *    the page for immediate reclaim and continue scanning.
+                *
+                *    __GFP_IO is checked  because a loop driver thread might
+                *    enter reclaim, and deadlock if it waits on a page for
+                *    which it is needed to do the write (loop masks off
+                *    __GFP_IO|__GFP_FS for this reason); but more thought
+                *    would probably show more reasons.
+                *
+                *    Don't require __GFP_FS, since we're not going into the
+                *    FS, just waiting on its writeback completion. Worryingly,
+                *    ext4 gfs2 and xfs allocate pages with
+                *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
+                *    may_enter_fs here is liable to OOM on them.
+                *
+                * 3) memcg encounters a page that is not already marked
+                *    PageReclaim. memcg does not have any dirty pages
+                *    throttling so we could easily OOM just because too many
+                *    pages are in writeback and there is nothing else to
+                *    reclaim. Wait for the writeback to complete.
+                */
                 if (PageWriteback(page)) {
-                       /*
-                        * memcg doesn't have any dirty pages throttling so we
-                        * could easily OOM just because too many pages are in
-                        * writeback and there is nothing else to reclaim.
-                        *
-                        * Check __GFP_IO, certainly because a loop driver
-                        * thread might enter reclaim, and deadlock if it waits
-                        * on a page for which it is needed to do the write
-                        * (loop masks off __GFP_IO|__GFP_FS for this reason);
-                        * but more thought would probably show more reasons.
-                        *
-                        * Don't require __GFP_FS, since we're not going into
-                        * the FS, just waiting on its writeback completion.
-                        * Worryingly, ext4 gfs2 and xfs allocate pages with
-                        * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
-                        * testing may_enter_fs here is liable to OOM on them.
-                        */
-                       if (global_reclaim(sc) ||
+                       /* Case 1 above */
+                       if (current_is_kswapd() &&
+                           PageReclaim(page) &&
+                           zone_is_reclaim_writeback(zone)) {
+                               unlock_page(page);
+                               congestion_wait(BLK_RW_ASYNC, HZ/10);
+                               zone_clear_flag(zone, ZONE_WRITEBACK);
+                               goto keep;
+
+                       /* Case 2 above */
+                       } else if (global_reclaim(sc) ||
                             !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
                                 /*
                                  * This is slightly racy - end_page_writeback()
@@ -757,9 +787,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                  */
                                 SetPageReclaim(page);
                                 nr_writeback++;
+
                                 goto keep_locked;
+
+                       /* Case 3 above */
+                       } else {
+                               wait_on_page_writeback(page);
                         }
-                       wait_on_page_writeback(page);
                 }
  
                 if (!force_reclaim)
@@ -1374,8 +1408,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
          *                     isolated page is PageWriteback
          */
         if (nr_writeback && nr_writeback >=
-                       (nr_taken >> (DEF_PRIORITY - sc->priority)))
+                       (nr_taken >> (DEF_PRIORITY - sc->priority))) {
                 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+               zone_set_flag(zone, ZONE_WRITEBACK);
+       }
  
         /*
          * Similarly, if many dirty pages are encountered that are not
@@ -2669,8 +2705,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
   * the high watermark.
   *
   * Returns true if kswapd scanned at least the requested number of pages to
- * reclaim. This is used to determine if the scanning priority needs to be
- * raised.
+ * reclaim or if the lack of progress was due to pages under writeback.
+ * This is used to determine if the scanning priority needs to be raised.
   */
  static bool kswapd_shrink_zone(struct zone *zone,
                                struct scan_control *sc,
@@ -2697,6 +2733,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
         if (nr_slab == 0 && !zone_reclaimable(zone))
                 zone->all_unreclaimable = 1;
  
+       zone_clear_flag(zone, ZONE_WRITEBACK);
+
         return sc->nr_scanned >= sc->nr_to_reclaim;
  }
author	Mel Gorman <mgorman@suse.de>
	Wed, 3 Jul 2013 22:01:51 +0000 (15:01 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 3 Jul 2013 23:07:28 +0000 (16:07 -0700)
include/linux/mmzone.h		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history