may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
 
+               /*
+                * If a page at the tail of the LRU is under writeback, there
+                * are three cases to consider.
+                *
+                * 1) If reclaim is encountering an excessive number of pages
+                *    under writeback and this page is both under writeback and
+                *    PageReclaim then it indicates that pages are being queued
+                *    for IO but are being recycled through the LRU before the
+                *    IO can complete. Waiting on the page itself risks an
+                *    indefinite stall if it is impossible to writeback the
+                *    page due to IO error or disconnected storage so instead
+                *    block for HZ/10 or until some IO completes then clear the
+                *    ZONE_WRITEBACK flag to recheck if the condition exists.
+                *
+                * 2) Global reclaim encounters a page, memcg encounters a
+                *    page that is not marked for immediate reclaim or
+                *    the caller does not have __GFP_IO. In this case mark
+                *    the page for immediate reclaim and continue scanning.
+                *
+                *    __GFP_IO is checked  because a loop driver thread might
+                *    enter reclaim, and deadlock if it waits on a page for
+                *    which it is needed to do the write (loop masks off
+                *    __GFP_IO|__GFP_FS for this reason); but more thought
+                *    would probably show more reasons.
+                *
+                *    Don't require __GFP_FS, since we're not going into the
+                *    FS, just waiting on its writeback completion. Worryingly,
+                *    ext4 gfs2 and xfs allocate pages with
+                *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
+                *    may_enter_fs here is liable to OOM on them.
+                *
+                * 3) memcg encounters a page that is not already marked
+                *    PageReclaim. memcg does not have any dirty pages
+                *    throttling so we could easily OOM just because too many
+                *    pages are in writeback and there is nothing else to
+                *    reclaim. Wait for the writeback to complete.
+                */
                if (PageWriteback(page)) {
-                       /*
-                        * memcg doesn't have any dirty pages throttling so we
-                        * could easily OOM just because too many pages are in
-                        * writeback and there is nothing else to reclaim.
-                        *
-                        * Check __GFP_IO, certainly because a loop driver
-                        * thread might enter reclaim, and deadlock if it waits
-                        * on a page for which it is needed to do the write
-                        * (loop masks off __GFP_IO|__GFP_FS for this reason);
-                        * but more thought would probably show more reasons.
-                        *
-                        * Don't require __GFP_FS, since we're not going into
-                        * the FS, just waiting on its writeback completion.
-                        * Worryingly, ext4 gfs2 and xfs allocate pages with
-                        * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
-                        * testing may_enter_fs here is liable to OOM on them.
-                        */
-                       if (global_reclaim(sc) ||
+                       /* Case 1 above */
+                       if (current_is_kswapd() &&
+                           PageReclaim(page) &&
+                           zone_is_reclaim_writeback(zone)) {
+                               unlock_page(page);
+                               congestion_wait(BLK_RW_ASYNC, HZ/10);
+                               zone_clear_flag(zone, ZONE_WRITEBACK);
+                               goto keep;
+
+                       /* Case 2 above */
+                       } else if (global_reclaim(sc) ||
                            !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
                                /*
                                 * This is slightly racy - end_page_writeback()
                                 */
                                SetPageReclaim(page);
                                nr_writeback++;
+
                                goto keep_locked;
+
+                       /* Case 3 above */
+                       } else {
+                               wait_on_page_writeback(page);
                        }
-                       wait_on_page_writeback(page);
                }
 
                if (!force_reclaim)
         *                     isolated page is PageWriteback
         */
        if (nr_writeback && nr_writeback >=
-                       (nr_taken >> (DEF_PRIORITY - sc->priority)))
+                       (nr_taken >> (DEF_PRIORITY - sc->priority))) {
                wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+               zone_set_flag(zone, ZONE_WRITEBACK);
+       }
 
        /*
         * Similarly, if many dirty pages are encountered that are not
  * the high watermark.
  *
  * Returns true if kswapd scanned at least the requested number of pages to
- * reclaim. This is used to determine if the scanning priority needs to be
- * raised.
+ * reclaim or if the lack of progress was due to pages under writeback.
+ * This is used to determine if the scanning priority needs to be raised.
  */
 static bool kswapd_shrink_zone(struct zone *zone,
                               struct scan_control *sc,
        if (nr_slab == 0 && !zone_reclaimable(zone))
                zone->all_unreclaimable = 1;
 
+       zone_clear_flag(zone, ZONE_WRITEBACK);
+
        return sc->nr_scanned >= sc->nr_to_reclaim;
 }