void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
 void set_bdi_congested(struct backing_dev_info *bdi, int sync);
 long congestion_wait(int sync, long timeout);
-
+long wait_iff_congested(struct zone *zone, int sync, long timeout);
 
 static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
 {
 
 typedef enum {
        ZONE_RECLAIM_LOCKED,            /* prevents concurrent reclaim */
        ZONE_OOM_LOCKED,                /* zone is in OOM killer zonelist */
+       ZONE_CONGESTED,                 /* zone has many dirty pages backed by
+                                        * a congested BDI
+                                        */
 } zone_flags_t;
 
 static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
        clear_bit(flag, &zone->flags);
 }
 
+static inline int zone_is_reclaim_congested(const struct zone *zone)
+{
+       return test_bit(ZONE_CONGESTED, &zone->flags);
+}
+
 static inline int zone_is_reclaim_locked(const struct zone *zone)
 {
        return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
 
        TP_ARGS(usec_timeout, usec_delayed)
 );
 
+DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested,
+
+       TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
+
+       TP_ARGS(usec_timeout, usec_delayed)
+);
+
 #endif /* _TRACE_WRITEBACK_H */
 
 /* This part must be outside protection */
 
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
        };
+static atomic_t nr_bdi_congested[2];
 
 void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
        wait_queue_head_t *wqh = &congestion_wqh[sync];
 
        bit = sync ? BDI_sync_congested : BDI_async_congested;
-       clear_bit(bit, &bdi->state);
+       if (test_and_clear_bit(bit, &bdi->state))
+               atomic_dec(&nr_bdi_congested[sync]);
        smp_mb__after_clear_bit();
        if (waitqueue_active(wqh))
                wake_up(wqh);
        enum bdi_state bit;
 
        bit = sync ? BDI_sync_congested : BDI_async_congested;
-       set_bit(bit, &bdi->state);
+       if (!test_and_set_bit(bit, &bdi->state))
+               atomic_inc(&nr_bdi_congested[sync]);
 }
 EXPORT_SYMBOL(set_bdi_congested);
 
 }
 EXPORT_SYMBOL(congestion_wait);
 
+/**
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
+ * @zone: A zone to check if it is heavily congested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * In the event of a congested backing_dev (any backing_dev) and the given
+ * @zone has experienced recent congestion, this waits for up to @timeout
+ * jiffies for either a BDI to exit congestion of the given @sync queue
+ * or a write to complete.
+ *
+ * In the absense of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
+ *
+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ * it is the number of jiffies that were still remaining when the function
+ * returned. return_value == timeout implies the function did not sleep.
+ */
+long wait_iff_congested(struct zone *zone, int sync, long timeout)
+{
+       long ret;
+       unsigned long start = jiffies;
+       DEFINE_WAIT(wait);
+       wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+       /*
+        * If there is no congestion, or heavy congestion is not being
+        * encountered in the current zone, yield if necessary instead
+        * of sleeping on the congestion queue
+        */
+       if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+                       !zone_is_reclaim_congested(zone)) {
+               cond_resched();
+
+               /* In case we scheduled, work out time remaining */
+               ret = timeout - (jiffies - start);
+               if (ret < 0)
+                       ret = 0;
+
+               goto out;
+       }
+
+       /* Sleep until uncongested or a write happens */
+       prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+       ret = io_schedule_timeout(timeout);
+       finish_wait(wqh, &wait);
+
+out:
+       trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
+                                       jiffies_to_usecs(jiffies - start));
+
+       return ret;
+}
+EXPORT_SYMBOL(wait_iff_congested);
 
                        preferred_zone, migratetype);
 
                if (!page && gfp_mask & __GFP_NOFAIL)
-                       congestion_wait(BLK_RW_ASYNC, HZ/50);
+                       wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
        } while (!page && (gfp_mask & __GFP_NOFAIL));
 
        return page;
        pages_reclaimed += did_some_progress;
        if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
                /* Wait for some write requests to complete then retry */
-               congestion_wait(BLK_RW_ASYNC, HZ/50);
+               wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto rebalance;
        }
 
 
        }
        if (mapping->a_ops->writepage == NULL)
                return PAGE_ACTIVATE;
-       if (!may_write_to_queue(mapping->backing_dev_info, sc)) {
-               disable_lumpy_reclaim_mode(sc);
+       if (!may_write_to_queue(mapping->backing_dev_info, sc))
                return PAGE_KEEP;
-       }
 
        if (clear_page_dirty_for_io(page)) {
                int res;
  * shrink_page_list() returns the number of reclaimed pages
  */
 static unsigned long shrink_page_list(struct list_head *page_list,
+                                     struct zone *zone,
                                      struct scan_control *sc)
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
        int pgactivate = 0;
+       unsigned long nr_dirty = 0;
+       unsigned long nr_congested = 0;
        unsigned long nr_reclaimed = 0;
 
        cond_resched();
                        goto keep;
 
                VM_BUG_ON(PageActive(page));
+               VM_BUG_ON(page_zone(page) != zone);
 
                sc->nr_scanned++;
 
                }
 
                if (PageDirty(page)) {
+                       nr_dirty++;
+
                        if (references == PAGEREF_RECLAIM_CLEAN)
                                goto keep_locked;
                        if (!may_enter_fs)
                        /* Page is dirty, try to write it out here */
                        switch (pageout(page, mapping, sc)) {
                        case PAGE_KEEP:
+                               nr_congested++;
                                goto keep_locked;
                        case PAGE_ACTIVATE:
                                goto activate_locked;
                VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
        }
 
+       /*
+        * Tag a zone as congested if all the dirty pages encountered were
+        * backed by a congested BDI. In this case, reclaimers should just
+        * back off and wait for congestion to clear because further reclaim
+        * will encounter the same problem
+        */
+       if (nr_dirty == nr_congested)
+               zone_set_flag(zone, ZONE_CONGESTED);
+
        free_page_list(&free_pages);
 
        list_splice(&ret_pages, page_list);
 
        spin_unlock_irq(&zone->lru_lock);
 
-       nr_reclaimed = shrink_page_list(&page_list, sc);
+       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
 
        /* Check if we should syncronously wait for writeback */
        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
                set_lumpy_reclaim_mode(priority, sc, true);
-               nr_reclaimed += shrink_page_list(&page_list, sc);
+               nr_reclaimed += shrink_page_list(&page_list, zone, sc);
        }
 
        local_irq_disable();
 
                /* Take a nap, wait for some writeback to complete */
                if (!sc->hibernation_mode && sc->nr_scanned &&
-                   priority < DEF_PRIORITY - 2)
-                       congestion_wait(BLK_RW_ASYNC, HZ/10);
+                   priority < DEF_PRIORITY - 2) {
+                       struct zone *preferred_zone;
+
+                       first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
+                                                       NULL, &preferred_zone);
+                       wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
+               }
        }
 
 out:
                                if (!zone_watermark_ok(zone, order,
                                            min_wmark_pages(zone), end_zone, 0))
                                        has_under_min_watermark_zone = 1;
+                       } else {
+                               /*
+                                * If a zone reaches its high watermark,
+                                * consider it to be no longer congested. It's
+                                * possible there are dirty pages backed by
+                                * congested BDIs but as pressure is relieved,
+                                * spectulatively avoid congestion waits
+                                */
+                               zone_clear_flag(zone, ZONE_CONGESTED);
                        }
 
                }