extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *mask);
+extern unsigned long compaction_suitable(struct zone *zone, int order);
+extern unsigned long compact_zone_order(struct zone *zone, int order,
+                                               gfp_t gfp_mask);
 
 /* Do not skip compaction more than 64 times */
 #define COMPACT_MAX_DEFER_SHIFT 6
        return COMPACT_CONTINUE;
 }
 
+static inline unsigned long compaction_suitable(struct zone *zone, int order)
+{
+       return COMPACT_SKIPPED;
+}
+
+static inline unsigned long compact_zone_order(struct zone *zone, int order,
+                                               gfp_t gfp_mask)
+{
+       return 0;
+}
+
 static inline void defer_compaction(struct zone *zone)
 {
 }
 
 #define NUMA_BUILD 0
 #endif
 
+/* This helps us avoid #ifdef CONFIG_COMPACTION */
+#ifdef CONFIG_COMPACTION
+#define COMPACTION_BUILD 1
+#else
+#define COMPACTION_BUILD 0
+#endif
+
 /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
 
        return COMPACT_CONTINUE;
 }
 
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ *   COMPACT_SKIPPED  - If there are too few free pages for compaction
+ *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
+ *   COMPACT_CONTINUE - If compaction should run now
+ */
+unsigned long compaction_suitable(struct zone *zone, int order)
+{
+       int fragindex;
+       unsigned long watermark;
+
+       /*
+        * Watermarks for order-0 must be met for compaction. Note the 2UL.
+        * This is because during migration, copies of pages need to be
+        * allocated and for a short time, the footprint is higher
+        */
+       watermark = low_wmark_pages(zone) + (2UL << order);
+       if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+               return COMPACT_SKIPPED;
+
+       /*
+        * fragmentation index determines if allocation failures are due to
+        * low memory or external fragmentation
+        *
+        * index of -1 implies allocations might succeed dependingon watermarks
+        * index towards 0 implies failure is due to lack of memory
+        * index towards 1000 implies failure is due to fragmentation
+        *
+        * Only compact if a failure would be due to fragmentation.
+        */
+       fragindex = fragmentation_index(zone, order);
+       if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+               return COMPACT_SKIPPED;
+
+       if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+               return COMPACT_PARTIAL;
+
+       return COMPACT_CONTINUE;
+}
+
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
        int ret;
 
+       ret = compaction_suitable(zone, cc->order);
+       switch (ret) {
+       case COMPACT_PARTIAL:
+       case COMPACT_SKIPPED:
+               /* Compaction is likely to fail */
+               return ret;
+       case COMPACT_CONTINUE:
+               /* Fall through to compaction */
+               ;
+       }
+
        /* Setup to move all movable pages to the end of the zone */
        cc->migrate_pfn = zone->zone_start_pfn;
        cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
        return ret;
 }
 
-static unsigned long compact_zone_order(struct zone *zone,
+unsigned long compact_zone_order(struct zone *zone,
                                                int order, gfp_t gfp_mask)
 {
        struct compact_control cc = {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
        int may_perform_io = gfp_mask & __GFP_IO;
-       unsigned long watermark;
        struct zoneref *z;
        struct zone *zone;
        int rc = COMPACT_SKIPPED;
        /* Compact each zone in the list */
        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
                                                                nodemask) {
-               int fragindex;
                int status;
 
-               /*
-                * Watermarks for order-0 must be met for compaction. Note
-                * the 2UL. This is because during migration, copies of
-                * pages need to be allocated and for a short time, the
-                * footprint is higher
-                */
-               watermark = low_wmark_pages(zone) + (2UL << order);
-               if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
-                       continue;
-
-               /*
-                * fragmentation index determines if allocation failures are
-                * due to low memory or external fragmentation
-                *
-                * index of -1 implies allocations might succeed depending
-                *      on watermarks
-                * index towards 0 implies failure is due to lack of memory
-                * index towards 1000 implies failure is due to fragmentation
-                *
-                * Only compact if a failure would be due to fragmentation.
-                */
-               fragindex = fragmentation_index(zone, order);
-               if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
-                       continue;
-
-               if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
-                       rc = COMPACT_PARTIAL;
-                       break;
-               }
-
                status = compact_zone_order(zone, order, gfp_mask);
                rc = max(status, rc);
 
-               if (zone_watermark_ok(zone, order, watermark, 0, 0))
+               /* If a normal allocation would succeed, stop compacting */
+               if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
                        break;
        }
 
 
        if (!trylock_page(page)) {
                if (!force)
                        goto move_newpage;
+
+               /*
+                * It's not safe for direct compaction to call lock_page.
+                * For example, during page readahead pages are added locked
+                * to the LRU. Later, when the IO completes the pages are
+                * marked uptodate and unlocked. However, the queueing
+                * could be merging multiple pages for one bio (e.g.
+                * mpage_readpages). If an allocation happens for the
+                * second or third page, the process can end up locking
+                * the same page twice and deadlocking. Rather than
+                * trying to be clever about what pages can be locked,
+                * avoid the use of lock_page for direct compaction
+                * altogether.
+                */
+               if (current->flags & PF_MEMALLOC)
+                       goto move_newpage;
+
                lock_page(page);
        }
 
 
        int migratetype, unsigned long *did_some_progress)
 {
        struct page *page;
+       struct task_struct *tsk = current;
 
        if (!order || compaction_deferred(preferred_zone))
                return NULL;
 
+       tsk->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
                                                                nodemask);
+       tsk->flags &= ~PF_MEMALLOC;
        if (*did_some_progress != COMPACT_SKIPPED) {
 
                /* Page migration frees to the PCP lists but we want merging */
                /* Wait for some write requests to complete then retry */
                wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto rebalance;
+       } else {
+               /*
+                * High-order allocations do not necessarily loop after
+                * direct reclaim and reclaim/compaction depends on compaction
+                * being called after reclaim so call directly if necessary
+                */
+               page = __alloc_pages_direct_compact(gfp_mask, order,
+                                       zonelist, high_zoneidx,
+                                       nodemask,
+                                       alloc_flags, preferred_zone,
+                                       migratetype, &did_some_progress);
+               if (page)
+                       goto got_pg;
        }
 
 nopage:
 
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/compaction.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
  * LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference
  *                     page from the LRU and reclaim all pages within a
  *                     naturally aligned range
+ * LUMPY_MODE_COMPACTION: For high-order allocations, reclaim a number of
+ *                     order-0 pages and then compact the zone
  */
 typedef unsigned __bitwise__ lumpy_mode;
 #define LUMPY_MODE_SINGLE              ((__force lumpy_mode)0x01u)
 #define LUMPY_MODE_ASYNC               ((__force lumpy_mode)0x02u)
 #define LUMPY_MODE_SYNC                        ((__force lumpy_mode)0x04u)
 #define LUMPY_MODE_CONTIGRECLAIM       ((__force lumpy_mode)0x08u)
+#define LUMPY_MODE_COMPACTION          ((__force lumpy_mode)0x10u)
 
 struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
        lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
 
        /*
-        * Some reclaim have alredy been failed. No worth to try synchronous
-        * lumpy reclaim.
+        * Initially assume we are entering either lumpy reclaim or
+        * reclaim/compaction.Depending on the order, we will either set the
+        * sync mode or just reclaim order-0 pages later.
         */
-       if (sync && sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE)
-               return;
+       if (COMPACTION_BUILD)
+               sc->lumpy_reclaim_mode = LUMPY_MODE_COMPACTION;
+       else
+               sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM;
 
        /*
-        * If we need a large contiguous chunk of memory, or have
-        * trouble getting a small set of contiguous pages, we
-        * will reclaim both active and inactive pages.
+        * Avoid using lumpy reclaim or reclaim/compaction if possible by
+        * restricting when its set to either costly allocations or when
+        * under memory pressure
         */
-       sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM;
        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
                sc->lumpy_reclaim_mode |= syncmode;
        else if (sc->order && priority < DEF_PRIORITY - 2)
        if (scanning_global_lru(sc)) {
                nr_taken = isolate_pages_global(nr_to_scan,
                        &page_list, &nr_scanned, sc->order,
-                       sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ?
-                                       ISOLATE_INACTIVE : ISOLATE_BOTH,
+                       sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ?
+                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
                        zone, 0, file);
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
        } else {
                nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
                        &page_list, &nr_scanned, sc->order,
-                       sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ?
-                                       ISOLATE_INACTIVE : ISOLATE_BOTH,
+                       sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ?
+                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
                        zone, sc->mem_cgroup,
                        0, file);
                /*
        }
 }
 
+/*
+ * Reclaim/compaction depends on a number of pages being freed. To avoid
+ * disruption to the system, a small number of order-0 pages continue to be
+ * rotated and reclaimed in the normal fashion. However, by the time we get
+ * back to the allocator and call try_to_compact_zone(), we ensure that
+ * there are enough free pages for it to be likely successful
+ */
+static inline bool should_continue_reclaim(struct zone *zone,
+                                       unsigned long nr_reclaimed,
+                                       unsigned long nr_scanned,
+                                       struct scan_control *sc)
+{
+       unsigned long pages_for_compaction;
+       unsigned long inactive_lru_pages;
+
+       /* If not in reclaim/compaction mode, stop */
+       if (!(sc->lumpy_reclaim_mode & LUMPY_MODE_COMPACTION))
+               return false;
+
+       /*
+        * If we failed to reclaim and have scanned the full list, stop.
+        * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
+        *       faster but obviously would be less likely to succeed
+        *       allocation. If this is desirable, use GFP_REPEAT to decide
+        *       if both reclaimed and scanned should be checked or just
+        *       reclaimed
+        */
+       if (!nr_reclaimed && !nr_scanned)
+               return false;
+
+       /*
+        * If we have not reclaimed enough pages for compaction and the
+        * inactive lists are large enough, continue reclaiming
+        */
+       pages_for_compaction = (2UL << sc->order);
+       inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
+                               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+       if (sc->nr_reclaimed < pages_for_compaction &&
+                       inactive_lru_pages > pages_for_compaction)
+               return true;
+
+       /* If compaction would go ahead or the allocation would succeed, stop */
+       switch (compaction_suitable(zone, sc->order)) {
+       case COMPACT_PARTIAL:
+       case COMPACT_CONTINUE:
+               return false;
+       default:
+               return true;
+       }
+}
+
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
        unsigned long nr[NR_LRU_LISTS];
        unsigned long nr_to_scan;
        enum lru_list l;
-       unsigned long nr_reclaimed = sc->nr_reclaimed;
+       unsigned long nr_reclaimed;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+       unsigned long nr_scanned = sc->nr_scanned;
 
+restart:
+       nr_reclaimed = 0;
        get_scan_count(zone, sc, nr, priority);
 
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
                        break;
        }
-
-       sc->nr_reclaimed = nr_reclaimed;
+       sc->nr_reclaimed += nr_reclaimed;
 
        /*
         * Even if we did not try to evict anon pages at all, we want to
        if (inactive_anon_is_low(zone, sc))
                shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
 
+       /* reclaim/compaction might need reclaim to continue */
+       if (should_continue_reclaim(zone, nr_reclaimed,
+                                       sc->nr_scanned - nr_scanned, sc))
+               goto restart;
+
        throttle_vm_writeout(sc->gfp_mask);
 }
 
                            total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                sc.may_writepage = 1;
 
+                       /*
+                        * Compact the zone for higher orders to reduce
+                        * latencies for higher-order allocations that
+                        * would ordinarily call try_to_compact_pages()
+                        */
+                       if (sc.order > PAGE_ALLOC_COSTLY_ORDER)
+                               compact_zone_order(zone, sc.order, sc.gfp_mask);
+
                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), end_zone, 0)) {
                                all_zones_ok = 0;