mm: vmscan: reclaim order-0 and use compaction instead of lumpy reclaim

author Mel Gorman <mel@csn.ul.ie>

Thu, 13 Jan 2011 23:45:56 +0000 (15:45 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 14 Jan 2011 01:32:33 +0000 (17:32 -0800)
author Mel Gorman <mel@csn.ul.ie>
Thu, 13 Jan 2011 23:45:56 +0000 (15:45 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Jan 2011 01:32:33 +0000 (17:32 -0800)
diff --git a/include/linux/compaction.h b/include/linux/compaction.h

index 5ac51552d9089807f128fa7381a35f8851e902c7..2592883d862db8bb857e2bec5439e86219918079 100644 (file)
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,6 +22,9 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
  extern int fragmentation_index(struct zone *zone, unsigned int order);
  extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
                         int order, gfp_t gfp_mask, nodemask_t *mask);
+extern unsigned long compaction_suitable(struct zone *zone, int order);
+extern unsigned long compact_zone_order(struct zone *zone, int order,
+                                               gfp_t gfp_mask);
  
  /* Do not skip compaction more than 64 times */
  #define COMPACT_MAX_DEFER_SHIFT 6
@@ -59,6 +62,17 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
         return COMPACT_CONTINUE;
  }
  
+static inline unsigned long compaction_suitable(struct zone *zone, int order)
+{
+       return COMPACT_SKIPPED;
+}
+
+static inline unsigned long compact_zone_order(struct zone *zone, int order,
+                                               gfp_t gfp_mask)
+{
+       return 0;
+}
+
  static inline void defer_compaction(struct zone *zone)
  {
  }
diff --git a/include/linux/kernel.h b/include/linux/kernel.h

index 57dac7022b63901f5e93527e3481c7e3051a5fa7..5a9d9059520b9fd36fb4049bef4e673a18a9cdd1 100644 (file)
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -600,6 +600,13 @@ struct sysinfo {
  #define NUMA_BUILD 0
  #endif
  
+/* This helps us avoid #ifdef CONFIG_COMPACTION */
+#ifdef CONFIG_COMPACTION
+#define COMPACTION_BUILD 1
+#else
+#define COMPACTION_BUILD 0
+#endif
+
  /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
  #ifdef CONFIG_FTRACE_MCOUNT_RECORD
  # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
diff --git a/mm/compaction.c b/mm/compaction.c

index 20011a850fef8119bfdc8cb26200446283827920..8fe917ec7c11be1b8ebd18ef4f2374d73677d8e7 100644 (file)
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -384,10 +384,62 @@ static int compact_finished(struct zone *zone,
         return COMPACT_CONTINUE;
  }
  
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ *   COMPACT_SKIPPED  - If there are too few free pages for compaction
+ *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
+ *   COMPACT_CONTINUE - If compaction should run now
+ */
+unsigned long compaction_suitable(struct zone *zone, int order)
+{
+       int fragindex;
+       unsigned long watermark;
+
+       /*
+        * Watermarks for order-0 must be met for compaction. Note the 2UL.
+        * This is because during migration, copies of pages need to be
+        * allocated and for a short time, the footprint is higher
+        */
+       watermark = low_wmark_pages(zone) + (2UL << order);
+       if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+               return COMPACT_SKIPPED;
+
+       /*
+        * fragmentation index determines if allocation failures are due to
+        * low memory or external fragmentation
+        *
+        * index of -1 implies allocations might succeed dependingon watermarks
+        * index towards 0 implies failure is due to lack of memory
+        * index towards 1000 implies failure is due to fragmentation
+        *
+        * Only compact if a failure would be due to fragmentation.
+        */
+       fragindex = fragmentation_index(zone, order);
+       if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+               return COMPACT_SKIPPED;
+
+       if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+               return COMPACT_PARTIAL;
+
+       return COMPACT_CONTINUE;
+}
+
  static int compact_zone(struct zone *zone, struct compact_control *cc)
  {
         int ret;
  
+       ret = compaction_suitable(zone, cc->order);
+       switch (ret) {
+       case COMPACT_PARTIAL:
+       case COMPACT_SKIPPED:
+               /* Compaction is likely to fail */
+               return ret;
+       case COMPACT_CONTINUE:
+               /* Fall through to compaction */
+               ;
+       }
+
         /* Setup to move all movable pages to the end of the zone */
         cc->migrate_pfn = zone->zone_start_pfn;
         cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -429,7 +481,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
         return ret;
  }
  
-static unsigned long compact_zone_order(struct zone *zone,
+unsigned long compact_zone_order(struct zone *zone,
                                                 int order, gfp_t gfp_mask)
  {
         struct compact_control cc = {
@@ -462,7 +514,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
         enum zone_type high_zoneidx = gfp_zone(gfp_mask);
         int may_enter_fs = gfp_mask & __GFP_FS;
         int may_perform_io = gfp_mask & __GFP_IO;
-       unsigned long watermark;
         struct zoneref *z;
         struct zone *zone;
         int rc = COMPACT_SKIPPED;
@@ -480,43 +531,13 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
         /* Compact each zone in the list */
         for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
                                                                 nodemask) {
-               int fragindex;
                 int status;
  
-               /*
-                * Watermarks for order-0 must be met for compaction. Note
-                * the 2UL. This is because during migration, copies of
-                * pages need to be allocated and for a short time, the
-                * footprint is higher
-                */
-               watermark = low_wmark_pages(zone) + (2UL << order);
-               if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
-                       continue;
-
-               /*
-                * fragmentation index determines if allocation failures are
-                * due to low memory or external fragmentation
-                *
-                * index of -1 implies allocations might succeed depending
-                *      on watermarks
-                * index towards 0 implies failure is due to lack of memory
-                * index towards 1000 implies failure is due to fragmentation
-                *
-                * Only compact if a failure would be due to fragmentation.
-                */
-               fragindex = fragmentation_index(zone, order);
-               if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
-                       continue;
-
-               if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
-                       rc = COMPACT_PARTIAL;
-                       break;
-               }
-
                 status = compact_zone_order(zone, order, gfp_mask);
                 rc = max(status, rc);
  
-               if (zone_watermark_ok(zone, order, watermark, 0, 0))
+               /* If a normal allocation would succeed, stop compacting */
+               if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
                         break;
         }
  
diff --git a/mm/migrate.c b/mm/migrate.c

index 6ae8a66a704575764ecf068c3f9a02bcc0a14575..94875b26592864517b91118c6ab0f312f43cece1 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -639,6 +639,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
         if (!trylock_page(page)) {
                 if (!force)
                         goto move_newpage;
+
+               /*
+                * It's not safe for direct compaction to call lock_page.
+                * For example, during page readahead pages are added locked
+                * to the LRU. Later, when the IO completes the pages are
+                * marked uptodate and unlocked. However, the queueing
+                * could be merging multiple pages for one bio (e.g.
+                * mpage_readpages). If an allocation happens for the
+                * second or third page, the process can end up locking
+                * the same page twice and deadlocking. Rather than
+                * trying to be clever about what pages can be locked,
+                * avoid the use of lock_page for direct compaction
+                * altogether.
+                */
+               if (current->flags & PF_MEMALLOC)
+                       goto move_newpage;
+
                 lock_page(page);
         }
  
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 22a1bb7723e412814961d0053bad647579fb3248..03a66a31bfcdb12c356f53b3f8ccc6820f670993 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1815,12 +1815,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         int migratetype, unsigned long *did_some_progress)
  {
         struct page *page;
+       struct task_struct *tsk = current;
  
         if (!order || compaction_deferred(preferred_zone))
                 return NULL;
  
+       tsk->flags |= PF_MEMALLOC;
         *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
                                                                 nodemask);
+       tsk->flags &= ~PF_MEMALLOC;
         if (*did_some_progress != COMPACT_SKIPPED) {
  
                 /* Page migration frees to the PCP lists but we want merging */
@@ -2121,6 +2124,19 @@ rebalance:
                 /* Wait for some write requests to complete then retry */
                 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                 goto rebalance;
+       } else {
+               /*
+                * High-order allocations do not necessarily loop after
+                * direct reclaim and reclaim/compaction depends on compaction
+                * being called after reclaim so call directly if necessary
+                */
+               page = __alloc_pages_direct_compact(gfp_mask, order,
+                                       zonelist, high_zoneidx,
+                                       nodemask,
+                                       alloc_flags, preferred_zone,
+                                       migratetype, &did_some_progress);
+               if (page)
+                       goto got_pg;
         }
  
  nopage:
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 3464312bde0789624bda6db304e85b4e169c0b9e..10ebd74a423ce999d47675e03d6ff96798192c88 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
  #include <linux/topology.h>
  #include <linux/cpu.h>
  #include <linux/cpuset.h>
+#include <linux/compaction.h>
  #include <linux/notifier.h>
  #include <linux/rwsem.h>
  #include <linux/delay.h>
@@ -59,12 +60,15 @@
   * LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference
   *                     page from the LRU and reclaim all pages within a
   *                     naturally aligned range
+ * LUMPY_MODE_COMPACTION: For high-order allocations, reclaim a number of
+ *                     order-0 pages and then compact the zone
   */
  typedef unsigned __bitwise__ lumpy_mode;
  #define LUMPY_MODE_SINGLE              ((__force lumpy_mode)0x01u)
  #define LUMPY_MODE_ASYNC               ((__force lumpy_mode)0x02u)
  #define LUMPY_MODE_SYNC                        ((__force lumpy_mode)0x04u)
  #define LUMPY_MODE_CONTIGRECLAIM       ((__force lumpy_mode)0x08u)
+#define LUMPY_MODE_COMPACTION          ((__force lumpy_mode)0x10u)
  
  struct scan_control {
         /* Incremented by the number of inactive pages that were scanned */
@@ -286,18 +290,20 @@ static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
         lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
  
         /*
-        * Some reclaim have alredy been failed. No worth to try synchronous
-        * lumpy reclaim.
+        * Initially assume we are entering either lumpy reclaim or
+        * reclaim/compaction.Depending on the order, we will either set the
+        * sync mode or just reclaim order-0 pages later.
          */
-       if (sync && sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE)
-               return;
+       if (COMPACTION_BUILD)
+               sc->lumpy_reclaim_mode = LUMPY_MODE_COMPACTION;
+       else
+               sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM;
  
         /*
-        * If we need a large contiguous chunk of memory, or have
-        * trouble getting a small set of contiguous pages, we
-        * will reclaim both active and inactive pages.
+        * Avoid using lumpy reclaim or reclaim/compaction if possible by
+        * restricting when its set to either costly allocations or when
+        * under memory pressure
          */
-       sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM;
         if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
                 sc->lumpy_reclaim_mode |= syncmode;
         else if (sc->order && priority < DEF_PRIORITY - 2)
@@ -1385,8 +1391,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
         if (scanning_global_lru(sc)) {
                 nr_taken = isolate_pages_global(nr_to_scan,
                         &page_list, &nr_scanned, sc->order,
-                       sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ?
-                                       ISOLATE_INACTIVE : ISOLATE_BOTH,
+                       sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ?
+                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
                         zone, 0, file);
                 zone->pages_scanned += nr_scanned;
                 if (current_is_kswapd())
@@ -1398,8 +1404,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
         } else {
                 nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
                         &page_list, &nr_scanned, sc->order,
-                       sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ?
-                                       ISOLATE_INACTIVE : ISOLATE_BOTH,
+                       sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ?
+                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
                         zone, sc->mem_cgroup,
                         0, file);
                 /*
@@ -1814,6 +1820,57 @@ out:
         }
  }
  
+/*
+ * Reclaim/compaction depends on a number of pages being freed. To avoid
+ * disruption to the system, a small number of order-0 pages continue to be
+ * rotated and reclaimed in the normal fashion. However, by the time we get
+ * back to the allocator and call try_to_compact_zone(), we ensure that
+ * there are enough free pages for it to be likely successful
+ */
+static inline bool should_continue_reclaim(struct zone *zone,
+                                       unsigned long nr_reclaimed,
+                                       unsigned long nr_scanned,
+                                       struct scan_control *sc)
+{
+       unsigned long pages_for_compaction;
+       unsigned long inactive_lru_pages;
+
+       /* If not in reclaim/compaction mode, stop */
+       if (!(sc->lumpy_reclaim_mode & LUMPY_MODE_COMPACTION))
+               return false;
+
+       /*
+        * If we failed to reclaim and have scanned the full list, stop.
+        * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
+        *       faster but obviously would be less likely to succeed
+        *       allocation. If this is desirable, use GFP_REPEAT to decide
+        *       if both reclaimed and scanned should be checked or just
+        *       reclaimed
+        */
+       if (!nr_reclaimed && !nr_scanned)
+               return false;
+
+       /*
+        * If we have not reclaimed enough pages for compaction and the
+        * inactive lists are large enough, continue reclaiming
+        */
+       pages_for_compaction = (2UL << sc->order);
+       inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
+                               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+       if (sc->nr_reclaimed < pages_for_compaction &&
+                       inactive_lru_pages > pages_for_compaction)
+               return true;
+
+       /* If compaction would go ahead or the allocation would succeed, stop */
+       switch (compaction_suitable(zone, sc->order)) {
+       case COMPACT_PARTIAL:
+       case COMPACT_CONTINUE:
+               return false;
+       default:
+               return true;
+       }
+}
+
  /*
   * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
   */
@@ -1823,9 +1880,12 @@ static void shrink_zone(int priority, struct zone *zone,
         unsigned long nr[NR_LRU_LISTS];
         unsigned long nr_to_scan;
         enum lru_list l;
-       unsigned long nr_reclaimed = sc->nr_reclaimed;
+       unsigned long nr_reclaimed;
         unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+       unsigned long nr_scanned = sc->nr_scanned;
  
+restart:
+       nr_reclaimed = 0;
         get_scan_count(zone, sc, nr, priority);
  
         while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1851,8 +1911,7 @@ static void shrink_zone(int priority, struct zone *zone,
                 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
                         break;
         }
-
-       sc->nr_reclaimed = nr_reclaimed;
+       sc->nr_reclaimed += nr_reclaimed;
  
         /*
          * Even if we did not try to evict anon pages at all, we want to
@@ -1861,6 +1920,11 @@ static void shrink_zone(int priority, struct zone *zone,
         if (inactive_anon_is_low(zone, sc))
                 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
  
+       /* reclaim/compaction might need reclaim to continue */
+       if (should_continue_reclaim(zone, nr_reclaimed,
+                                       sc->nr_scanned - nr_scanned, sc))
+               goto restart;
+
         throttle_vm_writeout(sc->gfp_mask);
  }
  
@@ -2307,6 +2371,14 @@ loop_again:
                             total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                 sc.may_writepage = 1;
  
+                       /*
+                        * Compact the zone for higher orders to reduce
+                        * latencies for higher-order allocations that
+                        * would ordinarily call try_to_compact_pages()
+                        */
+                       if (sc.order > PAGE_ALLOC_COSTLY_ORDER)
+                               compact_zone_order(zone, sc.order, sc.gfp_mask);
+
                         if (!zone_watermark_ok_safe(zone, order,
                                         high_wmark_pages(zone), end_zone, 0)) {
                                 all_zones_ok = 0;
author	Mel Gorman <mel@csn.ul.ie>
	Thu, 13 Jan 2011 23:45:56 +0000 (15:45 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 14 Jan 2011 01:32:33 +0000 (17:32 -0800)
include/linux/compaction.h		patch \| blob \| history
include/linux/kernel.h		patch \| blob \| history
mm/compaction.c		patch \| blob \| history
mm/migrate.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history