/* zone watermarks, access with *_wmark_pages(zone) macros */
        unsigned long watermark[NR_WMARK];
 
+       /*
+        * When free pages are below this point, additional steps are taken
+        * when reading the number of free pages to avoid per-cpu counter
+        * drift allowing watermarks to be breached
+        */
+       unsigned long percpu_drift_mark;
+
        /*
         * We don't know if the memory that we're going to allocate will be freeable
         * or/and it will be released eventually, so to avoid totally wasting several
        return test_bit(ZONE_OOM_LOCKED, &zone->flags);
 }
 
+#ifdef CONFIG_SMP
+unsigned long zone_nr_free_pages(struct zone *zone);
+#else
+#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
+#endif /* CONFIG_SMP */
+
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
  * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
 
        return x;
 }
 
+/*
+ * More accurate version that also considers the currently pending
+ * deltas. For that we need to loop over all cpus to find the current
+ * deltas. There is no synchronization so the result cannot be
+ * exactly accurate either.
+ */
+static inline unsigned long zone_page_state_snapshot(struct zone *zone,
+                                       enum zone_stat_item item)
+{
+       long x = atomic_long_read(&zone->vm_stat[item]);
+
+#ifdef CONFIG_SMP
+       int cpu;
+       for_each_online_cpu(cpu)
+               x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];
+
+       if (x < 0)
+               x = 0;
+#endif
+       return x;
+}
+
 extern unsigned long global_reclaimable_pages(void);
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
 
 
        return 1;
 }
 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
+
+#ifdef CONFIG_SMP
+/* Called when a more accurate view of NR_FREE_PAGES is needed */
+unsigned long zone_nr_free_pages(struct zone *zone)
+{
+       unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
+
+       /*
+        * While kswapd is awake, it is considered the zone is under some
+        * memory pressure. Under pressure, there is a risk that
+        * per-cpu-counter-drift will allow the min watermark to be breached
+        * potentially causing a live-lock. While kswapd is awake and
+        * free pages are low, get a better estimate for free pages
+        */
+       if (nr_free_pages < zone->percpu_drift_mark &&
+                       !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
+               return zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
+       return nr_free_pages;
+}
+#endif /* CONFIG_SMP */
 
 {
        /* free_pages my go negative - that's OK */
        long min = mark;
-       long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
+       long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
        int o;
 
        if (alloc_flags & ALLOC_HIGH)
                        " all_unreclaimable? %s"
                        "\n",
                        zone->name,
-                       K(zone_page_state(zone, NR_FREE_PAGES)),
+                       K(zone_nr_free_pages(zone)),
                        K(min_wmark_pages(zone)),
                        K(low_wmark_pages(zone)),
                        K(high_wmark_pages(zone)),
 
        int threshold;
 
        for_each_populated_zone(zone) {
+               unsigned long max_drift, tolerate_drift;
+
                threshold = calculate_threshold(zone);
 
                for_each_online_cpu(cpu)
                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
                                                        = threshold;
+
+               /*
+                * Only set percpu_drift_mark if there is a danger that
+                * NR_FREE_PAGES reports the low watermark is ok when in fact
+                * the min watermark could be breached by an allocation
+                */
+               tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
+               max_drift = num_online_cpus() * threshold;
+               if (max_drift > tolerate_drift)
+                       zone->percpu_drift_mark = high_wmark_pages(zone) +
+                                       max_drift;
        }
 }
 
                   "\n        scanned  %lu"
                   "\n        spanned  %lu"
                   "\n        present  %lu",
-                  zone_page_state(zone, NR_FREE_PAGES),
+                  zone_nr_free_pages(zone),
                   min_wmark_pages(zone),
                   low_wmark_pages(zone),
                   high_wmark_pages(zone),