Shared faults can lead to lots of unnecessary page migrations,
slowing down the system, and causing private faults to hit the
per-pgdat migration ratelimit.
This patch adds sysctl numa_balancing_migrate_deferred, which specifies
how many shared page migrations to skip unconditionally, after each page
migration that is skipped because it is a shared fault.
This reduces the number of page migrations back and forth in
shared fault situations. It also gives a strong preference to
the tasks that are already running where most of the memory is,
and to moving the other tasks to near the memory.
Testing this with a much higher scan rate than the default
still seems to result in fewer page migrations than before.
Memory seems to be somewhat better consolidated than previously,
with multi-instance specjbb runs on a 4 node system.
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1381141781-10992-62-git-send-email-mgorman@suse.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
 feature is too high then the rate the kernel samples for NUMA hinting
 faults may be controlled by the numa_balancing_scan_period_min_ms,
 numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
-numa_balancing_scan_size_mb and numa_balancing_settle_count sysctls.
+numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
+numa_balancing_migrate_deferred.
 
 ==============================================================
 
 gives the scheduler a chance to place the task on an alternative node if the
 preferred node is overloaded.
 
+numa_balancing_migrate_deferred is how many page migrations get skipped
+unconditionally, after a page migration is skipped because a page is shared
+with other tasks. This reduces page migration overhead, and determines
+how much stronger the "move task near its memory" policy scheduler becomes,
+versus the "move memory near its task" memory management policy, for workloads
+with shared memory.
+
 ==============================================================
 
 osrelease, ostype & version:
 
        int numa_scan_seq;
        unsigned int numa_scan_period;
        unsigned int numa_scan_period_max;
+       int numa_preferred_nid;
+       int numa_migrate_deferred;
        unsigned long numa_migrate_retry;
        u64 node_stamp;                 /* migration stamp  */
        struct callback_head numa_work;
         */
        unsigned long numa_faults_locality[2];
 
-       int numa_preferred_nid;
        unsigned long numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
 extern pid_t task_numa_group_id(struct task_struct *p);
 extern void set_numabalancing_state(bool enabled);
 extern void task_numa_free(struct task_struct *p);
+
+extern unsigned int sysctl_numa_balancing_migrate_deferred;
 #else
 static inline void task_numa_fault(int last_node, int node, int pages,
                                   int flags)
 
 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 unsigned int sysctl_numa_balancing_scan_delay = 1000;
 
+/*
+ * After skipping a page migration on a shared page, skip N more numa page
+ * migrations unconditionally. This reduces the number of NUMA migrations
+ * in shared memory workloads, and has the effect of pulling tasks towards
+ * where their memory lives, over pulling the memory towards the task.
+ */
+unsigned int sysctl_numa_balancing_migrate_deferred = 16;
+
 static unsigned int task_nr_scan_windows(struct task_struct *p)
 {
        unsigned long rss = 0;
 
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+       {
+               .procname       = "numa_balancing_migrate_deferred",
+               .data           = &sysctl_numa_balancing_migrate_deferred,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
        {
 
        kmem_cache_free(sn_cache, n);
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
+{
+       /* Never defer a private fault */
+       if (cpupid_match_pid(p, last_cpupid))
+               return false;
+
+       if (p->numa_migrate_deferred) {
+               p->numa_migrate_deferred--;
+               return true;
+       }
+       return false;
+}
+
+static inline void defer_numa_migrate(struct task_struct *p)
+{
+       p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
+}
+#else
+static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
+{
+       return false;
+}
+
+static inline void defer_numa_migrate(struct task_struct *p)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 /**
  * mpol_misplaced - check whether current page node is valid in policy
  *
                 * relation.
                 */
                last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
-               if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid)
+               if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
+
+                       /* See sysctl_numa_balancing_migrate_deferred comment */
+                       if (!cpupid_match_pid(current, last_cpupid))
+                               defer_numa_migrate(current);
+
+                       goto out;
+               }
+
+               /*
+                * The quadratic filter above reduces extraneous migration
+                * of shared pages somewhat. This code reduces it even more,
+                * reducing the overhead of page migrations of shared pages.
+                * This makes workloads with shared pages rely more on
+                * "move task near its memory", and less on "move memory
+                * towards its task", which is exactly what we want.
+                */
+               if (numa_migrate_deferred(current, last_cpupid))
                        goto out;
        }