config NUMA
        bool "Non Uniform Memory Access (NUMA) Support"
        depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
+       select ARCH_WANT_NUMA_VARIABLE_LOCALITY
        default n
        help
          Some SH systems have many various memories scattered around
 
        def_bool y
        select HAVE_AOUT if X86_32
        select HAVE_UNSTABLE_SCHED_CLOCK
+       select ARCH_SUPPORTS_NUMA_BALANCING
+       select ARCH_WANTS_PROT_NUMA_PROT_NONE
        select HAVE_IDE
        select HAVE_OPROFILE
        select HAVE_PCSPKR_PLATFORM
 
 #endif
 #ifdef CONFIG_CPUMASK_OFFSTACK
        struct cpumask cpumask_allocation;
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+       /*
+        * numa_next_scan is the next time when the PTEs will me marked
+        * pte_numa to gather statistics and migrate pages to new nodes
+        * if necessary
+        */
+       unsigned long numa_next_scan;
+
+       /* numa_scan_seq prevents two threads setting pte_numa */
+       int numa_scan_seq;
 #endif
        struct uprobes_state uprobes_state;
 };
 
        short il_next;
        short pref_node_fork;
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+       int numa_scan_seq;
+       int numa_migrate_seq;
+       unsigned int numa_scan_period;
+       u64 node_stamp;                 /* migration stamp  */
+       struct callback_head numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
+
        struct rcu_head rcu;
 
        /*
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
+#ifdef CONFIG_NUMA_BALANCING
+extern void task_numa_fault(int node, int pages);
+#else
+static inline void task_numa_fault(int node, int pages)
+{
+}
+#endif
+
 /*
  * Priority of a process goes from 0..MAX_PRIO-1, valid RT
  * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
 };
 extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
 
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_settle_count;
+
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
+
+#ifdef CONFIG_NUMA_BALANCING
+       if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+               p->mm->numa_next_scan = jiffies;
+               p->mm->numa_scan_seq = 0;
+       }
+
+       p->node_stamp = 0ULL;
+       p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+       p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+       p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+       p->numa_work.next = &p->numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
 }
 
 /*
 
 #include <linux/slab.h>
 #include <linux/profile.h>
 #include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/task_work.h>
 
 #include <trace/events/sched.h>
 
  * Scheduling class queueing methods:
  */
 
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * numa task sample period in ms: 5s
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 5000;
+unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
+
+static void task_numa_placement(struct task_struct *p)
+{
+       int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+
+       if (p->numa_scan_seq == seq)
+               return;
+       p->numa_scan_seq = seq;
+
+       /* FIXME: Scheduling placement policy hints go here */
+}
+
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int node, int pages)
+{
+       struct task_struct *p = current;
+
+       /* FIXME: Allocate task-specific structure for placement policy here */
+
+       task_numa_placement(p);
+}
+
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+       unsigned long migrate, next_scan, now = jiffies;
+       struct task_struct *p = current;
+       struct mm_struct *mm = p->mm;
+
+       WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+
+       work->next = work; /* protect against double add */
+       /*
+        * Who cares about NUMA placement when they're dying.
+        *
+        * NOTE: make sure not to dereference p->mm before this check,
+        * exit_task_work() happens _after_ exit_mm() so we could be called
+        * without p->mm even though we still had it when we enqueued this
+        * work.
+        */
+       if (p->flags & PF_EXITING)
+               return;
+
+       /*
+        * Enforce maximal scan/migration frequency..
+        */
+       migrate = mm->numa_next_scan;
+       if (time_before(now, migrate))
+               return;
+
+       if (p->numa_scan_period == 0)
+               p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+
+       next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
+       if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+               return;
+
+       ACCESS_ONCE(mm->numa_scan_seq)++;
+       {
+               struct vm_area_struct *vma;
+
+               down_read(&mm->mmap_sem);
+               for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                       if (!vma_migratable(vma))
+                               continue;
+                       change_prot_numa(vma, vma->vm_start, vma->vm_end);
+               }
+               up_read(&mm->mmap_sem);
+       }
+}
+
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+       struct callback_head *work = &curr->numa_work;
+       u64 period, now;
+
+       /*
+        * We don't care about NUMA placement if we don't have memory.
+        */
+       if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+               return;
+
+       /*
+        * Using runtime rather than walltime has the dual advantage that
+        * we (mostly) drive the selection from busy threads and that the
+        * task needs to have done some actual work before we bother with
+        * NUMA placement.
+        */
+       now = curr->se.sum_exec_runtime;
+       period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+
+       if (now - curr->node_stamp > period) {
+               curr->node_stamp = now;
+
+               if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+                       init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+                       task_work_add(curr, work, true);
+               }
+       }
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
                cfs_rq = cfs_rq_of(se);
                entity_tick(cfs_rq, se, queued);
        }
+
+       if (sched_feat_numa(NUMA))
+               task_tick_numa(rq, curr);
 }
 
 /*
 
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
+
+/*
+ * Apply the automatic NUMA scheduling policy
+ */
+#ifdef CONFIG_NUMA_BALANCING
+SCHED_FEAT(NUMA,       true)
+#endif
 
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
 
+#ifdef CONFIG_NUMA_BALANCING
+#define sched_feat_numa(x) sched_feat(x)
+#else
+#define sched_feat_numa(x) (0)
+#endif
+
 static inline u64 global_rt_period(void)
 {
        return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
 
 static int max_sched_granularity_ns = NSEC_PER_SEC;    /* 1 second */
 static int min_wakeup_granularity_ns;                  /* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;   /* 1 second */
+#ifdef CONFIG_SMP
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
 
 #ifdef CONFIG_COMPACTION
 static int min_extfrag_threshold;
                .extra1         = &min_wakeup_granularity_ns,
                .extra2         = &max_wakeup_granularity_ns,
        },
+#ifdef CONFIG_SMP
        {
                .procname       = "sched_tunable_scaling",
                .data           = &sysctl_sched_tunable_scaling,
                .extra1         = &zero,
                .extra2         = &one,
        },
-#endif
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_NUMA_BALANCING
+       {
+               .procname       = "numa_balancing_scan_period_min_ms",
+               .data           = &sysctl_numa_balancing_scan_period_min,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "numa_balancing_scan_period_max_ms",
+               .data           = &sysctl_numa_balancing_scan_period_max,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
        {
                .procname       = "sched_rt_period_us",
                .data           = &sysctl_sched_rt_period,
 
         */
        split_huge_page(page);
        put_page(page);
+
        return 0;
 
 clear_pmdnuma:
 
 out_unlock:
        spin_unlock(&mm->page_table_lock);
-       if (page)
+       if (page) {
                put_page(page);
+               task_numa_fault(numa_node_id(), HPAGE_PMD_NR);
+       }
        return 0;
 }
 
 
 {
        struct page *page = NULL;
        spinlock_t *ptl;
-       int current_nid, target_nid;
+       int current_nid = -1;
+       int target_nid;
 
        /*
        * The "pte" at this point cannot be used safely without
                current_nid = target_nid;
 
 out:
+       task_numa_fault(current_nid, 1);
        return 0;
 }
 
        for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
                pte_t pteval = *pte;
                struct page *page;
+               int curr_nid;
                if (!pte_present(pteval))
                        continue;
                if (!pte_numa(pteval))
                page = vm_normal_page(vma, addr, pteval);
                if (unlikely(!page))
                        continue;
+               /* only check non-shared pages */
+               if (unlikely(page_mapcount(page) != 1))
+                       continue;
+               pte_unmap_unlock(pte, ptl);
+
+               curr_nid = page_to_nid(page);
+               task_numa_fault(curr_nid, 1);
+
+               pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
        }
        pte_unmap_unlock(orig_pte, ptl);