place the hot pages in the fast memory.  This is implemented based on
 unmapping and page fault too.
 
+numa_balancing_promote_rate_limit_MBps
+======================================
+
+Too high promotion/demotion throughput between different memory types
+may hurt application latency.  This can be used to rate limit the
+promotion throughput.  The per-node max promotion throughput in MB/s
+will be limited to be no more than the set value.
+
+A rule of thumb is to set this to less than 1/10 of the PMEM node
+write bandwidth.
+
 oops_all_cpu_backtrace
 ======================
 
 
 #endif
 #ifdef CONFIG_NUMA_BALANCING
        PGPROMOTE_SUCCESS,      /* promote successfully */
+       PGPROMOTE_CANDIDATE,    /* candidate pages to promote */
 #endif
        NR_VM_NODE_STAT_ITEMS
 };
        struct deferred_split deferred_split_queue;
 #endif
 
+#ifdef CONFIG_NUMA_BALANCING
+       /* start time in ms of current promote rate limit period */
+       unsigned int nbp_rl_start;
+       /* number of promote candidate pages at start time of current rate limit period */
+       unsigned long nbp_rl_nr_cand;
+#endif
        /* Fields commonly accessed by the page reclaim scanner */
 
        /*
 
 /* The page with hint page fault latency < threshold in ms is considered hot */
 unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
 
+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+
 struct numa_group {
        refcount_t refcount;
 
        return (time - last_time) & PAGE_ACCESS_TIME_MASK;
 }
 
+/*
+ * For memory tiering mode, too high promotion/demotion throughput may
+ * hurt application latency.  So we provide a mechanism to rate limit
+ * the number of pages that are tried to be promoted.
+ */
+static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
+                                     unsigned long rate_limit, int nr)
+{
+       unsigned long nr_cand;
+       unsigned int now, start;
+
+       now = jiffies_to_msecs(jiffies);
+       mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
+       nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+       start = pgdat->nbp_rl_start;
+       if (now - start > MSEC_PER_SEC &&
+           cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
+               pgdat->nbp_rl_nr_cand = nr_cand;
+       if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
+               return true;
+       return false;
+}
+
 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
                                int src_nid, int dst_cpu)
 {
        if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
            !node_is_toptier(src_nid)) {
                struct pglist_data *pgdat;
-               unsigned long latency, th;
+               unsigned long rate_limit, latency, th;
 
                pgdat = NODE_DATA(dst_nid);
                if (pgdat_free_space_enough(pgdat))
                if (latency >= th)
                        return false;
 
-               return true;
+               rate_limit = sysctl_numa_balancing_promote_rate_limit << \
+                       (20 - PAGE_SHIFT);
+               return !numa_promotion_rate_limit(pgdat, rate_limit,
+                                                 thp_nr_pages(page));
        }
 
        this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);