vmscan: Support multiple kswapd threads per node

author Buddy Lumpkin <buddy.lumpkin@oracle.com>

Thu, 15 Mar 2018 06:57:13 +0000 (06:57 +0000)

committer Brian Maly <brian.maly@oracle.com>

Tue, 8 May 2018 19:44:16 +0000 (15:44 -0400)
author Buddy Lumpkin <buddy.lumpkin@oracle.com>
Thu, 15 Mar 2018 06:57:13 +0000 (06:57 +0000)
committer Brian Maly <brian.maly@oracle.com>
Tue, 8 May 2018 19:44:16 +0000 (15:44 -0400)
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt

index 373ccff095323286912f456025703aed444d90f7..e4e4b0fc1821adbd3891ddc1252474d80c76cc09 100644 (file)
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -32,6 +32,7 @@ Currently, these files are in /proc/sys/vm:
  - extfrag_threshold
  - hugepages_treat_as_movable
  - hugetlb_shm_group
+- kswapd_threads
  - laptop_mode
  - legacy_va_layout
  - lowmem_reserve_ratio
@@ -268,6 +269,23 @@ shared memory segment using hugetlb page.
  
  ==============================================================
  
+kswapd_threads
+
+kswapd_threads allows you to control the number of kswapd threads per node
+running on the system. This provides the ability to devote more CPU in
+exchange for more aggressive page replacement. More aggressive page
+replacement can reduce direct reclaims which cause latency for tasks
+and decrease throughput when doing filesystem IO through the pagecache.
+Direct reclaims are recorded using the allocstall counter in /proc/vmstat.
+
+The default value is 1 and the range of acceptible values are 1-16.
+Always start with lower values in the 3-6 range. Higher values should
+be justified with testing. If direct reclaims occur in spite of high
+values, the cost of direct reclaims (in latency) that occur can be
+higher due to increased lock contention.
+
+==============================================================
+
  laptop_mode
  
  laptop_mode is a knob that controls "laptop mode". All the things that are
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 2f115b63ef4e610a5921d44decd94c13afb36581..3d9952e28cd4191dc94eadedc30bed9c318760d7 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1798,6 +1798,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn);
  extern void set_dma_reserve(unsigned long new_dma_reserve);
  extern void memmap_init_zone(unsigned long, int, unsigned long,
                                 unsigned long, enum memmap_context);
+void update_kswapd_threads(void);
  extern void setup_per_zone_wmarks(void);
  extern int __meminit init_per_zone_wmark_min(void);
  extern void mem_init(void);
@@ -1815,6 +1816,7 @@ extern void zone_pcp_update(struct zone *zone);
  extern void zone_pcp_reset(struct zone *zone);
  
  /* page_alloc.c */
+extern int kswapd_threads;
  extern int min_free_kbytes;
  extern int watermark_scale_factor;
  
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 8871198822969b5d5885d9baf1a5ed108108d23a..e1be9d39a393ff333eb4337c02c128c44601274d 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -34,6 +34,7 @@
   * will not.
   */
  #define PAGE_ALLOC_COSTLY_ORDER 3
+#define MAX_KSWAPD_THREADS 16
  
  enum {
         MIGRATE_UNMOVABLE,
@@ -909,6 +910,8 @@ static inline int is_highmem(struct zone *zone)
  
  /* These two functions are used to setup the per zone pages min values */
  struct ctl_table;
+int kswapd_threads_sysctl_handler(struct ctl_table *, int,
+                                 void __user *, size_t *, loff_t *);
  int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
                                         void __user *, size_t *, loff_t *);
  int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 0603cc4a3bbad9b7b1cc93f572cf89d4ba2aa6e8..1ef88116dc3d73c19adfe8501a907d5591d6dd6e 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -134,6 +134,7 @@ static int one_thousand = 1000;
  #ifdef CONFIG_PRINTK
  static int ten_thousand = 10000;
  #endif
+static int max_kswapd_threads = MAX_KSWAPD_THREADS;
  
  /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
  static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -1395,6 +1396,15 @@ static struct ctl_table vm_table[] = {
                 .proc_handler   = min_free_kbytes_sysctl_handler,
                 .extra1         = &zero,
         },
+       {
+               .procname       = "kswapd_threads",
+               .data           = &kswapd_threads,
+               .maxlen         = sizeof(kswapd_threads),
+               .mode           = 0644,
+               .proc_handler   = kswapd_threads_sysctl_handler,
+               .extra1         = &one,
+               .extra2         = &max_kswapd_threads,
+       },
         {
                 .procname       = "watermark_scale_factor",
                 .data           = &watermark_scale_factor,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index fc65fb646fee0f7912672d7c36d8c21891117f21..79943256004b0e7ec81e05ad39f466c4364ea8a6 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5952,6 +5952,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
         return 0;
  }
  
+int kswapd_threads_sysctl_handler(struct ctl_table *table, int write,
+                                 void __user *buffer, size_t *length,
+                                 loff_t *ppos)
+{
+       int rc;
+
+       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+       if (rc)
+               return rc;
+
+       if (write)
+               update_kswapd_threads();
+       return 0;
+}
+
  int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
         void __user *buffer, size_t *length, loff_t *ppos)
  {
diff --git a/mm/vmscan.c b/mm/vmscan.c

index b95fca3630f18fea07e8cb21de53e7789eea5e67..da2f51859e5e56f6c270f6af326ed04ed2212ca1 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -108,6 +108,14 @@ struct scan_control {
  
  #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
  
+/*
+ * Number of active kswapd threads
+ */
+#define DEF_KSWAPD_THREADS_PER_NODE 1
+int kswapd_threads = DEF_KSWAPD_THREADS_PER_NODE;
+static int kswapd_threads_current = DEF_KSWAPD_THREADS_PER_NODE;
+static struct task_struct *kswapd_list[MAX_NUMNODES][MAX_KSWAPD_THREADS];
+
  #ifdef ARCH_HAS_PREFETCH
  #define prefetch_prev_lru_page(_page, _base, _field)                   \
         do {                                                            \
@@ -3515,23 +3523,96 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
  static int cpu_callback(struct notifier_block *nfb, unsigned long action,
                         void *hcpu)
  {
-       int nid;
+       int nid, hid;
+       int nr_threads = kswapd_threads_current;
  
-       if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
-               for_each_node_state(nid, N_MEMORY) {
-                       pg_data_t *pgdat = NODE_DATA(nid);
-                       const struct cpumask *mask;
+       if (action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
+               return NOTIFY_OK;
  
-                       mask = cpumask_of_node(pgdat->node_id);
+       for_each_node_state(nid, N_MEMORY) {
+               pg_data_t *pgdat = NODE_DATA(nid);
+               const struct cpumask *mask = cpumask_of_node(pgdat->node_id);
+               struct task_struct *task;
  
-                       if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
-                               /* One of our CPUs online: restore mask */
-                               set_cpus_allowed_ptr(pgdat->kswapd, mask);
+               if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) {
+                       for (hid = 0; hid < nr_threads; hid++) {
+                               struct task_struct *t = kswapd_list[nid][hid];
+                               /* CPU of ours online: restore mask */
+                               if (t)
+                                       set_cpus_allowed_ptr(t, mask);
+                       }
                 }
         }
         return NOTIFY_OK;
  }
  
+static void update_kswapd_threads_node(int nid)
+{
+       pg_data_t *pgdat;
+       int drop, increase;
+       int last_idx, start_idx, hid;
+       int nr_threads = kswapd_threads_current;
+
+       pgdat = NODE_DATA(nid);
+       last_idx = nr_threads - 1;
+       if (kswapd_threads < nr_threads) {
+               drop = nr_threads - kswapd_threads;
+               for (hid = last_idx; hid > (last_idx - drop); hid--) {
+                       if (kswapd_list[nid][hid]) {
+                               kthread_stop(kswapd_list[nid][hid]);
+                               kswapd_list[nid][hid] = NULL;
+                       }
+               }
+       } else {
+               increase = kswapd_threads - nr_threads;
+               start_idx = last_idx + 1;
+               for (hid = start_idx; hid < (start_idx + increase); hid++) {
+                       kswapd_list[nid][hid] = kthread_run(kswapd, pgdat,
+                                               "kswapd%d:%d", nid, hid);
+                       if (IS_ERR(kswapd_list[nid][hid])) {
+                               pr_err("Failed to start kswapd%d on node %d\n",
+                                      hid, nid);
+                               kswapd_list[nid][hid] = NULL;
+                               /*
+                                * We are out of resources. Do not start any
+                                * more threads.
+                                */
+                               break;
+                       }
+               }
+       }
+}
+
+/*
+ * When kswapd_threads is updated from userspace, this function is called
+ * to start required new kswapd threads or kill off extra  kswapd threads
+ */
+void update_kswapd_threads(void)
+{
+       int nid;
+
+       if (kswapd_threads_current == kswapd_threads)
+               return;
+
+       /*
+        * This function updates the number of currently running kswapd
+        * threads and kills off or starts up kswapd to match what has
+        * been requested from userspace. Memory hotplug functions also
+        * start up and kill off kswapd threads and use
+        * kswapd_threads_current to determine the number of threads to
+        * start or kill. To avoid race condition between the two, take
+        * memory hotplug lock.
+        */
+       mem_hotplug_begin();
+       for_each_node_state(nid, N_MEMORY)
+               update_kswapd_threads_node(nid);
+
+       pr_info("kswapd_thread count changed, old:%d new:%d\n",
+               kswapd_threads_current, kswapd_threads);
+       kswapd_threads_current = kswapd_threads;
+       mem_hotplug_done();
+}
+
  /*
   * This kswapd start function will be called by init and node-hot-add.
   * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
@@ -3540,18 +3621,27 @@ int kswapd_run(int nid)
  {
         pg_data_t *pgdat = NODE_DATA(nid);
         int ret = 0;
+       int hid, nr_threads;
  
-       if (pgdat->kswapd)
+       if (kswapd_list[nid][0])
                 return 0;
  
-       pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
-       if (IS_ERR(pgdat->kswapd)) {
-               /* failure at boot is fatal */
-               BUG_ON(system_state == SYSTEM_BOOTING);
-               pr_err("Failed to start kswapd on node %d\n", nid);
-               ret = PTR_ERR(pgdat->kswapd);
-               pgdat->kswapd = NULL;
+       nr_threads = kswapd_threads;
+       for (hid = 0; hid < nr_threads; hid++) {
+               kswapd_list[nid][hid] = kthread_run(kswapd, pgdat,
+                                                   "kswapd%d:%d",
+                                                   nid, hid);
+               if (IS_ERR(kswapd_list[nid][hid])) {
+                       /* failure at boot is fatal */
+                       BUG_ON(system_state == SYSTEM_BOOTING);
+                       pr_err("Failed to start kswapd%d on node %d\n",
+                              hid, nid);
+                       ret = PTR_ERR(kswapd_list[nid][hid]);
+                       kswapd_list[nid][hid] = NULL;
+                       break;
+               }
         }
+       kswapd_threads_current = nr_threads;
         return ret;
  }
  
@@ -3561,11 +3651,14 @@ int kswapd_run(int nid)
   */
  void kswapd_stop(int nid)
  {
-       struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
+       int hid;
+       int nr_threads = kswapd_threads_current;
  
-       if (kswapd) {
-               kthread_stop(kswapd);
-               NODE_DATA(nid)->kswapd = NULL;
+       for (hid = 0; hid < nr_threads; hid++) {
+               if (kswapd_list[nid][hid]) {
+                       kthread_stop(kswapd_list[nid][hid]);
+                       kswapd_list[nid][hid] = NULL;
+               }
         }
  }
author	Buddy Lumpkin <buddy.lumpkin@oracle.com>
	Thu, 15 Mar 2018 06:57:13 +0000 (06:57 +0000)
committer	Brian Maly <brian.maly@oracle.com>
	Tue, 8 May 2018 19:44:16 +0000 (15:44 -0400)
Documentation/sysctl/vm.txt		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/mmzone.h		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history