mm: hugetlb: improve parallel huge page allocation time

author Thomas Prescher <thomas.prescher@cyberus-technology.de>

Thu, 27 Feb 2025 22:45:05 +0000 (23:45 +0100)

committer Andrew Morton <akpm@linux-foundation.org>

Tue, 4 Mar 2025 05:50:52 +0000 (21:50 -0800)
author Thomas Prescher <thomas.prescher@cyberus-technology.de>
Thu, 27 Feb 2025 22:45:05 +0000 (23:45 +0100)
committer Andrew Morton <akpm@linux-foundation.org>
Tue, 4 Mar 2025 05:50:52 +0000 (21:50 -0800)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 8c6dbc664353ddf1e80e2c4e82cab0cb59a05e28..482bb91009112f346a9d14e198a8983b72d41a8b 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -14,9 +14,11 @@
  #include <linux/pagemap.h>
  #include <linux/mempolicy.h>
  #include <linux/compiler.h>
+#include <linux/cpumask.h>
  #include <linux/cpuset.h>
  #include <linux/mutex.h>
  #include <linux/memblock.h>
+#include <linux/minmax.h>
  #include <linux/sysfs.h>
  #include <linux/slab.h>
  #include <linux/sched/mm.h>
@@ -3600,31 +3602,31 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
                 .numa_aware     = true
         };
  
+       unsigned int num_allocation_threads = max(num_online_cpus() / 4, 1);
+
         job.thread_fn   = hugetlb_pages_alloc_boot_node;
         job.start       = 0;
         job.size        = h->max_huge_pages;
  
         /*
-        * job.max_threads is twice the num_node_state(N_MEMORY),
+        * job.max_threads is 25% of the available cpu threads by default.
          *
-        * Tests below indicate that a multiplier of 2 significantly improves
-        * performance, and although larger values also provide improvements,
-        * the gains are marginal.
+        * On large servers with terabytes of memory, huge page allocation
+        * can consume a considerably amount of time.
          *
-        * Therefore, choosing 2 as the multiplier strikes a good balance between
-        * enhancing parallel processing capabilities and maintaining efficient
-        * resource management.
+        * Tests below show how long it takes to allocate 1 TiB of memory with 2MiB huge pages.
+        * 2MiB huge pages. Using more threads can significantly improve allocation time.
          *
-        * +------------+-------+-------+-------+-------+-------+
-        * | multiplier |   1   |   2   |   3   |   4   |   5   |
-        * +------------+-------+-------+-------+-------+-------+
-        * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms |
-        * | 2T   4node | 979ms | 679ms | 543ms | 489ms | 481ms |
-        * | 50G  2node | 71ms  | 44ms  | 37ms  | 30ms  | 31ms  |
-        * +------------+-------+-------+-------+-------+-------+
+        * +-----------------------+-------+-------+-------+-------+-------+
+        * | threads               |   8   |   16  |   32  |   64  |   128 |
+        * +-----------------------+-------+-------+-------+-------+-------+
+        * | skylake      144 cpus |   44s |   22s |   16s |   19s |   20s |
+        * | cascade lake 192 cpus |   39s |   20s |   11s |   10s |    9s |
+        * +-----------------------+-------+-------+-------+-------+-------+
          */
-       job.max_threads = num_node_state(N_MEMORY) * 2;
-       job.min_chunk   = h->max_huge_pages / num_node_state(N_MEMORY) / 2;
+
+       job.max_threads = num_allocation_threads;
+       job.min_chunk   = h->max_huge_pages / num_allocation_threads;
         padata_do_multithreaded(&job);
  
         return h->nr_huge_pages;
author	Thomas Prescher <thomas.prescher@cyberus-technology.de>
	Thu, 27 Feb 2025 22:45:05 +0000 (23:45 +0100)
committer	Andrew Morton <akpm@linux-foundation.org>
	Tue, 4 Mar 2025 05:50:52 +0000 (21:50 -0800)