mm/swap: do not choose swap device according to numa node

author Baoquan He <bhe@redhat.com>

Sat, 11 Oct 2025 08:16:23 +0000 (16:16 +0800)

committer Andrew Morton <akpm@linux-foundation.org>

Wed, 15 Oct 2025 04:28:51 +0000 (21:28 -0700)
author Baoquan He <bhe@redhat.com>
Sat, 11 Oct 2025 08:16:23 +0000 (16:16 +0800)
committer Andrew Morton <akpm@linux-foundation.org>
Wed, 15 Oct 2025 04:28:51 +0000 (21:28 -0700)
diff --git a/Documentation/admin-guide/mm/swap_numa.rst b/Documentation/admin-guide/mm/swap_numa.rst

deleted file mode 100644 (file)

index 2e63062..0000000
--- a/Documentation/admin-guide/mm/swap_numa.rst
+++ /dev/null
@@ -1,78 +0,0 @@
-===========================================
-Automatically bind swap device to numa node
-===========================================
-
-If the system has more than one swap device and swap device has the node
-information, we can make use of this information to decide which swap
-device to use in get_swap_pages() to get better performance.
-
-
-How to use this feature
-=======================
-
-Swap device has priority and that decides the order of it to be used. To make
-use of automatically binding, there is no need to manipulate priority settings
-for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and
-swapB, with swapA attached to node 0 and swapB attached to node 1, are going
-to be swapped on. Simply swapping them on by doing::
-
-       # swapon /dev/swapA
-       # swapon /dev/swapB
-
-Then node 0 will use the two swap devices in the order of swapA then swapB and
-node 1 will use the two swap devices in the order of swapB then swapA. Note
-that the order of them being swapped on doesn't matter.
-
-A more complex example on a 4 node machine. Assume 6 swap devices are going to
-be swapped on: swapA and swapB are attached to node 0, swapC is attached to
-node 1, swapD and swapE are attached to node 2 and swapF is attached to node3.
-The way to swap them on is the same as above::
-
-       # swapon /dev/swapA
-       # swapon /dev/swapB
-       # swapon /dev/swapC
-       # swapon /dev/swapD
-       # swapon /dev/swapE
-       # swapon /dev/swapF
-
-Then node 0 will use them in the order of::
-
-       swapA/swapB -> swapC -> swapD -> swapE -> swapF
-
-swapA and swapB will be used in a round robin mode before any other swap device.
-
-node 1 will use them in the order of::
-
-       swapC -> swapA -> swapB -> swapD -> swapE -> swapF
-
-node 2 will use them in the order of::
-
-       swapD/swapE -> swapA -> swapB -> swapC -> swapF
-
-Similaly, swapD and swapE will be used in a round robin mode before any
-other swap devices.
-
-node 3 will use them in the order of::
-
-       swapF -> swapA -> swapB -> swapC -> swapD -> swapE
-
-
-Implementation details
-======================
-
-The current code uses a priority based list, swap_avail_list, to decide
-which swap device to use and if multiple swap devices share the same
-priority, they are used round robin. This change here replaces the single
-global swap_avail_list with a per-numa-node list, i.e. for each numa node,
-it sees its own priority based list of available swap devices. Swap
-device's priority can be promoted on its matching node's swap_avail_list.
-
-The current swap device's priority is set as: user can set a >=0 value,
-or the system will pick one starting from -1 then downwards. The priority
-value in the swap_avail_list is the negated value of the swap device's
-due to plist being sorted from low to high. The new policy doesn't change
-the semantics for priority >=0 cases, the previous starting from -1 then
-downwards now becomes starting from -2 then downwards and -1 is reserved
-as the promoted value. So if multiple swap devices are attached to the same
-node, they will all be promoted to priority -1 on that node's plist and will
-be used round robin before any other swap devices.
diff --git a/include/linux/swap.h b/include/linux/swap.h

index a4b2648177359c8fab102708d243297e3238d8af..38ca3df68716042946274c18a3a6695dda3b7b65 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -301,16 +301,7 @@ struct swap_info_struct {
         struct work_struct discard_work; /* discard worker */
         struct work_struct reclaim_work; /* reclaim worker */
         struct list_head discard_clusters; /* discard clusters list */
-       struct plist_node avail_lists[]; /*
-                                          * entries in swap_avail_heads, one
-                                          * entry per node.
-                                          * Must be last as the number of the
-                                          * array is nr_node_ids, which is not
-                                          * a fixed value so have to allocate
-                                          * dynamically.
-                                          * And it has to be an array so that
-                                          * plist_for_each_* can work.
-                                          */
+       struct plist_node avail_list;   /* entry in swap_avail_head */
  };
  
  static inline swp_entry_t page_swap_entry(struct page *page)
diff --git a/mm/swapfile.c b/mm/swapfile.c

index 0c2174d6b9248d18c90377979f22a1faf599fe68..4a36ea15de2b7c66dce0c2a0946a49280a7d22e2 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -74,7 +74,7 @@ atomic_long_t nr_swap_pages;
  EXPORT_SYMBOL_GPL(nr_swap_pages);
  /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
  long total_swap_pages;
-static int least_priority = -1;
+static int least_priority;
  unsigned long swapfile_maximum_size;
  #ifdef CONFIG_MIGRATION
  bool swap_migration_ad_supported;
@@ -103,7 +103,7 @@ static PLIST_HEAD(swap_active_head);
   * is held and the locking order requires swap_lock to be taken
   * before any swap_info_struct->lock.
   */
-static struct plist_head *swap_avail_heads;
+static PLIST_HEAD(swap_avail_head);
  static DEFINE_SPINLOCK(swap_avail_lock);
  
  struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -1130,7 +1130,6 @@ done:
  /* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */
  static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
  {
-       int nid;
         unsigned long pages;
  
         spin_lock(&swap_avail_lock);
@@ -1159,8 +1158,7 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
                         goto skip;
         }
  
-       for_each_node(nid)
-               plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
+       plist_del(&si->avail_list, &swap_avail_head);
  
  skip:
         spin_unlock(&swap_avail_lock);
@@ -1169,7 +1167,6 @@ skip:
  /* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
  static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
  {
-       int nid;
         long val;
         unsigned long pages;
  
@@ -1202,8 +1199,7 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
                         goto skip;
         }
  
-       for_each_node(nid)
-               plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
+       plist_add(&si->avail_list, &swap_avail_head);
  
  skip:
         spin_unlock(&swap_avail_lock);
@@ -1346,16 +1342,14 @@ static bool swap_alloc_fast(swp_entry_t *entry,
  static bool swap_alloc_slow(swp_entry_t *entry,
                             int order)
  {
-       int node;
         unsigned long offset;
         struct swap_info_struct *si, *next;
  
-       node = numa_node_id();
         spin_lock(&swap_avail_lock);
  start_over:
-       plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
+       plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
                 /* Rotate the device and switch to a new cluster */
-               plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
+               plist_requeue(&si->avail_list, &swap_avail_head);
                 spin_unlock(&swap_avail_lock);
                 if (get_swap_device_info(si)) {
                         offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
@@ -1380,7 +1374,7 @@ start_over:
                  * still in the swap_avail_head list then try it, otherwise
                  * start over if we have not gotten any slots.
                  */
-               if (plist_node_empty(&next->avail_lists[node]))
+               if (plist_node_empty(&si->avail_list))
                         goto start_over;
         }
         spin_unlock(&swap_avail_lock);
@@ -2709,25 +2703,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
         return generic_swapfile_activate(sis, swap_file, span);
  }
  
-static int swap_node(struct swap_info_struct *si)
-{
-       struct block_device *bdev;
-
-       if (si->bdev)
-               bdev = si->bdev;
-       else
-               bdev = si->swap_file->f_inode->i_sb->s_bdev;
-
-       return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
-}
-
  static void setup_swap_info(struct swap_info_struct *si, int prio,
                             unsigned char *swap_map,
                             struct swap_cluster_info *cluster_info,
                             unsigned long *zeromap)
  {
-       int i;
-
         if (prio >= 0)
                 si->prio = prio;
         else
@@ -2737,16 +2717,7 @@ static void setup_swap_info(struct swap_info_struct *si, int prio,
          * low-to-high, while swap ordering is high-to-low
          */
         si->list.prio = -si->prio;
-       for_each_node(i) {
-               if (si->prio >= 0)
-                       si->avail_lists[i].prio = -si->prio;
-               else {
-                       if (swap_node(si) == i)
-                               si->avail_lists[i].prio = 1;
-                       else
-                               si->avail_lists[i].prio = -si->prio;
-               }
-       }
+       si->avail_list.prio = -si->prio;
         si->swap_map = swap_map;
         si->cluster_info = cluster_info;
         si->zeromap = zeromap;
@@ -2924,10 +2895,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                 plist_for_each_entry_continue(si, &swap_active_head, list) {
                         si->prio++;
                         si->list.prio--;
-                       for_each_node(nid) {
-                               if (si->avail_lists[nid].prio != 1)
-                                       si->avail_lists[nid].prio--;
-                       }
+                       si->avail_list.prio--;
                 }
                 least_priority++;
         }
@@ -3168,9 +3136,8 @@ static struct swap_info_struct *alloc_swap_info(void)
         struct swap_info_struct *p;
         struct swap_info_struct *defer = NULL;
         unsigned int type;
-       int i;
  
-       p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
+       p = kvzalloc(sizeof(struct swap_info_struct), GFP_KERNEL);
         if (!p)
                 return ERR_PTR(-ENOMEM);
  
@@ -3209,8 +3176,7 @@ static struct swap_info_struct *alloc_swap_info(void)
         }
         p->swap_extent_root = RB_ROOT;
         plist_node_init(&p->list, 0);
-       for_each_node(i)
-               plist_node_init(&p->avail_lists[i], 0);
+       plist_node_init(&p->avail_list, 0);
         p->flags = SWP_USED;
         spin_unlock(&swap_lock);
         if (defer) {
@@ -3467,9 +3433,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
  
-       if (!swap_avail_heads)
-               return -ENOMEM;
-
         si = alloc_swap_info();
         if (IS_ERR(si))
                 return PTR_ERR(si);
@@ -4079,7 +4042,6 @@ static bool __has_usable_swap(void)
  void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
  {
         struct swap_info_struct *si, *next;
-       int nid = folio_nid(folio);
  
         if (!(gfp & __GFP_IO))
                 return;
@@ -4098,8 +4060,8 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
                 return;
  
         spin_lock(&swap_avail_lock);
-       plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
-                                 avail_lists[nid]) {
+       plist_for_each_entry_safe(si, next, &swap_avail_head,
+                                 avail_list) {
                 if (si->bdev) {
                         blkcg_schedule_throttle(si->bdev->bd_disk, true);
                         break;
@@ -4111,18 +4073,6 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
  
  static int __init swapfile_init(void)
  {
-       int nid;
-
-       swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
-                                        GFP_KERNEL);
-       if (!swap_avail_heads) {
-               pr_emerg("Not enough memory for swap heads, swap is disabled\n");
-               return -ENOMEM;
-       }
-
-       for_each_node(nid)
-               plist_head_init(&swap_avail_heads[nid]);
-
         swapfile_maximum_size = arch_max_swapfile_size();
  
         /*
author	Baoquan He <bhe@redhat.com>
	Sat, 11 Oct 2025 08:16:23 +0000 (16:16 +0800)
committer	Andrew Morton <akpm@linux-foundation.org>
	Wed, 15 Oct 2025 04:28:51 +0000 (21:28 -0700)
Documentation/admin-guide/mm/swap_numa.rst	[deleted file]	patch \| blob \| history
include/linux/swap.h		patch \| blob \| history
mm/swapfile.c		patch \| blob \| history