mm, swap: implement dynamic allocation of swap table

author Kairui Song <kasong@tencent.com>

Wed, 10 Sep 2025 16:08:32 +0000 (00:08 +0800)

committer Andrew Morton <akpm@linux-foundation.org>

Fri, 12 Sep 2025 00:25:56 +0000 (17:25 -0700)
author Kairui Song <kasong@tencent.com>
Wed, 10 Sep 2025 16:08:32 +0000 (00:08 +0800)
committer Andrew Morton <akpm@linux-foundation.org>
Fri, 12 Sep 2025 00:25:56 +0000 (17:25 -0700)
diff --git a/mm/swap.h b/mm/swap.h

index fe5c209220828059d91fbf6a337092459550d14e..8d8efdf1297a7f7a6adb7fa71f175aeffe5f2984 100644 (file)
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -36,7 +36,7 @@ struct swap_cluster_info {
         u16 count;
         u8 flags;
         u8 order;
-       atomic_long_t *table;   /* Swap table entries, see mm/swap_table.h */
+       atomic_long_t __rcu *table;     /* Swap table entries, see mm/swap_table.h */
         struct list_head list;
  };
  
diff --git a/mm/swap_state.c b/mm/swap_state.c

index 97372539a575d1554cd3f70b1b564c8955bf21b0..1fc3a9eff8f22d4416e906a98c11f5780acf341a 100644 (file)
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -91,8 +91,8 @@ struct folio *swap_cache_get_folio(swp_entry_t entry)
         struct folio *folio;
  
         for (;;) {
-               swp_tb = __swap_table_get(__swap_entry_to_cluster(entry),
-                                         swp_cluster_offset(entry));
+               swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
+                                       swp_cluster_offset(entry));
                 if (!swp_tb_is_folio(swp_tb))
                         return NULL;
                 folio = swp_tb_to_folio(swp_tb);
@@ -115,11 +115,10 @@ void *swap_cache_get_shadow(swp_entry_t entry)
  {
         unsigned long swp_tb;
  
-       swp_tb = __swap_table_get(__swap_entry_to_cluster(entry),
-                                 swp_cluster_offset(entry));
+       swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
+                               swp_cluster_offset(entry));
         if (swp_tb_is_shadow(swp_tb))
                 return swp_tb_to_shadow(swp_tb);
-
         return NULL;
  }
  
diff --git a/mm/swap_table.h b/mm/swap_table.h

index e1f7cc0097010dde513c0b7d0763017a8e851f7d..52254e455304c588ff5e62955df733cad780cdbb 100644 (file)
--- a/mm/swap_table.h
+++ b/mm/swap_table.h
@@ -2,8 +2,15 @@
  #ifndef _MM_SWAP_TABLE_H
  #define _MM_SWAP_TABLE_H
  
+#include <linux/rcupdate.h>
+#include <linux/atomic.h>
  #include "swap.h"
  
+/* A typical flat array in each cluster as swap table */
+struct swap_table {
+       atomic_long_t entries[SWAPFILE_CLUSTER];
+};
+
  /*
   * A swap table entry represents the status of a swap slot on a swap
   * (physical or virtual) device. The swap table in each cluster is a
@@ -76,22 +83,46 @@ static inline void *swp_tb_to_shadow(unsigned long swp_tb)
  static inline void __swap_table_set(struct swap_cluster_info *ci,
                                     unsigned int off, unsigned long swp_tb)
  {
+       atomic_long_t *table = rcu_dereference_protected(ci->table, true);
+
+       lockdep_assert_held(&ci->lock);
         VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
-       atomic_long_set(&ci->table[off], swp_tb);
+       atomic_long_set(&table[off], swp_tb);
  }
  
  static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci,
                                               unsigned int off, unsigned long swp_tb)
  {
+       atomic_long_t *table = rcu_dereference_protected(ci->table, true);
+
+       lockdep_assert_held(&ci->lock);
         VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
         /* Ordering is guaranteed by cluster lock, relax */
-       return atomic_long_xchg_relaxed(&ci->table[off], swp_tb);
+       return atomic_long_xchg_relaxed(&table[off], swp_tb);
  }
  
  static inline unsigned long __swap_table_get(struct swap_cluster_info *ci,
                                              unsigned int off)
  {
+       atomic_long_t *table;
+
         VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
-       return atomic_long_read(&ci->table[off]);
+       table = rcu_dereference_check(ci->table, lockdep_is_held(&ci->lock));
+
+       return atomic_long_read(&table[off]);
+}
+
+static inline unsigned long swap_table_get(struct swap_cluster_info *ci,
+                                       unsigned int off)
+{
+       atomic_long_t *table;
+       unsigned long swp_tb;
+
+       rcu_read_lock();
+       table = rcu_dereference(ci->table);
+       swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb();
+       rcu_read_unlock();
+
+       return swp_tb;
  }
  #endif
diff --git a/mm/swapfile.c b/mm/swapfile.c

index 89659928465e728e6d26cdc39530e14a0a27219d..faf867a6c5c19e564d5a21fb29fd0108179a236e 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -105,6 +105,8 @@ static DEFINE_SPINLOCK(swap_avail_lock);
  
  struct swap_info_struct *swap_info[MAX_SWAPFILES];
  
+static struct kmem_cache *swap_table_cachep;
+
  static DEFINE_MUTEX(swapon_mutex);
  
  static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
@@ -400,10 +402,17 @@ static inline bool cluster_is_discard(struct swap_cluster_info *info)
         return info->flags == CLUSTER_FLAG_DISCARD;
  }
  
+static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci)
+{
+       return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock));
+}
+
  static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order)
  {
         if (unlikely(ci->flags > CLUSTER_FLAG_USABLE))
                 return false;
+       if (!cluster_table_is_alloced(ci))
+               return false;
         if (!order)
                 return true;
         return cluster_is_empty(ci) || order == ci->order;
@@ -421,32 +430,98 @@ static inline unsigned int cluster_offset(struct swap_info_struct *si,
         return cluster_index(si, ci) * SWAPFILE_CLUSTER;
  }
  
-static int swap_cluster_alloc_table(struct swap_cluster_info *ci)
+static void swap_cluster_free_table(struct swap_cluster_info *ci)
  {
-       WARN_ON(ci->table);
-       ci->table = kzalloc(sizeof(unsigned long) * SWAPFILE_CLUSTER, GFP_KERNEL);
-       if (!ci->table)
-               return -ENOMEM;
-       return 0;
+       unsigned int ci_off;
+       struct swap_table *table;
+
+       /* Only empty cluster's table is allow to be freed  */
+       lockdep_assert_held(&ci->lock);
+       VM_WARN_ON_ONCE(!cluster_is_empty(ci));
+       for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++)
+               VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off)));
+       table = (void *)rcu_dereference_protected(ci->table, true);
+       rcu_assign_pointer(ci->table, NULL);
+
+       kmem_cache_free(swap_table_cachep, table);
  }
  
-static void swap_cluster_free_table(struct swap_cluster_info *ci)
+/*
+ * Allocate a swap table may need to sleep, which leads to migration,
+ * so attempt an atomic allocation first then fallback and handle
+ * potential race.
+ */
+static struct swap_cluster_info *
+swap_cluster_alloc_table(struct swap_info_struct *si,
+                        struct swap_cluster_info *ci,
+                        int order)
  {
-       unsigned int ci_off;
-       unsigned long swp_tb;
+       struct swap_cluster_info *pcp_ci;
+       struct swap_table *table;
+       unsigned long offset;
  
-       if (!ci->table)
-               return;
+       /*
+        * Only cluster isolation from the allocator does table allocation.
+        * Swap allocator uses a percpu cluster and holds the local lock.
+        */
+       lockdep_assert_held(&ci->lock);
+       lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock);
+
+       table = kmem_cache_zalloc(swap_table_cachep,
+                                 __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+       if (table) {
+               rcu_assign_pointer(ci->table, table);
+               return ci;
+       }
+
+       /*
+        * Try a sleep allocation. Each isolated free cluster may cause
+        * a sleep allocation, but there is a limited number of them, so
+        * the potential recursive allocation should be limited.
+        */
+       spin_unlock(&ci->lock);
+       if (!(si->flags & SWP_SOLIDSTATE))
+               spin_unlock(&si->global_cluster_lock);
+       local_unlock(&percpu_swap_cluster.lock);
+       table = kmem_cache_zalloc(swap_table_cachep, __GFP_HIGH | GFP_KERNEL);
  
-       for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) {
-               swp_tb = __swap_table_get(ci, ci_off);
-               if (!swp_tb_is_null(swp_tb))
-                       pr_err_once("swap: unclean swap space on swapoff: 0x%lx",
-                                   swp_tb);
+       local_lock(&percpu_swap_cluster.lock);
+       if (!(si->flags & SWP_SOLIDSTATE))
+               spin_lock(&si->global_cluster_lock);
+       /*
+        * Back to atomic context. First, check if we migrated to a new
+        * CPU with a usable percpu cluster. If so, try using that instead.
+        * No need to check it for the spinning device, as swap is
+        * serialized by the global lock on them.
+        *
+        * The is_usable check is a bit rough, but ensures order 0 success.
+        */
+       offset = this_cpu_read(percpu_swap_cluster.offset[order]);
+       if ((si->flags & SWP_SOLIDSTATE) && offset) {
+               pcp_ci = swap_cluster_lock(si, offset);
+               if (cluster_is_usable(pcp_ci, order) &&
+                   pcp_ci->count < SWAPFILE_CLUSTER) {
+                       ci = pcp_ci;
+                       goto free_table;
+               }
+               swap_cluster_unlock(pcp_ci);
         }
  
-       kfree(ci->table);
-       ci->table = NULL;
+       if (!table)
+               return NULL;
+
+       spin_lock(&ci->lock);
+       /* Nothing should have touched the dangling empty cluster. */
+       if (WARN_ON_ONCE(cluster_table_is_alloced(ci)))
+               goto free_table;
+
+       rcu_assign_pointer(ci->table, table);
+       return ci;
+
+free_table:
+       if (table)
+               kmem_cache_free(swap_table_cachep, table);
+       return ci;
  }
  
  static void move_cluster(struct swap_info_struct *si,
@@ -478,7 +553,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
  
  static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
  {
-       lockdep_assert_held(&ci->lock);
+       swap_cluster_free_table(ci);
         move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
         ci->order = 0;
  }
@@ -493,15 +568,11 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info
   * this returns NULL for an non-empty list.
   */
  static struct swap_cluster_info *isolate_lock_cluster(
-               struct swap_info_struct *si, struct list_head *list)
+               struct swap_info_struct *si, struct list_head *list, int order)
  {
-       struct swap_cluster_info *ci, *ret = NULL;
+       struct swap_cluster_info *ci, *found = NULL;
  
         spin_lock(&si->lock);
-
-       if (unlikely(!(si->flags & SWP_WRITEOK)))
-               goto out;
-
         list_for_each_entry(ci, list, list) {
                 if (!spin_trylock(&ci->lock))
                         continue;
@@ -513,13 +584,19 @@ static struct swap_cluster_info *isolate_lock_cluster(
  
                 list_del(&ci->list);
                 ci->flags = CLUSTER_FLAG_NONE;
-               ret = ci;
+               found = ci;
                 break;
         }
-out:
         spin_unlock(&si->lock);
  
-       return ret;
+       if (found && !cluster_table_is_alloced(found)) {
+               /* Only an empty free cluster's swap table can be freed. */
+               VM_WARN_ON_ONCE(list != &si->free_clusters);
+               VM_WARN_ON_ONCE(!cluster_is_empty(found));
+               return swap_cluster_alloc_table(si, found, order);
+       }
+
+       return found;
  }
  
  /*
@@ -652,17 +729,27 @@ static void relocate_cluster(struct swap_info_struct *si,
   * added to free cluster list and its usage counter will be increased by 1.
   * Only used for initialization.
   */
-static void inc_cluster_info_page(struct swap_info_struct *si,
+static int inc_cluster_info_page(struct swap_info_struct *si,
         struct swap_cluster_info *cluster_info, unsigned long page_nr)
  {
         unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+       struct swap_table *table;
         struct swap_cluster_info *ci;
  
         ci = cluster_info + idx;
+       if (!ci->table) {
+               table = kmem_cache_zalloc(swap_table_cachep, GFP_KERNEL);
+               if (!table)
+                       return -ENOMEM;
+               rcu_assign_pointer(ci->table, table);
+       }
+
         ci->count++;
  
         VM_BUG_ON(ci->count > SWAPFILE_CLUSTER);
         VM_BUG_ON(ci->flags);
+
+       return 0;
  }
  
  static bool cluster_reclaim_range(struct swap_info_struct *si,
@@ -844,7 +931,7 @@ static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
         unsigned int found = SWAP_ENTRY_INVALID;
  
         do {
-               struct swap_cluster_info *ci = isolate_lock_cluster(si, list);
+               struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order);
                 unsigned long offset;
  
                 if (!ci)
@@ -869,7 +956,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
         if (force)
                 to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
  
-       while ((ci = isolate_lock_cluster(si, &si->full_clusters))) {
+       while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) {
                 offset = cluster_offset(si, ci);
                 end = min(si->max, offset + SWAPFILE_CLUSTER);
                 to_scan--;
@@ -1017,6 +1104,7 @@ new_cluster:
  done:
         if (!(si->flags & SWP_SOLIDSTATE))
                 spin_unlock(&si->global_cluster_lock);
+
         return found;
  }
  
@@ -1884,7 +1972,13 @@ swp_entry_t get_swap_page_of_type(int type)
         /* This is called for allocating swap entry, not cache */
         if (get_swap_device_info(si)) {
                 if (si->flags & SWP_WRITEOK) {
+                       /*
+                        * Grab the local lock to be complaint
+                        * with swap table allocation.
+                        */
+                       local_lock(&percpu_swap_cluster.lock);
                         offset = cluster_alloc_swap_entry(si, 0, 1);
+                       local_unlock(&percpu_swap_cluster.lock);
                         if (offset) {
                                 entry = swp_entry(si->type, offset);
                                 atomic_long_dec(&nr_swap_pages);
@@ -2678,12 +2772,21 @@ static void wait_for_allocation(struct swap_info_struct *si)
  static void free_cluster_info(struct swap_cluster_info *cluster_info,
                               unsigned long maxpages)
  {
+       struct swap_cluster_info *ci;
         int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
  
         if (!cluster_info)
                 return;
-       for (i = 0; i < nr_clusters; i++)
-               swap_cluster_free_table(&cluster_info[i]);
+       for (i = 0; i < nr_clusters; i++) {
+               ci = cluster_info + i;
+               /* Cluster with bad marks count will have a remaining table */
+               spin_lock(&ci->lock);
+               if (rcu_dereference_protected(ci->table, true)) {
+                       ci->count = 0;
+                       swap_cluster_free_table(ci);
+               }
+               spin_unlock(&ci->lock);
+       }
         kvfree(cluster_info);
  }
  
@@ -2719,6 +2822,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
         struct address_space *mapping;
         struct inode *inode;
         struct filename *pathname;
+       unsigned int maxpages;
         int err, found = 0;
  
         if (!capable(CAP_SYS_ADMIN))
@@ -2825,8 +2929,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
         p->swap_map = NULL;
         zeromap = p->zeromap;
         p->zeromap = NULL;
+       maxpages = p->max;
         cluster_info = p->cluster_info;
-       free_cluster_info(cluster_info, p->max);
         p->max = 0;
         p->cluster_info = NULL;
         spin_unlock(&p->lock);
@@ -2838,6 +2942,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
         p->global_cluster = NULL;
         vfree(swap_map);
         kvfree(zeromap);
+       free_cluster_info(cluster_info, maxpages);
         /* Destroy swap account information */
         swap_cgroup_swapoff(p->type);
  
@@ -3216,11 +3321,8 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
         if (!cluster_info)
                 goto err;
  
-       for (i = 0; i < nr_clusters; i++) {
+       for (i = 0; i < nr_clusters; i++)
                 spin_lock_init(&cluster_info[i].lock);
-               if (swap_cluster_alloc_table(&cluster_info[i]))
-                       goto err_free;
-       }
  
         if (!(si->flags & SWP_SOLIDSTATE)) {
                 si->global_cluster = kmalloc(sizeof(*si->global_cluster),
@@ -3239,16 +3341,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
          * See setup_swap_map(): header page, bad pages,
          * and the EOF part of the last cluster.
          */
-       inc_cluster_info_page(si, cluster_info, 0);
+       err = inc_cluster_info_page(si, cluster_info, 0);
+       if (err)
+               goto err;
         for (i = 0; i < swap_header->info.nr_badpages; i++) {
                 unsigned int page_nr = swap_header->info.badpages[i];
  
                 if (page_nr >= maxpages)
                         continue;
-               inc_cluster_info_page(si, cluster_info, page_nr);
+               err = inc_cluster_info_page(si, cluster_info, page_nr);
+               if (err)
+                       goto err;
+       }
+       for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) {
+               err = inc_cluster_info_page(si, cluster_info, i);
+               if (err)
+                       goto err;
         }
-       for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
-               inc_cluster_info_page(si, cluster_info, i);
  
         INIT_LIST_HEAD(&si->free_clusters);
         INIT_LIST_HEAD(&si->full_clusters);
@@ -3962,6 +4071,15 @@ static int __init swapfile_init(void)
  
         swapfile_maximum_size = arch_max_swapfile_size();
  
+       /*
+        * Once a cluster is freed, it's swap table content is read
+        * only, and all swap cache readers (swap_cache_*) verifies
+        * the content before use. So it's safe to use RCU slab here.
+        */
+       swap_table_cachep = kmem_cache_create("swap_table",
+                           sizeof(struct swap_table),
+                           0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL);
+
  #ifdef CONFIG_MIGRATION
         if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
                 swap_migration_ad_supported = true;
author	Kairui Song <kasong@tencent.com>
	Wed, 10 Sep 2025 16:08:32 +0000 (00:08 +0800)
committer	Andrew Morton <akpm@linux-foundation.org>
	Fri, 12 Sep 2025 00:25:56 +0000 (17:25 -0700)
mm/swap.h		patch \| blob \| history
mm/swap_state.c		patch \| blob \| history
mm/swap_table.h		patch \| blob \| history
mm/swapfile.c		patch \| blob \| history