mm, swap: simplify folio swap allocation

author Kairui Song <kasong@tencent.com>

Mon, 24 Feb 2025 18:02:12 +0000 (02:02 +0800)

committer Andrew Morton <akpm@linux-foundation.org>

Fri, 28 Feb 2025 01:00:32 +0000 (17:00 -0800)
author Kairui Song <kasong@tencent.com>
Mon, 24 Feb 2025 18:02:12 +0000 (02:02 +0800)
committer Andrew Morton <akpm@linux-foundation.org>
Fri, 28 Feb 2025 01:00:32 +0000 (17:00 -0800)
diff --git a/include/linux/swap.h b/include/linux/swap.h

index a0a262bcaf4164a5051e88c8d8c54667be31d2b3..3a68da686c4ea5118416673e1460f8e69bde20c1 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -478,7 +478,7 @@ static inline long get_nr_swap_pages(void)
  }
  
  extern void si_swapinfo(struct sysinfo *);
-swp_entry_t folio_alloc_swap(struct folio *folio);
+int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask);
  bool folio_free_swap(struct folio *folio);
  void put_swap_folio(struct folio *folio, swp_entry_t entry);
  extern swp_entry_t get_swap_page_of_type(int);
@@ -587,11 +587,9 @@ static inline int swp_swapcount(swp_entry_t entry)
         return 0;
  }
  
-static inline swp_entry_t folio_alloc_swap(struct folio *folio)
+static int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask)
  {
-       swp_entry_t entry;
-       entry.val = 0;
-       return entry;
+       return -EINVAL;
  }
  
  static inline bool folio_free_swap(struct folio *folio)
diff --git a/mm/shmem.c b/mm/shmem.c

index 8bedc8a99e011d9eb30ef36e9ac51e96f53556ce..e442014509e88048c6d113515e4e0e86274f326a 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1546,7 +1546,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
         struct inode *inode = mapping->host;
         struct shmem_inode_info *info = SHMEM_I(inode);
         struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
-       swp_entry_t swap;
         pgoff_t index;
         int nr_pages;
         bool split = false;
@@ -1628,14 +1627,6 @@ try_split:
                 folio_mark_uptodate(folio);
         }
  
-       swap = folio_alloc_swap(folio);
-       if (!swap.val) {
-               if (nr_pages > 1)
-                       goto try_split;
-
-               goto redirty;
-       }
-
         /*
          * Add inode to shmem_unuse()'s list of swapped-out inodes,
          * if it's not already there.  Do it now before the folio is
@@ -1648,20 +1639,20 @@ try_split:
         if (list_empty(&info->swaplist))
                 list_add(&info->swaplist, &shmem_swaplist);
  
-       if (add_to_swap_cache(folio, swap,
-                       __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
-                       NULL) == 0) {
+       if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
                 shmem_recalc_inode(inode, 0, nr_pages);
-               swap_shmem_alloc(swap, nr_pages);
-               shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
+               swap_shmem_alloc(folio->swap, nr_pages);
+               shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));
  
                 mutex_unlock(&shmem_swaplist_mutex);
                 BUG_ON(folio_mapped(folio));
                 return swap_writepage(&folio->page, wbc);
         }
  
+       list_del_init(&info->swaplist);
         mutex_unlock(&shmem_swaplist_mutex);
-       put_swap_folio(folio, swap);
+       if (nr_pages > 1)
+               goto try_split;
  redirty:
         folio_mark_dirty(folio);
         if (wbc->for_reclaim)
diff --git a/mm/swap.h b/mm/swap.h

index ad2f121de970d50647b3010e1ddf8d5af7daa1ac..0abb68091b4f1b6a1545dd73848688a8e565f98f 100644 (file)
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -50,7 +50,6 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry)
  }
  
  void show_swap_cache_info(void);
-bool add_to_swap(struct folio *folio);
  void *get_shadow_from_swap_cache(swp_entry_t entry);
  int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
                       gfp_t gfp, void **shadowp);
@@ -163,11 +162,6 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
         return filemap_get_folio(mapping, index);
  }
  
-static inline bool add_to_swap(struct folio *folio)
-{
-       return false;
-}
-
  static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
  {
         return NULL;
diff --git a/mm/swap_state.c b/mm/swap_state.c

index 2b5744e211cd6eaab8c0919533b5200375527b94..68fd981b514fef29a275c9dbf25b43db637b6a4f 100644 (file)
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -166,63 +166,6 @@ void __delete_from_swap_cache(struct folio *folio,
         __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
  }
  
-/**
- * add_to_swap - allocate swap space for a folio
- * @folio: folio we want to move to swap
- *
- * Allocate swap space for the folio and add the folio to the
- * swap cache.
- *
- * Context: Caller needs to hold the folio lock.
- * Return: Whether the folio was added to the swap cache.
- */
-bool add_to_swap(struct folio *folio)
-{
-       swp_entry_t entry;
-       int err;
-
-       VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
-       VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
-
-       entry = folio_alloc_swap(folio);
-       if (!entry.val)
-               return false;
-
-       /*
-        * XArray node allocations from PF_MEMALLOC contexts could
-        * completely exhaust the page allocator. __GFP_NOMEMALLOC
-        * stops emergency reserves from being allocated.
-        *
-        * TODO: this could cause a theoretical memory reclaim
-        * deadlock in the swap out path.
-        */
-       /*
-        * Add it to the swap cache.
-        */
-       err = add_to_swap_cache(folio, entry,
-                       __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
-       if (err)
-               goto fail;
-       /*
-        * Normally the folio will be dirtied in unmap because its
-        * pte should be dirty. A special case is MADV_FREE page. The
-        * page's pte could have dirty bit cleared but the folio's
-        * SwapBacked flag is still set because clearing the dirty bit
-        * and SwapBacked flag has no lock protected. For such folio,
-        * unmap will not set dirty bit for it, so folio reclaim will
-        * not write the folio out. This can cause data corruption when
-        * the folio is swapped in later. Always setting the dirty flag
-        * for the folio solves the problem.
-        */
-       folio_mark_dirty(folio);
-
-       return true;
-
-fail:
-       put_swap_folio(folio, entry);
-       return false;
-}
-
  /*
   * This must be called only on folios that have
   * been verified to be in the swap cache and locked.
diff --git a/mm/swapfile.c b/mm/swapfile.c

index 1ba916109d993bdba20be18181afad08efa0d074..628f67974a7c1ba290fba4d6938d6eb98cf3e889 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1174,9 +1174,9 @@ static bool get_swap_device_info(struct swap_info_struct *si)
   * Fast path try to get swap entries with specified order from current
   * CPU's swap entry pool (a cluster).
   */
-static int swap_alloc_fast(swp_entry_t *entry,
-                          unsigned char usage,
-                          int order)
+static bool swap_alloc_fast(swp_entry_t *entry,
+                           unsigned char usage,
+                           int order)
  {
         struct swap_cluster_info *ci;
         struct swap_info_struct *si;
@@ -1206,47 +1206,31 @@ static int swap_alloc_fast(swp_entry_t *entry,
         return !!found;
  }
  
-swp_entry_t folio_alloc_swap(struct folio *folio)
+/* Rotate the device and switch to a new cluster */
+static bool swap_alloc_slow(swp_entry_t *entry,
+                           unsigned char usage,
+                           int order)
  {
-       unsigned int order = folio_order(folio);
-       unsigned int size = 1 << order;
-       struct swap_info_struct *si, *next;
-       swp_entry_t entry = {};
-       unsigned long offset;
         int node;
+       unsigned long offset;
+       struct swap_info_struct *si, *next;
  
-       if (order) {
-               /*
-                * Should not even be attempting large allocations when huge
-                * page swap is disabled. Warn and fail the allocation.
-                */
-               if (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER) {
-                       VM_WARN_ON_ONCE(1);
-                       return entry;
-               }
-       }
-
-       /* Fast path using percpu cluster */
-       local_lock(&percpu_swap_cluster.lock);
-       if (swap_alloc_fast(&entry, SWAP_HAS_CACHE, order))
-               goto out_alloced;
-
-       /* Rotate the device and switch to a new cluster */
+       node = numa_node_id();
         spin_lock(&swap_avail_lock);
  start_over:
-       node = numa_node_id();
         plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
+               /* Rotate the device and switch to a new cluster */
                 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
                 spin_unlock(&swap_avail_lock);
                 if (get_swap_device_info(si)) {
                         offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
                         put_swap_device(si);
                         if (offset) {
-                               entry = swp_entry(si->type, offset);
-                               goto out_alloced;
+                               *entry = swp_entry(si->type, offset);
+                               return true;
                         }
                         if (order)
-                               goto out_failed;
+                               return false;
                 }
  
                 spin_lock(&swap_avail_lock);
@@ -1265,20 +1249,68 @@ start_over:
                         goto start_over;
         }
         spin_unlock(&swap_avail_lock);
-out_failed:
+       return false;
+}
+
+/**
+ * folio_alloc_swap - allocate swap space for a folio
+ * @folio: folio we want to move to swap
+ * @gfp: gfp mask for shadow nodes
+ *
+ * Allocate swap space for the folio and add the folio to the
+ * swap cache.
+ *
+ * Context: Caller needs to hold the folio lock.
+ * Return: Whether the folio was added to the swap cache.
+ */
+int folio_alloc_swap(struct folio *folio, gfp_t gfp)
+{
+       unsigned int order = folio_order(folio);
+       unsigned int size = 1 << order;
+       swp_entry_t entry = {};
+
+       VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+       VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
+
+       /*
+        * Should not even be attempting large allocations when huge
+        * page swap is disabled. Warn and fail the allocation.
+        */
+       if (order && (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER)) {
+               VM_WARN_ON_ONCE(1);
+               return -EINVAL;
+       }
+
+       local_lock(&percpu_swap_cluster.lock);
+       if (swap_alloc_fast(&entry, SWAP_HAS_CACHE, order))
+               goto out_alloced;
+       if (swap_alloc_slow(&entry, SWAP_HAS_CACHE, order))
+               goto out_alloced;
         local_unlock(&percpu_swap_cluster.lock);
-       return entry;
+       return -ENOMEM;
  
  out_alloced:
         local_unlock(&percpu_swap_cluster.lock);
-       if (mem_cgroup_try_charge_swap(folio, entry)) {
-               put_swap_folio(folio, entry);
-               entry.val = 0;
-       } else {
-               atomic_long_sub(size, &nr_swap_pages);
-       }
+       if (mem_cgroup_try_charge_swap(folio, entry))
+               goto out_free;
  
-       return entry;
+       /*
+        * XArray node allocations from PF_MEMALLOC contexts could
+        * completely exhaust the page allocator. __GFP_NOMEMALLOC
+        * stops emergency reserves from being allocated.
+        *
+        * TODO: this could cause a theoretical memory reclaim
+        * deadlock in the swap out path.
+        */
+       if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL))
+               goto out_free;
+
+       atomic_long_sub(size, &nr_swap_pages);
+       return 0;
+
+out_free:
+       put_swap_folio(folio, entry);
+       return -ENOMEM;
  }
  
  static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
diff --git a/mm/vmscan.c b/mm/vmscan.c

index fcca38bc640f5ecb87b7000eac7ffbfb14646f3d..be00af3763b536483ab132060d028ab36389c58d 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1289,7 +1289,7 @@ retry:
                                             split_folio_to_list(folio, folio_list))
                                                 goto activate_locked;
                                 }
-                               if (!add_to_swap(folio)) {
+                               if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) {
                                         int __maybe_unused order = folio_order(folio);
  
                                         if (!folio_test_large(folio))
@@ -1305,9 +1305,21 @@ retry:
                                         }
  #endif
                                         count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
-                                       if (!add_to_swap(folio))
+                                       if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN))
                                                 goto activate_locked_split;
                                 }
+                               /*
+                                * Normally the folio will be dirtied in unmap because its
+                                * pte should be dirty. A special case is MADV_FREE page. The
+                                * page's pte could have dirty bit cleared but the folio's
+                                * SwapBacked flag is still set because clearing the dirty bit
+                                * and SwapBacked flag has no lock protected. For such folio,
+                                * unmap will not set dirty bit for it, so folio reclaim will
+                                * not write the folio out. This can cause data corruption when
+                                * the folio is swapped in later. Always setting the dirty flag
+                                * for the folio solves the problem.
+                                */
+                               folio_mark_dirty(folio);
                         }
                 }
author	Kairui Song <kasong@tencent.com>
	Mon, 24 Feb 2025 18:02:12 +0000 (02:02 +0800)
committer	Andrew Morton <akpm@linux-foundation.org>
	Fri, 28 Feb 2025 01:00:32 +0000 (17:00 -0800)
include/linux/swap.h		patch \| blob \| history
mm/shmem.c		patch \| blob \| history
mm/swap.h		patch \| blob \| history
mm/swap_state.c		patch \| blob \| history
mm/swapfile.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history