mm: shmem: skip swapcache for swapin of synchronous swap device

author Baolin Wang <baolin.wang@linux.alibaba.com>

Wed, 8 Jan 2025 02:16:49 +0000 (10:16 +0800)

committer Andrew Morton <akpm@linux-foundation.org>

Sun, 26 Jan 2025 04:22:30 +0000 (20:22 -0800)
author Baolin Wang <baolin.wang@linux.alibaba.com>
Wed, 8 Jan 2025 02:16:49 +0000 (10:16 +0800)
committer Andrew Morton <akpm@linux-foundation.org>
Sun, 26 Jan 2025 04:22:30 +0000 (20:22 -0800)
diff --git a/mm/shmem.c b/mm/shmem.c

index 95b80c24f6f9241710b3bb3bd5d27776c9c84146..42e11ab2e944ffefe50882a5179d604d487b8321 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1967,6 +1967,65 @@ unlock:
         return ERR_PTR(error);
  }
  
+static struct folio *shmem_swap_alloc_folio(struct inode *inode,
+               struct vm_area_struct *vma, pgoff_t index,
+               swp_entry_t entry, int order, gfp_t gfp)
+{
+       struct shmem_inode_info *info = SHMEM_I(inode);
+       struct folio *new;
+       void *shadow;
+       int nr_pages;
+
+       /*
+        * We have arrived here because our zones are constrained, so don't
+        * limit chance of success with further cpuset and node constraints.
+        */
+       gfp &= ~GFP_CONSTRAINT_MASK;
+       if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) {
+               gfp_t huge_gfp = vma_thp_gfp_mask(vma);
+
+               gfp = limit_gfp_mask(huge_gfp, gfp);
+       }
+
+       new = shmem_alloc_folio(gfp, order, info, index);
+       if (!new)
+               return ERR_PTR(-ENOMEM);
+
+       nr_pages = folio_nr_pages(new);
+       if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
+                                          gfp, entry)) {
+               folio_put(new);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       /*
+        * Prevent parallel swapin from proceeding with the swap cache flag.
+        *
+        * Of course there is another possible concurrent scenario as well,
+        * that is to say, the swap cache flag of a large folio has already
+        * been set by swapcache_prepare(), while another thread may have
+        * already split the large swap entry stored in the shmem mapping.
+        * In this case, shmem_add_to_page_cache() will help identify the
+        * concurrent swapin and return -EEXIST.
+        */
+       if (swapcache_prepare(entry, nr_pages)) {
+               folio_put(new);
+               return ERR_PTR(-EEXIST);
+       }
+
+       __folio_set_locked(new);
+       __folio_set_swapbacked(new);
+       new->swap = entry;
+
+       mem_cgroup_swapin_uncharge_swap(entry, nr_pages);
+       shadow = get_shadow_from_swap_cache(entry);
+       if (shadow)
+               workingset_refault(new, shadow);
+       folio_add_lru(new);
+       swap_read_folio(new, NULL);
+       return new;
+}
+
  /*
   * When a page is moved from swapcache to shmem filecache (either by the
   * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
@@ -2070,7 +2129,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
  }
  
  static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
-                                        struct folio *folio, swp_entry_t swap)
+                                        struct folio *folio, swp_entry_t swap,
+                                        bool skip_swapcache)
  {
         struct address_space *mapping = inode->i_mapping;
         swp_entry_t swapin_error;
@@ -2086,7 +2146,8 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
  
         nr_pages = folio_nr_pages(folio);
         folio_wait_writeback(folio);
-       delete_from_swap_cache(folio);
+       if (!skip_swapcache)
+               delete_from_swap_cache(folio);
         /*
          * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
          * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
@@ -2190,6 +2251,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
         struct shmem_inode_info *info = SHMEM_I(inode);
         struct swap_info_struct *si;
         struct folio *folio = NULL;
+       bool skip_swapcache = false;
         swp_entry_t swap;
         int error, nr_pages;
  
@@ -2211,6 +2273,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
         /* Look it up and read it in.. */
         folio = swap_cache_get_folio(swap, NULL, 0);
         if (!folio) {
+               int order = xa_get_order(&mapping->i_pages, index);
+               bool fallback_order0 = false;
                 int split_order;
  
                 /* Or update major stats only when swapin succeeds?? */
@@ -2220,6 +2284,33 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
                         count_memcg_event_mm(fault_mm, PGMAJFAULT);
                 }
  
+               /*
+                * If uffd is active for the vma, we need per-page fault
+                * fidelity to maintain the uffd semantics, then fallback
+                * to swapin order-0 folio, as well as for zswap case.
+                */
+               if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
+                                 !zswap_never_enabled()))
+                       fallback_order0 = true;
+
+               /* Skip swapcache for synchronous device. */
+               if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
+                       folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp);
+                       if (!IS_ERR(folio)) {
+                               skip_swapcache = true;
+                               goto alloced;
+                       }
+
+                       /*
+                        * Fallback to swapin order-0 folio unless the swap entry
+                        * already exists.
+                        */
+                       error = PTR_ERR(folio);
+                       folio = NULL;
+                       if (error == -EEXIST)
+                               goto failed;
+               }
+
                 /*
                  * Now swap device can only swap in order 0 folio, then we
                  * should split the large swap entry stored in the pagecache
@@ -2250,9 +2341,10 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
                 }
         }
  
+alloced:
         /* We have to do this with folio locked to prevent races */
         folio_lock(folio);
-       if (!folio_test_swapcache(folio) ||
+       if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
             folio->swap.val != swap.val ||
             !shmem_confirm_swap(mapping, index, swap)) {
                 error = -EEXIST;
@@ -2288,7 +2380,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
         if (sgp == SGP_WRITE)
                 folio_mark_accessed(folio);
  
-       delete_from_swap_cache(folio);
+       if (skip_swapcache) {
+               folio->swap.val = 0;
+               swapcache_clear(si, swap, nr_pages);
+       } else {
+               delete_from_swap_cache(folio);
+       }
         folio_mark_dirty(folio);
         swap_free_nr(swap, nr_pages);
         put_swap_device(si);
@@ -2299,8 +2396,11 @@ failed:
         if (!shmem_confirm_swap(mapping, index, swap))
                 error = -EEXIST;
         if (error == -EIO)
-               shmem_set_folio_swapin_error(inode, index, folio, swap);
+               shmem_set_folio_swapin_error(inode, index, folio, swap,
+                                            skip_swapcache);
  unlock:
+       if (skip_swapcache)
+               swapcache_clear(si, swap, folio_nr_pages(folio));
         if (folio) {
                 folio_unlock(folio);
                 folio_put(folio);
author	Baolin Wang <baolin.wang@linux.alibaba.com>
	Wed, 8 Jan 2025 02:16:49 +0000 (10:16 +0800)
committer	Andrew Morton <akpm@linux-foundation.org>
	Sun, 26 Jan 2025 04:22:30 +0000 (20:22 -0800)