]> www.infradead.org Git - nvme.git/commitdiff
mm: handle swap page faults under per-VMA lock
authorSuren Baghdasaryan <surenb@google.com>
Fri, 30 Jun 2023 21:19:56 +0000 (14:19 -0700)
committerAndrew Morton <akpm@linux-foundation.org>
Thu, 24 Aug 2023 23:20:17 +0000 (16:20 -0700)
When page fault is handled under per-VMA lock protection, all swap page
faults are retried with mmap_lock because folio_lock_or_retry has to drop
and reacquire mmap_lock if folio could not be immediately locked.  Follow
the same pattern as mmap_lock to drop per-VMA lock when waiting for folio
and retrying once folio is available.

With this obstacle removed, enable do_swap_page to operate under per-VMA
lock protection.  Drivers implementing ops->migrate_to_ram might still
rely on mmap_lock, therefore we have to fall back to mmap_lock in that
particular case.

Note that the only time do_swap_page calls synchronous swap_readpage is
when SWP_SYNCHRONOUS_IO is set, which is only set for
QUEUE_FLAG_SYNCHRONOUS devices: brd, zram and nvdimms (both btt and pmem).
Therefore we don't sleep in this path, and there's no need to drop the
mmap or per-VMA lock.

Link: https://lkml.kernel.org/r/20230630211957.1341547-6-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Tested-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Acked-by: Peter Xu <peterx@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hillf Danton <hdanton@sina.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michel Lespinasse <michel@lespinasse.org>
Cc: Minchan Kim <minchan@google.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Punit Agrawal <punit.agrawal@bytedance.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/mm.h
mm/filemap.c
mm/memory.c

index 939386e0aedab9f018d8fae4fe8ead037d08f7ec..0d16208178c7b6d18f7cfb14fe23c27a449f5a1e 100644 (file)
@@ -729,6 +729,14 @@ static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
        vma->detached = detached;
 }
 
+static inline void release_fault_lock(struct vm_fault *vmf)
+{
+       if (vmf->flags & FAULT_FLAG_VMA_LOCK)
+               vma_end_read(vmf->vma);
+       else
+               mmap_read_unlock(vmf->vma->vm_mm);
+}
+
 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
                                          unsigned long address);
 
@@ -749,6 +757,11 @@ static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
        return NULL;
 }
 
+static inline void release_fault_lock(struct vm_fault *vmf)
+{
+       mmap_read_unlock(vmf->vma->vm_mm);
+}
+
 #endif /* CONFIG_PER_VMA_LOCK */
 
 extern const struct vm_operations_struct vma_dummy_vm_ops;
index 40514493014a15c972708d2ab17fb016d519ade7..8040545954bc4175b26e1efd7331828a774333cd 100644 (file)
@@ -1671,27 +1671,26 @@ static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
  * Return values:
  * 0 - folio is locked.
  * non-zero - folio is not locked.
- *     mmap_lock has been released (mmap_read_unlock(), unless flags had both
- *     FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
- *     which case mmap_lock is still held.
+ *     mmap_lock or per-VMA lock has been released (mmap_read_unlock() or
+ *     vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and
+ *     FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held.
  *
  * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
- * with the folio locked and the mmap_lock unperturbed.
+ * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed.
  */
 vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
 {
-       struct mm_struct *mm = vmf->vma->vm_mm;
        unsigned int flags = vmf->flags;
 
        if (fault_flag_allow_retry_first(flags)) {
                /*
-                * CAUTION! In this case, mmap_lock is not released
-                * even though return VM_FAULT_RETRY.
+                * CAUTION! In this case, mmap_lock/per-VMA lock is not
+                * released even though returning VM_FAULT_RETRY.
                 */
                if (flags & FAULT_FLAG_RETRY_NOWAIT)
                        return VM_FAULT_RETRY;
 
-               mmap_read_unlock(mm);
+               release_fault_lock(vmf);
                if (flags & FAULT_FLAG_KILLABLE)
                        folio_wait_locked_killable(folio);
                else
@@ -1703,7 +1702,7 @@ vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
 
                ret = __folio_lock_killable(folio);
                if (ret) {
-                       mmap_read_unlock(mm);
+                       release_fault_lock(vmf);
                        return VM_FAULT_RETRY;
                }
        } else {
index 080e1d59d752e853aeb16894638a8a13035cec52..5748a41c164c5b553f839e3b176c3e26231e3e39 100644 (file)
@@ -3746,12 +3746,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        if (!pte_unmap_same(vmf))
                goto out;
 
-       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
-               ret = VM_FAULT_RETRY;
-               vma_end_read(vma);
-               goto out;
-       }
-
        entry = pte_to_swp_entry(vmf->orig_pte);
        if (unlikely(non_swap_entry(entry))) {
                if (is_migration_entry(entry)) {
@@ -3761,6 +3755,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                        vmf->page = pfn_swap_entry_to_page(entry);
                        ret = remove_device_exclusive_entry(vmf);
                } else if (is_device_private_entry(entry)) {
+                       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+                               /*
+                                * migrate_to_ram is not yet ready to operate
+                                * under VMA lock.
+                                */
+                               vma_end_read(vma);
+                               ret = VM_FAULT_RETRY;
+                               goto out;
+                       }
+
                        vmf->page = pfn_swap_entry_to_page(entry);
                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                        vmf->address, &vmf->ptl);