break;
        case SCAN_PMD_NONE:
                /*
-                * In MADV_COLLAPSE path, possible race with khugepaged where
-                * all pte entries have been removed and pmd cleared.  If so,
-                * skip all the pte checks and just update the pmd mapping.
+                * All pte entries have been removed and pmd cleared.
+                * Skip all the pte checks and just update the pmd mapping.
                 */
                goto maybe_install_pmd;
        default:
        mmap_write_unlock(mm);
 }
 
-static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
-                              struct mm_struct *target_mm,
-                              unsigned long target_addr, struct page *hpage,
-                              struct collapse_control *cc)
+static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 {
        struct vm_area_struct *vma;
-       int target_result = SCAN_FAIL;
 
-       i_mmap_lock_write(mapping);
+       i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
-               int result = SCAN_FAIL;
-               struct mm_struct *mm = NULL;
-               unsigned long addr = 0;
-               pmd_t *pmd;
-               bool is_target = false;
+               struct mmu_notifier_range range;
+               struct mm_struct *mm;
+               unsigned long addr;
+               pmd_t *pmd, pgt_pmd;
+               spinlock_t *pml;
+               spinlock_t *ptl;
+               bool skipped_uffd = false;
 
                /*
                 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
-                * got written to. These VMAs are likely not worth investing
-                * mmap_write_lock(mm) as PMD-mapping is likely to be split
-                * later.
-                *
-                * Note that vma->anon_vma check is racy: it can be set up after
-                * the check but before we took mmap_lock by the fault path.
-                * But page lock would prevent establishing any new ptes of the
-                * page, so we are safe.
-                *
-                * An alternative would be drop the check, but check that page
-                * table is clear before calling pmdp_collapse_flush() under
-                * ptl. It has higher chance to recover THP for the VMA, but
-                * has higher cost too. It would also probably require locking
-                * the anon_vma.
+                * got written to. These VMAs are likely not worth removing
+                * page tables from, as PMD-mapping is likely to be split later.
                 */
-               if (READ_ONCE(vma->anon_vma)) {
-                       result = SCAN_PAGE_ANON;
-                       goto next;
-               }
+               if (READ_ONCE(vma->anon_vma))
+                       continue;
+
                addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                if (addr & ~HPAGE_PMD_MASK ||
-                   vma->vm_end < addr + HPAGE_PMD_SIZE) {
-                       result = SCAN_VMA_CHECK;
-                       goto next;
-               }
+                   vma->vm_end < addr + HPAGE_PMD_SIZE)
+                       continue;
+
                mm = vma->vm_mm;
-               is_target = mm == target_mm && addr == target_addr;
-               result = find_pmd_or_thp_or_none(mm, addr, &pmd);
-               if (result != SCAN_SUCCEED)
-                       goto next;
+               if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
+                       continue;
+
+               if (hpage_collapse_test_exit(mm))
+                       continue;
                /*
-                * We need exclusive mmap_lock to retract page table.
-                *
-                * We use trylock due to lock inversion: we need to acquire
-                * mmap_lock while holding page lock. Fault path does it in
-                * reverse order. Trylock is a way to avoid deadlock.
-                *
-                * Also, it's not MADV_COLLAPSE's job to collapse other
-                * mappings - let khugepaged take care of them later.
+                * When a vma is registered with uffd-wp, we cannot recycle
+                * the page table because there may be pte markers installed.
+                * Other vmas can still have the same file mapped hugely, but
+                * skip this one: it will always be mapped in small page size
+                * for uffd-wp registered ranges.
                 */
-               result = SCAN_PTE_MAPPED_HUGEPAGE;
-               if ((cc->is_khugepaged || is_target) &&
-                   mmap_write_trylock(mm)) {
-                       /* trylock for the same lock inversion as above */
-                       if (!vma_try_start_write(vma))
-                               goto unlock_next;
+               if (userfaultfd_wp(vma))
+                       continue;
 
-                       /*
-                        * Re-check whether we have an ->anon_vma, because
-                        * collapse_and_free_pmd() requires that either no
-                        * ->anon_vma exists or the anon_vma is locked.
-                        * We already checked ->anon_vma above, but that check
-                        * is racy because ->anon_vma can be populated under the
-                        * mmap lock in read mode.
-                        */
-                       if (vma->anon_vma) {
-                               result = SCAN_PAGE_ANON;
-                               goto unlock_next;
-                       }
-                       /*
-                        * When a vma is registered with uffd-wp, we can't
-                        * recycle the pmd pgtable because there can be pte
-                        * markers installed.  Skip it only, so the rest mm/vma
-                        * can still have the same file mapped hugely, however
-                        * it'll always mapped in small page size for uffd-wp
-                        * registered ranges.
-                        */
-                       if (hpage_collapse_test_exit(mm)) {
-                               result = SCAN_ANY_PROCESS;
-                               goto unlock_next;
-                       }
-                       if (userfaultfd_wp(vma)) {
-                               result = SCAN_PTE_UFFD_WP;
-                               goto unlock_next;
-                       }
-                       collapse_and_free_pmd(mm, vma, addr, pmd);
-                       if (!cc->is_khugepaged && is_target)
-                               result = set_huge_pmd(vma, addr, pmd, hpage);
-                       else
-                               result = SCAN_SUCCEED;
+               /* PTEs were notified when unmapped; but now for the PMD? */
+               mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+                                       addr, addr + HPAGE_PMD_SIZE);
+               mmu_notifier_invalidate_range_start(&range);
+
+               pml = pmd_lock(mm, pmd);
+               ptl = pte_lockptr(mm, pmd);
+               if (ptl != pml)
+                       spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
 
-unlock_next:
-                       mmap_write_unlock(mm);
-                       goto next;
-               }
                /*
-                * Calling context will handle target mm/addr. Otherwise, let
-                * khugepaged try again later.
+                * Huge page lock is still held, so normally the page table
+                * must remain empty; and we have already skipped anon_vma
+                * and userfaultfd_wp() vmas.  But since the mmap_lock is not
+                * held, it is still possible for a racing userfaultfd_ioctl()
+                * to have inserted ptes or markers.  Now that we hold ptlock,
+                * repeating the anon_vma check protects from one category,
+                * and repeating the userfaultfd_wp() check from another.
                 */
-               if (!is_target) {
-                       khugepaged_add_pte_mapped_thp(mm, addr);
-                       continue;
+               if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
+                       skipped_uffd = true;
+               } else {
+                       pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
+                       pmdp_get_lockless_sync();
+               }
+
+               if (ptl != pml)
+                       spin_unlock(ptl);
+               spin_unlock(pml);
+
+               mmu_notifier_invalidate_range_end(&range);
+
+               if (!skipped_uffd) {
+                       mm_dec_nr_ptes(mm);
+                       page_table_check_pte_clear_range(mm, addr, pgt_pmd);
+                       pte_free_defer(mm, pmd_pgtable(pgt_pmd));
                }
-next:
-               if (is_target)
-                       target_result = result;
        }
-       i_mmap_unlock_write(mapping);
-       return target_result;
+       i_mmap_unlock_read(mapping);
 }
 
 /**
 
        /*
         * Remove pte page tables, so we can re-fault the page as huge.
+        * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
         */
-       result = retract_page_tables(mapping, start, mm, addr, hpage,
-                                    cc);
+       retract_page_tables(mapping, start);
+       if (cc && !cc->is_khugepaged)
+               result = SCAN_PTE_MAPPED_HUGEPAGE;
        unlock_page(hpage);
 
        /*