From 126bbf1a921f16cc21f0bb128e96b89aa29a377d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 23 Oct 2025 23:14:35 -0400 Subject: [PATCH] mm/hugetlb: Extract continue operation from hugetlb_mfill_atomic_pte() Extract the continue operation to be used as a pointer in uffd ops. Signed-off-by: Liam R. Howlett --- mm/hugetlb.c | 124 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 106 insertions(+), 18 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4339a31bac3b..78e9affb82c6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7046,6 +7046,101 @@ out: return 0; } +static int hugetlb_mfill_pte_continue(struct vm_area_struct *dst_vma, + unsigned long dst_addr, uffd_flags_t flags, + unsigned long increment) +{ + struct mm_struct *dst_mm = dst_vma->vm_mm; + bool wp_enabled = (flags & MFILL_ATOMIC_WP); + struct hstate *h = hstate_vma(dst_vma); + struct address_space *mapping = dst_vma->vm_file->f_mapping; + pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); + unsigned long size = huge_page_size(h); + int vm_shared = dst_vma->vm_flags & VM_SHARED; + pte_t _dst_pte; + pte_t *dst_pte; + spinlock_t *ptl; + int ret; + struct folio *folio; + u32 hash; + + ret = hugetlb_mfill_prepare(dst_vma, dst_addr, increment, + &dst_pte, &hash, flags); + if (ret) + return ret; + + ret = -EFAULT; + folio = filemap_lock_hugetlb_folio(h, mapping, idx); + if (IS_ERR(folio)) + goto out; + + /* + * If we just allocated a new page, we need a memory barrier to ensure + * that preceding stores to the page become visible before the + * set_pte_at() write. The memory barrier inside __folio_mark_uptodate + * is what we need. + * + * In the case where we have not allocated a new page (is_continue), + * the page must already be uptodate. UFFDIO_CONTINUE already includes + * an earlier smp_wmb() to ensure that prior stores will be visible + * before the set_pte_at() write. + */ + WARN_ON_ONCE(!folio_test_uptodate(folio)); + ptl = huge_pte_lock(h, dst_mm, dst_pte); + ret = -EIO; + if (folio_test_hwpoison(folio)) + goto out_release_unlock; + + /* + * We allow to overwrite a pte marker: consider when both MISSING|WP + * registered, we firstly wr-protect a none pte which has no page cache + * page backing it, then access the page. + */ + ret = -EEXIST; + if (!huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) + goto out_release_unlock; + + hugetlb_add_file_rmap(folio); + + /* + * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY + * with wp flag set, don't set pte write bit. + */ + _dst_pte = make_huge_pte(dst_vma, folio, !wp_enabled && vm_shared); + /* + * Always mark UFFDIO_COPY page dirty; note that this may not be + * extremely important for hugetlbfs for now since swapping is not + * supported, but we should still be clear in that this page cannot be + * thrown away at will, even if write bit not set. + */ + _dst_pte = huge_pte_mkdirty(_dst_pte); + _dst_pte = pte_mkyoung(_dst_pte); + + if (wp_enabled) + _dst_pte = huge_pte_mkuffd_wp(_dst_pte); + + set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size); + + hugetlb_count_add(pages_per_huge_page(h), dst_mm); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + + spin_unlock(ptl); + if (vm_shared) + folio_unlock(folio); + ret = 0; +out: + hugetlb_vma_unlock_read(dst_vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + return ret; +out_release_unlock: + spin_unlock(ptl); + if (vm_shared) + folio_unlock(folio); + folio_put(folio); + goto out; +} /* * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte * with modifications for hugetlb pages. @@ -7073,18 +7168,16 @@ int hugetlb_mfill_atomic_pte(struct vm_area_struct *dst_vma, bool folio_in_pagecache = false; u32 hash; + if (is_continue) + return hugetlb_mfill_pte_continue(dst_vma, dst_addr, flags, + increment); + ret = hugetlb_mfill_prepare(dst_vma, dst_addr, increment, &dst_pte, &hash, flags); if (ret) return ret; - if (is_continue) { - ret = -EFAULT; - folio = filemap_lock_hugetlb_folio(h, mapping, idx); - if (IS_ERR(folio)) - goto out; - folio_in_pagecache = true; - } else if (!*foliop) { + if (!*foliop) { /* If a folio already exists, then it's UFFDIO_COPY for * a non-missing case. Return -EEXIST. */ @@ -7168,13 +7261,10 @@ int hugetlb_mfill_atomic_pte(struct vm_area_struct *dst_vma, * an earlier smp_wmb() to ensure that prior stores will be visible * before the set_pte_at() write. */ - if (!is_continue) - __folio_mark_uptodate(folio); - else - WARN_ON_ONCE(!folio_test_uptodate(folio)); + __folio_mark_uptodate(folio); /* Add shared, newly allocated pages to the page cache. */ - if (vm_shared && !is_continue) { + if (vm_shared) { ret = -EFAULT; if (idx >= (i_size_read(mapping->host) >> huge_page_shift(h))) goto out_release_nounlock; @@ -7215,8 +7305,7 @@ int hugetlb_mfill_atomic_pte(struct vm_area_struct *dst_vma, * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY * with wp flag set, don't set pte write bit. */ - _dst_pte = make_huge_pte(dst_vma, folio, - !wp_enabled && !(is_continue && !vm_shared)); + _dst_pte = make_huge_pte(dst_vma, folio, !wp_enabled); /* * Always mark UFFDIO_COPY page dirty; note that this may not be * extremely important for hugetlbfs for now since swapping is not @@ -7237,9 +7326,8 @@ int hugetlb_mfill_atomic_pte(struct vm_area_struct *dst_vma, update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); - if (!is_continue) - folio_set_hugetlb_migratable(folio); - if (vm_shared || is_continue) + folio_set_hugetlb_migratable(folio); + if (vm_shared) folio_unlock(folio); ret = 0; out: @@ -7248,7 +7336,7 @@ out: return ret; out_release_unlock: spin_unlock(ptl); - if (vm_shared || is_continue) + if (vm_shared) folio_unlock(folio); out_release_nounlock: if (!folio_in_pagecache) -- 2.51.0