From feea9987f5395c9fd2c4a88d5ac2fdaf7e4e80ff Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:54 +1000 Subject: [PATCH] mm/huge_memory: add device-private THP support to PMD operations MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Extend core huge page management functions to handle device-private THP entries. This enables proper handling of large device-private folios in fundamental MM operations. The following functions have been updated: - copy_huge_pmd(): Handle device-private entries during fork/clone - zap_huge_pmd(): Properly free device-private THP during munmap - change_huge_pmd(): Support protection changes on device-private THP - __pte_offset_map(): Add device-private entry awareness Link: https://lkml.kernel.org/r/20251001065707.920170-4-balbirs@nvidia.com Signed-off-by: Matthew Brost Signed-off-by: Balbir Singh Acked-by: Zi Yan Cc: David Hildenbrand Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/swapops.h | 32 +++++++++++++++++++++++ mm/huge_memory.c | 56 ++++++++++++++++++++++++++++++++++------- mm/pgtable-generic.c | 2 +- 3 files changed, 80 insertions(+), 10 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 64ea151a7ae3..2687928a8146 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -594,10 +594,42 @@ static inline int is_pmd_migration_entry(pmd_t pmd) } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) + +/** + * is_pmd_device_private_entry() - Check if PMD contains a device private swap entry + * @pmd: The PMD to check + * + * Returns true if the PMD contains a swap entry that represents a device private + * page mapping. This is used for zone device private pages that have been + * swapped out but still need special handling during various memory management + * operations. + * + * Return: 1 if PMD contains device private entry, 0 otherwise + */ +static inline int is_pmd_device_private_entry(pmd_t pmd) +{ + return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd)); +} + +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + +static inline int is_pmd_device_private_entry(pmd_t pmd) +{ + return 0; +} + +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; } +static inline int is_pmd_non_present_folio_entry(pmd_t pmd) +{ + return is_pmd_migration_entry(pmd) || is_pmd_device_private_entry(pmd); +} + #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d56fee1fb5a7..63538a3e4976 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1703,17 +1703,45 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (unlikely(is_swap_pmd(pmd))) { swp_entry_t entry = pmd_to_swp_entry(pmd); - VM_BUG_ON(!is_pmd_migration_entry(pmd)); - if (!is_readable_migration_entry(entry)) { - entry = make_readable_migration_entry( - swp_offset(entry)); + VM_WARN_ON(!is_pmd_non_present_folio_entry(pmd)); + + if (is_writable_migration_entry(entry) || + is_readable_exclusive_migration_entry(entry)) { + entry = make_readable_migration_entry(swp_offset(entry)); pmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*src_pmd)) pmd = pmd_swp_mksoft_dirty(pmd); if (pmd_swp_uffd_wp(*src_pmd)) pmd = pmd_swp_mkuffd_wp(pmd); set_pmd_at(src_mm, addr, src_pmd, pmd); + } else if (is_device_private_entry(entry)) { + /* + * For device private entries, since there are no + * read exclusive entries, writable = !readable + */ + if (is_writable_device_private_entry(entry)) { + entry = make_readable_device_private_entry(swp_offset(entry)); + pmd = swp_entry_to_pmd(entry); + + if (pmd_swp_soft_dirty(*src_pmd)) + pmd = pmd_swp_mksoft_dirty(pmd); + if (pmd_swp_uffd_wp(*src_pmd)) + pmd = pmd_swp_mkuffd_wp(pmd); + set_pmd_at(src_mm, addr, src_pmd, pmd); + } + + src_folio = pfn_swap_entry_folio(entry); + VM_WARN_ON(!folio_test_large(src_folio)); + + folio_get(src_folio); + /* + * folio_try_dup_anon_rmap_pmd does not fail for + * device private entries. + */ + folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page, + dst_vma, src_vma); } + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); @@ -2211,15 +2239,16 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, folio_remove_rmap_pmd(folio, page, vma); WARN_ON_ONCE(folio_mapcount(folio) < 0); VM_BUG_ON_PAGE(!PageHead(page), page); - } else if (thp_migration_supported()) { + } else if (is_pmd_non_present_folio_entry(orig_pmd)) { swp_entry_t entry; - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); entry = pmd_to_swp_entry(orig_pmd); folio = pfn_swap_entry_folio(entry); flush_needed = 0; - } else - WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + + if (!thp_migration_supported()) + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + } if (folio_test_anon(folio)) { zap_deposited_table(tlb->mm, pmd); @@ -2239,6 +2268,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, folio_mark_accessed(folio); } + if (folio_is_device_private(folio)) { + folio_remove_rmap_pmd(folio, &folio->page, vma); + WARN_ON_ONCE(folio_mapcount(folio) < 0); + folio_put(folio); + } + spin_unlock(ptl); if (flush_needed) tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); @@ -2367,7 +2402,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct folio *folio = pfn_swap_entry_folio(entry); pmd_t newpmd; - VM_BUG_ON(!is_pmd_migration_entry(*pmd)); + VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd)); if (is_writable_migration_entry(entry)) { /* * A protection check is difficult so @@ -2380,6 +2415,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, newpmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*pmd)) newpmd = pmd_swp_mksoft_dirty(newpmd); + } else if (is_writable_device_private_entry(entry)) { + entry = make_readable_device_private_entry(swp_offset(entry)); + newpmd = swp_entry_to_pmd(entry); } else { newpmd = *pmd; } diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 567e2d084071..0c847cdf4fd3 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -290,7 +290,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) if (pmdvalp) *pmdvalp = pmdval; - if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) + if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval))) goto nomap; if (unlikely(pmd_trans_huge(pmdval))) goto nomap; -- 2.51.0