]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
mm/hugetlbfs: update hugetlbfs to use mmap_prepare
authorLorenzo Stoakes <lorenzo.stoakes@oracle.com>
Wed, 17 Sep 2025 19:11:13 +0000 (20:11 +0100)
committerAndrew Morton <akpm@linux-foundation.org>
Wed, 15 Oct 2025 04:28:25 +0000 (21:28 -0700)
Since we can now perform actions after the VMA is established via
mmap_prepare, use desc->action_success_hook to set up the hugetlb lock
once the VMA is setup.

We also make changes throughout hugetlbfs to make this possible.

Link: https://lkml.kernel.org/r/e5532a0aff1991a1b5435dcb358b7d35abc80f3b.1758135681.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
fs/hugetlbfs/inode.c
include/linux/hugetlb.h
include/linux/hugetlb_inline.h
mm/hugetlb.c

index f42548ee9083c6bf4b20f9a75e069e5f69fdfc3a..9e0625167517450d87172b2217f6981037f7a91d 100644 (file)
@@ -96,8 +96,15 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
 #define PGOFF_LOFFT_MAX \
        (((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))
 
-static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma)
 {
+       /* Unfortunate we have to reassign vma->vm_private_data. */
+       return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma);
+}
+
+static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc)
+{
+       struct file *file = desc->file;
        struct inode *inode = file_inode(file);
        loff_t len, vma_len;
        int ret;
@@ -112,8 +119,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
         * way when do_mmap unwinds (may be important on powerpc
         * and ia64).
         */
-       vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
-       vma->vm_ops = &hugetlb_vm_ops;
+       desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
+       desc->vm_ops = &hugetlb_vm_ops;
 
        /*
         * page based offset in vm_pgoff could be sufficiently large to
@@ -122,16 +129,16 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
         * sizeof(unsigned long).  So, only check in those instances.
         */
        if (sizeof(unsigned long) == sizeof(loff_t)) {
-               if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
+               if (desc->pgoff & PGOFF_LOFFT_MAX)
                        return -EINVAL;
        }
 
        /* must be huge page aligned */
-       if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
+       if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
                return -EINVAL;
 
-       vma_len = (loff_t)(vma->vm_end - vma->vm_start);
-       len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+       vma_len = (loff_t)vma_desc_size(desc);
+       len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT);
        /* check for overflow */
        if (len < vma_len)
                return -EINVAL;
@@ -141,7 +148,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
        ret = -ENOMEM;
 
-       vm_flags = vma->vm_flags;
+       vm_flags = desc->vm_flags;
        /*
         * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
         * reserving here. Note: only for SHM hugetlbfs file, the inode
@@ -151,17 +158,20 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
                vm_flags |= VM_NORESERVE;
 
        if (hugetlb_reserve_pages(inode,
-                               vma->vm_pgoff >> huge_page_order(h),
-                               len >> huge_page_shift(h), vma,
-                               vm_flags) < 0)
+                       desc->pgoff >> huge_page_order(h),
+                       len >> huge_page_shift(h), desc,
+                       vm_flags) < 0)
                goto out;
 
        ret = 0;
-       if (vma->vm_flags & VM_WRITE && inode->i_size < len)
+       if ((desc->vm_flags & VM_WRITE) && inode->i_size < len)
                i_size_write(inode, len);
 out:
        inode_unlock(inode);
 
+       /* Allocate the VMA lock after we set it up. */
+       if (!ret)
+               desc->action.success_hook = hugetlb_file_mmap_prepare_success;
        return ret;
 }
 
@@ -1221,7 +1231,7 @@ static void init_once(void *foo)
 
 static const struct file_operations hugetlbfs_file_operations = {
        .read_iter              = hugetlbfs_read_iter,
-       .mmap                   = hugetlbfs_file_mmap,
+       .mmap_prepare           = hugetlbfs_file_mmap_prepare,
        .fsync                  = noop_fsync,
        .get_unmapped_area      = hugetlb_get_unmapped_area,
        .llseek                 = default_llseek,
index 8e63e46b8e1f0ea5d88ed728f6de72b93b8901de..2387513d6ae539a0305bdc37fa639c0ee0c5c5e1 100644 (file)
@@ -150,8 +150,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
                             struct folio **foliop);
 #endif /* CONFIG_USERFAULTFD */
 long hugetlb_reserve_pages(struct inode *inode, long from, long to,
-                                               struct vm_area_struct *vma,
-                                               vm_flags_t vm_flags);
+                          struct vm_area_desc *desc, vm_flags_t vm_flags);
 long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
                                                long freed);
 bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
@@ -280,6 +279,7 @@ bool is_hugetlb_entry_hwpoisoned(pte_t pte);
 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
 void fixup_hugetlb_reservations(struct vm_area_struct *vma);
 void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
+int hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
 
 #else /* !CONFIG_HUGETLB_PAGE */
 
@@ -466,6 +466,11 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
 
 static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
 
+static inline int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+       return 0;
+}
+
 #endif /* !CONFIG_HUGETLB_PAGE */
 
 #ifndef pgd_write
index 0660a03d37d986c8167dbdb5086acaa96dc17a99..a27aa01629186dc95e90e1f6b3c0dc532ddbb998 100644 (file)
@@ -2,22 +2,27 @@
 #ifndef _LINUX_HUGETLB_INLINE_H
 #define _LINUX_HUGETLB_INLINE_H
 
-#ifdef CONFIG_HUGETLB_PAGE
-
 #include <linux/mm.h>
 
-static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+#ifdef CONFIG_HUGETLB_PAGE
+
+static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags)
 {
-       return !!(vma->vm_flags & VM_HUGETLB);
+       return !!(vm_flags & VM_HUGETLB);
 }
 
 #else
 
-static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags)
 {
        return false;
 }
 
 #endif
 
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+{
+       return is_vm_hugetlb_flags(vma->vm_flags);
+}
+
 #endif
index 0455119716ec0c57c779bc91c2b885d095ab4aad..b6e8b2b6e65a086c00db79ffc50e8da354bef0b7 100644 (file)
@@ -119,7 +119,6 @@ struct mutex *hugetlb_fault_mutex_table __ro_after_init;
 /* Forward declaration */
 static int hugetlb_acct_memory(struct hstate *h, long delta);
 static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
-static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
 static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
 static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
                unsigned long start, unsigned long end, bool take_locks);
@@ -427,17 +426,21 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
        }
 }
 
-static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+/*
+ * vma specific semaphore used for pmd sharing and fault/truncation
+ * synchronization
+ */
+int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
 {
        struct hugetlb_vma_lock *vma_lock;
 
        /* Only establish in (flags) sharable vmas */
        if (!vma || !(vma->vm_flags & VM_MAYSHARE))
-               return;
+               return 0;
 
        /* Should never get here with non-NULL vm_private_data */
        if (vma->vm_private_data)
-               return;
+               return -EINVAL;
 
        vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
        if (!vma_lock) {
@@ -452,13 +455,15 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
                 * allocation failure.
                 */
                pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
-               return;
+               return -EINVAL;
        }
 
        kref_init(&vma_lock->refs);
        init_rwsem(&vma_lock->rw_sema);
        vma_lock->vma = vma;
        vma->vm_private_data = vma_lock;
+
+       return 0;
 }
 
 /* Helper that removes a struct file_region from the resv_map cache and returns
@@ -1190,20 +1195,28 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
        }
 }
 
-static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
 {
-       VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
-       VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
+       VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma);
+       VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma);
 
-       set_vma_private_data(vma, (unsigned long)map);
+       set_vma_private_data(vma, get_vma_private_data(vma) | flags);
 }
 
-static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map)
 {
-       VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
-       VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
+       VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+       VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
 
-       set_vma_private_data(vma, get_vma_private_data(vma) | flags);
+       desc->private_data = map;
+}
+
+static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags)
+{
+       VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+       VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
+
+       desc->private_data = (void *)((unsigned long)desc->private_data | flags);
 }
 
 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
@@ -1213,6 +1226,13 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
        return (get_vma_private_data(vma) & flag) != 0;
 }
 
+static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag)
+{
+       VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+
+       return ((unsigned long)desc->private_data) & flag;
+}
+
 bool __vma_private_lock(struct vm_area_struct *vma)
 {
        return !(vma->vm_flags & VM_MAYSHARE) &&
@@ -7259,9 +7279,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
  */
 
 long hugetlb_reserve_pages(struct inode *inode,
-                                       long from, long to,
-                                       struct vm_area_struct *vma,
-                                       vm_flags_t vm_flags)
+               long from, long to,
+               struct vm_area_desc *desc,
+               vm_flags_t vm_flags)
 {
        long chg = -1, add = -1, spool_resv, gbl_resv;
        struct hstate *h = hstate_inode(inode);
@@ -7276,12 +7296,6 @@ long hugetlb_reserve_pages(struct inode *inode,
                return -EINVAL;
        }
 
-       /*
-        * vma specific semaphore used for pmd sharing and fault/truncation
-        * synchronization
-        */
-       hugetlb_vma_lock_alloc(vma);
-
        /*
         * Only apply hugepage reservation if asked. At fault time, an
         * attempt will be made for VM_NORESERVE to allocate a page
@@ -7294,9 +7308,9 @@ long hugetlb_reserve_pages(struct inode *inode,
         * Shared mappings base their reservation on the number of pages that
         * are already allocated on behalf of the file. Private mappings need
         * to reserve the full area even if read-only as mprotect() may be
-        * called to make the mapping read-write. Assume !vma is a shm mapping
+        * called to make the mapping read-write. Assume !desc is a shm mapping
         */
-       if (!vma || vma->vm_flags & VM_MAYSHARE) {
+       if (!desc || desc->vm_flags & VM_MAYSHARE) {
                /*
                 * resv_map can not be NULL as hugetlb_reserve_pages is only
                 * called for inodes for which resv_maps were created (see
@@ -7313,8 +7327,8 @@ long hugetlb_reserve_pages(struct inode *inode,
 
                chg = to - from;
 
-               set_vma_resv_map(vma, resv_map);
-               set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
+               set_vma_desc_resv_map(desc, resv_map);
+               set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER);
        }
 
        if (chg < 0)
@@ -7324,7 +7338,7 @@ long hugetlb_reserve_pages(struct inode *inode,
                                chg * pages_per_huge_page(h), &h_cg) < 0)
                goto out_err;
 
-       if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
+       if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) {
                /* For private mappings, the hugetlb_cgroup uncharge info hangs
                 * of the resv_map.
                 */
@@ -7358,7 +7372,7 @@ long hugetlb_reserve_pages(struct inode *inode,
         * consumed reservations are stored in the map. Hence, nothing
         * else has to be done for private mappings here
         */
-       if (!vma || vma->vm_flags & VM_MAYSHARE) {
+       if (!desc || desc->vm_flags & VM_MAYSHARE) {
                add = region_add(resv_map, from, to, regions_needed, h, h_cg);
 
                if (unlikely(add < 0)) {
@@ -7412,16 +7426,15 @@ out_uncharge_cgroup:
        hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
                                            chg * pages_per_huge_page(h), h_cg);
 out_err:
-       hugetlb_vma_lock_free(vma);
-       if (!vma || vma->vm_flags & VM_MAYSHARE)
+       if (!desc || desc->vm_flags & VM_MAYSHARE)
                /* Only call region_abort if the region_chg succeeded but the
                 * region_add failed or didn't run.
                 */
                if (chg >= 0 && add < 0)
                        region_abort(resv_map, from, to, regions_needed);
-       if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+       if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) {
                kref_put(&resv_map->refs, resv_map_release);
-               set_vma_resv_map(vma, NULL);
+               set_vma_desc_resv_map(desc, NULL);
        }
        return chg < 0 ? chg : add < 0 ? add : -EINVAL;
 }