unsigned long hugetlb_total_pages(void);
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags);
-#ifdef CONFIG_USERFAULTFD
-int hugetlb_mfill_atomic_pte(struct vm_area_struct *dst_vma,
- unsigned long dst_addr, unsigned long src_addr,
- uffd_flags_t flags, struct folio **foliop, unsigned long hash);
-#endif /* CONFIG_USERFAULTFD */
long hugetlb_reserve_pages(struct inode *inode, long from, long to,
struct vm_area_desc *desc, vm_flags_t vm_flags);
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
return 0;
}
-#ifdef CONFIG_USERFAULTFD
-static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- uffd_flags_t flags,
- struct folio **foliop)
-{
- BUG();
- return 0;
-}
-#endif /* CONFIG_USERFAULTFD */
-
static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
unsigned long sz)
{
unsigned long src_addr);
struct vm_uffd_ops {
+ int (*copy)(struct vm_area_struct *dst_vma, unsigned long dst_addr,
+ unsigned long src_addr, uffd_flags_t flags,
+ struct folio **foliop, unsigned long increment);
+ int (*zeropage)(struct vm_area_struct *dst_vma, unsigned long dst_addr);
int (*cont)(struct vm_area_struct *dst_vma, unsigned long dst_addr,
uffd_flags_t flags, unsigned long increment);
int (*poison)(struct vm_area_struct *dst_vma,
unsigned long dst_addr, uffd_flags_t flags,
unsigned long increment);
+int mfill_atomic_pte_copy(struct vm_area_struct *dst_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ uffd_flags_t flags, struct folio **foliop,
+ unsigned long increment);
+
+int mfill_atomic_pte_zeropage(struct vm_area_struct *dst_vma,
+ unsigned long dst_addr);
+
static inline bool vma_can_userfault(struct vm_area_struct *vma,
vm_flags_t vm_flags,
bool wp_async)
unsigned long dst_addr, uffd_flags_t flags,
unsigned long increment);
+static int hugetlb_mfill_atomic_pte_copy(struct vm_area_struct *dst_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ uffd_flags_t flags, struct folio **foliop,
+ unsigned long increment);
+
static const struct vm_uffd_ops hugetlb_uffd_ops = {
+ .copy = hugetlb_mfill_atomic_pte_copy,
+ .zeropage = NULL,
.cont = hugetlb_mfill_pte_continue,
.poison = hugetlb_mfill_pte_poison,
.is_dst_valid = hugetlb_is_dst_valid,
folio_put(folio);
goto out;
}
+
/*
* Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
* with modifications for hugetlb pages.
*/
-int hugetlb_mfill_atomic_pte(struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- uffd_flags_t flags,
- struct folio **foliop,
- unsigned long increment)
+static int hugetlb_mfill_atomic_pte_copy(struct vm_area_struct *dst_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ uffd_flags_t flags, struct folio **foliop,
+ unsigned long increment)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
bool wp_enabled = (flags & MFILL_ATOMIC_WP);
return 0;
}
-int shmem_mfill_atomic_pte(struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- uffd_flags_t flags,
- struct folio **foliop)
+
+static int shmem_mfill_atomic_pte_zeropage(struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
+{
+ struct inode *inode = file_inode(dst_vma->vm_file);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct address_space *mapping = inode->i_mapping;
+ gfp_t gfp = mapping_gfp_mask(mapping);
+ pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+ struct folio *folio;
+ int ret;
+ pgoff_t max_off;
+ pmd_t *dst_pmd;
+
+ ret = uffd_get_dst_pmd(dst_vma, dst_addr, &dst_pmd);
+ if (ret)
+ return ret;
+
+ if (shmem_inode_acct_blocks(inode, 1))
+ return -ENOMEM;
+
+ ret = -ENOMEM;
+ folio = shmem_alloc_folio(gfp, 0, info, pgoff);
+ if (!folio)
+ goto out_unacct_blocks;
+
+ clear_user_highpage(&folio->page, dst_addr);
+
+ VM_BUG_ON(folio_test_locked(folio));
+ VM_BUG_ON(folio_test_swapbacked(folio));
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+ __folio_mark_uptodate(folio);
+
+ ret = -EFAULT;
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(pgoff >= max_off))
+ goto out_release;
+
+ ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp);
+ if (ret)
+ goto out_release;
+ ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
+ if (ret)
+ goto out_release;
+
+ ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
+ &folio->page, true, 0);
+ if (ret)
+ goto out_delete_from_cache;
+
+ shmem_recalc_inode(inode, 1, 0);
+ folio_unlock(folio);
+ return 0;
+out_delete_from_cache:
+ filemap_remove_folio(folio);
+out_release:
+ folio_unlock(folio);
+ folio_put(folio);
+out_unacct_blocks:
+ shmem_inode_unacct_blocks(inode, 1);
+ return ret;
+}
+
+static int shmem_mfill_atomic_pte_copy(struct vm_area_struct *dst_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ uffd_flags_t flags, struct folio **foliop,
+ unsigned long increment)
{
struct inode *inode = file_inode(dst_vma->vm_file);
struct shmem_inode_info *info = SHMEM_I(inode);
if (!folio)
goto out_unacct_blocks;
- if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
- page_kaddr = kmap_local_folio(folio, 0);
- /*
- * The read mmap_lock is held here. Despite the
- * mmap_lock being read recursive a deadlock is still
- * possible if a writer has taken a lock. For example:
- *
- * process A thread 1 takes read lock on own mmap_lock
- * process A thread 2 calls mmap, blocks taking write lock
- * process B thread 1 takes page fault, read lock on own mmap lock
- * process B thread 2 calls mmap, blocks taking write lock
- * process A thread 1 blocks taking read lock on process B
- * process B thread 1 blocks taking read lock on process A
- *
- * Disable page faults to prevent potential deadlock
- * and retry the copy outside the mmap_lock.
- */
- pagefault_disable();
- ret = copy_from_user(page_kaddr,
- (const void __user *)src_addr,
- PAGE_SIZE);
- pagefault_enable();
- kunmap_local(page_kaddr);
-
- /* fallback to copy_from_user outside mmap_lock */
- if (unlikely(ret)) {
- *foliop = folio;
- ret = -ENOENT;
- /* don't free the page */
- goto out_unacct_blocks;
- }
-
- flush_dcache_folio(folio);
- } else { /* ZEROPAGE */
- clear_user_highpage(&folio->page, dst_addr);
+ page_kaddr = kmap_local_folio(folio, 0);
+ /*
+ * The read mmap_lock is held here. Despite the
+ * mmap_lock being read recursive a deadlock is still
+ * possible if a writer has taken a lock. For example:
+ *
+ * process A thread 1 takes read lock on own mmap_lock
+ * process A thread 2 calls mmap, blocks taking write lock
+ * process B thread 1 takes page fault, read lock on own mmap lock
+ * process B thread 2 calls mmap, blocks taking write lock
+ * process A thread 1 blocks taking read lock on process B
+ * process B thread 1 blocks taking read lock on process A
+ *
+ * Disable page faults to prevent potential deadlock
+ * and retry the copy outside the mmap_lock.
+ */
+ pagefault_disable();
+ ret = copy_from_user(page_kaddr, (const void __user *)src_addr,
+ PAGE_SIZE);
+ pagefault_enable();
+ kunmap_local(page_kaddr);
+
+ /* fallback to copy_from_user outside mmap_lock */
+ if (unlikely(ret)) {
+ *foliop = folio;
+ ret = -ENOENT;
+ /* don't free the page */
+ goto out_unacct_blocks;
}
+
+ flush_dcache_folio(folio);
} else {
folio = *foliop;
VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
#ifdef CONFIG_USERFAULTFD
static const struct vm_uffd_ops shmem_uffd_ops = {
+ .copy = shmem_mfill_atomic_pte_copy,
+ .zeropage = shmem_mfill_atomic_pte_zeropage,
.cont = mfill_atomic_pte_continue,
.poison = mfill_atomic_pte_poison,
.is_dst_valid = shmem_is_dst_valid,
return ret;
}
-static int mfill_atomic_pte_copy(struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- uffd_flags_t flags,
- struct folio **foliop)
+int mfill_atomic_pte_copy(struct vm_area_struct *dst_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ uffd_flags_t flags, struct folio **foliop,
+ unsigned long increment)
{
void *kaddr;
int ret;
return ret;
}
-static int mfill_atomic_pte_zeropage(struct vm_area_struct *dst_vma,
- unsigned long dst_addr)
+int mfill_atomic_pte_zeropage(struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
{
pte_t _dst_pte, *dst_pte;
pmd_t *dst_pmd;
/* Anon vma ops */
static const struct vm_uffd_ops default_uffd_ops = {
+ .copy = mfill_atomic_pte_copy,
+ .zeropage = mfill_atomic_pte_zeropage,
.cont = mfill_atomic_pte_continue,
.poison = mfill_atomic_pte_poison,
.is_dst_valid = uffd_def_is_dst_valid,
if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
return -EINVAL;
+ uffd_ops = vma_get_uffd_ops(dst_vma);
+ WARN_ON_ONCE(!uffd_ops || !uffd_ops->is_dst_valid);
+
/*
* There is no default zero huge page for all huge page sizes as
* supported by hugetlb. A PMD_SIZE huge pages may exist as used
* by THP. Since we can not reliably insert a zero page, this
* feature is not supported.
*/
- if (is_vm_hugetlb_page(dst_vma) &&
+ if (!uffd_ops->zeropage &&
uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE))
return -EINVAL;
- uffd_ops = vma_get_uffd_ops(dst_vma);
- WARN_ON_ONCE(!uffd_ops || !uffd_ops->is_dst_valid);
return uffd_ops->is_dst_valid(dst_vma, dst_start, len);
}
} else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
err = uffd_ops->cont(dst_vma, dst_addr, flags,
increment);
- } else if (is_vm_hugetlb_page(dst_vma)) {
- err = hugetlb_mfill_atomic_pte(dst_vma, dst_addr,
- src_addr, flags, &folio, increment);
- } else if (!(dst_vma->vm_flags & VM_SHARED)) {
+ } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
+ err = uffd_ops->copy(dst_vma, dst_addr, src_addr, flags,
+ &folio, increment);
/*
* The normal page fault path for a shmem will invoke
* the fault, fill the hole in the file and COW it right
* the pagetable (to verify it's still none) and not in
* the radix tree.
*/
- if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
- err = mfill_atomic_pte_copy(dst_vma, dst_addr,
- src_addr, flags,
- &folio);
- else
- err = mfill_atomic_pte_zeropage(dst_vma,
- dst_addr);
} else {
- err = shmem_mfill_atomic_pte(dst_vma, dst_addr,
- src_addr, flags, &folio);
+ err = uffd_ops->zeropage(dst_vma, dst_addr);
}
cond_resched();