pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd));
}
-static int move_present_pte(struct mm_struct *mm,
- struct vm_area_struct *dst_vma,
- struct vm_area_struct *src_vma,
- unsigned long dst_addr, unsigned long src_addr,
- pte_t *dst_pte, pte_t *src_pte,
- pte_t orig_dst_pte, pte_t orig_src_pte,
- pmd_t *dst_pmd, pmd_t dst_pmdval,
- spinlock_t *dst_ptl, spinlock_t *src_ptl,
- struct folio *src_folio)
+/*
+ * Checks if the two ptes and the corresponding folio are eligible for batched
+ * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL.
+ *
+ * NOTE: folio's reference is not required as the whole operation is within
+ * PTL's critical section.
+ */
+static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma,
+ unsigned long src_addr,
+ pte_t *src_pte, pte_t *dst_pte,
+ struct anon_vma *src_anon_vma)
{
- int err = 0;
+ pte_t orig_dst_pte, orig_src_pte;
+ struct folio *folio;
+
+ orig_dst_pte = ptep_get(dst_pte);
+ if (!pte_none(orig_dst_pte))
+ return NULL;
+
+ orig_src_pte = ptep_get(src_pte);
+ if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte)))
+ return NULL;
+
+ folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
+ if (!folio || !folio_trylock(folio))
+ return NULL;
+ if (!PageAnonExclusive(&folio->page) || folio_test_large(folio) ||
+ folio_anon_vma(folio) != src_anon_vma) {
+ folio_unlock(folio);
+ return NULL;
+ }
+ return folio;
+}
+/*
+ * Moves src folios to dst in a batch as long as they share the same
+ * anon_vma as the first folio, are not large, and can successfully
+ * take the lock via folio_trylock().
+ */
+static long move_present_ptes(struct mm_struct *mm,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ pte_t *dst_pte, pte_t *src_pte,
+ pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval,
+ spinlock_t *dst_ptl, spinlock_t *src_ptl,
+ struct folio **first_src_folio, unsigned long len,
+ struct anon_vma *src_anon_vma)
+{
+ int err = 0;
+ struct folio *src_folio = *first_src_folio;
+ unsigned long src_start = src_addr;
+ unsigned long src_end;
+
+ if (len > PAGE_SIZE) {
+ len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr;
+ src_end = pmd_addr_end(src_addr, src_addr + len);
+ } else
+ src_end = src_addr + len;
+ flush_cache_range(src_vma, src_addr, src_end);
double_pt_lock(dst_ptl, src_ptl);
if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
err = -EBUSY;
goto out;
}
+ /* It's safe to drop the reference now as the page-table is holding one. */
+ folio_put(*first_src_folio);
+ *first_src_folio = NULL;
+ arch_enter_lazy_mmu_mode();
+
+ while (true) {
+ orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+ /* Folio got pinned from under us. Put it back and fail the move. */
+ if (folio_maybe_dma_pinned(src_folio)) {
+ set_pte_at(mm, src_addr, src_pte, orig_src_pte);
+ err = -EBUSY;
+ break;
+ }
- orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
- /* Folio got pinned from under us. Put it back and fail the move. */
- if (folio_maybe_dma_pinned(src_folio)) {
- set_pte_at(mm, src_addr, src_pte, orig_src_pte);
- err = -EBUSY;
- goto out;
- }
-
- folio_move_anon_rmap(src_folio, dst_vma);
- src_folio->index = linear_page_index(dst_vma, dst_addr);
+ folio_move_anon_rmap(src_folio, dst_vma);
+ src_folio->index = linear_page_index(dst_vma, dst_addr);
- orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
- /* Set soft dirty bit so userspace can notice the pte was moved */
+ orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
+ /* Set soft dirty bit so userspace can notice the pte was moved */
#ifdef CONFIG_MEM_SOFT_DIRTY
- orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
+ orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
#endif
- if (pte_dirty(orig_src_pte))
- orig_dst_pte = pte_mkdirty(orig_dst_pte);
- orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
+ if (pte_dirty(orig_src_pte))
+ orig_dst_pte = pte_mkdirty(orig_dst_pte);
+ orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
+ set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
+
+ src_addr += PAGE_SIZE;
+ if (src_addr == src_end)
+ break;
+ dst_addr += PAGE_SIZE;
+ dst_pte++;
+ src_pte++;
- set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
+ folio_unlock(src_folio);
+ src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte,
+ dst_pte, src_anon_vma);
+ if (!src_folio)
+ break;
+ }
+
+ arch_leave_lazy_mmu_mode();
+ if (src_addr > src_start)
+ flush_tlb_range(src_vma, src_start, src_addr);
+
+ if (src_folio)
+ folio_unlock(src_folio);
out:
double_pt_unlock(dst_ptl, src_ptl);
- return err;
+ return src_addr > src_start ? src_addr - src_start : err;
}
static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
- return 0;
+ return PAGE_SIZE;
}
static int move_zeropage_pte(struct mm_struct *mm,
set_pte_at(mm, dst_addr, dst_pte, zero_pte);
double_pt_unlock(dst_ptl, src_ptl);
- return 0;
+ return PAGE_SIZE;
}
/*
- * The mmap_lock for reading is held by the caller. Just move the page
- * from src_pmd to dst_pmd if possible, and return true if succeeded
- * in moving the page.
+ * The mmap_lock for reading is held by the caller. Just move the page(s)
+ * from src_pmd to dst_pmd if possible, and return number of bytes moved.
+ * On failure, an error code is returned.
*/
-static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
- struct vm_area_struct *dst_vma,
- struct vm_area_struct *src_vma,
- unsigned long dst_addr, unsigned long src_addr,
- __u64 mode)
+static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ unsigned long len, __u64 mode)
{
swp_entry_t entry;
struct swap_info_struct *si = NULL;
struct folio *src_folio = NULL;
struct anon_vma *src_anon_vma = NULL;
struct mmu_notifier_range range;
- int err = 0;
+ long ret = 0;
- flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
- src_addr, src_addr + PAGE_SIZE);
+ src_addr, src_addr + len);
mmu_notifier_invalidate_range_start(&range);
retry:
/*
/* Retry if a huge pmd materialized from under us */
if (unlikely(!dst_pte)) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
* transparent huge pages under us.
*/
if (unlikely(!src_pte)) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
/* Sanity checks before the operation */
if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) ||
pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) {
- err = -EINVAL;
+ ret = -EINVAL;
goto out;
}
orig_dst_pte = ptep_get(dst_pte);
spin_unlock(dst_ptl);
if (!pte_none(orig_dst_pte)) {
- err = -EEXIST;
+ ret = -EEXIST;
goto out;
}
spin_unlock(src_ptl);
if (pte_none(orig_src_pte)) {
if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES))
- err = -ENOENT;
+ ret = -ENOENT;
else /* nothing to do to move a hole */
- err = 0;
+ ret = PAGE_SIZE;
goto out;
}
/* If PTE changed after we locked the folio them start over */
if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
if (pte_present(orig_src_pte)) {
if (is_zero_pfn(pte_pfn(orig_src_pte))) {
- err = move_zeropage_pte(mm, dst_vma, src_vma,
+ ret = move_zeropage_pte(mm, dst_vma, src_vma,
dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte,
dst_pmd, dst_pmdval, dst_ptl, src_ptl);
spin_lock(src_ptl);
if (!pte_same(orig_src_pte, ptep_get(src_pte))) {
spin_unlock(src_ptl);
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
if (!folio || !PageAnonExclusive(&folio->page)) {
spin_unlock(src_ptl);
- err = -EBUSY;
+ ret = -EBUSY;
goto out;
}
*/
if (!locked && folio_test_large(folio)) {
spin_unlock(src_ptl);
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
}
if (WARN_ON_ONCE(!folio_test_anon(src_folio))) {
- err = -EBUSY;
+ ret = -EBUSY;
goto out;
}
}
pte_unmap(src_pte);
pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
- err = split_folio(src_folio);
- if (err)
+ ret = split_folio(src_folio);
+ if (ret)
goto out;
/* have to reacquire the folio after it got split */
folio_unlock(src_folio);
src_anon_vma = folio_get_anon_vma(src_folio);
if (!src_anon_vma) {
/* page was unmapped from under us */
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
if (!anon_vma_trylock_write(src_anon_vma)) {
}
}
- err = move_present_pte(mm, dst_vma, src_vma,
- dst_addr, src_addr, dst_pte, src_pte,
- orig_dst_pte, orig_src_pte, dst_pmd,
- dst_pmdval, dst_ptl, src_ptl, src_folio);
+ ret = move_present_ptes(mm, dst_vma, src_vma,
+ dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte, dst_pmd,
+ dst_pmdval, dst_ptl, src_ptl, &src_folio,
+ len, src_anon_vma);
} else {
struct folio *folio = NULL;
pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
migration_entry_wait(mm, src_pmd, src_addr);
- err = -EAGAIN;
+ ret = -EAGAIN;
} else
- err = -EFAULT;
+ ret = -EFAULT;
goto out;
}
if (!pte_swp_exclusive(orig_src_pte)) {
- err = -EBUSY;
+ ret = -EBUSY;
goto out;
}
si = get_swap_device(entry);
if (unlikely(!si)) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
/*
swap_cache_index(entry));
if (!IS_ERR_OR_NULL(folio)) {
if (folio_test_large(folio)) {
- err = -EBUSY;
+ ret = -EBUSY;
folio_put(folio);
goto out;
}
goto retry;
}
}
- err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
+ ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
dst_ptl, src_ptl, src_folio, si, entry);
}
if (si)
put_swap_device(si);
- return err;
+ return ret;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
{
struct mm_struct *mm = ctx->mm;
struct vm_area_struct *src_vma, *dst_vma;
- unsigned long src_addr, dst_addr;
+ unsigned long src_addr, dst_addr, src_end;
pmd_t *src_pmd, *dst_pmd;
long err = -EINVAL;
ssize_t moved = 0;
if (err)
goto out_unlock;
- for (src_addr = src_start, dst_addr = dst_start;
- src_addr < src_start + len;) {
+ for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len;
+ src_addr < src_end;) {
spinlock_t *ptl;
pmd_t dst_pmdval;
unsigned long step_size;
dst_addr, src_addr);
step_size = HPAGE_PMD_SIZE;
} else {
+ long ret;
+
if (pmd_none(*src_pmd)) {
if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
err = -ENOENT;
break;
}
- err = move_pages_pte(mm, dst_pmd, src_pmd,
- dst_vma, src_vma,
- dst_addr, src_addr, mode);
- step_size = PAGE_SIZE;
+ ret = move_pages_ptes(mm, dst_pmd, src_pmd,
+ dst_vma, src_vma, dst_addr,
+ src_addr, src_end - src_addr, mode);
+ if (ret < 0)
+ err = ret;
+ else
+ step_size = ret;
}
cond_resched();