The lock can now be released.
 
+Exclusive access memory
+=======================
+
+Some devices have features such as atomic PTE bits that can be used to implement
+atomic access to system memory. To support atomic operations to a shared virtual
+memory page such a device needs access to that page which is exclusive of any
+userspace access from the CPU. The ``make_device_exclusive_range()`` function
+can be used to make a memory range inaccessible from userspace.
+
+This replaces all mappings for pages in the given range with special swap
+entries. Any attempt to access the swap entry results in a fault which is
+resovled by replacing the entry with the original mapping. A driver gets
+notified that the mapping has been changed by MMU notifiers, after which point
+it will no longer have exclusive access to the page. Exclusive access is
+guranteed to last until the driver drops the page lock and page reference, at
+which point any CPU faults on the page may proceed as described.
+
 Memory cgroup (memcg) and rss accounting
 ========================================
 
 
  * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal
  * a device driver to possibly ignore the invalidation if the
  * owner field matches the driver's device private pgmap owner.
+ *
+ * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no
+ * longer have exclusive access to the page. When sent during creation of an
+ * exclusive range the owner will be initialised to the value provided by the
+ * caller of make_device_exclusive_range(), otherwise the owner will be NULL.
  */
 enum mmu_notifier_event {
        MMU_NOTIFY_UNMAP = 0,
        MMU_NOTIFY_SOFT_DIRTY,
        MMU_NOTIFY_RELEASE,
        MMU_NOTIFY_MIGRATE,
+       MMU_NOTIFY_EXCLUSIVE,
 };
 
 #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
 
 void try_to_migrate(struct page *page, enum ttu_flags flags);
 void try_to_unmap(struct page *, enum ttu_flags flags);
 
+int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
+                               unsigned long end, struct page **pages,
+                               void *arg);
+
 /* Avoid racy checks */
 #define PVMW_SYNC              (1 << 0)
 /* Look for migarion entries rather than present PTEs */
 
  * migrate part of a process memory to device memory.
  *
  * When a page is migrated from CPU to device, we set the CPU page table entry
- * to a special SWP_DEVICE_* entry.
+ * to a special SWP_DEVICE_{READ|WRITE} entry.
+ *
+ * When a page is mapped by the device for exclusive access we set the CPU page
+ * table entries to special SWP_DEVICE_EXCLUSIVE_* entries.
  */
 #ifdef CONFIG_DEVICE_PRIVATE
-#define SWP_DEVICE_NUM 2
+#define SWP_DEVICE_NUM 4
 #define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
 #define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
+#define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2)
+#define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3)
 #else
 #define SWP_DEVICE_NUM 0
 #endif
 
 {
        return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
 }
+
+static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset)
+{
+       return swp_entry(SWP_DEVICE_EXCLUSIVE_READ, offset);
+}
+
+static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset)
+{
+       return swp_entry(SWP_DEVICE_EXCLUSIVE_WRITE, offset);
+}
+
+static inline bool is_device_exclusive_entry(swp_entry_t entry)
+{
+       return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_READ ||
+               swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE;
+}
+
+static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
+{
+       return unlikely(swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE);
+}
 #else /* CONFIG_DEVICE_PRIVATE */
 static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
 {
 {
        return false;
 }
+
+static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset)
+{
+       return swp_entry(0, 0);
+}
+
+static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset)
+{
+       return swp_entry(0, 0);
+}
+
+static inline bool is_device_exclusive_entry(swp_entry_t entry)
+{
+       return false;
+}
+
+static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
+{
+       return false;
+}
 #endif /* CONFIG_DEVICE_PRIVATE */
 
 #ifdef CONFIG_MIGRATION
  */
 static inline bool is_pfn_swap_entry(swp_entry_t entry)
 {
-       return is_migration_entry(entry) || is_device_private_entry(entry);
+       return is_migration_entry(entry) || is_device_private_entry(entry) ||
+              is_device_exclusive_entry(entry);
 }
 
 struct page_vma_mapped_walk;
 
 #include <linux/mmu_notifier.h>
 #include <linux/memory_hotplug.h>
 
+#include "internal.h"
+
 struct hmm_vma_walk {
        struct hmm_range        *range;
        unsigned long           last;
                if (!non_swap_entry(entry))
                        goto fault;
 
+               if (is_device_exclusive_entry(entry))
+                       goto fault;
+
                if (is_migration_entry(entry)) {
                        pte_unmap(ptep);
                        hmm_vma_walk->last = addr;
 
 }
 #endif
 
+static void restore_exclusive_pte(struct vm_area_struct *vma,
+                                 struct page *page, unsigned long address,
+                                 pte_t *ptep)
+{
+       pte_t pte;
+       swp_entry_t entry;
+
+       pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
+       if (pte_swp_soft_dirty(*ptep))
+               pte = pte_mksoft_dirty(pte);
+
+       entry = pte_to_swp_entry(*ptep);
+       if (pte_swp_uffd_wp(*ptep))
+               pte = pte_mkuffd_wp(pte);
+       else if (is_writable_device_exclusive_entry(entry))
+               pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+
+       set_pte_at(vma->vm_mm, address, ptep, pte);
+
+       /*
+        * No need to take a page reference as one was already
+        * created when the swap entry was made.
+        */
+       if (PageAnon(page))
+               page_add_anon_rmap(page, vma, address, false);
+       else
+               /*
+                * Currently device exclusive access only supports anonymous
+                * memory so the entry shouldn't point to a filebacked page.
+                */
+               WARN_ON_ONCE(!PageAnon(page));
+
+       if (vma->vm_flags & VM_LOCKED)
+               mlock_vma_page(page);
+
+       /*
+        * No need to invalidate - it was non-present before. However
+        * secondary CPUs may have mappings that need invalidating.
+        */
+       update_mmu_cache(vma, address, ptep);
+}
+
+/*
+ * Tries to restore an exclusive pte if the page lock can be acquired without
+ * sleeping.
+ */
+static int
+try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
+                       unsigned long addr)
+{
+       swp_entry_t entry = pte_to_swp_entry(*src_pte);
+       struct page *page = pfn_swap_entry_to_page(entry);
+
+       if (trylock_page(page)) {
+               restore_exclusive_pte(vma, page, addr, src_pte);
+               unlock_page(page);
+               return 0;
+       }
+
+       return -EBUSY;
+}
+
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
+       } else if (is_device_exclusive_entry(entry)) {
+               /*
+                * Make device exclusive entries present by restoring the
+                * original entry then copying as for a present pte. Device
+                * exclusive entries currently only support private writable
+                * (ie. COW) mappings.
+                */
+               VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
+               if (try_restore_exclusive_pte(src_pte, src_vma, addr))
+                       return -EBUSY;
+               return -ENOENT;
        }
        if (!userfaultfd_wp(dst_vma))
                pte = pte_swp_clear_uffd_wp(pte);
                        if (ret == -EIO) {
                                entry = pte_to_swp_entry(*src_pte);
                                break;
+                       } else if (ret == -EBUSY) {
+                               break;
+                       } else if (!ret) {
+                               progress += 8;
+                               continue;
                        }
-                       progress += 8;
-                       continue;
+
+                       /*
+                        * Device exclusive entry restored, continue by copying
+                        * the now present pte.
+                        */
+                       WARN_ON_ONCE(ret != -ENOENT);
                }
                /* copy_present_pte() will clear `*prealloc' if consumed */
                ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
                        goto out;
                }
                entry.val = 0;
+       } else if (ret == -EBUSY) {
+               goto out;
        } else if (ret ==  -EAGAIN) {
                prealloc = page_copy_prealloc(src_mm, src_vma, addr);
                if (!prealloc)
                }
 
                entry = pte_to_swp_entry(ptent);
-               if (is_device_private_entry(entry)) {
+               if (is_device_private_entry(entry) ||
+                   is_device_exclusive_entry(entry)) {
                        struct page *page = pfn_swap_entry_to_page(entry);
 
                        if (unlikely(details && details->check_mapping)) {
 
                        pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
                        rss[mm_counter(page)]--;
-                       page_remove_rmap(page, false);
+
+                       if (is_device_private_entry(entry))
+                               page_remove_rmap(page, false);
+
                        put_page(page);
                        continue;
                }
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
+/*
+ * Restore a potential device exclusive pte to a working pte entry
+ */
+static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
+{
+       struct page *page = vmf->page;
+       struct vm_area_struct *vma = vmf->vma;
+       struct mmu_notifier_range range;
+
+       if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
+               return VM_FAULT_RETRY;
+       mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
+                               vma->vm_mm, vmf->address & PAGE_MASK,
+                               (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
+       mmu_notifier_invalidate_range_start(&range);
+
+       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+                               &vmf->ptl);
+       if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+               restore_exclusive_pte(vma, page, vmf->address, vmf->pte);
+
+       pte_unmap_unlock(vmf->pte, vmf->ptl);
+       unlock_page(page);
+
+       mmu_notifier_invalidate_range_end(&range);
+       return 0;
+}
+
 /*
  * We enter with non-exclusive mmap_lock (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
                if (is_migration_entry(entry)) {
                        migration_entry_wait(vma->vm_mm, vmf->pmd,
                                             vmf->address);
+               } else if (is_device_exclusive_entry(entry)) {
+                       vmf->page = pfn_swap_entry_to_page(entry);
+                       ret = remove_device_exclusive_entry(vmf);
                } else if (is_device_private_entry(entry)) {
                        vmf->page = pfn_swap_entry_to_page(entry);
                        ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
 
                                newpte = swp_entry_to_pte(entry);
                                if (pte_swp_uffd_wp(oldpte))
                                        newpte = pte_swp_mkuffd_wp(newpte);
+                       } else if (is_writable_device_exclusive_entry(entry)) {
+                               entry = make_readable_device_exclusive_entry(
+                                                       swp_offset(entry));
+                               newpte = swp_entry_to_pte(entry);
+                               if (pte_swp_soft_dirty(oldpte))
+                                       newpte = pte_swp_mksoft_dirty(newpte);
+                               if (pte_swp_uffd_wp(oldpte))
+                                       newpte = pte_swp_mkuffd_wp(newpte);
                        } else {
                                newpte = oldpte;
                        }
 
 
                                /* Handle un-addressable ZONE_DEVICE memory */
                                entry = pte_to_swp_entry(*pvmw->pte);
-                               if (!is_device_private_entry(entry))
+                               if (!is_device_private_entry(entry) &&
+                                   !is_device_exclusive_entry(entry))
                                        return false;
                        } else if (!pte_present(*pvmw->pte))
                                return false;
                        return false;
                entry = pte_to_swp_entry(*pvmw->pte);
 
-               if (!is_migration_entry(entry))
+               if (!is_migration_entry(entry) &&
+                   !is_device_exclusive_entry(entry))
                        return false;
 
                pfn = swp_offset(entry);
 
                /* Handle un-addressable ZONE_DEVICE memory */
                entry = pte_to_swp_entry(*pvmw->pte);
-               if (!is_device_private_entry(entry))
+               if (!is_device_private_entry(entry) &&
+                   !is_device_exclusive_entry(entry))
                        return false;
 
                pfn = swp_offset(entry);
 
        rmap_walk(page, &rwc);
 }
 
+#ifdef CONFIG_DEVICE_PRIVATE
+struct make_exclusive_args {
+       struct mm_struct *mm;
+       unsigned long address;
+       void *owner;
+       bool valid;
+};
+
+static bool page_make_device_exclusive_one(struct page *page,
+               struct vm_area_struct *vma, unsigned long address, void *priv)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       struct page_vma_mapped_walk pvmw = {
+               .page = page,
+               .vma = vma,
+               .address = address,
+       };
+       struct make_exclusive_args *args = priv;
+       pte_t pteval;
+       struct page *subpage;
+       bool ret = true;
+       struct mmu_notifier_range range;
+       swp_entry_t entry;
+       pte_t swp_pte;
+
+       mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
+                                     vma->vm_mm, address, min(vma->vm_end,
+                                     address + page_size(page)), args->owner);
+       mmu_notifier_invalidate_range_start(&range);
+
+       while (page_vma_mapped_walk(&pvmw)) {
+               /* Unexpected PMD-mapped THP? */
+               VM_BUG_ON_PAGE(!pvmw.pte, page);
+
+               if (!pte_present(*pvmw.pte)) {
+                       ret = false;
+                       page_vma_mapped_walk_done(&pvmw);
+                       break;
+               }
+
+               subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+               address = pvmw.address;
+
+               /* Nuke the page table entry. */
+               flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+               pteval = ptep_clear_flush(vma, address, pvmw.pte);
+
+               /* Move the dirty bit to the page. Now the pte is gone. */
+               if (pte_dirty(pteval))
+                       set_page_dirty(page);
+
+               /*
+                * Check that our target page is still mapped at the expected
+                * address.
+                */
+               if (args->mm == mm && args->address == address &&
+                   pte_write(pteval))
+                       args->valid = true;
+
+               /*
+                * Store the pfn of the page in a special migration
+                * pte. do_swap_page() will wait until the migration
+                * pte is removed and then restart fault handling.
+                */
+               if (pte_write(pteval))
+                       entry = make_writable_device_exclusive_entry(
+                                                       page_to_pfn(subpage));
+               else
+                       entry = make_readable_device_exclusive_entry(
+                                                       page_to_pfn(subpage));
+               swp_pte = swp_entry_to_pte(entry);
+               if (pte_soft_dirty(pteval))
+                       swp_pte = pte_swp_mksoft_dirty(swp_pte);
+               if (pte_uffd_wp(pteval))
+                       swp_pte = pte_swp_mkuffd_wp(swp_pte);
+
+               set_pte_at(mm, address, pvmw.pte, swp_pte);
+
+               /*
+                * There is a reference on the page for the swap entry which has
+                * been removed, so shouldn't take another.
+                */
+               page_remove_rmap(subpage, false);
+       }
+
+       mmu_notifier_invalidate_range_end(&range);
+
+       return ret;
+}
+
+/**
+ * page_make_device_exclusive - mark the page exclusively owned by a device
+ * @page: the page to replace page table entries for
+ * @mm: the mm_struct where the page is expected to be mapped
+ * @address: address where the page is expected to be mapped
+ * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
+ *
+ * Tries to remove all the page table entries which are mapping this page and
+ * replace them with special device exclusive swap entries to grant a device
+ * exclusive access to the page. Caller must hold the page lock.
+ *
+ * Returns false if the page is still mapped, or if it could not be unmapped
+ * from the expected address. Otherwise returns true (success).
+ */
+static bool page_make_device_exclusive(struct page *page, struct mm_struct *mm,
+                               unsigned long address, void *owner)
+{
+       struct make_exclusive_args args = {
+               .mm = mm,
+               .address = address,
+               .owner = owner,
+               .valid = false,
+       };
+       struct rmap_walk_control rwc = {
+               .rmap_one = page_make_device_exclusive_one,
+               .done = page_not_mapped,
+               .anon_lock = page_lock_anon_vma_read,
+               .arg = &args,
+       };
+
+       /*
+        * Restrict to anonymous pages for now to avoid potential writeback
+        * issues. Also tail pages shouldn't be passed to rmap_walk so skip
+        * those.
+        */
+       if (!PageAnon(page) || PageTail(page))
+               return false;
+
+       rmap_walk(page, &rwc);
+
+       return args.valid && !page_mapcount(page);
+}
+
+/**
+ * make_device_exclusive_range() - Mark a range for exclusive use by a device
+ * @mm: mm_struct of assoicated target process
+ * @start: start of the region to mark for exclusive device access
+ * @end: end address of region
+ * @pages: returns the pages which were successfully marked for exclusive access
+ * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
+ *
+ * Returns: number of pages found in the range by GUP. A page is marked for
+ * exclusive access only if the page pointer is non-NULL.
+ *
+ * This function finds ptes mapping page(s) to the given address range, locks
+ * them and replaces mappings with special swap entries preventing userspace CPU
+ * access. On fault these entries are replaced with the original mapping after
+ * calling MMU notifiers.
+ *
+ * A driver using this to program access from a device must use a mmu notifier
+ * critical section to hold a device specific lock during programming. Once
+ * programming is complete it should drop the page lock and reference after
+ * which point CPU access to the page will revoke the exclusive access.
+ */
+int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
+                               unsigned long end, struct page **pages,
+                               void *owner)
+{
+       long npages = (end - start) >> PAGE_SHIFT;
+       long i;
+
+       npages = get_user_pages_remote(mm, start, npages,
+                                      FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
+                                      pages, NULL, NULL);
+       if (npages < 0)
+               return npages;
+
+       for (i = 0; i < npages; i++, start += PAGE_SIZE) {
+               if (!trylock_page(pages[i])) {
+                       put_page(pages[i]);
+                       pages[i] = NULL;
+                       continue;
+               }
+
+               if (!page_make_device_exclusive(pages[i], mm, start, owner)) {
+                       unlock_page(pages[i]);
+                       put_page(pages[i]);
+                       pages[i] = NULL;
+               }
+       }
+
+       return npages;
+}
+EXPORT_SYMBOL_GPL(make_device_exclusive_range);
+#endif
+
 void __put_anon_vma(struct anon_vma *anon_vma)
 {
        struct anon_vma *root = anon_vma->root;