From d360135cfdb07d3b529db2772c23999f4f8b643c Mon Sep 17 00:00:00 2001 From: Jerome Glisse Date: Wed, 5 Dec 2018 11:13:56 +1100 Subject: [PATCH] mm/mmu_notifier: contextual information for event triggering invalidation MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening. This patchset adds event information so that users of mmu notifier can differentiate among broad category: - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). Link: http://lkml.kernel.org/r/20181203201817.10759-4-jglisse@redhat.com Signed-off-by: Jerome Glisse Acked-by: Christian König Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Jan Kara Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Felix Kuehling Cc: Ralph Campbell Cc: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/dax.c | 1 + fs/proc/task_mmu.c | 1 + include/linux/mmu_notifier.h | 33 +++++++++++++++++++++++++++++++++ kernel/events/uprobes.c | 1 + mm/huge_memory.c | 4 ++++ mm/hugetlb.c | 4 ++++ mm/khugepaged.c | 1 + mm/ksm.c | 2 ++ mm/madvise.c | 1 + mm/memory.c | 5 +++++ mm/migrate.c | 2 ++ mm/mprotect.c | 1 + mm/mremap.c | 1 + mm/oom_kill.c | 1 + mm/rmap.c | 2 ++ 15 files changed, 60 insertions(+) diff --git a/fs/dax.c b/fs/dax.c index e22508ee19ec..83092c5ac5f0 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -761,6 +761,7 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, struct mmu_notifier_range range; unsigned long address; + range.event = MMU_NOTIFY_PROTECTION_PAGE; range.mm = vma->vm_mm; cond_resched(); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b417ac98f10a..823c5d9e8e93 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1156,6 +1156,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, range.start = 0; range.end = -1UL; range.mm = mm; + range.event = MMU_NOTIFY_SOFT_DIRTY; mmu_notifier_invalidate_range_start(&range); } walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index a85b845ec99b..8555cf8946a6 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -23,10 +23,43 @@ struct mmu_notifier_mm { spinlock_t lock; }; +/* + * What event is triggering the invalidation: + * + * MMU_NOTIFY_UNMAP + * either munmap() that unmap the range or a mremap() that move the range + * + * MMU_NOTIFY_CLEAR + * clear page table entry (many reasons for this like madvise() or replacing + * a page by another one, ...). + * + * MMU_NOTIFY_PROTECTION_VMA + * update is due to protection change for the range ie using the vma access + * permission (vm_page_prot) to update the whole range is enough no need to + * inspect changes to the CPU page table (mprotect() syscall) + * + * MMU_NOTIFY_PROTECTION_PAGE + * update is due to change in read/write flag for pages in the range so to + * mirror those changes the user must inspect the CPU page table (from the + * end callback). + * + * + * MMU_NOTIFY_SOFT_DIRTY + * soft dirty accounting (still same page and same access flags) + */ +enum mmu_notifier_event { + MMU_NOTIFY_UNMAP = 0, + MMU_NOTIFY_CLEAR, + MMU_NOTIFY_PROTECTION_VMA, + MMU_NOTIFY_PROTECTION_PAGE, + MMU_NOTIFY_SOFT_DIRTY, +}; + struct mmu_notifier_range { struct mm_struct *mm; unsigned long start; unsigned long end; + enum mmu_notifier_event event; bool blockable; }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index aa7996ca361e..b6ef3be1c24e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -174,6 +174,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; + range.event = MMU_NOTIFY_CLEAR; range.start = addr; range.end = addr + PAGE_SIZE; range.mm = mm; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9f9bc4dc585b..249581efb66a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1182,6 +1182,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, cond_resched(); } + range.event = MMU_NOTIFY_CLEAR; range.start = haddr; range.end = range.start + HPAGE_PMD_SIZE; range.mm = vma->vm_mm; @@ -1347,6 +1348,7 @@ alloc: vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); + range.event = MMU_NOTIFY_CLEAR; range.start = haddr; range.end = range.start + HPAGE_PMD_SIZE; range.mm = vma->vm_mm; @@ -2027,6 +2029,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; + range.event = MMU_NOTIFY_CLEAR; range.start = address & HPAGE_PUD_MASK; range.end = range.start + HPAGE_PUD_SIZE; range.mm = mm; @@ -2246,6 +2249,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; + range.event = MMU_NOTIFY_CLEAR; range.start = address & HPAGE_PMD_MASK; range.end = range.start + HPAGE_PMD_SIZE; range.mm = mm; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2876356a386c..252e81e1f86e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3246,6 +3246,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; + range.event = MMU_NOTIFY_CLEAR; range.start = vma->vm_start; range.end = vma->vm_end; range.mm = src; @@ -3362,6 +3363,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long sz = huge_page_size(h); struct mmu_notifier_range range; + range.event = MMU_NOTIFY_CLEAR; range.start = start; range.end = end; range.mm = mm; @@ -3647,6 +3649,7 @@ retry_avoidcopy: __SetPageUptodate(new_page); set_page_huge_active(new_page); + range.event = MMU_NOTIFY_CLEAR; range.start = haddr; range.end = range.start + huge_page_size(h); range.mm = mm; @@ -4384,6 +4387,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, bool shared_pmd = false; struct mmu_notifier_range range; + range.event = MMU_NOTIFY_PROTECTION_VMA; range.start = start; range.end = end; range.mm = mm; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e9fe0c9a9f56..c5c78ba30b38 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1016,6 +1016,7 @@ static void collapse_huge_page(struct mm_struct *mm, pte = pte_offset_map(pmd, address); pte_ptl = pte_lockptr(mm, pmd); + range.event = MMU_NOTIFY_CLEAR; range.start = address; range.end = range.start + HPAGE_PMD_SIZE; range.mm = mm; diff --git a/mm/ksm.c b/mm/ksm.c index 3b531c7e7b57..723bd32d4dd0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1064,6 +1064,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); + range.event = MMU_NOTIFY_CLEAR; range.start = pvmw.address; range.end = range.start + PAGE_SIZE; range.mm = mm; @@ -1153,6 +1154,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd) goto out; + range.event = MMU_NOTIFY_CLEAR; range.start = addr; range.end = addr + PAGE_SIZE; range.mm = mm; diff --git a/mm/madvise.c b/mm/madvise.c index f20dd80ca21b..c415985d6a04 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -466,6 +466,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, if (!vma_is_anonymous(vma)) return -EINVAL; + range.event = MMU_NOTIFY_CLEAR; range.start = max(vma->vm_start, start_addr); if (range.start >= vma->vm_end) return -EINVAL; diff --git a/mm/memory.c b/mm/memory.c index cdc0c3840e29..4a67eb8aef55 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1007,6 +1007,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * is_cow_mapping() returns true. */ is_cow = is_cow_mapping(vma->vm_flags); + range.event = MMU_NOTIFY_PROTECTION_PAGE; range.start = addr; range.end = end; range.mm = src_mm; @@ -1334,6 +1335,7 @@ void unmap_vmas(struct mmu_gather *tlb, { struct mmu_notifier_range range; + range.event = MMU_NOTIFY_UNMAP; range.start = start_addr; range.end = end_addr; range.mm = vma->vm_mm; @@ -1358,6 +1360,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, struct mmu_notifier_range range; struct mmu_gather tlb; + range.event = MMU_NOTIFY_CLEAR; range.start = start; range.end = range.start + size; range.mm = vma->vm_mm; @@ -1387,6 +1390,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr struct mmu_notifier_range range; struct mmu_gather tlb; + range.event = MMU_NOTIFY_CLEAR; range.start = address; range.end = range.start + size; range.mm = vma->vm_mm; @@ -2260,6 +2264,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) struct mem_cgroup *memcg; struct mmu_notifier_range range; + range.event = MMU_NOTIFY_CLEAR; range.start = vmf->address & PAGE_MASK; range.end = range.start + PAGE_SIZE; range.mm = mm; diff --git a/mm/migrate.c b/mm/migrate.c index 83a21ce98bba..878a7be673fc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2313,6 +2313,7 @@ static void migrate_vma_collect(struct migrate_vma *migrate) struct mmu_notifier_range range; struct mm_walk mm_walk; + range.event = MMU_NOTIFY_CLEAR; range.start = migrate->start; range.end = migrate->end; range.mm = migrate->vma->vm_mm; @@ -2733,6 +2734,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate) if (!notified) { notified = true; + range.event = MMU_NOTIFY_CLEAR; range.start = addr; range.end = migrate->end; range.mm = mm; diff --git a/mm/mprotect.c b/mm/mprotect.c index f466adf31e12..6d41321b2f3e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -186,6 +186,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, /* invoke the mmu notifier if the pmd is populated */ if (!range.start) { + range.event = MMU_NOTIFY_PROTECTION_VMA; range.start = addr; range.end = end; range.mm = mm; diff --git a/mm/mremap.c b/mm/mremap.c index 2c2bd3100591..932acee5c910 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -249,6 +249,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); + range.event = MMU_NOTIFY_UNMAP; range.start = old_addr; range.end = old_end; range.mm = vma->vm_mm; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d84201afcc1b..c07aa82015f7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -531,6 +531,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) struct mmu_notifier_range range; struct mmu_gather tlb; + range.event = MMU_NOTIFY_CLEAR; range.start = vma->vm_start; range.end = vma->vm_end; range.mm = mm; diff --git a/mm/rmap.c b/mm/rmap.c index bc553dce7b5b..3e33f4ffc1f6 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -896,6 +896,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, * We have to assume the worse case ie pmd for invalidation. Note that * the page can not be free from this function. */ + range.event = MMU_NOTIFY_PROTECTION_PAGE; range.mm = vma->vm_mm; range.start = address; range.end = min(vma->vm_end, range.start + @@ -1372,6 +1373,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ + range.event = MMU_NOTIFY_CLEAR; range.mm = vma->vm_mm; range.start = vma->vm_start; range.end = min(vma->vm_end, -- 2.50.1