]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
mm: handling Non-LRU pages returned by vm_normal_pages
authorAlex Sierra <alex.sierra@amd.com>
Tue, 31 May 2022 20:00:30 +0000 (15:00 -0500)
committerLiam R. Howlett <Liam.Howlett@oracle.com>
Wed, 20 Jul 2022 00:15:01 +0000 (20:15 -0400)
With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
device-managed anonymous pages that are not LRU pages.  Although they
behave like normal pages for purposes of mapping in CPU page, and for COW.
They do not support LRU lists, NUMA migration or THP.

We also introduced a FOLL_LRU flag that adds the same behaviour to
follow_page and related APIs, to allow callers to specify that they expect
to put pages on an LRU list.

Link: https://lkml.kernel.org/r/20220531200041.24904-3-alex.sierra@amd.com
Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
12 files changed:
fs/proc/task_mmu.c
include/linux/mm.h
mm/gup.c
mm/huge_memory.c
mm/khugepaged.c
mm/ksm.c
mm/madvise.c
mm/memory.c
mm/mempolicy.c
mm/migrate.c
mm/mlock.c
mm/mprotect.c

index f9c9abb50bb73f4d30c730f7877a56e1de1808c6..37ccb5c9f4f88bd38e6013efc6d2170a586f8137 100644 (file)
@@ -1800,7 +1800,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
                return NULL;
 
        page = vm_normal_page(vma, addr, pte);
-       if (!page)
+       if (!page || is_zone_device_page(page))
                return NULL;
 
        if (PageReserved(page))
index 7f2d3ec50718a274e5597155f94b515ed314559e..62cf14d3295cf9ded4e28e619a7c9b2f01d651bc 100644 (file)
@@ -601,7 +601,7 @@ struct vm_operations_struct {
 #endif
        /*
         * Called by vm_normal_page() for special PTEs to find the
-        * page for @addr.  This is useful if the default behavior
+        * page for @addr. This is useful if the default behavior
         * (using pte_page()) would not find the correct page.
         */
        struct page *(*find_special_page)(struct vm_area_struct *vma,
@@ -2950,6 +2950,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 #define FOLL_NUMA      0x200   /* force NUMA hinting page fault */
 #define FOLL_MIGRATION 0x400   /* wait for page to replace migration entry */
 #define FOLL_TRIED     0x800   /* a retry, previous pass started an IO */
+#define FOLL_LRU        0x1000  /* return only LRU (anon or page cache) */
 #define FOLL_REMOTE    0x2000  /* we are working on non-current tsk/mm */
 #define FOLL_COW       0x4000  /* internal GUP flag */
 #define FOLL_ANON      0x8000  /* don't do file mappings */
index 4ec77f68c6c1157edb76ddf3c0e8f9a4acba3c72..67cc836eb1b52c55b7f33408459d571538e3b265 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -532,7 +532,11 @@ retry:
        }
 
        page = vm_normal_page(vma, address, pte);
-       if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
+       if ((flags & FOLL_LRU) && ((page && is_zone_device_page(page)) ||
+           (!page && pte_devmap(pte)))) {
+               page = ERR_PTR(-EEXIST);
+               goto out;
+       } else if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
                /*
                 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
                 * case since they are only valid while holding the pgmap
index f44ffd3bbfae0d65146b70d4aa2da77b64308776..41264755c1b6a68e21b79e5b31ffbf5c1fc8f15f 100644 (file)
@@ -2905,7 +2905,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
                }
 
                /* FOLL_DUMP to ignore special (like zero) pages */
-               page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
+               page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
 
                if (IS_ERR(page))
                        continue;
index 637bfecd6bf57f642ae6b7ed208ac895301d9f93..476d7936010173e9cd67ce8e4dd33d89d2daf111 100644 (file)
@@ -618,7 +618,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                        goto out;
                }
                page = vm_normal_page(vma, address, pteval);
-               if (unlikely(!page)) {
+               if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
                        result = SCAN_PAGE_NULL;
                        goto out;
                }
@@ -1267,7 +1267,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                        writable = true;
 
                page = vm_normal_page(vma, _address, pteval);
-               if (unlikely(!page)) {
+               if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
                        result = SCAN_PAGE_NULL;
                        goto out_unmap;
                }
@@ -1479,7 +1479,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
                        goto abort;
 
                page = vm_normal_page(vma, addr, *pte);
-
+               if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+                       page = NULL;
                /*
                 * Note that uprobe, debugger, or MAP_PRIVATE may change the
                 * page table, but the new page will not be a subpage of hpage.
@@ -1497,6 +1498,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
                if (pte_none(*pte))
                        continue;
                page = vm_normal_page(vma, addr, *pte);
+               if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+                       goto abort;
                page_remove_rmap(page, vma, false);
        }
 
index 4e3d794f6b098d165f9784bf2254f4fbab610f7a..a5cd341f7870eb450597e2da298d0600b7936fcf 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -474,7 +474,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
        do {
                cond_resched();
                page = follow_page(vma, addr,
-                               FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
+                               FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE | FOLL_LRU);
                if (IS_ERR_OR_NULL(page))
                        break;
                if (PageKsm(page))
@@ -559,7 +559,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
        if (!vma)
                goto out;
 
-       page = follow_page(vma, addr, FOLL_GET);
+       page = follow_page(vma, addr, FOLL_GET | FOLL_LRU);
        if (IS_ERR_OR_NULL(page))
                goto out;
        if (PageAnon(page)) {
@@ -2310,7 +2310,7 @@ next_mm:
                while (ksm_scan.address < vma->vm_end) {
                        if (ksm_test_exit(mm))
                                break;
-                       *page = follow_page(vma, ksm_scan.address, FOLL_GET);
+                       *page = follow_page(vma, ksm_scan.address, FOLL_GET | FOLL_LRU);
                        if (IS_ERR_OR_NULL(*page)) {
                                ksm_scan.address += PAGE_SIZE;
                                cond_resched();
index 12e8010c766c56ef81c92b42596b23f6ce9366db..7a8af04069b3b9d2d712622fbc1515558006ccb0 100644 (file)
@@ -421,7 +421,7 @@ regular_page:
                        continue;
 
                page = vm_normal_page(vma, addr, ptent);
-               if (!page)
+               if (!page || is_zone_device_page(page))
                        continue;
 
                /*
@@ -639,7 +639,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
                }
 
                page = vm_normal_page(vma, addr, ptent);
-               if (!page)
+               if (!page || is_zone_device_page(page))
                        continue;
 
                /*
index fdfc119f181d6fe74eaf5210cc653bfb768e8f2b..c905fabddeec8d424d458a16b334c34d97c990dd 100644 (file)
@@ -633,6 +633,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                if (is_zero_pfn(pfn))
                        return NULL;
                if (pte_devmap(pte))
+/*
+ * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have
+ * refcounts incremented on their struct pages when they are inserted into
+ * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set
+ * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is
+ * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
+ */
                        return NULL;
 
                print_bad_pte(vma, addr, pte, NULL);
@@ -4708,7 +4715,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
        pte = pte_modify(old_pte, vma->vm_page_prot);
 
        page = vm_normal_page(vma, vmf->address, pte);
-       if (!page)
+       if (!page || is_zone_device_page(page))
                goto out_map;
 
        /* TODO: handle PTE-mapped THP */
index 201b043e96f111f5f3c382b2ec0d4a99d6e28c9a..b879a326bfb7155b07f4cfe67fc498223dc146ad 100644 (file)
@@ -524,7 +524,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
                if (!pte_present(*pte))
                        continue;
                page = vm_normal_page(vma, addr, *pte);
-               if (!page)
+               if (!page || is_zone_device_page(page))
                        continue;
                /*
                 * vm_normal_page() filters out zero pages, but there might
index d49797b3ad800abe678add32fe25cd31581dd838..085c8f10fb79ff1b4663fb275b58beb60e08d0aa 100644 (file)
@@ -1622,7 +1622,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
                goto out;
 
        /* FOLL_DUMP to ignore special (like zero) pages */
-       page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
+       page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
 
        err = PTR_ERR(page);
        if (IS_ERR(page))
@@ -1814,7 +1814,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
                        goto set_status;
 
                /* FOLL_DUMP to ignore special (like zero) pages */
-               page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
+               page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
 
                err = PTR_ERR(page);
                if (IS_ERR(page))
index c41604ba5197d80c15716d8cd53d608d611359bc..43d19a1f28eb37cbc66a309e83a4fb31cff07019 100644 (file)
@@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
                if (!pte_present(*pte))
                        continue;
                page = vm_normal_page(vma, addr, *pte);
-               if (!page)
+               if (!page || is_zone_device_page(page))
                        continue;
                if (PageTransCompound(page))
                        continue;
index 27fb652151e4b4577343c525a463b375d09cc00e..f7b31657d1daffb6797dda6d20a6313758a924c2 100644 (file)
@@ -95,7 +95,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
                                        continue;
 
                                page = vm_normal_page(vma, addr, oldpte);
-                               if (!page || PageKsm(page))
+                               if (!page || is_zone_device_page(page) || PageKsm(page))
                                        continue;
 
                                /* Also skip shared copy-on-write pages */