.. SPDX-License-Identifier: GPL-2.0
 
-==================================
-Free some vmemmap pages of HugeTLB
-==================================
+=========================================
+A vmemmap diet for HugeTLB and Device DAX
+=========================================
+
+HugeTLB
+=======
 
 The struct page structures (page structs) are used to describe a physical
 page frame. By default, there is a one-to-one mapping from a page frame to
 more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page)
 associated with each HugeTLB page. The compound_head() can handle this
 correctly (more details refer to the comment above compound_head()).
+
+Device DAX
+==========
+
+The device-dax interface uses the same tail deduplication technique explained
+in the previous chapter, except when used with the vmemmap in
+the device (altmap).
+
+The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64),
+PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64).
+
+The differences with HugeTLB are relatively minor.
+
+It only use 3 page structs for storing all information as opposed
+to 4 on HugeTLB pages.
+
+There's no remapping of vmemmap given that device-dax memory is not part of
+System RAM ranges initialized at boot. Thus the tail page deduplication
+happens at a later stage when we populate the sections. HugeTLB reuses the
+the head vmemmap page representing, whereas device-dax reuses the tail
+vmemmap page. This results in only half of the savings compared to HugeTLB.
+
+Deduplicated tail pages are not mapped read-only.
+
+Here's how things look like on device-dax after the sections are populated::
+
+ +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
+ |           |                     |     0     | -------------> |     0     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     1     | -------------> |     1     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     2     | ----------------^ ^ ^ ^ ^ ^
+ |           |                     +-----------+                   | | | | |
+ |           |                     |     3     | ------------------+ | | | |
+ |           |                     +-----------+                     | | | |
+ |           |                     |     4     | --------------------+ | | |
+ |    PMD    |                     +-----------+                       | | |
+ |   level   |                     |     5     | ----------------------+ | |
+ |  mapping  |                     +-----------+                         | |
+ |           |                     |     6     | ------------------------+ |
+ |           |                     +-----------+                           |
+ |           |                     |     7     | --------------------------+
+ |           |                     +-----------+
+ |           |
+ |           |
+ |           |
+ +-----------+
 
 }
 
 pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
-                                      struct vmem_altmap *altmap)
+                                      struct vmem_altmap *altmap,
+                                      struct page *reuse)
 {
        pte_t *pte = pte_offset_kernel(pmd, addr);
        if (pte_none(*pte)) {
                pte_t entry;
                void *p;
 
-               p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
-               if (!p)
-                       return NULL;
+               if (!reuse) {
+                       p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
+                       if (!p)
+                               return NULL;
+               } else {
+                       /*
+                        * When a PTE/PMD entry is freed from the init_mm
+                        * there's a a free_pages() call to this page allocated
+                        * above. Thus this get_page() is paired with the
+                        * put_page_testzero() on the freeing path.
+                        * This can only called by certain ZONE_DEVICE path,
+                        * and through vmemmap_populate_compound_pages() when
+                        * slab is available.
+                        */
+                       get_page(reuse);
+                       p = page_to_virt(reuse);
+               }
                entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
                set_pte_at(&init_mm, addr, pte, entry);
        }
 }
 
 static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
-                                             struct vmem_altmap *altmap)
+                                             struct vmem_altmap *altmap,
+                                             struct page *reuse)
 {
        pgd_t *pgd;
        p4d_t *p4d;
        pmd = vmemmap_pmd_populate(pud, addr, node);
        if (!pmd)
                return NULL;
-       pte = vmemmap_pte_populate(pmd, addr, node, altmap);
+       pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
        if (!pte)
                return NULL;
        vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
 
 static int __meminit vmemmap_populate_range(unsigned long start,
                                            unsigned long end, int node,
-                                           struct vmem_altmap *altmap)
+                                           struct vmem_altmap *altmap,
+                                           struct page *reuse)
 {
        unsigned long addr = start;
        pte_t *pte;
 
        for (; addr < end; addr += PAGE_SIZE) {
-               pte = vmemmap_populate_address(addr, node, altmap);
+               pte = vmemmap_populate_address(addr, node, altmap, reuse);
                if (!pte)
                        return -ENOMEM;
        }
 int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
                                         int node, struct vmem_altmap *altmap)
 {
-       return vmemmap_populate_range(start, end, node, altmap);
+       return vmemmap_populate_range(start, end, node, altmap, NULL);
+}
+
+/*
+ * For compound pages bigger than section size (e.g. x86 1G compound
+ * pages with 2M subsection size) fill the rest of sections as tail
+ * pages.
+ *
+ * Note that memremap_pages() resets @nr_range value and will increment
+ * it after each range successful onlining. Thus the value or @nr_range
+ * at section memmap populate corresponds to the in-progress range
+ * being onlined here.
+ */
+static bool __meminit reuse_compound_section(unsigned long start_pfn,
+                                            struct dev_pagemap *pgmap)
+{
+       unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
+       unsigned long offset = start_pfn -
+               PHYS_PFN(pgmap->ranges[pgmap->nr_range].start);
+
+       return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
+}
+
+static pte_t * __meminit compound_section_tail_page(unsigned long addr)
+{
+       pte_t *pte;
+
+       addr -= PAGE_SIZE;
+
+       /*
+        * Assuming sections are populated sequentially, the previous section's
+        * page data can be reused.
+        */
+       pte = pte_offset_kernel(pmd_off_k(addr), addr);
+       if (!pte)
+               return NULL;
+
+       return pte;
+}
+
+static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
+                                                    unsigned long start,
+                                                    unsigned long end, int node,
+                                                    struct dev_pagemap *pgmap)
+{
+       unsigned long size, addr;
+       pte_t *pte;
+       int rc;
+
+       if (reuse_compound_section(start_pfn, pgmap)) {
+               pte = compound_section_tail_page(start);
+               if (!pte)
+                       return -ENOMEM;
+
+               /*
+                * Reuse the page that was populated in the prior iteration
+                * with just tail struct pages.
+                */
+               return vmemmap_populate_range(start, end, node, NULL,
+                                             pte_page(*pte));
+       }
+
+       size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
+       for (addr = start; addr < end; addr += size) {
+               unsigned long next = addr, last = addr + size;
+
+               /* Populate the head page vmemmap page */
+               pte = vmemmap_populate_address(addr, node, NULL, NULL);
+               if (!pte)
+                       return -ENOMEM;
+
+               /* Populate the tail pages vmemmap page */
+               next = addr + PAGE_SIZE;
+               pte = vmemmap_populate_address(next, node, NULL, NULL);
+               if (!pte)
+                       return -ENOMEM;
+
+               /*
+                * Reuse the previous page for the rest of tail pages
+                * See layout diagram in Documentation/vm/vmemmap_dedup.rst
+                */
+               next += PAGE_SIZE;
+               rc = vmemmap_populate_range(next, last, node, NULL,
+                                           pte_page(*pte));
+               if (rc)
+                       return -ENOMEM;
+       }
+
+       return 0;
 }
 
 struct page * __meminit __populate_section_memmap(unsigned long pfn,
 {
        unsigned long start = (unsigned long) pfn_to_page(pfn);
        unsigned long end = start + nr_pages * sizeof(struct page);
+       int r;
 
        if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
                !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
                return NULL;
 
-       if (vmemmap_populate(start, end, nid, altmap))
+       if (is_power_of_2(sizeof(struct page)) &&
+           pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
+               r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
+       else
+               r = vmemmap_populate(start, end, nid, altmap);
+
+       if (r < 0)
                return NULL;
 
        return pfn_to_page(pfn);