]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
sparc64: Add 16GB hugepage support
authorNitin Gupta <nitin.m.gupta@oracle.com>
Fri, 5 May 2017 01:28:46 +0000 (18:28 -0700)
committerAllen Pais <allen.pais@oracle.com>
Fri, 16 Jun 2017 12:38:47 +0000 (18:08 +0530)
Adds support for 16GB hugepage size. To use this page size
use kernel parameters as:

default_hugepagesz=16G hugepagesz=16G hugepages=10

Testing:

Tested with the stream benchmark which allocates 48G of
arrays backed by 16G hugepages and does RW operation on
them in parallel.

Orabug: 25858371

Signed-off-by: Nitin Gupta <nitin.m.gupta@oracle.com>
Signed-off-by: Allen Pais <allen.pais@oracle.com>
arch/sparc/include/asm/page_64.h
arch/sparc/include/asm/pgtable_64.h
arch/sparc/include/asm/tsb.h
arch/sparc/kernel/tsb.S
arch/sparc/mm/gup.c
arch/sparc/mm/hugetlbpage.c
arch/sparc/mm/init_64.c

index e04f805f208c93459027e76320b6b7500639ced5..e45aba4254f04d736198a03121ecdc82948b2b7c 100644 (file)
@@ -17,6 +17,7 @@
 
 #define HPAGE_SHIFT            23
 #define REAL_HPAGE_SHIFT       22
+#define HPAGE_16GB_SHIFT       34
 #define HPAGE_2GB_SHIFT                31
 #define HPAGE_256MB_SHIFT      28
 #define HPAGE_64K_SHIFT                16
@@ -28,7 +29,7 @@
 #define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 #define REAL_HPAGE_PER_HPAGE   (_AC(1,UL) << (HPAGE_SHIFT - REAL_HPAGE_SHIFT))
-#define HUGE_MAX_HSTATE                4
+#define HUGE_MAX_HSTATE                5
 #endif
 
 #ifndef __ASSEMBLY__
index 0377f573df6968aaa0e4a0b058de3704510c4595..2adb60264bce8137d5b286a63f2112361be3a1dd 100644 (file)
@@ -431,6 +431,11 @@ static inline bool is_hugetlb_pmd(pmd_t pmd)
        return !!(pmd_val(pmd) & _PAGE_PMD_HUGE);
 }
 
+static inline bool is_hugetlb_pud(pud_t pud)
+{
+       return !!(pud_val(pud) & _PAGE_PUD_HUGE);
+}
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static inline pmd_t pmd_mkhuge(pmd_t pmd)
 {
@@ -716,6 +721,8 @@ static inline unsigned long pmd_write(pmd_t pmd)
        return pte_write(pte);
 }
 
+#define pud_write(pud) pte_write(__pte(pud_val(pud)))
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static inline unsigned long pmd_dirty(pmd_t pmd)
 {
@@ -866,10 +873,19 @@ static inline unsigned long __pmd_page(pmd_t pmd)
        return ((unsigned long) __va(pfn << PAGE_SHIFT));
 }
 
+static inline unsigned long pud_page_vaddr(pud_t pud)
+{
+       pte_t pte = __pte(pud_val(pud));
+       unsigned long pfn;
+
+       pfn = pte_pfn(pte);
+
+       return ((unsigned long) __va(pfn << PAGE_SHIFT));
+}
+
 #define pmd_page(pmd)                  virt_to_page((void *)__pmd_page(pmd))
 #define pmd_clear(pmdp)                        (pmd_val(*(pmdp)) = 0UL)
-#define pud_page_vaddr(pud)            \
-       ((unsigned long) __va(pud_val(pud)))
+
 #define pud_page(pud)                  virt_to_page((void *)pud_page_vaddr(pud))
 #define pud_present(pud)               (pud_val(pud) != 0U)
 #define pud_clear(pudp)                        (pud_val(*(pudp)) = 0UL)
index d2e79744d901842ed435f8e9098d39be74d81741..cc79862cacf9064640c38ec28f2526771169831d 100644 (file)
@@ -200,6 +200,35 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
         nop; \
 699:
 
+       /* PUD has been loaded into REG1, interpret the value, seeing
+        * if it is a HUGE PUD or a normal one.  If it is not valid
+        * then jump to FAIL_LABEL.  If it is a HUGE PUD, and it
+        * translates to a valid PTE, branch to PTE_LABEL.
+        *
+        * We have to propagate bits [32:22] from the virtual address
+        * to resolve at 4M granularity.
+        */
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#define USER_PGTABLE_CHECK_PUD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, PTE_LABEL) \
+       brz,pn          REG1, FAIL_LABEL;               \
+        sethi          %uhi(_PAGE_PUD_HUGE), REG2;     \
+       sllx            REG2, 32, REG2;                 \
+       andcc           REG1, REG2, %g0;                \
+       be,pt           %xcc, 700f;                     \
+        sethi          %hi(0x1ffc0000), REG2;          \
+       sllx            REG2, 1, REG2;                  \
+       brgez,pn        REG1, FAIL_LABEL;               \
+        andn           REG1, REG2, REG1;               \
+       and             VADDR, REG2, REG2;              \
+       brlz,pt         REG1, PTE_LABEL;                \
+        or             REG1, REG2, REG1;               \
+700:
+#else
+#define USER_PGTABLE_CHECK_PUD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, PTE_LABEL) \
+       brz,pn          REG1, FAIL_LABEL; \
+        nop;
+#endif
+
        /* PMD has been loaded into REG1, interpret the value, seeing
         * if it is a HUGE PMD or a normal one.  If it is not valid
         * then jump to FAIL_LABEL.  If it is a HUGE PMD, and it
@@ -252,6 +281,7 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
        srlx            REG2, 64 - PAGE_SHIFT, REG2; \
        andn            REG2, 0x7, REG2; \
        ldxa            [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
+       USER_PGTABLE_CHECK_PUD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, 800f) \
        brz,pn          REG1, FAIL_LABEL; \
         sllx           VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \
        srlx            REG2, 64 - PAGE_SHIFT, REG2; \
index 07c0df92496034efd1262dd2b40e56ffd5486c0c..5f42ac099fcb72e5d19c84546c60fa9ccdfa80dd 100644 (file)
@@ -117,7 +117,7 @@ tsb_miss_page_table_walk_sun4v_fastpath:
        /* Valid PTE is now in %g5.  */
 
 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
-       sethi           %uhi(_PAGE_PMD_HUGE), %g7
+       sethi           %uhi(_PAGE_PMD_HUGE | _PAGE_PUD_HUGE), %g7
        sllx            %g7, 32, %g7
 
        andcc           %g5, %g7, %g0
index 9c962f905a9eda01af460208cfb2bf7eea54c581..408fcbbe7fec14862e8637407fae8c5f84180aa8 100644 (file)
@@ -83,6 +83,8 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
        refs = 0;
        head = pmd_page(pmd);
        page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+       if (PageTail(head))
+               head = compound_head(head);
        tail = page;
        do {
                VM_BUG_ON(compound_head(page) != head);
@@ -116,6 +118,57 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
        return 1;
 }
 
+static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr,
+                       unsigned long end, int write, struct page **pages,
+                       int *nr)
+{
+       struct page *head, *page, *tail;
+       int refs;
+
+       if (!(pud_val(pud) & _PAGE_VALID))
+               return 0;
+
+       if (write && !pud_write(pud))
+               return 0;
+
+       refs = 0;
+       head = pud_page(pud);
+       page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+       if (PageTail(head))
+               head = compound_head(head);
+       tail = page;
+       do {
+               VM_BUG_ON(compound_head(page) != head);
+               pages[*nr] = page;
+               (*nr)++;
+               page++;
+               refs++;
+       } while (addr += PAGE_SIZE, addr != end);
+
+       if (!page_cache_add_speculative(head, refs)) {
+               *nr -= refs;
+               return 0;
+       }
+
+       if (unlikely(pud_val(pud) != pud_val(*pudp))) {
+               *nr -= refs;
+               while (refs--)
+                       put_page(head);
+               return 0;
+       }
+
+       /* Any tail page need their mapcount reference taken before we
+        * return.
+        */
+       while (refs--) {
+               if (PageTail(tail))
+                       get_huge_page_tail(tail);
+               tail++;
+       }
+
+       return 1;
+}
+
 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                int write, struct page **pages, int *nr)
 {
@@ -154,7 +207,11 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
                next = pud_addr_end(addr, end);
                if (pud_none(pud))
                        return 0;
-               if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+               if (unlikely(pud_large(pud))) {
+                       if (!gup_huge_pud(pudp, pud, addr, next,
+                                         write, pages, nr))
+                               return 0;
+               } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
                        return 0;
        } while (pudp++, addr = next, addr != end);
 
index 3eab5ac44b2fa8e9e402f6e629a7aa5ba38ee656..a8e04c70c07d90ed9ff72a2bd3a7bc50a4235e32 100644 (file)
@@ -147,6 +147,10 @@ static pte_t sun4v_hugepage_shift_to_tte(pte_t entry, unsigned int shift)
        pte_val(entry) = pte_val(entry) & ~_PAGE_SZALL_4V;
 
        switch (shift) {
+       case HPAGE_16GB_SHIFT:
+               hugepage_size = _PAGE_SZ16GB_4V;
+               pte_val(entry) |= _PAGE_PUD_HUGE;
+               break;
        case HPAGE_2GB_SHIFT:
                hugepage_size = _PAGE_SZ2GB_4V;
                pte_val(entry) |= _PAGE_PMD_HUGE;
@@ -199,6 +203,9 @@ static unsigned int sun4v_huge_tte_to_shift(pte_t entry)
        unsigned int shift;
 
        switch (tte_szbits) {
+       case _PAGE_SZ16GB_4V:
+               shift = HPAGE_16GB_SHIFT;
+               break;
        case _PAGE_SZ2GB_4V:
                shift = HPAGE_2GB_SHIFT;
                break;
@@ -272,25 +279,22 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
-       pte_t *pte = NULL;
 
        pgd = pgd_offset(mm, addr);
        p4d = p4d_alloc(mm, pgd, addr);
-       if (p4d) {
-               pud = pud_alloc(mm, p4d, addr);
-               if (pud) {
-                       pmd = pmd_alloc(mm, pud, addr);
-                       if (!pmd)
-                               return NULL;
-
-                       if (sz >= PMD_SIZE)
-                               pte = (pte_t *)pmd;
-                       else
-                               pte = pte_alloc_map(mm, NULL, pmd, addr);
-               }
-       }
-
-       return pte;
+       if (!p4d)
+               return NULL;
+       pud = pud_alloc(mm, p4d, addr);
+       if (!pud)
+               return NULL;
+       if (sz >= PUD_SIZE)
+               return (pte_t *)pud;
+       pmd = pmd_alloc(mm, pud, addr);
+       if (!pmd)
+               return NULL;
+       if (sz >= PMD_SIZE)
+               return (pte_t *)pmd;
+       return pte_alloc_map(mm, NULL, pmd, addr);
 }
 
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
@@ -299,37 +303,43 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
-       pte_t *pte = NULL;
 
        pgd = pgd_offset(mm, addr);
-       if (!pgd_none(*pgd)) {
-               p4d = p4d_offset(pgd, addr);
-               if (!p4d_none(*p4d)) {
-                       pud = pud_offset(p4d, addr);
-                       if (!pud_none(*pud)) {
-                               pmd = pmd_offset(pud, addr);
-                               if (!pmd_none(*pmd)) {
-                                       if (is_hugetlb_pmd(*pmd))
-                                               pte = (pte_t *)pmd;
-                                       else
-                                               pte = pte_offset_map(pmd, addr);
-                               }
-                       }
-               }
-       }
-
-       return pte;
+       if (pgd_none(*pgd))
+               return NULL;
+       p4d = p4d_offset(pgd, addr);
+       if (p4d_none(*p4d))
+               return NULL;
+       pud = pud_offset(p4d, addr);
+       if (pud_none(*pud))
+               return NULL;
+       if (is_hugetlb_pud(*pud))
+               return (pte_t *)pud;
+       pmd = pmd_offset(pud, addr);
+       if (pmd_none(*pmd))
+               return NULL;
+       if (is_hugetlb_pmd(*pmd))
+               return (pte_t *)pmd;
+       return pte_offset_map(pmd, addr);
 }
 
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                     pte_t *ptep, pte_t entry)
 {
-       unsigned int i, nptes, orig_shift, shift;
-       unsigned long size;
+       unsigned int nptes, orig_shift, shift;
+       unsigned long i, size;
        pte_t orig;
 
        size = huge_tte_to_size(entry);
-       shift = size >= HPAGE_SIZE ? PMD_SHIFT : PAGE_SHIFT;
+
+       shift = PAGE_SHIFT;
+       if (size >= PUD_SIZE)
+               shift = PUD_SHIFT;
+       else if (size >= PMD_SIZE)
+               shift = PMD_SHIFT;
+       else
+               shift = PAGE_SHIFT;
+
        nptes = size >> shift;
 
        if (!pte_present(*ptep) && pte_present(entry))
@@ -352,19 +362,23 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
                              pte_t *ptep)
 {
-       unsigned int i, nptes, hugepage_shift;
+       unsigned int i, nptes, orig_shift, shift;
        unsigned long size;
        pte_t entry;
 
        entry = *ptep;
        size = huge_tte_to_size(entry);
-       if (size >= HPAGE_SIZE)
-               nptes = size >> PMD_SHIFT;
+
+       shift = PAGE_SHIFT;
+       if (size >= PUD_SIZE)
+               shift = PUD_SHIFT;
+       else if (size >= PMD_SIZE)
+               shift = PMD_SHIFT;
        else
-               nptes = size >> PAGE_SHIFT;
+               shift = PAGE_SHIFT;
 
-       hugepage_shift = pte_none(entry) ? PAGE_SHIFT :
-               huge_tte_to_shift(entry);
+       nptes = size >> shift;
+       orig_shift = pte_none(entry) ? PAGE_SHIFT : huge_tte_to_shift(entry);
 
        if (pte_present(entry))
                mm->context.hugetlb_pte_count -= nptes;
@@ -373,11 +387,11 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
        for (i = 0; i < nptes; i++)
                ptep[i] = __pte(0UL);
 
-       maybe_tlb_batch_add(mm, addr, ptep, entry, 0, hugepage_shift);
+       maybe_tlb_batch_add(mm, addr, ptep, entry, 0, orig_shift);
        /* An HPAGE_SIZE'ed page is composed of two REAL_HPAGE_SIZE'ed pages */
        if (size == HPAGE_SIZE)
                maybe_tlb_batch_add(mm, addr + REAL_HPAGE_SIZE, ptep, entry, 0,
-                                   hugepage_shift);
+                                   orig_shift);
 
        return entry;
 }
@@ -390,7 +404,8 @@ int pmd_huge(pmd_t pmd)
 
 int pud_huge(pud_t pud)
 {
-       return 0;
+       return !pud_none(pud) &&
+               (pud_val(pud) & (_PAGE_VALID|_PAGE_PUD_HUGE)) != _PAGE_VALID;
 }
 
 static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
@@ -454,8 +469,11 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-               hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
-                                      ceiling);
+               if (is_hugetlb_pud(*pud))
+                       pud_clear(pud);
+               else
+                       hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
+                                              ceiling);
        } while (pud++, addr = next, addr != end);
 
        start &= P4D_MASK;
index e3447a02c49aa00e40e384e29756783e6dcab724..1509da6d3ff36fdaa4e3f8d2dbb243b3c7657e8e 100644 (file)
@@ -453,6 +453,10 @@ static int __init setup_hugepagesz(char *string)
        hugepage_shift = ilog2(hugepage_size);
 
        switch (hugepage_shift) {
+       case HPAGE_16GB_SHIFT:
+               hv_pgsz_mask = HV_PGSZ_MASK_16GB;
+               hv_pgsz_idx = HV_PGSZ_IDX_16GB;
+               break;
        case HPAGE_2GB_SHIFT:
                hv_pgsz_mask = HV_PGSZ_MASK_2GB;
                hv_pgsz_idx = HV_PGSZ_IDX_2GB;
@@ -492,6 +496,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *
 {
        struct mm_struct *mm;
        unsigned long flags;
+       bool is_huge_tsb;
        pte_t pte = *ptep;
 
        if (tlb_type != hypervisor) {
@@ -509,15 +514,37 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *
 
        spin_lock_irqsave(&mm->context.lock, flags);
 
+       is_huge_tsb = false;
 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
-       if ((mm->context.hugetlb_pte_count || mm->context.thp_pte_count) &&
-           is_hugetlb_pmd(__pmd(pte_val(pte)))) {
-               /* We are fabricating 8MB pages using 4MB real hw pages.  */
-               pte_val(pte) |= (address & (1UL << REAL_HPAGE_SHIFT));
-               __update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT,
-                                       address, pte_val(pte));
-       } else
+       if (mm->context.hugetlb_pte_count || mm->context.thp_pte_count) {
+               unsigned long hugepage_size = PAGE_SIZE;
+
+               if (is_vm_hugetlb_page(vma))
+                       hugepage_size = huge_page_size(hstate_vma(vma));
+
+               if (hugepage_size >= PUD_SIZE) {
+                       unsigned long mask = 0x1ffc00000UL;
+
+                       /* Transfer bits [32:22] from address to resolve
+                        * at 4M granularity.
+                        */
+                       pte_val(pte) &= ~mask;
+                       pte_val(pte) |= (address & mask);
+               } else if (hugepage_size >= PMD_SIZE) {
+                       /* We are fabricating 8MB pages using 4MB
+                        * real hw pages.
+                        */
+                       pte_val(pte) |= (address & (1UL << REAL_HPAGE_SHIFT));
+               }
+
+               if (hugepage_size >= PMD_SIZE) {
+                       __update_mmu_tsb_insert(mm, MM_TSB_HUGE,
+                               REAL_HPAGE_SHIFT, address, pte_val(pte));
+                       is_huge_tsb = true;
+               }
+       }
 #endif
+       if (!is_huge_tsb)
                __update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT,
                                        address, pte_val(pte));