From 95963e355b32bbe4cb2f4b89d55cd3ea9fe7fd47 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Thu, 22 Sep 2016 09:44:27 -0700
Subject: [PATCH] sparc64: Fix accounting issues used to size TSBs

Orabug: 24478985

As pages are allocated by a task, counters in the mm and mm_context
structures are used to track these allocations.  These counters are
then used to size the task's TSBs.  This patch addresses issues where
counts are not maintained properly, and TSBs of the incorrect size
are created for the task.

- hugetlb pages are not included in a task's RSS calculations.  However,
  the routine do_sparc64_fault() calculates the size of base TSB block
  by subtracting total size of hugetlb pages from RSS.  Since hugetlb
  size is likely larger than RSS, a negative value is passed as an
  unsigned value to the routine which allocates the TSB block.  The
  'negative unsigned' value appears as a really big value and results in
  a maximum sized base TSB being allocated.  This is the case for almost
  all tasks using hugetlb pages.

  THP pages are also counted in huge_pte_count[MM_PTES_HUGE].  And
  unlike hugetlb pages, THP pages are included in a task's RSS.
  Therefore, both hugetlb and THP can not be counted for in
  huge_pte_count[MM_PTES_HUGE].

  Add a new counter thp_pte_count for THP pages, and use this value for
  adjusting RSS to size the base TSB.

- In order to save memory, THP makes use of a huge zero page.  This huge
  zero page does not count against a task's RSS, but it does consume TSB
  entries.  Therefore, count huge zero page entries in
  huge_pte_count[MM_PTES_HUGE].

- Accounting of THP pages is done in the routine set_pmd_at().
  Unfortunately, this does not catch the case where a THP page is split.
  To handle this case, decrement the count in pmdp_invalidate().
  pmdp_invalidate is only called when splitting a THP.  However, 'sanity
  checks' are added in case it is ever called for other purposes.

- huge_pte_count[MM_PTES_HUGE] tracks the number of HPAGE_SIZE (8M) pages
  used by the task.  This value is used to size the TSB for HPAGE_SIZE
  pages.  However, for each HPAGE_SIZE (8M) there are two REAL_HPAGE_SIZE
  (4M) pages.  The TSB contains an entry for each REAL_HPAGE_SIZE page.
  Therefore, the number of REAL_HPAGE_SIZE pages used by the task should
  be used to size the MM_PTES_HUGE TSB.  A new compile time constant
  REAL_HPAGE_PER_HPAGE is used to multiply huge_pte_count[MM_PTES_HUGE]
  before sizing the TSB.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Vijay Kumar <vijay.ac.kumar@oracle.com>
Tested-by: Vijay Kumar <vijay.ac.kumar@oracle.com>
(cherry picked from commit 417fc85e759b6d4c4602fbdbdd5375ec5ddf2cb0)
Signed-off-by: Allen Pais <allen.pais@oracle.com>
---
 arch/sparc/include/asm/mmu_64.h  |  1 +
 arch/sparc/include/asm/page_64.h |  1 +
 arch/sparc/mm/fault_64.c         | 28 +++++++++----------------
 arch/sparc/mm/init_64.c          |  4 ++--
 arch/sparc/mm/tlb.c              | 36 ++++++++++++++++++++++++++++----
 arch/sparc/mm/tsb.c              | 21 +++++++++++++++----
 6 files changed, 63 insertions(+), 28 deletions(-)

diff --git a/arch/sparc/include/asm/mmu_64.h b/arch/sparc/include/asm/mmu_64.h
index b7e86bd2d01c..4db2fa11e346 100644
--- a/arch/sparc/include/asm/mmu_64.h
+++ b/arch/sparc/include/asm/mmu_64.h
@@ -108,6 +108,7 @@ typedef struct {
 	struct hv_tsb_descr	tsb_descr[MM_NUM_TSBS];
 	void			*vdso;
 	unsigned long		huge_pte_count[MM_NUM_HUGEPAGE_SIZES];
+	unsigned long		thp_pte_count;
 } mm_context_t;
 
 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
index a0c5f0556253..0992dbb6c127 100644
--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
@@ -28,6 +28,7 @@
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 #define HUGE_MAX_HSTATE		2
+#define REAL_HPAGE_PER_HPAGE	(_AC(1,UL) << (HPAGE_SHIFT - REAL_HPAGE_SHIFT))
 #endif
 
 #ifndef __ASSEMBLY__
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index f1a78f7aa650..41583015d6b5 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -287,19 +287,6 @@ static void noinline __kprobes bogus_32bit_fault_tpc(struct pt_regs *regs)
 
 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 /* Put this here until there are more consumers.*/
-static unsigned long hugepage_pte_counts_to_pages(mm_context_t *mm_context)
-{
-	unsigned long hugepages_to_pages = 0UL;
-
-	if (xl_hugepage_shift)
-		hugepages_to_pages = xl_hugepage_pte_count(mm_context) <<
-			(xl_hugepage_shift - PAGE_SHIFT);
-	hugepages_to_pages = hugepages_to_pages +
-		(hugepage_pte_count(mm_context) << (HPAGE_SHIFT - PAGE_SHIFT));
-
-	return hugepages_to_pages;
-}
-
 static void sparc64_hugetlb_tsb_fault(struct pt_regs *regs,
 				      struct mm_struct *mm,
 				      unsigned int hugepage_shift)
@@ -316,6 +303,13 @@ static void sparc64_hugetlb_tsb_fault(struct pt_regs *regs,
 		hugepage_size_to_pte_count_idx(1UL << hugepage_shift);
 
 	mm_rss = mm->context.huge_pte_count[hugepage_pte_idx];
+	if (hugepage_idx == MM_TSB_HUGE) {
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
+		mm_rss += mm->context.thp_pte_count;
+#endif
+		mm_rss *= REAL_HPAGE_PER_HPAGE;
+	}
+
 	if (unlikely(mm_rss >
 	     mm->context.tsb_block[hugepage_idx].tsb_rss_limit)) {
 		if (mm->context.tsb_block[hugepage_idx].tsb)
@@ -326,10 +320,6 @@ static void sparc64_hugetlb_tsb_fault(struct pt_regs *regs,
 	}
 }
 #else
-static unsigned long hugepage_pte_counts_to_pages(mm_context_t *mm_context)
-{
-	return 0UL;
-}
 static void sparc64_hugetlb_tsb_fault(struct pt_regs *regs,
 				      struct mm_struct *mm,
 				      unsigned int hugepage_shift)
@@ -542,7 +532,9 @@ good_area:
 	up_read(&mm->mmap_sem);
 
 	mm_rss = get_mm_rss(mm);
-	mm_rss =  mm_rss - hugepage_pte_counts_to_pages(&mm->context);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
+	mm_rss -= (mm->context.thp_pte_count * (HPAGE_SIZE / PAGE_SIZE));
+#endif
 	if (unlikely(mm_rss >
 		     mm->context.tsb_block[MM_TSB_BASE].tsb_rss_limit))
 		tsb_grow(mm, MM_TSB_BASE, mm_rss);
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 556ee725a17c..572a25f4b0a9 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -607,8 +607,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *
 	spin_lock_irqsave(&mm->context.lock, flags);
 
 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
-	if (mm->context.huge_pte_count[MM_PTES_HUGE] &&
-			is_default_hugetlb_pte(pte)) {
+	if ((mm->context.huge_pte_count[MM_PTES_HUGE] ||
+	     mm->context.thp_pte_count) && is_default_hugetlb_pte(pte)) {
 		/* We are fabricating 8MB pages using 4MB real hw pages */
 		pte_val(pte) |= (address & (1UL << REAL_HPAGE_SHIFT));
 		__update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT,
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index 5b0c4738296e..91ab380fbc50 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -174,10 +174,25 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 		return;
 
 	if ((pmd_val(pmd) ^ pmd_val(orig)) & _PAGE_PMD_HUGE) {
-		if (pmd_val(pmd) & _PAGE_PMD_HUGE)
-			mm->context.huge_pte_count[MM_PTES_HUGE]++;
-		else
-			mm->context.huge_pte_count[MM_PTES_HUGE]--;
+		/*
+		 * Note that this routine only sets pmds for THP pages.
+		 * Hugetlb pages are handled elsewhere.  We need to check
+		 * for huge zero page.  Huge zero pages are like hugetlb
+		 * pages in that there is no RSS, but there is the need
+		 * for TSB entries.  So, huge zero page counts go into
+		 * huge_pte_count[MM_PTES_HUGE].
+		 */
+		if (pmd_val(pmd) & _PAGE_PMD_HUGE) {
+			if (is_huge_zero_page(pmd_page(pmd)))
+				mm->context.huge_pte_count[MM_PTES_HUGE]++;
+			else
+				mm->context.thp_pte_count++;
+		} else {
+			if (is_huge_zero_page(pmd_page(orig)))
+				mm->context.huge_pte_count[MM_PTES_HUGE]--;
+			else
+				mm->context.thp_pte_count--;
+		}
 
 		/* Do not try to allocate the TSB hash table if we
 		 * don't have one already.  We have various locks held
@@ -204,6 +219,9 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 	}
 }
 
+/*
+ * This routine is only called when splitting a THP
+ */
 void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 		     pmd_t *pmdp)
 {
@@ -213,6 +231,16 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 
 	set_pmd_at(vma->vm_mm, address, pmdp, entry);
 	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+	/*
+	 * set_pmd_at() will not be called in a way to decrement the
+	 * context.thp_pte_count when splitting a THP, so do it now.
+	 * Sanity check pmd before doing the actual decrement.
+	 */
+	if ((pmd_val(entry) & _PAGE_PMD_HUGE) &&
+	    !is_huge_zero_page(pmd_page(entry)))
+		(vma->vm_mm)->context.thp_pte_count--;
+	
 }
 
 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c
index 19087ffed917..7c5ea6c783a4 100644
--- a/arch/sparc/mm/tsb.c
+++ b/arch/sparc/mm/tsb.c
@@ -502,10 +502,14 @@ retry_tsb_alloc:
 
 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static void capture_and_clear_huge_pte_counts(mm_context_t *mm_context,
+					      unsigned long *thp_pte_count,
 					      unsigned long *capture_array)
 {
 	unsigned int hugepage_idx;
 
+	*thp_pte_count = mm_context->thp_pte_count;
+	mm_context->thp_pte_count = 0UL;
+
 	for (hugepage_idx = 0UL; hugepage_idx != MM_NUM_HUGEPAGE_SIZES;
 		hugepage_idx++) {
 		capture_array[hugepage_idx] =
@@ -516,11 +520,14 @@ static void capture_and_clear_huge_pte_counts(mm_context_t *mm_context,
 
 static void
 captured_hugepage_pte_count_grow_tsb(struct mm_struct *mm,
+				     unsigned long *thp_pte_count,
 				     unsigned long *capture_huge_pte_count)
 {
 	if (unlikely(capture_huge_pte_count[MM_PTES_HUGE]))
+	if (unlikely(capture_huge_pte_count[MM_PTES_HUGE]) || *thp_pte_count)
 		tsb_grow(mm, MM_TSB_HUGE,
-			capture_huge_pte_count[MM_PTES_HUGE]);
+			(capture_huge_pte_count[MM_PTES_HUGE] +
+			 *thp_pte_count) * REAL_HPAGE_PER_HPAGE);
 
 	if (unlikely(capture_huge_pte_count[MM_PTES_XLHUGE]))
 		tsb_grow(mm, MM_TSB_XLHUGE,
@@ -528,15 +535,18 @@ captured_hugepage_pte_count_grow_tsb(struct mm_struct *mm,
 }
 #else
 static void capture_and_clear_huge_pte_counts(mm_context_t *mm_context,
+					      unsigned long *thp_pte_count,
 					      unsigned long *capture_array) {}
 static void
 captured_hugepage_pte_count_grow_tsb(struct mm_struct *mm,
+				     unsigned long *thp_pte_count,
 				     unsigned long *capture_huge_pte_count) {}
 #endif /* CONFIG_HUGETLB_PAGE || CONFIG_TRANSPARENT_HUGEPAGE */
 
 int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
 	unsigned long capture_huge_pte_count[MM_NUM_HUGEPAGE_SIZES];
+	unsigned long saved_thp_pte_count;
 	unsigned int i;
 
 	spin_lock_init(&mm->context.lock);
@@ -547,7 +557,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	 * will re-increment the counters as the parent PTEs are
 	 * copied into the child address space.
 	 */
-	capture_and_clear_huge_pte_counts(&mm->context, capture_huge_pte_count);
+	capture_and_clear_huge_pte_counts(&mm->context, &saved_thp_pte_count,
+					   capture_huge_pte_count);
 
 	/* copy_mm() copies over the parent's mm_struct before calling
 	 * us, so we need to zero out the TSB pointer or else tsb_grow()
@@ -559,9 +570,11 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	/* If this is fork, inherit the parent's TSB size.  We would
 	 * grow it to that size on the first page fault anyways.
 	 */
-	tsb_grow(mm, MM_TSB_BASE, get_mm_rss(mm));
+	tsb_grow(mm, MM_TSB_BASE, get_mm_rss(mm) -
+		 saved_thp_pte_count * (HPAGE_SIZE / PAGE_SIZE));
 
-	captured_hugepage_pte_count_grow_tsb(mm, capture_huge_pte_count);
+	captured_hugepage_pte_count_grow_tsb(mm, &saved_thp_pte_count,
+					     capture_huge_pte_count);
 
 	if (unlikely(!mm->context.tsb_block[MM_TSB_BASE].tsb))
 		return -ENOMEM;
-- 
2.50.1