#include <linux/threads.h>
 
-#define PTE_NONCACHE_NUM       0  /* dummy for now to share code w/ppc64 */
+/* For 32-bit, all levels of page tables are just drawn from get_free_page() */
+#define MAX_PGTABLE_INDEX_SIZE 0
 
 extern void __bad_pte(pmd_t *pmd);
 
 extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr);
 extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
 
-static inline void pgtable_free(pgtable_free_t pgf)
+static inline void pgtable_free(void *table, unsigned index_size)
 {
-       void *p = (void *)(pgf.val & ~PGF_CACHENUM_MASK);
-
-       free_page((unsigned long)p);
+       BUG_ON(index_size); /* 32-bit doesn't use this */
+       free_page((unsigned long)table);
 }
 
 #define check_pgt_cache()      do { } while (0)
 
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
 
+/*
+ * Functions that deal with pagetables that could be at any level of
+ * the table need to be passed an "index_size" so they know how to
+ * handle allocation.  For PTE pages (which are linked to a struct
+ * page for now, and drawn from the main get_free_pages() pool), the
+ * allocation size will be (2^index_size * sizeof(pointer)) and
+ * allocations are drawn from the kmem_cache in PGT_CACHE(index_size).
+ *
+ * The maximum index size needs to be big enough to allow any
+ * pagetable sizes we need, but small enough to fit in the low bits of
+ * any page table pointer.  In other words all pagetables, even tiny
+ * ones, must be aligned to allow at least enough low 0 bits to
+ * contain this value.  This value is also used as a mask, so it must
+ * be one less than a power of two.
+ */
+#define MAX_PGTABLE_INDEX_SIZE 0xf
+
 #ifndef CONFIG_PPC_SUBPAGE_PROT
 static inline void subpage_prot_free(pgd_t *pgd) {}
 #endif
 
 extern struct kmem_cache *pgtable_cache[];
-
-#define PGD_CACHE_NUM          0
-#define PUD_CACHE_NUM          1
-#define PMD_CACHE_NUM          1
-#define HUGEPTE_CACHE_NUM      2
-#define PTE_NONCACHE_NUM       7  /* from GFP rather than kmem_cache */
+#define PGT_CACHE(shift) (pgtable_cache[(shift)-1])
 
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-       return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL);
+       return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL);
 }
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
        subpage_prot_free(pgd);
-       kmem_cache_free(pgtable_cache[PGD_CACHE_NUM], pgd);
+       kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
 }
 
 #ifndef CONFIG_PPC_64K_PAGES
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-       return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM],
+       return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE),
                                GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 {
-       kmem_cache_free(pgtable_cache[PUD_CACHE_NUM], pud);
+       kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud);
 }
 
 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-       return kmem_cache_alloc(pgtable_cache[PMD_CACHE_NUM],
+       return kmem_cache_alloc(PGT_CACHE(PMD_INDEX_SIZE),
                                GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
-       kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd);
+       kmem_cache_free(PGT_CACHE(PMD_INDEX_SIZE), pmd);
 }
 
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
        return page;
 }
 
-static inline void pgtable_free(pgtable_free_t pgf)
+static inline void pgtable_free(void *table, unsigned index_size)
 {
-       void *p = (void *)(pgf.val & ~PGF_CACHENUM_MASK);
-       int cachenum = pgf.val & PGF_CACHENUM_MASK;
-
-       if (cachenum == PTE_NONCACHE_NUM)
-               free_page((unsigned long)p);
-       else
-               kmem_cache_free(pgtable_cache[cachenum], p);
+       if (!index_size)
+               free_page((unsigned long)table);
+       else {
+               BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE);
+               kmem_cache_free(PGT_CACHE(index_size), table);
+       }
 }
 
-#define __pmd_free_tlb(tlb, pmd,addr)                \
-       pgtable_free_tlb(tlb, pgtable_free_cache(pmd, \
-               PMD_CACHE_NUM, PMD_TABLE_SIZE-1))
+#define __pmd_free_tlb(tlb, pmd, addr)               \
+       pgtable_free_tlb(tlb, pmd, PMD_INDEX_SIZE)
 #ifndef CONFIG_PPC_64K_PAGES
 #define __pud_free_tlb(tlb, pud, addr)               \
-       pgtable_free_tlb(tlb, pgtable_free_cache(pud, \
-               PUD_CACHE_NUM, PUD_TABLE_SIZE-1))
+       pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE)
+
 #endif /* CONFIG_PPC_64K_PAGES */
 
 #define check_pgt_cache()      do { } while (0)
 
        __free_page(ptepage);
 }
 
-typedef struct pgtable_free {
-       unsigned long val;
-} pgtable_free_t;
-
-/* This needs to be big enough to allow for MMU_PAGE_COUNT + 2 to be stored
- * and small enough to fit in the low bits of any naturally aligned page
- * table cache entry. Arbitrarily set to 0x1f, that should give us some
- * room to grow
- */
-#define PGF_CACHENUM_MASK      0x1f
-
-static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum,
-                                               unsigned long mask)
-{
-       BUG_ON(cachenum > PGF_CACHENUM_MASK);
-
-       return (pgtable_free_t){.val = ((unsigned long) p & ~mask) | cachenum};
-}
-
 #ifdef CONFIG_PPC64
 #include <asm/pgalloc-64.h>
 #else
 #endif
 
 #ifdef CONFIG_SMP
-extern void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf);
+extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift);
 extern void pte_free_finish(void);
 #else /* CONFIG_SMP */
-static inline void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
+static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift)
 {
-       pgtable_free(pgf);
+       pgtable_free(table, shift);
 }
 static inline void pte_free_finish(void) { }
 #endif /* !CONFIG_SMP */
 static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage,
                                  unsigned long address)
 {
-       pgtable_free_t pgf = pgtable_free_cache(page_address(ptepage),
-                                               PTE_NONCACHE_NUM,
-                                               PTE_TABLE_SIZE-1);
        tlb_flush_pgtable(tlb, address);
        pgtable_page_dtor(ptepage);
-       pgtable_free_tlb(tlb, pgf);
+       pgtable_free_tlb(tlb, page_address(ptepage), 0);
 }
 
 #endif /* __KERNEL__ */
 
 #define pgoff_to_pte(off)      ((pte_t) {((off) << PTE_RPN_SHIFT)|_PAGE_FILE})
 #define PTE_FILE_MAX_BITS      (BITS_PER_LONG - PTE_RPN_SHIFT)
 
+void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
 void pgtable_cache_init(void);
 
 /*
 
 unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
 
 #define hugepte_shift                  mmu_huge_psizes
-#define PTRS_PER_HUGEPTE(psize)                (1 << hugepte_shift[psize])
-#define HUGEPTE_TABLE_SIZE(psize)      (sizeof(pte_t) << hugepte_shift[psize])
+#define HUGEPTE_INDEX_SIZE(psize)      (mmu_huge_psizes[(psize)])
+#define PTRS_PER_HUGEPTE(psize)                (1 << mmu_huge_psizes[psize])
 
 #define HUGEPD_SHIFT(psize)            (mmu_psize_to_shift(psize) \
-                                               + hugepte_shift[psize])
+                                        + HUGEPTE_INDEX_SIZE(psize))
 #define HUGEPD_SIZE(psize)             (1UL << HUGEPD_SHIFT(psize))
 #define HUGEPD_MASK(psize)             (~(HUGEPD_SIZE(psize)-1))
 
-/* Subtract one from array size because we don't need a cache for 4K since
- * is not a huge page size */
-#define HUGE_PGTABLE_INDEX(psize)      (HUGEPTE_CACHE_NUM + psize - 1)
-#define HUGEPTE_CACHE_NAME(psize)      (huge_pgtable_cache_name[psize])
-
-static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
-       [MMU_PAGE_64K]  = "hugepte_cache_64K",
-       [MMU_PAGE_1M]   = "hugepte_cache_1M",
-       [MMU_PAGE_16M]  = "hugepte_cache_16M",
-       [MMU_PAGE_16G]  = "hugepte_cache_16G",
-};
-
 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  * will choke on pointers to hugepte tables, which is handy for
  * catching screwups early. */
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
                           unsigned long address, unsigned int psize)
 {
-       pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)],
-                                     GFP_KERNEL|__GFP_REPEAT);
+       pte_t *new = kmem_cache_zalloc(PGT_CACHE(hugepte_shift[psize]),
+                                      GFP_KERNEL|__GFP_REPEAT);
 
        if (! new)
                return -ENOMEM;
 
        spin_lock(&mm->page_table_lock);
        if (!hugepd_none(*hpdp))
-               kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new);
+               kmem_cache_free(PGT_CACHE(hugepte_shift[psize]), new);
        else
                hpdp->pd = (unsigned long)new | HUGEPD_OK;
        spin_unlock(&mm->page_table_lock);
 
        hpdp->pd = 0;
        tlb->need_flush = 1;
-       pgtable_free_tlb(tlb, pgtable_free_cache(hugepte,
-                                                HUGEPTE_CACHE_NUM+psize-1,
-                                                PGF_CACHENUM_MASK));
+       pgtable_free_tlb(tlb, hugepte, hugepte_shift[psize]);
 }
 
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
                if (mmu_huge_psizes[psize] ||
                   mmu_psize_defs[psize].shift == PAGE_SHIFT)
                        return;
-               if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL))
-                       return;
                hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
 
                switch (mmu_psize_defs[psize].shift) {
        if (!cpu_has_feature(CPU_FTR_16M_PAGE))
                return -ENODEV;
 
-       /* Add supported huge page sizes.  Need to change HUGE_MAX_HSTATE
-        * and adjust PTE_NONCACHE_NUM if the number of supported huge page
-        * sizes changes.
+       /* Add supported huge page sizes.  Need to change
+        *  HUGE_MAX_HSTATE if the number of supported huge page sizes
+        *  changes.
         */
        set_huge_psize(MMU_PAGE_16M);
        set_huge_psize(MMU_PAGE_16G);
 
        for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
                if (mmu_huge_psizes[psize]) {
-                       pgtable_cache[HUGE_PGTABLE_INDEX(psize)] =
-                               kmem_cache_create(
-                                       HUGEPTE_CACHE_NAME(psize),
-                                       HUGEPTE_TABLE_SIZE(psize),
-                                       HUGEPTE_TABLE_SIZE(psize),
-                                       0,
-                                       NULL);
-                       if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)])
-                               panic("hugetlbpage_init(): could not create %s"\
-                                     "\n", HUGEPTE_CACHE_NAME(psize));
+                       pgtable_cache_add(hugepte_shift[psize], NULL);
+                       if (!PGT_CACHE(hugepte_shift[psize]))
+                               panic("hugetlbpage_init(): could not create "
+                                     "pgtable cache for %d bit pagesize\n",
+                                     mmu_psize_to_shift(psize));
                }
        }
 
 
        memset(addr, 0, PMD_TABLE_SIZE);
 }
 
-static const unsigned int pgtable_cache_size[2] = {
-       PGD_TABLE_SIZE, PMD_TABLE_SIZE
-};
-static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
-#ifdef CONFIG_PPC_64K_PAGES
-       "pgd_cache", "pmd_cache",
-#else
-       "pgd_cache", "pud_pmd_cache",
-#endif /* CONFIG_PPC_64K_PAGES */
-};
-
-#ifdef CONFIG_HUGETLB_PAGE
-/* Hugepages need an extra cache per hugepagesize, initialized in
- * hugetlbpage.c.  We can't put into the tables above, because HPAGE_SHIFT
- * is not compile time constant. */
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT];
-#else
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
-#endif
+struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
+
+/*
+ * Create a kmem_cache() for pagetables.  This is not used for PTE
+ * pages - they're linked to struct page, come from the normal free
+ * pages pool and have a different entry size (see real_pte_t) to
+ * everything else.  Caches created by this function are used for all
+ * the higher level pagetables, and for hugepage pagetables.
+ */
+void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
+{
+       char *name;
+       unsigned long table_size = sizeof(void *) << shift;
+       unsigned long align = table_size;
+
+       /* When batching pgtable pointers for RCU freeing, we store
+        * the index size in the low bits.  Table alignment must be
+        * big enough to fit it */
+       unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1;
+       struct kmem_cache *new;
+
+       /* It would be nice if this was a BUILD_BUG_ON(), but at the
+        * moment, gcc doesn't seem to recognize is_power_of_2 as a
+        * constant expression, so so much for that. */
+       BUG_ON(!is_power_of_2(minalign));
+       BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE));
+
+       if (PGT_CACHE(shift))
+               return; /* Already have a cache of this size */
+
+       align = max_t(unsigned long, align, minalign);
+       name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
+       new = kmem_cache_create(name, table_size, align, 0, ctor);
+       PGT_CACHE(shift) = new;
+
+       pr_debug("Allocated pgtable cache for order %d\n", shift);
+}
+
 
 void pgtable_cache_init(void)
 {
-       pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor);
-       pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor);
+       pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
+       pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
+       if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE))
+               panic("Couldn't allocate pgtable caches");
+
+       /* In all current configs, when the PUD index exists it's the
+        * same size as either the pgd or pmd index.  Verify that the
+        * initialization above has also created a PUD cache.  This
+        * will need re-examiniation if we add new possibilities for
+        * the pagetable layout. */
+       BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE));
 }
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 
 {
        struct rcu_head rcu;
        unsigned int    index;
-       pgtable_free_t  tables[0];
+       unsigned long   tables[0];
 };
 
 #define PTE_FREELIST_SIZE \
        ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
-         / sizeof(pgtable_free_t))
+         / sizeof(unsigned long))
 
 static void pte_free_smp_sync(void *arg)
 {
 /* This is only called when we are critically out of memory
  * (and fail to get a page in pte_free_tlb).
  */
-static void pgtable_free_now(pgtable_free_t pgf)
+static void pgtable_free_now(void *table, unsigned shift)
 {
        pte_freelist_forced_free++;
 
        smp_call_function(pte_free_smp_sync, NULL, 1);
 
-       pgtable_free(pgf);
+       pgtable_free(table, shift);
 }
 
 static void pte_free_rcu_callback(struct rcu_head *head)
                container_of(head, struct pte_freelist_batch, rcu);
        unsigned int i;
 
-       for (i = 0; i < batch->index; i++)
-               pgtable_free(batch->tables[i]);
+       for (i = 0; i < batch->index; i++) {
+               void *table = (void *)(batch->tables[i] & ~MAX_PGTABLE_INDEX_SIZE);
+               unsigned shift = batch->tables[i] & MAX_PGTABLE_INDEX_SIZE;
+
+               pgtable_free(table, shift);
+       }
 
        free_page((unsigned long)batch);
 }
        call_rcu(&batch->rcu, pte_free_rcu_callback);
 }
 
-void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift)
 {
        /* This is safe since tlb_gather_mmu has disabled preemption */
        struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+       unsigned long pgf;
 
        if (atomic_read(&tlb->mm->mm_users) < 2 ||
            cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){
-               pgtable_free(pgf);
+               pgtable_free(table, shift);
                return;
        }
 
        if (*batchp == NULL) {
                *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
                if (*batchp == NULL) {
-                       pgtable_free_now(pgf);
+                       pgtable_free_now(table, shift);
                        return;
                }
                (*batchp)->index = 0;
        }
+       BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+       pgf = (unsigned long)table | shift;
        (*batchp)->tables[(*batchp)->index++] = pgf;
        if ((*batchp)->index == PTE_FREELIST_SIZE) {
                pte_free_submit(*batchp);