#ifndef __MMU_H
 #define __MMU_H
 
+#include <linux/cpumask.h>
 #include <linux/errno.h>
 
 typedef struct {
+       cpumask_t cpu_attach_mask;
        atomic_t attach_count;
        unsigned int flush_mm;
        spinlock_t list_lock;
 
 static inline int init_new_context(struct task_struct *tsk,
                                   struct mm_struct *mm)
 {
+       cpumask_clear(&mm->context.cpu_attach_mask);
        atomic_set(&mm->context.attach_count, 0);
        mm->context.flush_mm = 0;
        mm->context.asce_bits = _ASCE_TABLE_LENGTH | _ASCE_USER_BITS;
 
        if (prev == next)
                return;
+       if (MACHINE_HAS_TLB_LC)
+               cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
        if (atomic_inc_return(&next->context.attach_count) >> 16) {
                /* Delay update_user_asce until all TLB flushes are done. */
                set_tsk_thread_flag(tsk, TIF_TLB_WAIT);
        }
        atomic_dec(&prev->context.attach_count);
        WARN_ON(atomic_read(&prev->context.attach_count) < 0);
+       if (MACHINE_HAS_TLB_LC)
+               cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask);
 }
 
 #define finish_arch_post_lock_switch finish_arch_post_lock_switch
 
                : "=m" (*ptep) : "m" (*ptep), "a" (pto), "a" (address));
 }
 
+static inline void __ptep_ipte_local(unsigned long address, pte_t *ptep)
+{
+       unsigned long pto = (unsigned long) ptep;
+
+#ifndef CONFIG_64BIT
+       /* pto in ESA mode must point to the start of the segment table */
+       pto &= 0x7ffffc00;
+#endif
+       /* Invalidation + local TLB flush for the pte */
+       asm volatile(
+               "       .insn rrf,0xb2210000,%2,%3,0,1"
+               : "=m" (*ptep) : "m" (*ptep), "a" (pto), "a" (address));
+}
+
 static inline void ptep_flush_direct(struct mm_struct *mm,
                                     unsigned long address, pte_t *ptep)
 {
+       int active, count;
+
        if (pte_val(*ptep) & _PAGE_INVALID)
                return;
-       __ptep_ipte(address, ptep);
+       active = (mm == current->active_mm) ? 1 : 0;
+       count = atomic_add_return(0x10000, &mm->context.attach_count);
+       if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active &&
+           cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+               __ptep_ipte_local(address, ptep);
+       else
+               __ptep_ipte(address, ptep);
+       atomic_sub(0x10000, &mm->context.attach_count);
 }
 
 static inline void ptep_flush_lazy(struct mm_struct *mm,
 #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address)
 #define pte_unmap(pte) do { } while (0)
 
-static inline void __pmd_idte(unsigned long address, pmd_t *pmdp)
-{
-       unsigned long sto = (unsigned long) pmdp -
-                           pmd_index(address) * sizeof(pmd_t);
-
-       if (!(pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)) {
-               asm volatile(
-                       "       .insn   rrf,0xb98e0000,%2,%3,0,0"
-                       : "=m" (*pmdp)
-                       : "m" (*pmdp), "a" (sto),
-                         "a" ((address & HPAGE_MASK))
-                       : "cc"
-               );
-       }
-}
-
-static inline void __pmd_csp(pmd_t *pmdp)
-{
-       register unsigned long reg2 asm("2") = pmd_val(*pmdp);
-       register unsigned long reg3 asm("3") = pmd_val(*pmdp) |
-                                              _SEGMENT_ENTRY_INVALID;
-       register unsigned long reg4 asm("4") = ((unsigned long) pmdp) + 5;
-
-       asm volatile(
-               "       csp %1,%3"
-               : "=m" (*pmdp)
-               : "d" (reg2), "d" (reg3), "d" (reg4), "m" (*pmdp) : "cc");
-}
-
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
 static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot)
 {
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLB_PAGE */
 
+static inline void __pmdp_csp(pmd_t *pmdp)
+{
+       register unsigned long reg2 asm("2") = pmd_val(*pmdp);
+       register unsigned long reg3 asm("3") = pmd_val(*pmdp) |
+                                              _SEGMENT_ENTRY_INVALID;
+       register unsigned long reg4 asm("4") = ((unsigned long) pmdp) + 5;
+
+       asm volatile(
+               "       csp %1,%3"
+               : "=m" (*pmdp)
+               : "d" (reg2), "d" (reg3), "d" (reg4), "m" (*pmdp) : "cc");
+}
+
+static inline void __pmdp_idte(unsigned long address, pmd_t *pmdp)
+{
+       unsigned long sto;
+
+       sto = (unsigned long) pmdp - pmd_index(address) * sizeof(pmd_t);
+       asm volatile(
+               "       .insn   rrf,0xb98e0000,%2,%3,0,0"
+               : "=m" (*pmdp)
+               : "m" (*pmdp), "a" (sto), "a" ((address & HPAGE_MASK))
+               : "cc" );
+}
+
+static inline void __pmdp_idte_local(unsigned long address, pmd_t *pmdp)
+{
+       unsigned long sto;
+
+       sto = (unsigned long) pmdp - pmd_index(address) * sizeof(pmd_t);
+       asm volatile(
+               "       .insn   rrf,0xb98e0000,%2,%3,0,1"
+               : "=m" (*pmdp)
+               : "m" (*pmdp), "a" (sto), "a" ((address & HPAGE_MASK))
+               : "cc" );
+}
+
+static inline void pmdp_flush_direct(struct mm_struct *mm,
+                                    unsigned long address, pmd_t *pmdp)
+{
+       int active, count;
+
+       if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
+               return;
+       if (!MACHINE_HAS_IDTE) {
+               __pmdp_csp(pmdp);
+               return;
+       }
+       active = (mm == current->active_mm) ? 1 : 0;
+       count = atomic_add_return(0x10000, &mm->context.attach_count);
+       if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active &&
+           cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+               __pmdp_idte_local(address, pmdp);
+       else
+               __pmdp_idte(address, pmdp);
+       atomic_sub(0x10000, &mm->context.attach_count);
+}
+
 static inline void pmdp_flush_lazy(struct mm_struct *mm,
                                   unsigned long address, pmd_t *pmdp)
 {
        int active, count;
 
+       if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
+               return;
        active = (mm == current->active_mm) ? 1 : 0;
        count = atomic_add_return(0x10000, &mm->context.attach_count);
        if ((count & 0xffff) <= active) {
                pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID;
                mm->context.flush_mm = 1;
-       } else
-               __pmd_idte(address, pmdp);
+       } else if (MACHINE_HAS_IDTE)
+               __pmdp_idte(address, pmdp);
+       else
+               __pmdp_csp(pmdp);
        atomic_sub(0x10000, &mm->context.attach_count);
 }
 
        pmd_t pmd;
 
        pmd = *pmdp;
-       __pmd_idte(address, pmdp);
+       pmdp_flush_direct(vma->vm_mm, address, pmdp);
        *pmdp = pmd_mkold(pmd);
        return pmd_young(pmd);
 }
 {
        pmd_t pmd = *pmdp;
 
-       __pmd_idte(address, pmdp);
+       pmdp_flush_direct(mm, address, pmdp);
        pmd_clear(pmdp);
        return pmd;
 }
 static inline void pmdp_invalidate(struct vm_area_struct *vma,
                                   unsigned long address, pmd_t *pmdp)
 {
-       __pmd_idte(address, pmdp);
+       pmdp_flush_direct(vma->vm_mm, address, pmdp);
 }
 
 #define __HAVE_ARCH_PMDP_SET_WRPROTECT
        pmd_t pmd = *pmdp;
 
        if (pmd_write(pmd)) {
-               __pmd_idte(address, pmdp);
+               pmdp_flush_direct(mm, address, pmdp);
                set_pmd_at(mm, address, pmdp, pmd_wrprotect(pmd));
        }
 }
 
 #define MACHINE_FLAG_TOPOLOGY  (1UL << 14)
 #define MACHINE_FLAG_TE                (1UL << 15)
 #define MACHINE_FLAG_RRBM      (1UL << 16)
+#define MACHINE_FLAG_TLB_LC    (1UL << 17)
 
 #define MACHINE_IS_VM          (S390_lowcore.machine_flags & MACHINE_FLAG_VM)
 #define MACHINE_IS_KVM         (S390_lowcore.machine_flags & MACHINE_FLAG_KVM)
 #define MACHINE_HAS_TOPOLOGY   (0)
 #define MACHINE_HAS_TE         (0)
 #define MACHINE_HAS_RRBM       (0)
+#define MACHINE_HAS_TLB_LC     (0)
 #else /* CONFIG_64BIT */
 #define MACHINE_HAS_IEEE       (1)
 #define MACHINE_HAS_CSP                (1)
 #define MACHINE_HAS_TOPOLOGY   (S390_lowcore.machine_flags & MACHINE_FLAG_TOPOLOGY)
 #define MACHINE_HAS_TE         (S390_lowcore.machine_flags & MACHINE_FLAG_TE)
 #define MACHINE_HAS_RRBM       (S390_lowcore.machine_flags & MACHINE_FLAG_RRBM)
+#define MACHINE_HAS_TLB_LC     (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_LC)
 #endif /* CONFIG_64BIT */
 
 /*
 
 #include <asm/pgalloc.h>
 
 /*
- * Flush all tlb entries on the local cpu.
+ * Flush all TLB entries on the local CPU.
  */
 static inline void __tlb_flush_local(void)
 {
        asm volatile("ptlb" : : : "memory");
 }
 
-#ifdef CONFIG_SMP
 /*
- * Flush all tlb entries on all cpus.
+ * Flush TLB entries for a specific ASCE on all CPUs
  */
+static inline void __tlb_flush_idte(unsigned long asce)
+{
+       /* Global TLB flush for the mm */
+       asm volatile(
+               "       .insn   rrf,0xb98e0000,0,%0,%1,0"
+               : : "a" (2048), "a" (asce) : "cc");
+}
+
+/*
+ * Flush TLB entries for a specific ASCE on the local CPU
+ */
+static inline void __tlb_flush_idte_local(unsigned long asce)
+{
+       /* Local TLB flush for the mm */
+       asm volatile(
+               "       .insn   rrf,0xb98e0000,0,%0,%1,1"
+               : : "a" (2048), "a" (asce) : "cc");
+}
+
+#ifdef CONFIG_SMP
 void smp_ptlb_all(void);
 
+/*
+ * Flush all TLB entries on all CPUs.
+ */
 static inline void __tlb_flush_global(void)
 {
        register unsigned long reg2 asm("2");
                : : "d" (reg2), "d" (reg3), "d" (reg4), "m" (dummy) : "cc" );
 }
 
+/*
+ * Flush TLB entries for a specific mm on all CPUs (in case gmap is used
+ * this implicates multiple ASCEs!).
+ */
 static inline void __tlb_flush_full(struct mm_struct *mm)
 {
-       cpumask_t local_cpumask;
-
        preempt_disable();
-       /*
-        * If the process only ran on the local cpu, do a local flush.
-        */
-       cpumask_copy(&local_cpumask, cpumask_of(smp_processor_id()));
-       if (cpumask_equal(mm_cpumask(mm), &local_cpumask))
+       atomic_add(0x10000, &mm->context.attach_count);
+       if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
+               /* Local TLB flush */
                __tlb_flush_local();
-       else
+       } else {
+               /* Global TLB flush */
                __tlb_flush_global();
+               /* Reset TLB flush mask */
+               if (MACHINE_HAS_TLB_LC)
+                       cpumask_copy(mm_cpumask(mm),
+                                    &mm->context.cpu_attach_mask);
+       }
+       atomic_sub(0x10000, &mm->context.attach_count);
        preempt_enable();
 }
+
+/*
+ * Flush TLB entries for a specific ASCE on all CPUs.
+ */
+static inline void __tlb_flush_asce(struct mm_struct *mm, unsigned long asce)
+{
+       int active, count;
+
+       preempt_disable();
+       active = (mm == current->active_mm) ? 1 : 0;
+       count = atomic_add_return(0x10000, &mm->context.attach_count);
+       if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active &&
+           cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
+               __tlb_flush_idte_local(asce);
+       } else {
+               if (MACHINE_HAS_IDTE)
+                       __tlb_flush_idte(asce);
+               else
+                       __tlb_flush_global();
+               /* Reset TLB flush mask */
+               if (MACHINE_HAS_TLB_LC)
+                       cpumask_copy(mm_cpumask(mm),
+                                    &mm->context.cpu_attach_mask);
+       }
+       atomic_sub(0x10000, &mm->context.attach_count);
+       preempt_enable();
+}
+
+static inline void __tlb_flush_kernel(void)
+{
+       if (MACHINE_HAS_IDTE)
+               __tlb_flush_idte((unsigned long) init_mm.pgd |
+                                init_mm.context.asce_bits);
+       else
+               __tlb_flush_global();
+}
 #else
-#define __tlb_flush_full(mm)   __tlb_flush_local()
 #define __tlb_flush_global()   __tlb_flush_local()
-#endif
+#define __tlb_flush_full(mm)   __tlb_flush_local()
 
 /*
- * Flush all tlb entries of a page table on all cpus.
+ * Flush TLB entries for a specific ASCE on all CPUs.
  */
-static inline void __tlb_flush_idte(unsigned long asce)
+static inline void __tlb_flush_asce(struct mm_struct *mm, unsigned long asce)
 {
-       asm volatile(
-               "       .insn   rrf,0xb98e0000,0,%0,%1,0"
-               : : "a" (2048), "a" (asce) : "cc" );
+       if (MACHINE_HAS_TLB_LC)
+               __tlb_flush_idte_local(asce);
+       else
+               __tlb_flush_local();
 }
 
+static inline void __tlb_flush_kernel(void)
+{
+       if (MACHINE_HAS_TLB_LC)
+               __tlb_flush_idte_local((unsigned long) init_mm.pgd |
+                                      init_mm.context.asce_bits);
+       else
+               __tlb_flush_local();
+}
+#endif
+
 static inline void __tlb_flush_mm(struct mm_struct * mm)
 {
        /*
         * only ran on the local cpu.
         */
        if (MACHINE_HAS_IDTE && list_empty(&mm->context.gmap_list))
-               __tlb_flush_idte((unsigned long) mm->pgd |
+               __tlb_flush_asce(mm, (unsigned long) mm->pgd |
                                 mm->context.asce_bits);
        else
                __tlb_flush_full(mm);
 static inline void flush_tlb_kernel_range(unsigned long start,
                                          unsigned long end)
 {
-       __tlb_flush_mm(&init_mm);
+       __tlb_flush_kernel();
 }
 
 #endif /* _S390_TLBFLUSH_H */
 
                S390_lowcore.machine_flags |= MACHINE_FLAG_TE;
        if (test_facility(66))
                S390_lowcore.machine_flags |= MACHINE_FLAG_RRBM;
+       if (test_facility(51))
+               S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC;
 #endif
 }
 
 
 {
        struct _lowcore *lc = pcpu->lowcore;
 
+       if (MACHINE_HAS_TLB_LC)
+               cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask);
+       cpumask_set_cpu(cpu, mm_cpumask(&init_mm));
        atomic_inc(&init_mm.context.attach_count);
        lc->cpu_nr = cpu;
        lc->percpu_offset = __per_cpu_offset[cpu];
                cpu_relax();
        pcpu_free_lowcore(pcpu);
        atomic_dec(&init_mm.context.attach_count);
+       cpumask_clear_cpu(cpu, mm_cpumask(&init_mm));
+       if (MACHINE_HAS_TLB_LC)
+               cpumask_clear_cpu(cpu, &init_mm.context.cpu_attach_mask);
 }
 
 void __noreturn cpu_die(void)
 
        pmd_t *pmdp = (pmd_t *) ptep;
        pte_t pte = huge_ptep_get(ptep);
 
-       if (MACHINE_HAS_IDTE)
-               __pmd_idte(addr, pmdp);
-       else
-               __pmd_csp(pmdp);
+       pmdp_flush_direct(mm, addr, pmdp);
        pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
        return pte;
 }
 
        __ctl_load(S390_lowcore.kernel_asce, 13, 13);
        arch_local_irq_restore(4UL << (BITS_PER_LONG - 8));
 
-       atomic_set(&init_mm.context.attach_count, 1);
-
        sparse_memory_present_with_active_regions(MAX_NUMNODES);
        sparse_init();
        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 
 void __init mem_init(void)
 {
+       if (MACHINE_HAS_TLB_LC)
+               cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask);
+       cpumask_set_cpu(0, mm_cpumask(&init_mm));
+       atomic_set(&init_mm.context.attach_count, 1);
+
         max_mapnr = max_low_pfn;
         high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
 
 
 static void gmap_flush_tlb(struct gmap *gmap)
 {
        if (MACHINE_HAS_IDTE)
-               __tlb_flush_idte((unsigned long) gmap->table |
+               __tlb_flush_asce(gmap->mm, (unsigned long) gmap->table |
                                 _ASCE_TYPE_REGION1);
        else
                __tlb_flush_global();
 
        /* Flush tlb. */
        if (MACHINE_HAS_IDTE)
-               __tlb_flush_idte((unsigned long) gmap->table |
+               __tlb_flush_asce(gmap->mm, (unsigned long) gmap->table |
                                 _ASCE_TYPE_REGION1);
        else
                __tlb_flush_global();
 
        }
        ret = 0;
 out:
-       flush_tlb_kernel_range(start, end);
        return ret;
 }
 
        memset((void *)start, 0, end - start);
        ret = 0;
 out:
-       flush_tlb_kernel_range(start, end);
        return ret;
 }