return remap_pfn_range(vma, from, phys_base >> PAGE_SHIFT, size, prot);
 }
 
+#include <asm/tlbflush.h>
 #include <asm-generic/pgtable.h>
 
 /* We provide our own get_unmapped_area to cope with VA holes and
 
         * and 2 stores in this critical code path.  -DaveM
         */
 #define switch_to(prev, next, last)                                    \
-do {   flush_tlb_pending();                                            \
-       save_and_clear_fpu();                                           \
+do {   save_and_clear_fpu();                                           \
        /* If you are tempted to conditionalize the following */        \
        /* so that ASI is only written if it changes, think again. */   \
        __asm__ __volatile__("wr %%g0, %0, %%asi"                       \
 
 struct tlb_batch {
        struct mm_struct *mm;
        unsigned long tlb_nr;
+       unsigned long active;
        unsigned long vaddrs[TLB_BATCH_NR];
 };
 
 extern void flush_tsb_kernel_range(unsigned long start, unsigned long end);
 extern void flush_tsb_user(struct tlb_batch *tb);
+extern void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr);
 
 /* TLB flush operations. */
 
-extern void flush_tlb_pending(void);
+static inline void flush_tlb_mm(struct mm_struct *mm)
+{
+}
+
+static inline void flush_tlb_page(struct vm_area_struct *vma,
+                                 unsigned long vmaddr)
+{
+}
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+                                  unsigned long start, unsigned long end)
+{
+}
+
+#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
 
-#define flush_tlb_range(vma,start,end) \
-       do { (void)(start); flush_tlb_pending(); } while (0)
-#define flush_tlb_page(vma,addr)       flush_tlb_pending()
-#define flush_tlb_mm(mm)               flush_tlb_pending()
+extern void flush_tlb_pending(void);
+extern void arch_enter_lazy_mmu_mode(void);
+extern void arch_leave_lazy_mmu_mode(void);
+#define arch_flush_lazy_mmu_mode()      do {} while (0)
 
 /* Local cpu only.  */
 extern void __flush_tlb_all(void);
-
+extern void __flush_tlb_page(unsigned long context, unsigned long vaddr);
 extern void __flush_tlb_kernel_range(unsigned long start, unsigned long end);
 
 #ifndef CONFIG_SMP
        __flush_tlb_kernel_range(start,end); \
 } while (0)
 
+static inline void global_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr)
+{
+       __flush_tlb_page(CTX_HWBITS(mm->context), vaddr);
+}
+
 #else /* CONFIG_SMP */
 
 extern void smp_flush_tlb_kernel_range(unsigned long start, unsigned long end);
+extern void smp_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr);
 
 #define flush_tlb_kernel_range(start, end) \
 do {   flush_tsb_kernel_range(start,end); \
        smp_flush_tlb_kernel_range(start, end); \
 } while (0)
 
+#define global_flush_tlb_page(mm, vaddr) \
+       smp_flush_tlb_page(mm, vaddr)
+
 #endif /* ! CONFIG_SMP */
 
 #endif /* _SPARC64_TLBFLUSH_H */
 
 }
 
 extern unsigned long xcall_flush_tlb_mm;
-extern unsigned long xcall_flush_tlb_pending;
+extern unsigned long xcall_flush_tlb_page;
 extern unsigned long xcall_flush_tlb_kernel_range;
 extern unsigned long xcall_fetch_glob_regs;
 extern unsigned long xcall_fetch_glob_pmu;
        put_cpu();
 }
 
+struct tlb_pending_info {
+       unsigned long ctx;
+       unsigned long nr;
+       unsigned long *vaddrs;
+};
+
+static void tlb_pending_func(void *info)
+{
+       struct tlb_pending_info *t = info;
+
+       __flush_tlb_pending(t->ctx, t->nr, t->vaddrs);
+}
+
 void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long *vaddrs)
 {
        u32 ctx = CTX_HWBITS(mm->context);
+       struct tlb_pending_info info;
        int cpu = get_cpu();
 
+       info.ctx = ctx;
+       info.nr = nr;
+       info.vaddrs = vaddrs;
+
        if (mm == current->mm && atomic_read(&mm->mm_users) == 1)
                cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
        else
-               smp_cross_call_masked(&xcall_flush_tlb_pending,
-                                     ctx, nr, (unsigned long) vaddrs,
-                                     mm_cpumask(mm));
+               smp_call_function_many(mm_cpumask(mm), tlb_pending_func,
+                                      &info, 1);
 
        __flush_tlb_pending(ctx, nr, vaddrs);
 
        put_cpu();
 }
 
+void smp_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr)
+{
+       unsigned long context = CTX_HWBITS(mm->context);
+       int cpu = get_cpu();
+
+       if (mm == current->mm && atomic_read(&mm->mm_users) == 1)
+               cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
+       else
+               smp_cross_call_masked(&xcall_flush_tlb_page,
+                                     context, vaddr, 0,
+                                     mm_cpumask(mm));
+       __flush_tlb_page(context, vaddr);
+
+       put_cpu();
+}
+
 void smp_flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
        start &= PAGE_MASK;
 
 void flush_tlb_pending(void)
 {
        struct tlb_batch *tb = &get_cpu_var(tlb_batch);
+       struct mm_struct *mm = tb->mm;
 
-       if (tb->tlb_nr) {
-               flush_tsb_user(tb);
+       if (!tb->tlb_nr)
+               goto out;
 
-               if (CTX_VALID(tb->mm->context)) {
+       flush_tsb_user(tb);
+
+       if (CTX_VALID(mm->context)) {
+               if (tb->tlb_nr == 1) {
+                       global_flush_tlb_page(mm, tb->vaddrs[0]);
+               } else {
 #ifdef CONFIG_SMP
                        smp_flush_tlb_pending(tb->mm, tb->tlb_nr,
                                              &tb->vaddrs[0]);
                                            tb->tlb_nr, &tb->vaddrs[0]);
 #endif
                }
-               tb->tlb_nr = 0;
        }
 
+       tb->tlb_nr = 0;
+
+out:
        put_cpu_var(tlb_batch);
 }
 
+void arch_enter_lazy_mmu_mode(void)
+{
+       struct tlb_batch *tb = &__get_cpu_var(tlb_batch);
+
+       tb->active = 1;
+}
+
+void arch_leave_lazy_mmu_mode(void)
+{
+       struct tlb_batch *tb = &__get_cpu_var(tlb_batch);
+
+       if (tb->tlb_nr)
+               flush_tlb_pending();
+       tb->active = 0;
+}
+
 static void tlb_batch_add_one(struct mm_struct *mm, unsigned long vaddr,
                              bool exec)
 {
                nr = 0;
        }
 
+       if (!tb->active) {
+               global_flush_tlb_page(mm, vaddr);
+               flush_tsb_user_page(mm, vaddr);
+               return;
+       }
+
        if (nr == 0)
                tb->mm = mm;
 
 
 #include <linux/preempt.h>
 #include <linux/slab.h>
 #include <asm/page.h>
-#include <asm/tlbflush.h>
-#include <asm/tlb.h>
-#include <asm/mmu_context.h>
 #include <asm/pgtable.h>
+#include <asm/mmu_context.h>
 #include <asm/tsb.h>
+#include <asm/tlb.h>
 #include <asm/oplib.h>
 
 extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
        }
 }
 
-static void __flush_tsb_one(struct tlb_batch *tb, unsigned long hash_shift,
-                           unsigned long tsb, unsigned long nentries)
+static void __flush_tsb_one_entry(unsigned long tsb, unsigned long v,
+                                 unsigned long hash_shift,
+                                 unsigned long nentries)
 {
-       unsigned long i;
+       unsigned long tag, ent, hash;
 
-       for (i = 0; i < tb->tlb_nr; i++) {
-               unsigned long v = tb->vaddrs[i];
-               unsigned long tag, ent, hash;
+       v &= ~0x1UL;
+       hash = tsb_hash(v, hash_shift, nentries);
+       ent = tsb + (hash * sizeof(struct tsb));
+       tag = (v >> 22UL);
 
-               v &= ~0x1UL;
+       tsb_flush(ent, tag);
+}
 
-               hash = tsb_hash(v, hash_shift, nentries);
-               ent = tsb + (hash * sizeof(struct tsb));
-               tag = (v >> 22UL);
+static void __flush_tsb_one(struct tlb_batch *tb, unsigned long hash_shift,
+                           unsigned long tsb, unsigned long nentries)
+{
+       unsigned long i;
 
-               tsb_flush(ent, tag);
-       }
+       for (i = 0; i < tb->tlb_nr; i++)
+               __flush_tsb_one_entry(tsb, tb->vaddrs[i], hash_shift, nentries);
 }
 
 void flush_tsb_user(struct tlb_batch *tb)
        spin_unlock_irqrestore(&mm->context.lock, flags);
 }
 
+void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr)
+{
+       unsigned long nentries, base, flags;
+
+       spin_lock_irqsave(&mm->context.lock, flags);
+
+       base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb;
+       nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries;
+       if (tlb_type == cheetah_plus || tlb_type == hypervisor)
+               base = __pa(base);
+       __flush_tsb_one_entry(base, vaddr, PAGE_SHIFT, nentries);
+
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
+       if (mm->context.tsb_block[MM_TSB_HUGE].tsb) {
+               base = (unsigned long) mm->context.tsb_block[MM_TSB_HUGE].tsb;
+               nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries;
+               if (tlb_type == cheetah_plus || tlb_type == hypervisor)
+                       base = __pa(base);
+               __flush_tsb_one_entry(base, vaddr, HPAGE_SHIFT, nentries);
+       }
+#endif
+       spin_unlock_irqrestore(&mm->context.lock, flags);
+}
+
 #define HV_PGSZ_IDX_BASE       HV_PGSZ_IDX_8K
 #define HV_PGSZ_MASK_BASE      HV_PGSZ_MASK_8K
 
 
        nop
        nop
 
+       .align          32
+       .globl          __flush_tlb_page
+__flush_tlb_page:      /* 22 insns */
+       /* %o0 = context, %o1 = vaddr */
+       rdpr            %pstate, %g7
+       andn            %g7, PSTATE_IE, %g2
+       wrpr            %g2, %pstate
+       mov             SECONDARY_CONTEXT, %o4
+       ldxa            [%o4] ASI_DMMU, %g2
+       stxa            %o0, [%o4] ASI_DMMU
+       andcc           %o1, 1, %g0
+       andn            %o1, 1, %o3
+       be,pn           %icc, 1f
+        or             %o3, 0x10, %o3
+       stxa            %g0, [%o3] ASI_IMMU_DEMAP
+1:     stxa            %g0, [%o3] ASI_DMMU_DEMAP
+       membar          #Sync
+       stxa            %g2, [%o4] ASI_DMMU
+       sethi           %hi(KERNBASE), %o4
+       flush           %o4
+       retl
+        wrpr           %g7, 0x0, %pstate
+       nop
+       nop
+       nop
+       nop
+
        .align          32
        .globl          __flush_tlb_pending
 __flush_tlb_pending:   /* 26 insns */
        retl
         wrpr           %g7, 0x0, %pstate
 
+__cheetah_flush_tlb_page:      /* 22 insns */
+       /* %o0 = context, %o1 = vaddr */
+       rdpr            %pstate, %g7
+       andn            %g7, PSTATE_IE, %g2
+       wrpr            %g2, 0x0, %pstate
+       wrpr            %g0, 1, %tl
+       mov             PRIMARY_CONTEXT, %o4
+       ldxa            [%o4] ASI_DMMU, %g2
+       srlx            %g2, CTX_PGSZ1_NUC_SHIFT, %o3
+       sllx            %o3, CTX_PGSZ1_NUC_SHIFT, %o3
+       or              %o0, %o3, %o0   /* Preserve nucleus page size fields */
+       stxa            %o0, [%o4] ASI_DMMU
+       andcc           %o1, 1, %g0
+       be,pn           %icc, 1f
+        andn           %o1, 1, %o3
+       stxa            %g0, [%o3] ASI_IMMU_DEMAP
+1:     stxa            %g0, [%o3] ASI_DMMU_DEMAP       
+       membar          #Sync
+       stxa            %g2, [%o4] ASI_DMMU
+       sethi           %hi(KERNBASE), %o4
+       flush           %o4
+       wrpr            %g0, 0, %tl
+       retl
+        wrpr           %g7, 0x0, %pstate
+
 __cheetah_flush_tlb_pending:   /* 27 insns */
        /* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
        rdpr            %pstate, %g7
        retl
         nop
 
+__hypervisor_flush_tlb_page: /* 11 insns */
+       /* %o0 = context, %o1 = vaddr */
+       mov             %o0, %g2
+       mov             %o1, %o0              /* ARG0: vaddr + IMMU-bit */
+       mov             %g2, %o1              /* ARG1: mmu context */
+       mov             HV_MMU_ALL, %o2       /* ARG2: flags */
+       srlx            %o0, PAGE_SHIFT, %o0
+       sllx            %o0, PAGE_SHIFT, %o0
+       ta              HV_MMU_UNMAP_ADDR_TRAP
+       brnz,pn         %o0, __hypervisor_tlb_tl0_error
+        mov            HV_MMU_UNMAP_ADDR_TRAP, %o1
+       retl
+        nop
+
 __hypervisor_flush_tlb_pending: /* 16 insns */
        /* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
        sllx            %o1, 3, %g1
        call            tlb_patch_one
         mov            19, %o2
 
+       sethi           %hi(__flush_tlb_page), %o0
+       or              %o0, %lo(__flush_tlb_page), %o0
+       sethi           %hi(__cheetah_flush_tlb_page), %o1
+       or              %o1, %lo(__cheetah_flush_tlb_page), %o1
+       call            tlb_patch_one
+        mov            22, %o2
+
        sethi           %hi(__flush_tlb_pending), %o0
        or              %o0, %lo(__flush_tlb_pending), %o0
        sethi           %hi(__cheetah_flush_tlb_pending), %o1
        nop
        nop
 
-       .globl          xcall_flush_tlb_pending
-xcall_flush_tlb_pending:       /* 21 insns */
-       /* %g5=context, %g1=nr, %g7=vaddrs[] */
-       sllx            %g1, 3, %g1
+       .globl          xcall_flush_tlb_page
+xcall_flush_tlb_page:  /* 17 insns */
+       /* %g5=context, %g1=vaddr */
        mov             PRIMARY_CONTEXT, %g4
        ldxa            [%g4] ASI_DMMU, %g2
        srlx            %g2, CTX_PGSZ1_NUC_SHIFT, %g4
        or              %g5, %g4, %g5
        mov             PRIMARY_CONTEXT, %g4
        stxa            %g5, [%g4] ASI_DMMU
-1:     sub             %g1, (1 << 3), %g1
-       ldx             [%g7 + %g1], %g5
-       andcc           %g5, 0x1, %g0
+       andcc           %g1, 0x1, %g0
        be,pn           %icc, 2f
-
-        andn           %g5, 0x1, %g5
+        andn           %g1, 0x1, %g5
        stxa            %g0, [%g5] ASI_IMMU_DEMAP
 2:     stxa            %g0, [%g5] ASI_DMMU_DEMAP
        membar          #Sync
-       brnz,pt         %g1, 1b
-        nop
        stxa            %g2, [%g4] ASI_DMMU
        retry
        nop
+       nop
 
        .globl          xcall_flush_tlb_kernel_range
 xcall_flush_tlb_kernel_range:  /* 25 insns */
        membar          #Sync
        retry
 
-       .globl          __hypervisor_xcall_flush_tlb_pending
-__hypervisor_xcall_flush_tlb_pending: /* 21 insns */
-       /* %g5=ctx, %g1=nr, %g7=vaddrs[], %g2,%g3,%g4,g6=scratch */
-       sllx            %g1, 3, %g1
+       .globl          __hypervisor_xcall_flush_tlb_page
+__hypervisor_xcall_flush_tlb_page: /* 17 insns */
+       /* %g5=ctx, %g1=vaddr */
        mov             %o0, %g2
        mov             %o1, %g3
        mov             %o2, %g4
-1:     sub             %g1, (1 << 3), %g1
-       ldx             [%g7 + %g1], %o0        /* ARG0: virtual address */
+       mov             %g1, %o0                /* ARG0: virtual address */
        mov             %g5, %o1                /* ARG1: mmu context */
        mov             HV_MMU_ALL, %o2         /* ARG2: flags */
        srlx            %o0, PAGE_SHIFT, %o0
        mov             HV_MMU_UNMAP_ADDR_TRAP, %g6
        brnz,a,pn       %o0, __hypervisor_tlb_xcall_error
         mov            %o0, %g5
-       brnz,pt         %g1, 1b
-        nop
        mov             %g2, %o0
        mov             %g3, %o1
        mov             %g4, %o2
        call            tlb_patch_one
         mov            10, %o2
 
+       sethi           %hi(__flush_tlb_page), %o0
+       or              %o0, %lo(__flush_tlb_page), %o0
+       sethi           %hi(__hypervisor_flush_tlb_page), %o1
+       or              %o1, %lo(__hypervisor_flush_tlb_page), %o1
+       call            tlb_patch_one
+        mov            11, %o2
+
        sethi           %hi(__flush_tlb_pending), %o0
        or              %o0, %lo(__flush_tlb_pending), %o0
        sethi           %hi(__hypervisor_flush_tlb_pending), %o1
        call            tlb_patch_one
         mov            21, %o2
 
-       sethi           %hi(xcall_flush_tlb_pending), %o0
-       or              %o0, %lo(xcall_flush_tlb_pending), %o0
-       sethi           %hi(__hypervisor_xcall_flush_tlb_pending), %o1
-       or              %o1, %lo(__hypervisor_xcall_flush_tlb_pending), %o1
+       sethi           %hi(xcall_flush_tlb_page), %o0
+       or              %o0, %lo(xcall_flush_tlb_page), %o0
+       sethi           %hi(__hypervisor_xcall_flush_tlb_page), %o1
+       or              %o1, %lo(__hypervisor_xcall_flush_tlb_page), %o1
        call            tlb_patch_one
-        mov            21, %o2
+        mov            17, %o2
 
        sethi           %hi(xcall_flush_tlb_kernel_range), %o0
        or              %o0, %lo(xcall_flush_tlb_kernel_range), %o0