* Relying on flush_tlb_fix_spurious_fault would suffice, but
         * the extra traps reduce performance.  So, eagerly SFENCE.VMA.
         */
-       local_flush_tlb_page(address);
+       flush_tlb_page(vma, address);
 }
 
 #define __HAVE_ARCH_UPDATE_MMU_TLB
 
 {
        ALT_FLUSH_TLB_PAGE(__asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory"));
 }
+
+static inline void local_flush_tlb_all_asid(unsigned long asid)
+{
+       __asm__ __volatile__ ("sfence.vma x0, %0"
+                       :
+                       : "r" (asid)
+                       : "memory");
+}
+
+static inline void local_flush_tlb_page_asid(unsigned long addr,
+               unsigned long asid)
+{
+       __asm__ __volatile__ ("sfence.vma %0, %1"
+                       :
+                       : "r" (addr), "r" (asid)
+                       : "memory");
+}
+
 #else /* CONFIG_MMU */
 #define local_flush_tlb_all()                  do { } while (0)
 #define local_flush_tlb_page(addr)             do { } while (0)
 
 
        if (need_flush_tlb)
                local_flush_tlb_all();
+#ifdef CONFIG_SMP
+       else {
+               cpumask_t *mask = &mm->context.tlb_stale_mask;
+
+               if (cpumask_test_cpu(cpu, mask)) {
+                       cpumask_clear_cpu(cpu, mask);
+                       local_flush_tlb_all_asid(cntx & asid_mask);
+               }
+       }
+#endif
 }
 
 static void set_mm_noasid(struct mm_struct *mm)
 
 #include <linux/sched.h>
 #include <asm/sbi.h>
 #include <asm/mmu_context.h>
-
-static inline void local_flush_tlb_all_asid(unsigned long asid)
-{
-       __asm__ __volatile__ ("sfence.vma x0, %0"
-                       :
-                       : "r" (asid)
-                       : "memory");
-}
-
-static inline void local_flush_tlb_page_asid(unsigned long addr,
-               unsigned long asid)
-{
-       __asm__ __volatile__ ("sfence.vma %0, %1"
-                       :
-                       : "r" (addr), "r" (asid)
-                       : "memory");
-}
+#include <asm/tlbflush.h>
 
 void flush_tlb_all(void)
 {
 static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start,
                                  unsigned long size, unsigned long stride)
 {
+       struct cpumask *pmask = &mm->context.tlb_stale_mask;
        struct cpumask *cmask = mm_cpumask(mm);
        unsigned int cpuid;
        bool broadcast;
        if (static_branch_unlikely(&use_asid_allocator)) {
                unsigned long asid = atomic_long_read(&mm->context.id);
 
+               /*
+                * TLB will be immediately flushed on harts concurrently
+                * executing this MM context. TLB flush on other harts
+                * is deferred until this MM context migrates there.
+                */
+               cpumask_setall(pmask);
+               cpumask_clear_cpu(cpuid, pmask);
+               cpumask_andnot(pmask, pmask, cmask);
+
                if (broadcast) {
                        sbi_remote_sfence_vma_asid(cmask, start, size, asid);
                } else if (size <= stride) {