select ARCH_USE_CMPXCHG_LOCKREF         if X86_64
        select ARCH_USE_QUEUED_RWLOCKS
        select ARCH_USE_QUEUED_SPINLOCKS
+       select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
        select ARCH_WANTS_DYNAMIC_TASK_STRUCT
        select ARCH_WANT_FRAME_POINTERS
        select ARCH_WANT_IPC_PARSE_VERSION      if X86_32
 
 
 #endif /* SMP */
 
+/* Not inlined due to inc_irq_stat not being defined yet */
+#define flush_tlb_local() {            \
+       inc_irq_stat(irq_tlb_count);    \
+       local_flush_tlb();              \
+}
+
 #ifndef CONFIG_PARAVIRT
 #define flush_tlb_others(mask, mm, start, end) \
        native_flush_tlb_others(mask, mm, start, end)
 
        TTU_IGNORE_MLOCK = (1 << 8),    /* ignore mlock */
        TTU_IGNORE_ACCESS = (1 << 9),   /* don't age */
        TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
+       TTU_BATCH_FLUSH = (1 << 11),    /* Batch TLB flushes where possible
+                                        * and caller guarantees they will
+                                        * do a final flush if necessary */
 };
 
 #ifdef CONFIG_MMU
 
        perf_nr_task_contexts,
 };
 
+/* Track pages that require TLB flushes */
+struct tlbflush_unmap_batch {
+       /*
+        * Each bit set is a CPU that potentially has a TLB entry for one of
+        * the PFNs being flushed. See set_tlb_ubc_flush_pending().
+        */
+       struct cpumask cpumask;
+
+       /* True if any bit in cpumask is set */
+       bool flush_required;
+};
+
 struct task_struct {
        volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
        void *stack;
        unsigned long numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+       struct tlbflush_unmap_batch tlb_ubc;
+#endif
+
        struct rcu_head rcu;
 
        /*
 
 config ARCH_SUPPORTS_NUMA_BALANCING
        bool
 
+#
+# For architectures that prefer to flush all TLBs after a number of pages
+# are unmapped instead of sending one IPI per page to flush. The architecture
+# must provide guarantees on what happens if a clean TLB cache entry is
+# written after the unmap. Details are in mm/rmap.c near the check for
+# should_defer_flush. The architecture should also consider if the full flush
+# and the refill costs are offset by the savings of sending fewer IPIs.
+config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+       bool
+
 #
 # For architectures that know their GCC __int128 support is sound
 #
 
 #define ALLOC_CMA              0x80 /* allow allocations from CMA areas */
 #define ALLOC_FAIR             0x100 /* fair zone allocation */
 
+enum ttu_flags;
+struct tlbflush_unmap_batch;
+
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+void try_to_unmap_flush(void);
+#else
+static inline void try_to_unmap_flush(void)
+{
+}
+
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 #endif /* __MM_INTERNAL_H */
 
 
 #include <asm/tlbflush.h>
 
+#include <trace/events/tlb.h>
+
 #include "internal.h"
 
 static struct kmem_cache *anon_vma_cachep;
        return address;
 }
 
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void percpu_flush_tlb_batch_pages(void *data)
+{
+       /*
+        * All TLB entries are flushed on the assumption that it is
+        * cheaper to flush all TLBs and let them be refilled than
+        * flushing individual PFNs. Note that we do not track mm's
+        * to flush as that might simply be multiple full TLB flushes
+        * for no gain.
+        */
+       count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+       flush_tlb_local();
+}
+
+/*
+ * Flush TLB entries for recently unmapped pages from remote CPUs. It is
+ * important if a PTE was dirty when it was unmapped that it's flushed
+ * before any IO is initiated on the page to prevent lost writes. Similarly,
+ * it must be flushed before freeing to prevent data leakage.
+ */
+void try_to_unmap_flush(void)
+{
+       struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
+       int cpu;
+
+       if (!tlb_ubc->flush_required)
+               return;
+
+       cpu = get_cpu();
+
+       trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL);
+
+       if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask))
+               percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask);
+
+       if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) {
+               smp_call_function_many(&tlb_ubc->cpumask,
+                       percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true);
+       }
+       cpumask_clear(&tlb_ubc->cpumask);
+       tlb_ubc->flush_required = false;
+       put_cpu();
+}
+
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+               struct page *page)
+{
+       struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
+
+       cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
+       tlb_ubc->flush_required = true;
+}
+
+/*
+ * Returns true if the TLB flush should be deferred to the end of a batch of
+ * unmap operations to reduce IPIs.
+ */
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+       bool should_defer = false;
+
+       if (!(flags & TTU_BATCH_FLUSH))
+               return false;
+
+       /* If remote CPUs need to be flushed then defer batch the flush */
+       if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
+               should_defer = true;
+       put_cpu();
+
+       return should_defer;
+}
+#else
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+               struct page *page)
+{
+}
+
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+       return false;
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
 /*
  * At what user virtual address is page expected in vma?
  * Caller should check the page is actually part of the vma.
 
        /* Nuke the page table entry. */
        flush_cache_page(vma, address, page_to_pfn(page));
-       pteval = ptep_clear_flush(vma, address, pte);
+       if (should_defer_flush(mm, flags)) {
+               /*
+                * We clear the PTE but do not flush so potentially a remote
+                * CPU could still be writing to the page. If the entry was
+                * previously clean then the architecture must guarantee that
+                * a clear->dirty transition on a cached TLB entry is written
+                * through and traps if the PTE is unmapped.
+                */
+               pteval = ptep_get_and_clear(mm, address, pte);
+
+               /* Potentially writable TLBs must be flushed before IO */
+               if (pte_dirty(pteval))
+                       flush_tlb_page(vma, address);
+               else
+                       set_tlb_ubc_flush_pending(mm, page);
+       } else {
+               pteval = ptep_clear_flush(vma, address, pte);
+       }
 
        /* Move the dirty bit to the physical page now the pte is gone. */
        if (pte_dirty(pteval))
 
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                       switch (try_to_unmap(page, ttu_flags)) {
+                       switch (try_to_unmap(page,
+                                       ttu_flags|TTU_BATCH_FLUSH)) {
                        case SWAP_FAIL:
                                goto activate_locked;
                        case SWAP_AGAIN:
        }
 
        mem_cgroup_uncharge_list(&free_pages);
+       try_to_unmap_flush();
        free_hot_cold_page_list(&free_pages, true);
 
        list_splice(&ret_pages, page_list);
        }
 }
 
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void init_tlb_ubc(void)
+{
+       /*
+        * This deliberately does not clear the cpumask as it's expensive
+        * and unnecessary. If there happens to be data in there then the
+        * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and
+        * then will be cleared.
+        */
+       current->tlb_ubc.flush_required = false;
+}
+#else
+static inline void init_tlb_ubc(void)
+{
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
        scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
                         sc->priority == DEF_PRIORITY);
 
+       init_tlb_ubc();
+
        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {