return gva_n - offset;
 }
 
-static void hyperv_flush_tlb_others(const struct cpumask *cpus,
-                                   const struct flush_tlb_info *info)
+static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
+                                  const struct flush_tlb_info *info)
 {
        int cpu, vcpu, gva_n, max_gvas;
        struct hv_tlb_flush **flush_pcpu;
        u64 status = U64_MAX;
        unsigned long flags;
 
-       trace_hyperv_mmu_flush_tlb_others(cpus, info);
+       trace_hyperv_mmu_flush_tlb_multi(cpus, info);
 
        if (!hv_hypercall_pg)
                goto do_native;
        if (!(status & HV_HYPERCALL_RESULT_MASK))
                return;
 do_native:
-       native_flush_tlb_others(cpus, info);
+       native_flush_tlb_multi(cpus, info);
 }
 
 static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
                return;
 
        pr_info("Using hypercall for remote TLB flush\n");
-       pv_ops.mmu.flush_tlb_others = hyperv_flush_tlb_others;
+       pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
        pv_ops.mmu.tlb_remove_table = tlb_remove_table;
 }
 
 void native_flush_tlb_local(void);
 void native_flush_tlb_global(void);
 void native_flush_tlb_one_user(unsigned long addr);
-void native_flush_tlb_others(const struct cpumask *cpumask,
+void native_flush_tlb_multi(const struct cpumask *cpumask,
                             const struct flush_tlb_info *info);
 
 static inline void __flush_tlb_local(void)
        PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
 }
 
-static inline void __flush_tlb_others(const struct cpumask *cpumask,
+static inline void __flush_tlb_multi(const struct cpumask *cpumask,
                                      const struct flush_tlb_info *info)
 {
-       PVOP_VCALL2(mmu.flush_tlb_others, cpumask, info);
+       PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
 }
 
 static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
 
        void (*flush_tlb_user)(void);
        void (*flush_tlb_kernel)(void);
        void (*flush_tlb_one_user)(unsigned long addr);
-       void (*flush_tlb_others)(const struct cpumask *cpus,
-                                const struct flush_tlb_info *info);
+       void (*flush_tlb_multi)(const struct cpumask *cpus,
+                               const struct flush_tlb_info *info);
 
        void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
 
 
  *  - flush_tlb_page(vma, vmaddr) flushes one page
  *  - flush_tlb_range(vma, start, end) flushes a range of pages
  *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
- *  - flush_tlb_others(cpumask, info) flushes TLBs on other cpus
+ *  - flush_tlb_multi(cpumask, info) flushes TLBs on multiple cpus
  *
  * ..but the i386 has somewhat limited tlb flushing capabilities,
  * and page-granular flushes are available only on i486 and up.
 void flush_tlb_local(void);
 void flush_tlb_one_user(unsigned long addr);
 void flush_tlb_one_kernel(unsigned long addr);
-void flush_tlb_others(const struct cpumask *cpumask,
+void flush_tlb_multi(const struct cpumask *cpumask,
                      const struct flush_tlb_info *info);
 
 #ifdef CONFIG_PARAVIRT
 
 
 #if IS_ENABLED(CONFIG_HYPERV)
 
-TRACE_EVENT(hyperv_mmu_flush_tlb_others,
+TRACE_EVENT(hyperv_mmu_flush_tlb_multi,
            TP_PROTO(const struct cpumask *cpus,
                     const struct flush_tlb_info *info),
            TP_ARGS(cpus, info),
 
 }
 #endif
 
-static void kvm_flush_tlb_others(const struct cpumask *cpumask,
+static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
                        const struct flush_tlb_info *info)
 {
        u8 state;
         * queue flush_on_enter for pre-empted vCPUs
         */
        for_each_cpu(cpu, flushmask) {
+               /*
+                * The local vCPU is never preempted, so we do not explicitly
+                * skip check for local vCPU - it will never be cleared from
+                * flushmask.
+                */
                src = &per_cpu(steal_time, cpu);
                state = READ_ONCE(src->preempted);
                if ((state & KVM_VCPU_PREEMPTED)) {
                }
        }
 
-       native_flush_tlb_others(flushmask, info);
+       native_flush_tlb_multi(flushmask, info);
 }
 
 static void __init kvm_guest_init(void)
        }
 
        if (pv_tlb_flush_supported()) {
-               pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
+               pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
                pv_ops.mmu.tlb_remove_table = tlb_remove_table;
                pr_info("KVM setup pv remote TLB flush\n");
        }
 
        .mmu.flush_tlb_user     = native_flush_tlb_local,
        .mmu.flush_tlb_kernel   = native_flush_tlb_global,
        .mmu.flush_tlb_one_user = native_flush_tlb_one_user,
-       .mmu.flush_tlb_others   = native_flush_tlb_others,
+       .mmu.flush_tlb_multi    = native_flush_tlb_multi,
        .mmu.tlb_remove_table   =
                        (void (*)(struct mmu_gather *, void *))tlb_remove_page,
 
 
 # define __flush_tlb_local             native_flush_tlb_local
 # define __flush_tlb_global            native_flush_tlb_global
 # define __flush_tlb_one_user(addr)    native_flush_tlb_one_user(addr)
-# define __flush_tlb_others(msk, info) native_flush_tlb_others(msk, info)
+# define __flush_tlb_multi(msk, info)  native_flush_tlb_multi(msk, info)
 #endif
 
 /*
                /*
                 * Even in lazy TLB mode, the CPU should stay set in the
                 * mm_cpumask. The TLB shootdown code can figure out from
-                * from cpu_tlbstate.is_lazy whether or not to send an IPI.
+                * cpu_tlbstate.is_lazy whether or not to send an IPI.
                 */
                if (WARN_ON_ONCE(real_prev != &init_mm &&
                                 !cpumask_test_cpu(cpu, mm_cpumask(next))))
                 * garbage into our TLB.  Since switching to init_mm is barely
                 * slower than a minimal flush, just switch to init_mm.
                 *
-                * This should be rare, with native_flush_tlb_others skipping
+                * This should be rare, with native_flush_tlb_multi() skipping
                 * IPIs to lazy TLB mode CPUs.
                 */
                switch_mm_irqs_off(NULL, &init_mm, NULL);
 
 static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
 
-STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
+STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
                                         const struct flush_tlb_info *info)
 {
+       /*
+        * Do accounting and tracing. Note that there are (and have always been)
+        * cases in which a remote TLB flush will be traced, but eventually
+        * would not happen.
+        */
        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
        if (info->end == TLB_FLUSH_ALL)
                trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
         * doing a speculative memory access.
         */
        if (info->freed_tables) {
-               smp_call_function_many(cpumask, flush_tlb_func,
-                              (void *)info, 1);
+               on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
        } else {
                /*
                 * Although we could have used on_each_cpu_cond_mask(),
                        if (tlb_is_not_lazy(cpu))
                                __cpumask_set_cpu(cpu, cond_cpumask);
                }
-               smp_call_function_many(cond_cpumask, flush_tlb_func, (void *)info, 1);
+               on_each_cpu_mask(cond_cpumask, flush_tlb_func, (void *)info, true);
        }
 }
 
-void flush_tlb_others(const struct cpumask *cpumask,
+void flush_tlb_multi(const struct cpumask *cpumask,
                      const struct flush_tlb_info *info)
 {
-       __flush_tlb_others(cpumask, info);
+       __flush_tlb_multi(cpumask, info);
 }
 
 /*
        info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
                                  new_tlb_gen);
 
-       if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
+       /*
+        * flush_tlb_multi() is not optimized for the common case in which only
+        * a local TLB flush is needed. Optimize this use-case by calling
+        * flush_tlb_func_local() directly in this case.
+        */
+       if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+               flush_tlb_multi(mm_cpumask(mm), info);
+       } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
                lockdep_assert_irqs_enabled();
                local_irq_disable();
                flush_tlb_func(info);
                local_irq_enable();
        }
 
-       if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
-               flush_tlb_others(mm_cpumask(mm), info);
-
        put_flush_tlb_info();
        put_cpu();
 }
        int cpu = get_cpu();
 
        info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, 0);
-       if (cpumask_test_cpu(cpu, &batch->cpumask)) {
+       /*
+        * flush_tlb_multi() is not optimized for the common case in which only
+        * a local TLB flush is needed. Optimize this use-case by calling
+        * flush_tlb_func_local() directly in this case.
+        */
+       if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+               flush_tlb_multi(&batch->cpumask, info);
+       } else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
                lockdep_assert_irqs_enabled();
                local_irq_disable();
                flush_tlb_func(info);
                local_irq_enable();
        }
 
-       if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
-               flush_tlb_others(&batch->cpumask, info);
-
        cpumask_clear(&batch->cpumask);
 
        put_flush_tlb_info();
 
        preempt_enable();
 }
 
-static void xen_flush_tlb_others(const struct cpumask *cpus,
-                                const struct flush_tlb_info *info)
+static void xen_flush_tlb_multi(const struct cpumask *cpus,
+                               const struct flush_tlb_info *info)
 {
        struct {
                struct mmuext_op op;
        const size_t mc_entry_size = sizeof(args->op) +
                sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus());
 
-       trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
+       trace_xen_mmu_flush_tlb_multi(cpus, info->mm, info->start, info->end);
 
        if (cpumask_empty(cpus))
                return;         /* nothing to do */
        args = mcs.args;
        args->op.arg2.vcpumask = to_cpumask(args->mask);
 
-       /* Remove us, and any offline CPUS. */
+       /* Remove any offline CPUs */
        cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
-       cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
 
        args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
        if (info->end != TLB_FLUSH_ALL &&
        .flush_tlb_user = xen_flush_tlb,
        .flush_tlb_kernel = xen_flush_tlb,
        .flush_tlb_one_user = xen_flush_tlb_one_user,
-       .flush_tlb_others = xen_flush_tlb_others,
+       .flush_tlb_multi = xen_flush_tlb_multi,
        .tlb_remove_table = tlb_remove_table,
 
        .pgd_alloc = xen_pgd_alloc,
 
            TP_printk("addr %lx", __entry->addr)
        );
 
-TRACE_EVENT(xen_mmu_flush_tlb_others,
+TRACE_EVENT(xen_mmu_flush_tlb_multi,
            TP_PROTO(const struct cpumask *cpus, struct mm_struct *mm,
                     unsigned long addr, unsigned long end),
            TP_ARGS(cpus, mm, addr, end),