int using_mmu_notifiers;
        u32 hpt_order;
        atomic_t vcpus_running;
+       u32 online_vcores;
        unsigned long hpt_npte;
        unsigned long hpt_mask;
        atomic_t hpte_mod_interest;
        spinlock_t slot_phys_lock;
-       unsigned short last_vcpu[NR_CPUS];
+       cpumask_t need_tlb_flush;
        struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
        struct kvmppc_linear_info *hpt_li;
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
        int nap_count;
        int napping_threads;
        u16 pcpu;
+       u16 last_cpu;
        u8 vcore_state;
        u8 in_guest;
        struct list_head runnable_threads;
        u64 dec_jiffies;
        u64 dec_expires;
        unsigned long pending_exceptions;
-       u16 last_cpu;
        u8 ceded;
        u8 prodded;
        u32 last_inst;
 
        DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
        DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
        DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
-       DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
-       DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
+       DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
        DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
        DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
        DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
        DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
        DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
        DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
-       DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
        DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
        DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
        DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
 
                 * Reset all the reverse-mapping chains for all memslots
                 */
                kvmppc_rmap_reset(kvm);
-               /*
-                * Set the whole last_vcpu array to an invalid vcpu number.
-                * This ensures that each vcpu will flush its TLB on next entry.
-                */
-               memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu));
+               /* Ensure that each vcpu will flush its TLB on next entry. */
+               cpumask_setall(&kvm->arch.need_tlb_flush);
                *htab_orderp = order;
                err = 0;
        } else {
 
                goto free_vcpu;
 
        vcpu->arch.shared = &vcpu->arch.shregs;
-       vcpu->arch.last_cpu = -1;
        vcpu->arch.mmcr[0] = MMCR0_FC;
        vcpu->arch.ctrl = CTRL_RUNLATCH;
        /* default to host PVR, since we can't spoof it */
                        vcore->preempt_tb = TB_NIL;
                }
                kvm->arch.vcores[core] = vcore;
+               kvm->arch.online_vcores++;
        }
        mutex_unlock(&kvm->lock);
 
                return -ENOMEM;
        kvm->arch.lpid = lpid;
 
+       /*
+        * Since we don't flush the TLB when tearing down a VM,
+        * and this lpid might have previously been used,
+        * make sure we flush on each core before running the new VM.
+        */
+       cpumask_setall(&kvm->arch.need_tlb_flush);
+
        INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
 
        kvm->arch.rma = NULL;
 
        return __va(addr);
 }
 
+/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
+static int global_invalidates(struct kvm *kvm, unsigned long flags)
+{
+       int global;
+
+       /*
+        * If there is only one vcore, and it's currently running,
+        * we can use tlbiel as long as we mark all other physical
+        * cores as potentially having stale TLB entries for this lpid.
+        * If we're not using MMU notifiers, we never take pages away
+        * from the guest, so we can use tlbiel if requested.
+        * Otherwise, don't use tlbiel.
+        */
+       if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore)
+               global = 0;
+       else if (kvm->arch.using_mmu_notifiers)
+               global = 1;
+       else
+               global = !(flags & H_LOCAL);
+
+       if (!global) {
+               /* any other core might now have stale TLB entries... */
+               smp_wmb();
+               cpumask_setall(&kvm->arch.need_tlb_flush);
+               cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
+                                 &kvm->arch.need_tlb_flush);
+       }
+
+       return global;
+}
+
 /*
  * Add this HPTE into the chain for the real page.
  * Must be called with the chain locked; it unlocks the chain.
        if (v & HPTE_V_VALID) {
                hpte[0] &= ~HPTE_V_VALID;
                rb = compute_tlbie_rb(v, hpte[1], pte_index);
-               if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) {
+               if (global_invalidates(kvm, flags)) {
                        while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
                                cpu_relax();
                        asm volatile("ptesync" : : : "memory");
                return H_NOT_FOUND;
        }
 
-       if (atomic_read(&kvm->online_vcpus) == 1)
-               flags |= H_LOCAL;
        v = hpte[0];
        bits = (flags << 55) & HPTE_R_PP0;
        bits |= (flags << 48) & HPTE_R_KEY_HI;
        if (v & HPTE_V_VALID) {
                rb = compute_tlbie_rb(v, r, pte_index);
                hpte[0] = v & ~HPTE_V_VALID;
-               if (!(flags & H_LOCAL)) {
+               if (global_invalidates(kvm, flags)) {
                        while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
                                cpu_relax();
                        asm volatile("ptesync" : : : "memory");
 
        mtspr   SPRN_SDR1,r6            /* switch to partition page table */
        mtspr   SPRN_LPID,r7
        isync
+
+       /* See if we need to flush the TLB */
+       lhz     r6,PACAPACAINDEX(r13)   /* test_bit(cpu, need_tlb_flush) */
+       clrldi  r7,r6,64-6              /* extract bit number (6 bits) */
+       srdi    r6,r6,6                 /* doubleword number */
+       sldi    r6,r6,3                 /* address offset */
+       add     r6,r6,r9
+       addi    r6,r6,KVM_NEED_FLUSH    /* dword in kvm->arch.need_tlb_flush */
        li      r0,1
+       sld     r0,r0,r7
+       ld      r7,0(r6)
+       and.    r7,r7,r0
+       beq     22f
+23:    ldarx   r7,0,r6                 /* if set, clear the bit */
+       andc    r7,r7,r0
+       stdcx.  r7,0,r6
+       bne     23b
+       li      r6,128                  /* and flush the TLB */
+       mtctr   r6
+       li      r7,0x800                /* IS field = 0b10 */
+       ptesync
+28:    tlbiel  r7
+       addi    r7,r7,0x1000
+       bdnz    28b
+       ptesync
+
+22:    li      r0,1
        stb     r0,VCORE_IN_GUEST(r5)   /* signal secondaries to continue */
        b       10f
 
        mr      r9,r4
        blt     hdec_soon
 
-       /*
-        * Invalidate the TLB if we could possibly have stale TLB
-        * entries for this partition on this core due to the use
-        * of tlbiel.
-        * XXX maybe only need this on primary thread?
-        */
-       ld      r9,VCPU_KVM(r4)         /* pointer to struct kvm */
-       lwz     r5,VCPU_VCPUID(r4)
-       lhz     r6,PACAPACAINDEX(r13)
-       rldimi  r6,r5,0,62              /* XXX map as if threads 1:1 p:v */
-       lhz     r8,VCPU_LAST_CPU(r4)
-       sldi    r7,r6,1                 /* see if this is the same vcpu */
-       add     r7,r7,r9                /* as last ran on this pcpu */
-       lhz     r0,KVM_LAST_VCPU(r7)
-       cmpw    r6,r8                   /* on the same cpu core as last time? */
-       bne     3f
-       cmpw    r0,r5                   /* same vcpu as this core last ran? */
-       beq     1f
-3:     sth     r6,VCPU_LAST_CPU(r4)    /* if not, invalidate partition TLB */
-       sth     r5,KVM_LAST_VCPU(r7)
-       li      r6,128
-       mtctr   r6
-       li      r7,0x800                /* IS field = 0b10 */
-       ptesync
-2:     tlbiel  r7
-       addi    r7,r7,0x1000
-       bdnz    2b
-       ptesync
-1:
-
        /* Save purr/spurr */
        mfspr   r5,SPRN_PURR
        mfspr   r6,SPRN_SPURR