*  - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR)
  *  - Debug ROM Address (MDCR_EL2_TDRA)
  *  - OS related registers (MDCR_EL2_TDOSA)
+ *  - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB)
  *
  * Additionally, KVM only traps guest accesses to the debug registers if
  * the guest is not actively using them (see the KVM_ARM64_DEBUG_DIRTY
 
        trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug);
 
+       /*
+        * This also clears MDCR_EL2_E2PB_MASK to disable guest access
+        * to the profiling buffer.
+        */
        vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK;
        vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
+                               MDCR_EL2_TPMS |
                                MDCR_EL2_TPMCR |
                                MDCR_EL2_TDRA |
                                MDCR_EL2_TDOSA);
 
        default:        write_debug(ptr[0], reg, 0);                    \
        }
 
+#define PMSCR_EL1              sys_reg(3, 0, 9, 9, 0)
+
+#define PMBLIMITR_EL1          sys_reg(3, 0, 9, 10, 0)
+#define PMBLIMITR_EL1_E                BIT(0)
+
+#define PMBIDR_EL1             sys_reg(3, 0, 9, 10, 7)
+#define PMBIDR_EL1_P           BIT(4)
+
+#define psb_csync()            asm volatile("hint #17")
+
+static void __hyp_text __debug_save_spe_vhe(u64 *pmscr_el1)
+{
+       /* The vcpu can run. but it can't hide. */
+}
+
+static void __hyp_text __debug_save_spe_nvhe(u64 *pmscr_el1)
+{
+       u64 reg;
+
+       /* SPE present on this CPU? */
+       if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
+                                                 ID_AA64DFR0_PMSVER_SHIFT))
+               return;
+
+       /* Yes; is it owned by EL3? */
+       reg = read_sysreg_s(PMBIDR_EL1);
+       if (reg & PMBIDR_EL1_P)
+               return;
+
+       /* No; is the host actually using the thing? */
+       reg = read_sysreg_s(PMBLIMITR_EL1);
+       if (!(reg & PMBLIMITR_EL1_E))
+               return;
+
+       /* Yes; save the control register and disable data generation */
+       *pmscr_el1 = read_sysreg_s(PMSCR_EL1);
+       write_sysreg_s(0, PMSCR_EL1);
+       isb();
+
+       /* Now drain all buffered data to memory */
+       psb_csync();
+       dsb(nsh);
+}
+
+static hyp_alternate_select(__debug_save_spe,
+                           __debug_save_spe_nvhe, __debug_save_spe_vhe,
+                           ARM64_HAS_VIRT_HOST_EXTN);
+
+static void __hyp_text __debug_restore_spe(u64 pmscr_el1)
+{
+       if (!pmscr_el1)
+               return;
+
+       /* The host page table is installed, but not yet synchronised */
+       isb();
+
+       /* Re-enable data generation */
+       write_sysreg_s(pmscr_el1, PMSCR_EL1);
+}
+
 void __hyp_text __debug_save_state(struct kvm_vcpu *vcpu,
                                   struct kvm_guest_debug_arch *dbg,
                                   struct kvm_cpu_context *ctxt)
            (vcpu->arch.ctxt.sys_regs[MDSCR_EL1] & DBG_MDSCR_MDE))
                vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
 
-       __debug_save_state(vcpu, &vcpu->arch.host_debug_state,
+       __debug_save_state(vcpu, &vcpu->arch.host_debug_state.regs,
                           kern_hyp_va(vcpu->arch.host_cpu_context));
+       __debug_save_spe()(&vcpu->arch.host_debug_state.pmscr_el1);
 }
 
 void __hyp_text __debug_cond_restore_host_state(struct kvm_vcpu *vcpu)
 {
-       __debug_restore_state(vcpu, &vcpu->arch.host_debug_state,
+       __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1);
+       __debug_restore_state(vcpu, &vcpu->arch.host_debug_state.regs,
                              kern_hyp_va(vcpu->arch.host_cpu_context));
 
        if (vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY)
 
 static void __hyp_text __deactivate_traps_vhe(void)
 {
        extern char vectors[];  /* kernel exception vectors */
+       u64 mdcr_el2 = read_sysreg(mdcr_el2);
 
+       mdcr_el2 &= MDCR_EL2_HPMN_MASK |
+                   MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT |
+                   MDCR_EL2_TPMS;
+
+       write_sysreg(mdcr_el2, mdcr_el2);
        write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2);
        write_sysreg(CPACR_EL1_FPEN, cpacr_el1);
        write_sysreg(vectors, vbar_el1);
 
 static void __hyp_text __deactivate_traps_nvhe(void)
 {
+       u64 mdcr_el2 = read_sysreg(mdcr_el2);
+
+       mdcr_el2 &= MDCR_EL2_HPMN_MASK;
+       mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT;
+
+       write_sysreg(mdcr_el2, mdcr_el2);
        write_sysreg(HCR_RW, hcr_el2);
        write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
 }
 
        __deactivate_traps_arch()();
        write_sysreg(0, hstr_el2);
-       write_sysreg(read_sysreg(mdcr_el2) & MDCR_EL2_HPMN_MASK, mdcr_el2);
        write_sysreg(0, pmuserenr_el0);
 }
 
        }
 
        __debug_save_state(vcpu, kern_hyp_va(vcpu->arch.debug_ptr), guest_ctxt);
+       /*
+        * This must come after restoring the host sysregs, since a non-VHE
+        * system may enable SPE here and make use of the TTBRs.
+        */
        __debug_cond_restore_host_state(vcpu);
 
        return exit_code;