extern void __kvm_hyp_reset(unsigned long);
 
 extern u64 __vgic_v3_get_ich_vtr_el2(void);
+extern u64 __vgic_v3_read_vmcr(void);
+extern void __vgic_v3_write_vmcr(u32 vmcr);
 extern void __vgic_v3_init_lrs(void);
+
 #endif
 
 #endif /* __ARM_KVM_ASM_H__ */
 
        vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
 
        kvm_arm_set_running_vcpu(vcpu);
+
+       kvm_vgic_load(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
-       /*
-        * The arch-generic KVM code expects the cpu field of a vcpu to be -1
-        * if the vcpu is no longer assigned to a cpu.  This is used for the
-        * optimized make_all_cpus_request path.
-        */
+       kvm_vgic_put(vcpu);
+
        vcpu->cpu = -1;
 
        kvm_arm_set_running_vcpu(NULL);
                 * non-preemptible context.
                 */
                preempt_disable();
+
                kvm_pmu_flush_hwstate(vcpu);
+
                kvm_timer_flush_hwstate(vcpu);
                kvm_vgic_flush_hwstate(vcpu);
 
 
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
 extern u64 __vgic_v3_get_ich_vtr_el2(void);
+extern u64 __vgic_v3_read_vmcr(void);
+extern void __vgic_v3_write_vmcr(u32 vmcr);
 extern void __vgic_v3_init_lrs(void);
 
 extern u32 __kvm_get_mdcr_el2(void);
 
 
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
 
+void kvm_vgic_load(struct kvm_vcpu *vcpu);
+void kvm_vgic_put(struct kvm_vcpu *vcpu);
+
 #define irqchip_in_kernel(k)   (!!((k)->arch.vgic.in_kernel))
 #define vgic_initialized(k)    ((k)->arch.vgic.initialized)
 #define vgic_ready(k)          ((k)->arch.vgic.ready)
 
        if (!base)
                return;
 
-       cpu_if->vgic_vmcr = readl_relaxed(base + GICH_VMCR);
-
        if (vcpu->arch.vgic_cpu.live_lrs) {
                cpu_if->vgic_apr = readl_relaxed(base + GICH_APR);
 
                }
        }
 
-       writel_relaxed(cpu_if->vgic_vmcr, base + GICH_VMCR);
        vcpu->arch.vgic_cpu.live_lrs = live_lrs;
 }
 
 
        if (!cpu_if->vgic_sre)
                dsb(st);
 
-       cpu_if->vgic_vmcr  = read_gicreg(ICH_VMCR_EL2);
-
        if (vcpu->arch.vgic_cpu.live_lrs) {
                int i;
                u32 max_lr_idx, nr_pri_bits;
                        live_lrs |= (1 << i);
        }
 
-       write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
-
        if (live_lrs) {
                write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
 
 {
        return read_gicreg(ICH_VTR_EL2);
 }
+
+u64 __hyp_text __vgic_v3_read_vmcr(void)
+{
+       return read_gicreg(ICH_VMCR_EL2);
+}
+
+void __hyp_text __vgic_v3_write_vmcr(u32 vmcr)
+{
+       write_gicreg(vmcr, ICH_VMCR_EL2);
+}
 
        vgic_debug_init(kvm);
 
        dist->initialized = true;
+
+       /*
+        * If we're initializing GICv2 on-demand when first running the VCPU
+        * then we need to load the VGIC state onto the CPU.  We can detect
+        * this easily by checking if we are in between vcpu_load and vcpu_put
+        * when we just initialized the VGIC.
+        */
+       preempt_disable();
+       vcpu = kvm_arm_get_running_vcpu();
+       if (vcpu)
+               kvm_vgic_load(vcpu);
+       preempt_enable();
 out:
        return ret;
 }
 
 
 void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 {
+       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
        u32 vmcr;
 
        vmcr  = (vmcrp->ctlr << GICH_VMCR_CTRL_SHIFT) & GICH_VMCR_CTRL_MASK;
        vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) &
                GICH_VMCR_PRIMASK_MASK;
 
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr;
+       cpu_if->vgic_vmcr = vmcr;
 }
 
 void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 {
-       u32 vmcr = vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr;
+       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+       u32 vmcr;
+
+       vmcr = cpu_if->vgic_vmcr;
 
        vmcrp->ctlr = (vmcr & GICH_VMCR_CTRL_MASK) >>
                        GICH_VMCR_CTRL_SHIFT;
 
        return ret;
 }
+
+void vgic_v2_load(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+       struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
+
+       writel_relaxed(cpu_if->vgic_vmcr, vgic->vctrl_base + GICH_VMCR);
+}
+
+void vgic_v2_put(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+       struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
+
+       cpu_if->vgic_vmcr = readl_relaxed(vgic->vctrl_base + GICH_VMCR);
+}
 
 
 void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 {
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
        u32 vmcr;
 
        /*
        vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK;
        vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK;
 
-       vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr;
+       cpu_if->vgic_vmcr = vmcr;
 }
 
 void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 {
-       u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr;
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+       u32 vmcr;
+
+       vmcr = cpu_if->vgic_vmcr;
 
        /*
         * Ignore the FIQen bit, because GIC emulation always implies
 
        return 0;
 }
+
+void vgic_v3_load(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+       kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
+}
+
+void vgic_v3_put(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+       cpu_if->vgic_vmcr = kvm_call_hyp(__vgic_v3_read_vmcr);
+}
 
        spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
 }
 
+void kvm_vgic_load(struct kvm_vcpu *vcpu)
+{
+       if (unlikely(!vgic_initialized(vcpu->kvm)))
+               return;
+
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_load(vcpu);
+       else
+               vgic_v3_load(vcpu);
+}
+
+void kvm_vgic_put(struct kvm_vcpu *vcpu)
+{
+       if (unlikely(!vgic_initialized(vcpu->kvm)))
+               return;
+
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_put(vcpu);
+       else
+               vgic_v3_put(vcpu);
+}
+
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
 {
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 
 int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
                             enum vgic_type);
 
+void vgic_v2_load(struct kvm_vcpu *vcpu);
+void vgic_v2_put(struct kvm_vcpu *vcpu);
+
 static inline void vgic_get_irq_kref(struct vgic_irq *irq)
 {
        if (irq->intid < VGIC_MIN_LPI)
 int vgic_v3_map_resources(struct kvm *kvm);
 int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address);
 
+void vgic_v3_load(struct kvm_vcpu *vcpu);
+void vgic_v3_put(struct kvm_vcpu *vcpu);
+
 int vgic_register_its_iodevs(struct kvm *kvm);
 bool vgic_has_its(struct kvm *kvm);
 int kvm_vgic_register_its_device(void);