vgic_v2_write_lr(i, 0);
 }
 
+void vgic_v2_set_npie(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
+
+       cpuif->vgic_hcr |= GICH_HCR_NPIE;
+}
+
 void vgic_v2_set_underflow(struct kvm_vcpu *vcpu)
 {
        struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
        int lr;
        unsigned long flags;
 
-       cpuif->vgic_hcr &= ~GICH_HCR_UIE;
+       cpuif->vgic_hcr &= ~(GICH_HCR_UIE | GICH_HCR_NPIE);
 
        for (lr = 0; lr < vgic_cpu->used_lrs; lr++) {
                u32 val = cpuif->vgic_lr[lr];
 
 static bool common_trap;
 static bool gicv4_enable;
 
+void vgic_v3_set_npie(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
+
+       cpuif->vgic_hcr |= ICH_HCR_NPIE;
+}
+
 void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
 {
        struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
        int lr;
        unsigned long flags;
 
-       cpuif->vgic_hcr &= ~ICH_HCR_UIE;
+       cpuif->vgic_hcr &= ~(ICH_HCR_UIE | ICH_HCR_NPIE);
 
        for (lr = 0; lr < vgic_cpu->used_lrs; lr++) {
                u64 val = cpuif->vgic_lr[lr];
 
                vgic_v3_set_underflow(vcpu);
 }
 
+static inline void vgic_set_npie(struct kvm_vcpu *vcpu)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_set_npie(vcpu);
+       else
+               vgic_v3_set_npie(vcpu);
+}
+
 /* Requires the ap_list_lock to be held. */
-static int compute_ap_list_depth(struct kvm_vcpu *vcpu)
+static int compute_ap_list_depth(struct kvm_vcpu *vcpu,
+                                bool *multi_sgi)
 {
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        struct vgic_irq *irq;
        int count = 0;
 
+       *multi_sgi = false;
+
        DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock));
 
        list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
                spin_lock(&irq->irq_lock);
                /* GICv2 SGIs can count for more than one... */
-               if (vgic_irq_is_sgi(irq->intid) && irq->source)
-                       count += hweight8(irq->source);
-               else
+               if (vgic_irq_is_sgi(irq->intid) && irq->source) {
+                       int w = hweight8(irq->source);
+
+                       count += w;
+                       *multi_sgi |= (w > 1);
+               } else {
                        count++;
+               }
                spin_unlock(&irq->irq_lock);
        }
        return count;
 {
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        struct vgic_irq *irq;
-       int count = 0;
+       int count;
+       bool npie = false;
+       bool multi_sgi;
+       u8 prio = 0xff;
 
        DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock));
 
-       if (compute_ap_list_depth(vcpu) > kvm_vgic_global_state.nr_lr)
+       count = compute_ap_list_depth(vcpu, &multi_sgi);
+       if (count > kvm_vgic_global_state.nr_lr || multi_sgi)
                vgic_sort_ap_list(vcpu);
 
+       count = 0;
+
        list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
                spin_lock(&irq->irq_lock);
 
-               if (unlikely(vgic_target_oracle(irq) != vcpu))
-                       goto next;
-
                /*
-                * If we get an SGI with multiple sources, try to get
-                * them in all at once.
+                * If we have multi-SGIs in the pipeline, we need to
+                * guarantee that they are all seen before any IRQ of
+                * lower priority. In that case, we need to filter out
+                * these interrupts by exiting early. This is easy as
+                * the AP list has been sorted already.
                 */
-               do {
+               if (multi_sgi && irq->priority > prio) {
+                       spin_unlock(&irq->irq_lock);
+                       break;
+               }
+
+               if (likely(vgic_target_oracle(irq) == vcpu)) {
                        vgic_populate_lr(vcpu, irq, count++);
-               } while (irq->source && count < kvm_vgic_global_state.nr_lr);
 
-next:
+                       if (irq->source) {
+                               npie = true;
+                               prio = irq->priority;
+                       }
+               }
+
                spin_unlock(&irq->irq_lock);
 
                if (count == kvm_vgic_global_state.nr_lr) {
                }
        }
 
+       if (npie)
+               vgic_set_npie(vcpu);
+
        vcpu->arch.vgic_cpu.used_lrs = count;
 
        /* Nuke remaining LRs */
 
 void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
 void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
 void vgic_v2_set_underflow(struct kvm_vcpu *vcpu);
+void vgic_v2_set_npie(struct kvm_vcpu *vcpu);
 int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
 int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
                         int offset, u32 *val);
 void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
 void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr);
 void vgic_v3_set_underflow(struct kvm_vcpu *vcpu);
+void vgic_v3_set_npie(struct kvm_vcpu *vcpu);
 void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
 void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
 void vgic_v3_enable(struct kvm_vcpu *vcpu);