upper_32_bits(addr), lower_32_bits(addr));
 
                if (i915.enable_execlists) {
+                       const u32 *hws = &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
                        u32 ptr, read, write;
                        unsigned int idx;
 
                                write += GEN8_CSB_ENTRIES;
                        while (read < write) {
                                idx = ++read % GEN8_CSB_ENTRIES;
-                               seq_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: %d\n",
+                               seq_printf(m, "\tExeclist CSB[%d]: 0x%08x [0x%08x in hwsp], context: %d [%d in hwsp]\n",
                                           idx,
                                           I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)),
-                                          I915_READ(RING_CONTEXT_STATUS_BUF_HI(engine, idx)));
+                                          hws[idx * 2],
+                                          I915_READ(RING_CONTEXT_STATUS_BUF_HI(engine, idx)),
+                                          hws[idx * 2 + 1]);
                        }
 
                        rcu_read_lock();
 
        while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) {
                u32 __iomem *csb_mmio =
                        dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
-               u32 __iomem *buf =
-                       dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0));
+               /* The HWSP contains a (cacheable) mirror of the CSB */
+               const u32 *buf =
+                       &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
                unsigned int head, tail;
 
+               /* However GVT emulation depends upon intercepting CSB mmio */
+               if (unlikely(engine->csb_use_mmio)) {
+                       buf = (u32 * __force)
+                               (dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
+               }
+
                /* The write will be ordered by the uncached read (itself
                 * a memory barrier), so we do not need another in the form
                 * of a locked instruction. The race between the interrupt
                         * status notifier.
                         */
 
-                       status = readl(buf + 2 * head);
+                       status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
                        if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
                                continue;
 
                        /* Check the context/desc id for this event matches */
-                       GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) !=
-                                        port->context_id);
+                       GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
 
                        rq = port_unpack(port, &count);
                        GEM_BUG_ON(count == 0);
        engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
 }
 
+static bool irq_handler_force_mmio(struct drm_i915_private *i915)
+{
+       /* GVT emulation depends upon intercepting CSB mmio */
+       if (intel_vgpu_active(i915))
+               return true;
+
+       /*
+        * IOMMU adds unpredictable latency causing the CSB write (from the
+        * GPU into the HWSP) to only be visible some time after the interrupt
+        * (missed breadcrumb syndrome).
+        */
+       if (intel_vtd_active())
+               return true;
+
+       return false;
+}
+
 static void
 logical_ring_setup(struct intel_engine_cs *engine)
 {
        /* Intentionally left blank. */
        engine->buffer = NULL;
 
+       engine->csb_use_mmio = irq_handler_force_mmio(dev_priv);
+
        fw_domains = intel_uncore_forcewake_for_reg(dev_priv,
                                                    RING_ELSP(engine),
                                                    FW_REG_WRITE);
 
        struct rb_root execlist_queue;
        struct rb_node *execlist_first;
        unsigned int fw_domains;
+       bool csb_use_mmio;
 
        /* Contexts are pinned whilst they are active on the GPU. The last
         * context executed remains active whilst the GPU is idle - the
 #define I915_GEM_HWS_SCRATCH_INDEX     0x40
 #define I915_GEM_HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH_INDEX << MI_STORE_DWORD_INDEX_SHIFT)
 
+#define I915_HWS_CSB_BUF0_INDEX                0x10
+
 struct intel_ring *
 intel_engine_create_ring(struct intel_engine_cs *engine, int size);
 int intel_ring_pin(struct intel_ring *ring,