container_of(fence, typeof(*request), submit);
        struct intel_engine_cs *engine = request->engine;
 
+       if (state != FENCE_COMPLETE)
+               return NOTIFY_DONE;
+
        /* Will be called from irq-context when using foreign DMA fences */
 
-       switch (state) {
-       case FENCE_COMPLETE:
-               engine->timeline->last_submitted_seqno = request->fence.seqno;
-               engine->submit_request(request);
-               break;
+       engine->timeline->last_submitted_seqno = request->fence.seqno;
 
-       case FENCE_FREE:
-               break;
-       }
+       engine->emit_breadcrumb(request,
+                               request->ring->vaddr + request->postfix);
+       engine->submit_request(request);
 
        return NOTIFY_DONE;
 }
        struct intel_ring *ring = request->ring;
        struct intel_timeline *timeline = request->timeline;
        struct drm_i915_gem_request *prev;
-       u32 request_start;
-       u32 reserved_tail;
-       int ret;
+       int err;
 
        lockdep_assert_held(&request->i915->drm.struct_mutex);
        trace_i915_gem_request_add(request);
         * should already have been reserved in the ring buffer. Let the ring
         * know that it is time to use that space up.
         */
-       request_start = ring->tail;
-       reserved_tail = request->reserved_space;
        request->reserved_space = 0;
 
        /*
         * what.
         */
        if (flush_caches) {
-               ret = engine->emit_flush(request, EMIT_FLUSH);
+               err = engine->emit_flush(request, EMIT_FLUSH);
 
                /* Not allowed to fail! */
-               WARN(ret, "engine->emit_flush() failed: %d!\n", ret);
+               WARN(err, "engine->emit_flush() failed: %d!\n", err);
        }
 
        /* Record the position of the start of the breadcrumb so that
         * GPU processing the request, we never over-estimate the
         * position of the ring's HEAD.
         */
+       err = intel_ring_begin(request, engine->emit_breadcrumb_sz);
+       GEM_BUG_ON(err);
        request->postfix = ring->tail;
-
-       /* Not allowed to fail! */
-       ret = engine->emit_breadcrumb(request);
-       WARN(ret, "(%s)->emit_breadcrumb failed: %d!\n", engine->name, ret);
-
-       /* Sanity check that the reserved size was large enough. */
-       ret = ring->tail - request_start;
-       if (ret < 0)
-               ret += ring->size;
-       WARN_ONCE(ret > reserved_tail,
-                 "Not enough space reserved (%d bytes) "
-                 "for adding the request (%d bytes)\n",
-                 reserved_tail, ret);
+       ring->tail += engine->emit_breadcrumb_sz * sizeof(u32);
 
        /* Seal the request and mark it as pending execution. Note that
         * we may inspect this state, without holding any locks, during
 
        struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
        u32 *reg_state = ce->lrc_reg_state;
 
-       reg_state[CTX_RING_TAIL+1] = intel_ring_offset(rq->ring, rq->tail);
+       reg_state[CTX_RING_TAIL+1] = rq->tail;
 
        /* True 32b PPGTT with dynamic page allocation: update PDP
         * registers and point the unallocated PDPs to scratch page.
 
        spin_lock_irqsave(&engine->execlist_lock, flags);
 
+       /* We keep the previous context alive until we retire the following
+        * request. This ensures that any the context object is still pinned
+        * for any residual writes the HW makes into it on the context switch
+        * into the next object following the breadcrumb. Otherwise, we may
+        * retire the context too early.
+        */
+       request->previous_context = engine->last_context;
+       engine->last_context = request->ctx;
+
        list_add_tail(&request->execlist_link, &engine->execlist_queue);
        if (execlists_elsp_idle(engine))
                tasklet_hi_schedule(&engine->irq_tasklet);
        return ret;
 }
 
-/*
- * intel_logical_ring_advance() - advance the tail and prepare for submission
- * @request: Request to advance the logical ringbuffer of.
- *
- * The tail is updated in our logical ringbuffer struct, not in the actual context. What
- * really happens during submission is that the context and current tail will be placed
- * on a queue waiting for the ELSP to be ready to accept a new context submission. At that
- * point, the tail *inside* the context is updated and the ELSP written to.
- */
-static int
-intel_logical_ring_advance(struct drm_i915_gem_request *request)
-{
-       struct intel_ring *ring = request->ring;
-       struct intel_engine_cs *engine = request->engine;
-
-       intel_ring_advance(ring);
-       request->tail = ring->tail;
-
-       /*
-        * Here we add two extra NOOPs as padding to avoid
-        * lite restore of a context with HEAD==TAIL.
-        *
-        * Caller must reserve WA_TAIL_DWORDS for us!
-        */
-       intel_ring_emit(ring, MI_NOOP);
-       intel_ring_emit(ring, MI_NOOP);
-       intel_ring_advance(ring);
-       request->wa_tail = ring->tail;
-
-       /* We keep the previous context alive until we retire the following
-        * request. This ensures that any the context object is still pinned
-        * for any residual writes the HW makes into it on the context switch
-        * into the next object following the breadcrumb. Otherwise, we may
-        * retire the context too early.
-        */
-       request->previous_context = engine->last_context;
-       engine->last_context = request->ctx;
-       return 0;
-}
-
 static int intel_lr_context_pin(struct i915_gem_context *ctx,
                                struct intel_engine_cs *engine)
 {
  * used as a workaround for not being allowed to do lite
  * restore with HEAD==TAIL (WaIdleLiteRestore).
  */
-
-static int gen8_emit_breadcrumb(struct drm_i915_gem_request *request)
+static void gen8_emit_wa_tail(struct drm_i915_gem_request *request, u32 *out)
 {
-       struct intel_ring *ring = request->ring;
-       int ret;
-
-       ret = intel_ring_begin(request, 6 + WA_TAIL_DWORDS);
-       if (ret)
-               return ret;
+       *out++ = MI_NOOP;
+       *out++ = MI_NOOP;
+       request->wa_tail = intel_ring_offset(request->ring, out);
+}
 
+static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request,
+                                u32 *out)
+{
        /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
        BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
 
-       intel_ring_emit(ring, (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW);
-       intel_ring_emit(ring,
-                       intel_hws_seqno_address(request->engine) |
-                       MI_FLUSH_DW_USE_GTT);
-       intel_ring_emit(ring, 0);
-       intel_ring_emit(ring, request->global_seqno);
-       intel_ring_emit(ring, MI_USER_INTERRUPT);
-       intel_ring_emit(ring, MI_NOOP);
-       return intel_logical_ring_advance(request);
+       *out++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
+       *out++ = intel_hws_seqno_address(request->engine) | MI_FLUSH_DW_USE_GTT;
+       *out++ = 0;
+       *out++ = request->global_seqno;
+       *out++ = MI_USER_INTERRUPT;
+       *out++ = MI_NOOP;
+       request->tail = intel_ring_offset(request->ring, out);
+
+       gen8_emit_wa_tail(request, out);
 }
 
 static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
 
-static int gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request)
+static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
+                                       u32 *out)
 {
-       struct intel_ring *ring = request->ring;
-       int ret;
-
-       ret = intel_ring_begin(request, 8 + WA_TAIL_DWORDS);
-       if (ret)
-               return ret;
-
        /* We're using qword write, seqno should be aligned to 8 bytes. */
        BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);
 
         * need a prior CS_STALL, which is emitted by the flush
         * following the batch.
         */
-       intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-       intel_ring_emit(ring,
-                       (PIPE_CONTROL_GLOBAL_GTT_IVB |
-                        PIPE_CONTROL_CS_STALL |
-                        PIPE_CONTROL_QW_WRITE));
-       intel_ring_emit(ring, intel_hws_seqno_address(request->engine));
-       intel_ring_emit(ring, 0);
-       intel_ring_emit(ring, request->global_seqno);
+       *out++ = GFX_OP_PIPE_CONTROL(6);
+       *out++ = (PIPE_CONTROL_GLOBAL_GTT_IVB |
+                 PIPE_CONTROL_CS_STALL |
+                 PIPE_CONTROL_QW_WRITE);
+       *out++ = intel_hws_seqno_address(request->engine);
+       *out++ = 0;
+       *out++ = request->global_seqno;
        /* We're thrashing one dword of HWS. */
-       intel_ring_emit(ring, 0);
-       intel_ring_emit(ring, MI_USER_INTERRUPT);
-       intel_ring_emit(ring, MI_NOOP);
-       return intel_logical_ring_advance(request);
+       *out++ = 0;
+       *out++ = MI_USER_INTERRUPT;
+       *out++ = MI_NOOP;
+       request->tail = intel_ring_offset(request->ring, out);
+
+       gen8_emit_wa_tail(request, out);
 }
 
 static const int gen8_emit_breadcrumb_render_sz = 8 + WA_TAIL_DWORDS;
 
        i915_vma_unpin_and_release(&dev_priv->semaphore);
 }
 
-static int gen8_rcs_signal(struct drm_i915_gem_request *req)
+static u32 *gen8_rcs_signal(struct drm_i915_gem_request *req, u32 *out)
 {
-       struct intel_ring *ring = req->ring;
        struct drm_i915_private *dev_priv = req->i915;
        struct intel_engine_cs *waiter;
        enum intel_engine_id id;
-       int ret, num_rings;
-
-       num_rings = INTEL_INFO(dev_priv)->num_rings;
-       ret = intel_ring_begin(req, (num_rings-1) * 8);
-       if (ret)
-               return ret;
 
        for_each_engine(waiter, dev_priv, id) {
                u64 gtt_offset = req->engine->semaphore.signal_ggtt[id];
                if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
                        continue;
 
-               intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-               intel_ring_emit(ring,
-                               PIPE_CONTROL_GLOBAL_GTT_IVB |
-                               PIPE_CONTROL_QW_WRITE |
-                               PIPE_CONTROL_CS_STALL);
-               intel_ring_emit(ring, lower_32_bits(gtt_offset));
-               intel_ring_emit(ring, upper_32_bits(gtt_offset));
-               intel_ring_emit(ring, req->global_seqno);
-               intel_ring_emit(ring, 0);
-               intel_ring_emit(ring,
-                               MI_SEMAPHORE_SIGNAL |
-                               MI_SEMAPHORE_TARGET(waiter->hw_id));
-               intel_ring_emit(ring, 0);
+               *out++ = GFX_OP_PIPE_CONTROL(6);
+               *out++ = (PIPE_CONTROL_GLOBAL_GTT_IVB |
+                         PIPE_CONTROL_QW_WRITE |
+                         PIPE_CONTROL_CS_STALL);
+               *out++ = lower_32_bits(gtt_offset);
+               *out++ = upper_32_bits(gtt_offset);
+               *out++ = req->global_seqno;
+               *out++ = 0;
+               *out++ = (MI_SEMAPHORE_SIGNAL |
+                         MI_SEMAPHORE_TARGET(waiter->hw_id));
+               *out++ = 0;
        }
-       intel_ring_advance(ring);
 
-       return 0;
+       return out;
 }
 
-static int gen8_xcs_signal(struct drm_i915_gem_request *req)
+static u32 *gen8_xcs_signal(struct drm_i915_gem_request *req, u32 *out)
 {
-       struct intel_ring *ring = req->ring;
        struct drm_i915_private *dev_priv = req->i915;
        struct intel_engine_cs *waiter;
        enum intel_engine_id id;
-       int ret, num_rings;
-
-       num_rings = INTEL_INFO(dev_priv)->num_rings;
-       ret = intel_ring_begin(req, (num_rings-1) * 6);
-       if (ret)
-               return ret;
 
        for_each_engine(waiter, dev_priv, id) {
                u64 gtt_offset = req->engine->semaphore.signal_ggtt[id];
                if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
                        continue;
 
-               intel_ring_emit(ring,
-                               (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW);
-               intel_ring_emit(ring,
-                               lower_32_bits(gtt_offset) |
-                               MI_FLUSH_DW_USE_GTT);
-               intel_ring_emit(ring, upper_32_bits(gtt_offset));
-               intel_ring_emit(ring, req->global_seqno);
-               intel_ring_emit(ring,
-                               MI_SEMAPHORE_SIGNAL |
-                               MI_SEMAPHORE_TARGET(waiter->hw_id));
-               intel_ring_emit(ring, 0);
+               *out++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
+               *out++ = lower_32_bits(gtt_offset) | MI_FLUSH_DW_USE_GTT;
+               *out++ = upper_32_bits(gtt_offset);
+               *out++ = req->global_seqno;
+               *out++ = (MI_SEMAPHORE_SIGNAL |
+                         MI_SEMAPHORE_TARGET(waiter->hw_id));
+               *out++ = 0;
        }
-       intel_ring_advance(ring);
 
-       return 0;
+       return out;
 }
 
-static int gen6_signal(struct drm_i915_gem_request *req)
+static u32 *gen6_signal(struct drm_i915_gem_request *req, u32 *out)
 {
-       struct intel_ring *ring = req->ring;
        struct drm_i915_private *dev_priv = req->i915;
        struct intel_engine_cs *engine;
        enum intel_engine_id id;
-       int ret, num_rings;
-
-       num_rings = INTEL_INFO(dev_priv)->num_rings;
-       ret = intel_ring_begin(req, round_up((num_rings-1) * 3, 2));
-       if (ret)
-               return ret;
+       int num_rings = 0;
 
        for_each_engine(engine, dev_priv, id) {
                i915_reg_t mbox_reg;
 
                mbox_reg = req->engine->semaphore.mbox.signal[engine->hw_id];
                if (i915_mmio_reg_valid(mbox_reg)) {
-                       intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-                       intel_ring_emit_reg(ring, mbox_reg);
-                       intel_ring_emit(ring, req->global_seqno);
+                       *out++ = MI_LOAD_REGISTER_IMM(1);
+                       *out++ = i915_mmio_reg_offset(mbox_reg);
+                       *out++ = req->global_seqno;
+                       num_rings++;
                }
        }
+       if (num_rings & 1)
+               *out++ = MI_NOOP;
 
-       /* If num_dwords was rounded, make sure the tail pointer is correct */
-       if (num_rings % 2 == 0)
-               intel_ring_emit(ring, MI_NOOP);
-       intel_ring_advance(ring);
-
-       return 0;
+       return out;
 }
 
 static void i9xx_submit_request(struct drm_i915_gem_request *request)
 {
        struct drm_i915_private *dev_priv = request->i915;
 
-       I915_WRITE_TAIL(request->engine,
-                       intel_ring_offset(request->ring, request->tail));
+       I915_WRITE_TAIL(request->engine, request->tail);
 }
 
-static int i9xx_emit_breadcrumb(struct drm_i915_gem_request *req)
+static void i9xx_emit_breadcrumb(struct drm_i915_gem_request *req,
+                                u32 *out)
 {
-       struct intel_ring *ring = req->ring;
-       int ret;
-
-       ret = intel_ring_begin(req, 4);
-       if (ret)
-               return ret;
-
-       intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
-       intel_ring_emit(ring, I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
-       intel_ring_emit(ring, req->global_seqno);
-       intel_ring_emit(ring, MI_USER_INTERRUPT);
-       intel_ring_advance(ring);
-
-       req->tail = ring->tail;
+       *out++ = MI_STORE_DWORD_INDEX;
+       *out++ = I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
+       *out++ = req->global_seqno;
+       *out++ = MI_USER_INTERRUPT;
 
-       return 0;
+       req->tail = intel_ring_offset(req->ring, out);
 }
 
 static const int i9xx_emit_breadcrumb_sz = 4;
  * Update the mailbox registers in the *other* rings with the current seqno.
  * This acts like a signal in the canonical semaphore.
  */
-static int gen6_sema_emit_breadcrumb(struct drm_i915_gem_request *req)
+static void gen6_sema_emit_breadcrumb(struct drm_i915_gem_request *req,
+                                     u32 *out)
 {
-       int ret;
-
-       ret = req->engine->semaphore.signal(req);
-       if (ret)
-               return ret;
-
-       return i9xx_emit_breadcrumb(req);
+       return i9xx_emit_breadcrumb(req,
+                                   req->engine->semaphore.signal(req, out));
 }
 
-static int gen8_render_emit_breadcrumb(struct drm_i915_gem_request *req)
+static void gen8_render_emit_breadcrumb(struct drm_i915_gem_request *req,
+                                       u32 *out)
 {
        struct intel_engine_cs *engine = req->engine;
-       struct intel_ring *ring = req->ring;
-       int ret;
 
-       if (engine->semaphore.signal) {
-               ret = engine->semaphore.signal(req);
-               if (ret)
-                       return ret;
-       }
-
-       ret = intel_ring_begin(req, 8);
-       if (ret)
-               return ret;
+       if (engine->semaphore.signal)
+               out = engine->semaphore.signal(req, out);
 
-       intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-       intel_ring_emit(ring, (PIPE_CONTROL_GLOBAL_GTT_IVB |
+       *out++ = GFX_OP_PIPE_CONTROL(6);
+       *out++ = (PIPE_CONTROL_GLOBAL_GTT_IVB |
                               PIPE_CONTROL_CS_STALL |
-                              PIPE_CONTROL_QW_WRITE));
-       intel_ring_emit(ring, intel_hws_seqno_address(engine));
-       intel_ring_emit(ring, 0);
-       intel_ring_emit(ring, req->global_seqno);
+                              PIPE_CONTROL_QW_WRITE);
+       *out++ = intel_hws_seqno_address(engine);
+       *out++ = 0;
+       *out++ = req->global_seqno;
        /* We're thrashing one dword of HWS. */
-       intel_ring_emit(ring, 0);
-       intel_ring_emit(ring, MI_USER_INTERRUPT);
-       intel_ring_emit(ring, MI_NOOP);
-       intel_ring_advance(ring);
-
-       req->tail = ring->tail;
+       *out++ = 0;
+       *out++ = MI_USER_INTERRUPT;
+       *out++ = MI_NOOP;
 
-       return 0;
+       req->tail = intel_ring_offset(req->ring, out);
 }
 
 static const int gen8_render_emit_breadcrumb_sz = 8;
 
 #define I915_DISPATCH_SECURE BIT(0)
 #define I915_DISPATCH_PINNED BIT(1)
 #define I915_DISPATCH_RS     BIT(2)
-       int             (*emit_breadcrumb)(struct drm_i915_gem_request *req);
+       void            (*emit_breadcrumb)(struct drm_i915_gem_request *req,
+                                          u32 *out);
        int             emit_breadcrumb_sz;
 
        /* Pass the request to the hardware queue (e.g. directly into
                /* AKA wait() */
                int     (*sync_to)(struct drm_i915_gem_request *req,
                                   struct drm_i915_gem_request *signal);
-               int     (*signal)(struct drm_i915_gem_request *req);
+               u32     *(*signal)(struct drm_i915_gem_request *req, u32 *out);
        } semaphore;
 
        /* Execlists */
         */
 }
 
-static inline u32 intel_ring_offset(struct intel_ring *ring, u32 value)
+static inline u32 intel_ring_offset(struct intel_ring *ring, void *addr)
 {
        /* Don't write ring->size (equivalent to 0) as that hangs some GPUs. */
-       return value & (ring->size - 1);
+       u32 offset = addr - ring->vaddr;
+       return offset & (ring->size - 1);
 }
 
 int __intel_ring_space(int head, int tail, int size);