return rq->engine->emit_flush(rq, EMIT_FLUSH);
 }
 
-static inline int mi_set_context(struct i915_request *rq, u32 flags)
+static inline int mi_set_context(struct i915_request *rq,
+                                struct intel_context *ce,
+                                u32 flags)
 {
        struct drm_i915_private *i915 = rq->i915;
        struct intel_engine_cs *engine = rq->engine;
 
        *cs++ = MI_NOOP;
        *cs++ = MI_SET_CONTEXT;
-       *cs++ = i915_ggtt_offset(rq->context->state) | flags;
+       *cs++ = i915_ggtt_offset(ce->state) | flags;
        /*
         * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP
         * WaMiSetContext_Hang:snb,ivb,vlv
        return rq->engine->emit_flush(rq, EMIT_INVALIDATE);
 }
 
+static int clear_residuals(struct i915_request *rq)
+{
+       struct intel_engine_cs *engine = rq->engine;
+       int ret;
+
+       ret = switch_mm(rq, vm_alias(engine->kernel_context->vm));
+       if (ret)
+               return ret;
+
+       if (engine->kernel_context->state) {
+               ret = mi_set_context(rq,
+                                    engine->kernel_context,
+                                    MI_MM_SPACE_GTT | MI_RESTORE_INHIBIT);
+               if (ret)
+                       return ret;
+       }
+
+       ret = engine->emit_bb_start(rq,
+                                   engine->wa_ctx.vma->node.start, 0,
+                                   0);
+       if (ret)
+               return ret;
+
+       ret = engine->emit_flush(rq, EMIT_FLUSH);
+       if (ret)
+               return ret;
+
+       /* Always invalidate before the next switch_mm() */
+       return engine->emit_flush(rq, EMIT_INVALIDATE);
+}
+
 static int switch_context(struct i915_request *rq)
 {
+       struct intel_engine_cs *engine = rq->engine;
        struct intel_context *ce = rq->context;
+       void **residuals = NULL;
        int ret;
 
        GEM_BUG_ON(HAS_EXECLISTS(rq->i915));
 
+       if (engine->wa_ctx.vma && ce != engine->kernel_context) {
+               if (engine->wa_ctx.vma->private != ce) {
+                       ret = clear_residuals(rq);
+                       if (ret)
+                               return ret;
+
+                       residuals = &engine->wa_ctx.vma->private;
+               }
+       }
+
        ret = switch_mm(rq, vm_alias(ce->vm));
        if (ret)
                return ret;
        if (ce->state) {
                u32 flags;
 
-               GEM_BUG_ON(rq->engine->id != RCS0);
+               GEM_BUG_ON(engine->id != RCS0);
 
                /* For resource streamer on HSW+ and power context elsewhere */
                BUILD_BUG_ON(HSW_MI_RS_SAVE_STATE_EN != MI_SAVE_EXT_STATE_EN);
                else
                        flags |= MI_RESTORE_INHIBIT;
 
-               ret = mi_set_context(rq, flags);
+               ret = mi_set_context(rq, ce, flags);
                if (ret)
                        return ret;
        }
        if (ret)
                return ret;
 
+       /*
+        * Now past the point of no return, this request _will_ be emitted.
+        *
+        * Or at least this preamble will be emitted, the request may be
+        * interrupted prior to submitting the user payload. If so, we
+        * still submit the "empty" request in order to preserve global
+        * state tracking such as this, our tracking of the current
+        * dirty context.
+        */
+       if (residuals) {
+               intel_context_put(*residuals);
+               *residuals = intel_context_get(ce);
+       }
+
        return 0;
 }
 
 
        intel_engine_cleanup_common(engine);
 
+       if (engine->wa_ctx.vma) {
+               intel_context_put(engine->wa_ctx.vma->private);
+               i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
+       }
+
        intel_ring_unpin(engine->legacy.ring);
        intel_ring_put(engine->legacy.ring);
 
        engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
 }
 
+static int gen7_ctx_switch_bb_setup(struct intel_engine_cs * const engine,
+                                   struct i915_vma * const vma)
+{
+       return 0;
+}
+
+static int gen7_ctx_switch_bb_init(struct intel_engine_cs *engine)
+{
+       struct drm_i915_gem_object *obj;
+       struct i915_vma *vma;
+       int size;
+       int err;
+
+       size = gen7_ctx_switch_bb_setup(engine, NULL /* probe size */);
+       if (size <= 0)
+               return size;
+
+       size = ALIGN(size, PAGE_SIZE);
+       obj = i915_gem_object_create_internal(engine->i915, size);
+       if (IS_ERR(obj))
+               return PTR_ERR(obj);
+
+       vma = i915_vma_instance(obj, engine->gt->vm, NULL);
+       if (IS_ERR(vma)) {
+               err = PTR_ERR(vma);
+               goto err_obj;
+       }
+
+       vma->private = intel_context_create(engine); /* dummy residuals */
+       if (IS_ERR(vma->private)) {
+               err = PTR_ERR(vma->private);
+               goto err_obj;
+       }
+
+       err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_HIGH);
+       if (err)
+               goto err_private;
+
+       err = gen7_ctx_switch_bb_setup(engine, vma);
+       if (err)
+               goto err_unpin;
+
+       engine->wa_ctx.vma = vma;
+       return 0;
+
+err_unpin:
+       i915_vma_unpin(vma);
+err_private:
+       intel_context_put(vma->private);
+err_obj:
+       i915_gem_object_put(obj);
+       return err;
+}
+
 int intel_ring_submission_setup(struct intel_engine_cs *engine)
 {
        struct intel_timeline *timeline;
 
        GEM_BUG_ON(timeline->hwsp_ggtt != engine->status_page.vma);
 
+       if (IS_GEN(engine->i915, 7) && engine->class == RENDER_CLASS) {
+               err = gen7_ctx_switch_bb_init(engine);
+               if (err)
+                       goto err_ring_unpin;
+       }
+
        /* Finally, take ownership and responsibility for cleanup! */
        engine->release = ring_release;
 
        return 0;
 
+err_ring_unpin:
+       intel_ring_unpin(ring);
 err_ring:
        intel_ring_put(ring);
 err_timeline_unpin:
        intel_engine_cleanup_common(engine);
        return err;
 }
+
+#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
+#include "selftest_ring_submission.c"
+#endif
 
--- /dev/null
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2020 Intel Corporation
+ */
+
+#include "intel_engine_pm.h"
+#include "selftests/igt_flush_test.h"
+
+static struct i915_vma *create_wally(struct intel_engine_cs *engine)
+{
+       struct drm_i915_gem_object *obj;
+       struct i915_vma *vma;
+       u32 *cs;
+       int err;
+
+       obj = i915_gem_object_create_internal(engine->i915, 4096);
+       if (IS_ERR(obj))
+               return ERR_CAST(obj);
+
+       vma = i915_vma_instance(obj, engine->gt->vm, NULL);
+       if (IS_ERR(vma)) {
+               i915_gem_object_put(obj);
+               return vma;
+       }
+
+       err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_HIGH);
+       if (err) {
+               i915_gem_object_put(obj);
+               return ERR_PTR(err);
+       }
+
+       cs = i915_gem_object_pin_map(obj, I915_MAP_WC);
+       if (IS_ERR(cs)) {
+               i915_gem_object_put(obj);
+               return ERR_CAST(cs);
+       }
+
+       if (INTEL_GEN(engine->i915) >= 6) {
+               *cs++ = MI_STORE_DWORD_IMM_GEN4;
+               *cs++ = 0;
+       } else if (INTEL_GEN(engine->i915) >= 4) {
+               *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+               *cs++ = 0;
+       } else {
+               *cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
+       }
+       *cs++ = vma->node.start + 4000;
+       *cs++ = STACK_MAGIC;
+
+       *cs++ = MI_BATCH_BUFFER_END;
+       i915_gem_object_unpin_map(obj);
+
+       vma->private = intel_context_create(engine); /* dummy residuals */
+       if (IS_ERR(vma->private)) {
+               vma = ERR_CAST(vma->private);
+               i915_gem_object_put(obj);
+       }
+
+       return vma;
+}
+
+static int context_sync(struct intel_context *ce)
+{
+       struct i915_request *rq;
+       int err = 0;
+
+       rq = intel_context_create_request(ce);
+       if (IS_ERR(rq))
+               return PTR_ERR(rq);
+
+       i915_request_get(rq);
+       i915_request_add(rq);
+
+       if (i915_request_wait(rq, 0, HZ / 5) < 0)
+               err = -ETIME;
+       i915_request_put(rq);
+
+       return err;
+}
+
+static int new_context_sync(struct intel_engine_cs *engine)
+{
+       struct intel_context *ce;
+       int err;
+
+       ce = intel_context_create(engine);
+       if (IS_ERR(ce))
+               return PTR_ERR(ce);
+
+       err = context_sync(ce);
+       intel_context_put(ce);
+
+       return err;
+}
+
+static int mixed_contexts_sync(struct intel_engine_cs *engine, u32 *result)
+{
+       int pass;
+       int err;
+
+       for (pass = 0; pass < 2; pass++) {
+               WRITE_ONCE(*result, 0);
+               err = context_sync(engine->kernel_context);
+               if (err || READ_ONCE(*result)) {
+                       if (!err) {
+                               pr_err("pass[%d] wa_bb emitted for the kernel context\n",
+                                      pass);
+                               err = -EINVAL;
+                       }
+                       return err;
+               }
+
+               WRITE_ONCE(*result, 0);
+               err = new_context_sync(engine);
+               if (READ_ONCE(*result) != STACK_MAGIC) {
+                       if (!err) {
+                               pr_err("pass[%d] wa_bb *NOT* emitted after the kernel context\n",
+                                      pass);
+                               err = -EINVAL;
+                       }
+                       return err;
+               }
+
+               WRITE_ONCE(*result, 0);
+               err = new_context_sync(engine);
+               if (READ_ONCE(*result) != STACK_MAGIC) {
+                       if (!err) {
+                               pr_err("pass[%d] wa_bb *NOT* emitted for the user context switch\n",
+                                      pass);
+                               err = -EINVAL;
+                       }
+                       return err;
+               }
+       }
+
+       return 0;
+}
+
+static int double_context_sync_00(struct intel_engine_cs *engine, u32 *result)
+{
+       struct intel_context *ce;
+       int err, i;
+
+       ce = intel_context_create(engine);
+       if (IS_ERR(ce))
+               return PTR_ERR(ce);
+
+       for (i = 0; i < 2; i++) {
+               WRITE_ONCE(*result, 0);
+               err = context_sync(ce);
+               if (err)
+                       break;
+       }
+       intel_context_put(ce);
+       if (err)
+               return err;
+
+       if (READ_ONCE(*result)) {
+               pr_err("wa_bb emitted between the same user context\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int kernel_context_sync_00(struct intel_engine_cs *engine, u32 *result)
+{
+       struct intel_context *ce;
+       int err, i;
+
+       ce = intel_context_create(engine);
+       if (IS_ERR(ce))
+               return PTR_ERR(ce);
+
+       for (i = 0; i < 2; i++) {
+               WRITE_ONCE(*result, 0);
+               err = context_sync(ce);
+               if (err)
+                       break;
+
+               err = context_sync(engine->kernel_context);
+               if (err)
+                       break;
+       }
+       intel_context_put(ce);
+       if (err)
+               return err;
+
+       if (READ_ONCE(*result)) {
+               pr_err("wa_bb emitted between the same user context [with intervening kernel]\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int __live_ctx_switch_wa(struct intel_engine_cs *engine)
+{
+       struct i915_vma *bb;
+       u32 *result;
+       int err;
+
+       bb = create_wally(engine);
+       if (IS_ERR(bb))
+               return PTR_ERR(bb);
+
+       result = i915_gem_object_pin_map(bb->obj, I915_MAP_WC);
+       if (IS_ERR(result)) {
+               intel_context_put(bb->private);
+               i915_vma_unpin_and_release(&bb, 0);
+               return PTR_ERR(result);
+       }
+       result += 1000;
+
+       engine->wa_ctx.vma = bb;
+
+       err = mixed_contexts_sync(engine, result);
+       if (err)
+               goto out;
+
+       err = double_context_sync_00(engine, result);
+       if (err)
+               goto out;
+
+       err = kernel_context_sync_00(engine, result);
+       if (err)
+               goto out;
+
+out:
+       intel_context_put(engine->wa_ctx.vma->private);
+       i915_vma_unpin_and_release(&engine->wa_ctx.vma, I915_VMA_RELEASE_MAP);
+       return err;
+}
+
+static int live_ctx_switch_wa(void *arg)
+{
+       struct intel_gt *gt = arg;
+       struct intel_engine_cs *engine;
+       enum intel_engine_id id;
+
+       /*
+        * Exercise the inter-context wa batch.
+        *
+        * Between each user context we run a wa batch, and since it may
+        * have implications for user visible state, we have to check that
+        * we do actually execute it.
+        *
+        * The trick we use is to replace the normal wa batch with a custom
+        * one that writes to a marker within it, and we can then look for
+        * that marker to confirm if the batch was run when we expect it,
+        * and equally important it was wasn't run when we don't!
+        */
+
+       for_each_engine(engine, gt, id) {
+               struct i915_vma *saved_wa;
+               int err;
+
+               if (!intel_engine_can_store_dword(engine))
+                       continue;
+
+               if (IS_GEN_RANGE(gt->i915, 4, 5))
+                       continue; /* MI_STORE_DWORD is privileged! */
+
+               saved_wa = fetch_and_zero(&engine->wa_ctx.vma);
+
+               intel_engine_pm_get(engine);
+               err = __live_ctx_switch_wa(engine);
+               intel_engine_pm_put(engine);
+               if (igt_flush_test(gt->i915))
+                       err = -EIO;
+
+               engine->wa_ctx.vma = saved_wa;
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+int intel_ring_submission_live_selftests(struct drm_i915_private *i915)
+{
+       static const struct i915_subtest tests[] = {
+               SUBTEST(live_ctx_switch_wa),
+       };
+
+       if (HAS_EXECLISTS(i915))
+               return 0;
+
+       return intel_gt_live_subtests(tests, &i915->gt);
+}