#define WA_TAIL_DWORDS 2
 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
 
+struct virtual_engine {
+       struct intel_engine_cs base;
+       struct intel_context context;
+
+       /*
+        * We allow only a single request through the virtual engine at a time
+        * (each request in the timeline waits for the completion fence of
+        * the previous before being submitted). By restricting ourselves to
+        * only submitting a single request, each request is placed on to a
+        * physical to maximise load spreading (by virtue of the late greedy
+        * scheduling -- each real engine takes the next available request
+        * upon idling).
+        */
+       struct i915_request *request;
+
+       /*
+        * We keep a rbtree of available virtual engines inside each physical
+        * engine, sorted by priority. Here we preallocate the nodes we need
+        * for the virtual engine, indexed by physical_engine->id.
+        */
+       struct ve_node {
+               struct rb_node rb;
+               int prio;
+       } nodes[I915_NUM_ENGINES];
+
+       /* And finally, which physical engines this virtual engine maps onto. */
+       unsigned int num_siblings;
+       struct intel_engine_cs *siblings[0];
+};
+
+static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
+{
+       GEM_BUG_ON(!intel_engine_is_virtual(engine));
+       return container_of(engine, struct virtual_engine, base);
+}
+
 static int execlists_context_deferred_alloc(struct intel_context *ce,
                                            struct intel_engine_cs *engine);
 static void execlists_init_reg_state(u32 *reg_state,
 }
 
 static inline bool need_preempt(const struct intel_engine_cs *engine,
-                               const struct i915_request *rq)
+                               const struct i915_request *rq,
+                               struct rb_node *rb)
 {
        int last_prio;
 
            rq_prio(list_next_entry(rq, link)) > last_prio)
                return true;
 
+       if (rb) {
+               struct virtual_engine *ve =
+                       rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
+               bool preempt = false;
+
+               if (engine == ve->siblings[0]) { /* only preempt one sibling */
+                       struct i915_request *next;
+
+                       rcu_read_lock();
+                       next = READ_ONCE(ve->request);
+                       if (next)
+                               preempt = rq_prio(next) > last_prio;
+                       rcu_read_unlock();
+               }
+
+               if (preempt)
+                       return preempt;
+       }
+
        /*
         * If the inflight context did not trigger the preemption, then maybe
         * it was the set of queued requests? Pick the highest priority in
        list_for_each_entry_safe_reverse(rq, rn,
                                         &engine->timeline.requests,
                                         link) {
+               struct intel_engine_cs *owner;
+
                if (i915_request_completed(rq))
                        break;
 
 
                GEM_BUG_ON(rq->hw_context->active);
 
-               GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
-               if (rq_prio(rq) != prio) {
-                       prio = rq_prio(rq);
-                       pl = i915_sched_lookup_priolist(engine, prio);
-               }
-               GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
-
-               list_add(&rq->sched.link, pl);
+               /*
+                * Push the request back into the queue for later resubmission.
+                * If this request is not native to this physical engine (i.e.
+                * it came from a virtual source), push it back onto the virtual
+                * engine so that it can be moved across onto another physical
+                * engine as load dictates.
+                */
+               owner = rq->hw_context->engine;
+               if (likely(owner == engine)) {
+                       GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
+                       if (rq_prio(rq) != prio) {
+                               prio = rq_prio(rq);
+                               pl = i915_sched_lookup_priolist(engine, prio);
+                       }
+                       GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
 
-               active = rq;
+                       list_add(&rq->sched.link, pl);
+                       active = rq;
+               } else {
+                       rq->engine = owner;
+                       owner->submit_request(rq);
+                       active = NULL;
+               }
        }
 
        return active;
                                                  execlists));
 }
 
+static void virtual_update_register_offsets(u32 *regs,
+                                           struct intel_engine_cs *engine)
+{
+       u32 base = engine->mmio_base;
+
+       /* Must match execlists_init_reg_state()! */
+
+       regs[CTX_CONTEXT_CONTROL] =
+               i915_mmio_reg_offset(RING_CONTEXT_CONTROL(base));
+       regs[CTX_RING_HEAD] = i915_mmio_reg_offset(RING_HEAD(base));
+       regs[CTX_RING_TAIL] = i915_mmio_reg_offset(RING_TAIL(base));
+       regs[CTX_RING_BUFFER_START] = i915_mmio_reg_offset(RING_START(base));
+       regs[CTX_RING_BUFFER_CONTROL] = i915_mmio_reg_offset(RING_CTL(base));
+
+       regs[CTX_BB_HEAD_U] = i915_mmio_reg_offset(RING_BBADDR_UDW(base));
+       regs[CTX_BB_HEAD_L] = i915_mmio_reg_offset(RING_BBADDR(base));
+       regs[CTX_BB_STATE] = i915_mmio_reg_offset(RING_BBSTATE(base));
+       regs[CTX_SECOND_BB_HEAD_U] =
+               i915_mmio_reg_offset(RING_SBBADDR_UDW(base));
+       regs[CTX_SECOND_BB_HEAD_L] = i915_mmio_reg_offset(RING_SBBADDR(base));
+       regs[CTX_SECOND_BB_STATE] = i915_mmio_reg_offset(RING_SBBSTATE(base));
+
+       regs[CTX_CTX_TIMESTAMP] =
+               i915_mmio_reg_offset(RING_CTX_TIMESTAMP(base));
+       regs[CTX_PDP3_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 3));
+       regs[CTX_PDP3_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 3));
+       regs[CTX_PDP2_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 2));
+       regs[CTX_PDP2_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 2));
+       regs[CTX_PDP1_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 1));
+       regs[CTX_PDP1_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 1));
+       regs[CTX_PDP0_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 0));
+       regs[CTX_PDP0_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 0));
+
+       if (engine->class == RENDER_CLASS) {
+               regs[CTX_RCS_INDIRECT_CTX] =
+                       i915_mmio_reg_offset(RING_INDIRECT_CTX(base));
+               regs[CTX_RCS_INDIRECT_CTX_OFFSET] =
+                       i915_mmio_reg_offset(RING_INDIRECT_CTX_OFFSET(base));
+               regs[CTX_BB_PER_CTX_PTR] =
+                       i915_mmio_reg_offset(RING_BB_PER_CTX_PTR(base));
+
+               regs[CTX_R_PWR_CLK_STATE] =
+                       i915_mmio_reg_offset(GEN8_R_PWR_CLK_STATE);
+       }
+}
+
+static bool virtual_matches(const struct virtual_engine *ve,
+                           const struct i915_request *rq,
+                           const struct intel_engine_cs *engine)
+{
+       const struct intel_engine_cs *active;
+
+       /*
+        * We track when the HW has completed saving the context image
+        * (i.e. when we have seen the final CS event switching out of
+        * the context) and must not overwrite the context image before
+        * then. This restricts us to only using the active engine
+        * while the previous virtualized request is inflight (so
+        * we reuse the register offsets). This is a very small
+        * hystersis on the greedy seelction algorithm.
+        */
+       active = READ_ONCE(ve->context.active);
+       if (active && active != engine)
+               return false;
+
+       return true;
+}
+
+static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
+                                    struct intel_engine_cs *engine)
+{
+       struct intel_engine_cs *old = ve->siblings[0];
+
+       /* All unattached (rq->engine == old) must already be completed */
+
+       spin_lock(&old->breadcrumbs.irq_lock);
+       if (!list_empty(&ve->context.signal_link)) {
+               list_move_tail(&ve->context.signal_link,
+                              &engine->breadcrumbs.signalers);
+               intel_engine_queue_breadcrumbs(engine);
+       }
+       spin_unlock(&old->breadcrumbs.irq_lock);
+}
+
 static void execlists_dequeue(struct intel_engine_cs *engine)
 {
        struct intel_engine_execlists * const execlists = &engine->execlists;
         * and context switches) submission.
         */
 
+       for (rb = rb_first_cached(&execlists->virtual); rb; ) {
+               struct virtual_engine *ve =
+                       rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
+               struct i915_request *rq = READ_ONCE(ve->request);
+
+               if (!rq) { /* lazily cleanup after another engine handled rq */
+                       rb_erase_cached(rb, &execlists->virtual);
+                       RB_CLEAR_NODE(rb);
+                       rb = rb_first_cached(&execlists->virtual);
+                       continue;
+               }
+
+               if (!virtual_matches(ve, rq, engine)) {
+                       rb = rb_next(rb);
+                       continue;
+               }
+
+               break;
+       }
+
        if (last) {
                /*
                 * Don't resubmit or switch until all outstanding
                if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
                        return;
 
-               if (need_preempt(engine, last)) {
+               if (need_preempt(engine, last, rb)) {
                        inject_preempt_context(engine);
                        return;
                }
                last->tail = last->wa_tail;
        }
 
+       while (rb) { /* XXX virtual is always taking precedence */
+               struct virtual_engine *ve =
+                       rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
+               struct i915_request *rq;
+
+               spin_lock(&ve->base.timeline.lock);
+
+               rq = ve->request;
+               if (unlikely(!rq)) { /* lost the race to a sibling */
+                       spin_unlock(&ve->base.timeline.lock);
+                       rb_erase_cached(rb, &execlists->virtual);
+                       RB_CLEAR_NODE(rb);
+                       rb = rb_first_cached(&execlists->virtual);
+                       continue;
+               }
+
+               GEM_BUG_ON(rq != ve->request);
+               GEM_BUG_ON(rq->engine != &ve->base);
+               GEM_BUG_ON(rq->hw_context != &ve->context);
+
+               if (rq_prio(rq) >= queue_prio(execlists)) {
+                       if (!virtual_matches(ve, rq, engine)) {
+                               spin_unlock(&ve->base.timeline.lock);
+                               rb = rb_next(rb);
+                               continue;
+                       }
+
+                       if (last && !can_merge_rq(last, rq)) {
+                               spin_unlock(&ve->base.timeline.lock);
+                               return; /* leave this rq for another engine */
+                       }
+
+                       GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n",
+                                 engine->name,
+                                 rq->fence.context,
+                                 rq->fence.seqno,
+                                 i915_request_completed(rq) ? "!" :
+                                 i915_request_started(rq) ? "*" :
+                                 "",
+                                 yesno(engine != ve->siblings[0]));
+
+                       ve->request = NULL;
+                       ve->base.execlists.queue_priority_hint = INT_MIN;
+                       rb_erase_cached(rb, &execlists->virtual);
+                       RB_CLEAR_NODE(rb);
+
+                       rq->engine = engine;
+
+                       if (engine != ve->siblings[0]) {
+                               u32 *regs = ve->context.lrc_reg_state;
+                               unsigned int n;
+
+                               GEM_BUG_ON(READ_ONCE(ve->context.active));
+                               virtual_update_register_offsets(regs, engine);
+
+                               if (!list_empty(&ve->context.signals))
+                                       virtual_xfer_breadcrumbs(ve, engine);
+
+                               /*
+                                * Move the bound engine to the top of the list
+                                * for future execution. We then kick this
+                                * tasklet first before checking others, so that
+                                * we preferentially reuse this set of bound
+                                * registers.
+                                */
+                               for (n = 1; n < ve->num_siblings; n++) {
+                                       if (ve->siblings[n] == engine) {
+                                               swap(ve->siblings[n],
+                                                    ve->siblings[0]);
+                                               break;
+                                       }
+                               }
+
+                               GEM_BUG_ON(ve->siblings[0] != engine);
+                       }
+
+                       __i915_request_submit(rq);
+                       trace_i915_request_in(rq, port_index(port, execlists));
+                       submit = true;
+                       last = rq;
+               }
+
+               spin_unlock(&ve->base.timeline.lock);
+               break;
+       }
+
        while ((rb = rb_first_cached(&execlists->queue))) {
                struct i915_priolist *p = to_priolist(rb);
                struct i915_request *rq, *rn;
                               &execlists->csb_status[reset_value]);
 }
 
+static struct i915_request *active_request(struct i915_request *rq)
+{
+       const struct list_head * const list = &rq->engine->timeline.requests;
+       const struct intel_context * const context = rq->hw_context;
+       struct i915_request *active = NULL;
+
+       list_for_each_entry_from_reverse(rq, list, link) {
+               if (i915_request_completed(rq))
+                       break;
+
+               if (rq->hw_context != context)
+                       break;
+
+               active = rq;
+       }
+
+       return active;
+}
+
 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
 {
        struct intel_engine_execlists * const execlists = &engine->execlists;
        if (!port_isset(execlists->port))
                goto out_clear;
 
-       ce = port_request(execlists->port)->hw_context;
+       rq = port_request(execlists->port);
+       ce = rq->hw_context;
 
        /*
         * Catch up with any missed context-switch interrupts.
         */
        execlists_cancel_port_requests(execlists);
 
-       /* Push back any incomplete requests for replay after the reset. */
-       rq = __unwind_incomplete_requests(engine);
+       rq = active_request(rq);
        if (!rq)
                goto out_replay;
 
-       if (rq->hw_context != ce) { /* caught just before a CS event */
-               rq = NULL;
-               goto out_replay;
-       }
-
        /*
         * If this request hasn't started yet, e.g. it is waiting on a
         * semaphore, we need to avoid skipping the request or else we
        }
        execlists_init_reg_state(regs, ce, engine, ce->ring);
 
-       /* Rerun the request; its payload has been neutered (if guilty). */
 out_replay:
+       /* Rerun the request; its payload has been neutered (if guilty). */
        ce->ring->head =
                rq ? intel_ring_wrap(ce->ring, rq->head) : ce->ring->tail;
        intel_ring_update_space(ce->ring);
        __execlists_update_reg_state(ce, engine);
 
+       /* Push back any incomplete requests for replay after the reset. */
+       __unwind_incomplete_requests(engine);
+
 out_clear:
        execlists_clear_all_active(execlists);
 }
                i915_priolist_free(p);
        }
 
+       /* Cancel all attached virtual engines */
+       while ((rb = rb_first_cached(&execlists->virtual))) {
+               struct virtual_engine *ve =
+                       rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
+
+               rb_erase_cached(rb, &execlists->virtual);
+               RB_CLEAR_NODE(rb);
+
+               spin_lock(&ve->base.timeline.lock);
+               if (ve->request) {
+                       ve->request->engine = engine;
+                       __i915_request_submit(ve->request);
+                       dma_fence_set_error(&ve->request->fence, -EIO);
+                       i915_request_mark_complete(ve->request);
+                       ve->base.execlists.queue_priority_hint = INT_MIN;
+                       ve->request = NULL;
+               }
+               spin_unlock(&ve->base.timeline.lock);
+       }
+
        /* Remaining _unready_ requests will be nop'ed when submitted */
 
        execlists->queue_priority_hint = INT_MIN;
        bool rcs = engine->class == RENDER_CLASS;
        u32 base = engine->mmio_base;
 
-       /* A context is actually a big batch buffer with several
+       /*
+        * A context is actually a big batch buffer with several
         * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
         * values we are setting here are only for the first context restore:
         * on a subsequent save, the GPU will recreate this batchbuffer with new
         * values (including all the missing MI_LOAD_REGISTER_IMM commands that
         * we are not initializing here).
+        *
+        * Must keep consistent with virtual_update_register_offsets().
         */
        regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
                                 MI_LRI_FORCE_POSTED;
        return ret;
 }
 
+static void virtual_context_destroy(struct kref *kref)
+{
+       struct virtual_engine *ve =
+               container_of(kref, typeof(*ve), context.ref);
+       unsigned int n;
+
+       GEM_BUG_ON(ve->request);
+       GEM_BUG_ON(ve->context.active);
+
+       for (n = 0; n < ve->num_siblings; n++) {
+               struct intel_engine_cs *sibling = ve->siblings[n];
+               struct rb_node *node = &ve->nodes[sibling->id].rb;
+
+               if (RB_EMPTY_NODE(node))
+                       continue;
+
+               spin_lock_irq(&sibling->timeline.lock);
+
+               /* Detachment is lazily performed in the execlists tasklet */
+               if (!RB_EMPTY_NODE(node))
+                       rb_erase_cached(node, &sibling->execlists.virtual);
+
+               spin_unlock_irq(&sibling->timeline.lock);
+       }
+       GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
+
+       if (ve->context.state)
+               __execlists_context_fini(&ve->context);
+
+       i915_timeline_fini(&ve->base.timeline);
+       kfree(ve);
+}
+
+static void virtual_engine_initial_hint(struct virtual_engine *ve)
+{
+       int swp;
+
+       /*
+        * Pick a random sibling on starting to help spread the load around.
+        *
+        * New contexts are typically created with exactly the same order
+        * of siblings, and often started in batches. Due to the way we iterate
+        * the array of sibling when submitting requests, sibling[0] is
+        * prioritised for dequeuing. If we make sure that sibling[0] is fairly
+        * randomised across the system, we also help spread the load by the
+        * first engine we inspect being different each time.
+        *
+        * NB This does not force us to execute on this engine, it will just
+        * typically be the first we inspect for submission.
+        */
+       swp = prandom_u32_max(ve->num_siblings);
+       if (!swp)
+               return;
+
+       swap(ve->siblings[swp], ve->siblings[0]);
+       virtual_update_register_offsets(ve->context.lrc_reg_state,
+                                       ve->siblings[0]);
+}
+
+static int virtual_context_pin(struct intel_context *ce)
+{
+       struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
+       int err;
+
+       /* Note: we must use a real engine class for setting up reg state */
+       err = __execlists_context_pin(ce, ve->siblings[0]);
+       if (err)
+               return err;
+
+       virtual_engine_initial_hint(ve);
+       return 0;
+}
+
+static void virtual_context_enter(struct intel_context *ce)
+{
+       struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
+       unsigned int n;
+
+       for (n = 0; n < ve->num_siblings; n++)
+               intel_engine_pm_get(ve->siblings[n]);
+}
+
+static void virtual_context_exit(struct intel_context *ce)
+{
+       struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
+       unsigned int n;
+
+       ce->saturated = 0;
+       for (n = 0; n < ve->num_siblings; n++)
+               intel_engine_pm_put(ve->siblings[n]);
+}
+
+static const struct intel_context_ops virtual_context_ops = {
+       .pin = virtual_context_pin,
+       .unpin = execlists_context_unpin,
+
+       .enter = virtual_context_enter,
+       .exit = virtual_context_exit,
+
+       .destroy = virtual_context_destroy,
+};
+
+static void virtual_submission_tasklet(unsigned long data)
+{
+       struct virtual_engine * const ve = (struct virtual_engine *)data;
+       const int prio = ve->base.execlists.queue_priority_hint;
+       unsigned int n;
+
+       local_irq_disable();
+       for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
+               struct intel_engine_cs *sibling = ve->siblings[n];
+               struct ve_node * const node = &ve->nodes[sibling->id];
+               struct rb_node **parent, *rb;
+               bool first;
+
+               spin_lock(&sibling->timeline.lock);
+
+               if (!RB_EMPTY_NODE(&node->rb)) {
+                       /*
+                        * Cheat and avoid rebalancing the tree if we can
+                        * reuse this node in situ.
+                        */
+                       first = rb_first_cached(&sibling->execlists.virtual) ==
+                               &node->rb;
+                       if (prio == node->prio || (prio > node->prio && first))
+                               goto submit_engine;
+
+                       rb_erase_cached(&node->rb, &sibling->execlists.virtual);
+               }
+
+               rb = NULL;
+               first = true;
+               parent = &sibling->execlists.virtual.rb_root.rb_node;
+               while (*parent) {
+                       struct ve_node *other;
+
+                       rb = *parent;
+                       other = rb_entry(rb, typeof(*other), rb);
+                       if (prio > other->prio) {
+                               parent = &rb->rb_left;
+                       } else {
+                               parent = &rb->rb_right;
+                               first = false;
+                       }
+               }
+
+               rb_link_node(&node->rb, rb, parent);
+               rb_insert_color_cached(&node->rb,
+                                      &sibling->execlists.virtual,
+                                      first);
+
+submit_engine:
+               GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
+               node->prio = prio;
+               if (first && prio > sibling->execlists.queue_priority_hint) {
+                       sibling->execlists.queue_priority_hint = prio;
+                       tasklet_hi_schedule(&sibling->execlists.tasklet);
+               }
+
+               spin_unlock(&sibling->timeline.lock);
+       }
+       local_irq_enable();
+}
+
+static void virtual_submit_request(struct i915_request *rq)
+{
+       struct virtual_engine *ve = to_virtual_engine(rq->engine);
+
+       GEM_TRACE("%s: rq=%llx:%lld\n",
+                 ve->base.name,
+                 rq->fence.context,
+                 rq->fence.seqno);
+
+       GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
+
+       GEM_BUG_ON(ve->request);
+       ve->base.execlists.queue_priority_hint = rq_prio(rq);
+       WRITE_ONCE(ve->request, rq);
+
+       tasklet_schedule(&ve->base.execlists.tasklet);
+}
+
+struct intel_context *
+intel_execlists_create_virtual(struct i915_gem_context *ctx,
+                              struct intel_engine_cs **siblings,
+                              unsigned int count)
+{
+       struct virtual_engine *ve;
+       unsigned int n;
+       int err;
+
+       if (count == 0)
+               return ERR_PTR(-EINVAL);
+
+       if (count == 1)
+               return intel_context_create(ctx, siblings[0]);
+
+       ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
+       if (!ve)
+               return ERR_PTR(-ENOMEM);
+
+       ve->base.i915 = ctx->i915;
+       ve->base.id = -1;
+       ve->base.class = OTHER_CLASS;
+       ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
+       ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
+       ve->base.flags = I915_ENGINE_IS_VIRTUAL;
+
+       snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
+
+       err = i915_timeline_init(ctx->i915, &ve->base.timeline, NULL);
+       if (err)
+               goto err_put;
+       i915_timeline_set_subclass(&ve->base.timeline, TIMELINE_VIRTUAL);
+
+       intel_engine_init_execlists(&ve->base);
+
+       ve->base.cops = &virtual_context_ops;
+       ve->base.request_alloc = execlists_request_alloc;
+
+       ve->base.schedule = i915_schedule;
+       ve->base.submit_request = virtual_submit_request;
+
+       ve->base.execlists.queue_priority_hint = INT_MIN;
+       tasklet_init(&ve->base.execlists.tasklet,
+                    virtual_submission_tasklet,
+                    (unsigned long)ve);
+
+       intel_context_init(&ve->context, ctx, &ve->base);
+
+       for (n = 0; n < count; n++) {
+               struct intel_engine_cs *sibling = siblings[n];
+
+               GEM_BUG_ON(!is_power_of_2(sibling->mask));
+               if (sibling->mask & ve->base.mask) {
+                       DRM_DEBUG("duplicate %s entry in load balancer\n",
+                                 sibling->name);
+                       err = -EINVAL;
+                       goto err_put;
+               }
+
+               /*
+                * The virtual engine implementation is tightly coupled to
+                * the execlists backend -- we push out request directly
+                * into a tree inside each physical engine. We could support
+                * layering if we handle cloning of the requests and
+                * submitting a copy into each backend.
+                */
+               if (sibling->execlists.tasklet.func !=
+                   execlists_submission_tasklet) {
+                       err = -ENODEV;
+                       goto err_put;
+               }
+
+               GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
+               RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
+
+               ve->siblings[ve->num_siblings++] = sibling;
+               ve->base.mask |= sibling->mask;
+
+               /*
+                * All physical engines must be compatible for their emission
+                * functions (as we build the instructions during request
+                * construction and do not alter them before submission
+                * on the physical engine). We use the engine class as a guide
+                * here, although that could be refined.
+                */
+               if (ve->base.class != OTHER_CLASS) {
+                       if (ve->base.class != sibling->class) {
+                               DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
+                                         sibling->class, ve->base.class);
+                               err = -EINVAL;
+                               goto err_put;
+                       }
+                       continue;
+               }
+
+               ve->base.class = sibling->class;
+               ve->base.uabi_class = sibling->uabi_class;
+               snprintf(ve->base.name, sizeof(ve->base.name),
+                        "v%dx%d", ve->base.class, count);
+               ve->base.context_size = sibling->context_size;
+
+               ve->base.emit_bb_start = sibling->emit_bb_start;
+               ve->base.emit_flush = sibling->emit_flush;
+               ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
+               ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
+               ve->base.emit_fini_breadcrumb_dw =
+                       sibling->emit_fini_breadcrumb_dw;
+       }
+
+       return &ve->context;
+
+err_put:
+       intel_context_put(&ve->context);
+       return ERR_PTR(err);
+}
+
+struct intel_context *
+intel_execlists_clone_virtual(struct i915_gem_context *ctx,
+                             struct intel_engine_cs *src)
+{
+       struct virtual_engine *se = to_virtual_engine(src);
+       struct intel_context *dst;
+
+       dst = intel_execlists_create_virtual(ctx,
+                                            se->siblings,
+                                            se->num_siblings);
+       if (IS_ERR(dst))
+               return dst;
+
+       return dst;
+}
+
 void intel_execlists_show_requests(struct intel_engine_cs *engine,
                                   struct drm_printer *m,
                                   void (*show_request)(struct drm_printer *m,
                show_request(m, last, "\t\tQ ");
        }
 
+       last = NULL;
+       count = 0;
+       for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
+               struct virtual_engine *ve =
+                       rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
+               struct i915_request *rq = READ_ONCE(ve->request);
+
+               if (rq) {
+                       if (count++ < max - 1)
+                               show_request(m, rq, "\t\tV ");
+                       else
+                               last = rq;
+               }
+       }
+       if (last) {
+               if (count > max) {
+                       drm_printf(m,
+                                  "\t\t...skipping %d virtual requests...\n",
+                                  count - max);
+               }
+               show_request(m, last, "\t\tV ");
+       }
+
        spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
 
        return err;
 }
 
+static int nop_virtual_engine(struct drm_i915_private *i915,
+                             struct intel_engine_cs **siblings,
+                             unsigned int nsibling,
+                             unsigned int nctx,
+                             unsigned int flags)
+#define CHAIN BIT(0)
+{
+       IGT_TIMEOUT(end_time);
+       struct i915_request *request[16];
+       struct i915_gem_context *ctx[16];
+       struct intel_context *ve[16];
+       unsigned long n, prime, nc;
+       struct igt_live_test t;
+       ktime_t times[2] = {};
+       int err;
+
+       GEM_BUG_ON(!nctx || nctx > ARRAY_SIZE(ctx));
+
+       for (n = 0; n < nctx; n++) {
+               ctx[n] = kernel_context(i915);
+               if (!ctx[n]) {
+                       err = -ENOMEM;
+                       nctx = n;
+                       goto out;
+               }
+
+               ve[n] = intel_execlists_create_virtual(ctx[n],
+                                                      siblings, nsibling);
+               if (IS_ERR(ve[n])) {
+                       kernel_context_close(ctx[n]);
+                       err = PTR_ERR(ve[n]);
+                       nctx = n;
+                       goto out;
+               }
+
+               err = intel_context_pin(ve[n]);
+               if (err) {
+                       intel_context_put(ve[n]);
+                       kernel_context_close(ctx[n]);
+                       nctx = n;
+                       goto out;
+               }
+       }
+
+       err = igt_live_test_begin(&t, i915, __func__, ve[0]->engine->name);
+       if (err)
+               goto out;
+
+       for_each_prime_number_from(prime, 1, 8192) {
+               times[1] = ktime_get_raw();
+
+               if (flags & CHAIN) {
+                       for (nc = 0; nc < nctx; nc++) {
+                               for (n = 0; n < prime; n++) {
+                                       request[nc] =
+                                               i915_request_create(ve[nc]);
+                                       if (IS_ERR(request[nc])) {
+                                               err = PTR_ERR(request[nc]);
+                                               goto out;
+                                       }
+
+                                       i915_request_add(request[nc]);
+                               }
+                       }
+               } else {
+                       for (n = 0; n < prime; n++) {
+                               for (nc = 0; nc < nctx; nc++) {
+                                       request[nc] =
+                                               i915_request_create(ve[nc]);
+                                       if (IS_ERR(request[nc])) {
+                                               err = PTR_ERR(request[nc]);
+                                               goto out;
+                                       }
+
+                                       i915_request_add(request[nc]);
+                               }
+                       }
+               }
+
+               for (nc = 0; nc < nctx; nc++) {
+                       if (i915_request_wait(request[nc],
+                                             I915_WAIT_LOCKED,
+                                             HZ / 10) < 0) {
+                               pr_err("%s(%s): wait for %llx:%lld timed out\n",
+                                      __func__, ve[0]->engine->name,
+                                      request[nc]->fence.context,
+                                      request[nc]->fence.seqno);
+
+                               GEM_TRACE("%s(%s) failed at request %llx:%lld\n",
+                                         __func__, ve[0]->engine->name,
+                                         request[nc]->fence.context,
+                                         request[nc]->fence.seqno);
+                               GEM_TRACE_DUMP();
+                               i915_gem_set_wedged(i915);
+                               break;
+                       }
+               }
+
+               times[1] = ktime_sub(ktime_get_raw(), times[1]);
+               if (prime == 1)
+                       times[0] = times[1];
+
+               if (__igt_timeout(end_time, NULL))
+                       break;
+       }
+
+       err = igt_live_test_end(&t);
+       if (err)
+               goto out;
+
+       pr_info("Requestx%d latencies on %s: 1 = %lluns, %lu = %lluns\n",
+               nctx, ve[0]->engine->name, ktime_to_ns(times[0]),
+               prime, div64_u64(ktime_to_ns(times[1]), prime));
+
+out:
+       if (igt_flush_test(i915, I915_WAIT_LOCKED))
+               err = -EIO;
+
+       for (nc = 0; nc < nctx; nc++) {
+               intel_context_unpin(ve[nc]);
+               intel_context_put(ve[nc]);
+               kernel_context_close(ctx[nc]);
+       }
+       return err;
+}
+
+static int live_virtual_engine(void *arg)
+{
+       struct drm_i915_private *i915 = arg;
+       struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
+       struct intel_engine_cs *engine;
+       enum intel_engine_id id;
+       unsigned int class, inst;
+       int err = -ENODEV;
+
+       if (USES_GUC_SUBMISSION(i915))
+               return 0;
+
+       mutex_lock(&i915->drm.struct_mutex);
+
+       for_each_engine(engine, i915, id) {
+               err = nop_virtual_engine(i915, &engine, 1, 1, 0);
+               if (err) {
+                       pr_err("Failed to wrap engine %s: err=%d\n",
+                              engine->name, err);
+                       goto out_unlock;
+               }
+       }
+
+       for (class = 0; class <= MAX_ENGINE_CLASS; class++) {
+               int nsibling, n;
+
+               nsibling = 0;
+               for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
+                       if (!i915->engine_class[class][inst])
+                               continue;
+
+                       siblings[nsibling++] = i915->engine_class[class][inst];
+               }
+               if (nsibling < 2)
+                       continue;
+
+               for (n = 1; n <= nsibling + 1; n++) {
+                       err = nop_virtual_engine(i915, siblings, nsibling,
+                                                n, 0);
+                       if (err)
+                               goto out_unlock;
+               }
+
+               err = nop_virtual_engine(i915, siblings, nsibling, n, CHAIN);
+               if (err)
+                       goto out_unlock;
+       }
+
+out_unlock:
+       mutex_unlock(&i915->drm.struct_mutex);
+       return err;
+}
+
 int intel_execlists_live_selftests(struct drm_i915_private *i915)
 {
        static const struct i915_subtest tests[] = {
                SUBTEST(live_chain_preempt),
                SUBTEST(live_preempt_hang),
                SUBTEST(live_preempt_smoke),
+               SUBTEST(live_virtual_engine),
        };
 
        if (!HAS_EXECLISTS(i915))