struct virtual_engine {
        struct intel_engine_cs base;
        struct intel_context context;
+       struct rcu_work rcu;
 
        /*
         * We allow only a single request through the virtual engine at a time
        return &ve->base.execlists.default_priolist.requests[0];
 }
 
-static void virtual_context_destroy(struct kref *kref)
+static void rcu_virtual_context_destroy(struct work_struct *wrk)
 {
        struct virtual_engine *ve =
-               container_of(kref, typeof(*ve), context.ref);
+               container_of(wrk, typeof(*ve), rcu.work);
        unsigned int n;
 
-       GEM_BUG_ON(!list_empty(virtual_queue(ve)));
-       GEM_BUG_ON(ve->request);
        GEM_BUG_ON(ve->context.inflight);
 
+       /* Preempt-to-busy may leave a stale request behind. */
+       if (unlikely(ve->request)) {
+               struct i915_request *old;
+
+               spin_lock_irq(&ve->base.active.lock);
+
+               old = fetch_and_zero(&ve->request);
+               if (old) {
+                       GEM_BUG_ON(!i915_request_completed(old));
+                       __i915_request_submit(old);
+                       i915_request_put(old);
+               }
+
+               spin_unlock_irq(&ve->base.active.lock);
+       }
+
+       /*
+        * Flush the tasklet in case it is still running on another core.
+        *
+        * This needs to be done before we remove ourselves from the siblings'
+        * rbtrees as in the case it is running in parallel, it may reinsert
+        * the rb_node into a sibling.
+        */
+       tasklet_kill(&ve->base.execlists.tasklet);
+
+       /* Decouple ourselves from the siblings, no more access allowed. */
        for (n = 0; n < ve->num_siblings; n++) {
                struct intel_engine_cs *sibling = ve->siblings[n];
                struct rb_node *node = &ve->nodes[sibling->id].rb;
-               unsigned long flags;
 
                if (RB_EMPTY_NODE(node))
                        continue;
 
-               spin_lock_irqsave(&sibling->active.lock, flags);
+               spin_lock_irq(&sibling->active.lock);
 
                /* Detachment is lazily performed in the execlists tasklet */
                if (!RB_EMPTY_NODE(node))
                        rb_erase_cached(node, &sibling->execlists.virtual);
 
-               spin_unlock_irqrestore(&sibling->active.lock, flags);
+               spin_unlock_irq(&sibling->active.lock);
        }
        GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
+       GEM_BUG_ON(!list_empty(virtual_queue(ve)));
 
        if (ve->context.state)
                __execlists_context_fini(&ve->context);
        kfree(ve);
 }
 
+static void virtual_context_destroy(struct kref *kref)
+{
+       struct virtual_engine *ve =
+               container_of(kref, typeof(*ve), context.ref);
+
+       GEM_BUG_ON(!list_empty(&ve->context.signals));
+
+       /*
+        * When destroying the virtual engine, we have to be aware that
+        * it may still be in use from an hardirq/softirq context causing
+        * the resubmission of a completed request (background completion
+        * due to preempt-to-busy). Before we can free the engine, we need
+        * to flush the submission code and tasklets that are still potentially
+        * accessing the engine. Flushing the tasklets requires process context,
+        * and since we can guard the resubmit onto the engine with an RCU read
+        * lock, we can delegate the free of the engine to an RCU worker.
+        */
+       INIT_RCU_WORK(&ve->rcu, rcu_virtual_context_destroy);
+       queue_rcu_work(system_wq, &ve->rcu);
+}
+
 static void virtual_engine_initial_hint(struct virtual_engine *ve)
 {
        int swp;