]> www.infradead.org Git - users/willy/xarray.git/commitdiff
drm/i915/gt: Allow failed resets without assertion
authorChris Wilson <chris@chris-wilson.co.uk>
Mon, 4 Jan 2021 11:51:42 +0000 (11:51 +0000)
committerChris Wilson <chris@chris-wilson.co.uk>
Tue, 5 Jan 2021 09:17:22 +0000 (09:17 +0000)
If the engine reset fails, we will attempt to resume with the current
inflight submissions. When that happens, we cannot assert that the
engine reset cleared the pending submission, so do not.

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2878
Fixes: 16f2941ad307 ("drm/i915/gt: Replace direct submit with direct call to tasklet")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210104115145.24460-3-chris@chris-wilson.co.uk
drivers/gpu/drm/i915/gt/intel_engine_types.h
drivers/gpu/drm/i915/gt/intel_execlists_submission.c
drivers/gpu/drm/i915/gt/intel_reset.c
drivers/gpu/drm/i915/gt/selftest_execlists.c

index c28f4e190fe6907caa16aa7252a13f7bfb5b06e5..430066e5884cda08bda14f21571d32f2fa0dcb98 100644 (file)
@@ -561,6 +561,8 @@ struct intel_engine_cs {
                unsigned long stop_timeout_ms;
                unsigned long timeslice_duration_ms;
        } props, defaults;
+
+       I915_SELFTEST_DECLARE(struct fault_attr reset_timeout);
 };
 
 static inline bool
index 2afbc0a4ca031af126975e5a683f63f546a81ee7..f02e3ae10d282aec417e89d527ada50ccd97770b 100644 (file)
@@ -3047,9 +3047,13 @@ static void execlists_reset_finish(struct intel_engine_cs *engine)
         * After a GPU reset, we may have requests to replay. Do so now while
         * we still have the forcewake to be sure that the GPU is not allowed
         * to sleep before we restart and reload a context.
+        *
+        * If the GPU reset fails, the engine may still be alive with requests
+        * inflight. We expect those to complete, or for the device to be
+        * reset as the next level of recovery, and as a final resort we
+        * will declare the device wedged.
         */
        GEM_BUG_ON(!reset_in_progress(execlists));
-       GEM_BUG_ON(engine->execlists.pending[0]);
 
        /* And kick in case we missed a new request submission. */
        if (__tasklet_enable(&execlists->tasklet))
index e02775fc326de4fbbff72dab16c386ced3dda203..b516b2c0528dc2012889641b1b266d62c1c39e34 100644 (file)
@@ -497,6 +497,9 @@ static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
        u32 request, mask, ack;
        int ret;
 
+       if (I915_SELFTEST_ONLY(should_fail(&engine->reset_timeout, 1)))
+               return -ETIMEDOUT;
+
        ack = intel_uncore_read_fw(uncore, reg);
        if (ack & RESET_CTL_CAT_ERROR) {
                /*
index 3854da5a4e65cdd05b0c75631b5dfbeef18cf22c..bfa7fd5c2c915d0e78c7ba95b1a104953c7ec374 100644 (file)
@@ -2299,6 +2299,77 @@ out:
        return err;
 }
 
+static void force_reset_timeout(struct intel_engine_cs *engine)
+{
+       engine->reset_timeout.probability = 999;
+       atomic_set(&engine->reset_timeout.times, -1);
+}
+
+static void cancel_reset_timeout(struct intel_engine_cs *engine)
+{
+       memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
+}
+
+static int __cancel_fail(struct live_preempt_cancel *arg)
+{
+       struct intel_engine_cs *engine = arg->engine;
+       struct i915_request *rq;
+       int err;
+
+       if (!IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT))
+               return 0;
+
+       if (!intel_has_reset_engine(engine->gt))
+               return 0;
+
+       GEM_TRACE("%s(%s)\n", __func__, engine->name);
+       rq = spinner_create_request(&arg->a.spin,
+                                   arg->a.ctx, engine,
+                                   MI_NOOP); /* preemption disabled */
+       if (IS_ERR(rq))
+               return PTR_ERR(rq);
+
+       clear_bit(CONTEXT_BANNED, &rq->context->flags);
+       i915_request_get(rq);
+       i915_request_add(rq);
+       if (!igt_wait_for_spinner(&arg->a.spin, rq)) {
+               err = -EIO;
+               goto out;
+       }
+
+       intel_context_set_banned(rq->context);
+
+       err = intel_engine_pulse(engine);
+       if (err)
+               goto out;
+
+       force_reset_timeout(engine);
+
+       /* force preempt reset [failure] */
+       while (!engine->execlists.pending[0])
+               intel_engine_flush_submission(engine);
+       del_timer_sync(&engine->execlists.preempt);
+       intel_engine_flush_submission(engine);
+
+       cancel_reset_timeout(engine);
+
+       /* after failure, require heartbeats to reset device */
+       intel_engine_set_heartbeat(engine, 1);
+       err = wait_for_reset(engine, rq, HZ / 2);
+       intel_engine_set_heartbeat(engine,
+                                  engine->defaults.heartbeat_interval_ms);
+       if (err) {
+               pr_err("Cancelled inflight0 request did not reset\n");
+               goto out;
+       }
+
+out:
+       i915_request_put(rq);
+       if (igt_flush_test(engine->i915))
+               err = -EIO;
+       return err;
+}
+
 static int live_preempt_cancel(void *arg)
 {
        struct intel_gt *gt = arg;
@@ -2338,6 +2409,10 @@ static int live_preempt_cancel(void *arg)
                err = __cancel_hostile(&data);
                if (err)
                        goto err_wedged;
+
+               err = __cancel_fail(&data);
+               if (err)
+                       goto err_wedged;
        }
 
        err = 0;