drm/i915: Remove GPU reset dependence on struct_mutex

author Chris Wilson <chris@chris-wilson.co.uk>

Fri, 25 Jan 2019 13:22:28 +0000 (13:22 +0000)

committer Chris Wilson <chris@chris-wilson.co.uk>

Fri, 25 Jan 2019 14:27:22 +0000 (14:27 +0000)
author Chris Wilson <chris@chris-wilson.co.uk>
Fri, 25 Jan 2019 13:22:28 +0000 (13:22 +0000)
committer Chris Wilson <chris@chris-wilson.co.uk>
Fri, 25 Jan 2019 14:27:22 +0000 (14:27 +0000)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c

index 9a9e1da496dc8624aed4a17292d3eb2e1def56e7..76dea0572f3e89527a090331c90940f5fba711ec 100644 (file)
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1284,8 +1284,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
                 seq_puts(m, "Wedged\n");
         if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
                 seq_puts(m, "Reset in progress: struct_mutex backoff\n");
-       if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
-               seq_puts(m, "Reset in progress: reset handoff to waiter\n");
         if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
                 seq_puts(m, "Waiter holding struct mutex\n");
         if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
@@ -1321,15 +1319,15 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
                 struct rb_node *rb;
  
                 seq_printf(m, "%s:\n", engine->name);
-               seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
+               seq_printf(m, "\tseqno = %x [current %x, last %x], %dms ago\n",
                            engine->hangcheck.seqno, seqno[id],
-                          intel_engine_last_submit(engine));
-               seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s, wedged? %s\n",
+                          intel_engine_last_submit(engine),
+                          jiffies_to_msecs(jiffies -
+                                           engine->hangcheck.action_timestamp));
+               seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
                            yesno(intel_engine_has_waiter(engine)),
                            yesno(test_bit(engine->id,
-                                         &dev_priv->gpu_error.missed_irq_rings)),
-                          yesno(engine->hangcheck.stalled),
-                          yesno(engine->hangcheck.wedged));
+                                         &dev_priv->gpu_error.missed_irq_rings)));
  
                 spin_lock_irq(&b->rb_lock);
                 for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
@@ -1343,11 +1341,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
                 seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
                            (long long)engine->hangcheck.acthd,
                            (long long)acthd[id]);
-               seq_printf(m, "\taction = %s(%d) %d ms ago\n",
-                          hangcheck_action_to_str(engine->hangcheck.action),
-                          engine->hangcheck.action,
-                          jiffies_to_msecs(jiffies -
-                                           engine->hangcheck.action_timestamp));
  
                 if (engine->id == RCS) {
                         seq_puts(m, "\tinstdone read =\n");
@@ -3911,8 +3904,6 @@ static int
  i915_wedged_set(void *data, u64 val)
  {
         struct drm_i915_private *i915 = data;
-       struct intel_engine_cs *engine;
-       unsigned int tmp;
  
         /*
          * There is no safeguard against this debugfs entry colliding
@@ -3925,18 +3916,8 @@ i915_wedged_set(void *data, u64 val)
         if (i915_reset_backoff(&i915->gpu_error))
                 return -EAGAIN;
  
-       for_each_engine_masked(engine, i915, val, tmp) {
-               engine->hangcheck.seqno = intel_engine_get_seqno(engine);
-               engine->hangcheck.stalled = true;
-       }
-
         i915_handle_error(i915, val, I915_ERROR_CAPTURE,
                           "Manually set wedged engine mask = %llx", val);
-
-       wait_on_bit(&i915->gpu_error.flags,
-                   I915_RESET_HANDOFF,
-                   TASK_UNINTERRUPTIBLE);
-
         return 0;
  }
  
@@ -4091,13 +4072,8 @@ i915_drop_caches_set(void *data, u64 val)
                 mutex_unlock(&i915->drm.struct_mutex);
         }
  
-       if (val & DROP_RESET_ACTIVE &&
-           i915_terminally_wedged(&i915->gpu_error)) {
+       if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(&i915->gpu_error))
                 i915_handle_error(i915, ALL_ENGINES, 0, NULL);
-               wait_on_bit(&i915->gpu_error.flags,
-                           I915_RESET_HANDOFF,
-                           TASK_UNINTERRUPTIBLE);
-       }
  
         fs_reclaim_acquire(GFP_KERNEL);
         if (val & DROP_BOUND)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h

index 3c111ad099225dcdc5a7cfcf92c4d327f4ddad0c..0133d1da3d3c4d3b4f6b13c6657db6ba9ac6b621 100644 (file)
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3001,11 +3001,6 @@ static inline bool i915_reset_backoff(struct i915_gpu_error *error)
         return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
  }
  
-static inline bool i915_reset_handoff(struct i915_gpu_error *error)
-{
-       return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
-}
-
  static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
  {
         return unlikely(test_bit(I915_WEDGED, &error->flags));
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c

index b359390ba22c09b46d19d75697ccafb78ac8c148..d20b42386c3cc38e6897586d565da05c5a62733e 100644 (file)
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -657,11 +657,6 @@ i915_gem_object_wait(struct drm_i915_gem_object *obj,
                      struct intel_rps_client *rps_client)
  {
         might_sleep();
-#if IS_ENABLED(CONFIG_LOCKDEP)
-       GEM_BUG_ON(debug_locks &&
-                  !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
-                  !!(flags & I915_WAIT_LOCKED));
-#endif
         GEM_BUG_ON(timeout < 0);
  
         timeout = i915_gem_object_wait_reservation(obj->resv,
@@ -4493,8 +4488,6 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
  
         GEM_TRACE("\n");
  
-       mutex_lock(&i915->drm.struct_mutex);
-
         wakeref = intel_runtime_pm_get(i915);
         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
  
@@ -4520,6 +4513,7 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
         intel_runtime_pm_put(i915, wakeref);
  
+       mutex_lock(&i915->drm.struct_mutex);
         i915_gem_contexts_lost(i915);
         mutex_unlock(&i915->drm.struct_mutex);
  }
@@ -4534,6 +4528,8 @@ int i915_gem_suspend(struct drm_i915_private *i915)
         wakeref = intel_runtime_pm_get(i915);
         intel_suspend_gt_powersave(i915);
  
+       flush_workqueue(i915->wq);
+
         mutex_lock(&i915->drm.struct_mutex);
  
         /*
@@ -4563,11 +4559,9 @@ int i915_gem_suspend(struct drm_i915_private *i915)
         i915_retire_requests(i915); /* ensure we flush after wedging */
  
         mutex_unlock(&i915->drm.struct_mutex);
+       i915_reset_flush(i915);
  
-       intel_uc_suspend(i915);
-
-       cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
-       cancel_delayed_work_sync(&i915->gt.retire_work);
+       drain_delayed_work(&i915->gt.retire_work);
  
         /*
          * As the idle_work is rearming if it detects a race, play safe and
@@ -4575,6 +4569,8 @@ int i915_gem_suspend(struct drm_i915_private *i915)
          */
         drain_delayed_work(&i915->gt.idle_work);
  
+       intel_uc_suspend(i915);
+
         /*
          * Assert that we successfully flushed all the work and
          * reset the GPU back to its idle, low power state.
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.h b/drivers/gpu/drm/i915/i915_gem_fence_reg.h

index 99a31ded4dfdfcc8eac68c390be31a76df27feee..09dcaf14121b622aeed9a259367c017bf2ff215d 100644 (file)
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.h
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
@@ -50,4 +50,3 @@ struct drm_i915_fence_reg {
  };
  
  #endif
-
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h

index 9229b03d629b4ff2fbcfbdecc11997abb7aefa9c..a0039ea97cdc07eee6fc9290b05efcd0a5db62fb 100644 (file)
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -39,6 +39,7 @@
  #include <linux/pagevec.h>
  
  #include "i915_request.h"
+#include "i915_reset.h"
  #include "i915_selftest.h"
  #include "i915_timeline.h"
  
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c

index 1f8e80e31b499e92c8413686c9a3f89940dc5ff0..4eef0462489cfdd0f9b62efea72cd221dc105fc7 100644 (file)
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -533,10 +533,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
         err_printf(m, "  waiting: %s\n", yesno(ee->waiting));
         err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
         err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
-       err_printf(m, "  hangcheck stall: %s\n", yesno(ee->hangcheck_stalled));
-       err_printf(m, "  hangcheck action: %s\n",
-                  hangcheck_action_to_str(ee->hangcheck_action));
-       err_printf(m, "  hangcheck action timestamp: %dms (%lu%s)\n",
+       err_printf(m, "  hangcheck timestamp: %dms (%lu%s)\n",
                    jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
                    ee->hangcheck_timestamp,
                    ee->hangcheck_timestamp == epoch ? "; epoch" : "");
@@ -684,15 +681,15 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
                    jiffies_to_msecs(error->capture - error->epoch));
  
         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-               if (error->engine[i].hangcheck_stalled &&
-                   error->engine[i].context.pid) {
-                       err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
-                                  engine_name(m->i915, i),
-                                  error->engine[i].context.comm,
-                                  error->engine[i].context.pid,
-                                  error->engine[i].context.ban_score,
-                                  bannable(&error->engine[i].context));
-               }
+               if (!error->engine[i].context.pid)
+                       continue;
+
+               err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
+                          engine_name(m->i915, i),
+                          error->engine[i].context.comm,
+                          error->engine[i].context.pid,
+                          error->engine[i].context.ban_score,
+                          bannable(&error->engine[i].context));
         }
         err_printf(m, "Reset count: %u\n", error->reset_count);
         err_printf(m, "Suspend count: %u\n", error->suspend_count);
@@ -1144,7 +1141,8 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
         return i;
  }
  
-/* Generate a semi-unique error code. The code is not meant to have meaning, The
+/*
+ * Generate a semi-unique error code. The code is not meant to have meaning, The
   * code's only purpose is to try to prevent false duplicated bug reports by
   * grossly estimating a GPU error state.
   *
@@ -1153,29 +1151,23 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
   *
   * It's only a small step better than a random number in its current form.
   */
-static u32 i915_error_generate_code(struct drm_i915_private *dev_priv,
-                                   struct i915_gpu_state *error,
-                                   int *engine_id)
+static u32 i915_error_generate_code(struct i915_gpu_state *error,
+                                   unsigned long engine_mask)
  {
-       u32 error_code = 0;
-       int i;
-
-       /* IPEHR would be an ideal way to detect errors, as it's the gross
+       /*
+        * IPEHR would be an ideal way to detect errors, as it's the gross
          * measure of "the command that hung." However, has some very common
          * synchronization commands which almost always appear in the case
          * strictly a client bug. Use instdone to differentiate those some.
          */
-       for (i = 0; i < I915_NUM_ENGINES; i++) {
-               if (error->engine[i].hangcheck_stalled) {
-                       if (engine_id)
-                               *engine_id = i;
+       if (engine_mask) {
+               struct drm_i915_error_engine *ee =
+                       &error->engine[ffs(engine_mask)];
  
-                       return error->engine[i].ipehr ^
-                              error->engine[i].instdone.instdone;
-               }
+               return ee->ipehr ^ ee->instdone.instdone;
         }
  
-       return error_code;
+       return 0;
  }
  
  static void gem_record_fences(struct i915_gpu_state *error)
@@ -1338,9 +1330,8 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
         }
  
         ee->idle = intel_engine_is_idle(engine);
-       ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
-       ee->hangcheck_action = engine->hangcheck.action;
-       ee->hangcheck_stalled = engine->hangcheck.stalled;
+       if (!ee->idle)
+               ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
         ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
                                                   engine);
  
@@ -1783,31 +1774,35 @@ static void capture_reg_state(struct i915_gpu_state *error)
         error->pgtbl_er = I915_READ(PGTBL_ER);
  }
  
-static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
-                                  struct i915_gpu_state *error,
-                                  u32 engine_mask,
-                                  const char *error_msg)
+static const char *
+error_msg(struct i915_gpu_state *error, unsigned long engines, const char *msg)
  {
-       u32 ecode;
-       int engine_id = -1, len;
+       int len;
+       int i;
  
-       ecode = i915_error_generate_code(dev_priv, error, &engine_id);
+       for (i = 0; i < ARRAY_SIZE(error->engine); i++)
+               if (!error->engine[i].context.pid)
+                       engines &= ~BIT(i);
  
         len = scnprintf(error->error_msg, sizeof(error->error_msg),
-                       "GPU HANG: ecode %d:%d:0x%08x",
-                       INTEL_GEN(dev_priv), engine_id, ecode);
-
-       if (engine_id != -1 && error->engine[engine_id].context.pid)
+                       "GPU HANG: ecode %d:%lx:0x%08x",
+                       INTEL_GEN(error->i915), engines,
+                       i915_error_generate_code(error, engines));
+       if (engines) {
+               /* Just show the first executing process, more is confusing */
+               i = ffs(engines);
                 len += scnprintf(error->error_msg + len,
                                  sizeof(error->error_msg) - len,
                                  ", in %s [%d]",
-                                error->engine[engine_id].context.comm,
-                                error->engine[engine_id].context.pid);
+                                error->engine[i].context.comm,
+                                error->engine[i].context.pid);
+       }
+       if (msg)
+               len += scnprintf(error->error_msg + len,
+                                sizeof(error->error_msg) - len,
+                                ", %s", msg);
  
-       scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
-                 ", reason: %s, action: %s",
-                 error_msg,
-                 engine_mask ? "reset" : "continue");
+       return error->error_msg;
  }
  
  static void capture_gen_state(struct i915_gpu_state *error)
@@ -1847,7 +1842,7 @@ static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
                 const struct drm_i915_error_engine *ee = &error->engine[i];
  
-               if (ee->hangcheck_stalled &&
+               if (ee->hangcheck_timestamp &&
                     time_before(ee->hangcheck_timestamp, epoch))
                         epoch = ee->hangcheck_timestamp;
         }
@@ -1921,7 +1916,7 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
   * i915_capture_error_state - capture an error record for later analysis
   * @i915: i915 device
   * @engine_mask: the mask of engines triggering the hang
- * @error_msg: a message to insert into the error capture header
+ * @msg: a message to insert into the error capture header
   *
   * Should be called when an error is detected (either a hang or an error
   * interrupt) to capture error state from the time of the error.  Fills
@@ -1929,8 +1924,8 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
   * to pick up.
   */
  void i915_capture_error_state(struct drm_i915_private *i915,
-                             u32 engine_mask,
-                             const char *error_msg)
+                             unsigned long engine_mask,
+                             const char *msg)
  {
         static bool warned;
         struct i915_gpu_state *error;
@@ -1946,8 +1941,7 @@ void i915_capture_error_state(struct drm_i915_private *i915,
         if (IS_ERR(error))
                 return;
  
-       i915_error_capture_msg(i915, error, engine_mask, error_msg);
-       DRM_INFO("%s\n", error->error_msg);
+       dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
  
         if (!error->simulated) {
                 spin_lock_irqsave(&i915->gpu_error.lock, flags);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h

index 604291f7762d6a21c40970273da6082cbd66289c..231173786eaebf7a171a67d058e958e4c0cb171d 100644 (file)
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -85,8 +85,6 @@ struct i915_gpu_state {
                 bool waiting;
                 int num_waiters;
                 unsigned long hangcheck_timestamp;
-               bool hangcheck_stalled;
-               enum intel_engine_hangcheck_action hangcheck_action;
                 struct i915_address_space *vm;
                 int num_requests;
                 u32 reset_count;
@@ -197,6 +195,8 @@ struct i915_gpu_state {
         struct scatterlist *sgl, *fit;
  };
  
+struct i915_gpu_restart;
+
  struct i915_gpu_error {
         /* For hangcheck timer */
  #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
@@ -247,15 +247,6 @@ struct i915_gpu_error {
          * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
          * secondary role in preventing two concurrent global reset attempts.
          *
-        * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
-        * struct_mutex. We try to acquire the struct_mutex in the reset worker,
-        * but it may be held by some long running waiter (that we cannot
-        * interrupt without causing trouble). Once we are ready to do the GPU
-        * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
-        * they already hold the struct_mutex and want to participate they can
-        * inspect the bit and do the reset directly, otherwise the worker
-        * waits for the struct_mutex.
-        *
          * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
          * acquire the struct_mutex to reset an engine, we need an explicit
          * flag to prevent two concurrent reset attempts in the same engine.
@@ -269,20 +260,13 @@ struct i915_gpu_error {
          */
         unsigned long flags;
  #define I915_RESET_BACKOFF     0
-#define I915_RESET_HANDOFF     1
-#define I915_RESET_MODESET     2
-#define I915_RESET_ENGINE      3
+#define I915_RESET_MODESET     1
+#define I915_RESET_ENGINE      2
  #define I915_WEDGED            (BITS_PER_LONG - 1)
  
         /** Number of times an engine has been reset */
         u32 reset_engine_count[I915_NUM_ENGINES];
  
-       /** Set of stalled engines with guilty requests, in the current reset */
-       u32 stalled_mask;
-
-       /** Reason for the current *global* reset */
-       const char *reason;
-
         struct mutex wedge_mutex; /* serialises wedging/unwedging */
  
         /**
@@ -299,6 +283,8 @@ struct i915_gpu_error {
  
         /* For missed irq/seqno simulation. */
         unsigned long test_irq_rings;
+
+       struct i915_gpu_restart *restart;
  };
  
  struct drm_i915_error_state_buf {
@@ -320,7 +306,7 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
  
  struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915);
  void i915_capture_error_state(struct drm_i915_private *dev_priv,
-                             u32 engine_mask,
+                             unsigned long engine_mask,
                               const char *error_msg);
  
  static inline struct i915_gpu_state *
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c

index ddc35e9dc0c05b0586eb16eab4efc1e0a36a6475..f4241a17e2ad3b2d3850f0b80c858146f31c5817 100644 (file)
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1083,18 +1083,6 @@ static bool __i915_spin_request(const struct i915_request *rq,
         return false;
  }
  
-static bool __i915_wait_request_check_and_reset(struct i915_request *request)
-{
-       struct i915_gpu_error *error = &request->i915->gpu_error;
-
-       if (likely(!i915_reset_handoff(error)))
-               return false;
-
-       __set_current_state(TASK_RUNNING);
-       i915_reset(request->i915, error->stalled_mask, error->reason);
-       return true;
-}
-
  /**
   * i915_request_wait - wait until execution of request has finished
   * @rq: the request to wait upon
@@ -1120,17 +1108,10 @@ long i915_request_wait(struct i915_request *rq,
  {
         const int state = flags & I915_WAIT_INTERRUPTIBLE ?
                 TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
-       wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
-       DEFINE_WAIT_FUNC(reset, default_wake_function);
         DEFINE_WAIT_FUNC(exec, default_wake_function);
         struct intel_wait wait;
  
         might_sleep();
-#if IS_ENABLED(CONFIG_LOCKDEP)
-       GEM_BUG_ON(debug_locks &&
-                  !!lockdep_is_held(&rq->i915->drm.struct_mutex) !=
-                  !!(flags & I915_WAIT_LOCKED));
-#endif
         GEM_BUG_ON(timeout < 0);
  
         if (i915_request_completed(rq))
@@ -1140,11 +1121,7 @@ long i915_request_wait(struct i915_request *rq,
                 return -ETIME;
  
         trace_i915_request_wait_begin(rq, flags);
-
         add_wait_queue(&rq->execute, &exec);
-       if (flags & I915_WAIT_LOCKED)
-               add_wait_queue(errq, &reset);
-
         intel_wait_init(&wait);
         if (flags & I915_WAIT_PRIORITY)
                 i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT);
@@ -1155,10 +1132,6 @@ restart:
                 if (intel_wait_update_request(&wait, rq))
                         break;
  
-               if (flags & I915_WAIT_LOCKED &&
-                   __i915_wait_request_check_and_reset(rq))
-                       continue;
-
                 if (signal_pending_state(state, current)) {
                         timeout = -ERESTARTSYS;
                         goto complete;
@@ -1188,9 +1161,6 @@ restart:
                  */
                 goto wakeup;
  
-       if (flags & I915_WAIT_LOCKED)
-               __i915_wait_request_check_and_reset(rq);
-
         for (;;) {
                 if (signal_pending_state(state, current)) {
                         timeout = -ERESTARTSYS;
@@ -1214,21 +1184,6 @@ wakeup:
                 if (i915_request_completed(rq))
                         break;
  
-               /*
-                * If the GPU is hung, and we hold the lock, reset the GPU
-                * and then check for completion. On a full reset, the engine's
-                * HW seqno will be advanced passed us and we are complete.
-                * If we do a partial reset, we have to wait for the GPU to
-                * resume and update the breadcrumb.
-                *
-                * If we don't hold the mutex, we can just wait for the worker
-                * to come along and update the breadcrumb (either directly
-                * itself, or indirectly by recovering the GPU).
-                */
-               if (flags & I915_WAIT_LOCKED &&
-                   __i915_wait_request_check_and_reset(rq))
-                       continue;
-
                 /* Only spin if we know the GPU is processing this request */
                 if (__i915_spin_request(rq, wait.seqno, state, 2))
                         break;
@@ -1242,8 +1197,6 @@ wakeup:
         intel_engine_remove_wait(rq->engine, &wait);
  complete:
         __set_current_state(TASK_RUNNING);
-       if (flags & I915_WAIT_LOCKED)
-               remove_wait_queue(errq, &reset);
         remove_wait_queue(&rq->execute, &exec);
         trace_i915_request_wait_end(rq);
  
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c

index 33408c4e635832442276a8417982e1a5d612cfbe..68af017ee54864cc26c967600ca5e7be885ace4e 100644 (file)
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -5,6 +5,7 @@
   */
  
  #include <linux/sched/mm.h>
+#include <linux/stop_machine.h>
  
  #include "i915_drv.h"
  #include "i915_gpu_error.h"
@@ -14,27 +15,31 @@
  
  #define RESET_MAX_RETRIES 3
  
+/* XXX How to handle concurrent GGTT updates using tiling registers? */
+#define RESET_UNDER_STOP_MACHINE 0
+
  static void engine_skip_context(struct i915_request *rq)
  {
         struct intel_engine_cs *engine = rq->engine;
         struct i915_gem_context *hung_ctx = rq->gem_context;
         struct i915_timeline *timeline = rq->timeline;
-       unsigned long flags;
  
+       lockdep_assert_held(&engine->timeline.lock);
         GEM_BUG_ON(timeline == &engine->timeline);
  
-       spin_lock_irqsave(&engine->timeline.lock, flags);
         spin_lock(&timeline->lock);
  
-       list_for_each_entry_continue(rq, &engine->timeline.requests, link)
-               if (rq->gem_context == hung_ctx)
-                       i915_request_skip(rq, -EIO);
+       if (rq->global_seqno) {
+               list_for_each_entry_continue(rq,
+                                            &engine->timeline.requests, link)
+                       if (rq->gem_context == hung_ctx)
+                               i915_request_skip(rq, -EIO);
+       }
  
         list_for_each_entry(rq, &timeline->requests, link)
                 i915_request_skip(rq, -EIO);
  
         spin_unlock(&timeline->lock);
-       spin_unlock_irqrestore(&engine->timeline.lock, flags);
  }
  
  static void client_mark_guilty(struct drm_i915_file_private *file_priv,
@@ -61,7 +66,7 @@ static void client_mark_guilty(struct drm_i915_file_private *file_priv,
         }
  }
  
-static void context_mark_guilty(struct i915_gem_context *ctx)
+static bool context_mark_guilty(struct i915_gem_context *ctx)
  {
         unsigned int score;
         bool banned, bannable;
@@ -74,7 +79,7 @@ static void context_mark_guilty(struct i915_gem_context *ctx)
  
         /* Cool contexts don't accumulate client ban score */
         if (!bannable)
-               return;
+               return false;
  
         if (banned) {
                 DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
@@ -85,6 +90,8 @@ static void context_mark_guilty(struct i915_gem_context *ctx)
  
         if (!IS_ERR_OR_NULL(ctx->file_priv))
                 client_mark_guilty(ctx->file_priv, ctx);
+
+       return banned;
  }
  
  static void context_mark_innocent(struct i915_gem_context *ctx)
@@ -92,6 +99,21 @@ static void context_mark_innocent(struct i915_gem_context *ctx)
         atomic_inc(&ctx->active_count);
  }
  
+void i915_reset_request(struct i915_request *rq, bool guilty)
+{
+       lockdep_assert_held(&rq->engine->timeline.lock);
+       GEM_BUG_ON(i915_request_completed(rq));
+
+       if (guilty) {
+               i915_request_skip(rq, -EIO);
+               if (context_mark_guilty(rq->gem_context))
+                       engine_skip_context(rq);
+       } else {
+               dma_fence_set_error(&rq->fence, -EAGAIN);
+               context_mark_innocent(rq->gem_context);
+       }
+}
+
  static void gen3_stop_engine(struct intel_engine_cs *engine)
  {
         struct drm_i915_private *dev_priv = engine->i915;
@@ -604,11 +626,8 @@ int intel_reset_guc(struct drm_i915_private *i915)
   * Ensure irq handler finishes, and not run again.
   * Also return the active request so that we only search for it once.
   */
-static struct i915_request *
-reset_prepare_engine(struct intel_engine_cs *engine)
+static void reset_prepare_engine(struct intel_engine_cs *engine)
  {
-       struct i915_request *rq;
-
         /*
          * During the reset sequence, we must prevent the engine from
          * entering RC6. As the context state is undefined until we restart
@@ -617,162 +636,85 @@ reset_prepare_engine(struct intel_engine_cs *engine)
          * GPU state upon resume, i.e. fail to restart after a reset.
          */
         intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
-
-       rq = engine->reset.prepare(engine);
-       if (rq && rq->fence.error == -EIO)
-               rq = ERR_PTR(-EIO); /* Previous reset failed! */
-
-       return rq;
+       engine->reset.prepare(engine);
  }
  
-static int reset_prepare(struct drm_i915_private *i915)
+static void reset_prepare(struct drm_i915_private *i915)
  {
         struct intel_engine_cs *engine;
-       struct i915_request *rq;
         enum intel_engine_id id;
-       int err = 0;
  
-       for_each_engine(engine, i915, id) {
-               rq = reset_prepare_engine(engine);
-               if (IS_ERR(rq)) {
-                       err = PTR_ERR(rq);
-                       continue;
-               }
-
-               engine->hangcheck.active_request = rq;
-       }
+       for_each_engine(engine, i915, id)
+               reset_prepare_engine(engine);
  
-       i915_gem_revoke_fences(i915);
         intel_uc_sanitize(i915);
-
-       return err;
  }
  
-/* Returns the request if it was guilty of the hang */
-static struct i915_request *
-reset_request(struct intel_engine_cs *engine,
-             struct i915_request *rq,
-             bool stalled)
+static int gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
  {
+       struct intel_engine_cs *engine;
+       enum intel_engine_id id;
+       int err;
+
         /*
-        * The guilty request will get skipped on a hung engine.
-        *
-        * Users of client default contexts do not rely on logical
-        * state preserved between batches so it is safe to execute
-        * queued requests following the hang. Non default contexts
-        * rely on preserved state, so skipping a batch loses the
-        * evolution of the state and it needs to be considered corrupted.
-        * Executing more queued batches on top of corrupted state is
-        * risky. But we take the risk by trying to advance through
-        * the queued requests in order to make the client behaviour
-        * more predictable around resets, by not throwing away random
-        * amount of batches it has prepared for execution. Sophisticated
-        * clients can use gem_reset_stats_ioctl and dma fence status
-        * (exported via sync_file info ioctl on explicit fences) to observe
-        * when it loses the context state and should rebuild accordingly.
-        *
-        * The context ban, and ultimately the client ban, mechanism are safety
-        * valves if client submission ends up resulting in nothing more than
-        * subsequent hangs.
+        * Everything depends on having the GTT running, so we need to start
+        * there.
          */
+       err = i915_ggtt_enable_hw(i915);
+       if (err)
+               return err;
  
-       if (i915_request_completed(rq)) {
-               GEM_TRACE("%s pardoned global=%d (fence %llx:%lld), current %d\n",
-                         engine->name, rq->global_seqno,
-                         rq->fence.context, rq->fence.seqno,
-                         intel_engine_get_seqno(engine));
-               stalled = false;
-       }
-
-       if (stalled) {
-               context_mark_guilty(rq->gem_context);
-               i915_request_skip(rq, -EIO);
+       for_each_engine(engine, i915, id)
+               intel_engine_reset(engine, stalled_mask & ENGINE_MASK(id));
  
-               /* If this context is now banned, skip all pending requests. */
-               if (i915_gem_context_is_banned(rq->gem_context))
-                       engine_skip_context(rq);
-       } else {
-               /*
-                * Since this is not the hung engine, it may have advanced
-                * since the hang declaration. Double check by refinding
-                * the active request at the time of the reset.
-                */
-               rq = i915_gem_find_active_request(engine);
-               if (rq) {
-                       unsigned long flags;
-
-                       context_mark_innocent(rq->gem_context);
-                       dma_fence_set_error(&rq->fence, -EAGAIN);
-
-                       /* Rewind the engine to replay the incomplete rq */
-                       spin_lock_irqsave(&engine->timeline.lock, flags);
-                       rq = list_prev_entry(rq, link);
-                       if (&rq->link == &engine->timeline.requests)
-                               rq = NULL;
-                       spin_unlock_irqrestore(&engine->timeline.lock, flags);
-               }
-       }
+       i915_gem_restore_fences(i915);
  
-       return rq;
+       return err;
  }
  
-static void reset_engine(struct intel_engine_cs *engine,
-                        struct i915_request *rq,
-                        bool stalled)
+static void reset_finish_engine(struct intel_engine_cs *engine)
  {
-       if (rq)
-               rq = reset_request(engine, rq, stalled);
-
-       /* Setup the CS to resume from the breadcrumb of the hung request */
-       engine->reset.reset(engine, rq);
+       engine->reset.finish(engine);
+       intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
  }
  
-static void gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
+struct i915_gpu_restart {
+       struct work_struct work;
+       struct drm_i915_private *i915;
+};
+
+static void restart_work(struct work_struct *work)
  {
+       struct i915_gpu_restart *arg = container_of(work, typeof(*arg), work);
+       struct drm_i915_private *i915 = arg->i915;
         struct intel_engine_cs *engine;
         enum intel_engine_id id;
+       intel_wakeref_t wakeref;
  
-       lockdep_assert_held(&i915->drm.struct_mutex);
-
-       i915_retire_requests(i915);
+       wakeref = intel_runtime_pm_get(i915);
+       mutex_lock(&i915->drm.struct_mutex);
+       WRITE_ONCE(i915->gpu_error.restart, NULL);
  
         for_each_engine(engine, i915, id) {
-               struct intel_context *ce;
-
-               reset_engine(engine,
-                            engine->hangcheck.active_request,
-                            stalled_mask & ENGINE_MASK(id));
-               ce = fetch_and_zero(&engine->last_retired_context);
-               if (ce)
-                       intel_context_unpin(ce);
+               struct i915_request *rq;
  
                 /*
                  * Ostensibily, we always want a context loaded for powersaving,
                  * so if the engine is idle after the reset, send a request
                  * to load our scratch kernel_context.
-                *
-                * More mysteriously, if we leave the engine idle after a reset,
-                * the next userspace batch may hang, with what appears to be
-                * an incoherent read by the CS (presumably stale TLB). An
-                * empty request appears sufficient to paper over the glitch.
                  */
-               if (intel_engine_is_idle(engine)) {
-                       struct i915_request *rq;
+               if (!intel_engine_is_idle(engine))
+                       continue;
  
-                       rq = i915_request_alloc(engine, i915->kernel_context);
-                       if (!IS_ERR(rq))
-                               i915_request_add(rq);
-               }
+               rq = i915_request_alloc(engine, i915->kernel_context);
+               if (!IS_ERR(rq))
+                       i915_request_add(rq);
         }
  
-       i915_gem_restore_fences(i915);
-}
-
-static void reset_finish_engine(struct intel_engine_cs *engine)
-{
-       engine->reset.finish(engine);
+       mutex_unlock(&i915->drm.struct_mutex);
+       intel_runtime_pm_put(i915, wakeref);
  
-       intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
+       kfree(arg);
  }
  
  static void reset_finish(struct drm_i915_private *i915)
@@ -780,11 +722,30 @@ static void reset_finish(struct drm_i915_private *i915)
         struct intel_engine_cs *engine;
         enum intel_engine_id id;
  
-       lockdep_assert_held(&i915->drm.struct_mutex);
-
-       for_each_engine(engine, i915, id) {
-               engine->hangcheck.active_request = NULL;
+       for_each_engine(engine, i915, id)
                 reset_finish_engine(engine);
+}
+
+static void reset_restart(struct drm_i915_private *i915)
+{
+       struct i915_gpu_restart *arg;
+
+       /*
+        * Following the reset, ensure that we always reload context for
+        * powersaving, and to correct engine->last_retired_context. Since
+        * this requires us to submit a request, queue a worker to do that
+        * task for us to evade any locking here.
+        */
+       if (READ_ONCE(i915->gpu_error.restart))
+               return;
+
+       arg = kmalloc(sizeof(*arg), GFP_KERNEL);
+       if (arg) {
+               arg->i915 = i915;
+               INIT_WORK(&arg->work, restart_work);
+
+               WRITE_ONCE(i915->gpu_error.restart, arg);
+               queue_work(i915->wq, &arg->work);
         }
  }
  
@@ -873,8 +834,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
         struct i915_timeline *tl;
         bool ret = false;
  
-       lockdep_assert_held(&i915->drm.struct_mutex);
-
         if (!test_bit(I915_WEDGED, &error->flags))
                 return true;
  
@@ -897,9 +856,9 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
          */
         list_for_each_entry(tl, &i915->gt.timelines, link) {
                 struct i915_request *rq;
+               long timeout;
  
-               rq = i915_gem_active_peek(&tl->last_request,
-                                         &i915->drm.struct_mutex);
+               rq = i915_gem_active_get_unlocked(&tl->last_request);
                 if (!rq)
                         continue;
  
@@ -914,12 +873,12 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
                  * and when the seqno passes the fence, the signaler
                  * then signals the fence waking us up).
                  */
-               if (dma_fence_default_wait(&rq->fence, true,
-                                          MAX_SCHEDULE_TIMEOUT) < 0)
+               timeout = dma_fence_default_wait(&rq->fence, true,
+                                                MAX_SCHEDULE_TIMEOUT);
+               i915_request_put(rq);
+               if (timeout < 0)
                         goto unlock;
         }
-       i915_retire_requests(i915);
-       GEM_BUG_ON(i915->gt.active_requests);
  
         intel_engines_sanitize(i915, false);
  
@@ -933,7 +892,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
          * context and do not require stop_machine().
          */
         intel_engines_reset_default_submission(i915);
-       i915_gem_contexts_lost(i915);
  
         GEM_TRACE("end\n");
  
@@ -946,6 +904,52 @@ unlock:
         return ret;
  }
  
+struct __i915_reset {
+       struct drm_i915_private *i915;
+       unsigned int stalled_mask;
+};
+
+static int __i915_reset__BKL(void *data)
+{
+       struct __i915_reset *arg = data;
+       int err;
+
+       err = intel_gpu_reset(arg->i915, ALL_ENGINES);
+       if (err)
+               return err;
+
+       return gt_reset(arg->i915, arg->stalled_mask);
+}
+
+#if RESET_UNDER_STOP_MACHINE
+/*
+ * XXX An alternative to using stop_machine would be to park only the
+ * processes that have a GGTT mmap. By remote parking the threads (SIGSTOP)
+ * we should be able to prevent their memmory accesses via the lost fence
+ * registers over the course of the reset without the potential recursive
+ * of mutexes between the pagefault handler and reset.
+ *
+ * See igt/gem_mmap_gtt/hang
+ */
+#define __do_reset(fn, arg) stop_machine(fn, arg, NULL)
+#else
+#define __do_reset(fn, arg) fn(arg)
+#endif
+
+static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
+{
+       struct __i915_reset arg = { i915, stalled_mask };
+       int err, i;
+
+       err = __do_reset(__i915_reset__BKL, &arg);
+       for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
+               msleep(100);
+               err = __do_reset(__i915_reset__BKL, &arg);
+       }
+
+       return err;
+}
+
  /**
   * i915_reset - reset chip after a hang
   * @i915: #drm_i915_private to reset
@@ -971,31 +975,22 @@ void i915_reset(struct drm_i915_private *i915,
  {
         struct i915_gpu_error *error = &i915->gpu_error;
         int ret;
-       int i;
  
         GEM_TRACE("flags=%lx\n", error->flags);
  
         might_sleep();
-       lockdep_assert_held(&i915->drm.struct_mutex);
         assert_rpm_wakelock_held(i915);
         GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
  
-       if (!test_bit(I915_RESET_HANDOFF, &error->flags))
-               return;
-
         /* Clear any previous failed attempts at recovery. Time to try again. */
         if (!i915_gem_unset_wedged(i915))
-               goto wakeup;
+               return;
  
         if (reason)
                 dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
         error->reset_count++;
  
-       ret = reset_prepare(i915);
-       if (ret) {
-               dev_err(i915->drm.dev, "GPU recovery failed\n");
-               goto taint;
-       }
+       reset_prepare(i915);
  
         if (!intel_has_gpu_reset(i915)) {
                 if (i915_modparams.reset)
@@ -1005,32 +1000,11 @@ void i915_reset(struct drm_i915_private *i915,
                 goto error;
         }
  
-       for (i = 0; i < RESET_MAX_RETRIES; i++) {
-               ret = intel_gpu_reset(i915, ALL_ENGINES);
-               if (ret == 0)
-                       break;
-
-               msleep(100);
-       }
-       if (ret) {
+       if (do_reset(i915, stalled_mask)) {
                 dev_err(i915->drm.dev, "Failed to reset chip\n");
                 goto taint;
         }
  
-       /* Ok, now get things going again... */
-
-       /*
-        * Everything depends on having the GTT running, so we need to start
-        * there.
-        */
-       ret = i915_ggtt_enable_hw(i915);
-       if (ret) {
-               DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n",
-                         ret);
-               goto error;
-       }
-
-       gt_reset(i915, stalled_mask);
         intel_overlay_reset(i915);
  
         /*
@@ -1052,9 +1026,8 @@ void i915_reset(struct drm_i915_private *i915,
  
  finish:
         reset_finish(i915);
-wakeup:
-       clear_bit(I915_RESET_HANDOFF, &error->flags);
-       wake_up_bit(&error->flags, I915_RESET_HANDOFF);
+       if (!i915_terminally_wedged(error))
+               reset_restart(i915);
         return;
  
  taint:
@@ -1073,7 +1046,6 @@ taint:
         add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
  error:
         i915_gem_set_wedged(i915);
-       i915_retire_requests(i915);
         goto finish;
  }
  
@@ -1099,18 +1071,16 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *i915,
  int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
  {
         struct i915_gpu_error *error = &engine->i915->gpu_error;
-       struct i915_request *active_request;
         int ret;
  
         GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
         GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
  
-       active_request = reset_prepare_engine(engine);
-       if (IS_ERR_OR_NULL(active_request)) {
-               /* Either the previous reset failed, or we pardon the reset. */
-               ret = PTR_ERR(active_request);
-               goto out;
-       }
+       if (i915_seqno_passed(intel_engine_get_seqno(engine),
+                             intel_engine_last_submit(engine)))
+               return 0;
+
+       reset_prepare_engine(engine);
  
         if (msg)
                 dev_notice(engine->i915->drm.dev,
@@ -1134,7 +1104,7 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
          * active request and can drop it, adjust head to skip the offending
          * request to resume executing remaining requests in the queue.
          */
-       reset_engine(engine, active_request, true);
+       intel_engine_reset(engine, true);
  
         /*
          * The engine and its registers (and workarounds in case of render)
@@ -1171,30 +1141,7 @@ static void i915_reset_device(struct drm_i915_private *i915,
         i915_wedge_on_timeout(&w, i915, 5 * HZ) {
                 intel_prepare_reset(i915);
  
-               error->reason = reason;
-               error->stalled_mask = engine_mask;
-
-               /* Signal that locked waiters should reset the GPU */
-               smp_mb__before_atomic();
-               set_bit(I915_RESET_HANDOFF, &error->flags);
-               wake_up_all(&error->wait_queue);
-
-               /*
-                * Wait for anyone holding the lock to wakeup, without
-                * blocking indefinitely on struct_mutex.
-                */
-               do {
-                       if (mutex_trylock(&i915->drm.struct_mutex)) {
-                               i915_reset(i915, engine_mask, reason);
-                               mutex_unlock(&i915->drm.struct_mutex);
-                       }
-               } while (wait_on_bit_timeout(&error->flags,
-                                            I915_RESET_HANDOFF,
-                                            TASK_UNINTERRUPTIBLE,
-                                            1));
-
-               error->stalled_mask = 0;
-               error->reason = NULL;
+               i915_reset(i915, engine_mask, reason);
  
                 intel_finish_reset(i915);
         }
@@ -1350,6 +1297,25 @@ out:
         intel_runtime_pm_put(i915, wakeref);
  }
  
+bool i915_reset_flush(struct drm_i915_private *i915)
+{
+       int err;
+
+       cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
+
+       flush_workqueue(i915->wq);
+       GEM_BUG_ON(READ_ONCE(i915->gpu_error.restart));
+
+       mutex_lock(&i915->drm.struct_mutex);
+       err = i915_gem_wait_for_idle(i915,
+                                    I915_WAIT_LOCKED |
+                                    I915_WAIT_FOR_IDLE_BOOST,
+                                    MAX_SCHEDULE_TIMEOUT);
+       mutex_unlock(&i915->drm.struct_mutex);
+
+       return !err;
+}
+
  static void i915_wedge_me(struct work_struct *work)
  {
         struct i915_wedge_me *w = container_of(work, typeof(*w), work.work);
diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h

index b6a519bde67d8f32f0e9d30526df62117c32580b..f2d347f319dfad4733f489968c757f055f0cb2f4 100644 (file)
--- a/drivers/gpu/drm/i915/i915_reset.h
+++ b/drivers/gpu/drm/i915/i915_reset.h
@@ -29,6 +29,9 @@ void i915_reset(struct drm_i915_private *i915,
  int i915_reset_engine(struct intel_engine_cs *engine,
                       const char *reason);
  
+void i915_reset_request(struct i915_request *rq, bool guilty);
+bool i915_reset_flush(struct drm_i915_private *i915);
+
  bool intel_has_gpu_reset(struct drm_i915_private *i915);
  bool intel_has_reset_engine(struct drm_i915_private *i915);
  
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c

index ef4c8c50a4ba9acc83b6d21a2653cf1189f6e85a..1a5c163b98d6f9eff4aead16c9443f0aecb92499 100644 (file)
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1119,10 +1119,8 @@ void intel_engines_sanitize(struct drm_i915_private *i915, bool force)
         if (!reset_engines(i915) && !force)
                 return;
  
-       for_each_engine(engine, i915, id) {
-               if (engine->reset.reset)
-                       engine->reset.reset(engine, NULL);
-       }
+       for_each_engine(engine, i915, id)
+               intel_engine_reset(engine, false);
  }
  
  /**
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c

index 349ae5844f24674329d43bfec986b8d78ad8ef1f..45e2db683fe5b9dabb1a7882b0e4558fc0ef14ff 100644 (file)
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -834,8 +834,7 @@ static void guc_submission_tasklet(unsigned long data)
         spin_unlock_irqrestore(&engine->timeline.lock, flags);
  }
  
-static struct i915_request *
-guc_reset_prepare(struct intel_engine_cs *engine)
+static void guc_reset_prepare(struct intel_engine_cs *engine)
  {
         struct intel_engine_execlists * const execlists = &engine->execlists;
  
@@ -861,8 +860,6 @@ guc_reset_prepare(struct intel_engine_cs *engine)
          */
         if (engine->i915->guc.preempt_wq)
                 flush_workqueue(engine->i915->guc.preempt_wq);
-
-       return i915_gem_find_active_request(engine);
  }
  
  /*
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c

index 741441daae321d68e34f08ce98ca628ea1b93b2a..5662d6fed5234de4705885264ceb6e922610221f 100644 (file)
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -25,6 +25,17 @@
  #include "i915_drv.h"
  #include "i915_reset.h"
  
+struct hangcheck {
+       u64 acthd;
+       u32 seqno;
+       enum intel_engine_hangcheck_action action;
+       unsigned long action_timestamp;
+       int deadlock;
+       struct intel_instdone instdone;
+       bool wedged:1;
+       bool stalled:1;
+};
+
  static bool instdone_unchanged(u32 current_instdone, u32 *old_instdone)
  {
         u32 tmp = current_instdone | *old_instdone;
@@ -119,25 +130,22 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
  }
  
  static void hangcheck_load_sample(struct intel_engine_cs *engine,
-                                 struct intel_engine_hangcheck *hc)
+                                 struct hangcheck *hc)
  {
         hc->acthd = intel_engine_get_active_head(engine);
         hc->seqno = intel_engine_get_seqno(engine);
  }
  
  static void hangcheck_store_sample(struct intel_engine_cs *engine,
-                                  const struct intel_engine_hangcheck *hc)
+                                  const struct hangcheck *hc)
  {
         engine->hangcheck.acthd = hc->acthd;
         engine->hangcheck.seqno = hc->seqno;
-       engine->hangcheck.action = hc->action;
-       engine->hangcheck.stalled = hc->stalled;
-       engine->hangcheck.wedged = hc->wedged;
  }
  
  static enum intel_engine_hangcheck_action
  hangcheck_get_action(struct intel_engine_cs *engine,
-                    const struct intel_engine_hangcheck *hc)
+                    const struct hangcheck *hc)
  {
         if (engine->hangcheck.seqno != hc->seqno)
                 return ENGINE_ACTIVE_SEQNO;
@@ -149,7 +157,7 @@ hangcheck_get_action(struct intel_engine_cs *engine,
  }
  
  static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
-                                       struct intel_engine_hangcheck *hc)
+                                       struct hangcheck *hc)
  {
         unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT;
  
@@ -265,19 +273,19 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
         intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
  
         for_each_engine(engine, dev_priv, id) {
-               struct intel_engine_hangcheck hc;
+               struct hangcheck hc;
  
                 hangcheck_load_sample(engine, &hc);
                 hangcheck_accumulate_sample(engine, &hc);
                 hangcheck_store_sample(engine, &hc);
  
-               if (engine->hangcheck.stalled) {
+               if (hc.stalled) {
                         hung |= intel_engine_flag(engine);
                         if (hc.action != ENGINE_DEAD)
                                 stuck |= intel_engine_flag(engine);
                 }
  
-               if (engine->hangcheck.wedged)
+               if (hc.wedged)
                         wedged |= intel_engine_flag(engine);
         }
  
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c

index 5551dd2ec0e680a2e92edaa951f5c621a8aff44d..185867106c1416cad1221a5e26bc85063a393ad3 100644 (file)
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -136,6 +136,7 @@
  #include <drm/i915_drm.h>
  #include "i915_drv.h"
  #include "i915_gem_render_state.h"
+#include "i915_reset.h"
  #include "i915_vgpu.h"
  #include "intel_lrc_reg.h"
  #include "intel_mocs.h"
@@ -264,7 +265,8 @@ static void unwind_wa_tail(struct i915_request *rq)
         assert_ring_tail_valid(rq->ring, rq->tail);
  }
  
-static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
+static struct i915_request *
+__unwind_incomplete_requests(struct intel_engine_cs *engine)
  {
         struct i915_request *rq, *rn, *active = NULL;
         struct list_head *uninitialized_var(pl);
@@ -306,6 +308,8 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
                 list_move_tail(&active->sched.link,
                                i915_sched_lookup_priolist(engine, prio));
         }
+
+       return active;
  }
  
  void
@@ -1732,11 +1736,9 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
         return 0;
  }
  
-static struct i915_request *
-execlists_reset_prepare(struct intel_engine_cs *engine)
+static void execlists_reset_prepare(struct intel_engine_cs *engine)
  {
         struct intel_engine_execlists * const execlists = &engine->execlists;
-       struct i915_request *request, *active;
         unsigned long flags;
  
         GEM_TRACE("%s: depth<-%d\n", engine->name,
@@ -1752,59 +1754,21 @@ execlists_reset_prepare(struct intel_engine_cs *engine)
          * prevents the race.
          */
         __tasklet_disable_sync_once(&execlists->tasklet);
+       GEM_BUG_ON(!reset_in_progress(execlists));
  
+       /* And flush any current direct submission. */
         spin_lock_irqsave(&engine->timeline.lock, flags);
-
-       /*
-        * We want to flush the pending context switches, having disabled
-        * the tasklet above, we can assume exclusive access to the execlists.
-        * For this allows us to catch up with an inflight preemption event,
-        * and avoid blaming an innocent request if the stall was due to the
-        * preemption itself.
-        */
-       process_csb(engine);
-
-       /*
-        * The last active request can then be no later than the last request
-        * now in ELSP[0]. So search backwards from there, so that if the GPU
-        * has advanced beyond the last CSB update, it will be pardoned.
-        */
-       active = NULL;
-       request = port_request(execlists->port);
-       if (request) {
-               /*
-                * Prevent the breadcrumb from advancing before we decide
-                * which request is currently active.
-                */
-               intel_engine_stop_cs(engine);
-
-               list_for_each_entry_from_reverse(request,
-                                                &engine->timeline.requests,
-                                                link) {
-                       if (__i915_request_completed(request,
-                                                    request->global_seqno))
-                               break;
-
-                       active = request;
-               }
-       }
-
+       process_csb(engine); /* drain preemption events */
         spin_unlock_irqrestore(&engine->timeline.lock, flags);
-
-       return active;
  }
  
-static void execlists_reset(struct intel_engine_cs *engine,
-                           struct i915_request *request)
+static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
  {
         struct intel_engine_execlists * const execlists = &engine->execlists;
+       struct i915_request *rq;
         unsigned long flags;
         u32 *regs;
  
-       GEM_TRACE("%s request global=%d, current=%d\n",
-                 engine->name, request ? request->global_seqno : 0,
-                 intel_engine_get_seqno(engine));
-
         spin_lock_irqsave(&engine->timeline.lock, flags);
  
         /*
@@ -1819,12 +1783,18 @@ static void execlists_reset(struct intel_engine_cs *engine,
         execlists_cancel_port_requests(execlists);
  
         /* Push back any incomplete requests for replay after the reset. */
-       __unwind_incomplete_requests(engine);
+       rq = __unwind_incomplete_requests(engine);
  
         /* Following the reset, we need to reload the CSB read/write pointers */
         reset_csb_pointers(&engine->execlists);
  
-       spin_unlock_irqrestore(&engine->timeline.lock, flags);
+       GEM_TRACE("%s seqno=%d, current=%d, stalled? %s\n",
+                 engine->name,
+                 rq ? rq->global_seqno : 0,
+                 intel_engine_get_seqno(engine),
+                 yesno(stalled));
+       if (!rq)
+               goto out_unlock;
  
         /*
          * If the request was innocent, we leave the request in the ELSP
@@ -1837,8 +1807,9 @@ static void execlists_reset(struct intel_engine_cs *engine,
          * and have to at least restore the RING register in the context
          * image back to the expected values to skip over the guilty request.
          */
-       if (!request || request->fence.error != -EIO)
-               return;
+       i915_reset_request(rq, stalled);
+       if (!stalled)
+               goto out_unlock;
  
         /*
          * We want a simple context + ring to execute the breadcrumb update.
@@ -1848,7 +1819,7 @@ static void execlists_reset(struct intel_engine_cs *engine,
          * future request will be after userspace has had the opportunity
          * to recreate its own state.
          */
-       regs = request->hw_context->lrc_reg_state;
+       regs = rq->hw_context->lrc_reg_state;
         if (engine->pinned_default_state) {
                 memcpy(regs, /* skip restoring the vanilla PPHWSP */
                        engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
@@ -1856,17 +1827,14 @@ static void execlists_reset(struct intel_engine_cs *engine,
         }
  
         /* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
-       request->ring->head = intel_ring_wrap(request->ring, request->postfix);
+       rq->ring->head = intel_ring_wrap(rq->ring, rq->postfix);
+       intel_ring_update_space(rq->ring);
  
-       execlists_init_reg_state(regs, request->gem_context, engine,
-                                request->ring);
+       execlists_init_reg_state(regs, rq->gem_context, engine, rq->ring);
+       __execlists_update_reg_state(engine, rq->hw_context);
  
-       __execlists_update_reg_state(engine, request->hw_context);
-
-       intel_ring_update_space(request->ring);
-
-       /* Reset WaIdleLiteRestore:bdw,skl as well */
-       unwind_wa_tail(request);
+out_unlock:
+       spin_unlock_irqrestore(&engine->timeline.lock, flags);
  }
  
  static void execlists_reset_finish(struct intel_engine_cs *engine)
@@ -1879,6 +1847,7 @@ static void execlists_reset_finish(struct intel_engine_cs *engine)
          * to sleep before we restart and reload a context.
          *
          */
+       GEM_BUG_ON(!reset_in_progress(execlists));
         if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
                 execlists->tasklet.func(execlists->tasklet.data);
  
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c

index c81db81e4416a11234bd3dbf72b314e9fad2ba1a..f68c7975006c38c8b642352975beb45b6b4dc0c9 100644 (file)
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -478,8 +478,6 @@ void intel_overlay_reset(struct drm_i915_private *dev_priv)
         if (!overlay)
                 return;
  
-       intel_overlay_release_old_vid(overlay);
-
         overlay->old_xscale = 0;
         overlay->old_yscale = 0;
         overlay->crtc = NULL;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c

index 09c90475168a4cb0a9074ae39ed1dc2d5fe9eaca..a9efc8c712546c6e0c78720dbf9f13b6c3bd77d0 100644 (file)
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -33,6 +33,7 @@
  
  #include "i915_drv.h"
  #include "i915_gem_render_state.h"
+#include "i915_reset.h"
  #include "i915_trace.h"
  #include "intel_drv.h"
  #include "intel_workarounds.h"
@@ -711,52 +712,80 @@ out:
         return ret;
  }
  
-static struct i915_request *reset_prepare(struct intel_engine_cs *engine)
+static void reset_prepare(struct intel_engine_cs *engine)
  {
         intel_engine_stop_cs(engine);
-       return i915_gem_find_active_request(engine);
  }
  
-static void skip_request(struct i915_request *rq)
+static void reset_ring(struct intel_engine_cs *engine, bool stalled)
  {
-       void *vaddr = rq->ring->vaddr;
+       struct i915_timeline *tl = &engine->timeline;
+       struct i915_request *pos, *rq;
+       unsigned long flags;
         u32 head;
  
-       head = rq->infix;
-       if (rq->postfix < head) {
-               memset32(vaddr + head, MI_NOOP,
-                        (rq->ring->size - head) / sizeof(u32));
-               head = 0;
+       rq = NULL;
+       spin_lock_irqsave(&tl->lock, flags);
+       list_for_each_entry(pos, &tl->requests, link) {
+               if (!__i915_request_completed(pos, pos->global_seqno)) {
+                       rq = pos;
+                       break;
+               }
         }
-       memset32(vaddr + head, MI_NOOP, (rq->postfix - head) / sizeof(u32));
-}
-
-static void reset_ring(struct intel_engine_cs *engine, struct i915_request *rq)
-{
-       GEM_TRACE("%s request global=%d, current=%d\n",
-                 engine->name, rq ? rq->global_seqno : 0,
-                 intel_engine_get_seqno(engine));
  
+       GEM_TRACE("%s seqno=%d, current=%d, stalled? %s\n",
+                 engine->name,
+                 rq ? rq->global_seqno : 0,
+                 intel_engine_get_seqno(engine),
+                 yesno(stalled));
         /*
-        * Try to restore the logical GPU state to match the continuation
-        * of the request queue. If we skip the context/PD restore, then
-        * the next request may try to execute assuming that its context
-        * is valid and loaded on the GPU and so may try to access invalid
-        * memory, prompting repeated GPU hangs.
+        * The guilty request will get skipped on a hung engine.
          *
-        * If the request was guilty, we still restore the logical state
-        * in case the next request requires it (e.g. the aliasing ppgtt),
-        * but skip over the hung batch.
+        * Users of client default contexts do not rely on logical
+        * state preserved between batches so it is safe to execute
+        * queued requests following the hang. Non default contexts
+        * rely on preserved state, so skipping a batch loses the
+        * evolution of the state and it needs to be considered corrupted.
+        * Executing more queued batches on top of corrupted state is
+        * risky. But we take the risk by trying to advance through
+        * the queued requests in order to make the client behaviour
+        * more predictable around resets, by not throwing away random
+        * amount of batches it has prepared for execution. Sophisticated
+        * clients can use gem_reset_stats_ioctl and dma fence status
+        * (exported via sync_file info ioctl on explicit fences) to observe
+        * when it loses the context state and should rebuild accordingly.
          *
-        * If the request was innocent, we try to replay the request with
-        * the restored context.
+        * The context ban, and ultimately the client ban, mechanism are safety
+        * valves if client submission ends up resulting in nothing more than
+        * subsequent hangs.
          */
+
         if (rq) {
-               /* If the rq hung, jump to its breadcrumb and skip the batch */
-               rq->ring->head = intel_ring_wrap(rq->ring, rq->head);
-               if (rq->fence.error == -EIO)
-                       skip_request(rq);
+               /*
+                * Try to restore the logical GPU state to match the
+                * continuation of the request queue. If we skip the
+                * context/PD restore, then the next request may try to execute
+                * assuming that its context is valid and loaded on the GPU and
+                * so may try to access invalid memory, prompting repeated GPU
+                * hangs.
+                *
+                * If the request was guilty, we still restore the logical
+                * state in case the next request requires it (e.g. the
+                * aliasing ppgtt), but skip over the hung batch.
+                *
+                * If the request was innocent, we try to replay the request
+                * with the restored context.
+                */
+               i915_reset_request(rq, stalled);
+
+               GEM_BUG_ON(rq->ring != engine->buffer);
+               head = rq->head;
+       } else {
+               head = engine->buffer->tail;
         }
+       engine->buffer->head = intel_ring_wrap(engine->buffer, head);
+
+       spin_unlock_irqrestore(&tl->lock, flags);
  }
  
  static void reset_finish(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h

index 5ad46c2fbc0fb3b6304cb458e79112ce716e446a..f2effd0015407b03af70932d0bb34d794af97e0f 100644 (file)
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -120,13 +120,8 @@ struct intel_instdone {
  struct intel_engine_hangcheck {
         u64 acthd;
         u32 seqno;
-       enum intel_engine_hangcheck_action action;
         unsigned long action_timestamp;
-       int deadlock;
         struct intel_instdone instdone;
-       struct i915_request *active_request;
-       bool stalled:1;
-       bool wedged:1;
  };
  
  struct intel_ring {
@@ -444,9 +439,8 @@ struct intel_engine_cs {
         int             (*init_hw)(struct intel_engine_cs *engine);
  
         struct {
-               struct i915_request *(*prepare)(struct intel_engine_cs *engine);
-               void (*reset)(struct intel_engine_cs *engine,
-                             struct i915_request *rq);
+               void (*prepare)(struct intel_engine_cs *engine);
+               void (*reset)(struct intel_engine_cs *engine, bool stalled);
                 void (*finish)(struct intel_engine_cs *engine);
         } reset;
  
@@ -1018,6 +1012,13 @@ gen8_emit_ggtt_write(u32 *cs, u32 value, u32 gtt_offset)
         return cs;
  }
  
+static inline void intel_engine_reset(struct intel_engine_cs *engine,
+                                     bool stalled)
+{
+       if (engine->reset.reset)
+               engine->reset.reset(engine, stalled);
+}
+
  void intel_engines_sanitize(struct drm_i915_private *i915, bool force);
  
  bool intel_engine_is_idle(struct intel_engine_cs *engine);
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c

index 12550b55c42facd6b0792282d7b2a988d986f11d..67431355cd6ed8c117b88a338c3b7c8b0a31a358 100644 (file)
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -363,9 +363,7 @@ static int igt_global_reset(void *arg)
         /* Check that we can issue a global GPU reset */
  
         igt_global_reset_lock(i915);
-       set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
  
-       mutex_lock(&i915->drm.struct_mutex);
         reset_count = i915_reset_count(&i915->gpu_error);
  
         i915_reset(i915, ALL_ENGINES, NULL);
@@ -374,9 +372,7 @@ static int igt_global_reset(void *arg)
                 pr_err("No GPU reset recorded!\n");
                 err = -EINVAL;
         }
-       mutex_unlock(&i915->drm.struct_mutex);
  
-       GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
         igt_global_reset_unlock(i915);
  
         if (i915_terminally_wedged(&i915->gpu_error))
@@ -399,9 +395,7 @@ static int igt_wedged_reset(void *arg)
         i915_gem_set_wedged(i915);
         GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
  
-       set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
         i915_reset(i915, ALL_ENGINES, NULL);
-       GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
  
         intel_runtime_pm_put(i915, wakeref);
         mutex_unlock(&i915->drm.struct_mutex);
@@ -511,7 +505,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
                                 break;
                         }
  
-                       if (!wait_for_idle(engine)) {
+                       if (!i915_reset_flush(i915)) {
                                 struct drm_printer p =
                                         drm_info_printer(i915->drm.dev);
  
@@ -903,20 +897,13 @@ static int igt_reset_engines(void *arg)
         return 0;
  }
  
-static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
+static u32 fake_hangcheck(struct drm_i915_private *i915, u32 mask)
  {
-       struct i915_gpu_error *error = &rq->i915->gpu_error;
-       u32 reset_count = i915_reset_count(error);
-
-       error->stalled_mask = mask;
-
-       /* set_bit() must be after we have setup the backchannel (mask) */
-       smp_mb__before_atomic();
-       set_bit(I915_RESET_HANDOFF, &error->flags);
+       u32 count = i915_reset_count(&i915->gpu_error);
  
-       wake_up_all(&error->wait_queue);
+       i915_reset(i915, mask, NULL);
  
-       return reset_count;
+       return count;
  }
  
  static int igt_reset_wait(void *arg)
@@ -962,7 +949,7 @@ static int igt_reset_wait(void *arg)
                 goto out_rq;
         }
  
-       reset_count = fake_hangcheck(rq, ALL_ENGINES);
+       reset_count = fake_hangcheck(i915, ALL_ENGINES);
  
         timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
         if (timeout < 0) {
@@ -972,7 +959,6 @@ static int igt_reset_wait(void *arg)
                 goto out_rq;
         }
  
-       GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
         if (i915_reset_count(&i915->gpu_error) == reset_count) {
                 pr_err("No GPU reset recorded!\n");
                 err = -EINVAL;
@@ -1162,7 +1148,7 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
         }
  
  out_reset:
-       fake_hangcheck(rq, intel_engine_flag(rq->engine));
+       fake_hangcheck(rq->i915, intel_engine_flag(rq->engine));
  
         if (tsk) {
                 struct igt_wedge_me w;
@@ -1341,12 +1327,7 @@ static int igt_reset_queue(void *arg)
                                 goto fini;
                         }
  
-                       reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
-
-                       i915_reset(i915, ENGINE_MASK(id), NULL);
-
-                       GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
-                                           &i915->gpu_error.flags));
+                       reset_count = fake_hangcheck(i915, ENGINE_MASK(id));
  
                         if (prev->fence.error != -EIO) {
                                 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
@@ -1565,6 +1546,7 @@ static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
                 pr_err("%s(%s): Failed to start request %llx, at %x\n",
                        __func__, engine->name,
                        rq->fence.seqno, hws_seqno(&h, rq));
+               i915_gem_set_wedged(i915);
                 err = -EIO;
         }
  
@@ -1588,7 +1570,6 @@ out:
  static void force_reset(struct drm_i915_private *i915)
  {
         i915_gem_set_wedged(i915);
-       set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
         i915_reset(i915, 0, NULL);
  }
  
@@ -1618,6 +1599,26 @@ static int igt_atomic_reset(void *arg)
         if (i915_terminally_wedged(&i915->gpu_error))
                 goto unlock;
  
+       if (intel_has_gpu_reset(i915)) {
+               const typeof(*phases) *p;
+
+               for (p = phases; p->name; p++) {
+                       GEM_TRACE("intel_gpu_reset under %s\n", p->name);
+
+                       p->critical_section_begin();
+                       err = intel_gpu_reset(i915, ALL_ENGINES);
+                       p->critical_section_end();
+
+                       if (err) {
+                               pr_err("intel_gpu_reset failed under %s\n",
+                                      p->name);
+                               goto out;
+                       }
+               }
+
+               force_reset(i915);
+       }
+
         if (intel_has_reset_engine(i915)) {
                 struct intel_engine_cs *engine;
                 enum intel_engine_id id;
diff --git a/drivers/gpu/drm/i915/selftests/intel_workarounds.c b/drivers/gpu/drm/i915/selftests/intel_workarounds.c

index a8cac56be83576b4ab133a7c7348f7a0dfe05b8c..b15c4f26c5933c32a086d8b4441ccbac1846d424 100644 (file)
--- a/drivers/gpu/drm/i915/selftests/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
@@ -214,7 +214,6 @@ out_put:
  
  static int do_device_reset(struct intel_engine_cs *engine)
  {
-       set_bit(I915_RESET_HANDOFF, &engine->i915->gpu_error.flags);
         i915_reset(engine->i915, ENGINE_MASK(engine->id), "live_workarounds");
         return 0;
  }
@@ -394,7 +393,6 @@ static int
  live_gpu_reset_gt_engine_workarounds(void *arg)
  {
         struct drm_i915_private *i915 = arg;
-       struct i915_gpu_error *error = &i915->gpu_error;
         intel_wakeref_t wakeref;
         struct wa_lists lists;
         bool ok;
@@ -413,7 +411,6 @@ live_gpu_reset_gt_engine_workarounds(void *arg)
         if (!ok)
                 goto out;
  
-       set_bit(I915_RESET_HANDOFF, &error->flags);
         i915_reset(i915, ALL_ENGINES, "live_workarounds");
  
         ok = verify_gt_engine_wa(i915, &lists, "after reset");
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c

index 5477ad4a7e7deff1e5d94d9118d6c590e83e6250..8ab5a2688a0cdd0120989e78078780fc502ce7a1 100644 (file)
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -58,8 +58,8 @@ static void mock_device_release(struct drm_device *dev)
         i915_gem_contexts_lost(i915);
         mutex_unlock(&i915->drm.struct_mutex);
  
-       cancel_delayed_work_sync(&i915->gt.retire_work);
-       cancel_delayed_work_sync(&i915->gt.idle_work);
+       drain_delayed_work(&i915->gt.retire_work);
+       drain_delayed_work(&i915->gt.idle_work);
         i915_gem_drain_workqueue(i915);
  
         mutex_lock(&i915->drm.struct_mutex);
author	Chris Wilson <chris@chris-wilson.co.uk>
	Fri, 25 Jan 2019 13:22:28 +0000 (13:22 +0000)
committer	Chris Wilson <chris@chris-wilson.co.uk>
	Fri, 25 Jan 2019 14:27:22 +0000 (14:27 +0000)
drivers/gpu/drm/i915/i915_debugfs.c		patch \| blob \| history
drivers/gpu/drm/i915/i915_drv.h		patch \| blob \| history
drivers/gpu/drm/i915/i915_gem.c		patch \| blob \| history
drivers/gpu/drm/i915/i915_gem_fence_reg.h		patch \| blob \| history
drivers/gpu/drm/i915/i915_gem_gtt.h		patch \| blob \| history
drivers/gpu/drm/i915/i915_gpu_error.c		patch \| blob \| history
drivers/gpu/drm/i915/i915_gpu_error.h		patch \| blob \| history
drivers/gpu/drm/i915/i915_request.c		patch \| blob \| history
drivers/gpu/drm/i915/i915_reset.c		patch \| blob \| history
drivers/gpu/drm/i915/i915_reset.h		patch \| blob \| history
drivers/gpu/drm/i915/intel_engine_cs.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_guc_submission.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_hangcheck.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_lrc.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_overlay.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_ringbuffer.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_ringbuffer.h		patch \| blob \| history
drivers/gpu/drm/i915/selftests/intel_hangcheck.c		patch \| blob \| history
drivers/gpu/drm/i915/selftests/intel_workarounds.c		patch \| blob \| history
drivers/gpu/drm/i915/selftests/mock_gem_device.c		patch \| blob \| history