drm/xe: Force wedged state and block GT reset upon any GPU hang

author Rodrigo Vivi <rodrigo.vivi@intel.com>

Tue, 23 Apr 2024 22:18:16 +0000 (18:18 -0400)

committer Rodrigo Vivi <rodrigo.vivi@intel.com>

Wed, 24 Apr 2024 16:12:58 +0000 (12:12 -0400)
author Rodrigo Vivi <rodrigo.vivi@intel.com>
Tue, 23 Apr 2024 22:18:16 +0000 (18:18 -0400)
committer Rodrigo Vivi <rodrigo.vivi@intel.com>
Wed, 24 Apr 2024 16:12:58 +0000 (12:12 -0400)
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c

index 76a7b37a4a532cc904a910b6a6b94d459980bb54..d45db6ff1fa3ec522645ff8c62fcdeef9cfbc336 100644 (file)
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -764,3 +764,32 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
  {
         return address & GENMASK_ULL(xe->info.va_bits - 1, 0);
  }
+
+/**
+ * xe_device_declare_wedged - Declare device wedged
+ * @xe: xe device instance
+ *
+ * This is a final state that can only be cleared with a module
+ * re-probe (unbind + bind).
+ * In this state every IOCTL will be blocked so the GT cannot be used.
+ * In general it will be called upon any critical error such as gt reset
+ * failure or guc loading failure.
+ * If xe.wedged module parameter is set to 2, this function will be called
+ * on every single execution timeout (a.k.a. GPU hang) right after devcoredump
+ * snapshot capture. In this mode, GT reset won't be attempted so the state of
+ * the issue is preserved for further debugging.
+ */
+void xe_device_declare_wedged(struct xe_device *xe)
+{
+       if (xe_modparam.wedged_mode == 0)
+               return;
+
+       if (!atomic_xchg(&xe->wedged, 1)) {
+               xe->needs_flr_on_fini = true;
+               drm_err(&xe->drm,
+                       "CRITICAL: Xe has declared device %s as wedged.\n"
+                       "IOCTLs and executions are blocked. Only a rebind may clear the failure\n"
+                       "Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
+                       dev_name(xe->drm.dev));
+       }
+}
diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h

index d2e4249d37ce7a9bdb40dff98d9d0f6db980d945..9ede45fc062a6270790892a3b04a3fb01b16e030 100644 (file)
--- a/drivers/gpu/drm/xe/xe_device.h
+++ b/drivers/gpu/drm/xe/xe_device.h
@@ -172,19 +172,6 @@ static inline bool xe_device_wedged(struct xe_device *xe)
         return atomic_read(&xe->wedged);
  }
  
-static inline void xe_device_declare_wedged(struct xe_device *xe)
-{
-       if (!atomic_xchg(&xe->wedged, 1)) {
-               xe->needs_flr_on_fini = true;
-               drm_err(&xe->drm,
-                       "CRITICAL: Xe has declared device %s as wedged.\n"
-                       "IOCTLs and executions are blocked until device is probed again with unbind and bind operations:\n"
-                       "echo '%s' > /sys/bus/pci/drivers/xe/unbind\n"
-                       "echo '%s' > /sys/bus/pci/drivers/xe/bind\n"
-                       "Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
-                       dev_name(xe->drm.dev), dev_name(xe->drm.dev),
-                       dev_name(xe->drm.dev));
-       }
-}
+void xe_device_declare_wedged(struct xe_device *xe);
  
  #endif
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h

index 02ce8d204622f7c1469c5455bff91125f6836b76..48f6da53a29232e8b55bf9ebd7833420ec7641f5 100644 (file)
--- a/drivers/gpu/drm/xe/xe_exec_queue.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue.h
@@ -26,6 +26,15 @@ void xe_exec_queue_fini(struct xe_exec_queue *q);
  void xe_exec_queue_destroy(struct kref *ref);
  void xe_exec_queue_assign_name(struct xe_exec_queue *q, u32 instance);
  
+static inline struct xe_exec_queue *
+xe_exec_queue_get_unless_zero(struct xe_exec_queue *q)
+{
+       if (kref_get_unless_zero(&q->refcount))
+               return q;
+
+       return NULL;
+}
+
  struct xe_exec_queue *xe_exec_queue_lookup(struct xe_file *xef, u32 id);
  
  static inline struct xe_exec_queue *xe_exec_queue_get(struct xe_exec_queue *q)
diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c

index 93df2d7969b33a9802d104c107f6d06f22b27ea3..8e9c4b990fbbf083946e649a7a0efdeb1961451b 100644 (file)
--- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
+++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
@@ -245,7 +245,7 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt)
                         return seqno;
  
                 xe_gt_tlb_invalidation_wait(gt, seqno);
-       } else if (xe_device_uc_enabled(xe)) {
+       } else if (xe_device_uc_enabled(xe) && !xe_device_wedged(xe)) {
                 xe_gt_WARN_ON(gt, xe_force_wake_get(gt_to_fw(gt), XE_FW_GT));
                 if (xe->info.platform == XE_PVC || GRAPHICS_VER(xe) >= 20) {
                         xe_mmio_write32(gt, PVC_GUC_TLB_INV_DESC1,
diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c

index 1aafa486edec2f9abc773a7d932f68db41678b91..db817a46f157475108db32784e6379f998d6d1ee 100644 (file)
--- a/drivers/gpu/drm/xe/xe_guc_ads.c
+++ b/drivers/gpu/drm/xe/xe_guc_ads.c
@@ -20,6 +20,7 @@
  #include "xe_lrc.h"
  #include "xe_map.h"
  #include "xe_mmio.h"
+#include "xe_module.h"
  #include "xe_platform_types.h"
  #include "xe_wa.h"
  
@@ -440,11 +441,17 @@ int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads)
  
  static void guc_policies_init(struct xe_guc_ads *ads)
  {
+       u32 global_flags = 0;
+
         ads_blob_write(ads, policies.dpc_promote_time,
                        GLOBAL_POLICY_DEFAULT_DPC_PROMOTE_TIME_US);
         ads_blob_write(ads, policies.max_num_work_items,
                        GLOBAL_POLICY_MAX_NUM_WI);
-       ads_blob_write(ads, policies.global_flags, 0);
+
+       if (xe_modparam.wedged_mode == 2)
+               global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
+
+       ads_blob_write(ads, policies.global_flags, global_flags);
         ads_blob_write(ads, policies.is_valid, 1);
  }
  
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c

index c7d38469fb4690cd69a5541b0500998290d44f43..0bea1753665901f3c395cb0bf0b22fd826149458 100644 (file)
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -35,6 +35,7 @@
  #include "xe_macros.h"
  #include "xe_map.h"
  #include "xe_mocs.h"
+#include "xe_module.h"
  #include "xe_ring_ops_types.h"
  #include "xe_sched_job.h"
  #include "xe_trace.h"
@@ -59,6 +60,7 @@ exec_queue_to_guc(struct xe_exec_queue *q)
  #define ENGINE_STATE_SUSPENDED         (1 << 5)
  #define EXEC_QUEUE_STATE_RESET         (1 << 6)
  #define ENGINE_STATE_KILLED            (1 << 7)
+#define EXEC_QUEUE_STATE_WEDGED                (1 << 8)
  
  static bool exec_queue_registered(struct xe_exec_queue *q)
  {
@@ -175,9 +177,20 @@ static void set_exec_queue_killed(struct xe_exec_queue *q)
         atomic_or(ENGINE_STATE_KILLED, &q->guc->state);
  }
  
-static bool exec_queue_killed_or_banned(struct xe_exec_queue *q)
+static bool exec_queue_wedged(struct xe_exec_queue *q)
  {
-       return exec_queue_killed(q) || exec_queue_banned(q);
+       return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_WEDGED;
+}
+
+static void set_exec_queue_wedged(struct xe_exec_queue *q)
+{
+       atomic_or(EXEC_QUEUE_STATE_WEDGED, &q->guc->state);
+}
+
+static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
+{
+       return exec_queue_banned(q) || (atomic_read(&q->guc->state) &
+               (EXEC_QUEUE_STATE_WEDGED | ENGINE_STATE_KILLED));
  }
  
  #ifdef CONFIG_PROVE_LOCKING
@@ -240,6 +253,17 @@ static void guc_submit_fini(struct drm_device *drm, void *arg)
         free_submit_wq(guc);
  }
  
+static void guc_submit_wedged_fini(struct drm_device *drm, void *arg)
+{
+       struct xe_guc *guc = arg;
+       struct xe_exec_queue *q;
+       unsigned long index;
+
+       xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
+               if (exec_queue_wedged(q))
+                       xe_exec_queue_put(q);
+}
+
  static const struct xe_exec_queue_ops guc_exec_queue_ops;
  
  static void primelockdep(struct xe_guc *guc)
@@ -708,7 +732,7 @@ guc_exec_queue_run_job(struct drm_sched_job *drm_job)
  
         trace_xe_sched_job_run(job);
  
-       if (!exec_queue_killed_or_banned(q) && !xe_sched_job_is_error(job)) {
+       if (!exec_queue_killed_or_banned_or_wedged(q) && !xe_sched_job_is_error(job)) {
                 if (!exec_queue_registered(q))
                         register_engine(q);
                 if (!lr)        /* LR jobs are emitted in the exec IOCTL */
@@ -844,6 +868,28 @@ static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q)
                 xe_sched_tdr_queue_imm(&q->guc->sched);
  }
  
+static void guc_submit_wedged(struct xe_guc *guc)
+{
+       struct xe_exec_queue *q;
+       unsigned long index;
+       int err;
+
+       xe_device_declare_wedged(guc_to_xe(guc));
+       xe_guc_submit_reset_prepare(guc);
+       xe_guc_ct_stop(&guc->ct);
+
+       err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm,
+                                      guc_submit_wedged_fini, guc);
+       if (err)
+               return;
+
+       mutex_lock(&guc->submission_state.lock);
+       xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
+               if (xe_exec_queue_get_unless_zero(q))
+                       set_exec_queue_wedged(q);
+       mutex_unlock(&guc->submission_state.lock);
+}
+
  static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
  {
         struct xe_guc_exec_queue *ge =
@@ -852,10 +898,16 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
         struct xe_guc *guc = exec_queue_to_guc(q);
         struct xe_device *xe = guc_to_xe(guc);
         struct xe_gpu_scheduler *sched = &ge->sched;
+       bool wedged = xe_device_wedged(xe);
  
         xe_assert(xe, xe_exec_queue_is_lr(q));
         trace_xe_exec_queue_lr_cleanup(q);
  
+       if (!wedged && xe_modparam.wedged_mode == 2) {
+               guc_submit_wedged(exec_queue_to_guc(q));
+               wedged = true;
+       }
+
         /* Kill the run_job / process_msg entry points */
         xe_sched_submission_stop(sched);
  
@@ -870,7 +922,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
          * xe_guc_deregister_done_handler() which treats it as an unexpected
          * state.
          */
-       if (exec_queue_registered(q) && !exec_queue_destroyed(q)) {
+       if (!wedged && exec_queue_registered(q) && !exec_queue_destroyed(q)) {
                 struct xe_guc *guc = exec_queue_to_guc(q);
                 int ret;
  
@@ -905,6 +957,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
         struct xe_device *xe = guc_to_xe(exec_queue_to_guc(q));
         int err = -ETIME;
         int i = 0;
+       bool wedged = xe_device_wedged(xe);
  
         /*
          * TDR has fired before free job worker. Common if exec queue
@@ -928,6 +981,11 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
  
         trace_xe_sched_job_timedout(job);
  
+       if (!wedged && xe_modparam.wedged_mode == 2) {
+               guc_submit_wedged(exec_queue_to_guc(q));
+               wedged = true;
+       }
+
         /* Kill the run_job entry point */
         xe_sched_submission_stop(sched);
  
@@ -935,8 +993,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
          * Kernel jobs should never fail, nor should VM jobs if they do
          * somethings has gone wrong and the GT needs a reset
          */
-       if (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
-           (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q))) {
+       if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
+                       (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) {
                 if (!xe_sched_invalidate_job(job, 2)) {
                         xe_sched_add_pending_job(sched, job);
                         xe_sched_submission_start(sched);
@@ -946,7 +1004,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
         }
  
         /* Engine state now stable, disable scheduling if needed */
-       if (exec_queue_registered(q)) {
+       if (!wedged && exec_queue_registered(q)) {
                 struct xe_guc *guc = exec_queue_to_guc(q);
                 int ret;
  
@@ -989,6 +1047,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
          */
         xe_sched_add_pending_job(sched, job);
         xe_sched_submission_start(sched);
+
         xe_guc_exec_queue_trigger_cleanup(q);
  
         /* Mark all outstanding jobs as bad, thus completing them */
@@ -1028,7 +1087,7 @@ static void guc_exec_queue_fini_async(struct xe_exec_queue *q)
         INIT_WORK(&q->guc->fini_async, __guc_exec_queue_fini_async);
  
         /* We must block on kernel engines so slabs are empty on driver unload */
-       if (q->flags & EXEC_QUEUE_FLAG_PERMANENT)
+       if (q->flags & EXEC_QUEUE_FLAG_PERMANENT || exec_queue_wedged(q))
                 __guc_exec_queue_fini_async(&q->guc->fini_async);
         else
                 queue_work(system_wq, &q->guc->fini_async);
@@ -1063,7 +1122,7 @@ static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg)
  
  static bool guc_exec_queue_allowed_to_change_state(struct xe_exec_queue *q)
  {
-       return !exec_queue_killed_or_banned(q) && exec_queue_registered(q);
+       return !exec_queue_killed_or_banned_or_wedged(q) && exec_queue_registered(q);
  }
  
  static void __guc_exec_queue_process_msg_set_sched_props(struct xe_sched_msg *msg)
@@ -1274,7 +1333,7 @@ static void guc_exec_queue_fini(struct xe_exec_queue *q)
  {
         struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_CLEANUP;
  
-       if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT))
+       if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && !exec_queue_wedged(q))
                 guc_exec_queue_add_msg(q, msg, CLEANUP);
         else
                 __guc_exec_queue_fini(exec_queue_to_guc(q), q);
@@ -1285,7 +1344,8 @@ static int guc_exec_queue_set_priority(struct xe_exec_queue *q,
  {
         struct xe_sched_msg *msg;
  
-       if (q->sched_props.priority == priority || exec_queue_killed_or_banned(q))
+       if (q->sched_props.priority == priority ||
+           exec_queue_killed_or_banned_or_wedged(q))
                 return 0;
  
         msg = kmalloc(sizeof(*msg), GFP_KERNEL);
@@ -1303,7 +1363,7 @@ static int guc_exec_queue_set_timeslice(struct xe_exec_queue *q, u32 timeslice_u
         struct xe_sched_msg *msg;
  
         if (q->sched_props.timeslice_us == timeslice_us ||
-           exec_queue_killed_or_banned(q))
+           exec_queue_killed_or_banned_or_wedged(q))
                 return 0;
  
         msg = kmalloc(sizeof(*msg), GFP_KERNEL);
@@ -1322,7 +1382,7 @@ static int guc_exec_queue_set_preempt_timeout(struct xe_exec_queue *q,
         struct xe_sched_msg *msg;
  
         if (q->sched_props.preempt_timeout_us == preempt_timeout_us ||
-           exec_queue_killed_or_banned(q))
+           exec_queue_killed_or_banned_or_wedged(q))
                 return 0;
  
         msg = kmalloc(sizeof(*msg), GFP_KERNEL);
@@ -1339,7 +1399,7 @@ static int guc_exec_queue_suspend(struct xe_exec_queue *q)
  {
         struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_SUSPEND;
  
-       if (exec_queue_killed_or_banned(q) || q->guc->suspend_pending)
+       if (exec_queue_killed_or_banned_or_wedged(q) || q->guc->suspend_pending)
                 return -EINVAL;
  
         q->guc->suspend_pending = true;
@@ -1485,7 +1545,7 @@ static void guc_exec_queue_start(struct xe_exec_queue *q)
  {
         struct xe_gpu_scheduler *sched = &q->guc->sched;
  
-       if (!exec_queue_killed_or_banned(q)) {
+       if (!exec_queue_killed_or_banned_or_wedged(q)) {
                 int i;
  
                 trace_xe_exec_queue_resubmit(q);
diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c

index ceb8345cbca6eb0f7fce33e2c9477d77240b7f0e..3edeb30d5ccbb3f81d6adec5421bc885166a7d41 100644 (file)
--- a/drivers/gpu/drm/xe/xe_module.c
+++ b/drivers/gpu/drm/xe/xe_module.c
@@ -17,6 +17,7 @@ struct xe_modparam xe_modparam = {
         .enable_display = true,
         .guc_log_level = 5,
         .force_probe = CONFIG_DRM_XE_FORCE_PROBE,
+       .wedged_mode = 1,
         /* the rest are 0 by default */
  };
  
@@ -55,6 +56,10 @@ MODULE_PARM_DESC(max_vfs,
                  "(0 = no VFs [default]; N = allow up to N VFs)");
  #endif
  
+module_param_named_unsafe(wedged_mode, xe_modparam.wedged_mode, int, 0600);
+MODULE_PARM_DESC(wedged_mode,
+                "Module's default policy for the wedged mode - 0=never, 1=upon-critical-errors[default], 2=upon-any-hang");
+
  struct init_funcs {
         int (*init)(void);
         void (*exit)(void);
diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h

index b369984f08ec276458d9e78db369585eba8325ca..61a0d28a28c8f84d118db72789d3fe446f2dfccf 100644 (file)
--- a/drivers/gpu/drm/xe/xe_module.h
+++ b/drivers/gpu/drm/xe/xe_module.h
@@ -21,6 +21,7 @@ struct xe_modparam {
  #ifdef CONFIG_PCI_IOV
         unsigned int max_vfs;
  #endif
+       int wedged_mode;
  };
  
  extern struct xe_modparam xe_modparam;
author	Rodrigo Vivi <rodrigo.vivi@intel.com>
	Tue, 23 Apr 2024 22:18:16 +0000 (18:18 -0400)
committer	Rodrigo Vivi <rodrigo.vivi@intel.com>
	Wed, 24 Apr 2024 16:12:58 +0000 (12:12 -0400)
drivers/gpu/drm/xe/xe_device.c		patch \| blob \| history
drivers/gpu/drm/xe/xe_device.h		patch \| blob \| history
drivers/gpu/drm/xe/xe_exec_queue.h		patch \| blob \| history
drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c		patch \| blob \| history
drivers/gpu/drm/xe/xe_guc_ads.c		patch \| blob \| history
drivers/gpu/drm/xe/xe_guc_submit.c		patch \| blob \| history
drivers/gpu/drm/xe/xe_module.c		patch \| blob \| history
drivers/gpu/drm/xe/xe_module.h		patch \| blob \| history