]> www.infradead.org Git - users/willy/xarray.git/commitdiff
drm/xe: Fix early wedge on GuC load failure
authorDaniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Wed, 11 Jun 2025 21:44:54 +0000 (14:44 -0700)
committerThomas Hellström <thomas.hellstrom@linux.intel.com>
Thu, 19 Jun 2025 15:24:30 +0000 (17:24 +0200)
When the GuC fails to load we declare the device wedged. However, the
very first GuC load attempt on GT0 (from xe_gt_init_hwconfig) is done
before the GT1 GuC objects are initialized, so things go bad when the
wedge code attempts to cleanup GT1. To fix this, check the initialization
status in the functions called during wedge.

Fixes: 7dbe8af13c18 ("drm/xe: Wedge the entire device")
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Jonathan Cavitt <jonathan.cavitt@intel.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Zhanjun Dong <zhanjun.dong@intel.com>
Cc: stable@vger.kernel.org # v6.12+: 1e1981b16bb1: drm/xe: Fix taking invalid lock on wedge
Cc: stable@vger.kernel.org # v6.12+
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>
Link: https://lore.kernel.org/r/20250611214453.1159846-2-daniele.ceraolospurio@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
(cherry picked from commit 0b93b7dcd9eb888a6ac7546560877705d4ad61bf)
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
drivers/gpu/drm/xe/xe_guc_ct.c
drivers/gpu/drm/xe/xe_guc_ct.h
drivers/gpu/drm/xe/xe_guc_submit.c

index 084cbdeba8eaa581e41ffab6a6a263fef7f2fa2d..e1362e608146b66a0354071c7bb666fe78b6be77 100644 (file)
@@ -137,6 +137,14 @@ void xe_gt_tlb_invalidation_reset(struct xe_gt *gt)
        struct xe_gt_tlb_invalidation_fence *fence, *next;
        int pending_seqno;
 
+       /*
+        * we can get here before the CTs are even initialized if we're wedging
+        * very early, in which case there are not going to be any pending
+        * fences so we can bail immediately.
+        */
+       if (!xe_guc_ct_initialized(&gt->uc.guc.ct))
+               return;
+
        /*
         * CT channel is already disabled at this point. No new TLB requests can
         * appear.
index 2447de0ebedf45759351fd6ce03a363a9459fe1a..d0ac48d8f4f799fbbd510d792b84167f7dff914b 100644 (file)
@@ -514,6 +514,9 @@ void xe_guc_ct_disable(struct xe_guc_ct *ct)
  */
 void xe_guc_ct_stop(struct xe_guc_ct *ct)
 {
+       if (!xe_guc_ct_initialized(ct))
+               return;
+
        xe_guc_ct_set_state(ct, XE_GUC_CT_STATE_STOPPED);
        stop_g2h_handler(ct);
 }
@@ -760,7 +763,7 @@ static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action,
        u16 seqno;
        int ret;
 
-       xe_gt_assert(gt, ct->state != XE_GUC_CT_STATE_NOT_INITIALIZED);
+       xe_gt_assert(gt, xe_guc_ct_initialized(ct));
        xe_gt_assert(gt, !g2h_len || !g2h_fence);
        xe_gt_assert(gt, !num_g2h || !g2h_fence);
        xe_gt_assert(gt, !g2h_len || num_g2h);
@@ -1344,7 +1347,7 @@ static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path)
        u32 action;
        u32 *hxg;
 
-       xe_gt_assert(gt, ct->state != XE_GUC_CT_STATE_NOT_INITIALIZED);
+       xe_gt_assert(gt, xe_guc_ct_initialized(ct));
        lockdep_assert_held(&ct->fast_lock);
 
        if (ct->state == XE_GUC_CT_STATE_DISABLED)
index 82c4ae458dda396904fcefcabd3796eb6552e10b..582aac10646945c384fc2eeced151c4de2ddb319 100644 (file)
@@ -22,6 +22,11 @@ void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot, struct drm_pr
 void xe_guc_ct_snapshot_free(struct xe_guc_ct_snapshot *snapshot);
 void xe_guc_ct_print(struct xe_guc_ct *ct, struct drm_printer *p, bool want_ctb);
 
+static inline bool xe_guc_ct_initialized(struct xe_guc_ct *ct)
+{
+       return ct->state != XE_GUC_CT_STATE_NOT_INITIALIZED;
+}
+
 static inline bool xe_guc_ct_enabled(struct xe_guc_ct *ct)
 {
        return ct->state == XE_GUC_CT_STATE_ENABLED;
index 6d84a52b660acfa1eeb33ecc9b7021dac5fc280f..9567f6700cf21e46a9feba21bfc98825e496266e 100644 (file)
@@ -1762,6 +1762,9 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc)
 {
        int ret;
 
+       if (!guc->submission_state.initialized)
+               return 0;
+
        /*
         * Using an atomic here rather than submission_state.lock as this
         * function can be called while holding the CT lock (engine reset