drm/xe/guc: Dead CT helper

author John Harrison <John.C.Harrison@Intel.com>

Thu, 3 Oct 2024 00:46:08 +0000 (17:46 -0700)

committer John Harrison <John.C.Harrison@Intel.com>

Tue, 8 Oct 2024 01:34:59 +0000 (18:34 -0700)
author John Harrison <John.C.Harrison@Intel.com>
Thu, 3 Oct 2024 00:46:08 +0000 (17:46 -0700)
committer John Harrison <John.C.Harrison@Intel.com>
Tue, 8 Oct 2024 01:34:59 +0000 (18:34 -0700)
diff --git a/drivers/gpu/drm/xe/abi/guc_communication_ctb_abi.h b/drivers/gpu/drm/xe/abi/guc_communication_ctb_abi.h

index 8f86a16dc5777cfaccf18806af7cc8775d2120ca..f58198cf2cf639571cd796a48a4d6756378e33a7 100644 (file)
--- a/drivers/gpu/drm/xe/abi/guc_communication_ctb_abi.h
+++ b/drivers/gpu/drm/xe/abi/guc_communication_ctb_abi.h
@@ -52,6 +52,7 @@ struct guc_ct_buffer_desc {
  #define GUC_CTB_STATUS_OVERFLOW                                (1 << 0)
  #define GUC_CTB_STATUS_UNDERFLOW                       (1 << 1)
  #define GUC_CTB_STATUS_MISMATCH                                (1 << 2)
+#define GUC_CTB_STATUS_DISABLED                                (1 << 3)
         u32 reserved[13];
  } __packed;
  static_assert(sizeof(struct guc_ct_buffer_desc) == 64);
diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c

index c2ddf883702b3b19d2ec52780f15d02c02f946a6..4abc790ed1cd2c43ffea30ef0a95af70a3bfae74 100644 (file)
--- a/drivers/gpu/drm/xe/xe_guc.c
+++ b/drivers/gpu/drm/xe/xe_guc.c
@@ -1180,7 +1180,7 @@ void xe_guc_print_info(struct xe_guc *guc, struct drm_printer *p)
  
         xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
  
-       xe_guc_ct_print(&guc->ct, p, false);
+       xe_guc_ct_print(&guc->ct, p);
         xe_guc_submit_print(guc, p);
  }
  
diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c

index c0dded21621f42ef251743d240f87da582372ab0..79571c9d180e8171cdc0c6e74f3a363812060392 100644 (file)
--- a/drivers/gpu/drm/xe/xe_guc_ct.c
+++ b/drivers/gpu/drm/xe/xe_guc_ct.c
@@ -26,12 +26,48 @@
  #include "xe_gt_sriov_pf_monitor.h"
  #include "xe_gt_tlb_invalidation.h"
  #include "xe_guc.h"
+#include "xe_guc_log.h"
  #include "xe_guc_relay.h"
  #include "xe_guc_submit.h"
  #include "xe_map.h"
  #include "xe_pm.h"
  #include "xe_trace_guc.h"
  
+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
+enum {
+       /* Internal states, not error conditions */
+       CT_DEAD_STATE_REARM,                    /* 0x0001 */
+       CT_DEAD_STATE_CAPTURE,                  /* 0x0002 */
+
+       /* Error conditions */
+       CT_DEAD_SETUP,                          /* 0x0004 */
+       CT_DEAD_H2G_WRITE,                      /* 0x0008 */
+       CT_DEAD_H2G_HAS_ROOM,                   /* 0x0010 */
+       CT_DEAD_G2H_READ,                       /* 0x0020 */
+       CT_DEAD_G2H_RECV,                       /* 0x0040 */
+       CT_DEAD_G2H_RELEASE,                    /* 0x0080 */
+       CT_DEAD_DEADLOCK,                       /* 0x0100 */
+       CT_DEAD_PROCESS_FAILED,                 /* 0x0200 */
+       CT_DEAD_FAST_G2H,                       /* 0x0400 */
+       CT_DEAD_PARSE_G2H_RESPONSE,             /* 0x0800 */
+       CT_DEAD_PARSE_G2H_UNKNOWN,              /* 0x1000 */
+       CT_DEAD_PARSE_G2H_ORIGIN,               /* 0x2000 */
+       CT_DEAD_PARSE_G2H_TYPE,                 /* 0x4000 */
+};
+
+static void ct_dead_worker_func(struct work_struct *w);
+static void ct_dead_capture(struct xe_guc_ct *ct, struct guc_ctb *ctb, u32 reason_code);
+
+#define CT_DEAD(ct, ctb, reason_code)          ct_dead_capture((ct), (ctb), CT_DEAD_##reason_code)
+#else
+#define CT_DEAD(ct, ctb, reason)                       \
+       do {                                            \
+               struct guc_ctb *_ctb = (ctb);           \
+               if (_ctb)                               \
+                       _ctb->info.broken = true;       \
+       } while (0)
+#endif
+
  /* Used when a CT send wants to block and / or receive data */
  struct g2h_fence {
         u32 *response_buffer;
@@ -184,6 +220,10 @@ int xe_guc_ct_init(struct xe_guc_ct *ct)
         xa_init(&ct->fence_lookup);
         INIT_WORK(&ct->g2h_worker, g2h_worker_func);
         INIT_DELAYED_WORK(&ct->safe_mode_worker, safe_mode_worker_func);
+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
+       spin_lock_init(&ct->dead.lock);
+       INIT_WORK(&ct->dead.worker, ct_dead_worker_func);
+#endif
         init_waitqueue_head(&ct->wq);
         init_waitqueue_head(&ct->g2h_fence_wq);
  
@@ -421,10 +461,22 @@ int xe_guc_ct_enable(struct xe_guc_ct *ct)
         if (ct_needs_safe_mode(ct))
                 ct_enter_safe_mode(ct);
  
+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
+       /*
+        * The CT has now been reset so the dumper can be re-armed
+        * after any existing dead state has been dumped.
+        */
+       spin_lock_irq(&ct->dead.lock);
+       if (ct->dead.reason)
+               ct->dead.reason |= (1 << CT_DEAD_STATE_REARM);
+       spin_unlock_irq(&ct->dead.lock);
+#endif
+
         return 0;
  
  err_out:
         xe_gt_err(gt, "Failed to enable GuC CT (%pe)\n", ERR_PTR(err));
+       CT_DEAD(ct, NULL, SETUP);
  
         return err;
  }
@@ -468,6 +520,19 @@ static bool h2g_has_room(struct xe_guc_ct *ct, u32 cmd_len)
  
         if (cmd_len > h2g->info.space) {
                 h2g->info.head = desc_read(ct_to_xe(ct), h2g, head);
+
+               if (h2g->info.head > h2g->info.size) {
+                       struct xe_device *xe = ct_to_xe(ct);
+                       u32 desc_status = desc_read(xe, h2g, status);
+
+                       desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
+
+                       xe_gt_err(ct_to_gt(ct), "CT: invalid head offset %u >= %u)\n",
+                                 h2g->info.head, h2g->info.size);
+                       CT_DEAD(ct, h2g, H2G_HAS_ROOM);
+                       return false;
+               }
+
                 h2g->info.space = CIRC_SPACE(h2g->info.tail, h2g->info.head,
                                              h2g->info.size) -
                                   h2g->info.resv_space;
@@ -523,10 +588,24 @@ static void __g2h_reserve_space(struct xe_guc_ct *ct, u32 g2h_len, u32 num_g2h)
  
  static void __g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len)
  {
+       bool bad = false;
+
         lockdep_assert_held(&ct->fast_lock);
-       xe_gt_assert(ct_to_gt(ct), ct->ctbs.g2h.info.space + g2h_len <=
-                    ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space);
-       xe_gt_assert(ct_to_gt(ct), ct->g2h_outstanding);
+
+       bad = ct->ctbs.g2h.info.space + g2h_len >
+                    ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space;
+       bad |= !ct->g2h_outstanding;
+
+       if (bad) {
+               xe_gt_err(ct_to_gt(ct), "Invalid G2H release: %d + %d vs %d - %d -> %d vs %d, outstanding = %d!\n",
+                         ct->ctbs.g2h.info.space, g2h_len,
+                         ct->ctbs.g2h.info.size, ct->ctbs.g2h.info.resv_space,
+                         ct->ctbs.g2h.info.space + g2h_len,
+                         ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space,
+                         ct->g2h_outstanding);
+               CT_DEAD(ct, &ct->ctbs.g2h, G2H_RELEASE);
+               return;
+       }
  
         ct->ctbs.g2h.info.space += g2h_len;
         if (!--ct->g2h_outstanding)
@@ -553,12 +632,43 @@ static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len,
         u32 full_len;
         struct iosys_map map = IOSYS_MAP_INIT_OFFSET(&h2g->cmds,
                                                          tail * sizeof(u32));
+       u32 desc_status;
  
         full_len = len + GUC_CTB_HDR_LEN;
  
         lockdep_assert_held(&ct->lock);
         xe_gt_assert(gt, full_len <= GUC_CTB_MSG_MAX_LEN);
-       xe_gt_assert(gt, tail <= h2g->info.size);
+
+       desc_status = desc_read(xe, h2g, status);
+       if (desc_status) {
+               xe_gt_err(gt, "CT write: non-zero status: %u\n", desc_status);
+               goto corrupted;
+       }
+
+       if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) {
+               u32 desc_tail = desc_read(xe, h2g, tail);
+               u32 desc_head = desc_read(xe, h2g, head);
+
+               if (tail != desc_tail) {
+                       desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_MISMATCH);
+                       xe_gt_err(gt, "CT write: tail was modified %u != %u\n", desc_tail, tail);
+                       goto corrupted;
+               }
+
+               if (tail > h2g->info.size) {
+                       desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
+                       xe_gt_err(gt, "CT write: tail out of range: %u vs %u\n",
+                                 tail, h2g->info.size);
+                       goto corrupted;
+               }
+
+               if (desc_head >= h2g->info.size) {
+                       desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
+                       xe_gt_err(gt, "CT write: invalid head offset %u >= %u)\n",
+                                 desc_head, h2g->info.size);
+                       goto corrupted;
+               }
+       }
  
         /* Command will wrap, zero fill (NOPs), return and check credits again */
         if (tail + full_len > h2g->info.size) {
@@ -611,6 +721,10 @@ static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len,
                              desc_read(xe, h2g, head), h2g->info.tail);
  
         return 0;
+
+corrupted:
+       CT_DEAD(ct, &ct->ctbs.h2g, H2G_WRITE);
+       return -EPIPE;
  }
  
  /*
@@ -718,7 +832,6 @@ static int guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len,
  {
         struct xe_device *xe = ct_to_xe(ct);
         struct xe_gt *gt = ct_to_gt(ct);
-       struct drm_printer p = xe_gt_info_printer(gt);
         unsigned int sleep_period_ms = 1;
         int ret;
  
@@ -771,8 +884,13 @@ try_again:
                         goto broken;
  #undef g2h_avail
  
-               if (dequeue_one_g2h(ct) < 0)
+               ret = dequeue_one_g2h(ct);
+               if (ret < 0) {
+                       if (ret != -ECANCELED)
+                               xe_gt_err(ct_to_gt(ct), "CTB receive failed (%pe)",
+                                         ERR_PTR(ret));
                         goto broken;
+               }
  
                 goto try_again;
         }
@@ -781,8 +899,7 @@ try_again:
  
  broken:
         xe_gt_err(gt, "No forward process on H2G, reset required\n");
-       xe_guc_ct_print(ct, &p, true);
-       ct->ctbs.h2g.info.broken = true;
+       CT_DEAD(ct, &ct->ctbs.h2g, DEADLOCK);
  
         return -EDEADLK;
  }
@@ -1021,6 +1138,7 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
                 else
                         xe_gt_err(gt, "unexpected response %u for FAST_REQ H2G fence 0x%x!\n",
                                   type, fence);
+               CT_DEAD(ct, NULL, PARSE_G2H_RESPONSE);
  
                 return -EPROTO;
         }
@@ -1028,6 +1146,7 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
         g2h_fence = xa_erase(&ct->fence_lookup, fence);
         if (unlikely(!g2h_fence)) {
                 /* Don't tear down channel, as send could've timed out */
+               /* CT_DEAD(ct, NULL, PARSE_G2H_UNKNOWN); */
                 xe_gt_warn(gt, "G2H fence (%u) not found!\n", fence);
                 g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN);
                 return 0;
@@ -1072,7 +1191,7 @@ static int parse_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
         if (unlikely(origin != GUC_HXG_ORIGIN_GUC)) {
                 xe_gt_err(gt, "G2H channel broken on read, origin=%u, reset required\n",
                           origin);
-               ct->ctbs.g2h.info.broken = true;
+               CT_DEAD(ct, &ct->ctbs.g2h, PARSE_G2H_ORIGIN);
  
                 return -EPROTO;
         }
@@ -1090,7 +1209,7 @@ static int parse_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
         default:
                 xe_gt_err(gt, "G2H channel broken on read, type=%u, reset required\n",
                           type);
-               ct->ctbs.g2h.info.broken = true;
+               CT_DEAD(ct, &ct->ctbs.g2h, PARSE_G2H_TYPE);
  
                 ret = -EOPNOTSUPP;
         }
@@ -1167,9 +1286,11 @@ static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
                 xe_gt_err(gt, "unexpected G2H action 0x%04x\n", action);
         }
  
-       if (ret)
+       if (ret) {
                 xe_gt_err(gt, "G2H action 0x%04x failed (%pe)\n",
                           action, ERR_PTR(ret));
+               CT_DEAD(ct, NULL, PROCESS_FAILED);
+       }
  
         return 0;
  }
@@ -1179,7 +1300,7 @@ static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path)
         struct xe_device *xe = ct_to_xe(ct);
         struct xe_gt *gt = ct_to_gt(ct);
         struct guc_ctb *g2h = &ct->ctbs.g2h;
-       u32 tail, head, len;
+       u32 tail, head, len, desc_status;
         s32 avail;
         u32 action;
         u32 *hxg;
@@ -1198,6 +1319,63 @@ static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path)
  
         xe_gt_assert(gt, xe_guc_ct_enabled(ct));
  
+       desc_status = desc_read(xe, g2h, status);
+       if (desc_status) {
+               if (desc_status & GUC_CTB_STATUS_DISABLED) {
+                       /*
+                        * Potentially valid if a CLIENT_RESET request resulted in
+                        * contexts/engines being reset. But should never happen as
+                        * no contexts should be active when CLIENT_RESET is sent.
+                        */
+                       xe_gt_err(gt, "CT read: unexpected G2H after GuC has stopped!\n");
+                       desc_status &= ~GUC_CTB_STATUS_DISABLED;
+               }
+
+               if (desc_status) {
+                       xe_gt_err(gt, "CT read: non-zero status: %u\n", desc_status);
+                       goto corrupted;
+               }
+       }
+
+       if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) {
+               u32 desc_tail = desc_read(xe, g2h, tail);
+               /*
+               u32 desc_head = desc_read(xe, g2h, head);
+
+                * info.head and desc_head are updated back-to-back at the end of
+                * this function and nowhere else. Hence, they cannot be different
+                * unless two g2h_read calls are running concurrently. Which is not
+                * possible because it is guarded by ct->fast_lock. And yet, some
+                * discrete platforms are reguarly hitting this error :(.
+                *
+                * desc_head rolling backwards shouldn't cause any noticeable
+                * problems - just a delay in GuC being allowed to proceed past that
+                * point in the queue. So for now, just disable the error until it
+                * can be root caused.
+                *
+               if (g2h->info.head != desc_head) {
+                       desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_MISMATCH);
+                       xe_gt_err(gt, "CT read: head was modified %u != %u\n",
+                                 desc_head, g2h->info.head);
+                       goto corrupted;
+               }
+                */
+
+               if (g2h->info.head > g2h->info.size) {
+                       desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
+                       xe_gt_err(gt, "CT read: head out of range: %u vs %u\n",
+                                 g2h->info.head, g2h->info.size);
+                       goto corrupted;
+               }
+
+               if (desc_tail >= g2h->info.size) {
+                       desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
+                       xe_gt_err(gt, "CT read: invalid tail offset %u >= %u)\n",
+                                 desc_tail, g2h->info.size);
+                       goto corrupted;
+               }
+       }
+
         /* Calculate DW available to read */
         tail = desc_read(xe, g2h, tail);
         avail = tail - g2h->info.head;
@@ -1214,9 +1392,7 @@ static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path)
         if (len > avail) {
                 xe_gt_err(gt, "G2H channel broken on read, avail=%d, len=%d, reset required\n",
                           avail, len);
-               g2h->info.broken = true;
-
-               return -EPROTO;
+               goto corrupted;
         }
  
         head = (g2h->info.head + 1) % g2h->info.size;
@@ -1262,6 +1438,10 @@ static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path)
                              action, len, g2h->info.head, tail);
  
         return len;
+
+corrupted:
+       CT_DEAD(ct, &ct->ctbs.g2h, G2H_READ);
+       return -EPROTO;
  }
  
  static void g2h_fast_path(struct xe_guc_ct *ct, u32 *msg, u32 len)
@@ -1288,9 +1468,11 @@ static void g2h_fast_path(struct xe_guc_ct *ct, u32 *msg, u32 len)
                 xe_gt_warn(gt, "NOT_POSSIBLE");
         }
  
-       if (ret)
+       if (ret) {
                 xe_gt_err(gt, "G2H action 0x%04x failed (%pe)\n",
                           action, ERR_PTR(ret));
+               CT_DEAD(ct, NULL, FAST_G2H);
+       }
  }
  
  /**
@@ -1350,7 +1532,6 @@ static int dequeue_one_g2h(struct xe_guc_ct *ct)
  
  static void receive_g2h(struct xe_guc_ct *ct)
  {
-       struct xe_gt *gt = ct_to_gt(ct);
         bool ongoing;
         int ret;
  
@@ -1387,9 +1568,8 @@ static void receive_g2h(struct xe_guc_ct *ct)
                 mutex_unlock(&ct->lock);
  
                 if (unlikely(ret == -EPROTO || ret == -EOPNOTSUPP)) {
-                       struct drm_printer p = xe_gt_info_printer(gt);
-
-                       xe_guc_ct_print(ct, &p, false);
+                       xe_gt_err(ct_to_gt(ct), "CT dequeue failed: %d", ret);
+                       CT_DEAD(ct, NULL, G2H_RECV);
                         kick_reset(ct);
                 }
         } while (ret == 1);
@@ -1417,9 +1597,8 @@ static void guc_ctb_snapshot_capture(struct xe_device *xe, struct guc_ctb *ctb,
  
         snapshot->cmds = kmalloc_array(ctb->info.size, sizeof(u32),
                                        atomic ? GFP_ATOMIC : GFP_KERNEL);
-
         if (!snapshot->cmds) {
-               drm_err(&xe->drm, "Skipping CTB commands snapshot. Only CTB info will be available.\n");
+               drm_err(&xe->drm, "Skipping CTB commands snapshot. Only CT info will be available.\n");
                 return;
         }
  
@@ -1500,7 +1679,7 @@ struct xe_guc_ct_snapshot *xe_guc_ct_snapshot_capture(struct xe_guc_ct *ct,
                            atomic ? GFP_ATOMIC : GFP_KERNEL);
  
         if (!snapshot) {
-               drm_err(&xe->drm, "Skipping CTB snapshot entirely.\n");
+               xe_gt_err(ct_to_gt(ct), "Skipping CTB snapshot entirely.\n");
                 return NULL;
         }
  
@@ -1564,16 +1743,119 @@ void xe_guc_ct_snapshot_free(struct xe_guc_ct_snapshot *snapshot)
   * xe_guc_ct_print - GuC CT Print.
   * @ct: GuC CT.
   * @p: drm_printer where it will be printed out.
- * @atomic: Boolean to indicate if this is called from atomic context like
- * reset or CTB handler or from some regular path like debugfs.
   *
   * This function quickly capture a snapshot and immediately print it out.
   */
-void xe_guc_ct_print(struct xe_guc_ct *ct, struct drm_printer *p, bool atomic)
+void xe_guc_ct_print(struct xe_guc_ct *ct, struct drm_printer *p)
  {
         struct xe_guc_ct_snapshot *snapshot;
  
-       snapshot = xe_guc_ct_snapshot_capture(ct, atomic);
+       snapshot = xe_guc_ct_snapshot_capture(ct, false);
         xe_guc_ct_snapshot_print(snapshot, p);
         xe_guc_ct_snapshot_free(snapshot);
  }
+
+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
+static void ct_dead_capture(struct xe_guc_ct *ct, struct guc_ctb *ctb, u32 reason_code)
+{
+       struct xe_guc_log_snapshot *snapshot_log;
+       struct xe_guc_ct_snapshot *snapshot_ct;
+       struct xe_guc *guc = ct_to_guc(ct);
+       unsigned long flags;
+       bool have_capture;
+
+       if (ctb)
+               ctb->info.broken = true;
+
+       /* Ignore further errors after the first dump until a reset */
+       if (ct->dead.reported)
+               return;
+
+       spin_lock_irqsave(&ct->dead.lock, flags);
+
+       /* And only capture one dump at a time */
+       have_capture = ct->dead.reason & (1 << CT_DEAD_STATE_CAPTURE);
+       ct->dead.reason |= (1 << reason_code) |
+                          (1 << CT_DEAD_STATE_CAPTURE);
+
+       spin_unlock_irqrestore(&ct->dead.lock, flags);
+
+       if (have_capture)
+               return;
+
+       snapshot_log = xe_guc_log_snapshot_capture(&guc->log, true);
+       snapshot_ct = xe_guc_ct_snapshot_capture((ct), true);
+
+       spin_lock_irqsave(&ct->dead.lock, flags);
+
+       if (ct->dead.snapshot_log || ct->dead.snapshot_ct) {
+               xe_gt_err(ct_to_gt(ct), "Got unexpected dead CT capture!\n");
+               xe_guc_log_snapshot_free(snapshot_log);
+               xe_guc_ct_snapshot_free(snapshot_ct);
+       } else {
+               ct->dead.snapshot_log = snapshot_log;
+               ct->dead.snapshot_ct = snapshot_ct;
+       }
+
+       spin_unlock_irqrestore(&ct->dead.lock, flags);
+
+       queue_work(system_unbound_wq, &(ct)->dead.worker);
+}
+
+static void ct_dead_print(struct xe_dead_ct *dead)
+{
+       struct xe_guc_ct *ct = container_of(dead, struct xe_guc_ct, dead);
+       struct xe_device *xe = ct_to_xe(ct);
+       struct xe_gt *gt = ct_to_gt(ct);
+       static int g_count;
+       struct drm_printer ip = xe_gt_info_printer(gt);
+       struct drm_printer lp = drm_line_printer(&ip, "Capture", ++g_count);
+
+       if (!dead->reason) {
+               xe_gt_err(gt, "CTB is dead for no reason!?\n");
+               return;
+       }
+
+       drm_printf(&lp, "CTB is dead - reason=0x%X\n", dead->reason);
+
+       /* Can't generate a genuine core dump at this point, so just do the good bits */
+       drm_puts(&lp, "**** Xe Device Coredump ****\n");
+       xe_device_snapshot_print(xe, &lp);
+
+       drm_printf(&lp, "**** GT #%d ****\n", gt->info.id);
+       drm_printf(&lp, "\tTile: %d\n", gt->tile->id);
+
+       drm_puts(&lp, "**** GuC Log ****\n");
+       xe_guc_log_snapshot_print(dead->snapshot_log, &lp);
+
+       drm_puts(&lp, "**** GuC CT ****\n");
+       xe_guc_ct_snapshot_print(dead->snapshot_ct, &lp);
+
+       drm_puts(&lp, "Done.\n");
+}
+
+static void ct_dead_worker_func(struct work_struct *w)
+{
+       struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, dead.worker);
+
+       if (!ct->dead.reported) {
+               ct->dead.reported = true;
+               ct_dead_print(&ct->dead);
+       }
+
+       spin_lock_irq(&ct->dead.lock);
+
+       xe_guc_log_snapshot_free(ct->dead.snapshot_log);
+       ct->dead.snapshot_log = NULL;
+       xe_guc_ct_snapshot_free(ct->dead.snapshot_ct);
+       ct->dead.snapshot_ct = NULL;
+
+       if (ct->dead.reason & (1 << CT_DEAD_STATE_REARM)) {
+               /* A reset has occurred so re-arm the error reporting */
+               ct->dead.reason = 0;
+               ct->dead.reported = false;
+       }
+
+       spin_unlock_irq(&ct->dead.lock);
+}
+#endif
diff --git a/drivers/gpu/drm/xe/xe_guc_ct.h b/drivers/gpu/drm/xe/xe_guc_ct.h

index 190202fce2d048adf83497157af5c7a8c1b20ca8..293041bed7ed0f0c285ca406da2000e8a65128e3 100644 (file)
--- a/drivers/gpu/drm/xe/xe_guc_ct.h
+++ b/drivers/gpu/drm/xe/xe_guc_ct.h
@@ -21,7 +21,7 @@ xe_guc_ct_snapshot_capture(struct xe_guc_ct *ct, bool atomic);
  void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot,
                               struct drm_printer *p);
  void xe_guc_ct_snapshot_free(struct xe_guc_ct_snapshot *snapshot);
-void xe_guc_ct_print(struct xe_guc_ct *ct, struct drm_printer *p, bool atomic);
+void xe_guc_ct_print(struct xe_guc_ct *ct, struct drm_printer *p);
  
  static inline bool xe_guc_ct_enabled(struct xe_guc_ct *ct)
  {
diff --git a/drivers/gpu/drm/xe/xe_guc_ct_types.h b/drivers/gpu/drm/xe/xe_guc_ct_types.h

index 761cb90312984325b4a875410b1a024b68951cb7..85e127ec91d7af22468ef7ebb9c6a9c9a0eeebc3 100644 (file)
--- a/drivers/gpu/drm/xe/xe_guc_ct_types.h
+++ b/drivers/gpu/drm/xe/xe_guc_ct_types.h
@@ -86,6 +86,24 @@ enum xe_guc_ct_state {
         XE_GUC_CT_STATE_ENABLED,
  };
  
+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
+/** struct xe_dead_ct - Information for debugging a dead CT */
+struct xe_dead_ct {
+       /** @lock: protects memory allocation/free operations, and @reason updates */
+       spinlock_t lock;
+       /** @reason: bit mask of CT_DEAD_* reason codes */
+       unsigned int reason;
+       /** @reported: for preventing multiple dumps per error sequence */
+       bool reported;
+       /** @worker: worker thread to get out of interrupt context before dumping */
+       struct work_struct worker;
+       /** snapshot_ct: copy of CT state and CTB content at point of error */
+       struct xe_guc_ct_snapshot *snapshot_ct;
+       /** snapshot_log: copy of GuC log at point of error */
+       struct xe_guc_log_snapshot *snapshot_log;
+};
+#endif
+
  /**
   * struct xe_guc_ct - GuC command transport (CT) layer
   *
@@ -128,6 +146,11 @@ struct xe_guc_ct {
         u32 msg[GUC_CTB_MSG_MAX_LEN];
         /** @fast_msg: Message buffer */
         u32 fast_msg[GUC_CTB_MSG_MAX_LEN];
+
+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
+       /** @dead: information for debugging dead CTs */
+       struct xe_dead_ct dead;
+#endif
  };
  
  #endif
author	John Harrison <John.C.Harrison@Intel.com>
	Thu, 3 Oct 2024 00:46:08 +0000 (17:46 -0700)
committer	John Harrison <John.C.Harrison@Intel.com>
	Tue, 8 Oct 2024 01:34:59 +0000 (18:34 -0700)
drivers/gpu/drm/xe/abi/guc_communication_ctb_abi.h		patch \| blob \| history
drivers/gpu/drm/xe/xe_guc.c		patch \| blob \| history
drivers/gpu/drm/xe/xe_guc_ct.c		patch \| blob \| history
drivers/gpu/drm/xe/xe_guc_ct.h		patch \| blob \| history
drivers/gpu/drm/xe/xe_guc_ct_types.h		patch \| blob \| history