]> www.infradead.org Git - users/hch/misc.git/commitdiff
drm/i915/guc: Include the GuC registers in the error state
authorDaniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Tue, 9 Sep 2025 22:36:22 +0000 (15:36 -0700)
committerDaniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Thu, 11 Sep 2025 18:28:02 +0000 (11:28 -0700)
If GuC hangs, the GuC logs might not contain enough information to
understand exactly why the hang occurred. In this case, we need to
look at the GuC HW state to try to understand where the GuC is stuck. It
is therefore useful to include the GuC HW state in the error capture.

The list of registers that are part of the GuC HW state can change based
on platform, but it is the same for all platforms from TGL to MTL so we
only need to support one version for i915.

v2: revised list
v3: remove confusing comment, use sizeof(u32) instead of 4 (John)

Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: John Harrison <John.C.Harrison@Intel.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://lore.kernel.org/r/20250909223621.3782625-2-daniele.ceraolospurio@intel.com
drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
drivers/gpu/drm/i915/gt/uc/intel_guc_reg.h
drivers/gpu/drm/i915/i915_gpu_error.c
drivers/gpu/drm/i915/i915_gpu_error.h

index e7ccfa520df3aec946afe6d9101d8ede43db35df..384d1400134dd315d6dbd67aca0b7f21ec7108cd 100644 (file)
@@ -46,6 +46,14 @@ static void guc_prepare_xfer(struct intel_gt *gt)
                /* allows for 5us (in 10ns units) before GT can go to RC6 */
                intel_uncore_write(uncore, GUC_ARAT_C6DIS, 0x1FF);
        }
+
+       /*
+        * Starting from IP 12.50 we need to enable the mirroring of GuC
+        * internal state to debug registers. This is always enabled on previous
+        * IPs.
+        */
+       if (GRAPHICS_VER_FULL(uncore->i915) >= IP_VER(12, 50))
+               intel_uncore_rmw(uncore, GUC_SHIM_CONTROL2, 0, GUC_ENABLE_DEBUG_REG);
 }
 
 static int guc_xfer_rsa_mmio(struct intel_uc_fw *guc_fw,
index 3fd7988375020f9ff436cbc2a4352e888d41d9fb..f73dab5275473ea309943af431aa70a8d0efe0d4 100644 (file)
@@ -96,6 +96,7 @@
 #define   GUC_GEN10_SHIM_WC_ENABLE             (1<<21)
 
 #define GUC_SHIM_CONTROL2              _MMIO(0xc068)
+#define   GUC_ENABLE_DEBUG_REG         (1<<11)
 #define   GUC_IS_PRIVILEGED            (1<<29)
 #define   GSC_LOADS_HUC                        (1<<30)
 
index 4f785cdbd1553a7a033a5c4b87b16d616e196d88..01dd77de6d14d0d8144a4a941706bfe0523f817d 100644 (file)
@@ -685,6 +685,74 @@ static void err_print_guc_ctb(struct drm_i915_error_state_buf *m,
                   ctb->head, ctb->tail, ctb->desc_offset, ctb->cmds_offset, ctb->size);
 }
 
+/* This list includes registers that are useful in debugging GuC hangs. */
+const struct {
+       u32 start;
+       u32 count;
+} guc_hw_reg_state[] = {
+       { 0xc0b0, 2 },
+       { 0xc000, 65 },
+       { 0xc140, 1 },
+       { 0xc180, 16 },
+       { 0xc1dc, 10 },
+       { 0xc300, 79 },
+       { 0xc4b4, 47 },
+       { 0xc574, 1 },
+       { 0xc57c, 1 },
+       { 0xc584, 11 },
+       { 0xc5c0, 8 },
+       { 0xc5e4, 1 },
+       { 0xc5ec, 103 },
+       { 0xc7c0, 1 },
+       { 0xc0b0, 2 }
+};
+
+static u32 print_range_line(struct drm_i915_error_state_buf *m, u32 start, u32 *dump, u32 count)
+{
+       if (count >= 8) {
+               err_printf(m, "[0x%04x] 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x\n",
+                          start, dump[0], dump[1], dump[2], dump[3],
+                          dump[4], dump[5], dump[6], dump[7]);
+               return 8;
+       } else if (count >= 4) {
+               err_printf(m, "[0x%04x] 0x%08x 0x%08x 0x%08x 0x%08x\n",
+                          start, dump[0], dump[1], dump[2], dump[3]);
+               return 4;
+       } else if (count >= 2) {
+               err_printf(m, "[0x%04x] 0x%08x 0x%08x\n", start, dump[0], dump[1]);
+               return 2;
+       }
+
+       err_printf(m, "[0x%04x] 0x%08x\n", start, dump[0]);
+       return 1;
+}
+
+static void err_print_guc_hw_state(struct drm_i915_error_state_buf *m, u32 *hw_state)
+{
+       u32 total = 0;
+       int i;
+
+       if (!hw_state)
+               return;
+
+       err_printf(m, "GuC Register State:\n");
+
+       for (i = 0; i < ARRAY_SIZE(guc_hw_reg_state); i++) {
+               u32 entry = 0;
+
+               while (entry < guc_hw_reg_state[i].count) {
+                       u32 start = guc_hw_reg_state[i].start + entry * sizeof(u32);
+                       u32 count = guc_hw_reg_state[i].count - entry;
+                       u32 *values = hw_state + total + entry;
+
+                       entry += print_range_line(m, start, values, count);
+               }
+
+               GEM_BUG_ON(entry != guc_hw_reg_state[i].count);
+               total += entry;
+       }
+}
+
 static void err_print_uc(struct drm_i915_error_state_buf *m,
                         const struct intel_uc_coredump *error_uc)
 {
@@ -693,6 +761,7 @@ static void err_print_uc(struct drm_i915_error_state_buf *m,
        intel_uc_fw_dump(&error_uc->guc_fw, &p);
        intel_uc_fw_dump(&error_uc->huc_fw, &p);
        err_printf(m, "GuC timestamp: 0x%08x\n", error_uc->guc.timestamp);
+       err_print_guc_hw_state(m, error_uc->guc.hw_state);
        intel_gpu_error_print_vma(m, NULL, error_uc->guc.vma_log);
        err_printf(m, "GuC CTB fence: %d\n", error_uc->guc.last_fence);
        err_print_guc_ctb(m, "Send", error_uc->guc.ctb + 0);
@@ -1025,6 +1094,7 @@ static void cleanup_uc(struct intel_uc_coredump *uc)
        kfree(uc->huc_fw.file_wanted.path);
        i915_vma_coredump_free(uc->guc.vma_log);
        i915_vma_coredump_free(uc->guc.vma_ctb);
+       kfree(uc->guc.hw_state);
 
        kfree(uc);
 }
@@ -1721,6 +1791,37 @@ static void gt_record_guc_ctb(struct intel_ctb_coredump *saved,
        saved->cmds_offset = ((void *)ctb->cmds) - blob_ptr;
 }
 
+static u32 read_guc_state_reg(struct intel_uncore *uncore, int range, int count)
+{
+       GEM_BUG_ON(range >= ARRAY_SIZE(guc_hw_reg_state));
+       GEM_BUG_ON(count >= guc_hw_reg_state[range].count);
+
+       return intel_uncore_read(uncore,
+                                _MMIO(guc_hw_reg_state[range].start + count * sizeof(u32)));
+}
+
+static void gt_record_guc_hw_state(struct intel_uncore *uncore,
+                                  struct intel_uc_coredump *error_uc)
+{
+       u32 *hw_state;
+       u32 count = 0;
+       int i, j;
+
+       for (i = 0; i < ARRAY_SIZE(guc_hw_reg_state); i++)
+               count += guc_hw_reg_state[i].count;
+
+       hw_state = kcalloc(count, sizeof(u32), ALLOW_FAIL);
+       if (!hw_state)
+               return;
+
+       count = 0;
+       for (i = 0; i < ARRAY_SIZE(guc_hw_reg_state); i++)
+               for (j = 0; j < guc_hw_reg_state[i].count; j++)
+                       hw_state[count++] = read_guc_state_reg(uncore, i, j);
+
+       error_uc->guc.hw_state = hw_state;
+}
+
 static struct intel_uc_coredump *
 gt_record_uc(struct intel_gt_coredump *gt,
             struct i915_vma_compress *compress)
@@ -1755,6 +1856,7 @@ gt_record_uc(struct intel_gt_coredump *gt,
                          uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc);
        gt_record_guc_ctb(error_uc->guc.ctb + 1, &uc->guc.ct.ctbs.recv,
                          uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc);
+       gt_record_guc_hw_state(gt->_gt->uncore, error_uc);
 
        return error_uc;
 }
index 182324979278c661b77f5cc5ec6f552a23b11922..91b3df621a492f8de7f579425857ba0ba99b0229 100644 (file)
@@ -177,6 +177,7 @@ struct intel_gt_coredump {
                        struct intel_ctb_coredump ctb[2];
                        struct i915_vma_coredump *vma_ctb;
                        struct i915_vma_coredump *vma_log;
+                       u32 *hw_state;
                        u32 timestamp;
                        u16 last_fence;
                        bool is_guc_capture;