]> www.infradead.org Git - users/hch/misc.git/commitdiff
drm/xe/guc: Set upper limit of H2G retries over CTB
authorMichal Wajdeczko <michal.wajdeczko@intel.com>
Wed, 3 Sep 2025 22:33:30 +0000 (00:33 +0200)
committerMichal Wajdeczko <michal.wajdeczko@intel.com>
Thu, 4 Sep 2025 20:24:51 +0000 (22:24 +0200)
The GuC communication protocol allows GuC to send NO_RESPONSE_RETRY
reply message to indicate that due to some interim condition it can
not handle incoming H2G request and the host shall resend it.

But in some cases, due to errors, this unsatisfied condition might
be final and this could lead to endless retries as it was recently
seen on the CI:

 [drm] GT0: PF: VF1 FLR didn't finish in 5000 ms (-ETIMEDOUT)
 [drm] GT0: PF: VF1 resource sanitizing failed (-ETIMEDOUT)
 [drm] GT0: PF: VF1 FLR failed!
 [drm:guc_ct_send_recv [xe]] GT0: H2G action 0x5503 retrying: reason 0x0
 [drm:guc_ct_send_recv [xe]] GT0: H2G action 0x5503 retrying: reason 0x0
 [drm:guc_ct_send_recv [xe]] GT0: H2G action 0x5503 retrying: reason 0x0
 [drm:guc_ct_send_recv [xe]] GT0: H2G action 0x5503 retrying: reason 0x0

To avoid such dangerous loops allow only limited number of retries
(for now 50) and add some delays (n * 5ms) to slow down the rate of
resending this repeated request.

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: John Harrison <John.C.Harrison@Intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Stuart Summers <stuart.summers@intel.com>
Reviewed-by: Julia Filipchuk <julia.filipchuk@intel.com>
Link: https://lore.kernel.org/r/20250903223330.6408-1-michal.wajdeczko@intel.com
drivers/gpu/drm/xe/xe_guc_ct.c

index e431ff73227cc34d1dfc179f238b4cfb100d4e81..f40543b040d6a8e2e57b16852f4affc96f850251 100644 (file)
@@ -1079,11 +1079,15 @@ static bool retry_failure(struct xe_guc_ct *ct, int ret)
        return true;
 }
 
+#define GUC_SEND_RETRY_LIMIT   50
+#define GUC_SEND_RETRY_MSLEEP  5
+
 static int guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len,
                            u32 *response_buffer, bool no_fail)
 {
        struct xe_gt *gt = ct_to_gt(ct);
        struct g2h_fence g2h_fence;
+       unsigned int retries = 0;
        int ret = 0;
 
        /*
@@ -1148,6 +1152,12 @@ retry_same_fence:
                xe_gt_dbg(gt, "H2G action %#x retrying: reason %#x\n",
                          action[0], g2h_fence.reason);
                mutex_unlock(&ct->lock);
+               if (++retries > GUC_SEND_RETRY_LIMIT) {
+                       xe_gt_err(gt, "H2G action %#x reached retry limit=%u, aborting\n",
+                                 action[0], GUC_SEND_RETRY_LIMIT);
+                       return -ELOOP;
+               }
+               msleep(GUC_SEND_RETRY_MSLEEP * retries);
                goto retry;
        }
        if (g2h_fence.fail) {