drm/xe: Don't short circuit TDR on jobs not started

author Matthew Brost <matthew.brost@intel.com>

Fri, 25 Oct 2024 21:43:29 +0000 (14:43 -0700)

committer Lucas De Marchi <lucas.demarchi@intel.com>

Thu, 31 Oct 2024 14:03:14 +0000 (07:03 -0700)
author Matthew Brost <matthew.brost@intel.com>
Fri, 25 Oct 2024 21:43:29 +0000 (14:43 -0700)
committer Lucas De Marchi <lucas.demarchi@intel.com>
Thu, 31 Oct 2024 14:03:14 +0000 (07:03 -0700)
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c

index d333be9c4227760d629dcd5414f71c89e015daa5..f903b077272259e475226da053db75b4efec61c3 100644 (file)
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -916,12 +916,22 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
  static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job)
  {
         struct xe_gt *gt = guc_to_gt(exec_queue_to_guc(q));
-       u32 ctx_timestamp = xe_lrc_ctx_timestamp(q->lrc[0]);
-       u32 ctx_job_timestamp = xe_lrc_ctx_job_timestamp(q->lrc[0]);
+       u32 ctx_timestamp, ctx_job_timestamp;
         u32 timeout_ms = q->sched_props.job_timeout_ms;
         u32 diff;
         u64 running_time_ms;
  
+       if (!xe_sched_job_started(job)) {
+               xe_gt_warn(gt, "Check job timeout: seqno=%u, lrc_seqno=%u, guc_id=%d, not started",
+                          xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
+                          q->guc->id);
+
+               return xe_sched_invalidate_job(job, 2);
+       }
+
+       ctx_timestamp = xe_lrc_ctx_timestamp(q->lrc[0]);
+       ctx_job_timestamp = xe_lrc_ctx_job_timestamp(q->lrc[0]);
+
         /*
          * Counter wraps at ~223s at the usual 19.2MHz, be paranoid catch
          * possible overflows with a high timeout.
@@ -1049,10 +1059,6 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
                 exec_queue_killed_or_banned_or_wedged(q) ||
                 exec_queue_destroyed(q);
  
-       /* Job hasn't started, can't be timed out */
-       if (!skip_timeout_check && !xe_sched_job_started(job))
-               goto rearm;
-
         /*
          * XXX: Sampling timeout doesn't work in wedged mode as we have to
          * modify scheduling state to read timestamp. We could read the
author	Matthew Brost <matthew.brost@intel.com>
	Fri, 25 Oct 2024 21:43:29 +0000 (14:43 -0700)
committer	Lucas De Marchi <lucas.demarchi@intel.com>
	Thu, 31 Oct 2024 14:03:14 +0000 (07:03 -0700)