drm/xe/vf: Post migration, repopulate ring area for pending request

author Tomasz Lis <tomasz.lis@intel.com>

Sat, 2 Aug 2025 03:10:43 +0000 (05:10 +0200)

committer Michał Winiarski <michal.winiarski@intel.com>

Mon, 4 Aug 2025 14:46:56 +0000 (16:46 +0200)
author Tomasz Lis <tomasz.lis@intel.com>
Sat, 2 Aug 2025 03:10:43 +0000 (05:10 +0200)
committer Michał Winiarski <michal.winiarski@intel.com>
Mon, 4 Aug 2025 14:46:56 +0000 (16:46 +0200)
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c

index 0beb6388acb02887a21cb3d9eff8ddd9ccf729df..05a1bd6b55bf42705f53a2e844b228572b5ad8ac 100644 (file)
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -1092,3 +1092,27 @@ void xe_exec_queue_contexts_hwsp_rebase(struct xe_exec_queue *q, void *scratch)
                 xe_lrc_update_hwctx_regs_with_address(q->lrc[i]);
         }
  }
+
+/**
+ * xe_exec_queue_jobs_ring_restore - Re-emit ring commands of requests pending on given queue.
+ * @q: the &xe_exec_queue struct instance
+ */
+void xe_exec_queue_jobs_ring_restore(struct xe_exec_queue *q)
+{
+       struct xe_gpu_scheduler *sched = &q->guc->sched;
+       struct xe_sched_job *job;
+
+       /*
+        * This routine is used within VF migration recovery. This means
+        * using the lock here introduces a restriction: we cannot wait
+        * for any GFX HW response while the lock is taken.
+        */
+       spin_lock(&sched->base.job_list_lock);
+       list_for_each_entry(job, &sched->base.pending_list, drm.list) {
+               if (xe_sched_job_is_error(job))
+                       continue;
+
+               q->ring_ops->emit_job(job);
+       }
+       spin_unlock(&sched->base.job_list_lock);
+}
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h

index da720197929bf59fd6781976aee1928dae17cd49..0ffc0cb03aa663f51164cd0e3a16e0ca655b2430 100644 (file)
--- a/drivers/gpu/drm/xe/xe_exec_queue.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue.h
@@ -92,4 +92,6 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q);
  
  void xe_exec_queue_contexts_hwsp_rebase(struct xe_exec_queue *q, void *scratch);
  
+void xe_exec_queue_jobs_ring_restore(struct xe_exec_queue *q);
+
  #endif
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c

index 300dac6dab31f900dfa49521c5953ae0ae4bb224..7a54890ce9d08597d4c1698d81047463e6b55328 100644 (file)
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -781,6 +781,30 @@ guc_exec_queue_run_job(struct drm_sched_job *drm_job)
         return fence;
  }
  
+/**
+ * xe_guc_jobs_ring_rebase - Re-emit ring commands of requests pending
+ * on all queues under a guc.
+ * @guc: the &xe_guc struct instance
+ */
+void xe_guc_jobs_ring_rebase(struct xe_guc *guc)
+{
+       struct xe_exec_queue *q;
+       unsigned long index;
+
+       /*
+        * This routine is used within VF migration recovery. This means
+        * using the lock here introduces a restriction: we cannot wait
+        * for any GFX HW response while the lock is taken.
+        */
+       mutex_lock(&guc->submission_state.lock);
+       xa_for_each(&guc->submission_state.exec_queue_lookup, index, q) {
+               if (exec_queue_killed_or_banned_or_wedged(q))
+                       continue;
+               xe_exec_queue_jobs_ring_restore(q);
+       }
+       mutex_unlock(&guc->submission_state.lock);
+}
+
  static void guc_exec_queue_free_job(struct drm_sched_job *drm_job)
  {
         struct xe_sched_job *job = to_xe_sched_job(drm_job);
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/xe_guc_submit.h

index 9a2718c81d438bdfee178e95d3b9f2f90d237e51..92a6f0ade615a8b76ffeb5d394145af1a21921eb 100644 (file)
--- a/drivers/gpu/drm/xe/xe_guc_submit.h
+++ b/drivers/gpu/drm/xe/xe_guc_submit.h
@@ -34,6 +34,8 @@ int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
  int xe_guc_exec_queue_reset_failure_handler(struct xe_guc *guc, u32 *msg, u32 len);
  int xe_guc_error_capture_handler(struct xe_guc *guc, u32 *msg, u32 len);
  
+void xe_guc_jobs_ring_rebase(struct xe_guc *guc);
+
  struct xe_guc_submit_exec_queue_snapshot *
  xe_guc_exec_queue_snapshot_capture(struct xe_exec_queue *q);
  void
diff --git a/drivers/gpu/drm/xe/xe_sriov_vf.c b/drivers/gpu/drm/xe/xe_sriov_vf.c

index 43ac73e432d4bc68112cbe21e84d155934522a61..a219395c15de05fcef02038f95cef67cd37670cc 100644 (file)
--- a/drivers/gpu/drm/xe/xe_sriov_vf.c
+++ b/drivers/gpu/drm/xe/xe_sriov_vf.c
@@ -284,7 +284,7 @@ static int gt_vf_post_migration_fixups(struct xe_gt *gt)
                 xe_tile_sriov_vf_fixup_ggtt_nodes(gt_to_tile(gt), shift);
                 xe_gt_sriov_vf_default_lrcs_hwsp_rebase(gt);
                 xe_guc_contexts_hwsp_rebase(&gt->uc.guc, buf);
-               /* FIXME: add the recovery steps */
+               xe_guc_jobs_ring_rebase(&gt->uc.guc);
                 xe_guc_ct_fixup_messages_with_ggtt(&gt->uc.guc.ct, shift);
         }
author	Tomasz Lis <tomasz.lis@intel.com>
	Sat, 2 Aug 2025 03:10:43 +0000 (05:10 +0200)
committer	Michał Winiarski <michal.winiarski@intel.com>
	Mon, 4 Aug 2025 14:46:56 +0000 (16:46 +0200)
drivers/gpu/drm/xe/xe_exec_queue.c		patch \| blob \| history
drivers/gpu/drm/xe/xe_exec_queue.h		patch \| blob \| history
drivers/gpu/drm/xe/xe_guc_submit.c		patch \| blob \| history
drivers/gpu/drm/xe/xe_guc_submit.h		patch \| blob \| history
drivers/gpu/drm/xe/xe_sriov_vf.c		patch \| blob \| history