]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
drm/xe/vf: Post migration, repopulate ring area for pending request
authorTomasz Lis <tomasz.lis@intel.com>
Sat, 2 Aug 2025 03:10:43 +0000 (05:10 +0200)
committerMichał Winiarski <michal.winiarski@intel.com>
Mon, 4 Aug 2025 14:46:56 +0000 (16:46 +0200)
The commands within ring area allocated for a request may contain
references to GGTT. These references require update after VF
migration, in order to continue any preempted LRCs, or jobs which
were emitted to the ring but not sent to GuC yet.

This change calls the emit function again for all such jobs,
as part of post-migration recovery.

v2: Moved few functions to better files
v3: Take job_list_lock
v4: Rephrased comments

Signed-off-by: Tomasz Lis <tomasz.lis@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Michal Winiarski <michal.winiarski@intel.com>
Reviewed-by: Michal Winiarski <michal.winiarski@intel.com>
Cc: Jonathan Cavitt <jonathan.cavitt@intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Link: https://lore.kernel.org/r/20250802031045.1127138-7-tomasz.lis@intel.com
Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
drivers/gpu/drm/xe/xe_exec_queue.c
drivers/gpu/drm/xe/xe_exec_queue.h
drivers/gpu/drm/xe/xe_guc_submit.c
drivers/gpu/drm/xe/xe_guc_submit.h
drivers/gpu/drm/xe/xe_sriov_vf.c

index 0beb6388acb02887a21cb3d9eff8ddd9ccf729df..05a1bd6b55bf42705f53a2e844b228572b5ad8ac 100644 (file)
@@ -1092,3 +1092,27 @@ void xe_exec_queue_contexts_hwsp_rebase(struct xe_exec_queue *q, void *scratch)
                xe_lrc_update_hwctx_regs_with_address(q->lrc[i]);
        }
 }
+
+/**
+ * xe_exec_queue_jobs_ring_restore - Re-emit ring commands of requests pending on given queue.
+ * @q: the &xe_exec_queue struct instance
+ */
+void xe_exec_queue_jobs_ring_restore(struct xe_exec_queue *q)
+{
+       struct xe_gpu_scheduler *sched = &q->guc->sched;
+       struct xe_sched_job *job;
+
+       /*
+        * This routine is used within VF migration recovery. This means
+        * using the lock here introduces a restriction: we cannot wait
+        * for any GFX HW response while the lock is taken.
+        */
+       spin_lock(&sched->base.job_list_lock);
+       list_for_each_entry(job, &sched->base.pending_list, drm.list) {
+               if (xe_sched_job_is_error(job))
+                       continue;
+
+               q->ring_ops->emit_job(job);
+       }
+       spin_unlock(&sched->base.job_list_lock);
+}
index da720197929bf59fd6781976aee1928dae17cd49..0ffc0cb03aa663f51164cd0e3a16e0ca655b2430 100644 (file)
@@ -92,4 +92,6 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q);
 
 void xe_exec_queue_contexts_hwsp_rebase(struct xe_exec_queue *q, void *scratch);
 
+void xe_exec_queue_jobs_ring_restore(struct xe_exec_queue *q);
+
 #endif
index 300dac6dab31f900dfa49521c5953ae0ae4bb224..7a54890ce9d08597d4c1698d81047463e6b55328 100644 (file)
@@ -781,6 +781,30 @@ guc_exec_queue_run_job(struct drm_sched_job *drm_job)
        return fence;
 }
 
+/**
+ * xe_guc_jobs_ring_rebase - Re-emit ring commands of requests pending
+ * on all queues under a guc.
+ * @guc: the &xe_guc struct instance
+ */
+void xe_guc_jobs_ring_rebase(struct xe_guc *guc)
+{
+       struct xe_exec_queue *q;
+       unsigned long index;
+
+       /*
+        * This routine is used within VF migration recovery. This means
+        * using the lock here introduces a restriction: we cannot wait
+        * for any GFX HW response while the lock is taken.
+        */
+       mutex_lock(&guc->submission_state.lock);
+       xa_for_each(&guc->submission_state.exec_queue_lookup, index, q) {
+               if (exec_queue_killed_or_banned_or_wedged(q))
+                       continue;
+               xe_exec_queue_jobs_ring_restore(q);
+       }
+       mutex_unlock(&guc->submission_state.lock);
+}
+
 static void guc_exec_queue_free_job(struct drm_sched_job *drm_job)
 {
        struct xe_sched_job *job = to_xe_sched_job(drm_job);
index 9a2718c81d438bdfee178e95d3b9f2f90d237e51..92a6f0ade615a8b76ffeb5d394145af1a21921eb 100644 (file)
@@ -34,6 +34,8 @@ int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
 int xe_guc_exec_queue_reset_failure_handler(struct xe_guc *guc, u32 *msg, u32 len);
 int xe_guc_error_capture_handler(struct xe_guc *guc, u32 *msg, u32 len);
 
+void xe_guc_jobs_ring_rebase(struct xe_guc *guc);
+
 struct xe_guc_submit_exec_queue_snapshot *
 xe_guc_exec_queue_snapshot_capture(struct xe_exec_queue *q);
 void
index 43ac73e432d4bc68112cbe21e84d155934522a61..a219395c15de05fcef02038f95cef67cd37670cc 100644 (file)
@@ -284,7 +284,7 @@ static int gt_vf_post_migration_fixups(struct xe_gt *gt)
                xe_tile_sriov_vf_fixup_ggtt_nodes(gt_to_tile(gt), shift);
                xe_gt_sriov_vf_default_lrcs_hwsp_rebase(gt);
                xe_guc_contexts_hwsp_rebase(&gt->uc.guc, buf);
-               /* FIXME: add the recovery steps */
+               xe_guc_jobs_ring_rebase(&gt->uc.guc);
                xe_guc_ct_fixup_messages_with_ggtt(&gt->uc.guc.ct, shift);
        }