drm/amdgpu: rework how isolation is enforced v2

author Christian König <christian.koenig@amd.com>

Wed, 15 Jan 2025 12:44:26 +0000 (13:44 +0100)

committer Alex Deucher <alexander.deucher@amd.com>

Fri, 21 Mar 2025 16:16:34 +0000 (12:16 -0400)
author Christian König <christian.koenig@amd.com>
Wed, 15 Jan 2025 12:44:26 +0000 (13:44 +0100)
committer Alex Deucher <alexander.deucher@amd.com>
Fri, 21 Mar 2025 16:16:34 +0000 (12:16 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index 2a9a41f4e7485e2cccc4f182384f7deaeb476efa..6d83ccfa42eeb0b00ca18635a8c1225de88a06e4 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1194,9 +1194,15 @@ struct amdgpu_device {
         bool                            debug_exp_resets;
         bool                            debug_disable_gpu_ring_reset;
  
-       bool                            enforce_isolation[MAX_XCP];
-       /* Added this mutex for cleaner shader isolation between GFX and compute processes */
+       /* Protection for the following isolation structure */
         struct mutex                    enforce_isolation_mutex;
+       bool                            enforce_isolation[MAX_XCP];
+       struct amdgpu_isolation {
+               void                    *owner;
+               struct dma_fence        *spearhead;
+               struct amdgpu_sync      active;
+               struct amdgpu_sync      prev;
+       } isolation[MAX_XCP];
  
         struct amdgpu_init_level *init_lvl;
  
@@ -1482,6 +1488,9 @@ void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
  struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev);
  struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
                                             struct dma_fence *gang);
+struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
+                                                 struct amdgpu_ring *ring,
+                                                 struct amdgpu_job *job);
  bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
  ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
  ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 4ed5e26a13d52d3af62f0ae1147850a7aa446e36..662c83d72509ea223f5bbe0e34e6d2a1fefd3db5 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4294,6 +4294,11 @@ int amdgpu_device_init(struct amdgpu_device *adev,
         mutex_init(&adev->gfx.reset_sem_mutex);
         /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
         mutex_init(&adev->enforce_isolation_mutex);
+       for (i = 0; i < MAX_XCP; ++i) {
+               adev->isolation[i].spearhead = dma_fence_get_stub();
+               amdgpu_sync_create(&adev->isolation[i].active);
+               amdgpu_sync_create(&adev->isolation[i].prev);
+       }
         mutex_init(&adev->gfx.kfd_sch_mutex);
         mutex_init(&adev->gfx.workload_profile_mutex);
         mutex_init(&adev->vcn.workload_profile_mutex);
@@ -4799,7 +4804,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
  
  void amdgpu_device_fini_sw(struct amdgpu_device *adev)
  {
-       int idx;
+       int i, idx;
         bool px;
  
         amdgpu_device_ip_fini(adev);
@@ -4807,6 +4812,11 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
         amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
         adev->accel_working = false;
         dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
+       for (i = 0; i < MAX_XCP; ++i) {
+               dma_fence_put(adev->isolation[i].spearhead);
+               amdgpu_sync_free(&adev->isolation[i].active);
+               amdgpu_sync_free(&adev->isolation[i].prev);
+       }
  
         amdgpu_reset_fini(adev);
  
@@ -6953,6 +6963,92 @@ struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
         return NULL;
  }
  
+/**
+ * amdgpu_device_enforce_isolation - enforce HW isolation
+ * @adev: the amdgpu device pointer
+ * @ring: the HW ring the job is supposed to run on
+ * @job: the job which is about to be pushed to the HW ring
+ *
+ * Makes sure that only one client at a time can use the GFX block.
+ * Returns: The dependency to wait on before the job can be pushed to the HW.
+ * The function is called multiple times until NULL is returned.
+ */
+struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
+                                                 struct amdgpu_ring *ring,
+                                                 struct amdgpu_job *job)
+{
+       struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id];
+       struct drm_sched_fence *f = job->base.s_fence;
+       struct dma_fence *dep;
+       void *owner;
+       int r;
+
+       /*
+        * For now enforce isolation only for the GFX block since we only need
+        * the cleaner shader on those rings.
+        */
+       if (ring->funcs->type != AMDGPU_RING_TYPE_GFX &&
+           ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE)
+               return NULL;
+
+       /*
+        * All submissions where enforce isolation is false are handled as if
+        * they come from a single client. Use ~0l as the owner to distinct it
+        * from kernel submissions where the owner is NULL.
+        */
+       owner = job->enforce_isolation ? f->owner : (void *)~0l;
+
+       mutex_lock(&adev->enforce_isolation_mutex);
+
+       /*
+        * The "spearhead" submission is the first one which changes the
+        * ownership to its client. We always need to wait for it to be
+        * pushed to the HW before proceeding with anything.
+        */
+       if (&f->scheduled != isolation->spearhead &&
+           !dma_fence_is_signaled(isolation->spearhead)) {
+               dep = isolation->spearhead;
+               goto out_grab_ref;
+       }
+
+       if (isolation->owner != owner) {
+
+               /*
+                * Wait for any gang to be assembled before switching to a
+                * different owner or otherwise we could deadlock the
+                * submissions.
+                */
+               if (!job->gang_submit) {
+                       dep = amdgpu_device_get_gang(adev);
+                       if (!dma_fence_is_signaled(dep))
+                               goto out_return_dep;
+                       dma_fence_put(dep);
+               }
+
+               dma_fence_put(isolation->spearhead);
+               isolation->spearhead = dma_fence_get(&f->scheduled);
+               amdgpu_sync_move(&isolation->active, &isolation->prev);
+               isolation->owner = owner;
+       }
+
+       /*
+        * Specifying the ring here helps to pipeline submissions even when
+        * isolation is enabled. If that is not desired for testing NULL can be
+        * used instead of the ring to enforce a CPU round trip while switching
+        * between clients.
+        */
+       dep = amdgpu_sync_peek_fence(&isolation->prev, ring);
+       r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT);
+       if (r)
+               DRM_WARN("OOM tracking isolation\n");
+
+out_grab_ref:
+       dma_fence_get(dep);
+out_return_dep:
+       mutex_unlock(&adev->enforce_isolation_mutex);
+       return dep;
+}
+
  bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
  {
         switch (adev->asic_type) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c

index 56d27cea052ed2a7b77a2fc1f494317762df5cf9..92ab821afc06ae561929cfed74c72ad0ac8d87b2 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
@@ -287,40 +287,27 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm,
             (*id)->flushed_updates < updates ||
             !(*id)->last_flush ||
             ((*id)->last_flush->context != fence_context &&
-            !dma_fence_is_signaled((*id)->last_flush))) {
+            !dma_fence_is_signaled((*id)->last_flush)))
+               needs_flush = true;
+
+       if ((*id)->owner != vm->immediate.fence_context ||
+           (!adev->vm_manager.concurrent_flush && needs_flush)) {
                 struct dma_fence *tmp;
  
-               /* Wait for the gang to be assembled before using a
-                * reserved VMID or otherwise the gang could deadlock.
+               /* Don't use per engine and per process VMID at the
+                * same time
                  */
-               tmp = amdgpu_device_get_gang(adev);
-               if (!dma_fence_is_signaled(tmp) && tmp != job->gang_submit) {
+               if (adev->vm_manager.concurrent_flush)
+                       ring = NULL;
+
+               /* to prevent one context starved by another context */
+               (*id)->pd_gpu_addr = 0;
+               tmp = amdgpu_sync_peek_fence(&(*id)->active, ring);
+               if (tmp) {
                         *id = NULL;
-                       *fence = tmp;
+                       *fence = dma_fence_get(tmp);
                         return 0;
                 }
-               dma_fence_put(tmp);
-
-               /* Make sure the id is owned by the gang before proceeding */
-               if (!job->gang_submit ||
-                   (*id)->owner != vm->immediate.fence_context) {
-
-                       /* Don't use per engine and per process VMID at the
-                        * same time
-                        */
-                       if (adev->vm_manager.concurrent_flush)
-                               ring = NULL;
-
-                       /* to prevent one context starved by another context */
-                       (*id)->pd_gpu_addr = 0;
-                       tmp = amdgpu_sync_peek_fence(&(*id)->active, ring);
-                       if (tmp) {
-                               *id = NULL;
-                               *fence = dma_fence_get(tmp);
-                               return 0;
-                       }
-               }
-               needs_flush = true;
         }
  
         /* Good we can use this VMID. Remember this submission as
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

index 935df2cdcc16c873aaab6a5fba23ee321c11d45f..acb21fc8b3ce5d8c745e3e3384bd9ec24b91be3f 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -361,17 +361,24 @@ amdgpu_job_prepare_job(struct drm_sched_job *sched_job,
  {
         struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched);
         struct amdgpu_job *job = to_amdgpu_job(sched_job);
-       struct dma_fence *fence = NULL;
+       struct dma_fence *fence;
         int r;
  
         r = drm_sched_entity_error(s_entity);
         if (r)
                 goto error;
  
-       if (job->gang_submit)
+       if (job->gang_submit) {
                 fence = amdgpu_device_switch_gang(ring->adev, job->gang_submit);
+               if (fence)
+                       return fence;
+       }
+
+       fence = amdgpu_device_enforce_isolation(ring->adev, ring, job);
+       if (fence)
+               return fence;
  
-       if (!fence && job->vm && !job->vmid) {
+       if (job->vm && !job->vmid) {
                 r = amdgpu_vmid_grab(job->vm, ring, job, &fence);
                 if (r) {
                         dev_err(ring->adev->dev, "Error getting VM ID (%d)\n", r);
@@ -384,9 +391,10 @@ amdgpu_job_prepare_job(struct drm_sched_job *sched_job,
                  */
                 if (!fence)
                         job->vm = NULL;
+               return fence;
         }
  
-       return fence;
+       return NULL;
  
  error:
         dma_fence_set_error(&job->base.s_fence->finished, r);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c

index bfe12164d27da7546844afa8bdb30671c57becab..c5f9db6b32a496b2cb1b949a429b20240dfde9a8 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
@@ -405,6 +405,25 @@ int amdgpu_sync_clone(struct amdgpu_sync *source, struct amdgpu_sync *clone)
         return 0;
  }
  
+/**
+ * amdgpu_sync_move - move all fences from src to dst
+ *
+ * @src: source of the fences, empty after function
+ * @dst: destination for the fences
+ *
+ * Moves all fences from source to destination. All fences in destination are
+ * freed and source is empty after the function call.
+ */
+void amdgpu_sync_move(struct amdgpu_sync *src, struct amdgpu_sync *dst)
+{
+       unsigned int i;
+
+       amdgpu_sync_free(dst);
+
+       for (i = 0; i < HASH_SIZE(src->fences); ++i)
+               hlist_move_list(&src->fences[i], &dst->fences[i]);
+}
+
  /**
   * amdgpu_sync_push_to_job - push fences into job
   * @sync: sync object to get the fences from
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h

index 1504f5e7fc46598f7595250c0eefb38a6d5177cd..51eb4382c91ebd216b3be4ae80d8923ff9c5c8db 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
@@ -57,6 +57,7 @@ struct dma_fence *amdgpu_sync_peek_fence(struct amdgpu_sync *sync,
                                      struct amdgpu_ring *ring);
  struct dma_fence *amdgpu_sync_get_fence(struct amdgpu_sync *sync);
  int amdgpu_sync_clone(struct amdgpu_sync *source, struct amdgpu_sync *clone);
+void amdgpu_sync_move(struct amdgpu_sync *src, struct amdgpu_sync *dst);
  int amdgpu_sync_push_to_job(struct amdgpu_sync *sync, struct amdgpu_job *job);
  int amdgpu_sync_wait(struct amdgpu_sync *sync, bool intr);
  void amdgpu_sync_free(struct amdgpu_sync *sync);
author	Christian König <christian.koenig@amd.com>
	Wed, 15 Jan 2025 12:44:26 +0000 (13:44 +0100)
committer	Alex Deucher <alexander.deucher@amd.com>
	Fri, 21 Mar 2025 16:16:34 +0000 (12:16 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu.h		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h		patch \| blob \| history