drm/amdkfd: Ensure user queue buffers residency

author Philip Yang <Philip.Yang@amd.com>

Thu, 20 Jun 2024 16:31:36 +0000 (12:31 -0400)

committer Alex Deucher <alexander.deucher@amd.com>

Tue, 23 Jul 2024 21:42:54 +0000 (17:42 -0400)
author Philip Yang <Philip.Yang@amd.com>
Thu, 20 Jun 2024 16:31:36 +0000 (12:31 -0400)
committer Alex Deucher <alexander.deucher@amd.com>
Tue, 23 Jul 2024 21:42:54 +0000 (17:42 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index 0ab37e7aec265625017b0e02bbe2a546a09dcd55..6d5fd371d5ce86b476c9597fdbba9ed48edaf940 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1252,7 +1252,7 @@ static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx,
         return ret;
  }
  
-static void unmap_bo_from_gpuvm(struct kgd_mem *mem,
+static int unmap_bo_from_gpuvm(struct kgd_mem *mem,
                                 struct kfd_mem_attachment *entry,
                                 struct amdgpu_sync *sync)
  {
@@ -1260,11 +1260,18 @@ static void unmap_bo_from_gpuvm(struct kgd_mem *mem,
         struct amdgpu_device *adev = entry->adev;
         struct amdgpu_vm *vm = bo_va->base.vm;
  
+       if (bo_va->queue_refcount) {
+               pr_debug("bo_va->queue_refcount %d\n", bo_va->queue_refcount);
+               return -EBUSY;
+       }
+
         amdgpu_vm_bo_unmap(adev, bo_va, entry->va);
  
         amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);
  
         amdgpu_sync_fence(sync, bo_va->last_pt_update);
+
+       return 0;
  }
  
  static int update_gpuvm_pte(struct kgd_mem *mem,
@@ -2191,7 +2198,10 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
                 pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n",
                          entry->va, entry->va + bo_size, entry);
  
-               unmap_bo_from_gpuvm(mem, entry, ctx.sync);
+               ret = unmap_bo_from_gpuvm(mem, entry, ctx.sync);
+               if (ret)
+                       goto unreserve_out;
+
                 entry->is_mapped = false;
  
                 mem->mapped_to_gpu_memory--;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

index bc42ccbde659ac5ef1854b3a90d5561916faf422..d7e27957013f32c95080143699707aa38b2a58f4 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -90,6 +90,12 @@ struct amdgpu_bo_va {
         bool                            cleared;
  
         bool                            is_xgmi;
+
+       /*
+        * protected by vm reservation lock
+        * if non-zero, cannot unmap from GPU because user queues may still access it
+        */
+       unsigned int                    queue_refcount;
  };
  
  struct amdgpu_bo {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

index 202f24ee4bd7dc0811a03e7087effee5e41b82d4..65a37ac5a0f0bb56bd990bdd46b0ad935b7a27e9 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1384,8 +1384,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
                 err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
                         peer_pdd->dev->adev, (struct kgd_mem *)mem, peer_pdd->drm_priv);
                 if (err) {
-                       pr_err("Failed to unmap from gpu %d/%d\n",
-                              i, args->n_devices);
+                       pr_debug("Failed to unmap from gpu %d/%d\n", i, args->n_devices);
                         goto unmap_memory_from_gpu_failed;
                 }
                 args->n_success = i+1;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 80d8080c576434e39b4fdd69ef066b7ee0bcba48..c31589043d5bd21cc430f36f2220264f508442bb 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1292,6 +1292,7 @@ void print_queue_properties(struct queue_properties *q);
  void print_queue(struct queue *q);
  int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo,
                          u64 expected_size);
+void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
  int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties);
  int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties);
  
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c

index 0e661160c295a04cfbd5366eb47ca4328760530d..3fd386dcb01163094b7ece46e2dff1f7ee713536 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -106,6 +106,7 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_
         }
  
         *pbo = amdgpu_bo_ref(mapping->bo_va->base.bo);
+       mapping->bo_va->queue_refcount++;
         return 0;
  
  out_err:
@@ -113,6 +114,19 @@ out_err:
         return -EINVAL;
  }
  
+void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo)
+{
+       if (*bo) {
+               struct amdgpu_bo_va *bo_va;
+
+               bo_va = amdgpu_vm_bo_find(vm, *bo);
+               if (bo_va)
+                       bo_va->queue_refcount--;
+       }
+
+       amdgpu_bo_unref(bo);
+}
+
  int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties)
  {
         struct amdgpu_vm *vm;
@@ -166,10 +180,20 @@ out_err_unreserve:
  
  int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties)
  {
-       amdgpu_bo_unref(&properties->wptr_bo);
-       amdgpu_bo_unref(&properties->rptr_bo);
-       amdgpu_bo_unref(&properties->ring_bo);
-       amdgpu_bo_unref(&properties->eop_buf_bo);
-       amdgpu_bo_unref(&properties->cwsr_bo);
+       struct amdgpu_vm *vm;
+       int err;
+
+       vm = drm_priv_to_vm(pdd->drm_priv);
+       err = amdgpu_bo_reserve(vm->root.bo, false);
+       if (err)
+               return err;
+
+       kfd_queue_buffer_put(vm, &properties->wptr_bo);
+       kfd_queue_buffer_put(vm, &properties->rptr_bo);
+       kfd_queue_buffer_put(vm, &properties->ring_bo);
+       kfd_queue_buffer_put(vm, &properties->eop_buf_bo);
+       kfd_queue_buffer_put(vm, &properties->cwsr_bo);
+
+       amdgpu_bo_unreserve(vm->root.bo);
         return 0;
  }
author	Philip Yang <Philip.Yang@amd.com>
	Thu, 20 Jun 2024 16:31:36 +0000 (12:31 -0400)
committer	Alex Deucher <alexander.deucher@amd.com>
	Tue, 23 Jul 2024 21:42:54 +0000 (17:42 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_object.h		patch \| blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c		patch \| blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_priv.h		patch \| blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_queue.c		patch \| blob \| history