]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
drm/amdkfd: pause autosuspend when creating pdd
authorJesse.zhang@amd.com <Jesse.zhang@amd.com>
Thu, 5 Dec 2024 09:41:26 +0000 (17:41 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 10 Dec 2024 15:26:18 +0000 (10:26 -0500)
When using MES creating a pdd will require talking to the GPU to
setup the relevant context. The code here forgot to wake up the GPU
in case it was in suspend, this causes KVM to EFAULT for passthrough
GPU for example. This issue can be masked if the GPU was woken up by
other things (e.g. opening the KMS node) first and have not yet gone to sleep.

v4: do the allocation of proc_ctx_bo in a lazy fashion
when the first queue is created in a process (Felix)

Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
Reviewed-by: Yunxiang Li <Yunxiang.Li@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_process.c

index c79fe9069e220e46e9508785972458c8499393b7..16b5daaa272f13f24bef2d7f11e01270fc3e53ba 100644 (file)
@@ -207,6 +207,21 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
        if (!down_read_trylock(&adev->reset_domain->sem))
                return -EIO;
 
+       if (!pdd->proc_ctx_cpu_ptr) {
+               r = amdgpu_amdkfd_alloc_gtt_mem(adev,
+                               AMDGPU_MES_PROC_CTX_SIZE,
+                               &pdd->proc_ctx_bo,
+                               &pdd->proc_ctx_gpu_addr,
+                               &pdd->proc_ctx_cpu_ptr,
+                               false);
+               if (r) {
+                       dev_err(adev->dev,
+                               "failed to allocate process context bo\n");
+                       return r;
+               }
+               memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
+       }
+
        memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input));
        queue_input.process_id = qpd->pqm->process->pasid;
        queue_input.page_table_base_addr =  qpd->page_table_base;
index 87cd52cf4ee995918050b7bcd93838e894fff98c..d0ee173acf824603172ba0091e13970744498efe 100644 (file)
@@ -1076,7 +1076,8 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
 
                kfd_free_process_doorbells(pdd->dev->kfd, pdd);
 
-               if (pdd->dev->kfd->shared_resources.enable_mes)
+               if (pdd->dev->kfd->shared_resources.enable_mes &&
+                       pdd->proc_ctx_cpu_ptr)
                        amdgpu_amdkfd_free_gtt_mem(pdd->dev->adev,
                                                   &pdd->proc_ctx_bo);
                /*
@@ -1608,7 +1609,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev,
                                                        struct kfd_process *p)
 {
        struct kfd_process_device *pdd = NULL;
-       int retval = 0;
 
        if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
                return NULL;
@@ -1632,21 +1632,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev,
        pdd->user_gpu_id = dev->id;
        atomic64_set(&pdd->evict_duration_counter, 0);
 
-       if (dev->kfd->shared_resources.enable_mes) {
-               retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev,
-                                               AMDGPU_MES_PROC_CTX_SIZE,
-                                               &pdd->proc_ctx_bo,
-                                               &pdd->proc_ctx_gpu_addr,
-                                               &pdd->proc_ctx_cpu_ptr,
-                                               false);
-               if (retval) {
-                       dev_err(dev->adev->dev,
-                               "failed to allocate process context bo\n");
-                       goto err_free_pdd;
-               }
-               memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
-       }
-
        p->pdds[p->n_pdds++] = pdd;
        if (kfd_dbg_is_per_vmid_supported(pdd->dev))
                pdd->spi_dbg_override = pdd->dev->kfd2kgd->disable_debug_trap(
@@ -1658,10 +1643,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev,
        idr_init(&pdd->alloc_idr);
 
        return pdd;
-
-err_free_pdd:
-       kfree(pdd);
-       return NULL;
 }
 
 /**