]> www.infradead.org Git - users/hch/misc.git/commitdiff
drm/amdgpu: fix task hang from failed job submission during process kill
authorLiu01 Tong <Tong.Liu01@amd.com>
Mon, 11 Aug 2025 06:52:37 +0000 (14:52 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 12 Aug 2025 18:22:54 +0000 (14:22 -0400)
During process kill, drm_sched_entity_flush() will kill the vm
entities. The following job submissions of this process will fail, and
the resources of these jobs have not been released, nor have the fences
been signalled, causing tasks to hang and timeout.

Fix by check entity status in amdgpu_vm_ready() and avoid submit jobs to
stopped entity.

v2: add amdgpu_vm_ready() check before amdgpu_vm_clear_freed() in
function amdgpu_cs_vm_handling().

Signed-off-by: Liu01 Tong <Tong.Liu01@amd.com>
Signed-off-by: Lin.Cao <lincao12@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

index 27f8f316f6c2813136c37d368b2c198eb70eaf9c..2ac9729e4c86d126db85eaa91831db5189bf170a 100644 (file)
@@ -1139,6 +1139,9 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
                }
        }
 
+       if (!amdgpu_vm_ready(vm))
+               return -EINVAL;
+
        r = amdgpu_vm_clear_freed(adev, vm, NULL);
        if (r)
                return r;
index 39b4250ede0ffb54e11c214628bd17af4eb372f6..bd12d8ff15a42bde92ff97169286aeab847016cc 100644 (file)
@@ -654,11 +654,10 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
  * Check if all VM PDs/PTs are ready for updates
  *
  * Returns:
- * True if VM is not evicting.
+ * True if VM is not evicting and all VM entities are not stopped
  */
 bool amdgpu_vm_ready(struct amdgpu_vm *vm)
 {
-       bool empty;
        bool ret;
 
        amdgpu_vm_eviction_lock(vm);
@@ -666,10 +665,18 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm)
        amdgpu_vm_eviction_unlock(vm);
 
        spin_lock(&vm->status_lock);
-       empty = list_empty(&vm->evicted);
+       ret &= list_empty(&vm->evicted);
        spin_unlock(&vm->status_lock);
 
-       return ret && empty;
+       spin_lock(&vm->immediate.lock);
+       ret &= !vm->immediate.stopped;
+       spin_unlock(&vm->immediate.lock);
+
+       spin_lock(&vm->delayed.lock);
+       ret &= !vm->delayed.stopped;
+       spin_unlock(&vm->delayed.lock);
+
+       return ret;
 }
 
 /**