From: Jesse.Zhang Date: Thu, 4 Sep 2025 01:54:36 +0000 (+0800) Subject: drm/amdgpu/userq: add a detect and reset callback X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=54d18bc6003fad33eb901aa5fa3425449e53d5aa;p=users%2Fhch%2Fmisc.git drm/amdgpu/userq: add a detect and reset callback Add a detect and reset callback and add the implementation for mes. The callback will detect all hung queues of a particular ip type (e.g., GFX or compute or SDMA) and reset them. v2: increase reset counter and set fence force completion v3: Removed userq_mutex in mes_userq_detect_and_reset since the driver holds it when calling Reviewed-by: Alex Deucher Signed-off-by: Jesse Zhang Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h index 017efd2075df..1bd84f4cce78 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h @@ -82,6 +82,8 @@ struct amdgpu_userq_funcs { struct amdgpu_usermode_queue *queue); int (*restore)(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_queue *queue); + int (*detect_and_reset)(struct amdgpu_device *adev, + int queue_type); }; /* Usermode queues for gfx */ diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c index d6f50b13e2ba..aee26f80bd53 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c @@ -21,6 +21,7 @@ * OTHER DEALINGS IN THE SOFTWARE. * */ +#include #include "amdgpu.h" #include "amdgpu_gfx.h" #include "mes_userqueue.h" @@ -198,6 +199,53 @@ static int mes_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr, return 0; } +static int mes_userq_detect_and_reset(struct amdgpu_device *adev, + int queue_type) +{ + int db_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev); + struct mes_detect_and_reset_queue_input input; + struct amdgpu_usermode_queue *queue; + struct amdgpu_userq_mgr *uqm, *tmp; + unsigned int hung_db_num = 0; + int queue_id, r, i; + u32 db_array[4]; + + if (db_array_size > 4) { + dev_err(adev->dev, "DB array size (%d vs 4) too small\n", + db_array_size); + return -EINVAL; + } + + memset(&input, 0x0, sizeof(struct mes_detect_and_reset_queue_input)); + + input.queue_type = queue_type; + + amdgpu_mes_lock(&adev->mes); + r = amdgpu_mes_detect_and_reset_hung_queues(adev, queue_type, false, + &hung_db_num, db_array); + amdgpu_mes_unlock(&adev->mes); + if (r) { + dev_err(adev->dev, "Failed to detect and reset queues, err (%d)\n", r); + } else if (hung_db_num) { + list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { + idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { + if (queue->queue_type == queue_type) { + for (i = 0; i < hung_db_num; i++) { + if (queue->doorbell_index == db_array[i]) { + queue->state = AMDGPU_USERQ_STATE_HUNG; + atomic_inc(&adev->gpu_reset_counter); + amdgpu_userq_fence_driver_force_completion(queue); + drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL); + } + } + } + } + } + } + + return r; +} + static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr, struct drm_amdgpu_userq_in *args_in, struct amdgpu_usermode_queue *queue) @@ -352,4 +400,5 @@ const struct amdgpu_userq_funcs userq_mes_funcs = { .mqd_destroy = mes_userq_mqd_destroy, .unmap = mes_userq_unmap, .map = mes_userq_map, + .detect_and_reset = mes_userq_detect_and_reset, };