]> www.infradead.org Git - users/hch/misc.git/commitdiff
drm/amdgpu/mes: add front end for detect and reset hung queue
authorJesse.Zhang <Jesse.Zhang@amd.com>
Thu, 4 Sep 2025 01:39:34 +0000 (09:39 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 5 Sep 2025 21:38:31 +0000 (17:38 -0400)
Helper function to detect and reset hung queues.  MES will
return an array of doorbell indices of which queues are hung
and were optionally reset.

v2:  Clear the doorbell array before detection

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h

index 135598502c8d01c0d65ebf741e5b8c754acc0c75..5bf9be073cddf35b80b1824ebf47529457ca1b8c 100644 (file)
@@ -191,6 +191,20 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
        if (r)
                goto error_doorbell;
 
+       if (adev->mes.hung_queue_db_array_size) {
+               r = amdgpu_bo_create_kernel(adev,
+                                           adev->mes.hung_queue_db_array_size * sizeof(u32),
+                                           PAGE_SIZE,
+                                           AMDGPU_GEM_DOMAIN_GTT,
+                                           &adev->mes.hung_queue_db_array_gpu_obj,
+                                           &adev->mes.hung_queue_db_array_gpu_addr,
+                                           &adev->mes.hung_queue_db_array_cpu_addr);
+               if (r) {
+                       dev_warn(adev->dev, "failed to create MES hung db array buffer (%d)", r);
+                       goto error_doorbell;
+               }
+       }
+
        return 0;
 
 error_doorbell:
@@ -216,6 +230,10 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
 {
        int i;
 
+       amdgpu_bo_free_kernel(&adev->mes.hung_queue_db_array_gpu_obj,
+                             &adev->mes.hung_queue_db_array_gpu_addr,
+                             &adev->mes.hung_queue_db_array_cpu_addr);
+
        amdgpu_bo_free_kernel(&adev->mes.event_log_gpu_obj,
                              &adev->mes.event_log_gpu_addr,
                              &adev->mes.event_log_cpu_addr);
@@ -366,6 +384,53 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev,
        return r;
 }
 
+int amdgpu_mes_get_hung_queue_db_array_size(struct amdgpu_device *adev)
+{
+       return adev->mes.hung_queue_db_array_size;
+}
+
+int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
+                                           int queue_type,
+                                           bool detect_only,
+                                           unsigned int *hung_db_num,
+                                           u32 *hung_db_array)
+
+{
+       struct mes_detect_and_reset_queue_input input;
+       u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr;
+       int r, i;
+
+       if (!hung_db_num || !hung_db_array)
+               return -EINVAL;
+
+       if ((queue_type != AMDGPU_RING_TYPE_GFX) &&
+           (queue_type != AMDGPU_RING_TYPE_COMPUTE) &&
+           (queue_type != AMDGPU_RING_TYPE_SDMA))
+               return -EINVAL;
+
+       /* Clear the doorbell array before detection */
+       memset(adev->mes.hung_queue_db_array_cpu_addr, 0,
+               adev->mes.hung_queue_db_array_size * sizeof(u32));
+       input.queue_type = queue_type;
+       input.detect_only = detect_only;
+
+       r = adev->mes.funcs->detect_and_reset_hung_queues(&adev->mes,
+                                                         &input);
+       if (r) {
+               dev_err(adev->dev, "failed to detect and reset\n");
+       } else {
+               *hung_db_num = 0;
+               for (i = 0; i < adev->mes.hung_queue_db_array_size; i++) {
+                       if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
+                               hung_db_array[i] = db_array[i];
+                               *hung_db_num += 1;
+                       }
+               }
+       }
+
+       return r;
+}
+
 uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg)
 {
        struct mes_misc_op_input op_input;
index 489a4a0f0610598e3ecc23471a4ca913e07b91ba..6b506fc72f58e073cc092d8841e605e397e2b565 100644 (file)
@@ -41,6 +41,7 @@
 #define AMDGPU_MES_API_VERSION_MASK    0x00fff000
 #define AMDGPU_MES_FEAT_VERSION_MASK   0xff000000
 #define AMDGPU_MES_MSCRATCH_SIZE       0x40000
+#define AMDGPU_MES_INVALID_DB_OFFSET   0xffffffff
 
 enum amdgpu_mes_priority_level {
        AMDGPU_MES_PRIORITY_LEVEL_LOW       = 0,
@@ -147,6 +148,10 @@ struct amdgpu_mes {
        uint64_t            resource_1_gpu_addr[AMDGPU_MAX_MES_PIPES];
        void                *resource_1_addr[AMDGPU_MAX_MES_PIPES];
 
+       int                             hung_queue_db_array_size;
+       struct amdgpu_bo                *hung_queue_db_array_gpu_obj;
+       uint64_t                        hung_queue_db_array_gpu_addr;
+       void                            *hung_queue_db_array_cpu_addr;
 };
 
 struct amdgpu_mes_gang {
@@ -280,6 +285,11 @@ struct mes_reset_queue_input {
        bool                               is_kq;
 };
 
+struct mes_detect_and_reset_queue_input {
+       uint32_t                           queue_type;
+       bool                               detect_only;
+};
+
 struct mes_inv_tlbs_pasid_input {
        uint32_t        xcc_id;
        uint16_t        pasid;
@@ -375,6 +385,10 @@ struct amdgpu_mes_funcs {
        int (*reset_hw_queue)(struct amdgpu_mes *mes,
                              struct mes_reset_queue_input *input);
 
+       int (*detect_and_reset_hung_queues)(struct amdgpu_mes *mes,
+                             struct mes_detect_and_reset_queue_input *input);
+
+
        int (*invalidate_tlbs_pasid)(struct amdgpu_mes *mes,
                              struct mes_inv_tlbs_pasid_input *input);
 };
@@ -400,6 +414,13 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev,
                                  unsigned int vmid,
                                  bool use_mmio);
 
+int amdgpu_mes_get_hung_queue_db_array_size(struct amdgpu_device *adev);
+int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
+                                           int queue_type,
+                                           bool detect_only,
+                                           unsigned int *hung_db_num,
+                                           u32 *hung_db_array);
+
 uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg);
 int amdgpu_mes_wreg(struct amdgpu_device *adev,
                    uint32_t reg, uint32_t val);