]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
drm/amdgpu: add RAS poison consumption handler (v2)
authorTao Zhou <tao.zhou1@amd.com>
Tue, 19 Apr 2022 03:04:19 +0000 (11:04 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 22 Apr 2022 18:50:13 +0000 (14:50 -0400)
Add support for general RAS poison consumption handler.

v2: remove callback function for poison consumption.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

index f10afc1ea017419079cc5e26aba1c5585b0d1909..1c86ec9ab1391cbb4329b065792f12f6d7fb649e 100644 (file)
@@ -1515,6 +1515,38 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
 /* ras fs end */
 
 /* ih begin */
+static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj,
+                               struct amdgpu_iv_entry *entry)
+{
+       bool poison_stat = true, need_reset = true;
+       struct amdgpu_device *adev = obj->adev;
+       struct ras_err_data err_data = {0, 0, 0, NULL};
+       struct amdgpu_ras_block_object *block_obj =
+               amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
+
+       if (!adev->gmc.xgmi.connected_to_cpu)
+               amdgpu_umc_poison_handler(adev, &err_data, false);
+
+       /* both query_poison_status and handle_poison_consumption are optional */
+       if (block_obj && block_obj->hw_ops) {
+               if (block_obj->hw_ops->query_poison_status) {
+                       poison_stat = block_obj->hw_ops->query_poison_status(adev);
+                       if (!poison_stat)
+                               dev_info(adev->dev, "No RAS poison status in %s poison IH.\n",
+                                               block_obj->ras_comm.name);
+               }
+
+               if (poison_stat && block_obj->hw_ops->handle_poison_consumption) {
+                       poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
+                       need_reset = poison_stat;
+               }
+       }
+
+       /* gpu reset is fallback for all failed cases */
+       if (need_reset)
+               amdgpu_ras_reset_gpu(adev);
+}
+
 static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
                                struct amdgpu_iv_entry *entry)
 {
@@ -1567,6 +1599,8 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
                if (amdgpu_ras_is_poison_mode_supported(obj->adev)) {
                        if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
                                amdgpu_ras_interrupt_poison_creation_handler(obj, &entry);
+                       else
+                               amdgpu_ras_interrupt_poison_consumption_handler(obj, &entry);
                } else {
                        if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
                                amdgpu_ras_interrupt_umc_handler(obj, &entry);
index 606df8869b89382e95859de58eeed95d0a1ea81a..c4b61785ab5c9329ddcbef8df0e1930c1c2388f2 100644 (file)
@@ -509,6 +509,7 @@ struct amdgpu_ras_block_hw_ops {
        void (*reset_ras_error_count)(struct amdgpu_device *adev);
        void (*reset_ras_error_status)(struct amdgpu_device *adev);
        bool (*query_poison_status)(struct amdgpu_device *adev);
+       bool (*handle_poison_consumption)(struct amdgpu_device *adev);
 };
 
 /* work flow