]> www.infradead.org Git - users/willy/xarray.git/commitdiff
drm/amdgpu: harvest edc status when connected to host via xGMI
authorDennis Li <Dennis.Li@amd.com>
Thu, 4 Feb 2021 05:32:05 +0000 (13:32 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 24 Mar 2021 03:00:41 +0000 (23:00 -0400)
When connected to a host via xGMI, system fatal errors may trigger
warm reset, driver has no change to query edc status before reset.
Therefore in this case, driver should harvest previous error loging
registers during boot, instead of only resetting them.

v2:
1. IP's ras_manager object is created when its ras feature is enabled,
so change to query edc status after amdgpu_ras_late_init called

2. change to enable watchdog timer after finishing gfx edc init

Signed-off-by: Dennis Li <Dennis.Li@amd.com>
Reivewed-by: Hawking Zhang <hawking.zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h

index 8e0a6c62322ec9d62d569d717982580abd58901f..689addb1520d26bbac50f74ab560c848b9670174 100644 (file)
@@ -601,6 +601,7 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev)
        struct ras_ih_if ih_info = {
                .cb = amdgpu_gfx_process_ras_data_cb,
        };
+       struct ras_query_if info = { 0 };
 
        if (!adev->gfx.ras_if) {
                adev->gfx.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
@@ -612,13 +613,19 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev)
                strcpy(adev->gfx.ras_if->name, "gfx");
        }
        fs_info.head = ih_info.head = *adev->gfx.ras_if;
-
        r = amdgpu_ras_late_init(adev, adev->gfx.ras_if,
                                 &fs_info, &ih_info);
        if (r)
                goto free;
 
        if (amdgpu_ras_is_supported(adev, adev->gfx.ras_if->block)) {
+               if (adev->gmc.xgmi.connected_to_cpu) {
+                       info.head = *adev->gfx.ras_if;
+                       amdgpu_ras_query_error_status(adev, &info);
+               } else {
+                       amdgpu_ras_reset_error_status(adev, AMDGPU_RAS_BLOCK__GFX);
+               }
+
                r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
                if (r)
                        goto late_fini;
index d92f0f14cbebc5bf488101b25169f72d1c9bc160..38af93f501e1e030c16defe2244847d265fb5680 100644 (file)
@@ -225,9 +225,9 @@ struct amdgpu_gfx_funcs {
        void (*reset_ras_error_count) (struct amdgpu_device *adev);
        void (*init_spm_golden)(struct amdgpu_device *adev);
        void (*query_ras_error_status) (struct amdgpu_device *adev);
+       void (*reset_ras_error_status) (struct amdgpu_device *adev);
        void (*update_perfmon_mgcg)(struct amdgpu_device *adev, bool enable);
        void (*enable_watchdog_timer)(struct amdgpu_device *adev);
-       void (*query_sq_timeout_status)(struct amdgpu_device *adev);
 };
 
 struct sq_work {
index c1516d871881ba7679fccc8e79a62c6777b74dd6..ed83a32f6f30aefe93f295798b13eb0506af95d7 100644 (file)
@@ -109,7 +109,7 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
        ssize_t s;
        char val[128];
 
-       if (amdgpu_ras_error_query(obj->adev, &info))
+       if (amdgpu_ras_query_error_status(obj->adev, &info))
                return -EINVAL;
 
        s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
@@ -434,7 +434,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
                return snprintf(buf, PAGE_SIZE,
                                "Query currently inaccessible\n");
 
-       if (amdgpu_ras_error_query(obj->adev, &info))
+       if (amdgpu_ras_query_error_status(obj->adev, &info))
                return -EINVAL;
 
        return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
@@ -757,8 +757,8 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
 /* feature ctl end */
 
 /* query/inject/cure begin */
-int amdgpu_ras_error_query(struct amdgpu_device *adev,
-               struct ras_query_if *info)
+int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
+       struct ras_query_if *info)
 {
        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
        struct ras_err_data err_data = {0, 0, 0, NULL};
@@ -787,10 +787,16 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
        case AMDGPU_RAS_BLOCK__GFX:
                if (adev->gfx.funcs->query_ras_error_count)
                        adev->gfx.funcs->query_ras_error_count(adev, &err_data);
+
+               if (adev->gfx.funcs->query_ras_error_status)
+                       adev->gfx.funcs->query_ras_error_status(adev);
                break;
        case AMDGPU_RAS_BLOCK__MMHUB:
                if (adev->mmhub.funcs->query_ras_error_count)
                        adev->mmhub.funcs->query_ras_error_count(adev, &err_data);
+
+               if (adev->mmhub.funcs->query_ras_error_status)
+                       adev->mmhub.funcs->query_ras_error_status(adev);
                break;
        case AMDGPU_RAS_BLOCK__PCIE_BIF:
                if (adev->nbio.funcs->query_ras_error_count)
@@ -826,6 +832,35 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
        return 0;
 }
 
+int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
+               enum amdgpu_ras_block block)
+{
+       if (!amdgpu_ras_is_supported(adev, block))
+               return -EINVAL;
+
+       switch (block) {
+       case AMDGPU_RAS_BLOCK__GFX:
+               if (adev->gfx.funcs->reset_ras_error_count)
+                       adev->gfx.funcs->reset_ras_error_count(adev);
+
+               if (adev->gfx.funcs->reset_ras_error_status)
+                       adev->gfx.funcs->reset_ras_error_status(adev);
+               break;
+       case AMDGPU_RAS_BLOCK__MMHUB:
+               if (adev->mmhub.funcs->reset_ras_error_count)
+                       adev->mmhub.funcs->reset_ras_error_count(adev);
+               break;
+       case AMDGPU_RAS_BLOCK__SDMA:
+               if (adev->sdma.funcs->reset_ras_error_count)
+                       adev->sdma.funcs->reset_ras_error_count(adev);
+               break;
+       default:
+               break;
+       }
+
+       return 0;
+}
+
 /* Trigger XGMI/WAFL error */
 static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
                                 struct ta_ras_trigger_error_input *block_info)
@@ -921,7 +956,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
                        .head = obj->head,
                };
 
-               if (amdgpu_ras_error_query(adev, &info))
+               if (amdgpu_ras_query_error_status(adev, &info))
                        return 0;
 
                data.ce_count += info.ce_count;
@@ -1451,7 +1486,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
                if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
                        continue;
 
-               amdgpu_ras_error_query(adev, &info);
+               amdgpu_ras_query_error_status(adev, &info);
        }
 }
 
@@ -1467,9 +1502,6 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
        case AMDGPU_RAS_BLOCK__GFX:
                if (adev->gfx.funcs->query_ras_error_status)
                        adev->gfx.funcs->query_ras_error_status(adev);
-
-               if (adev->gfx.funcs->query_sq_timeout_status)
-                       adev->gfx.funcs->query_sq_timeout_status(adev);
                break;
        case AMDGPU_RAS_BLOCK__MMHUB:
                if (adev->mmhub.funcs->query_ras_error_status)
index 42aab9adc26323cdc5b530c51e844fe578bcb440..a64bbb6dcfa43914a86dfa0c6d443d4ac2a3d3e2 100644 (file)
@@ -588,9 +588,12 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
 
 void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev);
 
-int amdgpu_ras_error_query(struct amdgpu_device *adev,
+int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
                struct ras_query_if *info);
 
+int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
+               enum amdgpu_ras_block block);
+
 int amdgpu_ras_error_inject(struct amdgpu_device *adev,
                struct ras_inject_if *info);
 
index 8b6ba1594f413a9ae7f8c740b80af4cb267c776b..652f71824a74ecab97494043bbb52ec862561e0f 100644 (file)
@@ -2124,8 +2124,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_2_gfx_funcs = {
        .query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
        .reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
        .query_ras_error_status = &gfx_v9_4_2_query_ras_error_status,
+       .reset_ras_error_status = &gfx_v9_4_2_reset_ras_error_status,
        .enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
-       .query_sq_timeout_status = &gfx_v9_4_2_query_sq_timeout_status,
 };
 
 static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
@@ -3970,9 +3970,6 @@ static int gfx_v9_0_hw_init(void *handle)
        if (adev->asic_type == CHIP_ALDEBARAN)
                gfx_v9_4_2_set_power_brake_sequence(adev);
 
-       if (adev->gfx.funcs->enable_watchdog_timer)
-               adev->gfx.funcs->enable_watchdog_timer(adev);
-
        return r;
 }
 
@@ -4736,14 +4733,13 @@ static int gfx_v9_0_ecc_late_init(void *handle)
        if (r)
                return r;
 
-       if (adev->gfx.funcs &&
-           adev->gfx.funcs->reset_ras_error_count)
-               adev->gfx.funcs->reset_ras_error_count(adev);
-
        r = amdgpu_gfx_ras_late_init(adev);
        if (r)
                return r;
 
+       if (adev->gfx.funcs->enable_watchdog_timer)
+               adev->gfx.funcs->enable_watchdog_timer(adev);
+
        return 0;
 }
 
index 44024ab9357716431e745c8f3e1bb05a2d87167c..2e94998c98120904b1646b35f82dd44a6b84b429 100644 (file)
@@ -79,6 +79,9 @@ static const struct soc15_reg_golden golden_settings_gc_9_4_2_alde[] = {
        SOC15_REG_GOLDEN_VALUE(GC, 0, regTCI_CNTL_3, 0xff, 0x20),
 };
 
+static void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev);
+static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev);
+
 void gfx_v9_4_2_init_golden_registers(struct amdgpu_device *adev,
                                      uint32_t die_id)
 {
@@ -1055,8 +1058,6 @@ void gfx_v9_4_2_reset_ras_error_count(struct amdgpu_device *adev)
 
        gfx_v9_4_2_query_sram_edc_count(adev, NULL, NULL);
        gfx_v9_4_2_query_utc_edc_count(adev, NULL, NULL);
-       gfx_v9_4_2_reset_utc_err_status(adev);
-       gfx_v9_4_2_reset_ea_err_status(adev);
 }
 
 int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_if)
@@ -1097,6 +1098,8 @@ static void gfx_v9_4_2_query_ea_err_status(struct amdgpu_device *adev)
                        if (reg_value)
                                dev_warn(adev->dev, "GCEA err detected at instance: %d, status: 0x%x!\n",
                                                j, reg_value);
+                       /* clear after read */
+                       WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_rdrsp_status_regs), 0x10);
                }
        }
 
@@ -1109,16 +1112,22 @@ static void gfx_v9_4_2_query_utc_err_status(struct amdgpu_device *adev)
        uint32_t data;
 
        data = RREG32_SOC15(GC, 0, regUTCL2_MEM_ECC_STATUS);
-       if (!data)
+       if (!data) {
                dev_warn(adev->dev, "GFX UTCL2 Mem Ecc Status: 0x%x!\n", data);
+               WREG32_SOC15(GC, 0, regUTCL2_MEM_ECC_STATUS, 0x3);
+       }
 
        data = RREG32_SOC15(GC, 0, regVML2_MEM_ECC_STATUS);
-       if (!data)
+       if (!data) {
                dev_warn(adev->dev, "GFX VML2 Mem Ecc Status: 0x%x!\n", data);
+               WREG32_SOC15(GC, 0, regVML2_MEM_ECC_STATUS, 0x3);
+       }
 
        data = RREG32_SOC15(GC, 0, regVML2_WALKER_MEM_ECC_STATUS);
-       if (!data)
+       if (!data) {
                dev_warn(adev->dev, "GFX VML2 Walker Mem Ecc Status: 0x%x!\n", data);
+               WREG32_SOC15(GC, 0, regVML2_WALKER_MEM_ECC_STATUS, 0x3);
+       }
 }
 
 void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev)
@@ -1128,6 +1137,17 @@ void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev)
 
        gfx_v9_4_2_query_ea_err_status(adev);
        gfx_v9_4_2_query_utc_err_status(adev);
+       gfx_v9_4_2_query_sq_timeout_status(adev);
+}
+
+void gfx_v9_4_2_reset_ras_error_status(struct amdgpu_device *adev)
+{
+       if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
+               return;
+
+       gfx_v9_4_2_reset_utc_err_status(adev);
+       gfx_v9_4_2_reset_ea_err_status(adev);
+       gfx_v9_4_2_reset_sq_timeout_status(adev);
 }
 
 void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev)
@@ -1209,7 +1229,7 @@ static void gfx_v9_4_2_log_cu_timeout_status(struct amdgpu_device *adev,
        }
 }
 
-void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev)
+static void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev)
 {
        uint32_t se_idx, sh_idx, cu_idx;
        uint32_t status;
@@ -1241,4 +1261,26 @@ void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev)
        }
        gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
        mutex_unlock(&adev->grbm_idx_mutex);
+}
+
+static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev)
+{
+       uint32_t se_idx, sh_idx, cu_idx;
+
+       mutex_lock(&adev->grbm_idx_mutex);
+       for (se_idx = 0; se_idx < adev->gfx.config.max_shader_engines;
+            se_idx++) {
+               for (sh_idx = 0; sh_idx < adev->gfx.config.max_sh_per_se;
+                    sh_idx++) {
+                       for (cu_idx = 0;
+                            cu_idx < adev->gfx.config.max_cu_per_sh;
+                            cu_idx++) {
+                               gfx_v9_4_2_select_se_sh(adev, se_idx, sh_idx,
+                                                       cu_idx);
+                               WREG32_SOC15(GC, 0, regSQ_TIMEOUT_STATUS, 0);
+                       }
+               }
+       }
+       gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
+       mutex_unlock(&adev->grbm_idx_mutex);
 }
\ No newline at end of file
index e01fa6afa8e4f70a6a57ea2ed2cd4870a95dfa55..c143d178ef9803c98ec41b9d08f4e64d8a640443 100644 (file)
@@ -35,7 +35,6 @@ int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_if);
 void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev);
 int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device *adev,
                                   void *ras_error_status);
-
+void gfx_v9_4_2_reset_ras_error_status(struct amdgpu_device *adev);
 void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev);
-void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev);
 #endif /* __GFX_V9_4_2_H__ */