]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
drm/amdgpu:Support retiring multiple MCA error address pages
authorYiPeng Chai <YiPeng.Chai@amd.com>
Mon, 15 Jan 2024 03:02:52 +0000 (11:02 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 22 Jan 2024 22:13:25 +0000 (17:13 -0500)
Support retiring multiple MCA error address pages in
one in-band query for umc v12_0.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c

index 349c810d23993dad0f3dbac7b5259556eb2ff7e9..3c1106cf9d80a6c2eefac47457315ce91c722423 100644 (file)
@@ -3920,8 +3920,7 @@ static int ras_err_info_cmp(void *priv, const struct list_head *a, const struct
 }
 
 static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data,
-                               struct amdgpu_smuio_mcm_config_info *mcm_info,
-                               struct ras_err_addr *err_addr)
+                               struct amdgpu_smuio_mcm_config_info *mcm_info)
 {
        struct ras_err_node *err_node;
 
@@ -3933,10 +3932,9 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
        if (!err_node)
                return NULL;
 
-       memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
+       INIT_LIST_HEAD(&err_node->err_info.err_addr_list);
 
-       if (err_addr)
-               memcpy(&err_node->err_info.err_addr, err_addr, sizeof(*err_addr));
+       memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
 
        err_data->err_list_count++;
        list_add_tail(&err_node->node, &err_data->err_node_list);
@@ -3945,6 +3943,29 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
        return &err_node->err_info;
 }
 
+void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *err_addr)
+{
+       struct ras_err_addr *mca_err_addr;
+
+       mca_err_addr = kzalloc(sizeof(*mca_err_addr), GFP_KERNEL);
+       if (!mca_err_addr)
+               return;
+
+       INIT_LIST_HEAD(&mca_err_addr->node);
+
+       mca_err_addr->err_status = err_addr->err_status;
+       mca_err_addr->err_ipid = err_addr->err_ipid;
+       mca_err_addr->err_addr = err_addr->err_addr;
+
+       list_add_tail(&mca_err_addr->node, &err_info->err_addr_list);
+}
+
+void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *mca_err_addr)
+{
+       list_del(&mca_err_addr->node);
+       kfree(mca_err_addr);
+}
+
 int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
                struct amdgpu_smuio_mcm_config_info *mcm_info,
                struct ras_err_addr *err_addr, u64 count)
@@ -3957,10 +3978,13 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
        if (!count)
                return 0;
 
-       err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
+       err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
        if (!err_info)
                return -EINVAL;
 
+       if (err_addr && err_addr->err_status)
+               amdgpu_ras_add_mca_err_addr(err_info, err_addr);
+
        err_info->ue_count += count;
        err_data->ue_count += count;
 
@@ -3979,7 +4003,7 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
        if (!count)
                return 0;
 
-       err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
+       err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
        if (!err_info)
                return -EINVAL;
 
@@ -4001,10 +4025,13 @@ int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
        if (!count)
                return 0;
 
-       err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
+       err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
        if (!err_info)
                return -EINVAL;
 
+       if (err_addr && err_addr->err_status)
+               amdgpu_ras_add_mca_err_addr(err_info, err_addr);
+
        err_info->de_count += count;
        err_data->de_count += count;
 
index 72022e2c6655a039b83384491c13e37d8ecd0c5e..0b6ffae1e8bb50bfe5886e5ac67cb81295b6c8ef 100644 (file)
@@ -480,6 +480,7 @@ struct ras_fs_data {
 };
 
 struct ras_err_addr {
+       struct list_head node;
        uint64_t err_status;
        uint64_t err_ipid;
        uint64_t err_addr;
@@ -490,7 +491,7 @@ struct ras_err_info {
        u64 ce_count;
        u64 ue_count;
        u64 de_count;
-       struct ras_err_addr err_addr;
+       struct list_head err_addr_list;
 };
 
 struct ras_err_node {
@@ -862,4 +863,9 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
 ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr,
                                  struct aca_handle *handle, char *buf, void *data);
 
+void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info,
+                       struct ras_err_addr *err_addr);
+
+void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info,
+               struct ras_err_addr *mca_err_addr);
 #endif
index 5ca73fefe35819d77f9fa7de1ce19c4c0d53eeac..836a4cc1134e2d12fb13b6cab2a8ada1cde4b6d3 100644 (file)
@@ -382,42 +382,46 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade
 {
        struct ras_err_node *err_node;
        uint64_t mc_umc_status;
+       struct ras_err_info *err_info;
+       struct ras_err_addr *mca_err_addr, *tmp;
        struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
 
        for_each_ras_error(err_node, err_data) {
-               mc_umc_status = err_node->err_info.err_addr.err_status;
-               if (!mc_umc_status)
+               err_info = &err_node->err_info;
+               if (list_empty(&err_info->err_addr_list))
                        continue;
 
-               if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) ||
-                   umc_v12_0_is_deferred_error(adev, mc_umc_status)) {
-                       uint64_t mca_addr, err_addr, mca_ipid;
-                       uint32_t InstanceIdLo;
-                       struct amdgpu_smuio_mcm_config_info *mcm_info;
-
-                       mcm_info = &err_node->err_info.mcm_info;
-                       mca_addr = err_node->err_info.err_addr.err_addr;
-                       mca_ipid = err_node->err_info.err_addr.err_ipid;
-
-                       err_addr =  REG_GET_FIELD(mca_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
-                       InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, InstanceIdLo);
-
-                       dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, ch:%d, err_addr:0x%llx\n",
-                               mca_ipid,
-                               mcm_info->die_id,
-                               MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
-                               MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
-                               err_addr);
-
-                       umc_v12_0_convert_error_address(adev,
-                               err_data, err_addr,
-                               MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
-                               MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
-                               mcm_info->die_id);
-
-                       /* Clear umc error address content */
-                       memset(&err_node->err_info.err_addr,
-                               0, sizeof(err_node->err_info.err_addr));
+               list_for_each_entry_safe(mca_err_addr, tmp, &err_info->err_addr_list, node) {
+                       mc_umc_status = mca_err_addr->err_status;
+                       if (mc_umc_status &&
+                               (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) ||
+                                umc_v12_0_is_deferred_error(adev, mc_umc_status))) {
+                               uint64_t mca_addr, err_addr, mca_ipid;
+                               uint32_t InstanceIdLo;
+
+                               mca_addr = mca_err_addr->err_addr;
+                               mca_ipid = mca_err_addr->err_ipid;
+
+                               err_addr = REG_GET_FIELD(mca_addr,
+                                                       MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
+                               InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, InstanceIdLo);
+
+                               dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, ch:%d, err_addr:0x%llx\n",
+                                       mca_ipid,
+                                       err_info->mcm_info.die_id,
+                                       MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
+                                       MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
+                                       err_addr);
+
+                               umc_v12_0_convert_error_address(adev,
+                                       err_data, err_addr,
+                                       MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
+                                       MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
+                                       err_info->mcm_info.die_id);
+                       }
+
+                       /* Delete error address node from list and free memory */
+                       amdgpu_ras_del_mca_err_addr(err_info, mca_err_addr);
                }
        }
 }