]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
drm/amdgpu: save UMC global channel index to eeprom
authorTao Zhou <tao.zhou1@amd.com>
Tue, 29 Oct 2024 11:46:44 +0000 (19:46 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 10 Dec 2024 15:26:46 +0000 (10:26 -0500)
Save the global channel index returned by RAS TA to eeprom.
We can get memory physical address by MCA address and channel index.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c

index b13debcf48ee3d9dd1c4331d625c99abe78591d4..a6473c7cdeb65c9c64e57482db3ba31a048c80a2 100644 (file)
@@ -483,6 +483,8 @@ struct ras_ecc_err {
        uint64_t ipid;
        uint64_t addr;
        uint64_t pa_pfn;
+       /* save global channel index across all UMC instances */
+       uint32_t channel_idx;
        struct ras_err_pages err_pages;
 };
 
index 984af815fb38c0d859c917e7bd582649e5a9b469..4107b78d9dda952804af983717592d9c41aceb52 100644 (file)
@@ -495,10 +495,9 @@ out:
 int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev,
                        uint64_t err_addr, uint32_t ch, uint32_t umc,
                        uint32_t node, uint32_t socket,
-                       uint64_t *addr, bool dump_addr)
+                       struct ta_ras_query_address_output *addr_out, bool dump_addr)
 {
        struct ta_ras_query_address_input addr_in;
-       struct ta_ras_query_address_output addr_out;
        int ret;
 
        memset(&addr_in, 0, sizeof(addr_in));
@@ -510,14 +509,12 @@ int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev,
 
        if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
                ret = adev->umc.ras->convert_ras_err_addr(adev, NULL, &addr_in,
-                               &addr_out, dump_addr);
+                               addr_out, dump_addr);
                if (ret)
                        return ret;
        } else {
                return 0;
        }
 
-       *addr = addr_out.pa.pa;
-
        return 0;
 }
index ce1e4fb385b5af8291c7647570c500aec84653e3..2f71194d5da86cf344104650db642923235b5478 100644 (file)
@@ -146,5 +146,5 @@ int amdgpu_umc_lookup_bad_pages_in_a_row(struct amdgpu_device *adev,
 int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev,
                        uint64_t err_addr, uint32_t ch, uint32_t umc,
                        uint32_t node, uint32_t socket,
-                       uint64_t *addr, bool dump_addr);
+                       struct ta_ras_query_address_output *addr_out, bool dump_addr);
 #endif
index 17ef9a6743f550c89cdc58487ea0672ff572e121..cce93b4ffb587e4e69cd2307b53510b414f0acc6 100644 (file)
@@ -180,7 +180,7 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev,
                                        bool dump_addr)
 {
        uint32_t col, col_lower, row, row_lower, bank;
-       uint32_t channel_index, umc_inst = 0;
+       uint32_t channel_index = 0, umc_inst = 0;
        uint32_t i, loop_bits[UMC_V12_0_RETIRE_LOOP_BITS];
        uint64_t soc_pa, column, err_addr;
        struct ta_ras_query_address_output addr_out_tmp;
@@ -193,7 +193,7 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev,
        else
                paddr_out = addr_out;
 
-       err_addr = bank = channel_index = 0;
+       err_addr = bank = 0;
        if (addr_in) {
                err_addr = addr_in->ma.err_addr;
                addr_in->addr_type = TA_RAS_MCA_TO_PA;
@@ -206,7 +206,6 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev,
                }
 
                bank = paddr_out->pa.bank;
-               channel_index = paddr_out->pa.channel_idx;
                /* no need to care about umc inst if addr_in is NULL */
                umc_inst = addr_in->ma.umc_inst;
        }
@@ -228,6 +227,7 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev,
        }
 
        soc_pa = paddr_out->pa.pa;
+       channel_index = paddr_out->pa.channel_idx;
        /* clear loop bits in soc physical address */
        for (i = 0; i < UMC_V12_0_RETIRE_LOOP_BITS; i++)
                soc_pa &= ~BIT_ULL(loop_bits[i]);
@@ -466,6 +466,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
        uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
        uint64_t err_addr, pa_addr = 0;
        struct ras_ecc_err *ecc_err;
+       struct ta_ras_query_address_output addr_out;
        int count, ret, i;
 
        hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
@@ -495,7 +496,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
        ret = amdgpu_umc_mca_to_addr(adev,
                        err_addr, MCA_IPID_2_UMC_CH(ipid),
                        MCA_IPID_2_UMC_INST(ipid), MCA_IPID_2_DIE_ID(ipid),
-                       MCA_IPID_2_SOCKET_ID(ipid), &pa_addr, true);
+                       MCA_IPID_2_SOCKET_ID(ipid), &addr_out, true);
        if (ret)
                return ret;
 
@@ -503,10 +504,12 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
        if (!ecc_err)
                return -ENOMEM;
 
+       pa_addr = addr_out.pa.pa;
        ecc_err->status = status;
        ecc_err->ipid = ipid;
        ecc_err->addr = addr;
        ecc_err->pa_pfn = pa_addr >> AMDGPU_GPU_PAGE_SHIFT;
+       ecc_err->channel_idx = addr_out.pa.channel_idx;
 
        /* If converted pa_pfn is 0, use pa C4 pfn. */
        if (!ecc_err->pa_pfn)
@@ -577,7 +580,7 @@ static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
                ret = amdgpu_umc_fill_error_record(err_data,
                                ecc_err->addr,
                                page_pfn[i] << AMDGPU_GPU_PAGE_SHIFT,
-                               MCA_IPID_2_UMC_CH(ecc_err->ipid),
+                               ecc_err->channel_idx,
                                MCA_IPID_2_UMC_INST(ecc_err->ipid));
                if (ret)
                        break;