From 71a0e9630027f77d7646c5b750593c9ecfaa27d3 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Tue, 29 Oct 2024 19:46:44 +0800 Subject: [PATCH] drm/amdgpu: save UMC global channel index to eeprom Save the global channel index returned by RAS TA to eeprom. We can get memory physical address by MCA address and channel index. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 7 ++----- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 2 +- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 13 ++++++++----- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index b13debcf48ee3..a6473c7cdeb65 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -483,6 +483,8 @@ struct ras_ecc_err { uint64_t ipid; uint64_t addr; uint64_t pa_pfn; + /* save global channel index across all UMC instances */ + uint32_t channel_idx; struct ras_err_pages err_pages; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 984af815fb38c..4107b78d9dda9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -495,10 +495,9 @@ out: int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev, uint64_t err_addr, uint32_t ch, uint32_t umc, uint32_t node, uint32_t socket, - uint64_t *addr, bool dump_addr) + struct ta_ras_query_address_output *addr_out, bool dump_addr) { struct ta_ras_query_address_input addr_in; - struct ta_ras_query_address_output addr_out; int ret; memset(&addr_in, 0, sizeof(addr_in)); @@ -510,14 +509,12 @@ int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev, if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { ret = adev->umc.ras->convert_ras_err_addr(adev, NULL, &addr_in, - &addr_out, dump_addr); + addr_out, dump_addr); if (ret) return ret; } else { return 0; } - *addr = addr_out.pa.pa; - return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h index ce1e4fb385b5a..2f71194d5da86 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h @@ -146,5 +146,5 @@ int amdgpu_umc_lookup_bad_pages_in_a_row(struct amdgpu_device *adev, int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev, uint64_t err_addr, uint32_t ch, uint32_t umc, uint32_t node, uint32_t socket, - uint64_t *addr, bool dump_addr); + struct ta_ras_query_address_output *addr_out, bool dump_addr); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 17ef9a6743f55..cce93b4ffb587 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -180,7 +180,7 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev, bool dump_addr) { uint32_t col, col_lower, row, row_lower, bank; - uint32_t channel_index, umc_inst = 0; + uint32_t channel_index = 0, umc_inst = 0; uint32_t i, loop_bits[UMC_V12_0_RETIRE_LOOP_BITS]; uint64_t soc_pa, column, err_addr; struct ta_ras_query_address_output addr_out_tmp; @@ -193,7 +193,7 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev, else paddr_out = addr_out; - err_addr = bank = channel_index = 0; + err_addr = bank = 0; if (addr_in) { err_addr = addr_in->ma.err_addr; addr_in->addr_type = TA_RAS_MCA_TO_PA; @@ -206,7 +206,6 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev, } bank = paddr_out->pa.bank; - channel_index = paddr_out->pa.channel_idx; /* no need to care about umc inst if addr_in is NULL */ umc_inst = addr_in->ma.umc_inst; } @@ -228,6 +227,7 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev, } soc_pa = paddr_out->pa.pa; + channel_index = paddr_out->pa.channel_idx; /* clear loop bits in soc physical address */ for (i = 0; i < UMC_V12_0_RETIRE_LOOP_BITS; i++) soc_pa &= ~BIT_ULL(loop_bits[i]); @@ -466,6 +466,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL]; uint64_t err_addr, pa_addr = 0; struct ras_ecc_err *ecc_err; + struct ta_ras_query_address_output addr_out; int count, ret, i; hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID); @@ -495,7 +496,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, ret = amdgpu_umc_mca_to_addr(adev, err_addr, MCA_IPID_2_UMC_CH(ipid), MCA_IPID_2_UMC_INST(ipid), MCA_IPID_2_DIE_ID(ipid), - MCA_IPID_2_SOCKET_ID(ipid), &pa_addr, true); + MCA_IPID_2_SOCKET_ID(ipid), &addr_out, true); if (ret) return ret; @@ -503,10 +504,12 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, if (!ecc_err) return -ENOMEM; + pa_addr = addr_out.pa.pa; ecc_err->status = status; ecc_err->ipid = ipid; ecc_err->addr = addr; ecc_err->pa_pfn = pa_addr >> AMDGPU_GPU_PAGE_SHIFT; + ecc_err->channel_idx = addr_out.pa.channel_idx; /* If converted pa_pfn is 0, use pa C4 pfn. */ if (!ecc_err->pa_pfn) @@ -577,7 +580,7 @@ static int umc_v12_0_fill_error_record(struct amdgpu_device *adev, ret = amdgpu_umc_fill_error_record(err_data, ecc_err->addr, page_pfn[i] << AMDGPU_GPU_PAGE_SHIFT, - MCA_IPID_2_UMC_CH(ecc_err->ipid), + ecc_err->channel_idx, MCA_IPID_2_UMC_INST(ecc_err->ipid)); if (ret) break; -- 2.49.0