From 03399d0bff2534e499878473bd0edc5dd8f99bbd Mon Sep 17 00:00:00 2001 From: Sathishkumar S Date: Wed, 29 Jan 2025 19:11:35 +0530 Subject: [PATCH 01/16] drm/amdgpu: Add ring reset callback for JPEG3_0_0 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add ring reset function callback for JPEG3_0_0 to recover from job timeouts without a full gpu reset. Signed-off-by: Sathishkumar S Acked-by: Christian König Reviewed-by: Leo Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/jpeg_v3_0.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v3_0.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v3_0.c index a4acb7cb7ea0..9faa9c6809df 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v3_0.c @@ -132,7 +132,10 @@ static int jpeg_v3_0_sw_init(struct amdgpu_ip_block *ip_block) if (r) return r; - return 0; + adev->jpeg.supported_reset = AMDGPU_RESET_TYPE_PER_QUEUE; + r = amdgpu_jpeg_sysfs_reset_mask_init(adev); + + return r; } /** @@ -151,6 +154,8 @@ static int jpeg_v3_0_sw_fini(struct amdgpu_ip_block *ip_block) if (r) return r; + amdgpu_jpeg_sysfs_reset_mask_fini(adev); + r = amdgpu_jpeg_sw_fini(adev); return r; @@ -550,6 +555,13 @@ static int jpeg_v3_0_process_interrupt(struct amdgpu_device *adev, return 0; } +static int jpeg_v3_0_ring_reset(struct amdgpu_ring *ring, unsigned int vmid) +{ + jpeg_v3_0_stop(ring->adev); + jpeg_v3_0_start(ring->adev); + return amdgpu_ring_test_helper(ring); +} + static const struct amd_ip_funcs jpeg_v3_0_ip_funcs = { .name = "jpeg_v3_0", .early_init = jpeg_v3_0_early_init, @@ -595,6 +607,7 @@ static const struct amdgpu_ring_funcs jpeg_v3_0_dec_ring_vm_funcs = { .emit_wreg = jpeg_v2_0_dec_ring_emit_wreg, .emit_reg_wait = jpeg_v2_0_dec_ring_emit_reg_wait, .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper, + .reset = jpeg_v3_0_ring_reset, }; static void jpeg_v3_0_set_dec_ring_funcs(struct amdgpu_device *adev) -- 2.51.0 From cb493aee4d40e84a60e2e4eca55c745b0835ac30 Mon Sep 17 00:00:00 2001 From: Sathishkumar S Date: Tue, 28 Jan 2025 09:17:08 +0530 Subject: [PATCH 02/16] drm/amdgpu: Per-instance init func for JPEG2_5_0 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add helper functions to handle per-instance initialization and deinitialization in JPEG2_5_0. Signed-off-by: Sathishkumar S Acked-by: Christian König Reviewed-by: Leo Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c | 102 +++++++++++++------------ 1 file changed, 55 insertions(+), 47 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c index b19724928ce4..0490b672d8d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c @@ -330,6 +330,44 @@ static void jpeg_v2_5_enable_clock_gating(struct amdgpu_device *adev, int inst) WREG32_SOC15(JPEG, inst, mmJPEG_CGC_GATE, data); } +static void jpeg_v2_5_start_inst(struct amdgpu_device *adev, int i) +{ + struct amdgpu_ring *ring = adev->jpeg.inst[i].ring_dec; + /* disable anti hang mechanism */ + WREG32_P(SOC15_REG_OFFSET(JPEG, i, mmUVD_JPEG_POWER_STATUS), 0, + ~UVD_JPEG_POWER_STATUS__JPEG_POWER_STATUS_MASK); + + /* JPEG disable CGC */ + jpeg_v2_5_disable_clock_gating(adev, i); + + /* MJPEG global tiling registers */ + WREG32_SOC15(JPEG, i, mmJPEG_DEC_GFX8_ADDR_CONFIG, + adev->gfx.config.gb_addr_config); + WREG32_SOC15(JPEG, i, mmJPEG_DEC_GFX10_ADDR_CONFIG, + adev->gfx.config.gb_addr_config); + + /* enable JMI channel */ + WREG32_P(SOC15_REG_OFFSET(JPEG, i, mmUVD_JMI_CNTL), 0, + ~UVD_JMI_CNTL__SOFT_RESET_MASK); + + /* enable System Interrupt for JRBC */ + WREG32_P(SOC15_REG_OFFSET(JPEG, i, mmJPEG_SYS_INT_EN), + JPEG_SYS_INT_EN__DJRBC_MASK, + ~JPEG_SYS_INT_EN__DJRBC_MASK); + + WREG32_SOC15(JPEG, i, mmUVD_LMI_JRBC_RB_VMID, 0); + WREG32_SOC15(JPEG, i, mmUVD_JRBC_RB_CNTL, (0x00000001L | 0x00000002L)); + WREG32_SOC15(JPEG, i, mmUVD_LMI_JRBC_RB_64BIT_BAR_LOW, + lower_32_bits(ring->gpu_addr)); + WREG32_SOC15(JPEG, i, mmUVD_LMI_JRBC_RB_64BIT_BAR_HIGH, + upper_32_bits(ring->gpu_addr)); + WREG32_SOC15(JPEG, i, mmUVD_JRBC_RB_RPTR, 0); + WREG32_SOC15(JPEG, i, mmUVD_JRBC_RB_WPTR, 0); + WREG32_SOC15(JPEG, i, mmUVD_JRBC_RB_CNTL, 0x00000002L); + WREG32_SOC15(JPEG, i, mmUVD_JRBC_RB_SIZE, ring->ring_size / 4); + ring->wptr = RREG32_SOC15(JPEG, i, mmUVD_JRBC_RB_WPTR); +} + /** * jpeg_v2_5_start - start JPEG block * @@ -339,52 +377,33 @@ static void jpeg_v2_5_enable_clock_gating(struct amdgpu_device *adev, int inst) */ static int jpeg_v2_5_start(struct amdgpu_device *adev) { - struct amdgpu_ring *ring; int i; for (i = 0; i < adev->jpeg.num_jpeg_inst; ++i) { if (adev->jpeg.harvest_config & (1 << i)) continue; + jpeg_v2_5_start_inst(adev, i); - ring = adev->jpeg.inst[i].ring_dec; - /* disable anti hang mechanism */ - WREG32_P(SOC15_REG_OFFSET(JPEG, i, mmUVD_JPEG_POWER_STATUS), 0, - ~UVD_JPEG_POWER_STATUS__JPEG_POWER_STATUS_MASK); - - /* JPEG disable CGC */ - jpeg_v2_5_disable_clock_gating(adev, i); - - /* MJPEG global tiling registers */ - WREG32_SOC15(JPEG, i, mmJPEG_DEC_GFX8_ADDR_CONFIG, - adev->gfx.config.gb_addr_config); - WREG32_SOC15(JPEG, i, mmJPEG_DEC_GFX10_ADDR_CONFIG, - adev->gfx.config.gb_addr_config); - - /* enable JMI channel */ - WREG32_P(SOC15_REG_OFFSET(JPEG, i, mmUVD_JMI_CNTL), 0, - ~UVD_JMI_CNTL__SOFT_RESET_MASK); - - /* enable System Interrupt for JRBC */ - WREG32_P(SOC15_REG_OFFSET(JPEG, i, mmJPEG_SYS_INT_EN), - JPEG_SYS_INT_EN__DJRBC_MASK, - ~JPEG_SYS_INT_EN__DJRBC_MASK); - - WREG32_SOC15(JPEG, i, mmUVD_LMI_JRBC_RB_VMID, 0); - WREG32_SOC15(JPEG, i, mmUVD_JRBC_RB_CNTL, (0x00000001L | 0x00000002L)); - WREG32_SOC15(JPEG, i, mmUVD_LMI_JRBC_RB_64BIT_BAR_LOW, - lower_32_bits(ring->gpu_addr)); - WREG32_SOC15(JPEG, i, mmUVD_LMI_JRBC_RB_64BIT_BAR_HIGH, - upper_32_bits(ring->gpu_addr)); - WREG32_SOC15(JPEG, i, mmUVD_JRBC_RB_RPTR, 0); - WREG32_SOC15(JPEG, i, mmUVD_JRBC_RB_WPTR, 0); - WREG32_SOC15(JPEG, i, mmUVD_JRBC_RB_CNTL, 0x00000002L); - WREG32_SOC15(JPEG, i, mmUVD_JRBC_RB_SIZE, ring->ring_size / 4); - ring->wptr = RREG32_SOC15(JPEG, i, mmUVD_JRBC_RB_WPTR); } return 0; } +static void jpeg_v2_5_stop_inst(struct amdgpu_device *adev, int i) +{ + /* reset JMI */ + WREG32_P(SOC15_REG_OFFSET(JPEG, i, mmUVD_JMI_CNTL), + UVD_JMI_CNTL__SOFT_RESET_MASK, + ~UVD_JMI_CNTL__SOFT_RESET_MASK); + + jpeg_v2_5_enable_clock_gating(adev, i); + + /* enable anti hang mechanism */ + WREG32_P(SOC15_REG_OFFSET(JPEG, i, mmUVD_JPEG_POWER_STATUS), + UVD_JPEG_POWER_STATUS__JPEG_POWER_STATUS_MASK, + ~UVD_JPEG_POWER_STATUS__JPEG_POWER_STATUS_MASK); +} + /** * jpeg_v2_5_stop - stop JPEG block * @@ -399,18 +418,7 @@ static int jpeg_v2_5_stop(struct amdgpu_device *adev) for (i = 0; i < adev->jpeg.num_jpeg_inst; ++i) { if (adev->jpeg.harvest_config & (1 << i)) continue; - - /* reset JMI */ - WREG32_P(SOC15_REG_OFFSET(JPEG, i, mmUVD_JMI_CNTL), - UVD_JMI_CNTL__SOFT_RESET_MASK, - ~UVD_JMI_CNTL__SOFT_RESET_MASK); - - jpeg_v2_5_enable_clock_gating(adev, i); - - /* enable anti hang mechanism */ - WREG32_P(SOC15_REG_OFFSET(JPEG, i, mmUVD_JPEG_POWER_STATUS), - UVD_JPEG_POWER_STATUS__JPEG_POWER_STATUS_MASK, - ~UVD_JPEG_POWER_STATUS__JPEG_POWER_STATUS_MASK); + jpeg_v2_5_stop_inst(adev, i); } return 0; -- 2.51.0 From 09e24a0b5243adf6e977288beb34497910a00cfe Mon Sep 17 00:00:00 2001 From: Sathishkumar S Date: Wed, 29 Jan 2025 19:18:11 +0530 Subject: [PATCH 03/16] drm/amdgpu: Add ring reset callback for JPEG2_5_0 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add ring reset function callback for JPEG2_5_0 to recover from job timeouts without a full gpu reset. Signed-off-by: Sathishkumar S Acked-by: Christian König Reviewed-by: Leo Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c index 0490b672d8d3..0a2c1dee2430 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c @@ -167,7 +167,10 @@ static int jpeg_v2_5_sw_init(struct amdgpu_ip_block *ip_block) if (r) return r; - return 0; + adev->jpeg.supported_reset = AMDGPU_RESET_TYPE_PER_QUEUE; + r = amdgpu_jpeg_sysfs_reset_mask_init(adev); + + return r; } /** @@ -186,6 +189,8 @@ static int jpeg_v2_5_sw_fini(struct amdgpu_ip_block *ip_block) if (r) return r; + amdgpu_jpeg_sysfs_reset_mask_fini(adev); + r = amdgpu_jpeg_sw_fini(adev); return r; @@ -638,6 +643,13 @@ static int jpeg_v2_5_process_interrupt(struct amdgpu_device *adev, return 0; } +static int jpeg_v2_5_ring_reset(struct amdgpu_ring *ring, unsigned int vmid) +{ + jpeg_v2_5_stop_inst(ring->adev, ring->me); + jpeg_v2_5_start_inst(ring->adev, ring->me); + return amdgpu_ring_test_helper(ring); +} + static const struct amd_ip_funcs jpeg_v2_5_ip_funcs = { .name = "jpeg_v2_5", .early_init = jpeg_v2_5_early_init, @@ -700,6 +712,7 @@ static const struct amdgpu_ring_funcs jpeg_v2_5_dec_ring_vm_funcs = { .emit_wreg = jpeg_v2_0_dec_ring_emit_wreg, .emit_reg_wait = jpeg_v2_0_dec_ring_emit_reg_wait, .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper, + .reset = jpeg_v2_5_ring_reset, }; static const struct amdgpu_ring_funcs jpeg_v2_6_dec_ring_vm_funcs = { @@ -730,6 +743,7 @@ static const struct amdgpu_ring_funcs jpeg_v2_6_dec_ring_vm_funcs = { .emit_wreg = jpeg_v2_0_dec_ring_emit_wreg, .emit_reg_wait = jpeg_v2_0_dec_ring_emit_reg_wait, .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper, + .reset = jpeg_v2_5_ring_reset, }; static void jpeg_v2_5_set_dec_ring_funcs(struct amdgpu_device *adev) -- 2.51.0 From 500c04d2a70876c9fd49070606d948e96d32760e Mon Sep 17 00:00:00 2001 From: Sathishkumar S Date: Wed, 29 Jan 2025 19:34:06 +0530 Subject: [PATCH 04/16] drm/amdgpu: Add ring reset callback for JPEG2_0_0 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add ring reset function callback for JPEG2_0_0 to recover from job timeouts without a full gpu reset. Signed-off-by: Sathishkumar S Acked-by: Christian König Reviewed-by: Leo Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c index 8c61081746c6..75843a0e3bfb 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c @@ -118,7 +118,10 @@ static int jpeg_v2_0_sw_init(struct amdgpu_ip_block *ip_block) if (r) return r; - return 0; + adev->jpeg.supported_reset = AMDGPU_RESET_TYPE_PER_QUEUE; + r = amdgpu_jpeg_sysfs_reset_mask_init(adev); + + return r; } /** @@ -137,6 +140,8 @@ static int jpeg_v2_0_sw_fini(struct amdgpu_ip_block *ip_block) if (r) return r; + amdgpu_jpeg_sysfs_reset_mask_fini(adev); + r = amdgpu_jpeg_sw_fini(adev); return r; @@ -759,6 +764,13 @@ static int jpeg_v2_0_process_interrupt(struct amdgpu_device *adev, return 0; } +static int jpeg_v2_0_ring_reset(struct amdgpu_ring *ring, unsigned int vmid) +{ + jpeg_v2_0_stop(ring->adev); + jpeg_v2_0_start(ring->adev); + return amdgpu_ring_test_helper(ring); +} + static const struct amd_ip_funcs jpeg_v2_0_ip_funcs = { .name = "jpeg_v2_0", .early_init = jpeg_v2_0_early_init, @@ -804,6 +816,7 @@ static const struct amdgpu_ring_funcs jpeg_v2_0_dec_ring_vm_funcs = { .emit_wreg = jpeg_v2_0_dec_ring_emit_wreg, .emit_reg_wait = jpeg_v2_0_dec_ring_emit_reg_wait, .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper, + .reset = jpeg_v2_0_ring_reset, }; static void jpeg_v2_0_set_dec_ring_funcs(struct amdgpu_device *adev) -- 2.51.0 From 80513e389765c8f9543b26d8fa4bbdf0e59ff8bc Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 13 Feb 2025 13:37:01 -0500 Subject: [PATCH 05/16] drm/amdgpu/gfx: only call mes for enforce isolation if supported This should not be called on chips without MES so check if MES is enabled and if the cleaner shader is supported. Fixes: 8521e3c5f058 ("drm/amd/amdgpu: limit single process inside MES") Reviewed-by: Srinivasan Shanmugam Signed-off-by: Alex Deucher Cc: Shaoyun Liu Cc: Srinivasan Shanmugam --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 27f5318c3a26..b9bd6654f317 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1670,11 +1670,13 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev, if (adev->enforce_isolation[i] && !partition_values[i]) { /* Going from enabled to disabled */ amdgpu_vmid_free_reserved(adev, AMDGPU_GFXHUB(i)); - amdgpu_mes_set_enforce_isolation(adev, i, false); + if (adev->enable_mes && adev->gfx.enable_cleaner_shader) + amdgpu_mes_set_enforce_isolation(adev, i, false); } else if (!adev->enforce_isolation[i] && partition_values[i]) { /* Going from disabled to enabled */ amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(i)); - amdgpu_mes_set_enforce_isolation(adev, i, true); + if (adev->enable_mes && adev->gfx.enable_cleaner_shader) + amdgpu_mes_set_enforce_isolation(adev, i, true); } adev->enforce_isolation[i] = partition_values[i]; } -- 2.51.0 From fe652becdbfccf265f4cea0eb379418d08c6596a Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 12 Feb 2025 16:20:15 -0500 Subject: [PATCH 06/16] drm/amdgpu/umsch: declare umsch firmware Needed to be properly picked up for the initrd, etc. Fixes: 3488c79beafa ("drm/amdgpu: add initial support for UMSCH") Reviewed-by: Saleemkhan Jamadar Signed-off-by: Alex Deucher Cc: Lang Yu --- drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c index 2cfa2447d13e..78319988b054 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c @@ -32,6 +32,8 @@ #include "amdgpu_umsch_mm.h" #include "umsch_mm_v4_0.h" +MODULE_FIRMWARE("amdgpu/umsch_mm_4_0_0.bin"); + int amdgpu_umsch_mm_submit_pkt(struct amdgpu_umsch_mm *umsch, void *pkt, int ndws) { struct amdgpu_ring *ring = &umsch->ring; -- 2.51.0 From 5183e69090f07585fa6c7ef71d4301bc16a15c76 Mon Sep 17 00:00:00 2001 From: Amber Lin Date: Wed, 12 Feb 2025 14:26:00 -0500 Subject: [PATCH 07/16] drm/amdgpu: Remove extra checks for CPX As far as the number of XCCs, the number of compute partitions, and the number of memory partitions qualify, CPX is valid. Signed-off-by: Amber Lin Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c index e157d6d857b6..2753f282e42d 100644 --- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c +++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c @@ -559,8 +559,11 @@ static bool __aqua_vanjaram_is_valid_mode(struct amdgpu_xcp_mgr *xcp_mgr, adev->gmc.num_mem_partitions == 4) && (num_xccs_per_xcp >= 2); case AMDGPU_CPX_PARTITION_MODE: + /* (num_xcc > 1) because 1 XCC is considered SPX, not CPX. + * (num_xcc % adev->gmc.num_mem_partitions) == 0 because + * num_compute_partitions can't be less than num_mem_partitions + */ return ((num_xcc > 1) && - (adev->gmc.num_mem_partitions == 1 || adev->gmc.num_mem_partitions == 4) && (num_xcc % adev->gmc.num_mem_partitions) == 0); default: return false; -- 2.51.0 From c917e39cbdcd9fff421184db6cc461cc58d52c17 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 12 Feb 2025 16:31:43 -0500 Subject: [PATCH 08/16] drm/amdgpu/umsch: fix ucode check Return an error if the IP version doesn't match otherwise we end up passing a NULL string to amdgpu_ucode_request. We should never hit this in practice today since we only enable the umsch code on the supported IP versions, but add a check to be safe. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202502130406.iWQ0eBug-lkp@intel.com/ Fixes: 020620424b27 ("drm/amd: Use a constant format string for amdgpu_ucode_request") Reviewed-by: Saleemkhan Jamadar Signed-off-by: Alex Deucher Cc: Arnd Bergmann Cc: Lang Yu --- drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c index 78319988b054..a7f2648245ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c @@ -129,7 +129,7 @@ int amdgpu_umsch_mm_init_microcode(struct amdgpu_umsch_mm *umsch) fw_name = "amdgpu/umsch_mm_4_0_0.bin"; break; default: - break; + return -EINVAL; } r = amdgpu_ucode_request(adev, &adev->umsch_mm.fw, AMDGPU_UCODE_REQUIRED, -- 2.51.0 From 56763be4009f3be178e534f9b3c10594abec5b6e Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 12 Feb 2025 16:43:13 -0500 Subject: [PATCH 09/16] drm/amdgpu/umsch: tidy up the ucode name string handling Make the constant parts of the name part of the string we pass to amdgpu_ucode_request(). Only the version number varies from IP to IP. Reviewed-by: Saleemkhan Jamadar Signed-off-by: Alex Deucher Cc: Lang Yu --- drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c index a7f2648245ec..cd707d70a0bf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c @@ -126,14 +126,14 @@ int amdgpu_umsch_mm_init_microcode(struct amdgpu_umsch_mm *umsch) switch (amdgpu_ip_version(adev, VCN_HWIP, 0)) { case IP_VERSION(4, 0, 5): case IP_VERSION(4, 0, 6): - fw_name = "amdgpu/umsch_mm_4_0_0.bin"; + fw_name = "4_0_0"; break; default: return -EINVAL; } r = amdgpu_ucode_request(adev, &adev->umsch_mm.fw, AMDGPU_UCODE_REQUIRED, - "%s", fw_name); + "amdgpu/umsch_mm_%s.bin", fw_name); if (r) { release_firmware(adev->umsch_mm.fw); adev->umsch_mm.fw = NULL; -- 2.51.0 From 0487f50310cfeec1bb4480a67294fc7081c5ed22 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 10 Feb 2025 17:45:14 -0500 Subject: [PATCH 10/16] drm/amdgpu/vcn5.0.1: use correct dpm helper The VCN and UVD helpers were split in commit ff69bba05f08 ("drm/amd/pm: add inst to dpm_set_powergating_by_smu") However, this happened in parallel to the vcn 5.0.1 development so it was missed there. Fixes: 346492f30ce3 ("drm/amdgpu: Add VCN_5_0_1 support") Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher Cc: Sonny Jiang Cc: Boyuan Zhang --- drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c index 8b463c977d08..8b0b3739a537 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c @@ -575,8 +575,10 @@ static int vcn_v5_0_1_start(struct amdgpu_device *adev) uint32_t tmp; int i, j, k, r, vcn_inst; - if (adev->pm.dpm_enabled) - amdgpu_dpm_enable_uvd(adev, true); + for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { + if (adev->pm.dpm_enabled) + amdgpu_dpm_enable_vcn(adev, true, i); + } for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr; @@ -816,8 +818,10 @@ static int vcn_v5_0_1_stop(struct amdgpu_device *adev) WREG32_SOC15(VCN, vcn_inst, regUVD_STATUS, 0); } - if (adev->pm.dpm_enabled) - amdgpu_dpm_enable_uvd(adev, false); + for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { + if (adev->pm.dpm_enabled) + amdgpu_dpm_enable_vcn(adev, false, i); + } return 0; } -- 2.51.0 From 77802398097a55b62030466dd38efaf21c32ee76 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 12 Feb 2025 09:58:01 -0500 Subject: [PATCH 11/16] drm/amdgpu/vcn5.0.1: drop dpm power helpers VCN 5.0.1 doesn't support powergating so there is no need to call these. Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c index 8b0b3739a537..288a77179036 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c @@ -575,11 +575,6 @@ static int vcn_v5_0_1_start(struct amdgpu_device *adev) uint32_t tmp; int i, j, k, r, vcn_inst; - for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { - if (adev->pm.dpm_enabled) - amdgpu_dpm_enable_vcn(adev, true, i); - } - for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr; @@ -818,11 +813,6 @@ static int vcn_v5_0_1_stop(struct amdgpu_device *adev) WREG32_SOC15(VCN, vcn_inst, regUVD_STATUS, 0); } - for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { - if (adev->pm.dpm_enabled) - amdgpu_dpm_enable_vcn(adev, false, i); - } - return 0; } -- 2.51.0 From eda80f1c2a00e3b62060de031f5bc547003f288d Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 12 Feb 2025 10:05:04 -0500 Subject: [PATCH 12/16] drm/amdgpu/vcn4.0.3: drop dpm power helpers VCN 4.0.3 doesn't support powergating so there is no need to call these. Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c index f0716c10f23e..f1f07badb9b7 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c @@ -1127,11 +1127,6 @@ static int vcn_v4_0_3_start(struct amdgpu_device *adev) int i, j, k, r, vcn_inst; uint32_t tmp; - for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { - if (adev->pm.dpm_enabled) - amdgpu_dpm_enable_vcn(adev, true, i); - } - for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG) { r = vcn_v4_0_3_start_dpg_mode(adev, i, adev->vcn.indirect_sram); @@ -1403,11 +1398,6 @@ static int vcn_v4_0_3_stop(struct amdgpu_device *adev) vcn_v4_0_3_enable_clock_gating(adev, i); } Done: - for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { - if (adev->pm.dpm_enabled) - amdgpu_dpm_enable_vcn(adev, false, i); - } - return 0; } -- 2.51.0 From 2012aff9815e8a0b6439a2fe73aedef12ba4595e Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Thu, 13 Feb 2025 23:43:46 +0530 Subject: [PATCH 13/16] drm/amdgpu: Rename VCN clock gating function for consistency MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Change the function name from vcn_v2_5_enable_clock_gating_inst to vcn_v2_5_enable_clock_gating to ensure consistency in naming. Fixes the below with gcc W=1: drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c:781: warning: expecting prototype for vcn_v2_5_enable_clock_gating_inst(). Prototype was for vcn_v2_5_enable_clock_gating() instead Cc: Leo Liu Cc: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c index b9be304aa294..105e59f6132b 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c @@ -770,7 +770,7 @@ static void vcn_v2_5_clock_gating_dpg_mode(struct amdgpu_device *adev, } /** - * vcn_v2_5_enable_clock_gating_inst - enable VCN clock gating + * vcn_v2_5_enable_clock_gating - enable VCN clock gating * * @adev: amdgpu_device pointer * @i: instance to enable clockgating on -- 2.51.0 From 523b69c6544554ed7d4b2a009cea0c001b4077b3 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 24 Jan 2025 23:31:10 +0800 Subject: [PATCH 14/16] drm/amd/include: Add amd cper header AMD is using Common Platform Error Record (CPER) format to report all gpu hardware errors. v2: add program attribute Signed-off-by: Hawking Zhang Signed-off-by: Xiang Liu Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/include/amd_cper.h | 269 +++++++++++++++++++++++++ 1 file changed, 269 insertions(+) create mode 100644 drivers/gpu/drm/amd/include/amd_cper.h diff --git a/drivers/gpu/drm/amd/include/amd_cper.h b/drivers/gpu/drm/amd/include/amd_cper.h new file mode 100644 index 000000000000..086869264425 --- /dev/null +++ b/drivers/gpu/drm/amd/include/amd_cper.h @@ -0,0 +1,269 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#ifndef __AMD_CPER_H__ +#define __AMD_CPER_H__ + +#include + +#define CPER_HDR_REV_1 (0x100) +#define CPER_SEC_MINOR_REV_1 (0x01) +#define CPER_SEC_MAJOR_REV_22 (0x22) +#define CPER_MAX_OAM_COUNT (8) + +#define CPER_CTX_TYPE_CRASH (1) +#define CPER_CTX_TYPE_BOOT (9) + +#define CPER_CREATOR_ID_AMDGPU "amdgpu" + +#define CPER_NOTIFY_MCE \ + GUID_INIT(0xE8F56FFE, 0x919C, 0x4cc5, 0xBA, 0x88, 0x65, 0xAB, \ + 0xE1, 0x49, 0x13, 0xBB) +#define CPER_NOTIFY_CMC \ + GUID_INIT(0x2DCE8BB1, 0xBDD7, 0x450e, 0xB9, 0xAD, 0x9C, 0xF4, \ + 0xEB, 0xD4, 0xF8, 0x90) +#define BOOT_TYPE \ + GUID_INIT(0x3D61A466, 0xAB40, 0x409a, 0xA6, 0x98, 0xF3, 0x62, \ + 0xD4, 0x64, 0xB3, 0x8F) + +#define AMD_CRASHDUMP \ + GUID_INIT(0x32AC0C78, 0x2623, 0x48F6, 0xB0, 0xD0, 0x73, 0x65, \ + 0x72, 0x5F, 0xD6, 0xAE) +#define AMD_GPU_NONSTANDARD_ERROR \ + GUID_INIT(0x32AC0C78, 0x2623, 0x48F6, 0x81, 0xA2, 0xAC, 0x69, \ + 0x17, 0x80, 0x55, 0x1D) +#define PROC_ERR_SECTION_TYPE \ + GUID_INIT(0xDC3EA0B0, 0xA144, 0x4797, 0xB9, 0x5B, 0x53, 0xFA, \ + 0x24, 0x2B, 0x6E, 0x1D) + +enum cper_error_severity { + CPER_SEV_NON_FATAL_UNCORRECTED = 0, + CPER_SEV_FATAL = 1, + CPER_SEV_NON_FATAL_CORRECTED = 2, + CPER_SEV_NUM = 3, + + CPER_SEV_UNUSED = 10, +}; + +enum cper_aca_reg { + CPER_ACA_REG_CTL_LO = 0, + CPER_ACA_REG_CTL_HI = 1, + CPER_ACA_REG_STATUS_LO = 2, + CPER_ACA_REG_STATUS_HI = 3, + CPER_ACA_REG_ADDR_LO = 4, + CPER_ACA_REG_ADDR_HI = 5, + CPER_ACA_REG_MISC0_LO = 6, + CPER_ACA_REG_MISC0_HI = 7, + CPER_ACA_REG_CONFIG_LO = 8, + CPER_ACA_REG_CONFIG_HI = 9, + CPER_ACA_REG_IPID_LO = 10, + CPER_ACA_REG_IPID_HI = 11, + CPER_ACA_REG_SYND_LO = 12, + CPER_ACA_REG_SYND_HI = 13, + + CPER_ACA_REG_COUNT = 32, +}; + +#pragma pack(push, 1) + +struct cper_timestamp { + uint8_t seconds; + uint8_t minutes; + uint8_t hours; + uint8_t flag; + uint8_t day; + uint8_t month; + uint8_t year; + uint8_t century; +}; + +struct cper_hdr { + char signature[4]; /* "CPER" */ + uint16_t revision; + uint32_t signature_end; /* 0xFFFFFFFF */ + uint16_t sec_cnt; + enum cper_error_severity error_severity; + union { + struct { + uint32_t platform_id : 1; + uint32_t timestamp : 1; + uint32_t partition_id : 1; + uint32_t reserved : 29; + } valid_bits; + uint32_t valid_mask; + }; + uint32_t record_length; /* Total size of CPER Entry */ + struct cper_timestamp timestamp; + char platform_id[16]; + guid_t partition_id; /* Reserved */ + char creator_id[16]; + guid_t notify_type; /* CMC, MCE */ + char record_id[8]; /* Unique CPER Entry ID */ + uint32_t flags; /* Reserved */ + uint64_t persistence_info; /* Reserved */ + uint8_t reserved[12]; /* Reserved */ +}; + +struct cper_sec_desc { + uint32_t sec_offset; /* Offset from the start of CPER entry */ + uint32_t sec_length; + uint8_t revision_minor; /* CPER_SEC_MINOR_REV_1 */ + uint8_t revision_major; /* CPER_SEC_MAJOR_REV_22 */ + union { + struct { + uint8_t fru_id : 1; + uint8_t fru_text : 1; + uint8_t reserved : 6; + } valid_bits; + uint8_t valid_mask; + }; + uint8_t reserved; + union { + struct { + uint32_t primary : 1; + uint32_t reserved1 : 2; + uint32_t exceed_err_threshold : 1; + uint32_t latent_err : 1; + uint32_t reserved2 : 27; + } flag_bits; + uint32_t flag_mask; + }; + guid_t sec_type; + char fru_id[16]; + enum cper_error_severity severity; + char fru_text[20]; +}; + +struct cper_sec_nonstd_err_hdr { + union { + struct { + uint64_t apic_id : 1; + uint64_t fw_id : 1; + uint64_t err_info_cnt : 6; + uint64_t err_context_cnt : 6; + } valid_bits; + uint64_t valid_mask; + }; + uint64_t apic_id; + char fw_id[48]; +}; + +struct cper_sec_nonstd_err_info { + guid_t error_type; + union { + struct { + uint64_t ms_chk : 1; + uint64_t target_addr_id : 1; + uint64_t req_id : 1; + uint64_t resp_id : 1; + uint64_t instr_ptr : 1; + uint64_t reserved : 59; + } valid_bits; + uint64_t valid_mask; + }; + union { + struct { + uint64_t err_type_valid : 1; + uint64_t pcc_valid : 1; + uint64_t uncorr_valid : 1; + uint64_t precise_ip_valid : 1; + uint64_t restartable_ip_valid : 1; + uint64_t overflow_valid : 1; + uint64_t reserved1 : 10; + uint64_t err_type : 2; + uint64_t pcc : 1; + uint64_t uncorr : 1; + uint64_t precised_ip : 1; + uint64_t restartable_ip : 1; + uint64_t overflow : 1; + uint64_t reserved2 : 41; + } ms_chk_bits; + uint64_t ms_chk_mask; + }; + uint64_t target_addr_id; + uint64_t req_id; + uint64_t resp_id; + uint64_t instr_ptr; +}; + +struct cper_sec_nonstd_err_ctx { + uint16_t reg_ctx_type; + uint16_t reg_arr_size; + uint32_t msr_addr; + uint64_t mm_reg_addr; + uint32_t reg_dump[CPER_ACA_REG_COUNT]; +}; + +struct cper_sec_nonstd_err { + struct cper_sec_nonstd_err_hdr hdr; + struct cper_sec_nonstd_err_info info; + struct cper_sec_nonstd_err_ctx ctx; +}; + +struct cper_sec_crashdump_hdr { + uint64_t reserved1; + uint64_t reserved2; + char fw_id[48]; + uint64_t reserved3[8]; +}; + +struct cper_sec_crashdump_reg_data { + uint32_t status_lo; + uint32_t status_hi; + uint32_t addr_lo; + uint32_t addr_hi; + uint32_t ipid_lo; + uint32_t ipid_hi; + uint32_t synd_lo; + uint32_t synd_hi; +}; + +struct cper_sec_crashdump_body_fatal { + uint16_t reg_ctx_type; + uint16_t reg_arr_size; + uint32_t reserved1; + uint64_t reserved2; + struct cper_sec_crashdump_reg_data data; +}; + +struct cper_sec_crashdump_body_boot { + uint16_t reg_ctx_type; + uint16_t reg_arr_size; + uint32_t reserved1; + uint64_t reserved2; + uint64_t msg[CPER_MAX_OAM_COUNT]; +}; + +struct cper_sec_crashdump_fatal { + struct cper_sec_crashdump_hdr hdr; + struct cper_sec_crashdump_body_fatal body; +}; + +struct cper_sec_crashdump_boot { + struct cper_sec_crashdump_hdr hdr; + struct cper_sec_crashdump_body_boot body; +}; + +#pragma pack(pop) + +#endif -- 2.51.0 From 92d5d2a09de16706fca340b6c1c197e562690da4 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 24 Jan 2025 23:37:33 +0800 Subject: [PATCH 15/16] drm/amdgpu: Introduce funcs for populating CPER Introduce utility functions designed to assist in populating CPER records. v2: call cper_init/fini in device_ip_init/fini. Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/Makefile | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 + drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c | 281 +++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h | 91 +++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 + 5 files changed, 382 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index 5b21674b07fb..aacc810cabb3 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -65,7 +65,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \ amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \ amdgpu_fw_attestation.o amdgpu_securedisplay.o \ amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \ - amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o amdgpu_dev_coredump.o + amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o amdgpu_dev_coredump.o \ + amdgpu_cper.o amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 0dbea25ab888..cb24b464be2b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -109,6 +109,7 @@ #include "amdgpu_mca.h" #include "amdgpu_aca.h" #include "amdgpu_ras.h" +#include "amdgpu_cper.h" #include "amdgpu_xcp.h" #include "amdgpu_seq64.h" #include "amdgpu_reg_state.h" @@ -1091,6 +1092,9 @@ struct amdgpu_device { /* ACA */ struct amdgpu_aca aca; + /* CPER */ + struct amdgpu_cper cper; + struct amdgpu_ip_block ip_blocks[AMDGPU_MAX_IP_NUM]; uint32_t harvest_ip_mask; int num_ip_blocks; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c new file mode 100644 index 000000000000..8ce5dc6efcf9 --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "amdgpu.h" + +static const guid_t MCE = CPER_NOTIFY_MCE; +static const guid_t CMC = CPER_NOTIFY_CMC; +static const guid_t BOOT = BOOT_TYPE; + +static const guid_t CRASHDUMP = AMD_CRASHDUMP; +static const guid_t RUNTIME = AMD_GPU_NONSTANDARD_ERROR; + +static void __inc_entry_length(struct cper_hdr *hdr, uint32_t size) +{ + hdr->record_length += size; +} + +void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev, + struct cper_hdr *hdr, + enum amdgpu_cper_type type, + enum cper_error_severity sev) +{ + hdr->signature[0] = 'C'; + hdr->signature[1] = 'P'; + hdr->signature[2] = 'E'; + hdr->signature[3] = 'R'; + hdr->revision = CPER_HDR_REV_1; + hdr->signature_end = 0xFFFFFFFF; + hdr->error_severity = sev; + + hdr->valid_bits.platform_id = 1; + hdr->valid_bits.partition_id = 1; + hdr->valid_bits.timestamp = 1; + /*TODO need to initialize hdr->timestamp */ + + snprintf(hdr->record_id, 8, "%d", atomic_inc_return(&adev->cper.unique_id)); + snprintf(hdr->platform_id, 16, "0x%04X:0x%04X", + adev->pdev->vendor, adev->pdev->device); + /* pmfw version should be part of creator_id according to CPER spec */ + snprintf(hdr->creator_id, 16, "%s", CPER_CREATOR_ID_AMDGPU); + + switch (type) { + case AMDGPU_CPER_TYPE_BOOT: + hdr->notify_type = BOOT; + break; + case AMDGPU_CPER_TYPE_FATAL: + case AMDGPU_CPER_TYPE_BP_THRESHOLD: + hdr->notify_type = MCE; + break; + case AMDGPU_CPER_TYPE_RUNTIME: + if (sev == CPER_SEV_NON_FATAL_CORRECTED) + hdr->notify_type = CMC; + else + hdr->notify_type = MCE; + break; + default: + dev_err(adev->dev, "Unknown CPER Type\n"); + break; + } + + __inc_entry_length(hdr, HDR_LEN); +} + +static int amdgpu_cper_entry_fill_section_desc(struct amdgpu_device *adev, + struct cper_sec_desc *section_desc, + bool bp_threshold, + bool poison, + enum cper_error_severity sev, + guid_t sec_type, + uint32_t section_length, + uint32_t section_offset) +{ + section_desc->revision_minor = CPER_SEC_MINOR_REV_1; + section_desc->revision_major = CPER_SEC_MAJOR_REV_22; + section_desc->sec_offset = section_offset; + section_desc->sec_length = section_length; + section_desc->valid_bits.fru_id = 1; + section_desc->valid_bits.fru_text = 1; + section_desc->flag_bits.primary = 1; + section_desc->severity = sev; + section_desc->sec_type = sec_type; + + if (adev->smuio.funcs && + adev->smuio.funcs->get_socket_id) + snprintf(section_desc->fru_text, 20, "OAM%d", + adev->smuio.funcs->get_socket_id(adev)); + /* TODO: fru_id is 16 bytes in CPER spec, but driver defines it as 20 bytes */ + snprintf(section_desc->fru_id, 16, "%llx", adev->unique_id); + + if (bp_threshold) + section_desc->flag_bits.exceed_err_threshold = 1; + if (poison) + section_desc->flag_bits.latent_err = 1; + + return 0; +} + +int amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device *adev, + struct cper_hdr *hdr, + uint32_t idx, + struct cper_sec_crashdump_reg_data reg_data) +{ + struct cper_sec_desc *section_desc; + struct cper_sec_crashdump_fatal *section; + + section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx)); + section = (struct cper_sec_crashdump_fatal *)((uint8_t *)hdr + + FATAL_SEC_OFFSET(hdr->sec_cnt, idx)); + + amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, false, + CPER_SEV_FATAL, CRASHDUMP, FATAL_SEC_LEN, + FATAL_SEC_OFFSET(hdr->sec_cnt, idx)); + + section->body.reg_ctx_type = CPER_CTX_TYPE_CRASH; + section->body.reg_arr_size = sizeof(reg_data); + section->body.data = reg_data; + + __inc_entry_length(hdr, SEC_DESC_LEN + FATAL_SEC_LEN); + + return 0; +} + +int amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device *adev, + struct cper_hdr *hdr, + uint32_t idx, + enum cper_error_severity sev, + uint32_t *reg_dump, + uint32_t reg_count) +{ + struct cper_sec_desc *section_desc; + struct cper_sec_nonstd_err *section; + bool poison; + + poison = (sev == CPER_SEV_NON_FATAL_CORRECTED) ? false : true; + section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx)); + section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr + + NONSTD_SEC_OFFSET(hdr->sec_cnt, idx)); + + amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, poison, + sev, RUNTIME, NONSTD_SEC_LEN, + NONSTD_SEC_OFFSET(hdr->sec_cnt, idx)); + + reg_count = min(reg_count, CPER_ACA_REG_COUNT); + + section->hdr.valid_bits.err_info_cnt = 1; + section->hdr.valid_bits.err_context_cnt = 1; + + section->info.error_type = RUNTIME; + section->info.ms_chk_bits.err_type_valid = 1; + section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH; + section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump); + + memcpy(section->ctx.reg_dump, reg_dump, reg_count * sizeof(uint32_t)); + + __inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN); + + return 0; +} + +int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev, + struct cper_hdr *hdr, + uint32_t idx) +{ + struct cper_sec_desc *section_desc; + struct cper_sec_nonstd_err *section; + + section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx)); + section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr + + NONSTD_SEC_OFFSET(hdr->sec_cnt, idx)); + + amdgpu_cper_entry_fill_section_desc(adev, section_desc, true, false, + CPER_SEV_FATAL, RUNTIME, NONSTD_SEC_LEN, + NONSTD_SEC_OFFSET(hdr->sec_cnt, idx)); + + section->hdr.valid_bits.err_info_cnt = 1; + section->hdr.valid_bits.err_context_cnt = 1; + + section->info.error_type = RUNTIME; + section->info.ms_chk_bits.err_type_valid = 1; + section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH; + section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump); + + /* Hardcoded Reg dump for bad page threshold CPER */ + section->ctx.reg_dump[CPER_ACA_REG_CTL_LO] = 0x1; + section->ctx.reg_dump[CPER_ACA_REG_CTL_HI] = 0x0; + section->ctx.reg_dump[CPER_ACA_REG_STATUS_LO] = 0x137; + section->ctx.reg_dump[CPER_ACA_REG_STATUS_HI] = 0xB0000000; + section->ctx.reg_dump[CPER_ACA_REG_ADDR_LO] = 0x0; + section->ctx.reg_dump[CPER_ACA_REG_ADDR_HI] = 0x0; + section->ctx.reg_dump[CPER_ACA_REG_MISC0_LO] = 0x0; + section->ctx.reg_dump[CPER_ACA_REG_MISC0_HI] = 0x0; + section->ctx.reg_dump[CPER_ACA_REG_CONFIG_LO] = 0x2; + section->ctx.reg_dump[CPER_ACA_REG_CONFIG_HI] = 0x1ff; + section->ctx.reg_dump[CPER_ACA_REG_IPID_LO] = 0x0; + section->ctx.reg_dump[CPER_ACA_REG_IPID_HI] = 0x96; + section->ctx.reg_dump[CPER_ACA_REG_SYND_LO] = 0x0; + section->ctx.reg_dump[CPER_ACA_REG_SYND_HI] = 0x0; + + __inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN); + + return 0; +} + +struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev, + enum amdgpu_cper_type type, + uint16_t section_count) +{ + struct cper_hdr *hdr; + uint32_t size = 0; + + size += HDR_LEN; + size += (SEC_DESC_LEN * section_count); + + switch (type) { + case AMDGPU_CPER_TYPE_RUNTIME: + case AMDGPU_CPER_TYPE_BP_THRESHOLD: + size += (NONSTD_SEC_LEN * section_count); + break; + case AMDGPU_CPER_TYPE_FATAL: + size += (FATAL_SEC_LEN * section_count); + break; + case AMDGPU_CPER_TYPE_BOOT: + size += (BOOT_SEC_LEN * section_count); + break; + default: + dev_err(adev->dev, "Unknown CPER Type!\n"); + return NULL; + } + + hdr = kzalloc(size, GFP_KERNEL); + if (!hdr) + return NULL; + + /* Save this early */ + hdr->sec_cnt = section_count; + + return hdr; +} + +int amdgpu_cper_init(struct amdgpu_device *adev) +{ + mutex_init(&adev->cper.cper_lock); + + adev->cper.enabled = true; + adev->cper.max_count = CPER_MAX_ALLOWED_COUNT; + + /*TODO: initialize cper ring*/ + + return 0; +} + +int amdgpu_cper_fini(struct amdgpu_device *adev) +{ + adev->cper.enabled = false; + + /*TODO: free cper ring */ + adev->cper.count = 0; + adev->cper.wptr = 0; + + return 0; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h new file mode 100644 index 000000000000..0ae845420983 --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __AMDGPU_CPER_H__ +#define __AMDGPU_CPER_H__ + +#include "amd_cper.h" + +#define CPER_MAX_ALLOWED_COUNT 0x1000 +#define HDR_LEN (sizeof(struct cper_hdr)) +#define SEC_DESC_LEN (sizeof(struct cper_sec_desc)) + +#define BOOT_SEC_LEN (sizeof(struct cper_sec_crashdump_boot)) +#define FATAL_SEC_LEN (sizeof(struct cper_sec_crashdump_fatal)) +#define NONSTD_SEC_LEN (sizeof(struct cper_sec_nonstd_err)) + +#define SEC_DESC_OFFSET(idx) (HDR_LEN + (SEC_DESC_LEN * idx)) + +#define BOOT_SEC_OFFSET(count, idx) (HDR_LEN + (SEC_DESC_LEN * count) + (BOOT_SEC_LEN * idx)) +#define FATAL_SEC_OFFSET(count, idx) (HDR_LEN + (SEC_DESC_LEN * count) + (FATAL_SEC_LEN * idx)) +#define NONSTD_SEC_OFFSET(count, idx) (HDR_LEN + (SEC_DESC_LEN * count) + (NONSTD_SEC_LEN * idx)) + +enum amdgpu_cper_type { + AMDGPU_CPER_TYPE_RUNTIME, + AMDGPU_CPER_TYPE_FATAL, + AMDGPU_CPER_TYPE_BOOT, + AMDGPU_CPER_TYPE_BP_THRESHOLD, +}; + +struct amdgpu_cper { + bool enabled; + + atomic_t unique_id; + struct mutex cper_lock; + + /* Lifetime CPERs generated */ + uint32_t count; + uint32_t max_count; + + uint32_t wptr; + + void *ring[CPER_MAX_ALLOWED_COUNT]; +}; + +void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev, + struct cper_hdr *hdr, + enum amdgpu_cper_type type, + enum cper_error_severity sev); +int amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device *adev, + struct cper_hdr *hdr, + uint32_t idx, + struct cper_sec_crashdump_reg_data reg_data); +int amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device *adev, + struct cper_hdr *hdr, + uint32_t idx, + enum cper_error_severity sev, + uint32_t *reg_dump, + uint32_t reg_count); +int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev, + struct cper_hdr *hdr, + uint32_t section_idx); + +struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev, + enum amdgpu_cper_type type, + uint16_t section_count); + +int amdgpu_cper_init(struct amdgpu_device *adev); +int amdgpu_cper_fini(struct amdgpu_device *adev); + +#endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e1fc020a99bd..0de2476c2ee7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3091,6 +3091,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) amdgpu_fru_get_product_info(adev); + r = amdgpu_cper_init(adev); + init_failed: return r; @@ -3451,6 +3453,8 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) { int i, r; + amdgpu_cper_fini(adev); + if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) amdgpu_virt_release_ras_err_handler_data(adev); -- 2.51.0 From 76b1f8b32dc1ad8bb5f6fe2faa669e2858d35932 Mon Sep 17 00:00:00 2001 From: Candice Li Date: Tue, 11 Feb 2025 09:58:24 +0800 Subject: [PATCH 16/16] drm/amdgpu: Optimize the enablement of GECC Enable GECC only when the default memory ECC mode or the module parameter amdgpu_ras_enable is activated. v2: Add kernel message to remind users explicitly set amdgpu_ras_enable=1 before driver loading to enable GECC and set amdgpu_ras_enable=0 to disable GECC when GECC is currently enabled if needed. Signed-off-by: Candice Li Reviewed-by: Hawking Zhang Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + .../gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c | 18 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 65 +++++++++++-------- 3 files changed, 52 insertions(+), 32 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index cb24b464be2b..2b1990ea9639 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1154,6 +1154,7 @@ struct amdgpu_device { struct ratelimit_state throttling_logging_rs; uint32_t ras_hw_enabled; uint32_t ras_enabled; + bool ras_default_ecc_enabled; bool no_hw_access; struct pci_saved_state *pci_state; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c index f873dd3cae16..eb015bdda8a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c @@ -549,9 +549,10 @@ bool amdgpu_atomfirmware_mem_ecc_supported(struct amdgpu_device *adev) u16 data_offset, size; union umc_info *umc_info; u8 frev, crev; - bool ecc_default_enabled = false; + bool mem_ecc_enabled = false; u8 umc_config; u32 umc_config1; + adev->ras_default_ecc_enabled = false; index = get_index_into_master_table(atom_master_list_of_data_tables_v2_1, umc_info); @@ -563,20 +564,22 @@ bool amdgpu_atomfirmware_mem_ecc_supported(struct amdgpu_device *adev) switch (crev) { case 1: umc_config = le32_to_cpu(umc_info->v31.umc_config); - ecc_default_enabled = + mem_ecc_enabled = (umc_config & UMC_CONFIG__DEFAULT_MEM_ECC_ENABLE) ? true : false; break; case 2: umc_config = le32_to_cpu(umc_info->v32.umc_config); - ecc_default_enabled = + mem_ecc_enabled = (umc_config & UMC_CONFIG__DEFAULT_MEM_ECC_ENABLE) ? true : false; break; case 3: umc_config = le32_to_cpu(umc_info->v33.umc_config); umc_config1 = le32_to_cpu(umc_info->v33.umc_config1); - ecc_default_enabled = + mem_ecc_enabled = ((umc_config & UMC_CONFIG__DEFAULT_MEM_ECC_ENABLE) || (umc_config1 & UMC_CONFIG1__ENABLE_ECC_CAPABLE)) ? true : false; + adev->ras_default_ecc_enabled = + (umc_config & UMC_CONFIG__DEFAULT_MEM_ECC_ENABLE) ? true : false; break; default: /* unsupported crev */ @@ -585,9 +588,12 @@ bool amdgpu_atomfirmware_mem_ecc_supported(struct amdgpu_device *adev) } else if (frev == 4) { switch (crev) { case 0: + umc_config = le32_to_cpu(umc_info->v40.umc_config); umc_config1 = le32_to_cpu(umc_info->v40.umc_config1); - ecc_default_enabled = + mem_ecc_enabled = (umc_config1 & UMC_CONFIG1__ENABLE_ECC_CAPABLE) ? true : false; + adev->ras_default_ecc_enabled = + (umc_config & UMC_CONFIG__DEFAULT_MEM_ECC_ENABLE) ? true : false; break; default: /* unsupported crev */ @@ -599,7 +605,7 @@ bool amdgpu_atomfirmware_mem_ecc_supported(struct amdgpu_device *adev) } } - return ecc_default_enabled; + return mem_ecc_enabled; } /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 94f5e5e0d1d6..0f2eb69ad715 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1794,34 +1794,47 @@ int psp_ras_initialize(struct psp_context *psp) if (ret) dev_warn(adev->dev, "PSP get boot config failed\n"); - if (!amdgpu_ras_is_supported(psp->adev, AMDGPU_RAS_BLOCK__UMC)) { - if (!boot_cfg) { - dev_info(adev->dev, "GECC is disabled\n"); - } else { - /* disable GECC in next boot cycle if ras is - * disabled by module parameter amdgpu_ras_enable - * and/or amdgpu_ras_mask, or boot_config_get call - * is failed - */ - ret = psp_boot_config_set(adev, 0); - if (ret) - dev_warn(adev->dev, "PSP set boot config failed\n"); - else - dev_warn(adev->dev, "GECC will be disabled in next boot cycle if set amdgpu_ras_enable and/or amdgpu_ras_mask to 0x0\n"); - } + if (boot_cfg == 1 && !adev->ras_default_ecc_enabled && + amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)) { + dev_warn(adev->dev, "GECC is currently enabled, which may affect performance\n"); + dev_warn(adev->dev, + "To disable GECC, please reboot the system and load the amdgpu driver with the parameter amdgpu_ras_enable=0\n"); } else { - if (boot_cfg == 1) { - dev_info(adev->dev, "GECC is enabled\n"); + if ((adev->ras_default_ecc_enabled || amdgpu_ras_enable == 1) && + amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)) { + if (boot_cfg == 1) { + dev_info(adev->dev, "GECC is enabled\n"); + } else { + /* enable GECC in next boot cycle if it is disabled + * in boot config, or force enable GECC if failed to + * get boot configuration + */ + ret = psp_boot_config_set(adev, BOOT_CONFIG_GECC); + if (ret) + dev_warn(adev->dev, "PSP set boot config failed\n"); + else + dev_warn(adev->dev, "GECC will be enabled in next boot cycle\n"); + } } else { - /* enable GECC in next boot cycle if it is disabled - * in boot config, or force enable GECC if failed to - * get boot configuration - */ - ret = psp_boot_config_set(adev, BOOT_CONFIG_GECC); - if (ret) - dev_warn(adev->dev, "PSP set boot config failed\n"); - else - dev_warn(adev->dev, "GECC will be enabled in next boot cycle\n"); + if (!boot_cfg) { + if (!adev->ras_default_ecc_enabled && + amdgpu_ras_enable != 1 && + amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)) + dev_warn(adev->dev, "GECC is disabled, set amdgpu_ras_enable=1 to enable GECC in next boot cycle if needed\n"); + else + dev_info(adev->dev, "GECC is disabled\n"); + } else { + /* disable GECC in next boot cycle if ras is + * disabled by module parameter amdgpu_ras_enable + * and/or amdgpu_ras_mask, or boot_config_get call + * is failed + */ + ret = psp_boot_config_set(adev, 0); + if (ret) + dev_warn(adev->dev, "PSP set boot config failed\n"); + else + dev_warn(adev->dev, "GECC will be disabled in next boot cycle if set amdgpu_ras_enable and/or amdgpu_ras_mask to 0x0\n"); + } } } } -- 2.51.0