From a86e0c0e94373aebc39c2efedaefc408f6a49fe3 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Fri, 15 Nov 2024 11:08:02 +0530 Subject: [PATCH 01/16] drm/amdgpu: Add init level for post reset reinit When device needs to be reset before initialization, it's not required for all IPs to be initialized before a reset. In such cases, it needs to identify whether the IP/feature is initialized for the first time or whether it's reinitialized after a reset. Add RESET_RECOVERY init level to identify post reset reinitialization phase. This only provides a device level identification, IP/features may choose to track their state independently also. Signed-off-by: Lijo Lazar Acked-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/aldebaran.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 25 ++++++++++++++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 5 +++++ drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 2 ++ drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c | 2 ++ drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c | 2 ++ 7 files changed, 38 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c index 3a588fecb0c5..f44de9d4b6a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c +++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c @@ -330,6 +330,8 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl, } list_for_each_entry(tmp_adev, reset_device_list, reset_list) { + amdgpu_set_init_level(tmp_adev, + AMDGPU_INIT_LEVEL_RESET_RECOVERY); dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); r = aldebaran_mode2_restore_ip(tmp_adev); @@ -375,6 +377,8 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl, tmp_adev); if (!r) { + amdgpu_set_init_level(tmp_adev, + AMDGPU_INIT_LEVEL_DEFAULT); amdgpu_irq_gpu_reset_resume_helper(tmp_adev); r = amdgpu_ib_ring_tests(tmp_adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index d8bc6da50016..4653a8d2823a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -839,6 +839,7 @@ struct amdgpu_mqd { enum amdgpu_init_lvl_id { AMDGPU_INIT_LEVEL_DEFAULT, AMDGPU_INIT_LEVEL_MINIMAL_XGMI, + AMDGPU_INIT_LEVEL_RESET_RECOVERY, }; struct amdgpu_init_level { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 0171d240fcb0..5ef95161e632 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -156,6 +156,11 @@ struct amdgpu_init_level amdgpu_init_default = { .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, }; +struct amdgpu_init_level amdgpu_init_recovery = { + .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, + .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, +}; + /* * Minimal blocks needed to be initialized before a XGMI hive can be reset. This * is used for cases like reset on initialization where the entire hive needs to @@ -182,6 +187,9 @@ void amdgpu_set_init_level(struct amdgpu_device *adev, case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: adev->init_lvl = &amdgpu_init_minimal_xgmi; break; + case AMDGPU_INIT_LEVEL_RESET_RECOVERY: + adev->init_lvl = &amdgpu_init_recovery; + break; case AMDGPU_INIT_LEVEL_DEFAULT: fallthrough; default: @@ -5419,7 +5427,7 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) struct list_head *device_list_handle; bool full_reset, vram_lost = false; struct amdgpu_device *tmp_adev; - int r; + int r, init_level; device_list_handle = reset_context->reset_device_list; @@ -5428,10 +5436,18 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); + /** + * If it's reset on init, it's default init level, otherwise keep level + * as recovery level. + */ + if (reset_context->method == AMD_RESET_METHOD_ON_INIT) + init_level = AMDGPU_INIT_LEVEL_DEFAULT; + else + init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; + r = 0; list_for_each_entry(tmp_adev, device_list_handle, reset_list) { - /* After reset, it's default init level */ - amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT); + amdgpu_set_init_level(tmp_adev, init_level); if (full_reset) { /* post card */ amdgpu_ras_set_fed(tmp_adev, false); @@ -5518,6 +5534,9 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) out: if (!r) { + /* IP init is complete now, set level as default */ + amdgpu_set_init_level(tmp_adev, + AMDGPU_INIT_LEVEL_DEFAULT); amdgpu_irq_gpu_reset_resume_helper(tmp_adev); r = amdgpu_ib_ring_tests(tmp_adev); if (r) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index 24dae7cdbe95..a0acb65f4b40 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -342,3 +342,8 @@ void amdgpu_reset_get_desc(struct amdgpu_reset_context *rst_ctxt, char *buf, strscpy(buf, "unknown", len); } } + +bool amdgpu_reset_in_recovery(struct amdgpu_device *adev) +{ + return (adev->init_lvl->level == AMDGPU_INIT_LEVEL_RESET_RECOVERY); +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index f8628bc898df..4d9b9701139b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -158,4 +158,6 @@ extern struct amdgpu_reset_handler xgmi_reset_on_init_handler; int amdgpu_reset_do_xgmi_reset_on_init( struct amdgpu_reset_context *reset_context); +bool amdgpu_reset_in_recovery(struct amdgpu_device *adev); + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c index 9b01e074af47..2594467bdd87 100644 --- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c @@ -220,6 +220,7 @@ sienna_cichlid_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl, int r; struct amdgpu_device *tmp_adev = (struct amdgpu_device *)reset_ctl->handle; + amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_RESET_RECOVERY); dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); r = sienna_cichlid_mode2_restore_ip(tmp_adev); @@ -237,6 +238,7 @@ sienna_cichlid_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl, amdgpu_irq_gpu_reset_resume_helper(tmp_adev); + amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT); r = amdgpu_ib_ring_tests(tmp_adev); if (r) { dev_err(tmp_adev->dev, diff --git a/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c b/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c index e70ebad3f9fa..70569ea906bc 100644 --- a/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c +++ b/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c @@ -221,6 +221,7 @@ smu_v13_0_10_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl, int r; struct amdgpu_device *tmp_adev = (struct amdgpu_device *)reset_ctl->handle; + amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_RESET_RECOVERY); dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); r = smu_v13_0_10_mode2_restore_ip(tmp_adev); @@ -234,6 +235,7 @@ smu_v13_0_10_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl, amdgpu_irq_gpu_reset_resume_helper(tmp_adev); + amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT); r = amdgpu_ib_ring_tests(tmp_adev); if (r) { dev_err(tmp_adev->dev, -- 2.50.1 From e283f4fb0862647f4bb02e78d728bc8fb9eef18d Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Fri, 15 Nov 2024 11:35:50 +0530 Subject: [PATCH 02/16] drm/amdgpu: Use reset recovery state checks Some in_reset checks are infact checking whether the state is reinitialization after reset. Replace with reset_in_recovery calls to identify that it's really checking for recovery stage after reset. Signed-off-by: Lijo Lazar Acked-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5ef95161e632..714d2e586eac 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3258,7 +3258,7 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) return r; } - if (!amdgpu_in_reset(adev)) + if (!amdgpu_reset_in_recovery(adev)) amdgpu_ras_set_error_query_ready(adev, true); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 1bc95b0cdbb8..4c9fa24dd972 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1298,7 +1298,7 @@ int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk, struct ras_manager *obj; /* in resume phase, no need to create aca fs node */ - if (adev->in_suspend || amdgpu_in_reset(adev)) + if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) return 0; obj = get_ras_manager(adev, blk); @@ -3610,7 +3610,7 @@ static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev) ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr; /* init event manager with node 0 on xgmi system */ - if (!amdgpu_in_reset(adev)) { + if (!amdgpu_reset_in_recovery(adev)) { if (!hive || adev->gmc.xgmi.node_id == 0) ras_event_mgr_init(ras->event_mgr); } @@ -3825,7 +3825,7 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev, r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1); if (r) { - if (adev->in_suspend || amdgpu_in_reset(adev)) { + if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) { /* in resume phase, if fail to enable ras, * clean up all ras fs nodes, and disable ras */ goto cleanup; @@ -3837,7 +3837,7 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev, amdgpu_persistent_edc_harvesting(adev, ras_block); /* in resume phase, no need to create ras fs node */ - if (adev->in_suspend || amdgpu_in_reset(adev)) + if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) return 0; ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); @@ -3967,7 +3967,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) amdgpu_ras_event_mgr_init(adev); if (amdgpu_ras_aca_is_supported(adev)) { - if (amdgpu_in_reset(adev)) { + if (amdgpu_reset_in_recovery(adev)) { if (amdgpu_aca_is_enabled(adev)) r = amdgpu_aca_reset(adev); else -- 2.50.1 From c3e3c1aac0bf25e0f3f9b1557766fc9b89fb318b Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 15 Nov 2024 18:26:06 +0100 Subject: [PATCH 03/16] drm/radeon: Constify struct pci_device_id MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit 'struct pci_device_id' is not modified in this driver. Constifying this structure moves some data to a read-only section, so increase overall security. On a x86_64, with allmodconfig: Before: ====== text data bss dec hex filename 11984 28672 44 40700 9efc drivers/gpu/drm/radeon/radeon_drv.o After: ===== text data bss dec hex filename 40000 664 44 40708 9f04 drivers/gpu/drm/radeon/radeon_drv.o Acked-by: Christian König Signed-off-by: Christophe JAILLET Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/radeon_drv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c index 23d6d1a2586d..5e958cc223f4 100644 --- a/drivers/gpu/drm/radeon/radeon_drv.c +++ b/drivers/gpu/drm/radeon/radeon_drv.c @@ -248,10 +248,9 @@ int radeon_cik_support = 1; MODULE_PARM_DESC(cik_support, "CIK support (1 = enabled (default), 0 = disabled)"); module_param_named(cik_support, radeon_cik_support, int, 0444); -static struct pci_device_id pciidlist[] = { +static const struct pci_device_id pciidlist[] = { radeon_PCI_IDS }; - MODULE_DEVICE_TABLE(pci, pciidlist); static const struct drm_driver kms_driver; -- 2.50.1 From 7037bb04265ef05c6ffad56d884b0df76f57b095 Mon Sep 17 00:00:00 2001 From: Steven 'Steve' Kendall Date: Fri, 15 Nov 2024 21:17:58 +0000 Subject: [PATCH 04/16] drm/radeon: Fix spurious unplug event on radeon HDMI On several HP models (tested on HP 3125 and HP Probook 455 G2), spurious unplug events are emitted upon login on Chrome OS. This is likely due to the way Chrome OS restarts graphics upon login, so it's possible it's an issue on other distributions but not as common, though I haven't reproduced the issue elsewhere. Use logic from an earlier version of the merged change (see link below) which iterates over connectors and finds matching encoders, rather than the other way around. Also fixes an issue with screen mirroring on Chrome OS. I've deployed this patch on Fedora and did not observe any regression on these devices. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1569#note_1603002 Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3771 Fixes: 20ea34710f7b ("drm/radeon: Add HD-audio component notifier support (v6)") Signed-off-by: Steven 'Steve' Kendall Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/radeon_audio.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_audio.c b/drivers/gpu/drm/radeon/radeon_audio.c index 47aa06a9a942..5b69cc8011b4 100644 --- a/drivers/gpu/drm/radeon/radeon_audio.c +++ b/drivers/gpu/drm/radeon/radeon_audio.c @@ -760,16 +760,20 @@ static int radeon_audio_component_get_eld(struct device *kdev, int port, if (!rdev->audio.enabled || !rdev->mode_info.mode_config_initialized) return 0; - list_for_each_entry(encoder, &rdev_to_drm(rdev)->mode_config.encoder_list, head) { + list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + const struct drm_connector_helper_funcs *connector_funcs = + connector->helper_private; + encoder = connector_funcs->best_encoder(connector); + + if (!encoder) + continue; + if (!radeon_encoder_is_digital(encoder)) continue; radeon_encoder = to_radeon_encoder(encoder); dig = radeon_encoder->enc_priv; if (!dig->pin || dig->pin->id != port) continue; - connector = radeon_get_connector_for_encoder(encoder); - if (!connector) - continue; *enabled = true; ret = drm_eld_size(connector->eld); memcpy(buf, connector->eld, min(max_bytes, ret)); -- 2.50.1 From 6a057072ddd127255350357dd880903e8fa23f36 Mon Sep 17 00:00:00 2001 From: Zicheng Qu Date: Tue, 5 Nov 2024 14:01:36 +0000 Subject: [PATCH 05/16] drm/amd/display: Fix null check for pipe_ctx->plane_state in dcn20_program_pipe This commit addresses a null pointer dereference issue in dcn20_program_pipe(). Previously, commit 8e4ed3cf1642 ("drm/amd/display: Add null check for pipe_ctx->plane_state in dcn20_program_pipe") partially fixed the null pointer dereference issue. However, in dcn20_update_dchubp_dpp(), the variable pipe_ctx is passed in, and plane_state is accessed again through pipe_ctx. Multiple if statements directly call attributes of plane_state, leading to potential null pointer dereference issues. This patch adds necessary null checks to ensure stability. Fixes: 8e4ed3cf1642 ("drm/amd/display: Add null check for pipe_ctx->plane_state in dcn20_program_pipe") Reviewed-by: Tom Chung Signed-off-by: Zicheng Qu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c index 05424a9af58b..b029ec1b26d3 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c @@ -1925,9 +1925,9 @@ static void dcn20_program_pipe( dc->res_pool->hubbub, pipe_ctx->plane_res.hubp->inst, pipe_ctx->hubp_regs.det_size); } - if (pipe_ctx->update_flags.raw || - (pipe_ctx->plane_state && pipe_ctx->plane_state->update_flags.raw) || - pipe_ctx->stream->update_flags.raw) + if (pipe_ctx->plane_state && (pipe_ctx->update_flags.raw || + pipe_ctx->plane_state->update_flags.raw || + pipe_ctx->stream->update_flags.raw)) dcn20_update_dchubp_dpp(dc, pipe_ctx, context); if (pipe_ctx->plane_state && (pipe_ctx->update_flags.bits.enable || -- 2.50.1 From 2bc96c95070571c6c824e0d4c7783bee25a37876 Mon Sep 17 00:00:00 2001 From: Zicheng Qu Date: Tue, 5 Nov 2024 14:01:37 +0000 Subject: [PATCH 06/16] drm/amd/display: Fix null check for pipe_ctx->plane_state in hwss_setup_dpp This commit addresses a null pointer dereference issue in hwss_setup_dpp(). The issue could occur when pipe_ctx->plane_state is null. The fix adds a check to ensure `pipe_ctx->plane_state` is not null before accessing. This prevents a null pointer dereference. Fixes: 0baae6246307 ("drm/amd/display: Refactor fast update to use new HWSS build sequence") Reviewed-by: Tom Chung Signed-off-by: Zicheng Qu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc_hw_sequencer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/core/dc_hw_sequencer.c index 0419ee7f22a5..252af83e34a5 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_hw_sequencer.c @@ -898,6 +898,9 @@ void hwss_setup_dpp(union block_sequence_params *params) struct dpp *dpp = pipe_ctx->plane_res.dpp; struct dc_plane_state *plane_state = pipe_ctx->plane_state; + if (!plane_state) + return; + if (dpp && dpp->funcs->dpp_setup) { // program the input csc dpp->funcs->dpp_setup(dpp, -- 2.50.1 From 4217ef9ab763dbf8af2b0ecd3f74c0caa135668c Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 15 Nov 2024 23:02:25 +0800 Subject: [PATCH 07/16] drm/amd/display: Allow building DC with clang on LoongArch Clang on LoongArch (18+) appears to be unaffected by the bug causing excessive stack usage in calculate_bandwidth(). But when building DC_FP support the stack frame size can be as large as 2816 bytes, which causes the FRAME_WARN build warnings. So on LoongArch we allow building DC with clang, but disable DC_FP by default. The help message is also updated. Tested-by: Rui Wang Signed-off-by: Huacai Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/Kconfig | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/Kconfig b/drivers/gpu/drm/amd/display/Kconfig index df17e79c45c7..11e3f2f3b174 100644 --- a/drivers/gpu/drm/amd/display/Kconfig +++ b/drivers/gpu/drm/amd/display/Kconfig @@ -7,20 +7,21 @@ menu "Display Engine Configuration" config DRM_AMD_DC bool "AMD DC - Enable new display engine" default y - depends on BROKEN || !CC_IS_CLANG || ARM64 || RISCV || SPARC64 || X86_64 + depends on BROKEN || !CC_IS_CLANG || ARM64 || LOONGARCH || RISCV || SPARC64 || X86_64 select SND_HDA_COMPONENT if SND_HDA_CORE # !CC_IS_CLANG: https://github.com/ClangBuiltLinux/linux/issues/1752 - select DRM_AMD_DC_FP if ARCH_HAS_KERNEL_FPU_SUPPORT && !(CC_IS_CLANG && (ARM64 || RISCV)) + select DRM_AMD_DC_FP if ARCH_HAS_KERNEL_FPU_SUPPORT && !(CC_IS_CLANG && (ARM64 || LOONGARCH || RISCV)) help Choose this option if you want to use the new display engine support for AMDGPU. This adds required support for Vega and Raven ASICs. - calculate_bandwidth() is presently broken on all !(X86_64 || SPARC64 || ARM64) - architectures built with Clang (all released versions), whereby the stack - frame gets blown up to well over 5k. This would cause an immediate kernel - panic on most architectures. We'll revert this when the following bug report - has been resolved: https://github.com/llvm/llvm-project/issues/41896. + calculate_bandwidth() is presently broken on all !(X86_64 || SPARC64 || + ARM64 || LOONGARCH || RISCV) architectures built with Clang (all released + versions), whereby the stack frame gets blown up to well over 5k. This + would cause an immediate kernel panic on most architectures. We'll revert + this when the following bug report has been resolved: + https://github.com/llvm/llvm-project/issues/41896. config DRM_AMD_DC_FP def_bool n -- 2.50.1 From cdc6705f98ea3f854a60ba8c9b19228e197ae384 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Mon, 11 Nov 2024 20:11:38 +0530 Subject: [PATCH 08/16] drm/amdkfd: Use the correct wptr size Write pointer could be 32-bit or 64-bit. Use the correct size during initialization. Signed-off-by: Lijo Lazar Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 55d18aed257b..2b0a830f5b29 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -125,7 +125,7 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev, memset(kq->pq_kernel_addr, 0, queue_size); memset(kq->rptr_kernel, 0, sizeof(*kq->rptr_kernel)); - memset(kq->wptr_kernel, 0, sizeof(*kq->wptr_kernel)); + memset(kq->wptr_kernel, 0, dev->kfd->device_info.doorbell_size); prop.queue_size = queue_size; prop.is_interop = false; -- 2.50.1 From b0df0e777874549c128b43f7bf4989a2ed24b37a Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Tue, 19 Nov 2024 14:26:58 +0800 Subject: [PATCH 09/16] drm/amd/pm: disable pcie speed switching on Intel platform for smu v14.0.2/3 disable pcie speed switching on Intel platform for smu v14.0.2/3 based on Intel's requirement. v2: align the setting with smu v13. Signed-off-by: Kenneth Feng Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.11.x --- .../drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c index 5e2629219280..79609da2dd80 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c @@ -1465,15 +1465,35 @@ static int smu_v14_0_2_update_pcie_parameters(struct smu_context *smu, struct smu_14_0_dpm_context *dpm_context = smu->smu_dpm.dpm_context; struct smu_14_0_pcie_table *pcie_table = &dpm_context->dpm_tables.pcie_table; + int num_of_levels = pcie_table->num_of_link_levels; uint32_t smu_pcie_arg; int ret, i; - for (i = 0; i < pcie_table->num_of_link_levels; i++) { - if (pcie_table->pcie_gen[i] > pcie_gen_cap) + if (!num_of_levels) + return 0; + + if (!(smu->adev->pm.pp_feature & PP_PCIE_DPM_MASK)) { + if (pcie_table->pcie_gen[num_of_levels - 1] < pcie_gen_cap) + pcie_gen_cap = pcie_table->pcie_gen[num_of_levels - 1]; + + if (pcie_table->pcie_lane[num_of_levels - 1] < pcie_width_cap) + pcie_width_cap = pcie_table->pcie_lane[num_of_levels - 1]; + + /* Force all levels to use the same settings */ + for (i = 0; i < num_of_levels; i++) { pcie_table->pcie_gen[i] = pcie_gen_cap; - if (pcie_table->pcie_lane[i] > pcie_width_cap) pcie_table->pcie_lane[i] = pcie_width_cap; + } + } else { + for (i = 0; i < num_of_levels; i++) { + if (pcie_table->pcie_gen[i] > pcie_gen_cap) + pcie_table->pcie_gen[i] = pcie_gen_cap; + if (pcie_table->pcie_lane[i] > pcie_width_cap) + pcie_table->pcie_lane[i] = pcie_width_cap; + } + } + for (i = 0; i < num_of_levels; i++) { smu_pcie_arg = i << 16; smu_pcie_arg |= pcie_table->pcie_gen[i] << 8; smu_pcie_arg |= pcie_table->pcie_lane[i]; -- 2.50.1 From 76c7f08094767b5df3b60e18d1bdecddd4a5c844 Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Tue, 19 Nov 2024 15:03:22 +0800 Subject: [PATCH 10/16] drm/amd/pm: skip setting the power source on smu v14.0.2/3 skip setting power source on smu v14.0.2/3 Signed-off-by: Kenneth Feng Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.11.x --- drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c index 79609da2dd80..687a0f5ac94f 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c @@ -2775,7 +2775,6 @@ static const struct pptable_funcs smu_v14_0_2_ppt_funcs = { .get_unique_id = smu_v14_0_2_get_unique_id, .get_power_limit = smu_v14_0_2_get_power_limit, .set_power_limit = smu_v14_0_2_set_power_limit, - .set_power_source = smu_v14_0_set_power_source, .get_power_profile_mode = smu_v14_0_2_get_power_profile_mode, .set_power_profile_mode = smu_v14_0_2_set_power_profile_mode, .run_btc = smu_v14_0_run_btc, -- 2.50.1 From da868898cf4c5ddbd1f7406e356edce5d7211eb5 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Wed, 20 Nov 2024 08:34:39 +0530 Subject: [PATCH 11/16] drm/amd/pm: Remove arcturus min power limit As per power team, there is no need to impose a lower bound on arcturus power limit. Any unreasonable limit set will result in frequent throttling. Signed-off-by: Lijo Lazar Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c index 4b36c230e43a..12125303bb79 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c @@ -1344,8 +1344,12 @@ static int arcturus_get_power_limit(struct smu_context *smu, *default_power_limit = power_limit; if (max_power_limit) *max_power_limit = power_limit; + /** + * No lower bound is imposed on the limit. Any unreasonable limit set + * will result in frequent throttling. + */ if (min_power_limit) - *min_power_limit = power_limit; + *min_power_limit = 0; return 0; } -- 2.50.1 From 4c28e645aa3e4d697a02fc291b363702b8a6c921 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 19 Nov 2024 14:19:07 -0500 Subject: [PATCH 12/16] drm/amdgpu/gmc7: fix wait_for_idle callers MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The wait_for_idle signature was changed, but the callers were not. Reviewed-by: Sunil Khatri Reported-by: Michel Dänzer Fixes: 82ae6619a450 ("drm/amdgpu: update the handle ptr in wait_for_idle") Signed-off-by: Alex Deucher Cc: Sunil Khatri --- drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c index 07f45f1a503a..b6016f11956e 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c @@ -87,9 +87,14 @@ static void gmc_v7_0_init_golden_registers(struct amdgpu_device *adev) static void gmc_v7_0_mc_stop(struct amdgpu_device *adev) { + struct amdgpu_ip_block *ip_block; u32 blackout; - gmc_v7_0_wait_for_idle((void *)adev); + ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GMC); + if (!ip_block) + return; + + gmc_v7_0_wait_for_idle(ip_block); blackout = RREG32(mmMC_SHARED_BLACKOUT_CNTL); if (REG_GET_FIELD(blackout, MC_SHARED_BLACKOUT_CNTL, BLACKOUT_MODE) != 1) { @@ -251,9 +256,14 @@ static void gmc_v7_0_vram_gtt_location(struct amdgpu_device *adev, */ static void gmc_v7_0_mc_program(struct amdgpu_device *adev) { + struct amdgpu_ip_block *ip_block; u32 tmp; int i, j; + ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GMC); + if (!ip_block) + return; + /* Initialize HDP */ for (i = 0, j = 0; i < 32; i++, j += 0x6) { WREG32((0xb05 + j), 0x00000000); @@ -264,7 +274,7 @@ static void gmc_v7_0_mc_program(struct amdgpu_device *adev) } WREG32(mmHDP_REG_COHERENCY_FLUSH_CNTL, 0); - if (gmc_v7_0_wait_for_idle((void *)adev)) + if (gmc_v7_0_wait_for_idle(ip_block)) dev_warn(adev->dev, "Wait for MC idle timedout !\n"); if (adev->mode_info.num_crtc) { @@ -288,7 +298,7 @@ static void gmc_v7_0_mc_program(struct amdgpu_device *adev) WREG32(mmMC_VM_AGP_BASE, 0); WREG32(mmMC_VM_AGP_TOP, adev->gmc.agp_end >> 22); WREG32(mmMC_VM_AGP_BOT, adev->gmc.agp_start >> 22); - if (gmc_v7_0_wait_for_idle((void *)adev)) + if (gmc_v7_0_wait_for_idle(ip_block)) dev_warn(adev->dev, "Wait for MC idle timedout !\n"); WREG32(mmBIF_FB_EN, BIF_FB_EN__FB_READ_EN_MASK | BIF_FB_EN__FB_WRITE_EN_MASK); @@ -1183,7 +1193,7 @@ static int gmc_v7_0_soft_reset(struct amdgpu_ip_block *ip_block) if (srbm_soft_reset) { gmc_v7_0_mc_stop(adev); - if (gmc_v7_0_wait_for_idle((void *)adev)) + if (gmc_v7_0_wait_for_idle(ip_block)) dev_warn(adev->dev, "Wait for GMC idle timed out !\n"); tmp = RREG32(mmSRBM_SOFT_RESET); -- 2.50.1 From fb9898243a7b8133c969c9bbd5d5470f7c2e1374 Mon Sep 17 00:00:00 2001 From: "Jesse.zhang@amd.com" Date: Thu, 7 Nov 2024 22:53:47 +0800 Subject: [PATCH 13/16] drm/amdgpu: Add sysfs interface for vcn reset mask MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add the sysfs interface for vcn: vcn_reset_mask The interface is read-only and show the resets supported by the IP. For example, full adapter reset (mode1/mode2/BACO/etc), soft reset, queue reset, and pipe reset. V2: the sysfs node returns a text string instead of some flags (Christian) V2: the sysfs node returns a text string instead of some flags (Christian) v3: add a generic helper which takes the ring as parameter and print the strings in the order they are applied (Christian) check amdgpu_gpu_recovery before creating sysfs file itself, and initialize supported_reset_types in IP version files (Lijo) v4: s/sdma/vcn/ in the reset mask setup Acked-by: Christian König Signed-off-by: Jesse Zhang Suggested-by: Alex Deucher Reviewed-by: Tim Huang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 35 +++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h | 4 +++ drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 9 +++++++ drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 9 +++++++ drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c | 10 +++++++ 5 files changed, 67 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c index aecb78e0519f..5ced59bf0f00 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c @@ -1283,3 +1283,38 @@ int amdgpu_vcn_psp_update_sram(struct amdgpu_device *adev, int inst_idx, return psp_execute_ip_fw_load(&adev->psp, &ucode); } + +static ssize_t amdgpu_get_vcn_reset_mask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(ddev); + + if (!adev) + return -ENODEV; + + return amdgpu_show_reset_mask(buf, adev->vcn.supported_reset); +} + +static DEVICE_ATTR(vcn_reset_mask, 0444, + amdgpu_get_vcn_reset_mask, NULL); + +int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev) +{ + int r = 0; + + if (adev->vcn.num_vcn_inst) { + r = device_create_file(adev->dev, &dev_attr_vcn_reset_mask); + if (r) + return r; + } + + return r; +} + +void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev) +{ + if (adev->vcn.num_vcn_inst) + device_remove_file(adev->dev, &dev_attr_vcn_reset_mask); +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h index 765b809d48a2..1e32311c1dff 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h @@ -333,6 +333,8 @@ struct amdgpu_vcn { /* IP reg dump */ uint32_t *ip_dump; + + uint32_t supported_reset; }; struct amdgpu_fw_shared_rb_ptrs_struct { @@ -519,5 +521,7 @@ int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev); int amdgpu_vcn_psp_update_sram(struct amdgpu_device *adev, int inst_idx, enum AMDGPU_UCODE_ID ucode_id); int amdgpu_vcn_save_vcpu_bo(struct amdgpu_device *adev); +int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev); +void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c index 5512259cac79..fcc8511e91ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c @@ -225,6 +225,10 @@ static int vcn_v4_0_sw_init(struct amdgpu_ip_block *ip_block) vcn_v4_0_fw_shared_init(adev, i); } + /* TODO: Add queue reset mask when FW fully supports it */ + adev->vcn.supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->vcn.inst[0].ring_enc[0]); + if (amdgpu_sriov_vf(adev)) { r = amdgpu_virt_alloc_mm_table(adev); if (r) @@ -247,6 +251,10 @@ static int vcn_v4_0_sw_init(struct amdgpu_ip_block *ip_block) adev->vcn.ip_dump = ptr; } + r = amdgpu_vcn_sysfs_reset_mask_init(adev); + if (r) + return r; + return 0; } @@ -284,6 +292,7 @@ static int vcn_v4_0_sw_fini(struct amdgpu_ip_block *ip_block) if (r) return r; + amdgpu_vcn_sysfs_reset_mask_fini(adev); r = amdgpu_vcn_sw_fini(adev); kfree(adev->vcn.ip_dump); diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c index cf808a153fce..63e837f83646 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c @@ -187,6 +187,10 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block) amdgpu_vcn_fwlog_init(&adev->vcn.inst[i]); } + /* TODO: Add queue reset mask when FW fully supports it */ + adev->vcn.supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->vcn.inst[0].ring_enc[0]); + if (amdgpu_sriov_vf(adev)) { r = amdgpu_virt_alloc_mm_table(adev); if (r) @@ -213,6 +217,10 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block) adev->vcn.ip_dump = ptr; } + r = amdgpu_vcn_sysfs_reset_mask_init(adev); + if (r) + return r; + return 0; } @@ -246,6 +254,7 @@ static int vcn_v4_0_3_sw_fini(struct amdgpu_ip_block *ip_block) if (r) return r; + amdgpu_vcn_sysfs_reset_mask_fini(adev); r = amdgpu_vcn_sw_fini(adev); kfree(adev->vcn.ip_dump); diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c index fe2cc1a80c13..bd3d2bbdc16b 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c @@ -170,6 +170,10 @@ static int vcn_v5_0_0_sw_init(struct amdgpu_ip_block *ip_block) amdgpu_vcn_fwlog_init(&adev->vcn.inst[i]); } + /* TODO: Add queue reset mask when FW fully supports it */ + adev->vcn.supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->vcn.inst[0].ring_enc[0]); + if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG) adev->vcn.pause_dpg_mode = vcn_v5_0_0_pause_dpg_mode; @@ -181,6 +185,11 @@ static int vcn_v5_0_0_sw_init(struct amdgpu_ip_block *ip_block) } else { adev->vcn.ip_dump = ptr; } + + r = amdgpu_vcn_sysfs_reset_mask_init(adev); + if (r) + return r; + return 0; } @@ -215,6 +224,7 @@ static int vcn_v5_0_0_sw_fini(struct amdgpu_ip_block *ip_block) if (r) return r; + amdgpu_vcn_sysfs_reset_mask_fini(adev); r = amdgpu_vcn_sw_fini(adev); kfree(adev->vcn.ip_dump); -- 2.50.1 From 2f1b13521d2a64967530623dc0a3ecd8fd653722 Mon Sep 17 00:00:00 2001 From: "Jesse.zhang@amd.com" Date: Mon, 18 Nov 2024 12:06:30 +0800 Subject: [PATCH 14/16] drm/amdgpu: Fix sysfs warning when hotplugging Fix the similar warning when hotplugging: [ 155.585721] kernfs: can not remove 'enforce_isolation', no directory [ 155.592201] WARNING: CPU: 3 PID: 6960 at fs/kernfs/dir.c:1683 kernfs_remove_by_name_ns+0xb9/0xc0 [ 155.601145] Modules linked in: xt_MASQUERADE xt_comment nft_compat veth bridge stp llc overlay nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink qrtr intel_rapl_msr amd_atl intel_rapl_common amd64_edac edac_mce_amd amdgpu kvm_amd kvm ipmi_ssif amdxcp rapl drm_exec gpu_sched drm_buddy i2c_algo_bit drm_suballoc_helper drm_ttm_helper ttm pcspkr drm_display_helper acpi_cpufreq drm_kms_helper video wmi k10temp i2c_piix4 acpi_ipmi ipmi_si drm zram ip_tables loop squashfs dm_multipath crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel sha512_ssse3 sha256_ssse3 sha1_ssse3 sp5100_tco ixgbe rfkill ccp dca sunrpc be2iscsi bnx2i cnic uio cxgb4i cxgb4 tls cxgb3i cxgb3 mdio libcxgbi libcxgb qla4xxx iscsi_boot_sysfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi ipmi_devintf ipmi_msghandler fuse [ 155.685224] systemd-journald[1354]: Compressed data object 957 -> 524 using ZSTD [ 155.685687] CPU: 3 PID: 6960 Comm: amd_pci_unplug Not tainted 6.10.0-1148853.1.zuul.164395107d6642bdb451071313e9378d #1 [ 155.704149] Hardware name: TYAN B8021G88V2HR-2T/S8021GM2NR-2T, BIOS V1.03.B10 04/01/2019 [ 155.712383] RIP: 0010:kernfs_remove_by_name_ns+0xb9/0xc0 [ 155.717805] Code: a0 00 48 89 ef e8 37 96 c7 ff 5b b8 fe ff ff ff 5d 41 5c 41 5d e9 f7 96 a0 00 0f 0b eb ab 48 c7 c7 48 ba 7e 8f e8 f7 66 bf ff <0f> 0b eb dc 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 [ 155.736766] RSP: 0018:ffffb1685d7a3e20 EFLAGS: 00010296 [ 155.742108] RAX: 0000000000000038 RBX: ffff929e94c80000 RCX: 0000000000000000 [ 155.749363] RDX: ffff928e1efaf200 RSI: ffff928e1efa18c0 RDI: ffff928e1efa18c0 [ 155.756612] RBP: 0000000000000008 R08: 0000000000000000 R09: 0000000000000003 [ 155.763855] R10: ffffb1685d7a3cd8 R11: ffffffff8fb3e1c8 R12: ffffffffc1ef5341 [ 155.771104] R13: ffff929e94cc5530 R14: 0000000000000000 R15: 0000000000000000 [ 155.778357] FS: 00007fd9dd8d9c40(0000) GS:ffff928e1ef80000(0000) knlGS:0000000000000000 [ 155.786594] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 155.792450] CR2: 0000561245ceee38 CR3: 0000000113018000 CR4: 00000000003506f0 [ 155.799702] Call Trace: [ 155.802254] [ 155.804460] ? __warn+0x80/0x120 [ 155.807798] ? kernfs_remove_by_name_ns+0xb9/0xc0 [ 155.812617] ? report_bug+0x164/0x190 [ 155.816393] ? handle_bug+0x3c/0x80 [ 155.819994] ? exc_invalid_op+0x17/0x70 [ 155.823939] ? asm_exc_invalid_op+0x1a/0x20 [ 155.828235] ? kernfs_remove_by_name_ns+0xb9/0xc0 [ 155.833058] amdgpu_gfx_sysfs_fini+0x59/0xd0 [amdgpu] [ 155.838637] gfx_v9_0_sw_fini+0x123/0x1c0 [amdgpu] [ 155.843887] amdgpu_device_fini_sw+0xbc/0x3e0 [amdgpu] [ 155.849432] amdgpu_driver_release_kms+0x16/0x30 [amdgpu] [ 155.855235] drm_dev_put.part.0+0x3c/0x60 [drm] [ 155.859914] drm_release+0x8b/0xc0 [drm] [ 155.863978] __fput+0xf1/0x2c0 [ 155.867141] __x64_sys_close+0x3c/0x80 [ 155.870998] do_syscall_64+0x64/0x170 V2: Add details in comments (Tim) Signed-off-by: Jesse Zhang Reported-by: Andy Dong Reviewed-by: Tim Huang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 8 +++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 6 ++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 6 ++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 6 ++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c | 6 ++++-- drivers/gpu/drm/amd/amdgpu/df_v3_6.c | 4 ++-- 7 files changed, 25 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index f57cc72c43cf..69a6b6dba0a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1778,9 +1778,11 @@ int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev) void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev) { - amdgpu_gfx_sysfs_xcp_fini(adev); - amdgpu_gfx_sysfs_isolation_shader_fini(adev); - amdgpu_gfx_sysfs_reset_mask_fini(adev); + if (adev->dev->kobj.sd) { + amdgpu_gfx_sysfs_xcp_fini(adev); + amdgpu_gfx_sysfs_isolation_shader_fini(adev); + amdgpu_gfx_sysfs_reset_mask_fini(adev); + } } int amdgpu_gfx_cleaner_shader_sw_init(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c index 04eb51674596..b6d2eb049f54 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c @@ -447,6 +447,8 @@ int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device *adev) void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdgpu_device *adev) { - if (adev->jpeg.num_jpeg_inst) - device_remove_file(adev->dev, &dev_attr_jpeg_reset_mask); + if (adev->dev->kobj.sd) { + if (adev->jpeg.num_jpeg_inst) + device_remove_file(adev->dev, &dev_attr_jpeg_reset_mask); + } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c index e8adfd0a570a..34b5e22b44e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c @@ -137,7 +137,8 @@ void amdgpu_preempt_mgr_fini(struct amdgpu_device *adev) if (ret) return; - device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used); + if (adev->dev->kobj.sd) + device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used); ttm_resource_manager_cleanup(man); ttm_set_driver_manager(&adev->mman.bdev, AMDGPU_PL_PREEMPT, NULL); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c index 8c89b69edc20..113f0d242618 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c @@ -451,6 +451,8 @@ void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev) if (!amdgpu_gpu_recovery) return; - if (adev->sdma.num_instances) - device_remove_file(adev->dev, &dev_attr_sdma_reset_mask); + if (adev->dev->kobj.sd) { + if (adev->sdma.num_instances) + device_remove_file(adev->dev, &dev_attr_sdma_reset_mask); + } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c index 5ced59bf0f00..3e94c3ba1ba2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c @@ -1315,6 +1315,8 @@ int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev) void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev) { - if (adev->vcn.num_vcn_inst) - device_remove_file(adev->dev, &dev_attr_vcn_reset_mask); + if (adev->dev->kobj.sd) { + if (adev->vcn.num_vcn_inst) + device_remove_file(adev->dev, &dev_attr_vcn_reset_mask); + } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c index 3e6f9dfb61bb..110b120d7375 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c @@ -904,8 +904,10 @@ int amdgpu_vpe_sysfs_reset_mask_init(struct amdgpu_device *adev) void amdgpu_vpe_sysfs_reset_mask_fini(struct amdgpu_device *adev) { - if (adev->vpe.num_instances) - device_remove_file(adev->dev, &dev_attr_vpe_reset_mask); + if (adev->dev->kobj.sd) { + if (adev->vpe.num_instances) + device_remove_file(adev->dev, &dev_attr_vpe_reset_mask); + } } static const struct amdgpu_ring_funcs vpe_ring_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/df_v3_6.c b/drivers/gpu/drm/amd/amdgpu/df_v3_6.c index 483a441b46aa..621aeca53880 100644 --- a/drivers/gpu/drm/amd/amdgpu/df_v3_6.c +++ b/drivers/gpu/drm/amd/amdgpu/df_v3_6.c @@ -254,8 +254,8 @@ static void df_v3_6_sw_init(struct amdgpu_device *adev) static void df_v3_6_sw_fini(struct amdgpu_device *adev) { - - device_remove_file(adev->dev, &dev_attr_df_cntr_avail); + if (adev->dev->kobj.sd) + device_remove_file(adev->dev, &dev_attr_df_cntr_avail); } -- 2.50.1 From 928cd772e18ffbd7723cb2361db4a8ccf2222235 Mon Sep 17 00:00:00 2001 From: Xiang Liu Date: Fri, 15 Nov 2024 16:59:30 +0800 Subject: [PATCH 15/16] drm/amdgpu/vcn: reset fw_shared when VCPU buffers corrupted on vcn v4.0.3 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit It is not necessarily corrupted. When there is RAS fatal error, device memory access is blocked. Hence vcpu bo cannot be saved to system memory as in a regular suspend sequence before going for reset. In other full device reset cases, that gets saved and restored during resume. v2: Remove redundant code like vcn_v4_0 did v2: Refine commit message v3: Drop the volatile v3: Refine commit message Signed-off-by: Xiang Liu Acked-by: Christian König Reviewed-by: Stanley.Yang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 30 ++++++++++++++++++------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c index 63e837f83646..3f69b9b2bcd0 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c @@ -123,6 +123,20 @@ static int vcn_v4_0_3_early_init(struct amdgpu_ip_block *ip_block) return amdgpu_vcn_early_init(adev); } +static int vcn_v4_0_3_fw_shared_init(struct amdgpu_device *adev, int inst_idx) +{ + struct amdgpu_vcn4_fw_shared *fw_shared; + + fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr; + fw_shared->present_flag_0 = cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE); + fw_shared->sq.is_enabled = 1; + + if (amdgpu_vcnfw_log) + amdgpu_vcn_fwlog_init(&adev->vcn.inst[inst_idx]); + + return 0; +} + /** * vcn_v4_0_3_sw_init - sw init for VCN block * @@ -155,8 +169,6 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block) return r; for (i = 0; i < adev->vcn.num_vcn_inst; i++) { - volatile struct amdgpu_vcn4_fw_shared *fw_shared; - vcn_inst = GET_INST(VCN, i); ring = &adev->vcn.inst[i].ring_enc[0]; @@ -179,12 +191,7 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block) if (r) return r; - fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr; - fw_shared->present_flag_0 = cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE); - fw_shared->sq.is_enabled = true; - - if (amdgpu_vcnfw_log) - amdgpu_vcn_fwlog_init(&adev->vcn.inst[i]); + vcn_v4_0_3_fw_shared_init(adev, i); } /* TODO: Add queue reset mask when FW fully supports it */ @@ -289,6 +296,8 @@ static int vcn_v4_0_3_hw_init(struct amdgpu_ip_block *ip_block) } } else { for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { + struct amdgpu_vcn4_fw_shared *fw_shared; + vcn_inst = GET_INST(VCN, i); ring = &adev->vcn.inst[i].ring_enc[0]; @@ -312,6 +321,11 @@ static int vcn_v4_0_3_hw_init(struct amdgpu_ip_block *ip_block) regVCN_RB1_DB_CTRL); } + /* Re-init fw_shared when RAS fatal error occurred */ + fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr; + if (!fw_shared->sq.is_enabled) + vcn_v4_0_3_fw_shared_init(adev, i); + r = amdgpu_ring_test_helper(ring); if (r) return r; -- 2.50.1 From b61badd20b443eabe132314669bb51a263982e5c Mon Sep 17 00:00:00 2001 From: Vitaly Prosyak Date: Mon, 11 Nov 2024 17:24:08 -0500 Subject: [PATCH 16/16] drm/amdgpu: fix usage slab after free MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit [ +0.000021] BUG: KASAN: slab-use-after-free in drm_sched_entity_flush+0x6cb/0x7a0 [gpu_sched] [ +0.000027] Read of size 8 at addr ffff8881b8605f88 by task amd_pci_unplug/2147 [ +0.000023] CPU: 6 PID: 2147 Comm: amd_pci_unplug Not tainted 6.10.0+ #1 [ +0.000016] Hardware name: ASUS System Product Name/ROG STRIX B550-F GAMING (WI-FI), BIOS 1401 12/03/2020 [ +0.000016] Call Trace: [ +0.000008] [ +0.000009] dump_stack_lvl+0x76/0xa0 [ +0.000017] print_report+0xce/0x5f0 [ +0.000017] ? drm_sched_entity_flush+0x6cb/0x7a0 [gpu_sched] [ +0.000019] ? srso_return_thunk+0x5/0x5f [ +0.000015] ? kasan_complete_mode_report_info+0x72/0x200 [ +0.000016] ? drm_sched_entity_flush+0x6cb/0x7a0 [gpu_sched] [ +0.000019] kasan_report+0xbe/0x110 [ +0.000015] ? drm_sched_entity_flush+0x6cb/0x7a0 [gpu_sched] [ +0.000023] __asan_report_load8_noabort+0x14/0x30 [ +0.000014] drm_sched_entity_flush+0x6cb/0x7a0 [gpu_sched] [ +0.000020] ? srso_return_thunk+0x5/0x5f [ +0.000013] ? __kasan_check_write+0x14/0x30 [ +0.000016] ? __pfx_drm_sched_entity_flush+0x10/0x10 [gpu_sched] [ +0.000020] ? srso_return_thunk+0x5/0x5f [ +0.000013] ? __kasan_check_write+0x14/0x30 [ +0.000013] ? srso_return_thunk+0x5/0x5f [ +0.000013] ? enable_work+0x124/0x220 [ +0.000015] ? __pfx_enable_work+0x10/0x10 [ +0.000013] ? srso_return_thunk+0x5/0x5f [ +0.000014] ? free_large_kmalloc+0x85/0xf0 [ +0.000016] drm_sched_entity_destroy+0x18/0x30 [gpu_sched] [ +0.000020] amdgpu_vce_sw_fini+0x55/0x170 [amdgpu] [ +0.000735] ? __kasan_check_read+0x11/0x20 [ +0.000016] vce_v4_0_sw_fini+0x80/0x110 [amdgpu] [ +0.000726] amdgpu_device_fini_sw+0x331/0xfc0 [amdgpu] [ +0.000679] ? mutex_unlock+0x80/0xe0 [ +0.000017] ? __pfx_amdgpu_device_fini_sw+0x10/0x10 [amdgpu] [ +0.000662] ? srso_return_thunk+0x5/0x5f [ +0.000014] ? __kasan_check_write+0x14/0x30 [ +0.000013] ? srso_return_thunk+0x5/0x5f [ +0.000013] ? mutex_unlock+0x80/0xe0 [ +0.000016] amdgpu_driver_release_kms+0x16/0x80 [amdgpu] [ +0.000663] drm_minor_release+0xc9/0x140 [drm] [ +0.000081] drm_release+0x1fd/0x390 [drm] [ +0.000082] __fput+0x36c/0xad0 [ +0.000018] __fput_sync+0x3c/0x50 [ +0.000014] __x64_sys_close+0x7d/0xe0 [ +0.000014] x64_sys_call+0x1bc6/0x2680 [ +0.000014] do_syscall_64+0x70/0x130 [ +0.000014] ? srso_return_thunk+0x5/0x5f [ +0.000014] ? irqentry_exit_to_user_mode+0x60/0x190 [ +0.000015] ? srso_return_thunk+0x5/0x5f [ +0.000014] ? irqentry_exit+0x43/0x50 [ +0.000012] ? srso_return_thunk+0x5/0x5f [ +0.000013] ? exc_page_fault+0x7c/0x110 [ +0.000015] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ +0.000014] RIP: 0033:0x7ffff7b14f67 [ +0.000013] Code: ff e8 0d 16 02 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 41 c3 48 83 ec 18 89 7c 24 0c e8 73 ba f7 ff [ +0.000026] RSP: 002b:00007fffffffe378 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 [ +0.000019] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007ffff7b14f67 [ +0.000014] RDX: 0000000000000000 RSI: 00007ffff7f6f47a RDI: 0000000000000003 [ +0.000014] RBP: 00007fffffffe3a0 R08: 0000555555569890 R09: 0000000000000000 [ +0.000014] R10: 0000000000000000 R11: 0000000000000246 R12: 00007fffffffe5c8 [ +0.000013] R13: 00005555555552a9 R14: 0000555555557d48 R15: 00007ffff7ffd040 [ +0.000020] [ +0.000016] Allocated by task 383 on cpu 7 at 26.880319s: [ +0.000014] kasan_save_stack+0x28/0x60 [ +0.000008] kasan_save_track+0x18/0x70 [ +0.000007] kasan_save_alloc_info+0x38/0x60 [ +0.000007] __kasan_kmalloc+0xc1/0xd0 [ +0.000007] kmalloc_trace_noprof+0x180/0x380 [ +0.000007] drm_sched_init+0x411/0xec0 [gpu_sched] [ +0.000012] amdgpu_device_init+0x695f/0xa610 [amdgpu] [ +0.000658] amdgpu_driver_load_kms+0x1a/0x120 [amdgpu] [ +0.000662] amdgpu_pci_probe+0x361/0xf30 [amdgpu] [ +0.000651] local_pci_probe+0xe7/0x1b0 [ +0.000009] pci_device_probe+0x248/0x890 [ +0.000008] really_probe+0x1fd/0x950 [ +0.000008] __driver_probe_device+0x307/0x410 [ +0.000007] driver_probe_device+0x4e/0x150 [ +0.000007] __driver_attach+0x223/0x510 [ +0.000006] bus_for_each_dev+0x102/0x1a0 [ +0.000007] driver_attach+0x3d/0x60 [ +0.000006] bus_add_driver+0x2ac/0x5f0 [ +0.000006] driver_register+0x13d/0x490 [ +0.000008] __pci_register_driver+0x1ee/0x2b0 [ +0.000007] llc_sap_close+0xb0/0x160 [llc] [ +0.000009] do_one_initcall+0x9c/0x3e0 [ +0.000008] do_init_module+0x241/0x760 [ +0.000008] load_module+0x51ac/0x6c30 [ +0.000006] __do_sys_init_module+0x234/0x270 [ +0.000007] __x64_sys_init_module+0x73/0xc0 [ +0.000006] x64_sys_call+0xe3/0x2680 [ +0.000006] do_syscall_64+0x70/0x130 [ +0.000007] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ +0.000015] Freed by task 2147 on cpu 6 at 160.507651s: [ +0.000013] kasan_save_stack+0x28/0x60 [ +0.000007] kasan_save_track+0x18/0x70 [ +0.000007] kasan_save_free_info+0x3b/0x60 [ +0.000007] poison_slab_object+0x115/0x1c0 [ +0.000007] __kasan_slab_free+0x34/0x60 [ +0.000007] kfree+0xfa/0x2f0 [ +0.000007] drm_sched_fini+0x19d/0x410 [gpu_sched] [ +0.000012] amdgpu_fence_driver_sw_fini+0xc4/0x2f0 [amdgpu] [ +0.000662] amdgpu_device_fini_sw+0x77/0xfc0 [amdgpu] [ +0.000653] amdgpu_driver_release_kms+0x16/0x80 [amdgpu] [ +0.000655] drm_minor_release+0xc9/0x140 [drm] [ +0.000071] drm_release+0x1fd/0x390 [drm] [ +0.000071] __fput+0x36c/0xad0 [ +0.000008] __fput_sync+0x3c/0x50 [ +0.000007] __x64_sys_close+0x7d/0xe0 [ +0.000007] x64_sys_call+0x1bc6/0x2680 [ +0.000007] do_syscall_64+0x70/0x130 [ +0.000007] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ +0.000014] The buggy address belongs to the object at ffff8881b8605f80 which belongs to the cache kmalloc-64 of size 64 [ +0.000020] The buggy address is located 8 bytes inside of freed 64-byte region [ffff8881b8605f80, ffff8881b8605fc0) [ +0.000028] The buggy address belongs to the physical page: [ +0.000011] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1b8605 [ +0.000008] anon flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff) [ +0.000007] page_type: 0xffffefff(slab) [ +0.000009] raw: 0017ffffc0000000 ffff8881000428c0 0000000000000000 dead000000000001 [ +0.000006] raw: 0000000000000000 0000000000200020 00000001ffffefff 0000000000000000 [ +0.000006] page dumped because: kasan: bad access detected [ +0.000012] Memory state around the buggy address: [ +0.000011] ffff8881b8605e80: fa fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ +0.000015] ffff8881b8605f00: 00 00 00 00 00 00 00 00 fc fc fc fc fc fc fc fc [ +0.000015] >ffff8881b8605f80: fa fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ +0.000013] ^ [ +0.000011] ffff8881b8606000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fc [ +0.000014] ffff8881b8606080: fc fc fc fc fc fc fc fa fb fb fb fb fb fb fb fb [ +0.000013] ================================================================== The issue reproduced on VG20 during the IGT pci_unplug test. The root cause of the issue is that the function drm_sched_fini is called before drm_sched_entity_kill. In drm_sched_fini, the drm_sched_rq structure is freed, but this structure is later accessed by each entity within the run queue, leading to invalid memory access. To resolve this, the order of cleanup calls is updated: Before: amdgpu_fence_driver_sw_fini amdgpu_device_ip_fini After: amdgpu_device_ip_fini amdgpu_fence_driver_sw_fini This updated order ensures that all entities in the IPs are cleaned up first, followed by proper cleanup of the schedulers. Additional Investigation: During debugging, another issue was identified in the amdgpu_vce_sw_fini function. The vce.vcpu_bo buffer must be freed only as the final step in the cleanup process to prevent any premature access during earlier cleanup stages. v2: Using Christian suggestion call drm_sched_entity_destroy before drm_sched_fini. Cc: Christian König Cc: Alex Deucher Signed-off-by: Vitaly Prosyak Reviewed-by: Christian König Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 714d2e586eac..9095c05e0269 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4677,8 +4677,8 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev) int idx; bool px; - amdgpu_fence_driver_sw_fini(adev); amdgpu_device_ip_fini(adev); + amdgpu_fence_driver_sw_fini(adev); amdgpu_ucode_release(&adev->firmware.gpu_info_fw); adev->accel_working = false; dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c index 74fdbf71d95b..599d3ca4e0ef 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c @@ -214,15 +214,15 @@ int amdgpu_vce_sw_fini(struct amdgpu_device *adev) drm_sched_entity_destroy(&adev->vce.entity); - amdgpu_bo_free_kernel(&adev->vce.vcpu_bo, &adev->vce.gpu_addr, - (void **)&adev->vce.cpu_addr); - for (i = 0; i < adev->vce.num_rings; i++) amdgpu_ring_fini(&adev->vce.ring[i]); amdgpu_ucode_release(&adev->vce.fw); mutex_destroy(&adev->vce.idle_mutex); + amdgpu_bo_free_kernel(&adev->vce.vcpu_bo, &adev->vce.gpu_addr, + (void **)&adev->vce.cpu_addr); + return 0; } -- 2.50.1