From 83626efdce0be2eb80696110fe55e9290c72b1f1 Mon Sep 17 00:00:00 2001 From: George Shen Date: Thu, 5 Dec 2024 16:58:21 -0500 Subject: [PATCH 01/16] drm/amd/display: Disable MPC rate control on ODM pipe update [Why] Seamless boot skips MPC init for the active pipe, resulting in stale MPC rate control state being retained. This will cause issues since other logic assumes it is disabled (as DCN30 and newer does not need it). [How] Disable MPC rate control on ODM pipe update to cover the seamless boot case. Tested-by: Daniel Wheeler Reviewed-by: Alvin Lee Signed-off-by: George Shen Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- .../amd/display/dc/hwss/dcn314/dcn314_hwseq.c | 12 ++++++++++++ .../amd/display/dc/hwss/dcn35/dcn35_hwseq.c | 12 ++++++++++++ .../drm/amd/display/dc/mpc/dcn30/dcn30_mpc.c | 18 ++++++++++++++++++ .../drm/amd/display/dc/mpc/dcn30/dcn30_mpc.h | 7 +++++++ 4 files changed, 49 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn314/dcn314_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn314/dcn314_hwseq.c index 9b88eb72086d..be26c925fdfa 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn314/dcn314_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn314/dcn314_hwseq.c @@ -162,6 +162,8 @@ void dcn314_update_odm(struct dc *dc, struct dc_state *context, struct pipe_ctx int opp_inst[MAX_PIPES] = {0}; int odm_slice_width = resource_get_odm_slice_dst_width(pipe_ctx, false); int last_odm_slice_width = resource_get_odm_slice_dst_width(pipe_ctx, true); + struct mpc *mpc = dc->res_pool->mpc; + int i; opp_cnt = get_odm_config(pipe_ctx, opp_inst); @@ -174,6 +176,16 @@ void dcn314_update_odm(struct dc *dc, struct dc_state *context, struct pipe_ctx pipe_ctx->stream_res.tg->funcs->set_odm_bypass( pipe_ctx->stream_res.tg, &pipe_ctx->stream->timing); + if (mpc->funcs->set_out_rate_control) { + for (i = 0; i < opp_cnt; ++i) { + mpc->funcs->set_out_rate_control( + mpc, opp_inst[i], + false, + 0, + NULL); + } + } + for (odm_pipe = pipe_ctx->next_odm_pipe; odm_pipe; odm_pipe = odm_pipe->next_odm_pipe) { odm_pipe->stream_res.opp->funcs->opp_pipe_clock_control( odm_pipe->stream_res.opp, diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c index e599cdc465bf..d5f76cc69c73 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c @@ -426,6 +426,8 @@ void dcn35_update_odm(struct dc *dc, struct dc_state *context, struct pipe_ctx * int opp_inst[MAX_PIPES] = {0}; int odm_slice_width = resource_get_odm_slice_dst_width(pipe_ctx, false); int last_odm_slice_width = resource_get_odm_slice_dst_width(pipe_ctx, true); + struct mpc *mpc = dc->res_pool->mpc; + int i; opp_cnt = get_odm_config(pipe_ctx, opp_inst); @@ -438,6 +440,16 @@ void dcn35_update_odm(struct dc *dc, struct dc_state *context, struct pipe_ctx * pipe_ctx->stream_res.tg->funcs->set_odm_bypass( pipe_ctx->stream_res.tg, &pipe_ctx->stream->timing); + if (mpc->funcs->set_out_rate_control) { + for (i = 0; i < opp_cnt; ++i) { + mpc->funcs->set_out_rate_control( + mpc, opp_inst[i], + false, + 0, + NULL); + } + } + for (odm_pipe = pipe_ctx->next_odm_pipe; odm_pipe; odm_pipe = odm_pipe->next_odm_pipe) { odm_pipe->stream_res.opp->funcs->opp_pipe_clock_control( odm_pipe->stream_res.opp, diff --git a/drivers/gpu/drm/amd/display/dc/mpc/dcn30/dcn30_mpc.c b/drivers/gpu/drm/amd/display/dc/mpc/dcn30/dcn30_mpc.c index fe26fde12eeb..85298b8a1b5e 100644 --- a/drivers/gpu/drm/amd/display/dc/mpc/dcn30/dcn30_mpc.c +++ b/drivers/gpu/drm/amd/display/dc/mpc/dcn30/dcn30_mpc.c @@ -110,6 +110,23 @@ void mpc3_disable_dwb_mux( MPC_DWB0_MUX, 0xf); } +void mpc3_set_out_rate_control( + struct mpc *mpc, + int opp_id, + bool enable, + bool rate_2x_mode, + struct mpc_dwb_flow_control *flow_control) +{ + struct dcn30_mpc *mpc30 = TO_DCN30_MPC(mpc); + + /* Always disable mpc out rate and flow control. + * MPC flow rate control is not needed for DCN30 and above. + */ + REG_UPDATE_2(MUX[opp_id], + MPC_OUT_RATE_CONTROL_DISABLE, 1, + MPC_OUT_RATE_CONTROL, 0); +} + enum dc_lut_mode mpc3_get_ogam_current(struct mpc *mpc, int mpcc_id) { /*Contrary to DCN2 and DCN1 wherein a single status register field holds this info; @@ -1519,6 +1536,7 @@ static const struct mpc_funcs dcn30_mpc_funcs = { .set_dwb_mux = mpc3_set_dwb_mux, .disable_dwb_mux = mpc3_disable_dwb_mux, .is_dwb_idle = mpc3_is_dwb_idle, + .set_out_rate_control = mpc3_set_out_rate_control, .set_gamut_remap = mpc3_set_gamut_remap, .program_shaper = mpc3_program_shaper, .acquire_rmu = mpcc3_acquire_rmu, diff --git a/drivers/gpu/drm/amd/display/dc/mpc/dcn30/dcn30_mpc.h b/drivers/gpu/drm/amd/display/dc/mpc/dcn30/dcn30_mpc.h index ce93003dae01..103f29900a2c 100644 --- a/drivers/gpu/drm/amd/display/dc/mpc/dcn30/dcn30_mpc.h +++ b/drivers/gpu/drm/amd/display/dc/mpc/dcn30/dcn30_mpc.h @@ -1085,6 +1085,13 @@ bool mpc3_is_dwb_idle( struct mpc *mpc, int dwb_id); +void mpc3_set_out_rate_control( + struct mpc *mpc, + int opp_id, + bool enable, + bool rate_2x_mode, + struct mpc_dwb_flow_control *flow_control); + void mpc3_power_on_ogam_lut( struct mpc *mpc, int mpcc_id, bool power_on); -- 2.51.0 From 824ed4cb629c87b0b8aec997d3b7f6f77143ad25 Mon Sep 17 00:00:00 2001 From: Aric Cyr Date: Sun, 8 Dec 2024 22:01:45 -0500 Subject: [PATCH 02/16] drm/amd/display: 3.2.314 DC 3.2.314 contains some improvements as summarized below: * Update DML21 code. * Fixes for FAMS2 interface. * HDMI fixes. * Compilation warning fixes. Signed-off-by: Aric Cyr Acked-by: Rodrigo Siqueira Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index aef70bcde355..8c6347413038 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -55,7 +55,7 @@ struct aux_payload; struct set_config_cmd_payload; struct dmub_notification; -#define DC_VER "3.2.313" +#define DC_VER "3.2.314" #define MAX_SURFACES 3 #define MAX_PLANES 6 -- 2.51.0 From 22b9555bc90df22b585bdd1f161b61584b13af51 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 16:47:48 -0500 Subject: [PATCH 03/16] drm/amdgpu/nbio7.7: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c index 1ac730328516..3fb6d2aa7e3b 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c @@ -247,7 +247,7 @@ static void nbio_v7_7_init_registers(struct amdgpu_device *adev) if (def != data) WREG32_SOC15(NBIO, 0, regBIF0_PCIE_MST_CTRL_3, data); - switch (adev->ip_versions[NBIO_HWIP][0]) { + switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { case IP_VERSION(7, 7, 0): data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF5_STRAP4) & ~BIT(23); WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF5_STRAP4, data); -- 2.51.0 From 0ec43fbece784215d3c4469973e4556d70bce915 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 16:49:20 -0500 Subject: [PATCH 04/16] drm/amdgpu/nbio7.0: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c index 49e953f86ced..d1032e9992b4 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c @@ -278,7 +278,7 @@ static void nbio_v7_0_init_registers(struct amdgpu_device *adev) { uint32_t data; - switch (adev->ip_versions[NBIO_HWIP][0]) { + switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { case IP_VERSION(2, 5, 0): data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4) & ~BIT(23); WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4, data); -- 2.51.0 From 2c8eeaaa0fe5841ccf07a0eb51b1426f34ef39f7 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:00:07 -0500 Subject: [PATCH 05/16] drm/amdgpu/nbio7.11: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c index 814ab59fdd4a..41421da63a08 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c @@ -275,7 +275,7 @@ static void nbio_v7_11_init_registers(struct amdgpu_device *adev) if (def != data) WREG32_SOC15(NBIO, 0, regBIF_BIF256_CI256_RC3X4_USB4_PCIE_MST_CTRL_3, data); - switch (adev->ip_versions[NBIO_HWIP][0]) { + switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { case IP_VERSION(7, 11, 0): case IP_VERSION(7, 11, 1): case IP_VERSION(7, 11, 2): -- 2.51.0 From 63bfd24088b42c6f55c2096bfc41b50213d419b2 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:03:20 -0500 Subject: [PATCH 06/16] drm/amdgpu/mmhub4.1: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c index 0fbc3be81f14..f2ab5001b492 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c @@ -108,7 +108,7 @@ mmhub_v4_1_0_print_l2_protection_fault_status(struct amdgpu_device *adev, dev_err(adev->dev, "MMVM_L2_PROTECTION_FAULT_STATUS_LO32:0x%08X\n", status); - switch (adev->ip_versions[MMHUB_HWIP][0]) { + switch (amdgpu_ip_version(adev, MMHUB_HWIP, 0)) { case IP_VERSION(4, 1, 0): mmhub_cid = mmhub_client_ids_v4_1_0[cid][rw]; break; -- 2.51.0 From f1fd1d0f40272948aa6ab82a3a82ecbbc76dff53 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:04:58 -0500 Subject: [PATCH 07/16] drm/amdgpu/gfx12: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index 6ca9d0461e4c..40c47a2dd060 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -4126,7 +4126,7 @@ static int gfx_v12_0_set_clockgating_state(struct amdgpu_ip_block *ip_block, if (amdgpu_sriov_vf(adev)) return 0; - switch (adev->ip_versions[GC_HWIP][0]) { + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { case IP_VERSION(12, 0, 0): case IP_VERSION(12, 0, 1): gfx_v12_0_update_gfx_clock_gating(adev, -- 2.51.0 From 8f2cd1067afe68372a1723e05e19b68ed187676a Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:06:26 -0500 Subject: [PATCH 08/16] drm/amdgpu/smu14.0.2: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c index 6a565ce74d5b..5cad09c5f2ff 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c @@ -2096,7 +2096,7 @@ static int smu_v14_0_2_enable_gfx_features(struct smu_context *smu) { struct amdgpu_device *adev = smu->adev; - if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(14, 0, 2)) + if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 2)) return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_EnableAllSmuFeatures, FEATURE_PWR_GFX, NULL); else -- 2.51.0 From 34c4eb7d4e0cd443399a0f114d467d2b3ff05419 Mon Sep 17 00:00:00 2001 From: Karol Przybylski Date: Sun, 15 Dec 2024 13:28:57 +0100 Subject: [PATCH 09/16] drm/amdgpu: Fix potential integer overflow in scheduler mask calculations The use of 1 << i in scheduler mask calculations can result in an unintentional integer overflow due to the expression being evaluated as a 32-bit signed integer. This patch replaces 1 << i with 1ULL << i to ensure the operation is performed as a 64-bit unsigned integer, preventing overflow Discovered in coverity scan, CID 1636393, 1636175, 1636007, 1635853 Fixes: c5c63d9cb5d3 ("drm/amdgpu: add amdgpu_gfx_sched_mask and amdgpu_compute_sched_mask debugfs") Signed-off-by: Karol Przybylski Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index a4dde54512b1..4a4e40dd25d6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -2124,7 +2124,7 @@ static int amdgpu_debugfs_gfx_sched_mask_set(void *data, u64 val) if (!adev) return -ENODEV; - mask = (1 << adev->gfx.num_gfx_rings) - 1; + mask = (1ULL << adev->gfx.num_gfx_rings) - 1; if ((val & mask) == 0) return -EINVAL; @@ -2152,7 +2152,7 @@ static int amdgpu_debugfs_gfx_sched_mask_get(void *data, u64 *val) for (i = 0; i < adev->gfx.num_gfx_rings; ++i) { ring = &adev->gfx.gfx_ring[i]; if (ring->sched.ready) - mask |= 1 << i; + mask |= 1ULL << i; } *val = mask; @@ -2194,7 +2194,7 @@ static int amdgpu_debugfs_compute_sched_mask_set(void *data, u64 val) if (!adev) return -ENODEV; - mask = (1 << adev->gfx.num_compute_rings) - 1; + mask = (1ULL << adev->gfx.num_compute_rings) - 1; if ((val & mask) == 0) return -EINVAL; @@ -2223,7 +2223,7 @@ static int amdgpu_debugfs_compute_sched_mask_get(void *data, u64 *val) for (i = 0; i < adev->gfx.num_compute_rings; ++i) { ring = &adev->gfx.compute_ring[i]; if (ring->sched.ready) - mask |= 1 << i; + mask |= 1ULL << i; } *val = mask; -- 2.51.0 From b4b7271e5ca95b581f2fcc4ae852c4079215e92d Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Tue, 26 Nov 2024 15:45:32 -0500 Subject: [PATCH 10/16] drm/amdgpu: Don't enable sdma 4.4.5 CTXEMPTY interrupt The sdma context empty interrupt is dropped in amdgpu_irq_dispatch as unregistered interrupt src_id 243, this interrupt accounts to 1/3 of total interrupts and causes IH primary ring overflow when running stressful benchmark application. Disable this interrupt has no side effect found. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 4c8308b2878b..56507ae919b0 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -973,10 +973,12 @@ static int sdma_v4_4_2_inst_start(struct amdgpu_device *adev, /* set utc l1 enable flag always to 1 */ temp = RREG32_SDMA(i, regSDMA_CNTL); temp = REG_SET_FIELD(temp, SDMA_CNTL, UTC_L1_ENABLE, 1); - /* enable context empty interrupt during initialization */ - temp = REG_SET_FIELD(temp, SDMA_CNTL, CTXEMPTY_INT_ENABLE, 1); - WREG32_SDMA(i, regSDMA_CNTL, temp); + if (amdgpu_ip_version(adev, SDMA0_HWIP, 0) < IP_VERSION(4, 4, 5)) { + /* enable context empty interrupt during initialization */ + temp = REG_SET_FIELD(temp, SDMA_CNTL, CTXEMPTY_INT_ENABLE, 1); + WREG32_SDMA(i, regSDMA_CNTL, temp); + } if (!amdgpu_sriov_vf(adev)) { if (adev->firmware.load_type != AMDGPU_FW_LOAD_PSP) { /* unhalt engine */ -- 2.51.0 From d1ebe307b44bbc9a98578c8f8089bb8789c5ecd7 Mon Sep 17 00:00:00 2001 From: Candice Li Date: Mon, 16 Dec 2024 17:20:12 +0800 Subject: [PATCH 11/16] drm/amdgpu: Enable psp v14_0_3 RAS support for non-SRIOV configurations. Enable psp v14_0_3 RAS support for non-SRIOV configurations. Signed-off-by: Candice Li Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index db081618e85c..01c947066a2e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3572,7 +3572,6 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) case IP_VERSION(13, 0, 6): case IP_VERSION(13, 0, 12): case IP_VERSION(13, 0, 14): - case IP_VERSION(14, 0, 3): return true; default: return false; @@ -3586,6 +3585,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) case IP_VERSION(13, 0, 10): case IP_VERSION(13, 0, 12): case IP_VERSION(13, 0, 14): + case IP_VERSION(14, 0, 3): return true; default: return false; -- 2.51.0 From 57f812d171af4ba233d3ed7c94dfa5b8e92dcc04 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Christian=20K=C3=B6nig?= Date: Thu, 12 Dec 2024 16:29:18 +0100 Subject: [PATCH 12/16] drm/amdgpu: fix amdgpu_coredump MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The VM pointer might already be outdated when that function is called. Use the PASID instead to gather the information instead. Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c index 946c48829f19..824f9da5b6ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c @@ -343,11 +343,10 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, coredump->skip_vram_check = skip_vram_check; coredump->reset_vram_lost = vram_lost; - if (job && job->vm) { - struct amdgpu_vm *vm = job->vm; + if (job && job->pasid) { struct amdgpu_task_info *ti; - ti = amdgpu_vm_get_task_info_vm(vm); + ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); if (ti) { coredump->reset_task_info = *ti; amdgpu_vm_put_task_info(ti); -- 2.51.0 From 26c95e838e6301b0230430ec2fadeabfcb07aeda Mon Sep 17 00:00:00 2001 From: =?utf8?q?Christian=20K=C3=B6nig?= Date: Thu, 12 Dec 2024 16:43:45 +0100 Subject: [PATCH 13/16] drm/amdgpu: set the VM pointer to NULL in amdgpu_job_prepare MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit As soon as the prepare phase is completed the VM might be released, better set it to NULL. Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 2e7e1c7dfe7c..04691753cabf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -361,6 +361,13 @@ amdgpu_job_prepare_job(struct drm_sched_job *sched_job, dev_err(ring->adev->dev, "Error getting VM ID (%d)\n", r); goto error; } + /* + * The VM structure might be released after the VMID is + * assigned, we had multiple problems with people trying to use + * the VM pointer so better set it to NULL. + */ + if (!fence) + job->vm = NULL; } return fence; -- 2.51.0 From 11815bb0e30966321ff4351b55ad7b6f2e0a63bf Mon Sep 17 00:00:00 2001 From: =?utf8?q?Christian=20K=C3=B6nig?= Date: Thu, 12 Dec 2024 16:51:04 +0100 Subject: [PATCH 14/16] drm/amdgpu: partially revert "reduce reset time" MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This partially reverts commit 194eb174cbe4fe2b3376ac30acca2dc8c8beca00. This commit introduced a new state variable into adev without even remotely worrying about CPU barriers. Since we already have the amdgpu_in_reset() function exactly for this use case partially revert that. Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 3 --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 2 +- 5 files changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 9f5351c013ff..69895fccb474 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1175,7 +1175,6 @@ struct amdgpu_device { struct work_struct reset_work; - bool job_hang; bool dc_enabled; /* Mask of active clusters */ uint32_t aid_mask; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index de30143ea51b..2e5732dfd425 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -836,7 +836,7 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off, if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; - if (!kiq_ring->sched.ready || adev->job_hang) + if (!kiq_ring->sched.ready || amdgpu_in_reset(adev)) return 0; ring_funcs = kzalloc(sizeof(*ring_funcs), GFP_KERNEL); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 4a4e40dd25d6..6d5d81f0dc4e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -515,7 +515,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; - if (!kiq_ring->sched.ready || adev->job_hang || amdgpu_in_reset(adev)) + if (!kiq_ring->sched.ready || amdgpu_in_reset(adev)) return 0; spin_lock(&kiq->ring_lock); @@ -567,7 +567,7 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev, int xcc_id) if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; - if (!adev->gfx.kiq[0].ring.sched.ready || adev->job_hang) + if (!adev->gfx.kiq[0].ring.sched.ready || amdgpu_in_reset(adev)) return 0; if (amdgpu_gfx_is_master_xcc(adev, xcc_id)) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 04691753cabf..100f04475943 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -102,8 +102,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) return DRM_GPU_SCHED_STAT_ENODEV; } - adev->job_hang = true; - /* * Do the coredump immediately after a job timeout to get a very * close dump/snapshot/representation of GPU's current error status @@ -181,7 +179,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) } exit: - adev->job_hang = false; drm_dev_exit(idx); return DRM_GPU_SCHED_STAT_NOMINAL; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 9484f3b5a9b7..003522c2d902 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -5957,7 +5957,7 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable) else WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp); - if (adev->job_hang && !enable) + if (amdgpu_in_reset(adev) && !enable) return 0; for (i = 0; i < adev->usec_timeout; i++) { -- 2.51.0 From f607b2b867bbef8a3a76de8d0eccf7429782bdca Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Fri, 22 Nov 2024 17:36:15 -0500 Subject: [PATCH 15/16] drm/amdkfd: KFD interrupt access ih_fifo data in-place To handle 40000 to 80000 interrupts per second running CPX mode with 4 streams/queues per KFD node, KFD interrupt handler becomes the performance bottleneck. Remove the kfifo_out memcpy overhead by accessing ih_fifo data in-place and updating rptr with kfifo_skip_count. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 35 +++++++++------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c index 9b6b6e882593..e7412de9a0ac 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c @@ -114,50 +114,43 @@ void kfd_interrupt_exit(struct kfd_node *node) */ bool enqueue_ih_ring_entry(struct kfd_node *node, const void *ih_ring_entry) { - int count; - - count = kfifo_in(&node->ih_fifo, ih_ring_entry, - node->kfd->device_info.ih_ring_entry_size); - if (count != node->kfd->device_info.ih_ring_entry_size) { + if (kfifo_is_full(&node->ih_fifo)) { dev_dbg_ratelimited(node->adev->dev, - "Interrupt ring overflow, dropping interrupt %d\n", - count); + "Interrupt ring overflow, dropping interrupt\n"); return false; } + kfifo_in(&node->ih_fifo, ih_ring_entry, node->kfd->device_info.ih_ring_entry_size); return true; } /* * Assumption: single reader/writer. This function is not re-entrant */ -static bool dequeue_ih_ring_entry(struct kfd_node *node, void *ih_ring_entry) +static bool dequeue_ih_ring_entry(struct kfd_node *node, u32 **ih_ring_entry) { int count; - count = kfifo_out(&node->ih_fifo, ih_ring_entry, - node->kfd->device_info.ih_ring_entry_size); - - WARN_ON(count && count != node->kfd->device_info.ih_ring_entry_size); + if (kfifo_is_empty(&node->ih_fifo)) + return false; + count = kfifo_out_linear_ptr(&node->ih_fifo, ih_ring_entry, + node->kfd->device_info.ih_ring_entry_size); + WARN_ON(count != node->kfd->device_info.ih_ring_entry_size); return count == node->kfd->device_info.ih_ring_entry_size; } static void interrupt_wq(struct work_struct *work) { - struct kfd_node *dev = container_of(work, struct kfd_node, - interrupt_work); - uint32_t ih_ring_entry[KFD_MAX_RING_ENTRY_SIZE]; + struct kfd_node *dev = container_of(work, struct kfd_node, interrupt_work); + uint32_t *ih_ring_entry; unsigned long start_jiffies = jiffies; - if (dev->kfd->device_info.ih_ring_entry_size > sizeof(ih_ring_entry)) { - dev_err_once(dev->adev->dev, "Ring entry too small\n"); - return; - } - - while (dequeue_ih_ring_entry(dev, ih_ring_entry)) { + while (dequeue_ih_ring_entry(dev, &ih_ring_entry)) { dev->kfd->device_info.event_interrupt_class->interrupt_wq(dev, ih_ring_entry); + kfifo_skip_count(&dev->ih_fifo, dev->kfd->device_info.ih_ring_entry_size); + if (time_is_before_jiffies(start_jiffies + HZ)) { /* If we spent more than a second processing signals, * reschedule the worker to avoid soft-lockup warnings -- 2.51.0 From 1b00143231d3e6f4b76f88f4edd6bb8a1332ef9b Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Tue, 12 Nov 2024 22:07:33 -0500 Subject: [PATCH 16/16] drm/amdgpu: Optimize gfx v9 GPU page fault handling After GPU page fault, there are lots of page fault interrupts generated at short period even with CAM filter enabled because the fault address is different. Each page fault copy to KFD ih fifo to send event to user space by KFD interrupt worker, this could cause KFD ih fifo overflow while other processes generate events at same time. KFD process is aborted after GPU page fault, we only need one GPU page fault interrupt sent to KFD ih fifo to send memory exception event to user space. Incease KFD ih fifo size to 2 times of IH primary ring size, to handle the burst events case. This patch handle the gfx v9 path, cover retry on/off and CAM filter on/off cases. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 10 ++++ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 3 + drivers/gpu/drm/amd/amdkfd/kfd_device.c | 67 ++++++++++++++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 + 5 files changed, 84 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 4b80ad860639..8af67f18500a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -433,6 +433,9 @@ void kgd2kfd_unlock_kfd(void); int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id); int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id); bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id); +bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry, + bool retry_fault); + #else static inline int kgd2kfd_init(void) { @@ -518,5 +521,12 @@ static inline bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id) { return false; } + +static inline bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry, + bool retry_fault) +{ + return false; +} + #endif #endif /* AMDGPU_AMDKFD_H_INCLUDED */ diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 48f1b9cf1f88..291549765c38 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -623,6 +623,9 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, } } + if (kgd2kfd_vmfault_fast_path(adev, entry, retry_fault)) + return 1; + if (!printk_ratelimit()) return 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index ac0fdaa1ea23..9b49563f2c42 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -1521,6 +1521,73 @@ bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id) return kfd_compute_active(node); } +/** + * kgd2kfd_vmfault_fast_path() - KFD vm page fault interrupt handling fast path for gmc v9 + * @adev: amdgpu device + * @entry: vm fault interrupt vector + * @retry_fault: if this is retry fault + * + * retry fault - + * with CAM enabled, adev primary ring + * | gmc_v9_0_process_interrupt() + * adev soft_ring + * | gmc_v9_0_process_interrupt() worker failed to recover page fault + * KFD node ih_fifo + * | KFD interrupt_wq worker + * kfd_signal_vm_fault_event + * + * without CAM, adev primary ring1 + * | gmc_v9_0_process_interrupt worker failed to recvoer page fault + * KFD node ih_fifo + * | KFD interrupt_wq worker + * kfd_signal_vm_fault_event + * + * no-retry fault - + * adev primary ring + * | gmc_v9_0_process_interrupt() + * KFD node ih_fifo + * | KFD interrupt_wq worker + * kfd_signal_vm_fault_event + * + * fast path - After kfd_signal_vm_fault_event, gmc_v9_0_process_interrupt drop the page fault + * of same process, don't copy interrupt to KFD node ih_fifo. + * With gdb debugger enabled, need convert the retry fault to no-retry fault for + * debugger, cannot use the fast path. + * + * Return: + * true - use the fast path to handle this fault + * false - use normal path to handle it + */ +bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry, + bool retry_fault) +{ + struct kfd_process *p; + u32 cam_index; + + if (entry->ih == &adev->irq.ih_soft || entry->ih == &adev->irq.ih1) { + p = kfd_lookup_process_by_pasid(entry->pasid); + if (!p) + return true; + + if (p->gpu_page_fault && !p->debug_trap_enabled) { + if (retry_fault && adev->irq.retry_cam_enabled) { + cam_index = entry->src_data[2] & 0x3ff; + WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index); + } + + kfd_unref_process(p); + return true; + } + + /* + * This is the first page fault, set flag and then signal user space + */ + p->gpu_page_fault = true; + kfd_unref_process(p); + } + return false; +} + #if defined(CONFIG_DEBUG_FS) /* This function will send a package to HIQ to hang the HWS diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c index e7412de9a0ac..8e00800f3207 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c @@ -46,7 +46,7 @@ #include #include "kfd_priv.h" -#define KFD_IH_NUM_ENTRIES 8192 +#define KFD_IH_NUM_ENTRIES 16384 static void interrupt_wq(struct work_struct *); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 800e4ae5b573..e529fdc1b422 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -1003,6 +1003,9 @@ struct kfd_process { struct semaphore runtime_enable_sema; bool is_runtime_retry; struct kfd_runtime_info runtime_info; + + /* if gpu page fault sent to KFD */ + bool gpu_page_fault; }; #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */ -- 2.51.0