From 5b3922222cea2d1e95a72994bd79abd02e0f3f5e Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 11 Mar 2025 15:53:55 -0400 Subject: [PATCH 01/16] drm/amdgpu/pm: enable vcn busy sysfs for GC 12.x Make it visible for the all GC 12.x chips that support it. Reviewed-by: Lijo Lazar Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/amdgpu_pm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index c1f5ef7384b4..74e75585a43b 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -2326,7 +2326,9 @@ static int default_attr_update(struct amdgpu_device *adev, struct amdgpu_device_ gc_ver == IP_VERSION(11, 5, 0) || gc_ver == IP_VERSION(11, 5, 1) || gc_ver == IP_VERSION(11, 5, 2) || - gc_ver == IP_VERSION(11, 5, 3))) + gc_ver == IP_VERSION(11, 5, 3) || + gc_ver == IP_VERSION(12, 0, 0) || + gc_ver == IP_VERSION(12, 0, 1))) *states = ATTR_STATE_UNSUPPORTED; } else if (DEVICE_ATTR_IS(pcie_bw)) { /* PCIe Perf counters won't work on APU nodes */ -- 2.51.0 From 15030aeec3934d32622f73909d54944c8cacf227 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 11 Mar 2025 16:34:45 -0400 Subject: [PATCH 02/16] drm/amdgpu/pm: enable vcn busy sysfs for GC 9.3.0 Make it visible for the all GC 9.3.0 chips that support it. Reviewed-by: Lijo Lazar Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/amdgpu_pm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index 74e75585a43b..744a72f40dec 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -2314,7 +2314,8 @@ static int default_attr_update(struct amdgpu_device *adev, struct amdgpu_device_ gc_ver == IP_VERSION(9, 0, 1)) *states = ATTR_STATE_UNSUPPORTED; } else if (DEVICE_ATTR_IS(vcn_busy_percent)) { - if (!(gc_ver == IP_VERSION(10, 3, 1) || + if (!(gc_ver == IP_VERSION(9, 3, 0) || + gc_ver == IP_VERSION(10, 3, 1) || gc_ver == IP_VERSION(10, 3, 3) || gc_ver == IP_VERSION(10, 3, 6) || gc_ver == IP_VERSION(10, 3, 7) || -- 2.51.0 From 8d5e70ba5da21452735474b70322446aeb442c94 Mon Sep 17 00:00:00 2001 From: Emily Deng Date: Thu, 6 Feb 2025 13:09:00 +0800 Subject: [PATCH 03/16] drm/amdgpu: Add amdgpu_sriov_multi_vf_mode function Use amdgpu_sriov_multi_vf_mode to replace amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev). Signed-off-by: Emily Deng Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 2 ++ drivers/gpu/drm/amd/pm/amdgpu_pm.c | 4 ++-- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 8 ++++---- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 49ca8c814455..a1450f13d963 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -1990,7 +1990,7 @@ static int amdgpu_debugfs_sclk_set(void *data, u64 val) uint32_t max_freq, min_freq; struct amdgpu_device *adev = (struct amdgpu_device *)data; - if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev)) + if (amdgpu_sriov_multi_vf_mode(adev)) return -EINVAL; ret = pm_runtime_get_sync(adev_to_drm(adev)->dev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index 9f65487e60f5..df03dba67ab8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -364,6 +364,8 @@ static inline bool is_virtual_machine(void) #define amdgpu_sriov_is_pp_one_vf(adev) \ ((adev)->virt.gim_feature & AMDGIM_FEATURE_PP_ONE_VF) +#define amdgpu_sriov_multi_vf_mode(adev) \ + (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev)) #define amdgpu_sriov_is_debug(adev) \ ((!amdgpu_in_reset(adev)) && adev->virt.tdr_debug) #define amdgpu_sriov_is_normal(adev) \ diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index 744a72f40dec..922def51685b 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -1936,7 +1936,7 @@ static int pp_od_clk_voltage_attr_update(struct amdgpu_device *adev, struct amdg if (gc_ver == IP_VERSION(9, 4, 3) || gc_ver == IP_VERSION(9, 4, 4) || gc_ver == IP_VERSION(9, 5, 0)) { - if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev)) + if (amdgpu_sriov_multi_vf_mode(adev)) *states = ATTR_STATE_UNSUPPORTED; return 0; } @@ -1971,7 +1971,7 @@ static int pp_dpm_dcefclk_attr_update(struct amdgpu_device *adev, struct amdgpu_ * setting should not be allowed from VF if not in one VF mode. */ if (gc_ver >= IP_VERSION(10, 0, 0) || - (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev))) { + (amdgpu_sriov_multi_vf_mode(adev))) { dev_attr->attr.mode &= ~S_IWUGO; dev_attr->store = NULL; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index 8cfb07549f54..d1aa6075c4f4 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -1814,7 +1814,7 @@ static int smu_hw_init(struct amdgpu_ip_block *ip_block) struct amdgpu_device *adev = ip_block->adev; struct smu_context *smu = adev->powerplay.pp_handle; - if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev)) { + if (amdgpu_sriov_multi_vf_mode(adev)) { smu->pm_enabled = false; return 0; } @@ -2038,7 +2038,7 @@ static int smu_hw_fini(struct amdgpu_ip_block *ip_block) struct smu_context *smu = adev->powerplay.pp_handle; int i, ret; - if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev)) + if (amdgpu_sriov_multi_vf_mode(adev)) return 0; for (i = 0; i < adev->vcn.num_vcn_inst; i++) { @@ -2106,7 +2106,7 @@ static int smu_suspend(struct amdgpu_ip_block *ip_block) int ret; uint64_t count; - if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev)) + if (amdgpu_sriov_multi_vf_mode(adev)) return 0; if (!smu->pm_enabled) @@ -2142,7 +2142,7 @@ static int smu_resume(struct amdgpu_ip_block *ip_block) struct amdgpu_device *adev = ip_block->adev; struct smu_context *smu = adev->powerplay.pp_handle; - if (amdgpu_sriov_vf(adev)&& !amdgpu_sriov_is_pp_one_vf(adev)) + if (amdgpu_sriov_multi_vf_mode(adev)) return 0; if (!smu->pm_enabled) -- 2.51.0 From 2da3af5f0b4deda735898f5c587be4324dda3fbd Mon Sep 17 00:00:00 2001 From: Emily Deng Date: Fri, 7 Feb 2025 14:00:00 +0800 Subject: [PATCH 04/16] drm/amdgpu: set CP_HQD_PQ_DOORBELL_CONTROL.DOORBELL_MODE to 1 for sriov multiple vf. In sriov multiple vf, Set CP_HQD_PQ_DOORBELL_CONTROL.DOORBELL_MODE to 1 to read WPTR from MQD. Signed-off-by: Emily Deng Acked-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 2 +- .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 27 ++++++++++++++++--- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 476542b6e7b5..efe45e4edfd7 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -1821,7 +1821,7 @@ static int gfx_v9_4_3_xcc_mqd_init(struct amdgpu_ring *ring, int xcc_id) DOORBELL_SOURCE, 0); tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_HIT, 0); - if (amdgpu_sriov_vf(adev)) + if (amdgpu_sriov_multi_vf_mode(adev)) tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_MODE, 1); } else { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index 3014925d95ff..80320a6c8854 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -554,7 +554,7 @@ static void init_mqd_hiq_v9_4_3(struct mqd_manager *mm, void **mqd, m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; - if (amdgpu_sriov_vf(mm->dev->adev)) + if (amdgpu_sriov_multi_vf_mode(mm->dev->adev)) m->cp_hqd_pq_doorbell_control |= 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT; m->cp_mqd_stride_size = kfd_hiq_mqd_stride(mm->dev); @@ -667,7 +667,9 @@ static void init_mqd_v9_4_3(struct mqd_manager *mm, void **mqd, get_xcc_mqd(mqd_mem_obj, &xcc_mqd_mem_obj, offset*xcc); init_mqd(mm, (void **)&m, &xcc_mqd_mem_obj, &xcc_gart_addr, q); - + if (amdgpu_sriov_multi_vf_mode(mm->dev->adev)) + m->cp_hqd_pq_doorbell_control |= 1 << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT; m->cp_mqd_stride_size = offset; /* @@ -727,6 +729,9 @@ static void update_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd + size * xcc); update_mqd(mm, m, q, minfo); + if (amdgpu_sriov_multi_vf_mode(mm->dev->adev)) + m->cp_hqd_pq_doorbell_control |= 1 << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT; update_cu_mask(mm, m, minfo, xcc); if (q->format == KFD_QUEUE_FORMAT_AQL) { @@ -749,6 +754,21 @@ static void update_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, } } +static void restore_mqd_v9_4_3(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *qp, + const void *mqd_src, + const void *ctl_stack_src, u32 ctl_stack_size) +{ + restore_mqd(mm, mqd, mqd_mem_obj, gart_addr, qp, mqd_src, ctl_stack_src, ctl_stack_size); + if (amdgpu_sriov_multi_vf_mode(mm->dev->adev)) { + struct v9_mqd *m; + + m = (struct v9_mqd *) mqd_mem_obj->cpu_ptr; + m->cp_hqd_pq_doorbell_control |= 1 << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT; + } +} static int destroy_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, enum kfd_preempt_type type, unsigned int timeout, uint32_t pipe_id, uint32_t queue_id) @@ -883,7 +903,6 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, mqd->is_occupied = kfd_is_occupied_cp; mqd->get_checkpoint_info = get_checkpoint_info; mqd->checkpoint_mqd = checkpoint_mqd; - mqd->restore_mqd = restore_mqd; mqd->mqd_size = sizeof(struct v9_mqd); mqd->mqd_stride = mqd_stride_v9; #if defined(CONFIG_DEBUG_FS) @@ -895,12 +914,14 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, mqd->init_mqd = init_mqd_v9_4_3; mqd->load_mqd = load_mqd_v9_4_3; mqd->update_mqd = update_mqd_v9_4_3; + mqd->restore_mqd = restore_mqd_v9_4_3; mqd->destroy_mqd = destroy_mqd_v9_4_3; mqd->get_wave_state = get_wave_state_v9_4_3; } else { mqd->init_mqd = init_mqd; mqd->load_mqd = load_mqd; mqd->update_mqd = update_mqd; + mqd->restore_mqd = restore_mqd; mqd->destroy_mqd = kfd_destroy_mqd_cp; mqd->get_wave_state = get_wave_state; } -- 2.51.0 From a4b6e990d788ba0835476a9a9b0dfad01113ed32 Mon Sep 17 00:00:00 2001 From: ganglxie Date: Tue, 11 Mar 2025 18:35:44 +0800 Subject: [PATCH 05/16] drm/amdgpu: Save PA of bad pages for old asics for old asics that do not support mca translating, we just save PA for them Signed-off-by: ganglxie Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 24 ++++++++++++++++--- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 9 +++++-- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 285e3aa2bb2f..7cf8a3036828 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2836,6 +2836,13 @@ static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev, save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; + /*old asics just have pa in eeprom*/ + if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) { + memcpy(err_data->err_addr, bps, + sizeof(struct eeprom_table_record) * adev->umc.retire_unit); + goto out; + } + for (i = 0; i < adev->umc.retire_unit; i++) bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); @@ -2858,6 +2865,7 @@ static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev, } } +out: return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit); } @@ -2981,14 +2989,24 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, /* only new entries are saved */ if (save_count > 0) { - for (i = 0; i < unit_num; i++) { + /*old asics only save pa to eeprom like before*/ + if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) { if (amdgpu_ras_eeprom_append(control, - &data->bps[bad_page_num + i * adev->umc.retire_unit], - 1)) { + &data->bps[bad_page_num], save_count)) { dev_err(adev->dev, "Failed to save EEPROM table data!"); return -EIO; } + } else { + for (i = 0; i < unit_num; i++) { + if (amdgpu_ras_eeprom_append(control, + &data->bps[bad_page_num + + i * adev->umc.retire_unit], 1)) { + dev_err(adev->dev, "Failed to save EEPROM table data!"); + return -EIO; + } + } } + dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 09a6f8bc1a5a..3597ecd9baca 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -727,9 +727,14 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, - control->ras_fri) % control->ras_max_record_count; - control->ras_num_mca_recs += num; - control->ras_num_bad_pages += num * adev->umc.retire_unit; + /*old asics only save pa to eeprom like before*/ + if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) + control->ras_num_pa_recs += num; + else + control->ras_num_mca_recs += num; + control->ras_num_bad_pages = control->ras_num_pa_recs + + control->ras_num_mca_recs * adev->umc.retire_unit; Out: kfree(buf); return res; -- 2.51.0 From 0c7e053448945e5a4379dc4396c762d7422b11ca Mon Sep 17 00:00:00 2001 From: Amber Lin Date: Wed, 12 Mar 2025 21:14:43 -0400 Subject: [PATCH 06/16] drm/amdkfd: Correct F8_MODE for gfx950 Correct F8_MODE setting for gfx950 that was removed Fixes: 61972cd93af7 ("drm/amdkfd: Set per-process flags only once for gfx9/10/11/12") Signed-off-by: Amber Lin Reviewed-by: Harish Kasiviswanathan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c index d794c8172b40..9fcc8c6e57b7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c @@ -71,8 +71,7 @@ static bool set_cache_memory_policy_v9(struct device_queue_manager *dqm, qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; if (KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 3) || - KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 4) || - KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 5, 0)) + KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 4)) qpd->sh_mem_config |= (1 << SH_MEM_CONFIG__F8_MODE__SHIFT); if (KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 5, 0)) { -- 2.51.0 From 13c13bdd1b014eb8261326fe1d62cb4675f3c795 Mon Sep 17 00:00:00 2001 From: Xiang Liu Date: Fri, 28 Feb 2025 14:56:45 +0800 Subject: [PATCH 07/16] drm/amdgpu: Enable ACA by default for psp v13_0_6/v13_0_14 Enable ACA by default for psp v13_0_6/v13_0_14. Signed-off-by: Xiang Liu Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 7cf8a3036828..cfec29835634 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3785,9 +3785,11 @@ init_ras_enabled_flag: adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : adev->ras_hw_enabled & amdgpu_ras_mask; - /* aca is disabled by default except for psp v13_0_12 */ + /* aca is disabled by default except for psp v13_0_6/v13_0_12/v13_0_14 */ adev->aca.is_enabled = - (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12)); + (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) || + amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) || + amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14)); /* bad page feature is not applicable to specific app platform */ if (adev->gmc.is_app_apu && -- 2.51.0 From f81cd793119e7f4b426a825435d49cc10a081c7a Mon Sep 17 00:00:00 2001 From: Shaoyun Liu Date: Mon, 10 Mar 2025 12:38:12 -0400 Subject: [PATCH 08/16] drm/amd/amdgpu: Fix MES init sequence When MES is been used , the set_hw_resource_1 API is required to initialize MES internal context correctly Signed-off-by: Shaoyun Liu Acked-by: Alex Deucher Reviewed-by: Srinivasan Shanmugam Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 6 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 9 ++-- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 59 ++++++++++++------------ drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 43 ++++++++--------- 4 files changed, 57 insertions(+), 60 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 68d640aaa2e1..da2c9a8cb3e0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -143,9 +143,9 @@ struct amdgpu_mes { const struct amdgpu_mes_funcs *funcs; /* mes resource_1 bo*/ - struct amdgpu_bo *resource_1; - uint64_t resource_1_gpu_addr; - void *resource_1_addr; + struct amdgpu_bo *resource_1[AMDGPU_MAX_MES_PIPES]; + uint64_t resource_1_gpu_addr[AMDGPU_MAX_MES_PIPES]; + void *resource_1_addr[AMDGPU_MAX_MES_PIPES]; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index ab7e73d0e7b1..0bb8cbe0dcc0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -614,10 +614,11 @@ static int amdgpu_virt_write_vf2pf_data(struct amdgpu_device *adev) vf2pf_info->decode_usage = 0; vf2pf_info->dummy_page_addr = (uint64_t)adev->dummy_page_addr; - vf2pf_info->mes_info_addr = (uint64_t)adev->mes.resource_1_gpu_addr; - - if (adev->mes.resource_1) { - vf2pf_info->mes_info_size = adev->mes.resource_1->tbo.base.size; + if (amdgpu_sriov_is_mes_info_enable(adev)) { + vf2pf_info->mes_info_addr = + (uint64_t)(adev->mes.resource_1_gpu_addr[0] + AMDGPU_GPU_PAGE_SIZE); + vf2pf_info->mes_info_size = + adev->mes.resource_1[0]->tbo.base.size - AMDGPU_GPU_PAGE_SIZE; } vf2pf_info->checksum = amd_sriov_msg_checksum( diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 7eee41187b7c..e65916ada23b 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -740,10 +740,13 @@ static int mes_v11_0_set_hw_resources_1(struct amdgpu_mes *mes) mes_set_hw_res_pkt.header.opcode = MES_SCH_API_SET_HW_RSRC_1; mes_set_hw_res_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; mes_set_hw_res_pkt.enable_mes_info_ctx = 1; - mes_set_hw_res_pkt.mes_info_ctx_mc_addr = mes->resource_1_gpu_addr; - mes_set_hw_res_pkt.mes_info_ctx_size = MES11_HW_RESOURCE_1_SIZE; - mes_set_hw_res_pkt.cleaner_shader_fence_mc_addr = - mes->resource_1_gpu_addr + MES11_HW_RESOURCE_1_SIZE; + + mes_set_hw_res_pkt.cleaner_shader_fence_mc_addr = mes->resource_1_gpu_addr[0]; + if (amdgpu_sriov_is_mes_info_enable(mes->adev)) { + mes_set_hw_res_pkt.mes_info_ctx_mc_addr = + mes->resource_1_gpu_addr[0] + AMDGPU_GPU_PAGE_SIZE; + mes_set_hw_res_pkt.mes_info_ctx_size = MES11_HW_RESOURCE_1_SIZE; + } return mes_v11_0_submit_pkt_and_poll_completion(mes, &mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt), @@ -1381,7 +1384,7 @@ static int mes_v11_0_mqd_sw_init(struct amdgpu_device *adev, static int mes_v11_0_sw_init(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; - int pipe, r; + int pipe, r, bo_size; adev->mes.funcs = &mes_v11_0_funcs; adev->mes.kiq_hw_init = &mes_v11_0_kiq_hw_init; @@ -1416,19 +1419,21 @@ static int mes_v11_0_sw_init(struct amdgpu_ip_block *ip_block) if (r) return r; - if (amdgpu_sriov_is_mes_info_enable(adev) || - adev->gfx.enable_cleaner_shader) { - r = amdgpu_bo_create_kernel(adev, - MES11_HW_RESOURCE_1_SIZE + AMDGPU_GPU_PAGE_SIZE, - PAGE_SIZE, - AMDGPU_GEM_DOMAIN_VRAM, - &adev->mes.resource_1, - &adev->mes.resource_1_gpu_addr, - &adev->mes.resource_1_addr); - if (r) { - dev_err(adev->dev, "(%d) failed to create mes resource_1 bo\n", r); - return r; - } + bo_size = AMDGPU_GPU_PAGE_SIZE; + if (amdgpu_sriov_is_mes_info_enable(adev)) + bo_size += MES11_HW_RESOURCE_1_SIZE; + + /* Only needed for AMDGPU_MES_SCHED_PIPE on MES 11*/ + r = amdgpu_bo_create_kernel(adev, + bo_size, + PAGE_SIZE, + AMDGPU_GEM_DOMAIN_VRAM, + &adev->mes.resource_1[0], + &adev->mes.resource_1_gpu_addr[0], + &adev->mes.resource_1_addr[0]); + if (r) { + dev_err(adev->dev, "(%d) failed to create mes resource_1 bo\n", r); + return r; } return 0; @@ -1439,11 +1444,8 @@ static int mes_v11_0_sw_fini(struct amdgpu_ip_block *ip_block) struct amdgpu_device *adev = ip_block->adev; int pipe; - if (amdgpu_sriov_is_mes_info_enable(adev) || - adev->gfx.enable_cleaner_shader) { - amdgpu_bo_free_kernel(&adev->mes.resource_1, &adev->mes.resource_1_gpu_addr, - &adev->mes.resource_1_addr); - } + amdgpu_bo_free_kernel(&adev->mes.resource_1[0], &adev->mes.resource_1_gpu_addr[0], + &adev->mes.resource_1_addr[0]); for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { kfree(adev->mes.mqd_backup[pipe]); @@ -1632,13 +1634,10 @@ static int mes_v11_0_hw_init(struct amdgpu_ip_block *ip_block) if (r) goto failure; - if (amdgpu_sriov_is_mes_info_enable(adev) || - adev->gfx.enable_cleaner_shader) { - r = mes_v11_0_set_hw_resources_1(&adev->mes); - if (r) { - DRM_ERROR("failed mes_v11_0_set_hw_resources_1, r=%d\n", r); - goto failure; - } + r = mes_v11_0_set_hw_resources_1(&adev->mes); + if (r) { + DRM_ERROR("failed mes_v11_0_set_hw_resources_1, r=%d\n", r); + goto failure; } r = mes_v11_0_query_sched_status(&adev->mes); diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index fdc435b62012..183dd3346da5 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -687,7 +687,7 @@ static int mes_v12_0_set_hw_resources_1(struct amdgpu_mes *mes, int pipe) mes_set_hw_res_1_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; mes_set_hw_res_1_pkt.mes_kiq_unmap_timeout = 0xa; mes_set_hw_res_1_pkt.cleaner_shader_fence_mc_addr = - mes->resource_1_gpu_addr; + mes->resource_1_gpu_addr[pipe]; return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &mes_set_hw_res_1_pkt, sizeof(mes_set_hw_res_1_pkt), @@ -1519,23 +1519,22 @@ static int mes_v12_0_sw_init(struct amdgpu_ip_block *ip_block) if (r) return r; - if (!adev->enable_uni_mes && pipe == AMDGPU_MES_KIQ_PIPE) + if (!adev->enable_uni_mes && pipe == AMDGPU_MES_KIQ_PIPE) { r = mes_v12_0_kiq_ring_init(adev); - else + } + else { r = mes_v12_0_ring_init(adev, pipe); - if (r) - return r; - } - - if (adev->enable_uni_mes) { - r = amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, - AMDGPU_GEM_DOMAIN_VRAM, - &adev->mes.resource_1, - &adev->mes.resource_1_gpu_addr, - &adev->mes.resource_1_addr); - if (r) { - dev_err(adev->dev, "(%d) failed to create mes resource_1 bo\n", r); - return r; + if (r) + return r; + r = amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, + AMDGPU_GEM_DOMAIN_VRAM, + &adev->mes.resource_1[pipe], + &adev->mes.resource_1_gpu_addr[pipe], + &adev->mes.resource_1_addr[pipe]); + if (r) { + dev_err(adev->dev, "(%d) failed to create mes resource_1 bo pipe[%d]\n", r, pipe); + return r; + } } } @@ -1547,12 +1546,11 @@ static int mes_v12_0_sw_fini(struct amdgpu_ip_block *ip_block) struct amdgpu_device *adev = ip_block->adev; int pipe; - if (adev->enable_uni_mes) - amdgpu_bo_free_kernel(&adev->mes.resource_1, - &adev->mes.resource_1_gpu_addr, - &adev->mes.resource_1_addr); - for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { + amdgpu_bo_free_kernel(&adev->mes.resource_1[pipe], + &adev->mes.resource_1_gpu_addr[pipe], + &adev->mes.resource_1_addr[pipe]); + kfree(adev->mes.mqd_backup[pipe]); amdgpu_bo_free_kernel(&adev->mes.eop_gpu_obj[pipe], @@ -1751,8 +1749,7 @@ static int mes_v12_0_hw_init(struct amdgpu_ip_block *ip_block) if (r) goto failure; - if (adev->enable_uni_mes) - mes_v12_0_set_hw_resources_1(&adev->mes, AMDGPU_MES_SCHED_PIPE); + mes_v12_0_set_hw_resources_1(&adev->mes, AMDGPU_MES_SCHED_PIPE); mes_v12_0_init_aggregated_doorbell(&adev->mes); -- 2.51.0 From 42d9d7bed270247f134190ba0cb05bbd072f58c2 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Wed, 5 Feb 2025 10:06:38 -0300 Subject: [PATCH 09/16] drm/amd/display: avoid NPD when ASIC does not support DMUB ctx->dmub_srv will de NULL if the ASIC does not support DMUB, which is tested in dm_dmub_sw_init. However, it will be dereferenced in dmub_hw_lock_mgr_cmd if should_use_dmub_lock returns true. This has been the case since dmub support has been added for PSR1. Fix this by checking for dmub_srv in should_use_dmub_lock. [ 37.440832] BUG: kernel NULL pointer dereference, address: 0000000000000058 [ 37.447808] #PF: supervisor read access in kernel mode [ 37.452959] #PF: error_code(0x0000) - not-present page [ 37.458112] PGD 0 P4D 0 [ 37.460662] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI [ 37.465553] CPU: 2 UID: 1000 PID: 1745 Comm: DrmThread Not tainted 6.14.0-rc1-00003-gd62e938120f0 #23 99720e1cb1e0fc4773b8513150932a07de3c6e88 [ 37.478324] Hardware name: Google Morphius/Morphius, BIOS Google_Morphius.13434.858.0 10/26/2023 [ 37.487103] RIP: 0010:dmub_hw_lock_mgr_cmd+0x77/0xb0 [ 37.492074] Code: 44 24 0e 00 00 00 00 48 c7 04 24 45 00 00 0c 40 88 74 24 0d 0f b6 02 88 44 24 0c 8b 01 89 44 24 08 85 f6 75 05 c6 44 24 0e 01 <48> 8b 7f 58 48 89 e6 ba 01 00 00 00 e8 08 3c 2a 00 65 48 8b 04 5 [ 37.510822] RSP: 0018:ffff969442853300 EFLAGS: 00010202 [ 37.516052] RAX: 0000000000000000 RBX: ffff92db03000000 RCX: ffff969442853358 [ 37.523185] RDX: ffff969442853368 RSI: 0000000000000001 RDI: 0000000000000000 [ 37.530322] RBP: 0000000000000001 R08: 00000000000004a7 R09: 00000000000004a5 [ 37.537453] R10: 0000000000000476 R11: 0000000000000062 R12: ffff92db0ade8000 [ 37.544589] R13: ffff92da01180ae0 R14: ffff92da011802a8 R15: ffff92db03000000 [ 37.551725] FS: 0000784a9cdfc6c0(0000) GS:ffff92db2af00000(0000) knlGS:0000000000000000 [ 37.559814] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 37.565562] CR2: 0000000000000058 CR3: 0000000112b1c000 CR4: 00000000003506f0 [ 37.572697] Call Trace: [ 37.575152] [ 37.577258] ? __die_body+0x66/0xb0 [ 37.580756] ? page_fault_oops+0x3e7/0x4a0 [ 37.584861] ? exc_page_fault+0x3e/0xe0 [ 37.588706] ? exc_page_fault+0x5c/0xe0 [ 37.592550] ? asm_exc_page_fault+0x22/0x30 [ 37.596742] ? dmub_hw_lock_mgr_cmd+0x77/0xb0 [ 37.601107] dcn10_cursor_lock+0x1e1/0x240 [ 37.605211] program_cursor_attributes+0x81/0x190 [ 37.609923] commit_planes_for_stream+0x998/0x1ef0 [ 37.614722] update_planes_and_stream_v2+0x41e/0x5c0 [ 37.619703] dc_update_planes_and_stream+0x78/0x140 [ 37.624588] amdgpu_dm_atomic_commit_tail+0x4362/0x49f0 [ 37.629832] ? srso_return_thunk+0x5/0x5f [ 37.633847] ? mark_held_locks+0x6d/0xd0 [ 37.637774] ? _raw_spin_unlock_irq+0x24/0x50 [ 37.642135] ? srso_return_thunk+0x5/0x5f [ 37.646148] ? lockdep_hardirqs_on+0x95/0x150 [ 37.650510] ? srso_return_thunk+0x5/0x5f [ 37.654522] ? _raw_spin_unlock_irq+0x2f/0x50 [ 37.658883] ? srso_return_thunk+0x5/0x5f [ 37.662897] ? wait_for_common+0x186/0x1c0 [ 37.666998] ? srso_return_thunk+0x5/0x5f [ 37.671009] ? drm_crtc_next_vblank_start+0xc3/0x170 [ 37.675983] commit_tail+0xf5/0x1c0 [ 37.679478] drm_atomic_helper_commit+0x2a2/0x2b0 [ 37.684186] drm_atomic_commit+0xd6/0x100 [ 37.688199] ? __cfi___drm_printfn_info+0x10/0x10 [ 37.692911] drm_atomic_helper_update_plane+0xe5/0x130 [ 37.698054] drm_mode_cursor_common+0x501/0x670 [ 37.702600] ? __cfi_drm_mode_cursor_ioctl+0x10/0x10 [ 37.707572] drm_mode_cursor_ioctl+0x48/0x70 [ 37.711851] drm_ioctl_kernel+0xf2/0x150 [ 37.715781] drm_ioctl+0x363/0x590 [ 37.719189] ? __cfi_drm_mode_cursor_ioctl+0x10/0x10 [ 37.724165] amdgpu_drm_ioctl+0x41/0x80 [ 37.728013] __se_sys_ioctl+0x7f/0xd0 [ 37.731685] do_syscall_64+0x87/0x100 [ 37.735355] ? vma_end_read+0x12/0xe0 [ 37.739024] ? srso_return_thunk+0x5/0x5f [ 37.743041] ? find_held_lock+0x47/0xf0 [ 37.746884] ? vma_end_read+0x12/0xe0 [ 37.750552] ? srso_return_thunk+0x5/0x5f [ 37.754565] ? lock_release+0x1c4/0x2e0 [ 37.758406] ? vma_end_read+0x12/0xe0 [ 37.762079] ? exc_page_fault+0x84/0xe0 [ 37.765921] ? srso_return_thunk+0x5/0x5f [ 37.769938] ? lockdep_hardirqs_on+0x95/0x150 [ 37.774303] ? srso_return_thunk+0x5/0x5f [ 37.778317] ? exc_page_fault+0x84/0xe0 [ 37.782163] entry_SYSCALL_64_after_hwframe+0x55/0x5d [ 37.787218] RIP: 0033:0x784aa5ec3059 [ 37.790803] Code: 04 25 28 00 00 00 48 89 45 c8 31 c0 48 8d 45 10 c7 45 b0 10 00 00 00 48 89 45 b8 48 8d 45 d0 48 89 45 c0 b8 10 00 00 00 0f 05 <41> 89 c0 3d 00 f0 ff ff 77 1d 48 8b 45 c8 64 48 2b 04 25 28 00 0 [ 37.809553] RSP: 002b:0000784a9cdf90e0 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 37.817121] RAX: ffffffffffffffda RBX: 0000784a9cdf917c RCX: 0000784aa5ec3059 [ 37.824256] RDX: 0000784a9cdf917c RSI: 00000000c01c64a3 RDI: 0000000000000020 [ 37.831391] RBP: 0000784a9cdf9130 R08: 0000000000000100 R09: 0000000000ff0000 [ 37.838525] R10: 0000000000000000 R11: 0000000000000246 R12: 0000025c01606ed0 [ 37.845657] R13: 0000025c00030200 R14: 00000000c01c64a3 R15: 0000000000000020 [ 37.852799] [ 37.854992] Modules linked in: [ 37.864546] gsmi: Log Shutdown Reason 0x03 [ 37.868656] CR2: 0000000000000058 [ 37.871979] ---[ end trace 0000000000000000 ]--- [ 37.880976] RIP: 0010:dmub_hw_lock_mgr_cmd+0x77/0xb0 [ 37.885954] Code: 44 24 0e 00 00 00 00 48 c7 04 24 45 00 00 0c 40 88 74 24 0d 0f b6 02 88 44 24 0c 8b 01 89 44 24 08 85 f6 75 05 c6 44 24 0e 01 <48> 8b 7f 58 48 89 e6 ba 01 00 00 00 e8 08 3c 2a 00 65 48 8b 04 5 [ 37.904703] RSP: 0018:ffff969442853300 EFLAGS: 00010202 [ 37.909933] RAX: 0000000000000000 RBX: ffff92db03000000 RCX: ffff969442853358 [ 37.917068] RDX: ffff969442853368 RSI: 0000000000000001 RDI: 0000000000000000 [ 37.924201] RBP: 0000000000000001 R08: 00000000000004a7 R09: 00000000000004a5 [ 37.931336] R10: 0000000000000476 R11: 0000000000000062 R12: ffff92db0ade8000 [ 37.938469] R13: ffff92da01180ae0 R14: ffff92da011802a8 R15: ffff92db03000000 [ 37.945602] FS: 0000784a9cdfc6c0(0000) GS:ffff92db2af00000(0000) knlGS:0000000000000000 [ 37.953689] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 37.959435] CR2: 0000000000000058 CR3: 0000000112b1c000 CR4: 00000000003506f0 [ 37.966570] Kernel panic - not syncing: Fatal exception [ 37.971901] Kernel Offset: 0x30200000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 37.982840] gsmi: Log Shutdown Reason 0x02 Fixes: b5c764d6ed55 ("drm/amd/display: Use HW lock mgr for PSR1") Signed-off-by: Thadeu Lima de Souza Cascardo Cc: Sun peng Li Cc: Tom Chung Cc: Daniel Wheeler Cc: Alex Deucher Reviewed-by: Rodrigo Siqueira Reviewed-by: Leo Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c b/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c index bf636b28e3e1..a262598aa676 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c @@ -63,6 +63,10 @@ void dmub_hw_lock_mgr_inbox0_cmd(struct dc_dmub_srv *dmub_srv, bool should_use_dmub_lock(struct dc_link *link) { + /* ASIC doesn't support DMUB */ + if (!link->ctx->dmub_srv) + return false; + if (link->psr_settings.psr_version == DC_PSR_VERSION_SU_1) return true; -- 2.51.0 From ebdc52607a46cda08972888178c6aa9cd6965141 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Wed, 12 Mar 2025 14:31:06 +0800 Subject: [PATCH 10/16] drm/amdgpu/gfx12: correct cleanup of 'me' field with gfx_v12_0_me_fini() In gfx_v12_0_cp_gfx_load_me_microcode_rs64(), gfx_v12_0_pfp_fini() is incorrectly used to free 'me' field of 'gfx', since gfx_v12_0_pfp_fini() can only release 'pfp' field of 'gfx'. The release function of 'me' field should be gfx_v12_0_me_fini(). Fixes: 52cb80c12e8a ("drm/amdgpu: Add gfx v12_0 ip block support (v6)") Signed-off-by: Wentao Liang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index 87dc4195192c..da364e04f09c 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -2455,7 +2455,7 @@ static int gfx_v12_0_cp_gfx_load_me_microcode_rs64(struct amdgpu_device *adev) (void **)&adev->gfx.me.me_fw_data_ptr); if (r) { dev_err(adev->dev, "(%d) failed to create me data bo\n", r); - gfx_v12_0_pfp_fini(adev); + gfx_v12_0_me_fini(adev); return r; } -- 2.51.0 From eb6cdfb807d038d9b9986b5c87188f28a4071eae Mon Sep 17 00:00:00 2001 From: David Belanger Date: Tue, 2 Jul 2024 17:56:41 -0400 Subject: [PATCH 11/16] drm/amdgpu: Restore uncached behaviour on GFX12 Always use MTYPE_UC if UNCACHED flag is specified. This makes kernarg region uncached and it restores usermode cache disable debug flag functionality. Do not set MTYPE_UC for COHERENT flag, on GFX12 coherence is handled by shader code. Signed-off-by: David Belanger Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 22 ++-------------------- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 8 +------- 2 files changed, 3 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c index bf8d01da8815..05c026d0b0d9 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c @@ -501,9 +501,6 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device *adev, uint64_t *flags) { struct amdgpu_bo *bo = mapping->bo_va->base.bo; - struct amdgpu_device *bo_adev; - bool coherent, is_system; - *flags &= ~AMDGPU_PTE_EXECUTABLE; *flags |= mapping->flags & AMDGPU_PTE_EXECUTABLE; @@ -519,26 +516,11 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device *adev, *flags &= ~AMDGPU_PTE_VALID; } - if (!bo) - return; - - if (bo->flags & (AMDGPU_GEM_CREATE_COHERENT | - AMDGPU_GEM_CREATE_UNCACHED)) - *flags = AMDGPU_PTE_MTYPE_GFX12(*flags, MTYPE_UC); - - bo_adev = amdgpu_ttm_adev(bo->tbo.bdev); - coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT; - is_system = bo->tbo.resource && - (bo->tbo.resource->mem_type == TTM_PL_TT || - bo->tbo.resource->mem_type == AMDGPU_PL_PREEMPT); - if (bo && bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) *flags |= AMDGPU_PTE_DCC; - /* WA for HW bug */ - if (is_system || ((bo_adev != adev) && coherent)) - *flags = AMDGPU_PTE_MTYPE_GFX12(*flags, MTYPE_NC); - + if (bo && bo->flags & AMDGPU_GEM_CREATE_UNCACHED) + *flags = AMDGPU_PTE_MTYPE_GFX12(*flags, MTYPE_UC); } static unsigned gmc_v12_0_get_vbios_fb_size(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 63eeef2a75ba..100717a98ec1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1287,13 +1287,7 @@ svm_range_get_pte_flags(struct kfd_node *node, break; case IP_VERSION(12, 0, 0): case IP_VERSION(12, 0, 1): - if (domain == SVM_RANGE_VRAM_DOMAIN) { - if (bo_node != node) - mapping_flags |= AMDGPU_VM_MTYPE_NC; - } else { - mapping_flags |= coherent ? - AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; - } + mapping_flags |= AMDGPU_VM_MTYPE_NC; break; default: mapping_flags |= coherent ? -- 2.51.0 From 4db4c82d4db7aeded480d905624396dc284db090 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 13 Mar 2025 16:57:50 -0400 Subject: [PATCH 12/16] drm/amdgpu: drop drm_firmware_drivers_only() There are a number of systems and cloud providers out there that have nomodeset hardcoded in their kernel parameters to block nouveau for the nvidia driver. This prevents the amdgpu driver from loading. Unfortunately the end user cannot easily change this. The preferred way to block modules from loading is to use modprobe.blacklist=. That is what providers should be using to block specific drivers. Drop the check to allow the driver to load even when nomodeset is specified on the kernel command line. Reviewed-by: Kent Russell Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 653f2bc77530..1e77fdc9c7d1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2990,9 +2990,6 @@ static int __init amdgpu_init(void) { int r; - if (drm_firmware_drivers_only()) - return -EINVAL; - r = amdgpu_sync_init(); if (r) goto error_sync; -- 2.51.0 From e00e5c223878a60e391e5422d173c3382d378f87 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 13 Mar 2025 20:52:38 -0400 Subject: [PATCH 13/16] drm/amdgpu: adjust drm_firmware_drivers_only() handling Move to probe so we can check the PCI device type and only apply the drm_firmware_drivers_only() check for PCI DISPLAY classes. Also add a module parameter to override the nomodeset kernel parameter as a workaround for platforms that have this hardcoded on their kernel command lines. Reviewed-by: Kent Russell Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 1e77fdc9c7d1..f62456da0ba5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -176,6 +176,7 @@ uint amdgpu_sdma_phase_quantum = 32; char *amdgpu_disable_cu; char *amdgpu_virtual_display; bool enforce_isolation; +int amdgpu_modeset = -1; /* Specifies the default granularity for SVM, used in buffer * migration and restoration of backing memory when handling @@ -1037,6 +1038,13 @@ module_param_named(user_partt_mode, amdgpu_user_partt_mode, uint, 0444); module_param(enforce_isolation, bool, 0444); MODULE_PARM_DESC(enforce_isolation, "enforce process isolation between graphics and compute . enforce_isolation = on"); +/** + * DOC: modeset (int) + * Override nomodeset (1 = override, -1 = auto). The default is -1 (auto). + */ +MODULE_PARM_DESC(modeset, "Override nomodeset (1 = enable, -1 = auto)"); +module_param_named(modeset, amdgpu_modeset, int, 0444); + /** * DOC: seamless (int) * Seamless boot will keep the image on the screen during the boot process. @@ -2257,6 +2265,12 @@ static int amdgpu_pci_probe(struct pci_dev *pdev, int ret, retry = 0, i; bool supports_atomic = false; + if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || + (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { + if (drm_firmware_drivers_only() && amdgpu_modeset == -1) + return -EINVAL; + } + /* skip devices which are owned by radeon */ for (i = 0; i < ARRAY_SIZE(amdgpu_unsupported_pciidlist); i++) { if (amdgpu_unsupported_pciidlist[i] == pdev->device) -- 2.51.0 From 9deacd6c55f1b31e5ab20db79df2e14ac480203c Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 25 Sep 2024 10:29:31 -0400 Subject: [PATCH 14/16] drm/amdgpu: don't free conflicting apertures for non-display devices PCI_CLASS_ACCELERATOR_PROCESSING devices won't ever be the sysfb, so there is no need to free conflicting apertures. Reviewed-by: Kent Russell Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 17ce1677378e..cc0b6c70fcb8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4399,10 +4399,17 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (r) return r; - /* Get rid of things like offb */ - r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); - if (r) - return r; + /* + * No need to remove conflicting FBs for non-display class devices. + * This prevents the sysfb from being freed accidently. + */ + if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || + (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { + /* Get rid of things like offb */ + r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); + if (r) + return r; + } /* Enable TMZ based on IP_VERSION */ amdgpu_gmc_tmz_set(adev); -- 2.51.0 From 05d50ea3ea4123536ca461ff54fa04b94adc65c1 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Tue, 4 Mar 2025 15:05:53 +0800 Subject: [PATCH 15/16] drm/amdgpu: format old RAS eeprom data into V3 version Clear old data and save it in V3 format. v2: only format eeprom data for new ASICs. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++++ .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 26 ++++++++++--------- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 1 + 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index cfec29835634..bed2603ae4c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3473,6 +3473,13 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) adev, control->bad_channel_bitmap); con->update_channel_flag = false; } + + /* The format action is only applied to new ASICs */ + if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 && + control->tbl_hdr.version < RAS_TABLE_VER_V3) + if (!amdgpu_ras_eeprom_reset_table(control)) + if (amdgpu_ras_save_bad_pages(adev, NULL)) + dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n"); } return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 3597ecd9baca..f6c0c7d3a19d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -413,9 +413,11 @@ static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) { case IP_VERSION(8, 10, 0): - case IP_VERSION(12, 0, 0): hdr->version = RAS_TABLE_VER_V2_1; return; + case IP_VERSION(12, 0, 0): + hdr->version = RAS_TABLE_VER_V3; + return; default: hdr->version = RAS_TABLE_VER_V1; return; @@ -443,7 +445,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) hdr->header = RAS_TABLE_HDR_VAL; amdgpu_ras_set_eeprom_table_version(control); - if (hdr->version == RAS_TABLE_VER_V2_1) { + if (hdr->version >= RAS_TABLE_VER_V2_1) { hdr->first_rec_offset = RAS_RECORD_START_V2_1; hdr->tbl_size = RAS_TABLE_HEADER_SIZE + RAS_TABLE_V2_1_INFO_SIZE; @@ -461,7 +463,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) } csum = __calc_hdr_byte_sum(control); - if (hdr->version == RAS_TABLE_VER_V2_1) + if (hdr->version >= RAS_TABLE_VER_V2_1) csum += __calc_ras_info_byte_sum(control); csum = -csum; hdr->checksum = csum; @@ -757,7 +759,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) "Saved bad pages %d reaches threshold value %d\n", control->ras_num_bad_pages, ras->bad_page_cnt_threshold); control->tbl_hdr.header = RAS_TABLE_HDR_BAD; - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) { + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) { control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD; control->tbl_rai.health_percent = 0; } @@ -770,7 +772,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) amdgpu_dpm_send_rma_reason(adev); } - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + RAS_TABLE_V2_1_INFO_SIZE + control->ras_num_recs * RAS_TABLE_RECORD_SIZE; @@ -810,7 +812,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) * now calculate gpu health percent */ if (amdgpu_bad_page_threshold != 0 && - control->tbl_hdr.version == RAS_TABLE_VER_V2_1 && + control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 && control->ras_num_bad_pages <= ras->bad_page_cnt_threshold) control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold - control->ras_num_bad_pages) * 100) / @@ -823,7 +825,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) csum += *pp; csum += __calc_hdr_byte_sum(control); - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) csum += __calc_ras_info_byte_sum(control); /* avoid sign extension when assigning to "checksum" */ csum = -csum; @@ -1040,7 +1042,7 @@ uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *co /* get available eeprom table version first before eeprom table init */ amdgpu_ras_set_eeprom_table_version(control); - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) return RAS_MAX_RECORD_COUNT_V2_1; else return RAS_MAX_RECORD_COUNT; @@ -1285,7 +1287,7 @@ static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control int buf_size, res; u8 csum, *buf, *pp; - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) buf_size = RAS_TABLE_HEADER_SIZE + RAS_TABLE_V2_1_INFO_SIZE + control->ras_num_recs * RAS_TABLE_RECORD_SIZE; @@ -1388,7 +1390,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) __decode_table_header_from_buf(hdr, buf); - if (hdr->version == RAS_TABLE_VER_V2_1) { + if (hdr->version >= RAS_TABLE_VER_V2_1) { control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr); control->ras_record_offset = RAS_RECORD_START_V2_1; control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1; @@ -1428,7 +1430,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records", control->ras_num_bad_pages); - if (hdr->version == RAS_TABLE_VER_V2_1) { + if (hdr->version >= RAS_TABLE_VER_V2_1) { res = __read_table_ras_info(control); if (res) return res; @@ -1448,7 +1450,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) ras->bad_page_cnt_threshold); } else if (hdr->header == RAS_TABLE_HDR_BAD && amdgpu_bad_page_threshold != 0) { - if (hdr->version == RAS_TABLE_VER_V2_1) { + if (hdr->version >= RAS_TABLE_VER_V2_1) { res = __read_table_ras_info(control); if (res) return res; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index 13f7eda9a696..ec6d7ea37ad0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -28,6 +28,7 @@ #define RAS_TABLE_VER_V1 0x00010000 #define RAS_TABLE_VER_V2_1 0x00021000 +#define RAS_TABLE_VER_V3 0x00030000 struct amdgpu_device; -- 2.51.0 From 55ff973fe1c053de143969cfc8b34baff084084a Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Tue, 11 Mar 2025 16:46:39 +0800 Subject: [PATCH 16/16] drm/amd/amdgpu: shorten the gfx idle worker timeout Shorten the gfx idle worker timeout. This is to sync with DAL when there is no activity on the screen. Original 1 second can not sync with DAL, so DAL can not apply MALL when the workload type is not bootup default. Signed-off-by: Kenneth Feng Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 4b1675f79caa..3c030d3f39d4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -57,8 +57,8 @@ enum amdgpu_gfx_pipe_priority { #define AMDGPU_GFX_QUEUE_PRIORITY_MINIMUM 0 #define AMDGPU_GFX_QUEUE_PRIORITY_MAXIMUM 15 -/* 1 second timeout */ -#define GFX_PROFILE_IDLE_TIMEOUT msecs_to_jiffies(1000) +/* 10 millisecond timeout */ +#define GFX_PROFILE_IDLE_TIMEOUT msecs_to_jiffies(10) enum amdgpu_gfx_partition { AMDGPU_SPX_PARTITION_MODE = 0, -- 2.51.0