From 057fef20b8401110a7bc1c2fe9d804a8a0bf0d24 Mon Sep 17 00:00:00 2001 From: Victor Lu Date: Thu, 13 Feb 2025 18:38:28 -0500 Subject: [PATCH 01/16] drm/amdgpu: Do not program AGP BAR regs under SRIOV in gfxhub_v1_0.c SRIOV VF does not have write access to AGP BAR regs. Skip the writes to avoid a dmesg warning. Signed-off-by: Victor Lu Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c index 0e3ddea7b8e0..a7bfc9f41d0e 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c @@ -92,12 +92,12 @@ static void gfxhub_v1_0_init_system_aperture_regs(struct amdgpu_device *adev) { uint64_t value; - /* Program the AGP BAR */ - WREG32_SOC15_RLC(GC, 0, mmMC_VM_AGP_BASE, 0); - WREG32_SOC15_RLC(GC, 0, mmMC_VM_AGP_BOT, adev->gmc.agp_start >> 24); - WREG32_SOC15_RLC(GC, 0, mmMC_VM_AGP_TOP, adev->gmc.agp_end >> 24); - if (!amdgpu_sriov_vf(adev) || adev->asic_type <= CHIP_VEGA10) { + /* Program the AGP BAR */ + WREG32_SOC15_RLC(GC, 0, mmMC_VM_AGP_BASE, 0); + WREG32_SOC15_RLC(GC, 0, mmMC_VM_AGP_BOT, adev->gmc.agp_start >> 24); + WREG32_SOC15_RLC(GC, 0, mmMC_VM_AGP_TOP, adev->gmc.agp_end >> 24); + /* Program the system aperture low logical page number. */ WREG32_SOC15_RLC(GC, 0, mmMC_VM_SYSTEM_APERTURE_LOW_ADDR, min(adev->gmc.fb_start, adev->gmc.agp_start) >> 18); -- 2.51.0 From bac38ca8c4755452fcd7e9f2603dea944bcfe76e Mon Sep 17 00:00:00 2001 From: Jonathan Kim Date: Wed, 15 Jan 2025 15:29:34 -0500 Subject: [PATCH 02/16] drm/amdkfd: implement per queue sdma reset for gfx 9.4+ To reset hung SDMA queues on GFX 9.4+ for the GFX9 family, a soft reset must be issued through SMU. Since soft resets will reset an entire SDMA engine, use a common KGD call to do the reset as the KGD will handle avoiding a reset of in flight GFX and paging queues on that engine. In addition, create a common call for all reset types to simplify the handling of module parameter settings that block gpu resets. Signed-off-by: Jonathan Kim Reviewed-by: Harish Kasiviswanathan Signed-off-by: Alex Deucher --- .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c | 1 + .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 3 +- .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c | 14 +- .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c | 9 +- .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h | 2 + .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c | 3 +- .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c | 9 +- .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v12.c | 7 + .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 16 ++- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 2 + .../drm/amd/amdkfd/kfd_device_queue_manager.c | 128 ++++++++++++++++-- .../gpu/drm/amd/include/kgd_kfd_interface.h | 2 + 12 files changed, 171 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c index 8dfdb18197c4..6e861d08d044 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c @@ -193,4 +193,5 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = { .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings, .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr, .hqd_reset = kgd_gfx_v9_hqd_reset, + .hqd_sdma_get_doorbell = kgd_gfx_v9_hqd_sdma_get_doorbell }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c index 9abf29b58ac7..c820418e8ccd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c @@ -419,5 +419,6 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = { .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy, .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings, .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr, - .hqd_reset = kgd_gfx_v9_hqd_reset + .hqd_reset = kgd_gfx_v9_hqd_reset, + .hqd_sdma_get_doorbell = kgd_gfx_v9_hqd_sdma_get_doorbell }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c index e2ae714a700f..0c0998477598 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c @@ -509,6 +509,17 @@ static uint32_t kgd_gfx_v9_4_3_clear_address_watch(struct amdgpu_device *adev, return 0; } +static uint32_t kgd_gfx_v9_4_3_hqd_sdma_get_doorbell(struct amdgpu_device *adev, + int engine, int queue) +{ + uint32_t reg_offset = get_sdma_rlc_reg_offset(adev, engine, queue); + uint32_t status = RREG32(regSDMA_RLC0_CONTEXT_STATUS + reg_offset); + uint32_t doorbell_off = RREG32(regSDMA_RLC0_DOORBELL_OFFSET + reg_offset); + bool is_active = !!REG_GET_FIELD(status, SDMA_RLC0_CONTEXT_STATUS, SELECTED); + + return is_active ? doorbell_off >> 2 : 0; +} + const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = { .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings, .set_pasid_vmid_mapping = kgd_gfx_v9_4_3_set_pasid_vmid_mapping, @@ -543,5 +554,6 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = { .set_address_watch = kgd_gfx_v9_4_3_set_address_watch, .clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch, .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr, - .hqd_reset = kgd_gfx_v9_hqd_reset + .hqd_reset = kgd_gfx_v9_hqd_reset, + .hqd_sdma_get_doorbell = kgd_gfx_v9_4_3_hqd_sdma_get_doorbell }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c index 62176d607bef..2887b6f3eaa2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c @@ -1084,6 +1084,12 @@ uint64_t kgd_gfx_v10_hqd_reset(struct amdgpu_device *adev, return 0; } +uint32_t kgd_gfx_v10_hqd_sdma_get_doorbell(struct amdgpu_device *adev, + int engine, int queue) +{ + return 0; +} + const struct kfd2kgd_calls gfx_v10_kfd2kgd = { .program_sh_mem_settings = kgd_program_sh_mem_settings, .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, @@ -1112,5 +1118,6 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = { .build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info, .program_trap_handler_settings = program_trap_handler_settings, .hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr, - .hqd_reset = kgd_gfx_v10_hqd_reset + .hqd_reset = kgd_gfx_v10_hqd_reset, + .hqd_sdma_get_doorbell = kgd_gfx_v10_hqd_sdma_get_doorbell }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h index 9efd2dd4fdd7..db577c2a847a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h @@ -65,3 +65,5 @@ uint64_t kgd_gfx_v10_hqd_reset(struct amdgpu_device *adev, uint32_t queue_id, uint32_t inst, unsigned int utimeout); +uint32_t kgd_gfx_v10_hqd_sdma_get_doorbell(struct amdgpu_device *adev, + int engine, int queue); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c index c718bedda0ca..ac9ad505f9d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c @@ -682,5 +682,6 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = { .set_address_watch = kgd_gfx_v10_set_address_watch, .clear_address_watch = kgd_gfx_v10_clear_address_watch, .hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr, - .hqd_reset = kgd_gfx_v10_hqd_reset + .hqd_reset = kgd_gfx_v10_hqd_reset, + .hqd_sdma_get_doorbell = kgd_gfx_v10_hqd_sdma_get_doorbell }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c index a4ba49cb22db..e0e6a6a49d90 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c @@ -800,6 +800,12 @@ static uint64_t kgd_gfx_v11_hqd_reset(struct amdgpu_device *adev, return 0; } +static uint32_t kgd_gfx_v11_hqd_sdma_get_doorbell(struct amdgpu_device *adev, + int engine, int queue) +{ + return 0; +} + const struct kfd2kgd_calls gfx_v11_kfd2kgd = { .program_sh_mem_settings = program_sh_mem_settings_v11, .set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11, @@ -824,5 +830,6 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = { .set_address_watch = kgd_gfx_v11_set_address_watch, .clear_address_watch = kgd_gfx_v11_clear_address_watch, .hqd_get_pq_addr = kgd_gfx_v11_hqd_get_pq_addr, - .hqd_reset = kgd_gfx_v11_hqd_reset + .hqd_reset = kgd_gfx_v11_hqd_reset, + .hqd_sdma_get_doorbell = kgd_gfx_v11_hqd_sdma_get_doorbell }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v12.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v12.c index 0dfe7093bd8a..6f0dc23c901b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v12.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v12.c @@ -361,6 +361,12 @@ static uint32_t kgd_gfx_v12_clear_address_watch(struct amdgpu_device *adev, return 0; } +static uint32_t kgd_gfx_v12_hqd_sdma_get_doorbell(struct amdgpu_device *adev, + int engine, int queue) +{ + return 0; +} + const struct kfd2kgd_calls gfx_v12_kfd2kgd = { .init_interrupts = init_interrupts_v12, .hqd_dump = hqd_dump_v12, @@ -374,4 +380,5 @@ const struct kfd2kgd_calls gfx_v12_kfd2kgd = { .set_wave_launch_mode = kgd_gfx_v12_set_wave_launch_mode, .set_address_watch = kgd_gfx_v12_set_address_watch, .clear_address_watch = kgd_gfx_v12_clear_address_watch, + .hqd_sdma_get_doorbell = kgd_gfx_v12_hqd_sdma_get_doorbell }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index 441568163e20..84135eb90660 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -1131,9 +1131,6 @@ uint64_t kgd_gfx_v9_hqd_get_pq_addr(struct amdgpu_device *adev, uint32_t low, high; uint64_t queue_addr = 0; - if (!amdgpu_gpu_recovery) - return 0; - kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); amdgpu_gfx_rlc_enter_safe_mode(adev, inst); @@ -1182,9 +1179,6 @@ uint64_t kgd_gfx_v9_hqd_reset(struct amdgpu_device *adev, uint32_t low, high, pipe_reset_data = 0; uint64_t queue_addr = 0; - if (!amdgpu_gpu_recovery) - return 0; - kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); amdgpu_gfx_rlc_enter_safe_mode(adev, inst); @@ -1229,6 +1223,13 @@ unlock_out: return queue_addr; } +uint32_t kgd_gfx_v9_hqd_sdma_get_doorbell(struct amdgpu_device *adev, + int engine, int queue) + +{ + return 0; +} + const struct kfd2kgd_calls gfx_v9_kfd2kgd = { .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings, .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping, @@ -1258,5 +1259,6 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = { .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy, .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings, .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr, - .hqd_reset = kgd_gfx_v9_hqd_reset + .hqd_reset = kgd_gfx_v9_hqd_reset, + .hqd_sdma_get_doorbell = kgd_gfx_v9_hqd_sdma_get_doorbell }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h index b6a91a552aa4..90c8fa13d519 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h @@ -111,3 +111,5 @@ uint64_t kgd_gfx_v9_hqd_reset(struct amdgpu_device *adev, uint32_t queue_id, uint32_t inst, unsigned int utimeout); +uint32_t kgd_gfx_v9_hqd_sdma_get_doorbell(struct amdgpu_device *adev, + int engine, int queue); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 91e4988dc1e3..f3f2fd6ee65c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -36,6 +36,7 @@ #include "kfd_kernel_queue.h" #include "amdgpu_amdkfd.h" #include "amdgpu_reset.h" +#include "amdgpu_sdma.h" #include "mes_v11_api_def.h" #include "kfd_debug.h" @@ -67,6 +68,8 @@ static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q); static int allocate_sdma_queue(struct device_queue_manager *dqm, struct queue *q, const uint32_t *restore_sdma_id); +static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma); + static inline enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) { @@ -2205,8 +2208,7 @@ static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uin return NULL; } -/* only for compute queue */ -static int reset_queues_on_hws_hang(struct device_queue_manager *dqm) +static int reset_hung_queues(struct device_queue_manager *dqm) { int r = 0, reset_count = 0, i; @@ -2259,6 +2261,104 @@ reset_fail: return r; } +static bool sdma_has_hang(struct device_queue_manager *dqm) +{ + int engine_start = dqm->dev->node_id * get_num_all_sdma_engines(dqm); + int engine_end = engine_start + get_num_all_sdma_engines(dqm); + int num_queues_per_eng = dqm->dev->kfd->device_info.num_sdma_queues_per_engine; + int i, j; + + for (i = engine_start; i < engine_end; i++) { + for (j = 0; j < num_queues_per_eng; j++) { + if (!dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j)) + continue; + + return true; + } + } + + return false; +} + +static bool set_sdma_queue_as_reset(struct device_queue_manager *dqm, + uint32_t doorbell_off) +{ + struct device_process_node *cur; + struct qcm_process_device *qpd; + struct queue *q; + + list_for_each_entry(cur, &dqm->queues, list) { + qpd = cur->qpd; + list_for_each_entry(q, &qpd->queues_list, list) { + if ((q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) && + q->properties.doorbell_off == doorbell_off) { + set_queue_as_reset(dqm, q, qpd); + return true; + } + } + } + + return false; +} + +static int reset_hung_queues_sdma(struct device_queue_manager *dqm) +{ + int engine_start = dqm->dev->node_id * get_num_all_sdma_engines(dqm); + int engine_end = engine_start + get_num_all_sdma_engines(dqm); + int num_queues_per_eng = dqm->dev->kfd->device_info.num_sdma_queues_per_engine; + int r = 0, i, j; + + if (dqm->is_hws_hang) + return -EIO; + + /* Scan for hung HW queues and reset engine. */ + dqm->detect_hang_count = 0; + for (i = engine_start; i < engine_end; i++) { + for (j = 0; j < num_queues_per_eng; j++) { + uint32_t doorbell_off = + dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j); + + if (!doorbell_off) + continue; + + /* Reset engine and check. */ + if (amdgpu_sdma_reset_engine(dqm->dev->adev, i, false) || + dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j) || + !set_sdma_queue_as_reset(dqm, doorbell_off)) { + r = -ENOTRECOVERABLE; + goto reset_fail; + } + + /* Should only expect one queue active per engine */ + dqm->detect_hang_count++; + break; + } + } + + /* Signal process reset */ + if (dqm->detect_hang_count) + kfd_signal_reset_event(dqm->dev); + else + r = -ENOTRECOVERABLE; + +reset_fail: + dqm->detect_hang_count = 0; + + return r; +} + +static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma) +{ + while (halt_if_hws_hang) + schedule(); + + if (!amdgpu_gpu_recovery) + return -ENOTRECOVERABLE; + + return is_sdma ? reset_hung_queues_sdma(dqm) : reset_hung_queues(dqm); +} + /* dqm->lock mutex has to be locked before calling this function */ static int unmap_queues_cpsch(struct device_queue_manager *dqm, enum kfd_unmap_queues_filter filter, @@ -2309,16 +2409,13 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, * check those fields */ mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; - if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) { - while (halt_if_hws_hang) - schedule(); - if (reset_queues_on_hws_hang(dqm)) { - dqm->is_hws_hang = true; - kfd_hws_hang(dqm); - retval = -ETIME; - goto out; - } - } + if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd) && + reset_queues_on_hws_hang(dqm, false)) + goto reset_fail; + + /* Check for SDMA hang and attempt SDMA reset */ + if (sdma_has_hang(dqm) && reset_queues_on_hws_hang(dqm, true)) + goto reset_fail; /* We need to reset the grace period value for this device */ if (grace_period != USE_DEFAULT_GRACE_PERIOD) { @@ -2329,10 +2426,15 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, pm_release_ib(&dqm->packet_mgr); dqm->active_runlist = false; - out: up_read(&dqm->dev->adev->reset_domain->sem); return retval; + +reset_fail: + dqm->is_hws_hang = true; + kfd_hws_hang(dqm); + up_read(&dqm->dev->adev->reset_domain->sem); + return -ETIME; } /* only for compute queue */ diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h index e3e635a31b8a..1e8dfa6c0dc8 100644 --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h @@ -330,6 +330,8 @@ struct kfd2kgd_calls { uint64_t (*hqd_reset)(struct amdgpu_device *adev, uint32_t pipe_id, uint32_t queue_id, uint32_t inst, unsigned int utimeout); + uint32_t (*hqd_sdma_get_doorbell)(struct amdgpu_device *adev, + int engine, int queue); }; #endif /* KGD_KFD_INTERFACE_H_INCLUDED */ -- 2.51.0 From ceb7114c961bd8d8605dfff8e18d1a39d99cdd30 Mon Sep 17 00:00:00 2001 From: Jonathan Kim Date: Wed, 26 Feb 2025 14:22:02 -0500 Subject: [PATCH 03/16] drm/amdkfd: flag per-sdma queue reset supported to user space Similar to compute queue reset, flag SDMA queue reset capabilities to user space for safe testing. Signed-off-by: Jonathan Kim Reviewed-by: Harish Kasiviswanathan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 5 +++++ drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 1 + include/uapi/linux/kfd_sysfs.h | 3 +++ 3 files changed, 9 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index dbc5595e999a..27e7356eed6f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -519,6 +519,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->gpu->kfd->mec_fw_version); sysfs_show_32bit_prop(buffer, offs, "capability", dev->node_props.capability); + sysfs_show_32bit_prop(buffer, offs, "capability2", + dev->node_props.capability2); sysfs_show_64bit_prop(buffer, offs, "debug_prop", dev->node_props.debug_prop); sysfs_show_32bit_prop(buffer, offs, "sdma_fw_version", @@ -1981,6 +1983,9 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev) if (kfd_dbg_has_ttmps_always_setup(dev->gpu)) dev->node_props.debug_prop |= HSA_DBG_DISPATCH_INFO_ALWAYS_VALID; + if (dev->gpu->adev->sdma.supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) + dev->node_props.capability2 |= HSA_CAP2_PER_SDMA_QUEUE_RESET_SUPPORTED; + if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0)) { if (KFD_GC_VERSION(dev->gpu) == IP_VERSION(9, 4, 3) || KFD_GC_VERSION(dev->gpu) == IP_VERSION(9, 4, 4)) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index f06c9db7ddde..3de8ec0043bb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -51,6 +51,7 @@ struct kfd_node_properties { uint32_t cpu_core_id_base; uint32_t simd_id_base; uint32_t capability; + uint32_t capability2; uint64_t debug_prop; uint32_t max_waves_per_simd; uint32_t lds_size_in_kb; diff --git a/include/uapi/linux/kfd_sysfs.h b/include/uapi/linux/kfd_sysfs.h index 859b8e91d4d3..1125fe47959f 100644 --- a/include/uapi/linux/kfd_sysfs.h +++ b/include/uapi/linux/kfd_sysfs.h @@ -63,6 +63,9 @@ #define HSA_CAP_PER_QUEUE_RESET_SUPPORTED 0x80000000 #define HSA_CAP_RESERVED 0x000f8000 +#define HSA_CAP2_PER_SDMA_QUEUE_RESET_SUPPORTED 0x00000001 +#define HSA_CAP2_RESERVED 0xfffffffe + /* debug_prop bits in node properties */ #define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_MASK 0x0000000f #define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_SHIFT 0 -- 2.51.0 From a29936bcd21eea7ac87546e2107313cd0f62c4d7 Mon Sep 17 00:00:00 2001 From: Sathishkumar S Date: Wed, 26 Feb 2025 15:59:47 +0530 Subject: [PATCH 04/16] drm/amdgpu: Fix core reset sequence for JPEG5_0_1 For cores 1 through 9 repair the core reset sequence by adjusting offsets to access the expected registers. Signed-off-by: Sathishkumar S Reviewed-by: Leo Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c index 56c01d207e20..218e16b68f1d 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c @@ -672,24 +672,20 @@ static void jpeg_v5_0_1_core_stall_reset(struct amdgpu_ring *ring) WREG32_SOC15_OFFSET(JPEG, jpeg_inst, regUVD_JMI0_UVD_JMI_CLIENT_STALL, reg_offset, 0x1F); - SOC15_WAIT_ON_RREG(JPEG, jpeg_inst, - regUVD_JMI0_UVD_JMI_CLIENT_CLEAN_STATUS, - 0x1F, 0x1F); + SOC15_WAIT_ON_RREG_OFFSET(JPEG, jpeg_inst, + regUVD_JMI0_UVD_JMI_CLIENT_CLEAN_STATUS, + reg_offset, 0x1F, 0x1F); WREG32_SOC15_OFFSET(JPEG, jpeg_inst, regUVD_JMI0_JPEG_LMI_DROP, reg_offset, 0x1F); - WREG32_SOC15_OFFSET(JPEG, jpeg_inst, - regJPEG_CORE_RST_CTRL, - reg_offset, 1 << ring->pipe); + WREG32_SOC15(JPEG, jpeg_inst, regJPEG_CORE_RST_CTRL, 1 << ring->pipe); WREG32_SOC15_OFFSET(JPEG, jpeg_inst, regUVD_JMI0_UVD_JMI_CLIENT_STALL, reg_offset, 0x00); WREG32_SOC15_OFFSET(JPEG, jpeg_inst, regUVD_JMI0_JPEG_LMI_DROP, reg_offset, 0x00); - WREG32_SOC15_OFFSET(JPEG, jpeg_inst, - regJPEG_CORE_RST_CTRL, - reg_offset, 0x00); + WREG32_SOC15(JPEG, jpeg_inst, regJPEG_CORE_RST_CTRL, 0x00); } static int jpeg_v5_0_1_ring_reset(struct amdgpu_ring *ring, unsigned int vmid) -- 2.51.0 From 3646cc65e2747ff112d7de1a05a2e756414b771e Mon Sep 17 00:00:00 2001 From: Victor Lu Date: Thu, 13 Feb 2025 18:41:26 -0500 Subject: [PATCH 05/16] drm/amdgpu: Do not write to GRBM_CNTL if Aldebaran SRIOV Aldebaran SRIOV VF does not have write permissions to GRBM_CTNL. This access can be skipped to avoid a dmesg warning. v2: Use GC IP version check instead of asic check Signed-off-by: Victor Lu Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index f7938d318f26..1f0f03108a82 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -2637,7 +2637,10 @@ static void gfx_v9_0_constants_init(struct amdgpu_device *adev) u32 tmp; int i; - WREG32_FIELD15_RLC(GC, 0, GRBM_CNTL, READ_TIMEOUT, 0xff); + if (!amdgpu_sriov_vf(adev) || + amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(9, 4, 2)) { + WREG32_FIELD15_RLC(GC, 0, GRBM_CNTL, READ_TIMEOUT, 0xff); + } gfx_v9_0_tiling_mode_table_init(adev); -- 2.51.0 From 571d36837c84707ea36fa37ab1373a124e328ed4 Mon Sep 17 00:00:00 2001 From: Charles Han Date: Wed, 5 Mar 2025 18:40:57 +0800 Subject: [PATCH 06/16] drm/amdgpu: fix inconsistent indenting warning Fix below inconsistent indenting smatch warning. smatch warnings: drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c:582 amdgpu_sdma_reset_engine() warn: inconsistent indenting Signed-off-by: Charles Han Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c index 39669f8788a7..3a4cef896018 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c @@ -621,5 +621,5 @@ exit: if (suspend_user_queues) amdgpu_amdkfd_resume(adev, false); - return ret; + return ret; } -- 2.51.0 From 14c8097ba4db1b6e1c28b2ed65186b9199fe9155 Mon Sep 17 00:00:00 2001 From: Jonathan Kim Date: Thu, 27 Feb 2025 12:25:25 -0500 Subject: [PATCH 07/16] drm/amdkfd: remove unused debug gws support status variable Remove unused declaration of gws_debug_workaround. Signed-off-by: Jonathan Kim Reviewed-by: Amber Lin Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 966d1c484d9f..bb09c873a9a5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -289,7 +289,6 @@ struct kfd_node { /* Global GWS resource shared between processes */ void *gws; - bool gws_debug_workaround; /* Clients watching SMI events */ struct list_head smi_clients; -- 2.51.0 From 94b0908b85524d467a00c6aa2a277ef98fd8b152 Mon Sep 17 00:00:00 2001 From: Victor Lu Date: Thu, 13 Feb 2025 18:49:46 -0500 Subject: [PATCH 08/16] drm/amdgpu: Do not set power brake sequence for Aldebaran SRIOV Aldebaran SRIOV VF cannot access the power brake feature regs. The accesses can be skipped to avoid a dmesg warning. v2: Remove redundant asic type check Signed-off-by: Victor Lu Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 1f0f03108a82..d345285ea885 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4045,7 +4045,8 @@ static int gfx_v9_0_hw_init(struct amdgpu_ip_block *ip_block) if (r) return r; - if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2)) + if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) && + !amdgpu_sriov_vf(adev)) gfx_v9_4_2_set_power_brake_sequence(adev); return r; -- 2.51.0 From fe2fa3be3d59ba67d6de54a0064441ec233cb50c Mon Sep 17 00:00:00 2001 From: Emily Deng Date: Mon, 3 Mar 2025 15:10:22 +0800 Subject: [PATCH 09/16] drm/amdgpu: Fix missing drain retry fault the last entry While the entry get in svm_range_unmap_from_cpu is the last entry, and the entry is page fault, it also need to be dropped. So for equal case, it also need to be dropped. v2: Only modify the svm_range_restore_pages. Signed-off-by: Emily Deng Reviewed-by: Xiaogang Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h | 3 +++ drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h index 7d4395a5d8ac..b0a88f92cd82 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h @@ -78,6 +78,9 @@ struct amdgpu_ih_ring { #define amdgpu_ih_ts_after(t1, t2) \ (((int64_t)((t2) << 16) - (int64_t)((t1) << 16)) > 0LL) +#define amdgpu_ih_ts_after_or_equal(t1, t2) \ + (((int64_t)((t2) << 16) - (int64_t)((t1) << 16)) >= 0LL) + /* provided by the ih block */ struct amdgpu_ih_funcs { /* ring read/write ptr handling, called from interrupt context */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index db3034b00dac..1a38ac75abbd 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -3011,7 +3011,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, /* check if this page fault time stamp is before svms->checkpoint_ts */ if (svms->checkpoint_ts[gpuidx] != 0) { - if (amdgpu_ih_ts_after(ts, svms->checkpoint_ts[gpuidx])) { + if (amdgpu_ih_ts_after_or_equal(ts, svms->checkpoint_ts[gpuidx])) { pr_debug("draining retry fault, drop fault 0x%llx\n", addr); r = 0; goto out; -- 2.51.0 From 334dc5fcc3f177823115ec4e075259997c16d4a7 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 6 Mar 2025 11:36:49 +0800 Subject: [PATCH 10/16] drm/amdgpu: increase RAS bad page threshold For default policy, driver will issue an RMA event when the number of bad pages is greater than 8 physical rows, rather than reaches 8 physical rows, don't rely on threshold configurable parameters in default mode. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index ab27cecb5519..09a6f8bc1a5a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -747,7 +747,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) /* Modify the header if it exceeds. */ if (amdgpu_bad_page_threshold != 0 && - control->ras_num_bad_pages >= ras->bad_page_cnt_threshold) { + control->ras_num_bad_pages > ras->bad_page_cnt_threshold) { dev_warn(adev->dev, "Saved bad pages %d reaches threshold value %d\n", control->ras_num_bad_pages, ras->bad_page_cnt_threshold); @@ -806,7 +806,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) */ if (amdgpu_bad_page_threshold != 0 && control->tbl_hdr.version == RAS_TABLE_VER_V2_1 && - control->ras_num_bad_pages < ras->bad_page_cnt_threshold) + control->ras_num_bad_pages <= ras->bad_page_cnt_threshold) control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold - control->ras_num_bad_pages) * 100) / ras->bad_page_cnt_threshold; @@ -1456,7 +1456,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) res); return -EINVAL; } - if (ras->bad_page_cnt_threshold > control->ras_num_bad_pages) { + if (ras->bad_page_cnt_threshold >= control->ras_num_bad_pages) { /* This means that, the threshold was increased since * the last time the system was booted, and now, * ras->bad_page_cnt_threshold - control->num_recs > 0, -- 2.51.0 From 3bc7bc73af7d167e564eb09ed17af0eed24b5110 Mon Sep 17 00:00:00 2001 From: Shiwu Zhang Date: Mon, 3 Mar 2025 21:03:03 +0800 Subject: [PATCH 11/16] drm/amdgpu: retire ip init code specific for A0 rev For aqua_vanjaram, A0 HW is retired so remove the code specific for it in gfx ip init. Signed-off-by: Shiwu Zhang Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 87add6274b98..b276a16a8121 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -349,18 +349,7 @@ static void gfx_v9_4_3_init_golden_registers(struct amdgpu_device *adev) WREG32_SOC15(GC, dev_inst, regGB_ADDR_CONFIG, GOLDEN_GB_ADDR_CONFIG); - if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0)) { - WREG32_FIELD15_PREREG(GC, dev_inst, TCP_UTCL1_CNTL2, SPARE, 0x1); - } else { - /* Golden settings applied by driver for ASIC with rev_id 0 */ - if (adev->rev_id == 0) { - WREG32_FIELD15_PREREG(GC, dev_inst, TCP_UTCL1_CNTL1, - REDUCE_FIFO_DEPTH_BY_2, 2); - } else { - WREG32_FIELD15_PREREG(GC, dev_inst, TCP_UTCL1_CNTL2, - SPARE, 0x1); - } - } + WREG32_FIELD15_PREREG(GC, dev_inst, TCP_UTCL1_CNTL2, SPARE, 0x1); } } -- 2.51.0 From 216be476f14a8a129f1e3210d3c97b9a94942fea Mon Sep 17 00:00:00 2001 From: Shiwu Zhang Date: Tue, 4 Mar 2025 11:13:48 +0800 Subject: [PATCH 12/16] drm/amdgpu: fix the gb_addr_config_fields init value mismatch For gfx_v9_4_3 specifically, before regGB_ADDR_CONFIG is overwritten in gfx hw_init it is read out to popluate the gb_addr_config_fields in the sw_init stage, which causes mismatch. Fix it by using the golden value in sw_init as well. v2: This is a driver-set golden reg and keep as it is (Lijo) Signed-off-by: Shiwu Zhang Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index b276a16a8121..476542b6e7b5 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -916,8 +916,6 @@ static const struct aca_info gfx_v9_4_3_aca_info = { static int gfx_v9_4_3_gpu_early_init(struct amdgpu_device *adev) { - u32 gb_addr_config; - adev->gfx.funcs = &gfx_v9_4_3_gfx_funcs; adev->gfx.ras = &gfx_v9_4_3_ras; @@ -926,9 +924,7 @@ static int gfx_v9_4_3_gpu_early_init(struct amdgpu_device *adev) adev->gfx.config.sc_prim_fifo_size_backend = 0x100; adev->gfx.config.sc_hiz_tile_fifo_size = 0x30; adev->gfx.config.sc_earlyz_tile_fifo_size = 0x4C0; - gb_addr_config = RREG32_SOC15(GC, GET_INST(GC, 0), regGB_ADDR_CONFIG); - - adev->gfx.config.gb_addr_config = gb_addr_config; + adev->gfx.config.gb_addr_config = GOLDEN_GB_ADDR_CONFIG; adev->gfx.config.gb_addr_config_fields.num_pipes = 1 << REG_GET_FIELD( -- 2.51.0 From 148084bbb1e5131b3f1200c72c2b60d85e73aa75 Mon Sep 17 00:00:00 2001 From: Xiang Liu Date: Thu, 6 Mar 2025 15:23:34 +0800 Subject: [PATCH 13/16] drm/amdgpu: Use unique CPER record id across devices Encode socket id to CPER record id to be unique across devices. v2: add pointer check for adev->smuio.funcs->get_socket_id v2: set 0 if adev->smuio.funcs->get_socket_id is NULL Signed-off-by: Xiang Liu Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c index 0415ed222342..3f291b30b79f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c @@ -57,6 +57,8 @@ void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev, enum amdgpu_cper_type type, enum cper_error_severity sev) { + char record_id[16]; + hdr->signature[0] = 'C'; hdr->signature[1] = 'P'; hdr->signature[2] = 'E'; @@ -71,7 +73,13 @@ void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev, amdgpu_cper_get_timestamp(&hdr->timestamp); - snprintf(hdr->record_id, 8, "%d", atomic_inc_return(&adev->cper.unique_id)); + snprintf(record_id, 9, "%d:%X", + (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ? + adev->smuio.funcs->get_socket_id(adev) : + 0, + atomic_inc_return(&adev->cper.unique_id)); + memcpy(hdr->record_id, record_id, 8); + snprintf(hdr->platform_id, 16, "0x%04X:0x%04X", adev->pdev->vendor, adev->pdev->device); /* pmfw version should be part of creator_id according to CPER spec */ @@ -117,10 +125,10 @@ static int amdgpu_cper_entry_fill_section_desc(struct amdgpu_device *adev, section_desc->severity = sev; section_desc->sec_type = sec_type; - if (adev->smuio.funcs && - adev->smuio.funcs->get_socket_id) - snprintf(section_desc->fru_text, 20, "OAM%d", - adev->smuio.funcs->get_socket_id(adev)); + snprintf(section_desc->fru_text, 20, "OAM%d", + (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ? + adev->smuio.funcs->get_socket_id(adev) : + 0); if (bp_threshold) section_desc->flag_bits.exceed_err_threshold = 1; -- 2.51.0 From ba795235a2b99ba9bbef647ab003b2f3145d9bbb Mon Sep 17 00:00:00 2001 From: David Rosca Date: Thu, 13 Feb 2025 15:30:37 +0100 Subject: [PATCH 14/16] drm/amdgpu/display: Allow DCC for video formats on GFX12 We advertise DCC as supported for NV12/P010 formats on GFX12, but it would fail on this check on atomic commit. Signed-off-by: David Rosca Reviewed-by: Ruijing Dong Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c index dcf2b98566ea..e1c1e71ac899 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c @@ -277,8 +277,11 @@ static int amdgpu_dm_plane_validate_dcc(struct amdgpu_device *adev, if (!dcc->enable) return 0; - if (format >= SURFACE_PIXEL_FORMAT_VIDEO_BEGIN || - !dc->cap_funcs.get_dcc_compression_cap) + if (adev->family < AMDGPU_FAMILY_GC_12_0_0 && + format >= SURFACE_PIXEL_FORMAT_VIDEO_BEGIN) + return -EINVAL; + + if (!dc->cap_funcs.get_dcc_compression_cap) return -EINVAL; input.format = format; -- 2.51.0 From bd4b125eb949785c6f8a53b0494e32795421209d Mon Sep 17 00:00:00 2001 From: Aliaksei Urbanski Date: Thu, 6 Mar 2025 13:36:03 +0300 Subject: [PATCH 15/16] drm/amd/display: fix missing .is_two_pixels_per_container Starting from 6.11, AMDGPU driver, while being loaded with amdgpu.dc=1, due to lack of .is_two_pixels_per_container function in dce60_tg_funcs, causes a NULL pointer dereference on PCs with old GPUs, such as R9 280X. So this fix adds missing .is_two_pixels_per_container to dce60_tg_funcs. Reported-by: Rosen Penev Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3942 Fixes: e6a901a00822 ("drm/amd/display: use even ODM slice width for two pixels per container") Signed-off-by: Aliaksei Urbanski Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dce60/dce60_timing_generator.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/dc/dce60/dce60_timing_generator.c b/drivers/gpu/drm/amd/display/dc/dce60/dce60_timing_generator.c index e5fb0e8333e4..e691a1cf3356 100644 --- a/drivers/gpu/drm/amd/display/dc/dce60/dce60_timing_generator.c +++ b/drivers/gpu/drm/amd/display/dc/dce60/dce60_timing_generator.c @@ -239,6 +239,7 @@ static const struct timing_generator_funcs dce60_tg_funcs = { dce60_timing_generator_enable_advanced_request, .configure_crc = dce60_configure_crc, .get_crc = dce110_get_crc, + .is_two_pixels_per_container = dce110_is_two_pixels_per_container, }; void dce60_timing_generator_construct( -- 2.51.0 From 1435e895d4fc967d64e9f5bf81e992ac32f5ac76 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Thu, 6 Mar 2025 15:51:48 +0800 Subject: [PATCH 16/16] drm/amdgpu: handle amdgpu_cgs_create_device() errors in amd_powerplay_create() Add error handling to propagate amdgpu_cgs_create_device() failures to the caller. When amdgpu_cgs_create_device() fails, release hwmgr and return -ENOMEM to prevent null pointer dereference. [v1]->[v2]: Change error code from -EINVAL to -ENOMEM. Free hwmgr. Signed-off-by: Wentao Liang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c index be22ed30a3c1..b48a031cbba0 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c +++ b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c @@ -51,6 +51,11 @@ static int amd_powerplay_create(struct amdgpu_device *adev) hwmgr->adev = adev; hwmgr->not_vf = !amdgpu_sriov_vf(adev); hwmgr->device = amdgpu_cgs_create_device(adev); + if (!hwmgr->device) { + kfree(hwmgr); + return -ENOMEM; + } + mutex_init(&hwmgr->msg_lock); hwmgr->chip_family = adev->family; hwmgr->chip_id = adev->asic_type; -- 2.51.0