From d01ca8708d95a561f6462a15cad94a2c0bec7042 Mon Sep 17 00:00:00 2001 From: Harry Wentland Date: Mon, 28 Apr 2025 16:26:20 -0400 Subject: [PATCH 01/16] drm/amd/display: Don't check for NULL divisor in fixpt code [Why] We check for a NULL divisor but don't act on it. This check does nothing other than throw a warning. It does confuse static checkers though: See https://lkml.org/lkml/2025/4/26/371 [How] Drop the ASSERTs in both DC and SPL variants. Fixes: 4562236b3bc0 ("drm/amd/dc: Add dc display driver (v2)") Fixes: 6efc0ab3b05d ("drm/amd/display: add back quality EASF and ISHARP and dc dependency changes") Signed-off-by: Harry Wentland Cc: Linus Torvalds Cc: Leo Li Cc: Alex Deucher Acked-by: Alex Deucher Reviewed-by: Alex Hung Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/basics/fixpt31_32.c | 5 ----- drivers/gpu/drm/amd/display/dc/sspl/spl_fixpt31_32.c | 4 ---- 2 files changed, 9 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/basics/fixpt31_32.c b/drivers/gpu/drm/amd/display/dc/basics/fixpt31_32.c index 88d3f9d7dd55..452206b5095e 100644 --- a/drivers/gpu/drm/amd/display/dc/basics/fixpt31_32.c +++ b/drivers/gpu/drm/amd/display/dc/basics/fixpt31_32.c @@ -51,8 +51,6 @@ static inline unsigned long long complete_integer_division_u64( { unsigned long long result; - ASSERT(divisor); - result = div64_u64_rem(dividend, divisor, remainder); return result; @@ -213,9 +211,6 @@ struct fixed31_32 dc_fixpt_recip(struct fixed31_32 arg) * @note * Good idea to use Newton's method */ - - ASSERT(arg.value); - return dc_fixpt_from_fraction( dc_fixpt_one.value, arg.value); diff --git a/drivers/gpu/drm/amd/display/dc/sspl/spl_fixpt31_32.c b/drivers/gpu/drm/amd/display/dc/sspl/spl_fixpt31_32.c index 52d97918a3bd..ebf0287417e0 100644 --- a/drivers/gpu/drm/amd/display/dc/sspl/spl_fixpt31_32.c +++ b/drivers/gpu/drm/amd/display/dc/sspl/spl_fixpt31_32.c @@ -29,8 +29,6 @@ static inline unsigned long long spl_complete_integer_division_u64( { unsigned long long result; - SPL_ASSERT(divisor); - result = spl_div64_u64_rem(dividend, divisor, remainder); return result; @@ -196,8 +194,6 @@ struct spl_fixed31_32 spl_fixpt_recip(struct spl_fixed31_32 arg) * Good idea to use Newton's method */ - SPL_ASSERT(arg.value); - return spl_fixpt_from_fraction( spl_fixpt_one.value, arg.value); -- 2.51.0 From f0be138691d9294760b59e034a9594bc23c2884e Mon Sep 17 00:00:00 2001 From: Eric Huang Date: Fri, 2 May 2025 14:58:02 -0400 Subject: [PATCH 02/16] drm/amdkfd: change error to warning message for SDMA queues creation SDMA doesn't support oversubsciption, it is the user matter to create queues over HW limit, but not supposed to be a KFD error. Signed-off-by: Eric Huang Reviewed-by: Harish Kasiviswanathan Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 14 ++++++++------ .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 11 +++++++++-- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index c610e172a2b8..76359c6a3f3a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1576,8 +1576,9 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, int bit; if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { - if (bitmap_empty(dqm->sdma_bitmap, KFD_MAX_SDMA_QUEUES)) { - dev_err(dev, "No more SDMA queue to allocate\n"); + if (bitmap_empty(dqm->sdma_bitmap, get_num_sdma_queues(dqm))) { + dev_warn(dev, "No more SDMA queue to allocate (%d total queues)\n", + get_num_sdma_queues(dqm)); return -ENOMEM; } @@ -1602,8 +1603,9 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, q->properties.sdma_queue_id = q->sdma_id / kfd_get_num_sdma_engines(dqm->dev); } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { - if (bitmap_empty(dqm->xgmi_sdma_bitmap, KFD_MAX_SDMA_QUEUES)) { - dev_err(dev, "No more XGMI SDMA queue to allocate\n"); + if (bitmap_empty(dqm->xgmi_sdma_bitmap, get_num_xgmi_sdma_queues(dqm))) { + dev_warn(dev, "No more XGMI SDMA queue to allocate (%d total queues)\n", + get_num_xgmi_sdma_queues(dqm)); return -ENOMEM; } if (restore_sdma_id) { @@ -1662,8 +1664,8 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, } if (!free_bit_found) { - dev_err(dev, "No more SDMA queue to allocate for target ID %i\n", - q->properties.sdma_engine_id); + dev_warn(dev, "No more SDMA queue to allocate for target ID %i (%d total queues)\n", + q->properties.sdma_engine_id, num_queues); return -ENOMEM; } } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 7eb370b68159..6d5fa57d4a23 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -451,8 +451,15 @@ int pqm_create_queue(struct process_queue_manager *pqm, } if (retval != 0) { - pr_err("process pid %d DQM create queue type %d failed. ret %d\n", - pqm->process->lead_thread->pid, type, retval); + if ((type == KFD_QUEUE_TYPE_SDMA || + type == KFD_QUEUE_TYPE_SDMA_XGMI || + type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) && + retval == -ENOMEM) + pr_warn("process pid %d DQM create queue type %d failed. ret %d\n", + pqm->process->lead_thread->pid, type, retval); + else + pr_err("process pid %d DQM create queue type %d failed. ret %d\n", + pqm->process->lead_thread->pid, type, retval); goto err_create_queue; } -- 2.51.0 From c8305c6327b4c2ef812a6eca544d8e7c5e8a0b30 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Sat, 3 May 2025 14:38:42 -0600 Subject: [PATCH 03/16] drm/amdgpu: Add documentation to some parts of the AMDGPU ring and wb Add some random documentation associated with the ring buffer manipulations and writeback. Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 52 ++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 74 ++++++++++++++++++++++++ 2 files changed, 125 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 7e5ae8f1f0a9..de57ab032cc7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -520,12 +520,62 @@ int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv); */ #define AMDGPU_MAX_WB 1024 /* Reserve at most 1024 WB slots for amdgpu-owned rings. */ +/** + * amdgpu_wb - This struct is used for small GPU memory allocation. + * + * This struct is used to allocate a small amount of GPU memory that can be + * used to shadow certain states into the memory. This is especially useful for + * providing easy CPU access to some states without requiring register access + * (e.g., if some block is power gated, reading register may be problematic). + * + * Note: the term writeback was initially used because many of the amdgpu + * components had some level of writeback memory, and this struct initially + * described those components. + */ struct amdgpu_wb { + + /** + * @wb_obj: + * + * Buffer Object used for the writeback memory. + */ struct amdgpu_bo *wb_obj; + + /** + * @wb: + * + * Pointer to the first writeback slot. In terms of CPU address + * this value can be accessed directly by using the offset as an index. + * For the GPU address, it is necessary to use gpu_addr and the offset. + */ volatile uint32_t *wb; + + /** + * @gpu_addr: + * + * Writeback base address in the GPU. + */ uint64_t gpu_addr; - u32 num_wb; /* Number of wb slots actually reserved for amdgpu. */ + + /** + * @num_wb: + * + * Number of writeback slots reserved for amdgpu. + */ + u32 num_wb; + + /** + * @used: + * + * Track the writeback slot already used. + */ unsigned long used[DIV_ROUND_UP(AMDGPU_MAX_WB, BITS_PER_LONG)]; + + /** + * @lock: + * + * Protects read and write of the used field array. + */ spinlock_t lock; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index ec4de8df34e7..b95b47110769 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -164,8 +164,24 @@ void amdgpu_fence_update_start_timestamp(struct amdgpu_ring *ring, uint32_t seq, /* provided by hw blocks that expose a ring buffer for commands */ struct amdgpu_ring_funcs { + /** + * @type: + * + * GFX, Compute, SDMA, UVD, VCE, VCN, VPE, KIQ, MES, UMSCH, and CPER + * use ring buffers. The type field just identifies which component the + * ring buffer is associated with. + */ enum amdgpu_ring_type type; uint32_t align_mask; + + /** + * @nop: + * + * Every block in the amdgpu has no-op instructions (e.g., GFX 10 + * uses PACKET3(PACKET3_NOP, 0x3FFF), VCN 5 uses VCN_ENC_CMD_NO_OP, + * etc). This field receives the specific no-op for the component + * that initializes the ring. + */ u32 nop; bool support_64bit_ptrs; bool no_user_fence; @@ -241,6 +257,9 @@ struct amdgpu_ring_funcs { bool (*is_guilty)(struct amdgpu_ring *ring); }; +/** + * amdgpu_ring - Holds ring information + */ struct amdgpu_ring { struct amdgpu_device *adev; const struct amdgpu_ring_funcs *funcs; @@ -252,13 +271,61 @@ struct amdgpu_ring { unsigned rptr_offs; u64 rptr_gpu_addr; volatile u32 *rptr_cpu_addr; + + /** + * @wptr: + * + * This is part of the Ring buffer implementation and represents the + * write pointer. The wptr determines where the host has written. + */ u64 wptr; + + /** + * @wptr_old: + * + * Before update wptr with the new value, usually the old value is + * stored in the wptr_old. + */ u64 wptr_old; unsigned ring_size; + + /** + * @max_dw: + * + * Maximum number of DWords for ring allocation. This information is + * provided at the ring initialization time, and each IP block can + * specify a specific value. Check places that invoke + * amdgpu_ring_init() to see the maximum size per block. + */ unsigned max_dw; + + /** + * @count_dw: + * + * This value starts with the maximum amount of DWords supported by the + * ring. This value is updated based on the ring manipulation. + */ int count_dw; uint64_t gpu_addr; + + /** + * @ptr_mask: + * + * Some IPs provide support for 64-bit pointers and others for 32-bit + * only; this behavior is component-specific and defined by the field + * support_64bit_ptr. If the IP block supports 64-bits, the mask + * 0xffffffffffffffff is set; otherwise, this value assumes buf_mask. + * Notice that this field is used to keep wptr under a valid range. + */ uint64_t ptr_mask; + + /** + * @buf_mask: + * + * Buffer mask is a value used to keep wptr count under its + * thresholding. Buffer mask initialized during the ring buffer + * initialization time, and it is defined as (ring_size / 4) -1. + */ uint32_t buf_mask; u32 idx; u32 xcc_id; @@ -276,6 +343,13 @@ struct amdgpu_ring { bool use_pollmem; unsigned wptr_offs; u64 wptr_gpu_addr; + + /** + * @wptr_cpu_addr: + * + * This is the CPU address pointer in the writeback slot. This is used + * to commit changes to the GPU. + */ volatile u32 *wptr_cpu_addr; unsigned fence_offs; u64 fence_gpu_addr; -- 2.51.0 From dd3d035a78384f7389020810ac2882de50efe934 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Sat, 3 May 2025 14:38:43 -0600 Subject: [PATCH 04/16] Documentation/gpu: Add new entries to amdgpu glossary Add some additional entries. Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- Documentation/gpu/amdgpu/amdgpu-glossary.rst | 33 ++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/Documentation/gpu/amdgpu/amdgpu-glossary.rst b/Documentation/gpu/amdgpu/amdgpu-glossary.rst index 8e6af7cc76c2..2040da593b14 100644 --- a/Documentation/gpu/amdgpu/amdgpu-glossary.rst +++ b/Documentation/gpu/amdgpu/amdgpu-glossary.rst @@ -24,6 +24,9 @@ we have a dedicated glossary for Display Core at CIK Sea Islands + CB + Color Buffer + CP Command Processor @@ -39,6 +42,9 @@ we have a dedicated glossary for Display Core at CU Compute Unit + DB + Depth Buffer + DFS Digital Frequency Synthesizer @@ -63,6 +69,12 @@ we have a dedicated glossary for Display Core at GC Graphics and Compute + GDS + Global Data Share + + GE + Geometry Engine + GMC Graphic Memory Controller @@ -128,6 +140,9 @@ we have a dedicated glossary for Display Core at MQD Memory Queue Descriptor + PA + Primitive Assembler / Physical Address + PFP Pre-Fetch Parser (Graphics) @@ -146,6 +161,9 @@ we have a dedicated glossary for Display Core at the GFX block. It's involved in GFX power management and SR-IOV, among other things. + SC + Scan Converter + SDMA System DMA @@ -164,6 +182,9 @@ we have a dedicated glossary for Display Core at SMU/SMC System Management Unit / System Management Controller + SPI + Shader Processor Input + SRLC Save/Restore List Control @@ -176,6 +197,9 @@ we have a dedicated glossary for Display Core at SS Spread Spectrum + SX + Shader Export + TA Trusted Application @@ -185,6 +209,9 @@ we have a dedicated glossary for Display Core at TOC Table of Contents + UMSCH + User Mode Scheduler + UVD Unified Video Decoder @@ -202,3 +229,9 @@ we have a dedicated glossary for Display Core at VPE Video Processing Engine + + XCC + Accelerator Core Complex + + XCP + Accelerator Core Partition -- 2.51.0 From 6615f1ad34d379c3ddb67f924702bbbcc602507b Mon Sep 17 00:00:00 2001 From: Ellen Pan Date: Tue, 29 Apr 2025 16:22:53 -0400 Subject: [PATCH 05/16] drm/amdgpu: Add Runtime Bad Page message definitions for VFs Currently VFs rely on poison consumption interrupt from HW to kick off the bad page retirement process. Part of this process includes a VF reset. This patch adds the following: 1) Host Bad Pages notification message. 2) Guest request bad pages message. When combined, VFs are able to reserve the pages early, and potentially avoid future poison consumption that will disrupt user services from consequent FLR. Reviewed-by: Shravan Kumar Gande Signed-off-by: Victor Skvortsov Signed-off-by: Ellen Pan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h index bea724981309..3b0c55f67fe4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h @@ -331,6 +331,7 @@ enum amd_sriov_mailbox_request_message { MB_REQ_MSG_RAS_POISON = 202, MB_REQ_RAS_ERROR_COUNT = 203, MB_REQ_RAS_CPER_DUMP = 204, + MB_REQ_RAS_BAD_PAGES = 205, }; /* mailbox message send from host to guest */ @@ -348,6 +349,8 @@ enum amd_sriov_mailbox_response_message { MB_RES_MSG_GPU_RMA = 10, MB_RES_MSG_RAS_ERROR_COUNT_READY = 11, MB_REQ_RAS_CPER_DUMP_READY = 14, + MB_RES_MSG_RAS_BAD_PAGES_READY = 15, + MB_RES_MSG_RAS_BAD_PAGES_NOTIFICATION = 16, MB_RES_MSG_TEXT_MESSAGE = 255 }; -- 2.51.0 From 5da3d8820dd3202d5ae871050e45facb2787235d Mon Sep 17 00:00:00 2001 From: Ellen Pan Date: Tue, 29 Apr 2025 16:48:09 -0400 Subject: [PATCH 06/16] drm/amdgpu: Implement Runtime Bad Page query for VFs Host will send a notification when new bad pages are available. Uopn guest request, the first 256 bad page addresses will be placed into the PF2VF region. Guest should pause the PF2VF worker thread while the copy is in progress. Reviewed-by: Shravan Kumar Gande Signed-off-by: Victor Skvortsov Signed-off-by: Ellen Pan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 13 +++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 5 +++ drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 46 ++++++++++++++++-------- drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h | 3 ++ drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 28 +++++++++++++++ drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h | 3 ++ 6 files changed, 84 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 83f3334b3931..13f0cdeb59c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -1488,3 +1488,16 @@ bool amdgpu_virt_ras_telemetry_block_en(struct amdgpu_device *adev, return true; } + +/* + * amdgpu_virt_request_bad_pages() - request bad pages + * @adev: amdgpu device. + * Send command to GPU hypervisor to write new bad pages into the shared PF2VF region + */ +void amdgpu_virt_request_bad_pages(struct amdgpu_device *adev) +{ + struct amdgpu_virt *virt = &adev->virt; + + if (virt->ops && virt->ops->req_bad_pages) + virt->ops->req_bad_pages(adev); +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index b6ec6b7969f0..577c6194db78 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -97,6 +97,7 @@ struct amdgpu_virt_ops { bool (*rcvd_ras_intr)(struct amdgpu_device *adev); int (*req_ras_err_count)(struct amdgpu_device *adev); int (*req_ras_cper_dump)(struct amdgpu_device *adev, u64 vf_rptr); + int (*req_bad_pages)(struct amdgpu_device *adev); }; /* @@ -262,7 +263,10 @@ struct amdgpu_virt { uint32_t reg_val_offs; struct amdgpu_irq_src ack_irq; struct amdgpu_irq_src rcv_irq; + struct work_struct flr_work; + struct work_struct bad_pages_work; + struct amdgpu_mm_table mm_table; const struct amdgpu_virt_ops *ops; struct amdgpu_vf_error_buffer vf_errors; @@ -429,4 +433,5 @@ int amdgpu_virt_req_ras_cper_dump(struct amdgpu_device *adev, bool force_update) int amdgpu_virt_ras_telemetry_post_reset(struct amdgpu_device *adev); bool amdgpu_virt_ras_telemetry_block_en(struct amdgpu_device *adev, enum amdgpu_ras_block block); +void amdgpu_virt_request_bad_pages(struct amdgpu_device *adev); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index f5411b798e11..f2a74aa76b56 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -274,6 +274,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) { struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); + struct amdgpu_reset_context reset_context = { 0 }; amdgpu_virt_fini_data_exchange(adev); @@ -281,8 +282,6 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) if (amdgpu_device_should_recover_gpu(adev) && (!amdgpu_device_has_job_running(adev) || adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)) { - struct amdgpu_reset_context reset_context; - memset(&reset_context, 0, sizeof(reset_context)); reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; @@ -293,6 +292,19 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) } } +static void xgpu_ai_mailbox_bad_pages_work(struct work_struct *work) +{ + struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, bad_pages_work); + struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); + + if (down_read_trylock(&adev->reset_domain->sem)) { + amdgpu_virt_fini_data_exchange(adev); + amdgpu_virt_request_bad_pages(adev); + amdgpu_virt_init_data_exchange(adev); + up_read(&adev->reset_domain->sem); + } +} + static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, struct amdgpu_irq_src *src, unsigned type, @@ -314,24 +326,29 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev, enum idh_event event = xgpu_ai_mailbox_peek_msg(adev); switch (event) { - case IDH_FLR_NOTIFICATION: + case IDH_RAS_BAD_PAGES_NOTIFICATION: + xgpu_ai_mailbox_send_ack(adev); + if (amdgpu_sriov_runtime(adev)) + schedule_work(&adev->virt.bad_pages_work); + break; + case IDH_FLR_NOTIFICATION: if (amdgpu_sriov_runtime(adev)) WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, &adev->virt.flr_work), "Failed to queue work! at %s", __func__); break; - case IDH_QUERY_ALIVE: - xgpu_ai_mailbox_send_ack(adev); - break; - /* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore - * it byfar since that polling thread will handle it, - * other msg like flr complete is not handled here. - */ - case IDH_CLR_MSG_BUF: - case IDH_FLR_NOTIFICATION_CMPL: - case IDH_READY_TO_ACCESS_GPU: - default: + case IDH_QUERY_ALIVE: + xgpu_ai_mailbox_send_ack(adev); + break; + /* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore + * it byfar since that polling thread will handle it, + * other msg like flr complete is not handled here. + */ + case IDH_CLR_MSG_BUF: + case IDH_FLR_NOTIFICATION_CMPL: + case IDH_READY_TO_ACCESS_GPU: + default: break; } @@ -387,6 +404,7 @@ int xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev) } INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work); + INIT_WORK(&adev->virt.bad_pages_work, xgpu_ai_mailbox_bad_pages_work); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h index ed57cbc150af..efb452ad1700 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h @@ -40,6 +40,7 @@ enum idh_request { IDH_LOG_VF_ERROR = 200, IDH_READY_TO_RESET = 201, IDH_RAS_POISON = 202, + IDH_REQ_RAS_BAD_PAGES = 205, }; enum idh_event { @@ -54,6 +55,8 @@ enum idh_event { IDH_RAS_POISON_READY, IDH_PF_SOFT_FLR_NOTIFICATION, IDH_RAS_ERROR_DETECTED, + IDH_RAS_BAD_PAGES_READY = 15, + IDH_RAS_BAD_PAGES_NOTIFICATION = 16, IDH_TEXT_MESSAGE = 255, }; diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index 5aadf24cb202..74a50c0036ef 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -187,6 +187,9 @@ send_request: case IDH_REQ_RAS_CPER_DUMP: event = IDH_RAS_CPER_DUMP_READY; break; + case IDH_REQ_RAS_BAD_PAGES: + event = IDH_RAS_BAD_PAGES_READY; + break; default: break; } @@ -342,6 +345,19 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) } } +static void xgpu_nv_mailbox_bad_pages_work(struct work_struct *work) +{ + struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, bad_pages_work); + struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); + + if (down_read_trylock(&adev->reset_domain->sem)) { + amdgpu_virt_fini_data_exchange(adev); + amdgpu_virt_request_bad_pages(adev); + amdgpu_virt_init_data_exchange(adev); + up_read(&adev->reset_domain->sem); + } +} + static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev, struct amdgpu_irq_src *src, unsigned type, @@ -366,6 +382,11 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev, enum idh_event event = xgpu_nv_mailbox_peek_msg(adev); switch (event) { + case IDH_RAS_BAD_PAGES_NOTIFICATION: + xgpu_nv_mailbox_send_ack(adev); + if (amdgpu_sriov_runtime(adev)) + schedule_work(&adev->virt.bad_pages_work); + break; case IDH_FLR_NOTIFICATION: if (amdgpu_sriov_runtime(adev)) WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, @@ -436,6 +457,7 @@ int xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev) } INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work); + INIT_WORK(&adev->virt.bad_pages_work, xgpu_nv_mailbox_bad_pages_work); return 0; } @@ -480,6 +502,11 @@ static int xgpu_nv_req_ras_cper_dump(struct amdgpu_device *adev, u64 vf_rptr) adev, IDH_REQ_RAS_CPER_DUMP, vf_rptr_hi, vf_rptr_lo, 0); } +static int xgpu_nv_req_ras_bad_pages(struct amdgpu_device *adev) +{ + return xgpu_nv_send_access_requests(adev, IDH_REQ_RAS_BAD_PAGES); +} + const struct amdgpu_virt_ops xgpu_nv_virt_ops = { .req_full_gpu = xgpu_nv_request_full_gpu_access, .rel_full_gpu = xgpu_nv_release_full_gpu_access, @@ -492,4 +519,5 @@ const struct amdgpu_virt_ops xgpu_nv_virt_ops = { .rcvd_ras_intr = xgpu_nv_rcvd_ras_intr, .req_ras_err_count = xgpu_nv_req_ras_err_count, .req_ras_cper_dump = xgpu_nv_req_ras_cper_dump, + .req_bad_pages = xgpu_nv_req_ras_bad_pages, }; diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h index 72c9fceb9d79..6d292a537c1b 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h @@ -42,6 +42,7 @@ enum idh_request { IDH_RAS_POISON = 202, IDH_REQ_RAS_ERROR_COUNT = 203, IDH_REQ_RAS_CPER_DUMP = 204, + IDH_REQ_RAS_BAD_PAGES = 205, }; enum idh_event { @@ -58,6 +59,8 @@ enum idh_event { IDH_RAS_ERROR_DETECTED, IDH_RAS_ERROR_COUNT_READY = 11, IDH_RAS_CPER_DUMP_READY = 14, + IDH_RAS_BAD_PAGES_READY = 15, + IDH_RAS_BAD_PAGES_NOTIFICATION = 16, IDH_TEXT_MESSAGE = 255, }; -- 2.51.0 From af7160c25c68520299db40d608892810904fa064 Mon Sep 17 00:00:00 2001 From: Prike Liang Date: Tue, 15 Apr 2025 10:42:19 +0800 Subject: [PATCH 07/16] drm/amdgpu: fix the eviction fence dereference MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The dma_resv_add_fence() already refers to the added fence. So when attaching the evciton fence to the gem bo, it needn't refer to it anymore. Signed-off-by: Prike Liang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c index d86e611a9ff4..1a7469543db5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c @@ -189,7 +189,6 @@ void amdgpu_eviction_fence_destroy(struct amdgpu_eviction_fence_mgr *evf_mgr) int amdgpu_eviction_fence_attach(struct amdgpu_eviction_fence_mgr *evf_mgr, struct amdgpu_bo *bo) { - struct dma_fence *ef; struct amdgpu_eviction_fence *ev_fence; struct dma_resv *resv = bo->tbo.base.resv; int ret; @@ -205,10 +204,8 @@ int amdgpu_eviction_fence_attach(struct amdgpu_eviction_fence_mgr *evf_mgr, spin_lock(&evf_mgr->ev_fence_lock); ev_fence = evf_mgr->ev_fence; - if (ev_fence) { - ef = dma_fence_get(&ev_fence->base); - dma_resv_add_fence(resv, ef, DMA_RESV_USAGE_BOOKKEEP); - } + if (ev_fence) + dma_resv_add_fence(resv, &ev_fence->base, DMA_RESV_USAGE_BOOKKEEP); spin_unlock(&evf_mgr->ev_fence_lock); return 0; -- 2.51.0 From 5c89ceda9984498b28716944633a9a01cbb2c90d Mon Sep 17 00:00:00 2001 From: Ruijing Dong Date: Fri, 2 May 2025 11:19:26 -0400 Subject: [PATCH 08/16] drm/amdgpu/vcn: using separate VCN1_AON_SOC offset VCN1_AON_SOC_ADDRESS_3_0 offset varies on different VCN generations, the issue in vcn4.0.5 is caused by a different VCN1_AON_SOC_ADDRESS_3_0 offset. This patch does the following: 1. use the same offset for other VCN generations. 2. use the vcn4.0.5 special offset 3. update vcn_4_0 and vcn_5_0 Acked-by: Saleemkhan Jamadar Reviewed-by: Leo Liu Signed-off-by: Ruijing Dong Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h | 1 - drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c | 1 + drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c | 1 + drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c | 1 + drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 4 +++- drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 1 + drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c | 1 + drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c | 3 ++- 8 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h index cdcdae7f71ce..83adf81defc7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h @@ -66,7 +66,6 @@ #define VCN_ENC_CMD_REG_WAIT 0x0000000c #define VCN_AON_SOC_ADDRESS_2_0 0x1f800 -#define VCN1_AON_SOC_ADDRESS_3_0 0x48000 #define VCN_VID_IP_ADDRESS_2_0 0x0 #define VCN_AON_IP_ADDRESS_2_0 0x30000 diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c index 8e7a36f26e9c..b8d835c9e17e 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c @@ -39,6 +39,7 @@ #define VCN_VID_SOC_ADDRESS_2_0 0x1fa00 #define VCN1_VID_SOC_ADDRESS_3_0 0x48200 +#define VCN1_AON_SOC_ADDRESS_3_0 0x48000 #define mmUVD_CONTEXT_ID_INTERNAL_OFFSET 0x1fd #define mmUVD_GPCOM_VCPU_CMD_INTERNAL_OFFSET 0x503 diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c index d716510b8dd6..3eec1b8feaee 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c @@ -39,6 +39,7 @@ #define VCN_VID_SOC_ADDRESS_2_0 0x1fa00 #define VCN1_VID_SOC_ADDRESS_3_0 0x48200 +#define VCN1_AON_SOC_ADDRESS_3_0 0x48000 #define mmUVD_CONTEXT_ID_INTERNAL_OFFSET 0x27 #define mmUVD_GPCOM_VCPU_CMD_INTERNAL_OFFSET 0x0f diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c index 22ae1939476f..0b19f0ab4480 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c @@ -40,6 +40,7 @@ #define VCN_VID_SOC_ADDRESS_2_0 0x1fa00 #define VCN1_VID_SOC_ADDRESS_3_0 0x48200 +#define VCN1_AON_SOC_ADDRESS_3_0 0x48000 #define mmUVD_CONTEXT_ID_INTERNAL_OFFSET 0x27 #define mmUVD_GPCOM_VCPU_CMD_INTERNAL_OFFSET 0x0f diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c index c6f6392c1c20..1f777c125b00 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c @@ -46,6 +46,7 @@ #define VCN_VID_SOC_ADDRESS_2_0 0x1fb00 #define VCN1_VID_SOC_ADDRESS_3_0 0x48300 +#define VCN1_AON_SOC_ADDRESS_3_0 0x48000 #define VCN_HARVEST_MMSCH 0 @@ -614,7 +615,8 @@ static void vcn_v4_0_mc_resume_dpg_mode(struct amdgpu_vcn_inst *vinst, /* VCN global tiling registers */ WREG32_SOC15_DPG_MODE(inst_idx, SOC15_DPG_MODE_OFFSET( - VCN, 0, regUVD_GFX10_ADDR_CONFIG), adev->gfx.config.gb_addr_config, 0, indirect); + VCN, inst_idx, regUVD_GFX10_ADDR_CONFIG), + adev->gfx.config.gb_addr_config, 0, indirect); } /** diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c index 139c83bd165e..712e1fba33ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c @@ -45,6 +45,7 @@ #define VCN_VID_SOC_ADDRESS_2_0 0x1fb00 #define VCN1_VID_SOC_ADDRESS_3_0 0x48300 +#define VCN1_AON_SOC_ADDRESS_3_0 0x48000 static const struct amdgpu_hwip_reg_entry vcn_reg_list_4_0_3[] = { SOC15_REG_ENTRY_STR(VCN, 0, regUVD_POWER_STATUS), diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c index a8cfc63713ad..558469744f3a 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c @@ -46,6 +46,7 @@ #define VCN_VID_SOC_ADDRESS_2_0 0x1fb00 #define VCN1_VID_SOC_ADDRESS_3_0 (0x48300 + 0x38000) +#define VCN1_AON_SOC_ADDRESS_3_0 (0x48000 + 0x38000) #define VCN_HARVEST_MMSCH 0 diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c index d99d05f42f1d..b90da3d3e140 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c @@ -533,7 +533,8 @@ static void vcn_v5_0_0_mc_resume_dpg_mode(struct amdgpu_vcn_inst *vinst, /* VCN global tiling registers */ WREG32_SOC24_DPG_MODE(inst_idx, SOC24_DPG_MODE_OFFSET( - VCN, 0, regUVD_GFX10_ADDR_CONFIG), adev->gfx.config.gb_addr_config, 0, indirect); + VCN, inst_idx, regUVD_GFX10_ADDR_CONFIG), + adev->gfx.config.gb_addr_config, 0, indirect); return; } -- 2.51.0 From ce8f7d95899c2869b47ea6ce0b3e5bf304b2fff4 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 1 May 2025 13:00:16 -0400 Subject: [PATCH 09/16] Revert "drm/amd: Stop evicting resources on APUs in suspend" This reverts commit 3a9626c816db901def438dc2513622e281186d39. This breaks S4 because we end up setting the s3/s0ix flags even when we are entering s4 since prepare is used by both flows. The causes both the S3/s0ix and s4 flags to be set which breaks several checks in the driver which assume they are mutually exclusive. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3634 Cc: Mario Limonciello Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 -- drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 18 ------------------ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++--------- 3 files changed, 2 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index de57ab032cc7..f59ba0960a77 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1707,11 +1707,9 @@ static inline void amdgpu_acpi_get_backlight_caps(struct amdgpu_dm_backlight_cap #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND) bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev); bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev); -void amdgpu_choose_low_power_state(struct amdgpu_device *adev); #else static inline bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) { return false; } static inline bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev) { return false; } -static inline void amdgpu_choose_low_power_state(struct amdgpu_device *adev) { } #endif void amdgpu_register_gpu_instance(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c index b7f8f2ff143d..707e131f89d2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c @@ -1533,22 +1533,4 @@ bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) #endif /* CONFIG_AMD_PMC */ } -/** - * amdgpu_choose_low_power_state - * - * @adev: amdgpu_device_pointer - * - * Choose the target low power state for the GPU - */ -void amdgpu_choose_low_power_state(struct amdgpu_device *adev) -{ - if (adev->in_runpm) - return; - - if (amdgpu_acpi_is_s0ix_active(adev)) - adev->in_s0ix = true; - else if (amdgpu_acpi_is_s3_active(adev)) - adev->in_s3 = true; -} - #endif /* CONFIG_SUSPEND */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8330e30f0caf..ccc6217761bf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5018,15 +5018,13 @@ int amdgpu_device_prepare(struct drm_device *dev) struct amdgpu_device *adev = drm_to_adev(dev); int i, r; - amdgpu_choose_low_power_state(adev); - if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; /* Evict the majority of BOs before starting suspend sequence */ r = amdgpu_device_evict_resources(adev); if (r) - goto unprepare; + return r; flush_delayed_work(&adev->gfx.gfx_off_delay_work); @@ -5037,15 +5035,10 @@ int amdgpu_device_prepare(struct drm_device *dev) continue; r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); if (r) - goto unprepare; + return r; } return 0; - -unprepare: - adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; - - return r; } /** -- 2.51.0 From 6be34e1d1f0efc624a7505ac98609d8c6e732b85 Mon Sep 17 00:00:00 2001 From: Ellen Pan Date: Tue, 29 Apr 2025 17:00:50 -0400 Subject: [PATCH 10/16] drm/amdgpu: Add unrecoverable error message definitions for VFs Host may stop runtime services after reaching a bad page threshold. This notification will indicate to the VF that it no longer has access to the GPU. Reviewed-by: Shravan Kumar Gande Signed-off-by: Victor Skvortsov Signed-off-by: Ellen Pan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h index 3b0c55f67fe4..92ca13097aaa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h @@ -351,6 +351,7 @@ enum amd_sriov_mailbox_response_message { MB_REQ_RAS_CPER_DUMP_READY = 14, MB_RES_MSG_RAS_BAD_PAGES_READY = 15, MB_RES_MSG_RAS_BAD_PAGES_NOTIFICATION = 16, + MB_RES_MSG_UNRECOV_ERR_NOTIFICATION = 17, MB_RES_MSG_TEXT_MESSAGE = 255 }; -- 2.51.0 From 086809c82c96116aed78f762e4f1d61a4e0210a7 Mon Sep 17 00:00:00 2001 From: Ellen Pan Date: Tue, 29 Apr 2025 17:18:44 -0400 Subject: [PATCH 11/16] drm/amdgpu: Implement unrecoverable error message handling for VFs This notification may arrive in VF mailbox while polling for response from another event. This patches covers the following scenarios: - If VF is already in RMA state, then do not attempt to contact the host. Host will ignore the VF after sending the notification. - If the notification is detected during polling, then set the RMA status, and return error to caller. - If the notification arrives by interrupt, then set the RMA status and queue a reset. This reset will fail and VF will stop runtime services. Reviewed-by: Shravan Kumar Gande Signed-off-by: Victor Skvortsov Signed-off-by: Ellen Pan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 ++++ drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 17 +++++++++-- drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h | 1 + drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 34 ++++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h | 1 + 5 files changed, 52 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ccc6217761bf..a2997ededd0e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -6150,6 +6150,11 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ /* Actual ASIC resets if needed.*/ /* Host driver will handle XGMI hive reset for SRIOV */ if (amdgpu_sriov_vf(adev)) { + + /* Bail out of reset early */ + if (amdgpu_ras_is_rma(adev)) + return -ENODEV; + if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); amdgpu_ras_set_fed(adev, true); diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index f2a74aa76b56..48101a34e049 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -324,6 +324,7 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry) { enum idh_event event = xgpu_ai_mailbox_peek_msg(adev); + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); switch (event) { case IDH_RAS_BAD_PAGES_NOTIFICATION: @@ -331,12 +332,22 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev, if (amdgpu_sriov_runtime(adev)) schedule_work(&adev->virt.bad_pages_work); break; + case IDH_UNRECOV_ERR_NOTIFICATION: + xgpu_ai_mailbox_send_ack(adev); + ras->is_rma = true; + dev_err(adev->dev, "VF is in an unrecoverable state. Runtime Services are halted.\n"); + if (amdgpu_sriov_runtime(adev)) + WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, + &adev->virt.flr_work), + "Failed to queue work! at %s", + __func__); + break; case IDH_FLR_NOTIFICATION: if (amdgpu_sriov_runtime(adev)) WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, - &adev->virt.flr_work), - "Failed to queue work! at %s", - __func__); + &adev->virt.flr_work), + "Failed to queue work! at %s", + __func__); break; case IDH_QUERY_ALIVE: xgpu_ai_mailbox_send_ack(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h index efb452ad1700..874b9f8f9804 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h @@ -57,6 +57,7 @@ enum idh_event { IDH_RAS_ERROR_DETECTED, IDH_RAS_BAD_PAGES_READY = 15, IDH_RAS_BAD_PAGES_NOTIFICATION = 16, + IDH_UNRECOV_ERR_NOTIFICATION = 17, IDH_TEXT_MESSAGE = 255, }; diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index 74a50c0036ef..f6d8597452ed 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -67,6 +67,8 @@ static int xgpu_nv_mailbox_rcv_msg(struct amdgpu_device *adev, reg = RREG32_NO_KIQ(mmMAILBOX_MSGBUF_RCV_DW0); if (reg == IDH_FAIL) r = -EINVAL; + if (reg == IDH_UNRECOV_ERR_NOTIFICATION) + r = -ENODEV; else if (reg != event) return -ENOENT; @@ -103,6 +105,7 @@ static int xgpu_nv_poll_msg(struct amdgpu_device *adev, enum idh_event event) { int r; uint64_t timeout, now; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); now = (uint64_t)ktime_to_ms(ktime_get()); timeout = now + NV_MAILBOX_POLL_MSG_TIMEDOUT; @@ -110,8 +113,16 @@ static int xgpu_nv_poll_msg(struct amdgpu_device *adev, enum idh_event event) do { r = xgpu_nv_mailbox_rcv_msg(adev, event); if (!r) { - dev_dbg(adev->dev, "rcv_msg 0x%x after %llu ms\n", event, NV_MAILBOX_POLL_MSG_TIMEDOUT - timeout + now); + dev_dbg(adev->dev, "rcv_msg 0x%x after %llu ms\n", + event, NV_MAILBOX_POLL_MSG_TIMEDOUT - timeout + now); return 0; + } else if (r == -ENODEV) { + if (!amdgpu_ras_is_rma(adev)) { + ras->is_rma = true; + dev_err(adev->dev, "VF is in an unrecoverable state. " + "Runtime Services are halted.\n"); + } + return r; } msleep(10); @@ -166,6 +177,10 @@ static int xgpu_nv_send_access_requests_with_param(struct amdgpu_device *adev, enum idh_event event = -1; send_request: + + if (amdgpu_ras_is_rma(adev)) + return -ENODEV; + xgpu_nv_mailbox_trans_msg(adev, req, data1, data2, data3); switch (req) { @@ -323,6 +338,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) { struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); + struct amdgpu_reset_context reset_context = { 0 }; amdgpu_virt_fini_data_exchange(adev); @@ -333,8 +349,6 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT || adev->compute_timeout == MAX_SCHEDULE_TIMEOUT || adev->video_timeout == MAX_SCHEDULE_TIMEOUT)) { - struct amdgpu_reset_context reset_context; - memset(&reset_context, 0, sizeof(reset_context)); reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; @@ -380,6 +394,7 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry) { enum idh_event event = xgpu_nv_mailbox_peek_msg(adev); + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); switch (event) { case IDH_RAS_BAD_PAGES_NOTIFICATION: @@ -387,6 +402,19 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev, if (amdgpu_sriov_runtime(adev)) schedule_work(&adev->virt.bad_pages_work); break; + case IDH_UNRECOV_ERR_NOTIFICATION: + xgpu_nv_mailbox_send_ack(adev); + if (!amdgpu_ras_is_rma(adev)) { + ras->is_rma = true; + dev_err(adev->dev, "VF is in an unrecoverable state. Runtime Services are halted.\n"); + } + + if (amdgpu_sriov_runtime(adev)) + WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, + &adev->virt.flr_work), + "Failed to queue work! at %s", + __func__); + break; case IDH_FLR_NOTIFICATION: if (amdgpu_sriov_runtime(adev)) WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h index 6d292a537c1b..5808689562cc 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h @@ -61,6 +61,7 @@ enum idh_event { IDH_RAS_CPER_DUMP_READY = 14, IDH_RAS_BAD_PAGES_READY = 15, IDH_RAS_BAD_PAGES_NOTIFICATION = 16, + IDH_UNRECOV_ERR_NOTIFICATION = 17, IDH_TEXT_MESSAGE = 255, }; -- 2.51.0 From 06f2dcc241e7e5c681f81fbc46cacdf4bfd7d6d7 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 1 May 2025 13:46:46 -0400 Subject: [PATCH 12/16] drm/amdgpu: fix pm notifier handling Set the s3/s0ix and s4 flags in the pm notifier so that we can skip the resource evictions properly in pm prepare based on whether we are suspending or hibernating. Drop the eviction as processes are not frozen at this time, we we can end up getting stuck trying to evict VRAM while applications continue to submit work which causes the buffers to get pulled back into VRAM. v2: Move suspend flags out of pm notifier (Mario) Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4178 Fixes: 2965e6355dcd ("drm/amd: Add Suspend/Hibernate notification callback support") Cc: Mario Limonciello Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 +++++------------- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 +--------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a2997ededd0e..5d47c36c8280 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4976,28 +4976,20 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev) * @data: data * * This function is called when the system is about to suspend or hibernate. - * It is used to evict resources from the device before the system goes to - * sleep while there is still access to swap. + * It is used to set the appropriate flags so that eviction can be optimized + * in the pm prepare callback. */ static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, void *data) { struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); - int r; switch (mode) { case PM_HIBERNATION_PREPARE: adev->in_s4 = true; - fallthrough; - case PM_SUSPEND_PREPARE: - r = amdgpu_device_evict_resources(adev); - /* - * This is considered non-fatal at this time because - * amdgpu_device_prepare() will also fatally evict resources. - * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 - */ - if (r) - drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); + break; + case PM_POST_HIBERNATION: + adev->in_s4 = false; break; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 8e45f18ad470..7f437aea3d06 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2644,13 +2644,8 @@ static int amdgpu_pmops_freeze(struct device *dev) static int amdgpu_pmops_thaw(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); - struct amdgpu_device *adev = drm_to_adev(drm_dev); - int r; - - r = amdgpu_device_resume(drm_dev, true); - adev->in_s4 = false; - return r; + return amdgpu_device_resume(drm_dev, true); } static int amdgpu_pmops_poweroff(struct device *dev) @@ -2663,9 +2658,6 @@ static int amdgpu_pmops_poweroff(struct device *dev) static int amdgpu_pmops_restore(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); - struct amdgpu_device *adev = drm_to_adev(drm_dev); - - adev->in_s4 = false; return amdgpu_device_resume(drm_dev, true); } -- 2.51.0 From 6edc89645c01ae9bd9407f84608564e7521f0117 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 29 Apr 2025 09:45:03 -0400 Subject: [PATCH 13/16] drm/amdgpu/psp: mark securedisplay TA as optional This is an optional TA which is only available on certain embedded systems. Mark it as optional to avoid user confusion. This mirrors what we already do for other optional TAs. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4181 Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index d54bb1377262..1c843b888475 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -2214,7 +2214,8 @@ static int psp_securedisplay_initialize(struct psp_context *psp) if (!psp->securedisplay_context.context.bin_desc.size_bytes || !psp->securedisplay_context.context.bin_desc.start_addr) { - dev_info(psp->adev->dev, "SECUREDISPLAY: securedisplay ta ucode is not available\n"); + dev_info(psp->adev->dev, + "SECUREDISPLAY: optional securedisplay ta ucode is not available\n"); return 0; } -- 2.51.0 From 926c79ad6ecde21e7e2d7415f873d57c364611f1 Mon Sep 17 00:00:00 2001 From: Prike Liang Date: Wed, 23 Apr 2025 20:26:48 +0800 Subject: [PATCH 14/16] drm/amdgpu: promote the implicit sync to the dependent read fences MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The driver doesn't want to implicitly sync on the DMA_RESV_USAGE_BOOKKEEP usage fences, and the BOOKEEP fences should be synced explicitly. So, as the VM implicit syncing only need to return and sync the dependent read fences. Signed-off-by: Prike Liang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c index 5576ed0b508f..d6ae9974c952 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c @@ -249,9 +249,8 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync, if (resv == NULL) return -EINVAL; - - /* TODO: Use DMA_RESV_USAGE_READ here */ - dma_resv_for_each_fence(&cursor, resv, DMA_RESV_USAGE_BOOKKEEP, f) { + /* Implicitly sync only to KERNEL, WRITE and READ */ + dma_resv_for_each_fence(&cursor, resv, DMA_RESV_USAGE_READ, f) { dma_fence_chain_for_each(f, f) { struct dma_fence *tmp = dma_fence_chain_contained(f); -- 2.51.0 From def41146b96a9e292a17cddb1ac97007f41c44bc Mon Sep 17 00:00:00 2001 From: Prike Liang Date: Tue, 6 May 2025 16:32:23 +0800 Subject: [PATCH 15/16] drm/amdgpu: unreserve the gem BO before returning from attach error MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit It requires unlocking the reserved gem BO before returning from attaching the eviction fence error. Signed-off-by: Prike Liang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index f03fc3cf4d50..2c68118fe9fd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -298,6 +298,7 @@ static int amdgpu_gem_object_open(struct drm_gem_object *obj, r = amdgpu_eviction_fence_attach(&fpriv->evf_mgr, abo); if (r) { DRM_DEBUG_DRIVER("Failed to attach eviction fence to BO\n"); + amdgpu_bo_unreserve(abo); return r; } -- 2.51.0 From 8e320f67d49d6bb28eba9c8bded66ad04b0acf0b Mon Sep 17 00:00:00 2001 From: Shane Xiao Date: Thu, 24 Apr 2025 13:15:01 +0800 Subject: [PATCH 16/16] drm/amdgpu: Add debug bit for userptr usage MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit In VM debug mode, it is desirable to notify the application to correct the freeing sequence by unmapping the memory before destroying the userptr in the old userptr path. Add a bitmask to decide whether to send gpu vm fault to the applition. Signed-off-by: Shane Xiao Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index f59ba0960a77..836ea081088a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1281,6 +1281,7 @@ struct amdgpu_device { bool debug_enable_ras_aca; bool debug_exp_resets; bool debug_disable_gpu_ring_reset; + bool debug_vm_userptr; /* Protection for the following isolation structure */ struct mutex enforce_isolation_mutex; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 7f437aea3d06..4ddd08ce8885 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -143,6 +143,7 @@ enum AMDGPU_DEBUG_MASK { AMDGPU_DEBUG_ENABLE_EXP_RESETS = BIT(5), AMDGPU_DEBUG_DISABLE_GPU_RING_RESET = BIT(6), AMDGPU_DEBUG_SMU_POOL = BIT(7), + AMDGPU_DEBUG_VM_USERPTR = BIT(8), }; unsigned int amdgpu_vram_limit = UINT_MAX; @@ -2273,6 +2274,10 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev) pr_info("debug: use vram for smu pool\n"); adev->pm.smu_debug_mask |= SMU_DEBUG_POOL_USE_VRAM; } + if (amdgpu_debug_mask & AMDGPU_DEBUG_VM_USERPTR) { + pr_info("debug: VM mode debug for userptr is enabled\n"); + adev->debug_vm_userptr = true; + } } static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long flags) -- 2.51.0