]> www.infradead.org Git - users/hch/configfs.git/commitdiff
drm/amdgpu/mes: fix mes ring buffer overflow
authorJack Xiao <Jack.Xiao@amd.com>
Thu, 18 Jul 2024 08:38:50 +0000 (16:38 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 13 Aug 2024 13:57:52 +0000 (09:57 -0400)
wait memory room until enough before writing mes packets
to avoid ring buffer overflow.

v2: squash in sched_hw_submission fix

Fixes: de3246254156 ("drm/amdgpu: cleanup MES11 command submission")
Fixes: fffe347e1478 ("drm/amdgpu: cleanup MES12 command submission")
Signed-off-by: Jack Xiao <Jack.Xiao@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
drivers/gpu/drm/amd/amdgpu/mes_v12_0.c

index 8c39bf7e1fac945c54f85de3913deb6b1e44aa68..690976665cf699526ede8d8ea70520cc47b83bb0 100644 (file)
@@ -214,6 +214,8 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring,
         */
        if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ)
                sched_hw_submission = max(sched_hw_submission, 256);
+       if (ring->funcs->type == AMDGPU_RING_TYPE_MES)
+               sched_hw_submission = 8;
        else if (ring == &adev->sdma.instance[0].page)
                sched_hw_submission = 256;
 
index f9343642ae7e41135344b874256d745ac6af8aa0..1a5ad5be33bfc98828b2db1e0b4b8b0b718fb473 100644 (file)
@@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
        const char *op_str, *misc_op_str;
        unsigned long flags;
        u64 status_gpu_addr;
-       u32 status_offset;
+       u32 seq, status_offset;
        u64 *status_ptr;
        signed long r;
        int ret;
@@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
        if (r)
                goto error_unlock_free;
 
+       seq = ++ring->fence_drv.sync_seq;
+       r = amdgpu_fence_wait_polling(ring,
+                                     seq - ring->fence_drv.num_fences_mask,
+                                     timeout);
+       if (r < 1)
+               goto error_undo;
+
        api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
        api_status->api_completion_fence_addr = status_gpu_addr;
        api_status->api_completion_fence_value = 1;
@@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
        mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
        mes_status_pkt.api_status.api_completion_fence_addr =
                ring->fence_drv.gpu_addr;
-       mes_status_pkt.api_status.api_completion_fence_value =
-               ++ring->fence_drv.sync_seq;
+       mes_status_pkt.api_status.api_completion_fence_value = seq;
 
        amdgpu_ring_write_multiple(ring, &mes_status_pkt,
                                   sizeof(mes_status_pkt) / 4);
@@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
                        x_pkt->header.opcode);
 
-       r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
+       r = amdgpu_fence_wait_polling(ring, seq, timeout);
        if (r < 1 || !*status_ptr) {
 
                if (misc_op_str)
@@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
        amdgpu_device_wb_free(adev, status_offset);
        return 0;
 
+error_undo:
+       dev_err(adev->dev, "MES ring buffer is full.\n");
+       amdgpu_ring_undo(ring);
+
 error_unlock_free:
        spin_unlock_irqrestore(&mes->ring_lock, flags);
 
index 0713bc3eb263ea2a1589a9e5e0a77259408088fc..249e5a66205c279b338c1a61a27f77ed1ba431fa 100644 (file)
@@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
        const char *op_str, *misc_op_str;
        unsigned long flags;
        u64 status_gpu_addr;
-       u32 status_offset;
+       u32 seq, status_offset;
        u64 *status_ptr;
        signed long r;
        int ret;
@@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
        if (r)
                goto error_unlock_free;
 
+       seq = ++ring->fence_drv.sync_seq;
+       r = amdgpu_fence_wait_polling(ring,
+                                     seq - ring->fence_drv.num_fences_mask,
+                                     timeout);
+       if (r < 1)
+               goto error_undo;
+
        api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
        api_status->api_completion_fence_addr = status_gpu_addr;
        api_status->api_completion_fence_value = 1;
@@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
        mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
        mes_status_pkt.api_status.api_completion_fence_addr =
                ring->fence_drv.gpu_addr;
-       mes_status_pkt.api_status.api_completion_fence_value =
-               ++ring->fence_drv.sync_seq;
+       mes_status_pkt.api_status.api_completion_fence_value = seq;
 
        amdgpu_ring_write_multiple(ring, &mes_status_pkt,
                                   sizeof(mes_status_pkt) / 4);
@@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
                        x_pkt->header.opcode);
 
-       r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout);
+       r = amdgpu_fence_wait_polling(ring, seq, timeout);
        if (r < 1 || !*status_ptr) {
 
                if (misc_op_str)
@@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
        amdgpu_device_wb_free(adev, status_offset);
        return 0;
 
+error_undo:
+       dev_err(adev->dev, "MES ring buffer is full.\n");
+       amdgpu_ring_undo(ring);
+
 error_unlock_free:
        spin_unlock_irqrestore(&mes->ring_lock, flags);