drm/amdgpu/sdma: guilty tracking is per instance

author Alex Deucher <alexander.deucher@amd.com>

Mon, 17 Mar 2025 13:45:02 +0000 (09:45 -0400)

committer Alex Deucher <alexander.deucher@amd.com>

Fri, 21 Mar 2025 16:16:34 +0000 (12:16 -0400)
author Alex Deucher <alexander.deucher@amd.com>
Mon, 17 Mar 2025 13:45:02 +0000 (09:45 -0400)
committer Alex Deucher <alexander.deucher@amd.com>
Fri, 21 Mar 2025 16:16:34 +0000 (12:16 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h

index 8e7e794ff136f79414ab0968b8d69ff1e5295388..dc1a81c2f9af7a8246dc5da59a48f772af3daed7 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -65,6 +65,10 @@ struct amdgpu_sdma_instance {
         uint64_t                sdma_fw_gpu_addr;
         uint32_t                *sdma_fw_ptr;
         struct mutex            engine_reset_mutex;
+       /* track guilty state of GFX and PAGE queues */
+       bool                    gfx_guilty;
+       bool                    page_guilty;
+
  };
  
  enum amdgpu_sdma_ras_memory_id {
@@ -127,9 +131,6 @@ struct amdgpu_sdma {
         uint32_t                *ip_dump;
         uint32_t                supported_reset;
         struct list_head        reset_callback_list;
-       /* track guilty state of GFX and PAGE queues */
-       bool gfx_guilty;
-       bool page_guilty;
  };
  
  /*
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c

index aedfd22b44456f317c29a81790eb467794d6dc49..ea2d8f28b126ad8b46162a5f8a16c8fd40a60c50 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -672,12 +672,11 @@ static uint32_t sdma_v4_4_2_rb_cntl(struct amdgpu_ring *ring, uint32_t rb_cntl)
   * @adev: amdgpu_device pointer
   * @i: instance to resume
   * @restore: used to restore wptr when restart
- * @guilty: boolean indicating whether this queue is the guilty one (caused the timeout/error)
   *
   * Set up the gfx DMA ring buffers and enable them.
   * Returns 0 for success, error for failure.
   */
-static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, bool restore, bool guilty)
+static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, bool restore)
  {
         struct amdgpu_ring *ring = &adev->sdma.instance[i].ring;
         u32 rb_cntl, ib_cntl, wptr_poll_cntl;
@@ -714,7 +713,7 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b
         /* For the guilty queue, set RPTR to the current wptr to skip bad commands,
          * It is not a guilty queue, restore cache_rptr and continue execution.
          */
-       if (guilty)
+       if (adev->sdma.instance[i].gfx_guilty)
                 rwptr = ring->wptr;
         else
                 rwptr = ring->cached_rptr;
@@ -779,12 +778,11 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b
   * @adev: amdgpu_device pointer
   * @i: instance to resume
   * @restore: boolean to say restore needed or not
- * @guilty: boolean indicating whether this queue is the guilty one (caused the timeout/error)
   *
   * Set up the page DMA ring buffers and enable them.
   * Returns 0 for success, error for failure.
   */
-static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, bool restore, bool guilty)
+static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, bool restore)
  {
         struct amdgpu_ring *ring = &adev->sdma.instance[i].page;
         u32 rb_cntl, ib_cntl, wptr_poll_cntl;
@@ -803,7 +801,7 @@ static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i,
         /* For the guilty queue, set RPTR to the current wptr to skip bad commands,
          * It is not a guilty queue, restore cache_rptr and continue execution.
          */
-       if (guilty)
+       if (adev->sdma.instance[i].page_guilty)
                 rwptr = ring->wptr;
         else
                 rwptr = ring->cached_rptr;
@@ -989,9 +987,9 @@ static int sdma_v4_4_2_inst_start(struct amdgpu_device *adev,
                 uint32_t temp;
  
                 WREG32_SDMA(i, regSDMA_SEM_WAIT_FAIL_TIMER_CNTL, 0);
-               sdma_v4_4_2_gfx_resume(adev, i, restore, adev->sdma.gfx_guilty);
+               sdma_v4_4_2_gfx_resume(adev, i, restore);
                 if (adev->sdma.has_page_queue)
-                       sdma_v4_4_2_page_resume(adev, i, restore, adev->sdma.page_guilty);
+                       sdma_v4_4_2_page_resume(adev, i, restore);
  
                 /* set utc l1 enable flag always to 1 */
                 temp = RREG32_SDMA(i, regSDMA_CNTL);
@@ -1446,6 +1444,10 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block)
  
         for (i = 0; i < adev->sdma.num_instances; i++) {
                 mutex_init(&adev->sdma.instance[i].engine_reset_mutex);
+               /* Initialize guilty flags for GFX and PAGE queues */
+               adev->sdma.instance[i].gfx_guilty = false;
+               adev->sdma.instance[i].page_guilty = false;
+
                 ring = &adev->sdma.instance[i].ring;
                 ring->ring_obj = NULL;
                 ring->use_doorbell = true;
@@ -1507,9 +1509,6 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block)
         r = amdgpu_sdma_sysfs_reset_mask_init(adev);
         if (r)
                 return r;
-       /* Initialize guilty flags for GFX and PAGE queues */
-       adev->sdma.gfx_guilty = false;
-       adev->sdma.page_guilty = false;
  
         return r;
  }
@@ -1689,9 +1688,11 @@ static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t instance_
                 return -EINVAL;
  
         /* Check if this queue is the guilty one */
-       adev->sdma.gfx_guilty = sdma_v4_4_2_is_queue_selected(adev, instance_id, false);
+       adev->sdma.instance[instance_id].gfx_guilty =
+               sdma_v4_4_2_is_queue_selected(adev, instance_id, false);
         if (adev->sdma.has_page_queue)
-               adev->sdma.page_guilty = sdma_v4_4_2_is_queue_selected(adev, instance_id, true);
+               adev->sdma.instance[instance_id].page_guilty =
+                       sdma_v4_4_2_is_queue_selected(adev, instance_id, true);
  
         /* Cache the rptr before reset, after the reset,
         * all of the registers will be reset to 0
author	Alex Deucher <alexander.deucher@amd.com>
	Mon, 17 Mar 2025 13:45:02 +0000 (09:45 -0400)
committer	Alex Deucher <alexander.deucher@amd.com>
	Fri, 21 Mar 2025 16:16:34 +0000 (12:16 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c		patch \| blob \| history