]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
drm/amdgpu: Fix map/unmap queue logic
authorLijo Lazar <lijo.lazar@amd.com>
Tue, 5 Nov 2024 05:00:20 +0000 (10:30 +0530)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 8 Nov 2024 16:10:00 +0000 (11:10 -0500)
In current logic, it calls ring_alloc followed by a ring_test. ring_test
in turn will call another ring_alloc. This is illegal usage as a
ring_alloc is expected to be closed properly with a ring_commit. Change
to commit the map/unmap queue packet first followed by a ring_test. Add a
comment about the usage of ring_test.

Also, reorder the current pre-condition checks of job hang or kiq ring
scheduler not ready. Without them being met, it is not useful to attempt
ring or memory allocations.

Fixes tag refers to the original patch which introduced this issue which
then got carried over into newer code.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Le Ma <le.ma@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Fixes: 6c10b5cc4eaa ("drm/amdgpu: Remove duplicate code in gfx_v8_0.c")
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c

index 24343c312480a064a7ac7a9bd97986416ff374c0..3afcd1e8aa543534808eacd78ff9d63a99bdf163 100644 (file)
@@ -834,6 +834,9 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,
        if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
                return -EINVAL;
 
+       if (!kiq_ring->sched.ready || adev->job_hang)
+               return 0;
+
        ring_funcs = kzalloc(sizeof(*ring_funcs), GFP_KERNEL);
        if (!ring_funcs)
                return -ENOMEM;
@@ -858,8 +861,14 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,
 
        kiq->pmf->kiq_unmap_queues(kiq_ring, ring, RESET_QUEUES, 0, 0);
 
-       if (kiq_ring->sched.ready && !adev->job_hang)
-               r = amdgpu_ring_test_helper(kiq_ring);
+       /* Submit unmap queue packet */
+       amdgpu_ring_commit(kiq_ring);
+       /*
+        * Ring test will do a basic scratch register change check. Just run
+        * this to ensure that unmap queues that is submitted before got
+        * processed successfully before returning.
+        */
+       r = amdgpu_ring_test_helper(kiq_ring);
 
        spin_unlock(&kiq->ring_lock);
 
index 6cc6484bde06695899bef350bda408eb9a57e19e..5136cb3bdd4b21c78a34448b66a6f12845e41438 100644 (file)
@@ -515,6 +515,17 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id)
        if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
                return -EINVAL;
 
+       if (!kiq_ring->sched.ready || adev->job_hang)
+               return 0;
+       /**
+        * This is workaround: only skip kiq_ring test
+        * during ras recovery in suspend stage for gfx9.4.3
+        */
+       if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
+            amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
+           amdgpu_ras_in_recovery(adev))
+               return 0;
+
        spin_lock(&kiq->ring_lock);
        if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size *
                                        adev->gfx.num_compute_rings)) {
@@ -528,20 +539,15 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id)
                                           &adev->gfx.compute_ring[j],
                                           RESET_QUEUES, 0, 0);
        }
-
-       /**
-        * This is workaround: only skip kiq_ring test
-        * during ras recovery in suspend stage for gfx9.4.3
+       /* Submit unmap queue packet */
+       amdgpu_ring_commit(kiq_ring);
+       /*
+        * Ring test will do a basic scratch register change check. Just run
+        * this to ensure that unmap queues that is submitted before got
+        * processed successfully before returning.
         */
-       if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
-           amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
-           amdgpu_ras_in_recovery(adev)) {
-               spin_unlock(&kiq->ring_lock);
-               return 0;
-       }
+       r = amdgpu_ring_test_helper(kiq_ring);
 
-       if (kiq_ring->sched.ready && !adev->job_hang)
-               r = amdgpu_ring_test_helper(kiq_ring);
        spin_unlock(&kiq->ring_lock);
 
        return r;
@@ -569,8 +575,11 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev, int xcc_id)
        if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
                return -EINVAL;
 
-       spin_lock(&kiq->ring_lock);
+       if (!adev->gfx.kiq[0].ring.sched.ready || adev->job_hang)
+               return 0;
+
        if (amdgpu_gfx_is_master_xcc(adev, xcc_id)) {
+               spin_lock(&kiq->ring_lock);
                if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size *
                                                adev->gfx.num_gfx_rings)) {
                        spin_unlock(&kiq->ring_lock);
@@ -583,11 +592,17 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev, int xcc_id)
                                                   &adev->gfx.gfx_ring[j],
                                                   PREEMPT_QUEUES, 0, 0);
                }
-       }
+               /* Submit unmap queue packet */
+               amdgpu_ring_commit(kiq_ring);
 
-       if (adev->gfx.kiq[0].ring.sched.ready && !adev->job_hang)
+               /*
+                * Ring test will do a basic scratch register change check.
+                * Just run this to ensure that unmap queues that is submitted
+                * before got processed successfully before returning.
+                */
                r = amdgpu_ring_test_helper(kiq_ring);
-       spin_unlock(&kiq->ring_lock);
+               spin_unlock(&kiq->ring_lock);
+       }
 
        return r;
 }
@@ -692,7 +707,13 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int xcc_id)
                kiq->pmf->kiq_map_queues(kiq_ring,
                                         &adev->gfx.compute_ring[j]);
        }
-
+       /* Submit map queue packet */
+       amdgpu_ring_commit(kiq_ring);
+       /*
+        * Ring test will do a basic scratch register change check. Just run
+        * this to ensure that map queues that is submitted before got
+        * processed successfully before returning.
+        */
        r = amdgpu_ring_test_helper(kiq_ring);
        spin_unlock(&kiq->ring_lock);
        if (r)
@@ -743,7 +764,13 @@ int amdgpu_gfx_enable_kgq(struct amdgpu_device *adev, int xcc_id)
                                                 &adev->gfx.gfx_ring[j]);
                }
        }
-
+       /* Submit map queue packet */
+       amdgpu_ring_commit(kiq_ring);
+       /*
+        * Ring test will do a basic scratch register change check. Just run
+        * this to ensure that map queues that is submitted before got
+        * processed successfully before returning.
+        */
        r = amdgpu_ring_test_helper(kiq_ring);
        spin_unlock(&kiq->ring_lock);
        if (r)
index 480c41ee947eb177ef7100acbff8cbc9572dd211..b7006c41e270b6d2900bf0e1f3499fa3cd53a52b 100644 (file)
@@ -4823,6 +4823,13 @@ static int gfx_v8_0_kcq_disable(struct amdgpu_device *adev)
                amdgpu_ring_write(kiq_ring, 0);
                amdgpu_ring_write(kiq_ring, 0);
        }
+       /* Submit unmap queue packet */
+       amdgpu_ring_commit(kiq_ring);
+       /*
+        * Ring test will do a basic scratch register change check. Just run
+        * this to ensure that unmap queues that is submitted before got
+        * processed successfully before returning.
+        */
        r = amdgpu_ring_test_helper(kiq_ring);
        if (r)
                DRM_ERROR("KCQ disable failed\n");