]> www.infradead.org Git - users/hch/dma-mapping.git/commitdiff
drm/amdgpu: let mode2 reset fallback to default when failure
authorVictor Zhao <Victor.Zhao@amd.com>
Thu, 28 Jul 2022 02:39:23 +0000 (10:39 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 16 Aug 2022 22:14:31 +0000 (18:14 -0400)
- introduce AMDGPU_SKIP_MODE2_RESET flag
- let mode2 reset fallback to default reset method if failed

v2: move this part out from the asic specific part

Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
Acked-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c

index 5e53a5293935622049ba44844b08e0fe7fffd434..091415a4abf02b203c019f36e504168062bbfb20 100644 (file)
@@ -135,6 +135,7 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
        reset_context.method = AMD_RESET_METHOD_NONE;
        reset_context.reset_req_dev = adev;
        clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+       clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
        amdgpu_device_gpu_recover(adev, NULL, &reset_context);
 }
index e8a0b19b7398538411d42160315adde676c3ee55..c1ec4b653ca4955331b3a25d23855e737c77280f 100644 (file)
@@ -5148,6 +5148,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
        reset_context->job = job;
        reset_context->hive = hive;
+
        /*
         * Build list of devices to reset.
         * In case we are in XGMI hive mode, resort the device list
@@ -5267,8 +5268,11 @@ retry:   /* Rest of adevs pre asic reset from XGMI hive. */
                        amdgpu_ras_resume(adev);
        } else {
                r = amdgpu_do_asic_reset(device_list_handle, reset_context);
-               if (r && r == -EAGAIN)
+               if (r && r == -EAGAIN) {
+                       set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags);
+                       adev->asic_reset_res = 0;
                        goto retry;
+               }
        }
 
 skip_hw_reset:
@@ -5699,6 +5703,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
        reset_context.reset_req_dev = adev;
        set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
        set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
+       set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
        adev->no_hw_access = true;
        r = amdgpu_device_pre_asic_reset(adev, &reset_context);
index b1099ee79c50b0bce57a031683dafa7912ff6532..36f49da15800f699162c5a1609b90fb27f6e094c 100644 (file)
@@ -71,6 +71,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
                reset_context.method = AMD_RESET_METHOD_NONE;
                reset_context.reset_req_dev = adev;
                clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+               clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
                r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
                if (r)
index ff5361f5c2d4f2746a3db6fc00877733434d6cc9..ab9ba5a9c33dbebbe146918ca5e4a5775f37b5dc 100644 (file)
@@ -1949,6 +1949,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
                reset_context.method = AMD_RESET_METHOD_NONE;
                reset_context.reset_req_dev = adev;
                clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+               clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
                amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
        }
index f778466bb9dbdf3311ebdf053aada97d94531f6d..831fb222139c6c232e4b9074d9e2047d80d18d69 100644 (file)
@@ -74,6 +74,9 @@ int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev,
 {
        struct amdgpu_reset_handler *reset_handler = NULL;
 
+       if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
+               return -ENOSYS;
+
        if (adev->reset_cntl && adev->reset_cntl->get_reset_handler)
                reset_handler = adev->reset_cntl->get_reset_handler(
                        adev->reset_cntl, reset_context);
@@ -90,6 +93,9 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev,
        int ret;
        struct amdgpu_reset_handler *reset_handler = NULL;
 
+       if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
+               return -ENOSYS;
+
        if (adev->reset_cntl)
                reset_handler = adev->reset_cntl->get_reset_handler(
                        adev->reset_cntl, reset_context);
index ffda1560c6481d6476fe0ee081224f2ba879bc6c..f71b83c425908e970f35b9bb54333fa8deee86aa 100644 (file)
@@ -30,6 +30,7 @@ enum AMDGPU_RESET_FLAGS {
 
        AMDGPU_NEED_FULL_RESET = 0,
        AMDGPU_SKIP_HW_RESET = 1,
+       AMDGPU_SKIP_MODE2_RESET = 2,
 };
 
 struct amdgpu_reset_context {
index 12906ba74462fb65669392bc826663e8fbb60d09..a2f04b24913299fecee52d837289467f501e357b 100644 (file)
@@ -290,6 +290,7 @@ flr_done:
                reset_context.method = AMD_RESET_METHOD_NONE;
                reset_context.reset_req_dev = adev;
                clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+               clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
                amdgpu_device_gpu_recover(adev, NULL, &reset_context);
        }
index e07757eea7adf95bb43b1a330166b8e84a75468b..a977f0027928d0cd121cf65bae42051ae6442bc5 100644 (file)
@@ -317,6 +317,7 @@ flr_done:
                reset_context.method = AMD_RESET_METHOD_NONE;
                reset_context.reset_req_dev = adev;
                clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+               clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
                amdgpu_device_gpu_recover(adev, NULL, &reset_context);
        }
index 288c414babdfa740b598ab49142666b2586beca1..fd14fa9b9cd7cb0fd8c85cca16bf14b50dcead20 100644 (file)
@@ -529,6 +529,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
                reset_context.method = AMD_RESET_METHOD_NONE;
                reset_context.reset_req_dev = adev;
                clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+               clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
 
                amdgpu_device_gpu_recover(adev, NULL, &reset_context);
        }