Some times a hang GPU causes multiple reset sources to schedule resets.
The second source will be able to trigger an unnecessary reset if they
schedule after we call amdgpu_device_stop_pending_resets.
Move amdgpu_device_stop_pending_resets to after the reset is done. Since
at this point the GPU is supposedly in a good state, any reset scheduled
after this point would be a legitimate reset.
Remove unnecessary and incorrect checks for amdgpu_in_reset that was
kinda serving this purpose.
Signed-off-by: Yunxiang Li <Yunxiang.Li@amd.com>
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
 retry:
        amdgpu_amdkfd_pre_reset(adev);
 
-       amdgpu_device_stop_pending_resets(adev);
-
        if (from_hypervisor)
                r = amdgpu_virt_request_full_gpu(adev, true);
        else
                                  r, adev_to_drm(tmp_adev)->unique);
                        tmp_adev->asic_reset_res = r;
                }
-
-               if (!amdgpu_sriov_vf(tmp_adev))
-                       /*
-                       * Drop all pending non scheduler resets. Scheduler resets
-                       * were already dropped during drm_sched_stop
-                       */
-                       amdgpu_device_stop_pending_resets(tmp_adev);
        }
 
        /* Actual ASIC resets if needed.*/
                        goto retry;
        }
 
+       list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+               /*
+                * Drop any pending non scheduler resets queued before reset is done.
+                * Any reset scheduled after this point would be valid. Scheduler resets
+                * were already dropped during drm_sched_stop and no new ones can come
+                * in before drm_sched_start.
+                */
+               amdgpu_device_stop_pending_resets(tmp_adev);
+       }
+
 skip_hw_reset:
 
        /* Post ASIC reset for all devs .*/
 
        if (ret) {
                adev->virt.vf2pf_update_retry_cnt++;
                if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
-                   amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) {
+                   amdgpu_sriov_runtime(adev)) {
                        amdgpu_ras_set_fed(adev, true);
                        if (amdgpu_reset_domain_schedule(adev->reset_domain,
                                                          &adev->virt.flr_work))
 
 
        switch (event) {
                case IDH_FLR_NOTIFICATION:
-               if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+               if (amdgpu_sriov_runtime(adev))
                        WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
                                                                &adev->virt.flr_work),
                                  "Failed to queue work! at %s",
 
 
        switch (event) {
        case IDH_FLR_NOTIFICATION:
-               if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+               if (amdgpu_sriov_runtime(adev))
                        WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
                                   &adev->virt.flr_work),
                                  "Failed to queue work! at %s",
 
                r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
 
                /* only handle FLR_NOTIFY now */
-               if (!r && !amdgpu_in_reset(adev))
+               if (!r)
                        WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
                                                                &adev->virt.flr_work),
                                  "Failed to queue work! at %s",