]> www.infradead.org Git - users/hch/misc.git/commitdiff
drm/amdgpu: Report individual reset error
authorLijo Lazar <lijo.lazar@amd.com>
Mon, 6 Oct 2025 05:09:03 +0000 (10:39 +0530)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 7 Oct 2025 18:09:19 +0000 (14:09 -0400)
If reinitialization of one of the GPUs fails after reset, it logs
failure on all subsequent GPUs eventhough they have resumed
successfully.

A sample log where only device at 0000:95:00.0 had a failure -

amdgpu 0000:15:00.0: amdgpu: GPU reset(19) succeeded!
amdgpu 0000:65:00.0: amdgpu: GPU reset(19) succeeded!
amdgpu 0000:75:00.0: amdgpu: GPU reset(19) succeeded!
amdgpu 0000:85:00.0: amdgpu: GPU reset(19) succeeded!
amdgpu 0000:95:00.0: amdgpu: GPU reset(19) failed
amdgpu 0000:e5:00.0: amdgpu: GPU reset(19) failed
amdgpu 0000:f5:00.0: amdgpu: GPU reset(19) failed
amdgpu 0000:05:00.0: amdgpu: GPU reset(19) failed
amdgpu 0000:15:00.0: amdgpu: GPU reset end with ret = -5

To avoid confusion, report the error for each device
separately and return the first error as the overall result.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Asad Kamal <asad.kamal@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 929936c8d87caa120864d9a470ef3ada1cc9cac1..7a899fb4de29cb02f9a6ce267973cbaf8a0a22b7 100644 (file)
@@ -6389,23 +6389,28 @@ static int amdgpu_device_sched_resume(struct list_head *device_list,
                if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
                        drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
 
-               if (tmp_adev->asic_reset_res)
-                       r = tmp_adev->asic_reset_res;
-
-               tmp_adev->asic_reset_res = 0;
-
-               if (r) {
+               if (tmp_adev->asic_reset_res) {
                        /* bad news, how to tell it to userspace ?
                         * for ras error, we should report GPU bad status instead of
                         * reset failure
                         */
                        if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
                            !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
-                               dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
-                                       atomic_read(&tmp_adev->gpu_reset_counter));
-                       amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
+                               dev_info(
+                                       tmp_adev->dev,
+                                       "GPU reset(%d) failed with error %d \n",
+                                       atomic_read(
+                                               &tmp_adev->gpu_reset_counter),
+                                       tmp_adev->asic_reset_res);
+                       amdgpu_vf_error_put(tmp_adev,
+                                           AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0,
+                                           tmp_adev->asic_reset_res);
+                       if (!r)
+                               r = tmp_adev->asic_reset_res;
+                       tmp_adev->asic_reset_res = 0;
                } else {
-                       dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
+                       dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n",
+                                atomic_read(&tmp_adev->gpu_reset_counter));
                        if (amdgpu_acpi_smart_shift_update(tmp_adev,
                                                           AMDGPU_SS_DEV_D0))
                                dev_warn(tmp_adev->dev,