]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
drm/amdgpu: Do a basic health check before reset
authorLijo Lazar <lijo.lazar@amd.com>
Wed, 13 Mar 2024 09:37:10 +0000 (15:07 +0530)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 20 Mar 2024 17:37:38 +0000 (13:37 -0400)
Check if the device is present in the bus before trying to recover. It
could be that device itself is lost from the bus in some hang
situations.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Asad Kamal <asad.kamal@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 5dc24c971b41f0a93c7463fbbf397d33e08c5563..efb3b7e74b80202bb8c1333225fc0438842140ff 100644 (file)
@@ -5532,6 +5532,23 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
 
 }
 
+static int amdgpu_device_health_check(struct list_head *device_list_handle)
+{
+       struct amdgpu_device *tmp_adev;
+       int ret = 0;
+       u32 status;
+
+       list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+               pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
+               if (PCI_POSSIBLE_ERROR(status)) {
+                       dev_err(tmp_adev->dev, "device lost from bus!");
+                       ret = -ENODEV;
+               }
+       }
+
+       return ret;
+}
+
 /**
  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
  *
@@ -5603,6 +5620,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                device_list_handle = &device_list;
        }
 
+       if (!amdgpu_sriov_vf(adev)) {
+               r = amdgpu_device_health_check(device_list_handle);
+               if (r)
+                       goto end_reset;
+       }
+
        /* We need to lock reset domain only once both for XGMI and single device */
        tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
                                    reset_list);
@@ -5768,6 +5791,7 @@ skip_sched_resume:
                                            reset_list);
        amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
 
+end_reset:
        if (hive) {
                mutex_unlock(&hive->hive_lock);
                amdgpu_put_xgmi_hive(hive);