drm/amdgpu: timely save bad pages to eeprom after gpu ras reset is completed

author YiPeng Chai <YiPeng.Chai@amd.com>

Tue, 2 Jul 2024 09:53:02 +0000 (17:53 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Wed, 10 Jul 2024 14:13:41 +0000 (10:13 -0400)
author YiPeng Chai <YiPeng.Chai@amd.com>
Tue, 2 Jul 2024 09:53:02 +0000 (17:53 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Wed, 10 Jul 2024 14:13:41 +0000 (10:13 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index 64bee125f17a0d10ff7b3e5c0bda61303bea7254..d0307c55da5092227992ff215bcf8c712f3a2d48 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2934,8 +2934,12 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
         struct ras_err_data err_data;
         unsigned long err_cnt;
  
-       if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev))
+       /* If gpu reset is ongoing, delay retiring the bad pages */
+       if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) {
+               amdgpu_ras_schedule_retirement_dwork(con,
+                               AMDGPU_RAS_RETIRE_PAGE_INTERVAL * 3);
                 return;
+       }
  
         amdgpu_ras_error_data_init(&err_data);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c

index 0faa21d8a7b44ff628ef9d8d71fb8b535eadf2de..9dbb13adb6613d4abadc5d93aaaac0f5657501fc 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -29,6 +29,7 @@
  #include "mp/mp_13_0_6_sh_mask.h"
  
  #define MAX_ECC_NUM_PER_RETIREMENT  32
+#define DELAYED_TIME_FOR_GPU_RESET  1000  //ms
  
  static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
                                             uint32_t node_inst,
@@ -568,6 +569,23 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
  
         con->umc_ecc_log.de_queried_count++;
  
+       /* The problem case is as follows:
+        * 1. GPU A triggers a gpu ras reset, and GPU A drives
+        *    GPU B to also perform a gpu ras reset.
+        * 2. After gpu B ras reset started, gpu B queried a DE
+        *    data. Since the DE data was queried in the ras reset
+        *    thread instead of the page retirement thread, bad
+        *    page retirement work would not be triggered. Then
+        *    even if all gpu resets are completed, the bad pages
+        *    will be cached in RAM until GPU B's bad page retirement
+        *    work is triggered again and then saved to eeprom.
+        * Trigger delayed work to save the bad pages to eeprom in time
+        * after gpu ras reset is completed.
+        */
+       if (amdgpu_ras_in_recovery(adev))
+               schedule_delayed_work(&con->page_retirement_dwork,
+                       msecs_to_jiffies(DELAYED_TIME_FOR_GPU_RESET));
+
         return 0;
  }
author	YiPeng Chai <YiPeng.Chai@amd.com>
	Tue, 2 Jul 2024 09:53:02 +0000 (17:53 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Wed, 10 Jul 2024 14:13:41 +0000 (10:13 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c		patch \| blob \| history