drm/amdgpu: do RAS MCA2PA conversion in device init phase

author Tao Zhou <tao.zhou1@amd.com>

Fri, 18 Oct 2024 06:49:00 +0000 (14:49 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Tue, 10 Dec 2024 15:26:46 +0000 (10:26 -0500)
author Tao Zhou <tao.zhou1@amd.com>
Fri, 18 Oct 2024 06:49:00 +0000 (14:49 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Tue, 10 Dec 2024 15:26:46 +0000 (10:26 -0500)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index e7eccc30c6920d89d09748926c7121f70da84e5d..f22242ab240727c642dc4dd86e2a69f5616270fa 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2717,40 +2717,110 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
         return 0;
  }
  
+static int amdgpu_ras_mca2pa(struct amdgpu_device *adev,
+                       struct eeprom_table_record *bps,
+                       struct ras_err_data *err_data)
+{
+       struct ta_ras_query_address_input addr_in;
+       uint32_t socket = 0;
+       int ret = 0;
+
+       if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id)
+               socket = adev->smuio.funcs->get_socket_id(adev);
+
+       /* reinit err_data */
+       err_data->err_addr_cnt = 0;
+       err_data->err_addr_len = adev->umc.retire_unit;
+
+       memset(&addr_in, 0, sizeof(addr_in));
+       addr_in.ma.err_addr = bps->address;
+       addr_in.ma.socket_id = socket;
+       addr_in.ma.ch_inst = bps->mem_channel;
+       /* tell RAS TA the node instance is not used */
+       addr_in.ma.node_inst = TA_RAS_INV_NODE;
+
+       if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
+               ret = adev->umc.ras->convert_ras_err_addr(adev, err_data,
+                               &addr_in, NULL, false);
+
+       return ret;
+}
+
  /* it deal with vram only. */
  int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
                 struct eeprom_table_record *bps, int pages)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         struct ras_err_handler_data *data;
+       struct ras_err_data err_data;
+       struct eeprom_table_record *err_rec;
         int ret = 0;
-       uint32_t i;
+       uint32_t i, j, loop_cnt = 1;
+       bool is_mca_add = true;
  
         if (!con || !con->eh_data || !bps || pages <= 0)
                 return 0;
  
+       if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) {
+               is_mca_add = false;
+       } else {
+               if ((pages > 1) &&
+                   (bps[0].address == bps[1].address) &&
+                   (bps[0].mem_channel == bps[1].mem_channel))
+                       is_mca_add = false;
+       }
+
         mutex_lock(&con->recovery_lock);
         data = con->eh_data;
         if (!data)
                 goto out;
  
-       for (i = 0; i < pages; i++) {
-               if (amdgpu_ras_check_bad_page_unlock(con,
-                       bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
-                       continue;
-
-               if (!data->space_left &&
-                       amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
+       if (is_mca_add) {
+               err_data.err_addr =
+                       kcalloc(adev->umc.retire_unit,
+                               sizeof(struct eeprom_table_record), GFP_KERNEL);
+               if (!err_data.err_addr) {
+                       dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n");
                         ret = -ENOMEM;
                         goto out;
                 }
  
-               amdgpu_ras_reserve_page(adev, bps[i].retired_page);
+               loop_cnt = adev->umc.retire_unit;
+       }
+
+       for (i = 0; i < pages; i++) {
+               if (is_mca_add) {
+                       if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data))
+                               goto free;
+
+                       err_rec = err_data.err_addr;
+               } else {
+                       err_rec = &bps[i];
+               }
  
-               memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
-               data->count++;
-               data->space_left--;
+               for (j = 0; j < loop_cnt; j++) {
+                       if (amdgpu_ras_check_bad_page_unlock(con,
+                               err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+                               continue;
+
+                       if (!data->space_left &&
+                           amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
+                               ret = -ENOMEM;
+                               goto free;
+                       }
+
+                       amdgpu_ras_reserve_page(adev, err_rec[j].retired_page);
+
+                       memcpy(&data->bps[data->count], &(err_rec[j]),
+                                       sizeof(struct eeprom_table_record));
+                       data->count++;
+                       data->space_left--;
+               }
         }
+
+free:
+       if (is_mca_add)
+               kfree(err_data.err_addr);
  out:
         mutex_unlock(&con->recovery_lock);
author	Tao Zhou <tao.zhou1@amd.com>
	Fri, 18 Oct 2024 06:49:00 +0000 (14:49 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Tue, 10 Dec 2024 15:26:46 +0000 (10:26 -0500)