]> www.infradead.org Git - users/hch/misc.git/commitdiff
drm/amdgpu: format old RAS eeprom data into V3 version
authorTao Zhou <tao.zhou1@amd.com>
Tue, 4 Mar 2025 07:05:53 +0000 (15:05 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 18 Mar 2025 18:03:38 +0000 (14:03 -0400)
Clear old data and save it in V3 format.

v2: only format eeprom data for new ASICs.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h

index cfec29835634c10db736a6b83c9e460c5267a2e9..bed2603ae4c4386cfa645daee820fb317022c141 100644 (file)
@@ -3473,6 +3473,13 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
                                adev, control->bad_channel_bitmap);
                        con->update_channel_flag = false;
                }
+
+               /* The format action is only applied to new ASICs */
+               if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 &&
+                   control->tbl_hdr.version < RAS_TABLE_VER_V3)
+                       if (!amdgpu_ras_eeprom_reset_table(control))
+                               if (amdgpu_ras_save_bad_pages(adev, NULL))
+                                       dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n");
        }
 
        return ret;
index 3597ecd9baca34ceacc1120397d672c8c214a0a4..f6c0c7d3a19d4206688a8dbc24ee73aee56e5ee7 100644 (file)
@@ -413,9 +413,11 @@ static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control
 
        switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
        case IP_VERSION(8, 10, 0):
-       case IP_VERSION(12, 0, 0):
                hdr->version = RAS_TABLE_VER_V2_1;
                return;
+       case IP_VERSION(12, 0, 0):
+               hdr->version = RAS_TABLE_VER_V3;
+               return;
        default:
                hdr->version = RAS_TABLE_VER_V1;
                return;
@@ -443,7 +445,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
        hdr->header = RAS_TABLE_HDR_VAL;
        amdgpu_ras_set_eeprom_table_version(control);
 
-       if (hdr->version == RAS_TABLE_VER_V2_1) {
+       if (hdr->version >= RAS_TABLE_VER_V2_1) {
                hdr->first_rec_offset = RAS_RECORD_START_V2_1;
                hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
                                RAS_TABLE_V2_1_INFO_SIZE;
@@ -461,7 +463,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
        }
 
        csum = __calc_hdr_byte_sum(control);
-       if (hdr->version == RAS_TABLE_VER_V2_1)
+       if (hdr->version >= RAS_TABLE_VER_V2_1)
                csum += __calc_ras_info_byte_sum(control);
        csum = -csum;
        hdr->checksum = csum;
@@ -757,7 +759,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
                        "Saved bad pages %d reaches threshold value %d\n",
                        control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
                control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
-               if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) {
+               if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) {
                        control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
                        control->tbl_rai.health_percent = 0;
                }
@@ -770,7 +772,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
                amdgpu_dpm_send_rma_reason(adev);
        }
 
-       if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
+       if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
                control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
                                            RAS_TABLE_V2_1_INFO_SIZE +
                                            control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
@@ -810,7 +812,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
         * now calculate gpu health percent
         */
        if (amdgpu_bad_page_threshold != 0 &&
-           control->tbl_hdr.version == RAS_TABLE_VER_V2_1 &&
+           control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 &&
            control->ras_num_bad_pages <= ras->bad_page_cnt_threshold)
                control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
                                                   control->ras_num_bad_pages) * 100) /
@@ -823,7 +825,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
                csum += *pp;
 
        csum += __calc_hdr_byte_sum(control);
-       if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
+       if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
                csum += __calc_ras_info_byte_sum(control);
        /* avoid sign extension when assigning to "checksum" */
        csum = -csum;
@@ -1040,7 +1042,7 @@ uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *co
        /* get available eeprom table version first before eeprom table init */
        amdgpu_ras_set_eeprom_table_version(control);
 
-       if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
+       if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
                return RAS_MAX_RECORD_COUNT_V2_1;
        else
                return RAS_MAX_RECORD_COUNT;
@@ -1285,7 +1287,7 @@ static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control
        int buf_size, res;
        u8  csum, *buf, *pp;
 
-       if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
+       if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
                buf_size = RAS_TABLE_HEADER_SIZE +
                           RAS_TABLE_V2_1_INFO_SIZE +
                           control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
@@ -1388,7 +1390,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
 
        __decode_table_header_from_buf(hdr, buf);
 
-       if (hdr->version == RAS_TABLE_VER_V2_1) {
+       if (hdr->version >= RAS_TABLE_VER_V2_1) {
                control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
                control->ras_record_offset = RAS_RECORD_START_V2_1;
                control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
@@ -1428,7 +1430,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
                DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
                                 control->ras_num_bad_pages);
 
-               if (hdr->version == RAS_TABLE_VER_V2_1) {
+               if (hdr->version >= RAS_TABLE_VER_V2_1) {
                        res = __read_table_ras_info(control);
                        if (res)
                                return res;
@@ -1448,7 +1450,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
                                        ras->bad_page_cnt_threshold);
        } else if (hdr->header == RAS_TABLE_HDR_BAD &&
                   amdgpu_bad_page_threshold != 0) {
-               if (hdr->version == RAS_TABLE_VER_V2_1) {
+               if (hdr->version >= RAS_TABLE_VER_V2_1) {
                        res = __read_table_ras_info(control);
                        if (res)
                                return res;
index 13f7eda9a6960517e3e3a383ddd443829c66b628..ec6d7ea37ad071d102e25277e2562f059b808f8e 100644 (file)
@@ -28,6 +28,7 @@
 
 #define RAS_TABLE_VER_V1           0x00010000
 #define RAS_TABLE_VER_V2_1         0x00021000
+#define RAS_TABLE_VER_V3           0x00030000
 
 struct amdgpu_device;