]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
drm/amdgpu: add ras event id support
authorYang Wang <kevinyang.wang@amd.com>
Wed, 13 Mar 2024 04:50:43 +0000 (12:50 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 20 Mar 2024 17:38:13 +0000 (13:38 -0400)
add amdgpu ras event id support to better distinguish different
error information sources in dmesg logs.

the following log will be identify by event id:
{event_id} interrupt to inform RAS event
{event_id} ACA logs
{event_id} errors statistic since from current injection/error query
{event_id} errors statistic since from gpu load

Signed-off-by: Yang Wang <kevinyang.wang@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c

index 24ad4b97177b5cff7840cb434115f61018f09f74..0734490347db5918c526107d91568e6219f61410 100644 (file)
@@ -210,22 +210,26 @@ int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
        return -EOPNOTSUPP;
 }
 
-static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry)
+static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry,
+                                        struct ras_query_context *qctx)
 {
-       dev_info(adev->dev, HW_ERR "Accelerator Check Architecture events logged\n");
-       dev_info(adev->dev, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n",
-                idx, entry->regs[MCA_REG_IDX_STATUS]);
-       dev_info(adev->dev, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n",
-                idx, entry->regs[MCA_REG_IDX_ADDR]);
-       dev_info(adev->dev, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n",
-                idx, entry->regs[MCA_REG_IDX_MISC0]);
-       dev_info(adev->dev, HW_ERR "aca entry[%02d].IPID=0x%016llx\n",
-                idx, entry->regs[MCA_REG_IDX_IPID]);
-       dev_info(adev->dev, HW_ERR "aca entry[%02d].SYND=0x%016llx\n",
-                idx, entry->regs[MCA_REG_IDX_SYND]);
+       u64 event_id = qctx->event_id;
+
+       RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");
+       RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n",
+                     idx, entry->regs[MCA_REG_IDX_STATUS]);
+       RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n",
+                     idx, entry->regs[MCA_REG_IDX_ADDR]);
+       RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n",
+                     idx, entry->regs[MCA_REG_IDX_MISC0]);
+       RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].IPID=0x%016llx\n",
+                     idx, entry->regs[MCA_REG_IDX_IPID]);
+       RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].SYND=0x%016llx\n",
+                     idx, entry->regs[MCA_REG_IDX_SYND]);
 }
 
-int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data)
+int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
+                                struct ras_err_data *err_data, struct ras_query_context *qctx)
 {
        struct amdgpu_smuio_mcm_config_info mcm_info;
        struct ras_err_addr err_addr = {0};
@@ -244,7 +248,7 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo
        list_for_each_entry(node, &mca_set.list, node) {
                entry = &node->entry;
 
-               amdgpu_mca_smu_mca_bank_dump(adev, i++, entry);
+               amdgpu_mca_smu_mca_bank_dump(adev, i++, entry, qctx);
 
                count = 0;
                ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count);
index b964110ed1e05e4f1a55e2659837fe1c3cb601af..e5bf07ce3451a174743edb360582fb58a61ded5c 100644 (file)
@@ -169,6 +169,7 @@ void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root
 void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set);
 int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_entry *entry);
 void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set);
-int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data);
+int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
+                                struct ras_err_data *err_data, struct ras_query_context *qctx);
 
 #endif
index 8ebab6f22e5a59079603acd27d8e50c766f12321..26662c76b293d313d53b2928ce91c96e87562f21 100644 (file)
@@ -1045,6 +1045,7 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d
 static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
                                              struct ras_manager *ras_mgr,
                                              struct ras_err_data *err_data,
+                                             struct ras_query_context *qctx,
                                              const char *blk_name,
                                              bool is_ue,
                                              bool is_de)
@@ -1052,27 +1053,28 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
        struct amdgpu_smuio_mcm_config_info *mcm_info;
        struct ras_err_node *err_node;
        struct ras_err_info *err_info;
+       u64 event_id = qctx->event_id;
 
        if (is_ue) {
                for_each_ras_error(err_node, err_data) {
                        err_info = &err_node->err_info;
                        mcm_info = &err_info->mcm_info;
                        if (err_info->ue_count) {
-                               dev_info(adev->dev, "socket: %d, die: %d, "
-                                        "%lld new uncorrectable hardware errors detected in %s block\n",
-                                        mcm_info->socket_id,
-                                        mcm_info->die_id,
-                                        err_info->ue_count,
-                                        blk_name);
+                               RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
+                                             "%lld new uncorrectable hardware errors detected in %s block\n",
+                                             mcm_info->socket_id,
+                                             mcm_info->die_id,
+                                             err_info->ue_count,
+                                             blk_name);
                        }
                }
 
                for_each_ras_error(err_node, &ras_mgr->err_data) {
                        err_info = &err_node->err_info;
                        mcm_info = &err_info->mcm_info;
-                       dev_info(adev->dev, "socket: %d, die: %d, "
-                                "%lld uncorrectable hardware errors detected in total in %s block\n",
-                                mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name);
+                       RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
+                                     "%lld uncorrectable hardware errors detected in total in %s block\n",
+                                     mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name);
                }
 
        } else {
@@ -1081,44 +1083,44 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
                                err_info = &err_node->err_info;
                                mcm_info = &err_info->mcm_info;
                                if (err_info->de_count) {
-                                       dev_info(adev->dev, "socket: %d, die: %d, "
-                                               "%lld new deferred hardware errors detected in %s block\n",
-                                               mcm_info->socket_id,
-                                               mcm_info->die_id,
-                                               err_info->de_count,
-                                               blk_name);
+                                       RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
+                                                     "%lld new deferred hardware errors detected in %s block\n",
+                                                     mcm_info->socket_id,
+                                                     mcm_info->die_id,
+                                                     err_info->de_count,
+                                                     blk_name);
                                }
                        }
 
                        for_each_ras_error(err_node, &ras_mgr->err_data) {
                                err_info = &err_node->err_info;
                                mcm_info = &err_info->mcm_info;
-                               dev_info(adev->dev, "socket: %d, die: %d, "
-                                       "%lld deferred hardware errors detected in total in %s block\n",
-                                       mcm_info->socket_id, mcm_info->die_id,
-                                       err_info->de_count, blk_name);
+                               RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
+                                             "%lld deferred hardware errors detected in total in %s block\n",
+                                             mcm_info->socket_id, mcm_info->die_id,
+                                             err_info->de_count, blk_name);
                        }
                } else {
                        for_each_ras_error(err_node, err_data) {
                                err_info = &err_node->err_info;
                                mcm_info = &err_info->mcm_info;
                                if (err_info->ce_count) {
-                                       dev_info(adev->dev, "socket: %d, die: %d, "
-                                               "%lld new correctable hardware errors detected in %s block\n",
-                                               mcm_info->socket_id,
-                                               mcm_info->die_id,
-                                               err_info->ce_count,
-                                               blk_name);
+                                       RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
+                                                     "%lld new correctable hardware errors detected in %s block\n",
+                                                     mcm_info->socket_id,
+                                                     mcm_info->die_id,
+                                                     err_info->ce_count,
+                                                     blk_name);
                                }
                        }
 
                        for_each_ras_error(err_node, &ras_mgr->err_data) {
                                err_info = &err_node->err_info;
                                mcm_info = &err_info->mcm_info;
-                               dev_info(adev->dev, "socket: %d, die: %d, "
-                                       "%lld correctable hardware errors detected in total in %s block\n",
-                                       mcm_info->socket_id, mcm_info->die_id,
-                                       err_info->ce_count, blk_name);
+                               RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
+                                             "%lld correctable hardware errors detected in total in %s block\n",
+                                             mcm_info->socket_id, mcm_info->die_id,
+                                             err_info->ce_count, blk_name);
                        }
                }
        }
@@ -1131,77 +1133,79 @@ static inline bool err_data_has_source_info(struct ras_err_data *data)
 
 static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
                                             struct ras_query_if *query_if,
-                                            struct ras_err_data *err_data)
+                                            struct ras_err_data *err_data,
+                                            struct ras_query_context *qctx)
 {
        struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);
        const char *blk_name = get_ras_block_str(&query_if->head);
+       u64 event_id = qctx->event_id;
 
        if (err_data->ce_count) {
                if (err_data_has_source_info(err_data)) {
-                       amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
+                       amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
                                                          blk_name, false, false);
                } else if (!adev->aid_mask &&
                           adev->smuio.funcs &&
                           adev->smuio.funcs->get_socket_id &&
                           adev->smuio.funcs->get_die_id) {
-                       dev_info(adev->dev, "socket: %d, die: %d "
-                                "%ld correctable hardware errors "
-                                "detected in %s block\n",
-                                adev->smuio.funcs->get_socket_id(adev),
-                                adev->smuio.funcs->get_die_id(adev),
-                                ras_mgr->err_data.ce_count,
-                                blk_name);
+                       RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
+                                     "%ld correctable hardware errors "
+                                     "detected in %s block\n",
+                                     adev->smuio.funcs->get_socket_id(adev),
+                                     adev->smuio.funcs->get_die_id(adev),
+                                     ras_mgr->err_data.ce_count,
+                                     blk_name);
                } else {
-                       dev_info(adev->dev, "%ld correctable hardware errors "
-                                "detected in %s block\n",
-                                ras_mgr->err_data.ce_count,
-                                blk_name);
+                       RAS_EVENT_LOG(adev, event_id, "%ld correctable hardware errors "
+                                     "detected in %s block\n",
+                                     ras_mgr->err_data.ce_count,
+                                     blk_name);
                }
        }
 
        if (err_data->ue_count) {
                if (err_data_has_source_info(err_data)) {
-                       amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
+                       amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
                                                          blk_name, true, false);
                } else if (!adev->aid_mask &&
                           adev->smuio.funcs &&
                           adev->smuio.funcs->get_socket_id &&
                           adev->smuio.funcs->get_die_id) {
-                       dev_info(adev->dev, "socket: %d, die: %d "
-                                "%ld uncorrectable hardware errors "
-                                "detected in %s block\n",
-                                adev->smuio.funcs->get_socket_id(adev),
-                                adev->smuio.funcs->get_die_id(adev),
-                                ras_mgr->err_data.ue_count,
-                                blk_name);
+                       RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
+                                     "%ld uncorrectable hardware errors "
+                                     "detected in %s block\n",
+                                     adev->smuio.funcs->get_socket_id(adev),
+                                     adev->smuio.funcs->get_die_id(adev),
+                                     ras_mgr->err_data.ue_count,
+                                     blk_name);
                } else {
-                       dev_info(adev->dev, "%ld uncorrectable hardware errors "
-                                "detected in %s block\n",
-                                ras_mgr->err_data.ue_count,
-                                blk_name);
+                       RAS_EVENT_LOG(adev, event_id, "%ld uncorrectable hardware errors "
+                                     "detected in %s block\n",
+                                     ras_mgr->err_data.ue_count,
+                                     blk_name);
                }
        }
 
        if (err_data->de_count) {
                if (err_data_has_source_info(err_data)) {
-                       amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
+                       amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
                                                          blk_name, false, true);
                } else if (!adev->aid_mask &&
                           adev->smuio.funcs &&
                           adev->smuio.funcs->get_socket_id &&
                           adev->smuio.funcs->get_die_id) {
-                       dev_info(adev->dev, "socket: %d, die: %d "
-                                "%ld deferred hardware errors "
-                                "detected in %s block\n",
-                                adev->smuio.funcs->get_socket_id(adev),
-                                adev->smuio.funcs->get_die_id(adev),
-                                ras_mgr->err_data.de_count,
-                                blk_name);
+                       RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
+                                     "%ld deferred hardware errors "
+                                     "detected in %s block\n",
+                                     adev->smuio.funcs->get_socket_id(adev),
+                                     adev->smuio.funcs->get_die_id(adev),
+                                     ras_mgr->err_data.de_count,
+                                     blk_name);
                } else {
-                       dev_info(adev->dev, "%ld deferred hardware errors "
-                                "detected in %s block\n",
-                                ras_mgr->err_data.de_count,
-                                blk_name);
+                       RAS_EVENT_LOG(adev, event_id, "%ld deferred hardware errors "
+                                     "detected in %s block\n",
+                                     ras_mgr->err_data.de_count,
+                                     blk_name);
                }
        }
 }
@@ -1294,6 +1298,7 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *a
 static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
                                                struct ras_query_if *info,
                                                struct ras_err_data *err_data,
+                                               struct ras_query_context *qctx,
                                                unsigned int error_query_mode)
 {
        enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
@@ -1338,8 +1343,8 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
                                return ret;
                } else {
                        /* FIXME: add code to check return value later */
-                       amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
-                       amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
+                       amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data, qctx);
+                       amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data, qctx);
                }
        }
 
@@ -1351,6 +1356,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
 {
        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
        struct ras_err_data err_data;
+       struct ras_query_context qctx;
        unsigned int error_query_mode;
        int ret;
 
@@ -1364,8 +1370,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
        if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode))
                return -EINVAL;
 
+       memset(&qctx, 0, sizeof(qctx));
+       qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ?
+                                                  RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID);
        ret = amdgpu_ras_query_error_status_helper(adev, info,
                                                   &err_data,
+                                                  &qctx,
                                                   error_query_mode);
        if (ret)
                goto out_fini_err_data;
@@ -1376,7 +1386,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
        info->ce_count = obj->err_data.ce_count;
        info->de_count = obj->err_data.de_count;
 
-       amdgpu_ras_error_generate_report(adev, info, &err_data);
+       amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx);
 
 out_fini_err_data:
        amdgpu_ras_error_data_fini(&err_data);
@@ -3036,6 +3046,35 @@ static int amdgpu_get_ras_schema(struct amdgpu_device *adev)
                        AMDGPU_RAS_ERROR__PARITY;
 }
 
+static void ras_event_mgr_init(struct ras_event_manager *mgr)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(mgr->seqnos); i++)
+               atomic64_set(&mgr->seqnos[i], 0);
+}
+
+static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev)
+{
+       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+       struct amdgpu_hive_info *hive;
+
+       if (!ras)
+               return;
+
+       hive = amdgpu_get_xgmi_hive(adev);
+       ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr;
+
+       /* init event manager with node 0 on xgmi system */
+       if (!amdgpu_in_reset(adev)) {
+               if (!hive || adev->gmc.xgmi.node_id == 0)
+                       ras_event_mgr_init(ras->event_mgr);
+       }
+
+       if (hive)
+               amdgpu_put_xgmi_hive(hive);
+}
+
 int amdgpu_ras_init(struct amdgpu_device *adev)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -3356,6 +3395,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
        if (amdgpu_sriov_vf(adev))
                return 0;
 
+       amdgpu_ras_event_mgr_init(adev);
+
        if (amdgpu_aca_is_enabled(adev)) {
                if (amdgpu_in_reset(adev))
                        r = amdgpu_aca_reset(adev);
@@ -3472,13 +3513,37 @@ void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status)
                atomic_set(&ras->fed, !!status);
 }
 
+bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id)
+{
+       return !(id & BIT_ULL(63));
+}
+
+u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type)
+{
+       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+       u64 id;
+
+       switch (type) {
+       case RAS_EVENT_TYPE_ISR:
+               id = (u64)atomic64_read(&ras->event_mgr->seqnos[type]);
+               break;
+       case RAS_EVENT_TYPE_INVALID:
+       default:
+               id = BIT_ULL(63) | 0ULL;
+               break;
+       }
+
+       return id;
+}
+
 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
 {
        if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
                struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+               u64 event_id = (u64)atomic64_inc_return(&ras->event_mgr->seqnos[RAS_EVENT_TYPE_ISR]);
 
-               dev_info(adev->dev, "uncorrectable hardware error"
-                       "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
+               RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"
+                             "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
 
                ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
                amdgpu_ras_reset_gpu(adev);
index e0f8ce9d844060e95355a4d2015429b6427e1993..8d26989c75c8dc0db98dfb5e1045a9709d35e673 100644 (file)
@@ -64,6 +64,14 @@ struct amdgpu_iv_entry;
 /* The high three bits indicates socketid */
 #define AMDGPU_RAS_GET_FEATURES(val)  ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
 
+#define RAS_EVENT_LOG(_adev, _id, _fmt, ...)                           \
+do {                                                                   \
+       if (amdgpu_ras_event_id_is_valid((_adev), (_id)))                       \
+           dev_info((_adev)->dev, "{%llu}" _fmt, (_id), ##__VA_ARGS__);        \
+       else                                                            \
+           dev_info((_adev)->dev, _fmt, ##__VA_ARGS__);                        \
+} while (0)
+
 enum amdgpu_ras_block {
        AMDGPU_RAS_BLOCK__UMC = 0,
        AMDGPU_RAS_BLOCK__SDMA,
@@ -419,6 +427,21 @@ struct umc_ecc_info {
        int record_ce_addr_supported;
 };
 
+enum ras_event_type {
+       RAS_EVENT_TYPE_INVALID = -1,
+       RAS_EVENT_TYPE_ISR = 0,
+       RAS_EVENT_TYPE_COUNT,
+};
+
+struct ras_event_manager {
+       atomic64_t seqnos[RAS_EVENT_TYPE_COUNT];
+};
+
+struct ras_query_context {
+       enum ras_event_type type;
+       u64 event_id;
+};
+
 struct amdgpu_ras {
        /* ras infrastructure */
        /* for ras itself. */
@@ -479,6 +502,11 @@ struct amdgpu_ras {
        atomic_t page_retirement_req_cnt;
        /* Fatal error detected flag */
        atomic_t fed;
+
+       /* RAS event manager */
+       struct ras_event_manager __event_mgr;
+       struct ras_event_manager *event_mgr;
+
 };
 
 struct ras_fs_data {
@@ -879,4 +907,6 @@ void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info,
 void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status);
 bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
 
+bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id);
+u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type);
 #endif
index 1592c63b3099b982d0b9bdda596919da8ec14f5f..a3bfc16de6d4961b03bb4750a1c9ead360ba210c 100644 (file)
@@ -44,6 +44,7 @@ struct amdgpu_hive_info {
 
        struct amdgpu_reset_domain *reset_domain;
        atomic_t ras_recovery;
+       struct ras_event_manager event_mgr;
 };
 
 struct amdgpu_pcs_ras_field {
index 77af4e25ff465c8e760fa533e8271f5c83965e01..4a02e1f041dae3bdd5194201608650708e7ac25b 100644 (file)
@@ -404,10 +404,16 @@ static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev,
 static void umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
                                        void *ras_error_status)
 {
+       struct ras_query_context qctx;
+
+       memset(&qctx, 0, sizeof(qctx));
+       qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ?
+                                                   RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID);
+
        amdgpu_mca_smu_log_ras_error(adev,
-               AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status);
+               AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status, &qctx);
        amdgpu_mca_smu_log_ras_error(adev,
-               AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status);
+               AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status, &qctx);
 }
 
 static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *adev,