ras-arm-handler: Parse and log ARM Processor Error Info table
authorShiju Jose <shiju.jose@huawei.com>
Tue, 16 Jul 2024 16:36:59 +0000 (17:36 +0100)
committerMauro Carvalho Chehab <mchehab+huawei@kernel.org>
Wed, 17 Jul 2024 06:02:42 +0000 (08:02 +0200)
Parse and log ARM Processor Error Info table data, UEFI 2.9A/2.10
specs section N2.4.4.1.

[mchehab: fix a typo]
Suggested-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
ras-arm-handler.c
ras-record.c
ras-record.h

index 3f52ab8d51a2572d5d003e68bf8a08a73833d0be..01122f3a70c7522198062b797ff3623fb1d1854e 100644 (file)
 #define ARM_ERR_VALID_FLAGS BIT(1)
 #define BIT2 2
 
+#define ARM_INFO_VALID_MULTI_ERR       BIT(0)
+#define ARM_INFO_VALID_FLAGS           BIT(1)
+#define ARM_INFO_VALID_ERR_INFO                BIT(2)
+#define ARM_INFO_VALID_VIRT_ADDR       BIT(3)
+#define ARM_INFO_VALID_PHYSICAL_ADDR   BIT(4)
+
+#define ARM_INFO_FLAGS_FIRST           BIT(0)
+#define ARM_INFO_FLAGS_LAST            BIT(1)
+#define ARM_INFO_FLAGS_PROPAGATED      BIT(2)
+#define ARM_INFO_FLAGS_OVERFLOW                BIT(3)
+
+#define ARM_ERR_TYPE_MASK              0x1E /* GENMASK(4,1) */
+#define ARM_CACHE_ERROR                        BIT(1)
+#define ARM_TLB_ERROR                  BIT(2)
+#define ARM_BUS_ERROR                  BIT(3)
+#define ARM_VENDOR_ERROR               BIT(4)
+
+#define ARM_ERR_VALID_TRANSACTION_TYPE         BIT(0)
+#define ARM_ERR_VALID_OPERATION_TYPE           BIT(1)
+#define ARM_ERR_VALID_LEVEL                    BIT(2)
+#define ARM_ERR_VALID_PROC_CONTEXT_CORRUPT     BIT(3)
+#define ARM_ERR_VALID_CORRECTED                        BIT(4)
+#define ARM_ERR_VALID_PRECISE_PC               BIT(5)
+#define ARM_ERR_VALID_RESTARTABLE_PC           BIT(6)
+#define ARM_ERR_VALID_PARTICIPATION_TYPE       BIT(7)
+#define ARM_ERR_VALID_TIME_OUT                 BIT(8)
+#define ARM_ERR_VALID_ADDRESS_SPACE            BIT(9)
+#define ARM_ERR_VALID_MEM_ATTRIBUTES           BIT(10)
+#define ARM_ERR_VALID_ACCESS_MODE              BIT(11)
+
+#define ARM_ERR_TRANSACTION_SHIFT              16
+#define ARM_ERR_TRANSACTION_MASK               0x3 /* GENMASK(1,0) */
+#define ARM_ERR_OPERATION_SHIFT                        18
+#define ARM_ERR_OPERATION_MASK                 0xF /* GENMASK(3,0) */
+#define ARM_ERR_LEVEL_SHIFT                    22
+#define ARM_ERR_LEVEL_MASK                     0x7 /* GENMASK(2,0) */
+#define ARM_ERR_PC_CORRUPT_SHIFT               25
+#define ARM_ERR_PC_CORRUPT_MASK                        0x1 /* (GENMASK(0,0) */
+#define ARM_ERR_CORRECTED_SHIFT                        26
+#define ARM_ERR_CORRECTED_MASK                 0x1 /* GENMASK(0,0) */
+#define ARM_ERR_PRECISE_PC_SHIFT               27
+#define ARM_ERR_PRECISE_PC_MASK                        0x1 /* GENMASK(0,0) */
+#define ARM_ERR_RESTARTABLE_PC_SHIFT           28
+#define ARM_ERR_RESTARTABLE_PC_MASK            0x1 /* GENMASK(0,0) */
+#define ARM_ERR_PARTICIPATION_TYPE_SHIFT       29
+#define ARM_ERR_PARTICIPATION_TYPE_MASK                0x3 /* GENMASK(1,0) */
+#define ARM_ERR_TIME_OUT_SHIFT                 31
+#define ARM_ERR_TIME_OUT_MASK                  0x1 /* GENMASK(0,0) */
+#define ARM_ERR_ADDRESS_SPACE_SHIFT            32
+#define ARM_ERR_ADDRESS_SPACE_MASK             0x3 /* GENMASK(1,0) */
+#define ARM_ERR_MEM_ATTRIBUTES_SHIFT           34
+#define ARM_ERR_MEM_ATTRIBUTES_MASK            0x1FF /* GENMASK(8,0) */
+#define ARM_ERR_ACCESS_MODE_SHIFT              43
+#define ARM_ERR_ACCESS_MODE_MASK               0x1 /* GENMASK(0,0) */
+
 void display_raw_data(struct trace_seq *s,
                      const uint8_t *buf,
                      uint32_t datalen)
@@ -48,6 +103,283 @@ void display_raw_data(struct trace_seq *s,
        }
 }
 
+static const char * const arm_proc_error_type_strs[] = {
+       "",
+       "cache error",
+       "TLB error",
+       "bus error",
+       "micro-architectural error",
+};
+
+static const char * const arm_proc_error_flags_strs[] = {
+       "first error ",
+       "last error",
+       "propagated error",
+       "overflow",
+};
+
+static const char * const arm_err_trans_type_strs[] = {
+       "Instruction",
+       "Data Access",
+       "Generic",
+};
+
+static const char * const arm_bus_err_op_strs[] = {
+       "Generic error (type cannot be determined)",
+       "Generic read (type of instruction or data request cannot be determined)",
+       "Generic write (type of instruction of data request cannot be determined)",
+       "Data read",
+       "Data write",
+       "Instruction fetch",
+       "Prefetch",
+};
+
+static const char * const arm_cache_err_op_strs[] = {
+       "Generic error (type cannot be determined)",
+       "Generic read (type of instruction or data request cannot be determined)",
+       "Generic write (type of instruction of data request cannot be determined)",
+       "Data read",
+       "Data write",
+       "Instruction fetch",
+       "Prefetch",
+       "Eviction",
+       "Snooping (processor initiated a cache snoop that resulted in an error)",
+       "Snooped (processor raised a cache error caused by another processor or device snooping its cache)",
+       "Management",
+};
+
+static const char * const arm_tlb_err_op_strs[] = {
+       "Generic error (type cannot be determined)",
+       "Generic read (type of instruction or data request cannot be determined)",
+       "Generic write (type of instruction of data request cannot be determined)",
+       "Data read",
+       "Data write",
+       "Instruction fetch",
+       "Prefetch",
+       "Local management operation (processor initiated a TLB management operation that resulted in an error)",
+       "External management operation (processor raised a TLB error caused by another processor or device broadcasting TLB operations)",
+};
+
+static const char * const arm_bus_err_part_type_strs[] = {
+       "Local processor originated request",
+       "Local processor responded to request",
+       "Local processor observed",
+       "Generic",
+};
+
+static const char * const arm_bus_err_addr_space_strs[] = {
+       "External Memory Access",
+       "Internal Memory Access",
+       "Unknown",
+       "Device Memory Access",
+};
+
+static int decode_err_data_bits(char *buf, unsigned long data,
+                               const char **data_str, size_t str_size)
+{
+       int bit;
+
+       if (!buf || !data_str || !str_size)
+               return -1;
+
+       for (bit = 0; bit < str_size; bit++)
+               if (data & BIT(bit))
+                       mce_snprintf(buf, " %s", ((char *)data_str[bit]));
+       return 0;
+}
+
+static void parse_arm_err_info(struct trace_seq *s, uint32_t type, uint64_t error_info)
+{
+       uint8_t trans_type, op_type, level, participation_type, address_space;
+       uint16_t mem_attributes;
+       bool proc_context_corrupt, corrected, precise_pc, restartable_pc;
+       bool time_out, access_mode;
+
+       /*
+        * Vendor type errors have error information values that are vendor
+        * specific.
+        */
+       if (type & ARM_VENDOR_ERROR)
+               return;
+
+       if (error_info & ARM_ERR_VALID_TRANSACTION_TYPE) {
+               trans_type = ((error_info >> ARM_ERR_TRANSACTION_SHIFT)
+                             & ARM_ERR_TRANSACTION_MASK);
+               if (trans_type < ARRAY_SIZE(arm_err_trans_type_strs))
+                       trace_seq_printf(s, " transaction type:%s",
+                                        arm_err_trans_type_strs[trans_type]);
+       }
+
+       if (error_info & ARM_ERR_VALID_OPERATION_TYPE) {
+               op_type = ((error_info >> ARM_ERR_OPERATION_SHIFT)
+                          & ARM_ERR_OPERATION_MASK);
+               if (type & ARM_CACHE_ERROR) {
+                       if (op_type < ARRAY_SIZE(arm_cache_err_op_strs))
+                               trace_seq_printf(s, " cache error, operation type:%s",
+                                                arm_cache_err_op_strs[op_type]);
+               }
+               if (type & ARM_TLB_ERROR) {
+                       if (op_type < ARRAY_SIZE(arm_tlb_err_op_strs)) {
+                               trace_seq_printf(s, " TLB error, operation type: %s",
+                                                arm_tlb_err_op_strs[op_type]);
+                       }
+               }
+               if (type & ARM_BUS_ERROR) {
+                       if (op_type < ARRAY_SIZE(arm_bus_err_op_strs)) {
+                               trace_seq_printf(s, " bus error, operation type: %s",
+                                                arm_bus_err_op_strs[op_type]);
+                       }
+               }
+       }
+
+       if (error_info & ARM_ERR_VALID_LEVEL) {
+               level = ((error_info >> ARM_ERR_LEVEL_SHIFT)
+                        & ARM_ERR_LEVEL_MASK);
+               if (type & ARM_CACHE_ERROR)
+                       trace_seq_printf(s, " cache level: %d", level);
+
+               if (type & ARM_TLB_ERROR)
+                       trace_seq_printf(s, " TLB level: %d", level);
+
+               if (type & ARM_BUS_ERROR)
+                       trace_seq_printf(s, " affinity level at which the bus error occurred: %d",
+                                        level);
+       }
+
+       if (error_info & ARM_ERR_VALID_PROC_CONTEXT_CORRUPT) {
+               proc_context_corrupt = ((error_info >> ARM_ERR_PC_CORRUPT_SHIFT)
+                                       & ARM_ERR_PC_CORRUPT_MASK);
+               if (proc_context_corrupt)
+                       trace_seq_printf(s, " processor context corrupted");
+               else
+                       trace_seq_printf(s, " processor context not corrupted");
+       }
+
+       if (error_info & ARM_ERR_VALID_CORRECTED) {
+               corrected = ((error_info >> ARM_ERR_CORRECTED_SHIFT)
+                            & ARM_ERR_CORRECTED_MASK);
+               if (corrected)
+                       trace_seq_printf(s, " the error has been corrected");
+               else
+                       trace_seq_printf(s, " the error has not been corrected");
+       }
+
+       if (error_info & ARM_ERR_VALID_PRECISE_PC) {
+               precise_pc = ((error_info >> ARM_ERR_PRECISE_PC_SHIFT)
+                             & ARM_ERR_PRECISE_PC_MASK);
+               if (precise_pc)
+                       trace_seq_printf(s, " PC is precise");
+               else
+                       trace_seq_printf(s, " PC is imprecise");
+       }
+
+       if (error_info & ARM_ERR_VALID_RESTARTABLE_PC) {
+               restartable_pc = ((error_info >> ARM_ERR_RESTARTABLE_PC_SHIFT)
+                                 & ARM_ERR_RESTARTABLE_PC_MASK);
+               if (restartable_pc)
+                       trace_seq_printf(s, " Program execution can be restarted reliably at the PC associated with the error");
+       }
+
+       /* The rest of the fields are specific to bus errors */
+       if (type != ARM_BUS_ERROR)
+               return;
+
+       if (error_info & ARM_ERR_VALID_PARTICIPATION_TYPE) {
+               participation_type = ((error_info >> ARM_ERR_PARTICIPATION_TYPE_SHIFT)
+                                     & ARM_ERR_PARTICIPATION_TYPE_MASK);
+               if (participation_type < ARRAY_SIZE(arm_bus_err_part_type_strs)) {
+                       trace_seq_printf(s, " participation type: %s",
+                                        arm_bus_err_part_type_strs[participation_type]);
+               }
+       }
+
+       if (error_info & ARM_ERR_VALID_TIME_OUT) {
+               time_out = ((error_info >> ARM_ERR_TIME_OUT_SHIFT)
+                           & ARM_ERR_TIME_OUT_MASK);
+               if (time_out)
+                       trace_seq_printf(s, " request timed out");
+       }
+
+       if (error_info & ARM_ERR_VALID_ADDRESS_SPACE) {
+               address_space = ((error_info >> ARM_ERR_ADDRESS_SPACE_SHIFT)
+                                & ARM_ERR_ADDRESS_SPACE_MASK);
+               if (address_space < ARRAY_SIZE(arm_bus_err_addr_space_strs)) {
+                       trace_seq_printf(s, " address space: %s",
+                                        arm_bus_err_addr_space_strs[address_space]);
+               }
+       }
+
+       if (error_info & ARM_ERR_VALID_MEM_ATTRIBUTES) {
+               mem_attributes = ((error_info >> ARM_ERR_MEM_ATTRIBUTES_SHIFT)
+                                 & ARM_ERR_MEM_ATTRIBUTES_MASK);
+               trace_seq_printf(s, " memory access attributes:0x%x",
+                                mem_attributes);
+       }
+
+       if (error_info & ARM_ERR_VALID_ACCESS_MODE) {
+               access_mode = ((error_info >> ARM_ERR_ACCESS_MODE_SHIFT)
+                              & ARM_ERR_ACCESS_MODE_MASK);
+               if (access_mode)
+                       trace_seq_printf(s, " access mode: normal");
+               else
+                       trace_seq_printf(s, " access mode: secure");
+       }
+}
+
+static int parse_arm_processor_err_info(struct trace_seq *s, struct ras_arm_event *ev)
+{
+       int err_info_size = sizeof(struct ras_arm_err_info);
+       struct ras_arm_err_info *err_info;
+       int i, num_pei;
+
+       if (ev->pei_len % err_info_size != 0) {
+               log(TERM, LOG_ERR,
+                   "The event data does not match to the ARM Processor Error Information Structure\n");
+               return -1;
+       }
+       num_pei = ev->pei_len / err_info_size;
+       err_info = (struct ras_arm_err_info *)(ev->pei_error);
+
+       trace_seq_printf(s, "\nARM processor error info:\n");
+       for (i = 0; i < num_pei; ++i) {
+               decode_err_data_bits(ev->error_types, err_info->type,
+                                    (const char **)arm_proc_error_type_strs,
+                                    ARRAY_SIZE(arm_proc_error_type_strs));
+               trace_seq_printf(s, " error_types:%s", ev->error_types);
+
+               if (err_info->validation_bits & ARM_ERR_VALID_ERROR_COUNT) {
+                       ev->error_count = err_info->multiple_error + 1;
+                       trace_seq_printf(s, " error_count:%d", ev->error_count);
+               }
+               if (err_info->validation_bits & ARM_INFO_VALID_FLAGS) {
+                       decode_err_data_bits(ev->error_flags, err_info->flags,
+                                            (const char **)arm_proc_error_flags_strs,
+                                            ARRAY_SIZE(arm_proc_error_flags_strs));
+                       trace_seq_printf(s, " error_flags:%s", ev->error_flags);
+               }
+               if (err_info->validation_bits & ARM_INFO_VALID_ERR_INFO) {
+                       ev->error_info = err_info->error_info;
+                       trace_seq_printf(s, " error_info: 0x%016llx",
+                                        (unsigned long long)ev->error_info);
+                       parse_arm_err_info(s, err_info->type, ev->error_info);
+               }
+               if (err_info->validation_bits & ARM_INFO_VALID_VIRT_ADDR) {
+                       ev->virt_fault_addr = err_info->virt_fault_addr;
+                       trace_seq_printf(s, " virtual fault address: 0x%016llx",
+                                        (unsigned long long)err_info->virt_fault_addr);
+               }
+               if (err_info->validation_bits & ARM_INFO_VALID_PHYSICAL_ADDR) {
+                       ev->phy_fault_addr = err_info->physical_fault_addr;
+                       trace_seq_printf(s, " physical fault address: 0x%016llx",
+                                        (unsigned long long)err_info->physical_fault_addr);
+               }
+               trace_seq_printf(s, "\n");
+               err_info += 1;
+       }
+
+       return 0;
+}
+
 #ifdef HAVE_CPU_FAULT_ISOLATION
 static int is_core_failure(struct ras_arm_err_info *err_info)
 {
@@ -226,6 +558,8 @@ int ras_arm_event_handler(struct trace_seq *s,
                }
                display_raw_data(s, ev.pei_error, ev.pei_len);
 
+               parse_arm_processor_err_info(s, &ev);
+
                if (tep_get_field_val(s, event, "ctx_len", record, &val, 1) < 0)
                        return -1;
                ev.ctx_len = val;
index 490934e1887500d9a950eedbb60376f8150fc419..8341ae817e487f127f6611412f947d8d248e5a28 100644 (file)
@@ -215,6 +215,11 @@ static const struct db_fields arm_event_fields[] = {
                { .name = "err_info",           .type = "BLOB"  },
                { .name = "context_info",               .type = "BLOB"  },
                { .name = "vendor_info",                .type = "BLOB"  },
+               { .name = "error_type",         .type = "TEXT" },
+               { .name = "error_flags",        .type = "TEXT" },
+               { .name = "error_info",         .type = "INTEGER" },
+               { .name = "virt_fault_addr",    .type = "INTEGER" },
+               { .name = "phy_fault_addr",     .type = "INTEGER" },
 };
 
 static const struct db_table_descriptor arm_event_tab = {
@@ -244,6 +249,11 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev)
                          ev->ctx_error, ev->ctx_len, NULL);
        sqlite3_bind_blob(priv->stmt_arm_record,  9,
                          ev->vsei_error, ev->oem_len, NULL);
+       sqlite3_bind_text(priv->stmt_arm_record,  10, ev->error_types, -1, NULL);
+       sqlite3_bind_text(priv->stmt_arm_record, 11, ev->error_flags, -1, NULL);
+       sqlite3_bind_int64(priv->stmt_arm_record,  12,  ev->error_info);
+       sqlite3_bind_int64(priv->stmt_arm_record,  13,  ev->virt_fault_addr);
+       sqlite3_bind_int64(priv->stmt_arm_record,  14,  ev->phy_fault_addr);
 
        rc = sqlite3_step(priv->stmt_arm_record);
        if (rc != SQLITE_OK && rc != SQLITE_DONE)
index e8ed26326e04a57fc2c1c8aea75d2ec1776737fe..06777a28ee833dd25efb0c85b65388a21a3895fd 100644 (file)
@@ -96,6 +96,11 @@ struct ras_arm_event {
        uint32_t ctx_len;
        const uint8_t *vsei_error;
        uint32_t oem_len;
+       char error_types[512];
+       char error_flags[512];
+       uint64_t error_info;
+       uint64_t virt_fault_addr;
+       uint64_t phy_fault_addr;
 };
 
 struct devlink_event {