From d0773a8e434a6ca5e87c5202d5189cfdd020aec2 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Tue, 16 Jul 2024 17:36:59 +0100 Subject: [PATCH] ras-arm-handler: Parse and log ARM Processor Error Info table Parse and log ARM Processor Error Info table data, UEFI 2.9A/2.10 specs section N2.4.4.1. [mchehab: fix a typo] Suggested-by: Mauro Carvalho Chehab Signed-off-by: Shiju Jose Signed-off-by: Mauro Carvalho Chehab --- ras-arm-handler.c | 334 ++++++++++++++++++++++++++++++++++++++++++++++ ras-record.c | 10 ++ ras-record.h | 5 + 3 files changed, 349 insertions(+) diff --git a/ras-arm-handler.c b/ras-arm-handler.c index 3f52ab8..01122f3 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c @@ -28,6 +28,61 @@ #define ARM_ERR_VALID_FLAGS BIT(1) #define BIT2 2 +#define ARM_INFO_VALID_MULTI_ERR BIT(0) +#define ARM_INFO_VALID_FLAGS BIT(1) +#define ARM_INFO_VALID_ERR_INFO BIT(2) +#define ARM_INFO_VALID_VIRT_ADDR BIT(3) +#define ARM_INFO_VALID_PHYSICAL_ADDR BIT(4) + +#define ARM_INFO_FLAGS_FIRST BIT(0) +#define ARM_INFO_FLAGS_LAST BIT(1) +#define ARM_INFO_FLAGS_PROPAGATED BIT(2) +#define ARM_INFO_FLAGS_OVERFLOW BIT(3) + +#define ARM_ERR_TYPE_MASK 0x1E /* GENMASK(4,1) */ +#define ARM_CACHE_ERROR BIT(1) +#define ARM_TLB_ERROR BIT(2) +#define ARM_BUS_ERROR BIT(3) +#define ARM_VENDOR_ERROR BIT(4) + +#define ARM_ERR_VALID_TRANSACTION_TYPE BIT(0) +#define ARM_ERR_VALID_OPERATION_TYPE BIT(1) +#define ARM_ERR_VALID_LEVEL BIT(2) +#define ARM_ERR_VALID_PROC_CONTEXT_CORRUPT BIT(3) +#define ARM_ERR_VALID_CORRECTED BIT(4) +#define ARM_ERR_VALID_PRECISE_PC BIT(5) +#define ARM_ERR_VALID_RESTARTABLE_PC BIT(6) +#define ARM_ERR_VALID_PARTICIPATION_TYPE BIT(7) +#define ARM_ERR_VALID_TIME_OUT BIT(8) +#define ARM_ERR_VALID_ADDRESS_SPACE BIT(9) +#define ARM_ERR_VALID_MEM_ATTRIBUTES BIT(10) +#define ARM_ERR_VALID_ACCESS_MODE BIT(11) + +#define ARM_ERR_TRANSACTION_SHIFT 16 +#define ARM_ERR_TRANSACTION_MASK 0x3 /* GENMASK(1,0) */ +#define ARM_ERR_OPERATION_SHIFT 18 +#define ARM_ERR_OPERATION_MASK 0xF /* GENMASK(3,0) */ +#define ARM_ERR_LEVEL_SHIFT 22 +#define ARM_ERR_LEVEL_MASK 0x7 /* GENMASK(2,0) */ +#define ARM_ERR_PC_CORRUPT_SHIFT 25 +#define ARM_ERR_PC_CORRUPT_MASK 0x1 /* (GENMASK(0,0) */ +#define ARM_ERR_CORRECTED_SHIFT 26 +#define ARM_ERR_CORRECTED_MASK 0x1 /* GENMASK(0,0) */ +#define ARM_ERR_PRECISE_PC_SHIFT 27 +#define ARM_ERR_PRECISE_PC_MASK 0x1 /* GENMASK(0,0) */ +#define ARM_ERR_RESTARTABLE_PC_SHIFT 28 +#define ARM_ERR_RESTARTABLE_PC_MASK 0x1 /* GENMASK(0,0) */ +#define ARM_ERR_PARTICIPATION_TYPE_SHIFT 29 +#define ARM_ERR_PARTICIPATION_TYPE_MASK 0x3 /* GENMASK(1,0) */ +#define ARM_ERR_TIME_OUT_SHIFT 31 +#define ARM_ERR_TIME_OUT_MASK 0x1 /* GENMASK(0,0) */ +#define ARM_ERR_ADDRESS_SPACE_SHIFT 32 +#define ARM_ERR_ADDRESS_SPACE_MASK 0x3 /* GENMASK(1,0) */ +#define ARM_ERR_MEM_ATTRIBUTES_SHIFT 34 +#define ARM_ERR_MEM_ATTRIBUTES_MASK 0x1FF /* GENMASK(8,0) */ +#define ARM_ERR_ACCESS_MODE_SHIFT 43 +#define ARM_ERR_ACCESS_MODE_MASK 0x1 /* GENMASK(0,0) */ + void display_raw_data(struct trace_seq *s, const uint8_t *buf, uint32_t datalen) @@ -48,6 +103,283 @@ void display_raw_data(struct trace_seq *s, } } +static const char * const arm_proc_error_type_strs[] = { + "", + "cache error", + "TLB error", + "bus error", + "micro-architectural error", +}; + +static const char * const arm_proc_error_flags_strs[] = { + "first error ", + "last error", + "propagated error", + "overflow", +}; + +static const char * const arm_err_trans_type_strs[] = { + "Instruction", + "Data Access", + "Generic", +}; + +static const char * const arm_bus_err_op_strs[] = { + "Generic error (type cannot be determined)", + "Generic read (type of instruction or data request cannot be determined)", + "Generic write (type of instruction of data request cannot be determined)", + "Data read", + "Data write", + "Instruction fetch", + "Prefetch", +}; + +static const char * const arm_cache_err_op_strs[] = { + "Generic error (type cannot be determined)", + "Generic read (type of instruction or data request cannot be determined)", + "Generic write (type of instruction of data request cannot be determined)", + "Data read", + "Data write", + "Instruction fetch", + "Prefetch", + "Eviction", + "Snooping (processor initiated a cache snoop that resulted in an error)", + "Snooped (processor raised a cache error caused by another processor or device snooping its cache)", + "Management", +}; + +static const char * const arm_tlb_err_op_strs[] = { + "Generic error (type cannot be determined)", + "Generic read (type of instruction or data request cannot be determined)", + "Generic write (type of instruction of data request cannot be determined)", + "Data read", + "Data write", + "Instruction fetch", + "Prefetch", + "Local management operation (processor initiated a TLB management operation that resulted in an error)", + "External management operation (processor raised a TLB error caused by another processor or device broadcasting TLB operations)", +}; + +static const char * const arm_bus_err_part_type_strs[] = { + "Local processor originated request", + "Local processor responded to request", + "Local processor observed", + "Generic", +}; + +static const char * const arm_bus_err_addr_space_strs[] = { + "External Memory Access", + "Internal Memory Access", + "Unknown", + "Device Memory Access", +}; + +static int decode_err_data_bits(char *buf, unsigned long data, + const char **data_str, size_t str_size) +{ + int bit; + + if (!buf || !data_str || !str_size) + return -1; + + for (bit = 0; bit < str_size; bit++) + if (data & BIT(bit)) + mce_snprintf(buf, " %s", ((char *)data_str[bit])); + return 0; +} + +static void parse_arm_err_info(struct trace_seq *s, uint32_t type, uint64_t error_info) +{ + uint8_t trans_type, op_type, level, participation_type, address_space; + uint16_t mem_attributes; + bool proc_context_corrupt, corrected, precise_pc, restartable_pc; + bool time_out, access_mode; + + /* + * Vendor type errors have error information values that are vendor + * specific. + */ + if (type & ARM_VENDOR_ERROR) + return; + + if (error_info & ARM_ERR_VALID_TRANSACTION_TYPE) { + trans_type = ((error_info >> ARM_ERR_TRANSACTION_SHIFT) + & ARM_ERR_TRANSACTION_MASK); + if (trans_type < ARRAY_SIZE(arm_err_trans_type_strs)) + trace_seq_printf(s, " transaction type:%s", + arm_err_trans_type_strs[trans_type]); + } + + if (error_info & ARM_ERR_VALID_OPERATION_TYPE) { + op_type = ((error_info >> ARM_ERR_OPERATION_SHIFT) + & ARM_ERR_OPERATION_MASK); + if (type & ARM_CACHE_ERROR) { + if (op_type < ARRAY_SIZE(arm_cache_err_op_strs)) + trace_seq_printf(s, " cache error, operation type:%s", + arm_cache_err_op_strs[op_type]); + } + if (type & ARM_TLB_ERROR) { + if (op_type < ARRAY_SIZE(arm_tlb_err_op_strs)) { + trace_seq_printf(s, " TLB error, operation type: %s", + arm_tlb_err_op_strs[op_type]); + } + } + if (type & ARM_BUS_ERROR) { + if (op_type < ARRAY_SIZE(arm_bus_err_op_strs)) { + trace_seq_printf(s, " bus error, operation type: %s", + arm_bus_err_op_strs[op_type]); + } + } + } + + if (error_info & ARM_ERR_VALID_LEVEL) { + level = ((error_info >> ARM_ERR_LEVEL_SHIFT) + & ARM_ERR_LEVEL_MASK); + if (type & ARM_CACHE_ERROR) + trace_seq_printf(s, " cache level: %d", level); + + if (type & ARM_TLB_ERROR) + trace_seq_printf(s, " TLB level: %d", level); + + if (type & ARM_BUS_ERROR) + trace_seq_printf(s, " affinity level at which the bus error occurred: %d", + level); + } + + if (error_info & ARM_ERR_VALID_PROC_CONTEXT_CORRUPT) { + proc_context_corrupt = ((error_info >> ARM_ERR_PC_CORRUPT_SHIFT) + & ARM_ERR_PC_CORRUPT_MASK); + if (proc_context_corrupt) + trace_seq_printf(s, " processor context corrupted"); + else + trace_seq_printf(s, " processor context not corrupted"); + } + + if (error_info & ARM_ERR_VALID_CORRECTED) { + corrected = ((error_info >> ARM_ERR_CORRECTED_SHIFT) + & ARM_ERR_CORRECTED_MASK); + if (corrected) + trace_seq_printf(s, " the error has been corrected"); + else + trace_seq_printf(s, " the error has not been corrected"); + } + + if (error_info & ARM_ERR_VALID_PRECISE_PC) { + precise_pc = ((error_info >> ARM_ERR_PRECISE_PC_SHIFT) + & ARM_ERR_PRECISE_PC_MASK); + if (precise_pc) + trace_seq_printf(s, " PC is precise"); + else + trace_seq_printf(s, " PC is imprecise"); + } + + if (error_info & ARM_ERR_VALID_RESTARTABLE_PC) { + restartable_pc = ((error_info >> ARM_ERR_RESTARTABLE_PC_SHIFT) + & ARM_ERR_RESTARTABLE_PC_MASK); + if (restartable_pc) + trace_seq_printf(s, " Program execution can be restarted reliably at the PC associated with the error"); + } + + /* The rest of the fields are specific to bus errors */ + if (type != ARM_BUS_ERROR) + return; + + if (error_info & ARM_ERR_VALID_PARTICIPATION_TYPE) { + participation_type = ((error_info >> ARM_ERR_PARTICIPATION_TYPE_SHIFT) + & ARM_ERR_PARTICIPATION_TYPE_MASK); + if (participation_type < ARRAY_SIZE(arm_bus_err_part_type_strs)) { + trace_seq_printf(s, " participation type: %s", + arm_bus_err_part_type_strs[participation_type]); + } + } + + if (error_info & ARM_ERR_VALID_TIME_OUT) { + time_out = ((error_info >> ARM_ERR_TIME_OUT_SHIFT) + & ARM_ERR_TIME_OUT_MASK); + if (time_out) + trace_seq_printf(s, " request timed out"); + } + + if (error_info & ARM_ERR_VALID_ADDRESS_SPACE) { + address_space = ((error_info >> ARM_ERR_ADDRESS_SPACE_SHIFT) + & ARM_ERR_ADDRESS_SPACE_MASK); + if (address_space < ARRAY_SIZE(arm_bus_err_addr_space_strs)) { + trace_seq_printf(s, " address space: %s", + arm_bus_err_addr_space_strs[address_space]); + } + } + + if (error_info & ARM_ERR_VALID_MEM_ATTRIBUTES) { + mem_attributes = ((error_info >> ARM_ERR_MEM_ATTRIBUTES_SHIFT) + & ARM_ERR_MEM_ATTRIBUTES_MASK); + trace_seq_printf(s, " memory access attributes:0x%x", + mem_attributes); + } + + if (error_info & ARM_ERR_VALID_ACCESS_MODE) { + access_mode = ((error_info >> ARM_ERR_ACCESS_MODE_SHIFT) + & ARM_ERR_ACCESS_MODE_MASK); + if (access_mode) + trace_seq_printf(s, " access mode: normal"); + else + trace_seq_printf(s, " access mode: secure"); + } +} + +static int parse_arm_processor_err_info(struct trace_seq *s, struct ras_arm_event *ev) +{ + int err_info_size = sizeof(struct ras_arm_err_info); + struct ras_arm_err_info *err_info; + int i, num_pei; + + if (ev->pei_len % err_info_size != 0) { + log(TERM, LOG_ERR, + "The event data does not match to the ARM Processor Error Information Structure\n"); + return -1; + } + num_pei = ev->pei_len / err_info_size; + err_info = (struct ras_arm_err_info *)(ev->pei_error); + + trace_seq_printf(s, "\nARM processor error info:\n"); + for (i = 0; i < num_pei; ++i) { + decode_err_data_bits(ev->error_types, err_info->type, + (const char **)arm_proc_error_type_strs, + ARRAY_SIZE(arm_proc_error_type_strs)); + trace_seq_printf(s, " error_types:%s", ev->error_types); + + if (err_info->validation_bits & ARM_ERR_VALID_ERROR_COUNT) { + ev->error_count = err_info->multiple_error + 1; + trace_seq_printf(s, " error_count:%d", ev->error_count); + } + if (err_info->validation_bits & ARM_INFO_VALID_FLAGS) { + decode_err_data_bits(ev->error_flags, err_info->flags, + (const char **)arm_proc_error_flags_strs, + ARRAY_SIZE(arm_proc_error_flags_strs)); + trace_seq_printf(s, " error_flags:%s", ev->error_flags); + } + if (err_info->validation_bits & ARM_INFO_VALID_ERR_INFO) { + ev->error_info = err_info->error_info; + trace_seq_printf(s, " error_info: 0x%016llx", + (unsigned long long)ev->error_info); + parse_arm_err_info(s, err_info->type, ev->error_info); + } + if (err_info->validation_bits & ARM_INFO_VALID_VIRT_ADDR) { + ev->virt_fault_addr = err_info->virt_fault_addr; + trace_seq_printf(s, " virtual fault address: 0x%016llx", + (unsigned long long)err_info->virt_fault_addr); + } + if (err_info->validation_bits & ARM_INFO_VALID_PHYSICAL_ADDR) { + ev->phy_fault_addr = err_info->physical_fault_addr; + trace_seq_printf(s, " physical fault address: 0x%016llx", + (unsigned long long)err_info->physical_fault_addr); + } + trace_seq_printf(s, "\n"); + err_info += 1; + } + + return 0; +} + #ifdef HAVE_CPU_FAULT_ISOLATION static int is_core_failure(struct ras_arm_err_info *err_info) { @@ -226,6 +558,8 @@ int ras_arm_event_handler(struct trace_seq *s, } display_raw_data(s, ev.pei_error, ev.pei_len); + parse_arm_processor_err_info(s, &ev); + if (tep_get_field_val(s, event, "ctx_len", record, &val, 1) < 0) return -1; ev.ctx_len = val; diff --git a/ras-record.c b/ras-record.c index 490934e..8341ae8 100644 --- a/ras-record.c +++ b/ras-record.c @@ -215,6 +215,11 @@ static const struct db_fields arm_event_fields[] = { { .name = "err_info", .type = "BLOB" }, { .name = "context_info", .type = "BLOB" }, { .name = "vendor_info", .type = "BLOB" }, + { .name = "error_type", .type = "TEXT" }, + { .name = "error_flags", .type = "TEXT" }, + { .name = "error_info", .type = "INTEGER" }, + { .name = "virt_fault_addr", .type = "INTEGER" }, + { .name = "phy_fault_addr", .type = "INTEGER" }, }; static const struct db_table_descriptor arm_event_tab = { @@ -244,6 +249,11 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) ev->ctx_error, ev->ctx_len, NULL); sqlite3_bind_blob(priv->stmt_arm_record, 9, ev->vsei_error, ev->oem_len, NULL); + sqlite3_bind_text(priv->stmt_arm_record, 10, ev->error_types, -1, NULL); + sqlite3_bind_text(priv->stmt_arm_record, 11, ev->error_flags, -1, NULL); + sqlite3_bind_int64(priv->stmt_arm_record, 12, ev->error_info); + sqlite3_bind_int64(priv->stmt_arm_record, 13, ev->virt_fault_addr); + sqlite3_bind_int64(priv->stmt_arm_record, 14, ev->phy_fault_addr); rc = sqlite3_step(priv->stmt_arm_record); if (rc != SQLITE_OK && rc != SQLITE_DONE) diff --git a/ras-record.h b/ras-record.h index e8ed263..06777a2 100644 --- a/ras-record.h +++ b/ras-record.h @@ -96,6 +96,11 @@ struct ras_arm_event { uint32_t ctx_len; const uint8_t *vsei_error; uint32_t oem_len; + char error_types[512]; + char error_flags[512]; + uint64_t error_info; + uint64_t virt_fault_addr; + uint64_t phy_fault_addr; }; struct devlink_event { -- 2.50.1