From: Shengwei Luo <luoshengwei@huawei.com> Date: Wed, 23 Feb 2022 09:23:27 +0000 (+0800) Subject: rasdaemon: Support cpu fault isolation for recoverable errors X-Git-Tag: v0.7.0~16 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=b8f6c6200a276081079b56db1114ca77d25dcc59;p=users%2Fmchehab%2Frasdaemon.git rasdaemon: Support cpu fault isolation for recoverable errors When the recoverable errors in cpu core occurred, try to offline the related cpu core. Signed-off-by: Shengwei Luo <luoshengwei@huawei.com> Signed-off-by: Junchong Pan <panjunchong@hisilicon.com> Signed-off-by: Lei Feng <fenglei47@h-partners.com> Signed-off-by: Shiju Jose <shiju.jose@huawei.com> Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org> --- diff --git a/ras-arm-handler.c b/ras-arm-handler.c index 9c7a3c3..a0dfc51 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c @@ -26,6 +26,7 @@ #define ARM_ERR_VALID_ERROR_COUNT BIT(0) #define ARM_ERR_VALID_FLAGS BIT(1) +#define BIT2 2 void display_raw_data(struct trace_seq *s, const uint8_t *buf, @@ -47,7 +48,20 @@ void display_raw_data(struct trace_seq *s, } #ifdef HAVE_CPU_FAULT_ISOLATION -static int count_errors(struct ras_arm_event *ev) +static int is_core_failure(struct ras_arm_err_info *err_info) +{ + if (err_info->validation_bits & ARM_ERR_VALID_FLAGS) { + /* + * core failure: + * Bit 0\1\3: (at lease 1) + * Bit 2: 0 + */ + return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << BIT2)); + } + return 0; +} + +static int count_errors(struct ras_arm_event *ev, int sev) { struct ras_arm_err_info *err_info; int num_pei; @@ -75,6 +89,8 @@ static int count_errors(struct ras_arm_event *ev) */ error_count = err_info->multiple_error + 1; } + if (sev == GHES_SEV_RECOVERABLE && !is_core_failure(err_info)) + error_count = 0; num += error_count; err_info += 1; @@ -118,8 +134,8 @@ static int ras_handle_cpu_error(struct trace_seq *s, } trace_seq_printf(s, "\n severity: %s", severity); - if (val == GHES_SEV_CORRECTED) { - int nums = count_errors(ev); + if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) { + int nums = count_errors(ev, val); if (nums > 0) { err_info.nums = nums; diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c index 1694a08..90633fd 100644 --- a/ras-cpu-isolation.c +++ b/ras-cpu-isolation.c @@ -126,6 +126,7 @@ static int init_cpu_info(unsigned int cpus) for (unsigned int i = 0; i < cpus; ++i) { cpu_infos[i].ce_nums = 0; + cpu_infos[i].uce_nums = 0; cpu_infos[i].state = get_cpu_status(i); cpu_infos[i].ce_queue = init_queue(); @@ -306,6 +307,15 @@ static int do_ce_handler(unsigned int cpu) return HANDLE_NOTHING; } +static int do_uce_handler(unsigned int cpu) +{ + if (cpu_infos[cpu].uce_nums > 0) { + log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%u\n", cpu); + return do_cpu_offline(cpu); + } + return HANDLE_NOTHING; +} + static int error_handler(unsigned int cpu, struct error_info *err_info) { int ret = HANDLE_NOTHING; @@ -314,6 +324,9 @@ static int error_handler(unsigned int cpu, struct error_info *err_info) case CE: ret = do_ce_handler(cpu); break; + case UCE: + ret = do_uce_handler(cpu); + break; default: break; } @@ -336,6 +349,9 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info) cpu_infos[cpu].ce_nums += err_info->nums; break; } + case UCE: + cpu_infos[cpu].uce_nums++; + break; default: break; } @@ -382,6 +398,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu) cpu, cpu_state[cpu_infos[cpu].state]); clear_queue(cpu_infos[cpu].ce_queue); cpu_infos[cpu].ce_nums = 0; + cpu_infos[cpu].uce_nums = 0; } else log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n", cpu, cpu_state[cpu_infos[cpu].state]); diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h index 35b5225..5682106 100644 --- a/ras-cpu-isolation.h +++ b/ras-cpu-isolation.h @@ -45,10 +45,12 @@ enum error_handle_result { }; enum error_type { - CE = 1 + CE = 1, + UCE }; struct cpu_info { + unsigned long uce_nums; unsigned long ce_nums; struct link_queue *ce_queue; enum cpu_state state;