]> www.infradead.org Git - users/mchehab/rasdaemon.git/commitdiff
rasdaemon: Support cpu fault isolation for recoverable errors
authorShengwei Luo <luoshengwei@huawei.com>
Wed, 23 Feb 2022 09:23:27 +0000 (17:23 +0800)
committerMauro Carvalho Chehab <mchehab@kernel.org>
Sat, 21 Jan 2023 06:30:37 +0000 (07:30 +0100)
When the recoverable errors in cpu core occurred, try to offline
the related cpu core.

Signed-off-by: Shengwei Luo <luoshengwei@huawei.com>
Signed-off-by: Junchong Pan <panjunchong@hisilicon.com>
Signed-off-by: Lei Feng <fenglei47@h-partners.com>
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
ras-arm-handler.c
ras-cpu-isolation.c
ras-cpu-isolation.h

index 9c7a3c38f2fa4a17d43d237537a93adaaf4e38b8..a0dfc512d8bcce3aae2e339c621938ff70c98feb 100644 (file)
@@ -26,6 +26,7 @@
 
 #define ARM_ERR_VALID_ERROR_COUNT BIT(0)
 #define ARM_ERR_VALID_FLAGS BIT(1)
+#define BIT2 2
 
 void display_raw_data(struct trace_seq *s,
                const uint8_t *buf,
@@ -47,7 +48,20 @@ void display_raw_data(struct trace_seq *s,
 }
 
 #ifdef HAVE_CPU_FAULT_ISOLATION
-static int count_errors(struct ras_arm_event *ev)
+static int is_core_failure(struct ras_arm_err_info *err_info)
+{
+       if (err_info->validation_bits & ARM_ERR_VALID_FLAGS) {
+               /*
+                * core failure:
+                * Bit 0\1\3: (at lease 1)
+                * Bit 2: 0
+                */
+               return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << BIT2));
+       }
+       return 0;
+}
+
+static int count_errors(struct ras_arm_event *ev, int sev)
 {
        struct ras_arm_err_info *err_info;
        int num_pei;
@@ -75,6 +89,8 @@ static int count_errors(struct ras_arm_event *ev)
                         */
                        error_count = err_info->multiple_error + 1;
                }
+               if (sev == GHES_SEV_RECOVERABLE && !is_core_failure(err_info))
+                       error_count = 0;
 
                num += error_count;
                err_info += 1;
@@ -118,8 +134,8 @@ static int ras_handle_cpu_error(struct trace_seq *s,
        }
        trace_seq_printf(s, "\n severity: %s", severity);
 
-       if (val == GHES_SEV_CORRECTED) {
-               int nums = count_errors(ev);
+       if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) {
+               int nums = count_errors(ev, val);
 
                if (nums > 0) {
                        err_info.nums = nums;
index 1694a08342bda02aa8853c30e4897ee91c8102d0..90633fdda78b62ff4349557430c958405ab20d64 100644 (file)
@@ -126,6 +126,7 @@ static int init_cpu_info(unsigned int cpus)
 
        for (unsigned int i = 0; i < cpus; ++i) {
                cpu_infos[i].ce_nums = 0;
+               cpu_infos[i].uce_nums = 0;
                cpu_infos[i].state = get_cpu_status(i);
                cpu_infos[i].ce_queue = init_queue();
 
@@ -306,6 +307,15 @@ static int do_ce_handler(unsigned int cpu)
        return HANDLE_NOTHING;
 }
 
+static int do_uce_handler(unsigned int cpu)
+{
+       if (cpu_infos[cpu].uce_nums > 0) {
+               log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%u\n", cpu);
+               return do_cpu_offline(cpu);
+       }
+       return HANDLE_NOTHING;
+}
+
 static int error_handler(unsigned int cpu, struct error_info *err_info)
 {
        int ret = HANDLE_NOTHING;
@@ -314,6 +324,9 @@ static int error_handler(unsigned int cpu, struct error_info *err_info)
        case CE:
                ret = do_ce_handler(cpu);
                break;
+       case UCE:
+               ret = do_uce_handler(cpu);
+               break;
        default:
                break;
        }
@@ -336,6 +349,9 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info)
                cpu_infos[cpu].ce_nums += err_info->nums;
                break;
        }
+       case UCE:
+               cpu_infos[cpu].uce_nums++;
+               break;
        default:
                break;
        }
@@ -382,6 +398,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu)
                        cpu, cpu_state[cpu_infos[cpu].state]);
                clear_queue(cpu_infos[cpu].ce_queue);
                cpu_infos[cpu].ce_nums = 0;
+               cpu_infos[cpu].uce_nums = 0;
        } else
                log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n",
                        cpu, cpu_state[cpu_infos[cpu].state]);
index 35b522566be6bd3eaf36685cea6870e9e1af1755..5682106a7089abbc4850f8b5b8f2a47cd8553ba5 100644 (file)
@@ -45,10 +45,12 @@ enum error_handle_result {
 };
 
 enum error_type {
-       CE = 1
+       CE = 1,
+       UCE
 };
 
 struct cpu_info {
+       unsigned long uce_nums;
        unsigned long ce_nums;
        struct link_queue *ce_queue;
        enum cpu_state state;