From 46133bfad7ade5af9c3e1c0cc4a1c9b18b492c78 Mon Sep 17 00:00:00 2001 From: Ashok Vairavan Date: Mon, 26 Sep 2016 19:50:51 -0700 Subject: [PATCH] Allow mce to reset instead of panic on UE The intent of this patch is to ensure that the mce stack is not put in the panic stack trace when the kernel reboots due to the Uncorrectable Error. The mce stack in the panic trace confuses the administrator and falsely implicates mce module as a culprit. Hence a synchronization flag is added to machine restart the system when it experience uncorrectable error. Orabug: 24745271 Signed-off-by: Ashok Vairavan Reviewed-by: Dan Duval --- arch/x86/kernel/cpu/mcheck/mce.c | 42 ++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 20190bdac9d5..528ddd1ff354 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -293,6 +294,7 @@ static atomic_t mce_panicked; static int fake_panic; static atomic_t mce_fake_panicked; +static atomic_t mce_reboot; /* Panic in progress. Enable interrupts and wait for final IPI */ static void wait_for_panic(void) @@ -301,8 +303,29 @@ static void wait_for_panic(void) preempt_disable(); local_irq_enable(); + + /* + * If two cpus get into mce_panic() very close together, + * the second CPU will wait for mce_reboot flag. The lead CPU + * should set the flag within PANIC_TIMEOUT seconds. + */ while (timeout-- > 0) + { + /* + * The intent of this patch is to ensure that the mce stack + * is not put in the panic stack trace when the kernel reboots + * due to the Uncorrectable Error. The mce stack in the panic trace + * confuses the administrator and falsely implicates mce module + * as a culprit. + */ + if(atomic_read(&mce_reboot)) + machine_emergency_restart(); udelay(1); + } + /* + * If the mce_reboot flag is not set within PANIC_TIMEOUT secs + * call kernel panic. + */ if (panic_timeout == 0) panic_timeout = mca_cfg.panic_timeout; panic("Panicing machine check CPU died"); @@ -311,6 +334,7 @@ static void wait_for_panic(void) static void mce_panic(const char *msg, struct mce *final, char *exp) { int i, apei_err = 0; + bool uncorrectable = false; if (!fake_panic) { /* @@ -350,6 +374,13 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) if (!apei_err) apei_err = apei_write_mce(m); } + /* + * Set the flag for any uncorreactable errors. But turn on + * mce_reboot flag only after printing MCE errors. Otherwise, + * we may end up restarting the system without printing mce + * logs. + */ + uncorrectable = true; } if (final) { print_mce(final); @@ -360,11 +391,11 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); if (exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); - if (!fake_panic) { - if (panic_timeout == 0) - panic_timeout = mca_cfg.panic_timeout; - panic(msg); - } else + if(uncorrectable) + atomic_inc(&mce_reboot); + if (!fake_panic) + wait_for_panic(); + else pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); } @@ -2527,6 +2558,7 @@ static void mce_reset(void) { cpu_missing = 0; atomic_set(&mce_fake_panicked, 0); + atomic_set(&mce_reboot, 0); atomic_set(&mce_executing, 0); atomic_set(&mce_callin, 0); atomic_set(&global_nwo, 0); -- 2.50.1