From 08ca0acc9dd66efc76a3d2d51758ab080562994a Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Thu, 4 Jun 2015 18:55:24 +0200 Subject: [PATCH] x86/mce: Handle Local MCE events Orabug: 25384378 Add the necessary changes to do_machine_check() to be able to process MCEs signaled as local MCEs. Typically, only recoverable errors (SRAR type) will be Signaled as LMCE. The architecture does not restrict to only those errors, however. When errors are signaled as LMCE, there is no need for the MCE handler to perform rendezvous with other logical processors unlike earlier processors that would broadcast machine check errors. Signed-off-by: Ashok Raj Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1433436928-31903-17-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar (cherry picked from commit 243d657eaf540db882f73497060da5a4f7d86a90) Signed-off-by: Somasundaram Krishnasamy Reviewed-by: Jack Vogel --- arch/x86/kernel/cpu/mcheck/mce.c | 32 +++++++++++++++++++++----- arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 + 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 329ea34b7bb9..80d5fc61ec1f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1050,6 +1050,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) char *msg = "Unknown"; u64 recover_paddr = ~0ull; int flags = MF_ACTION_REQUIRED; + int lmce = 0; /* If this CPU is offline, just bail out. */ if (cpu_is_offline(smp_processor_id())) { @@ -1088,11 +1089,20 @@ void do_machine_check(struct pt_regs *regs, long error_code) kill_it = 1; /* - * Go through all the banks in exclusion of the other CPUs. - * This way we don't report duplicated events on shared banks - * because the first one to see it will clear it. + * Check if this MCE is signaled to only this logical processor */ - order = mce_start(&no_way_out); + if (m.mcgstatus & MCG_STATUS_LMCES) + lmce = 1; + else { + /* + * Go through all the banks in exclusion of the other CPUs. + * This way we don't report duplicated events on shared banks + * because the first one to see it will clear it. + * If this is a Local MCE, then no need to perform rendezvous. + */ + order = mce_start(&no_way_out); + } + for (i = 0; i < cfg->banks; i++) { __clear_bit(i, toclear); if (!test_bit(i, valid_banks)) @@ -1169,8 +1179,18 @@ void do_machine_check(struct pt_regs *regs, long error_code) * Do most of the synchronization with other CPUs. * When there's any problem use only local no_way_out state. */ - if (mce_end(order) < 0) - no_way_out = worst >= MCE_PANIC_SEVERITY; + if (!lmce) { + if (mce_end(order) < 0) + no_way_out = worst >= MCE_PANIC_SEVERITY; + } else { + /* + * Local MCE skipped calling mce_reign() + * If we found a fatal error, we need to panic here. + */ + if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) + mce_panic("Machine check from unknown source", + NULL, NULL); + } /* * At insane "tolerant" levels we take no action. Otherwise diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 2d872deb2c50..844f56c5616d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -452,4 +452,5 @@ void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); intel_init_cmci(); + intel_init_lmce(); } -- 2.50.1