]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
x86/ras/therm_throt: Do not log a fake MCE for thermal events
authorBorislav Petkov <bp@suse.de>
Mon, 23 Jan 2017 18:35:07 +0000 (19:35 +0100)
committerEthan Zhao <ethan.zhao@oracle.com>
Wed, 28 Jun 2017 22:51:35 +0000 (07:51 +0900)
We log a fake bank 128 MCE to note that we're handling a CPU thermal
event. However, this confuses people into thinking that their hardware
generates MCEs. Hijacking MCA for logging thermal events is a gross
misuse anyway and it shouldn't have been done in the first place. And
besides we have other means for dealing with thermal events which are
much more suitable.

So let's kill the MCE logging part.

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Ashok Raj <ashok.raj@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Yazen Ghannam <Yazen.Ghannam@amd.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170105213846.GA12024@gmail.com
Link: http://lkml.kernel.org/r/20170123183514.13356-3-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 9b052ea4ced0fa1ad30a2eafe86984a16297e6f1)

Orabug: 26361336

Signed-off-by: Ethan Zhao <ethan.zhao@oracle.com>
Reviewed-by: Jack Vogel <jack.vogel@oracle.com>
arch/x86/include/asm/mce.h
arch/x86/kernel/cpu/mcheck/mce.c
arch/x86/kernel/cpu/mcheck/therm_throt.c

index 7968b3ba48730bf5f444d7f59521f99e59409b54..db62cd30eb7d34e67437dadfa42cfca5bba86a93 100644 (file)
 
 #define MCE_OVERFLOW 0         /* bit 0 in flags means overflow */
 
-/* Software defined banks */
-#define MCE_EXTENDED_BANK      128
-#define MCE_THERMAL_BANK       (MCE_EXTENDED_BANK + 0)
-
 #define MCE_LOG_LEN 32
 #define MCE_LOG_SIGNATURE      "MACHINECHECK"
 
@@ -239,8 +235,6 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 
 void intel_init_thermal(struct cpuinfo_x86 *c);
 
-void mce_log_therm_throt_event(__u64 status);
-
 /* Interrupt Handler for core thermal thresholds */
 extern int (*platform_thermal_notify)(__u64 msr_val);
 
index f28540c1f008b4b9544a9d1ff45c08892272326d..ff78767c42ee2bcb868c1137fd327403e63bf788 100644 (file)
@@ -1304,31 +1304,6 @@ static void mce_process_work(struct work_struct *dummy)
                memory_failure(pfn, MCE_VECTOR, 0);
 }
 
-#ifdef CONFIG_X86_MCE_INTEL
-/***
- * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
- * @cpu: The CPU on which the event occurred.
- * @status: Event status information
- *
- * This function should be called by the thermal interrupt after the
- * event has been processed and the decision was made to log the event
- * further.
- *
- * The status parameter will be saved to the 'status' field of 'struct mce'
- * and historically has been the register value of the
- * MSR_IA32_THERMAL_STATUS (Intel) msr.
- */
-void mce_log_therm_throt_event(__u64 status)
-{
-       struct mce m;
-
-       mce_setup(&m);
-       m.bank = MCE_THERMAL_BANK;
-       m.status = status;
-       mce_log(&m);
-}
-#endif /* CONFIG_X86_MCE_INTEL */
-
 /*
  * Periodic polling timer for "silent" machine check errors.  If the
  * poller finds an MCE, poll 2x faster.  When the poller finds no more
index 56270f0f05e615d2e6d190410bd9f5cdf320e060..c9442c63d3c70c0e748e8cc46e1a7662d72f2845 100644 (file)
@@ -6,7 +6,7 @@
  *
  * Maintains a counter in /sys that keeps track of the number of thermal
  * events, such that the user knows how bad the thermal problem might be
- * (since the logging to syslog and mcelog is rate limited).
+ * (since the logging to syslog is rate limited).
  *
  * Author: Dmitriy Zavin (dmitriyz@google.com)
  *
@@ -142,13 +142,8 @@ static struct attribute_group thermal_attr_group = {
  * IRQ has been acknowledged.
  *
  * It will take care of rate limiting and printing messages to the syslog.
- *
- * Returns: 0 : Event should NOT be further logged, i.e. still in
- *              "timeout" from previous log message.
- *          1 : Event should be logged further, and a message has been
- *              printed to the syslog.
  */
-static int therm_throt_process(bool new_event, int event, int level)
+static void therm_throt_process(bool new_event, int event, int level)
 {
        struct _thermal_state *state;
        unsigned int this_cpu = smp_processor_id();
@@ -163,16 +158,16 @@ static int therm_throt_process(bool new_event, int event, int level)
                else if (event == POWER_LIMIT_EVENT)
                        state = &pstate->core_power_limit;
                else
-                        return 0;
+                       return;
        } else if (level == PACKAGE_LEVEL) {
                if (event == THERMAL_THROTTLING_EVENT)
                        state = &pstate->package_throttle;
                else if (event == POWER_LIMIT_EVENT)
                        state = &pstate->package_power_limit;
                else
-                       return 0;
+                       return;
        } else
-               return 0;
+               return;
 
        old_event = state->new_event;
        state->new_event = new_event;
@@ -182,7 +177,7 @@ static int therm_throt_process(bool new_event, int event, int level)
 
        if (time_before64(now, state->next_check) &&
                        state->count != state->last_count)
-               return 0;
+               return;
 
        state->next_check = now + CHECK_INTERVAL;
        state->last_count = state->count;
@@ -194,17 +189,15 @@ static int therm_throt_process(bool new_event, int event, int level)
                                this_cpu,
                                level == CORE_LEVEL ? "Core" : "Package",
                                state->count);
-               return 1;
+               return;
        }
        if (old_event) {
                if (event == THERMAL_THROTTLING_EVENT)
                        printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
                                this_cpu,
                                level == CORE_LEVEL ? "Core" : "Package");
-               return 1;
+               return;
        }
-
-       return 0;
 }
 
 static int thresh_event_valid(int level, int event)
@@ -393,10 +386,9 @@ static void intel_thermal_interrupt(void)
        /* Check for violation of core thermal thresholds*/
        notify_thresholds(msr_val);
 
-       if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
-                               THERMAL_THROTTLING_EVENT,
-                               CORE_LEVEL) != 0)
-               mce_log_therm_throt_event(msr_val);
+       therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+                           THERMAL_THROTTLING_EVENT,
+                           CORE_LEVEL);
 
        if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
                therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,