]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
kernel/watchdog.c: perform all-CPU backtrace in case of hard lockup
authorJiri Kosina <jkosina@suse.cz>
Fri, 6 Nov 2015 02:44:41 +0000 (18:44 -0800)
committerChuck Anderson <chuck.anderson@oracle.com>
Mon, 31 Oct 2016 10:40:06 +0000 (03:40 -0700)
In many cases of hardlockup reports, it's actually not possible to know
why it triggered, because the CPU that got stuck is usually waiting on a
resource (with IRQs disabled) in posession of some other CPU is holding.

IOW, we are often looking at the stacktrace of the victim and not the
actual offender.

Introduce sysctl / cmdline parameter that makes it possible to have
hardlockup detector perform all-CPU backtrace.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit 55537871ef666b4153fd1ef8782e4a13fee142cc)

Signed-off-by: Vijay Kumar <vijay.ac.kumar@oracle.com>
Orabug: 24327572

Documentation/kernel-parameters.txt
Documentation/sysctl/kernel.txt
include/linux/nmi.h
kernel/sysctl.c
kernel/watchdog.c

index 6726139bd2899038e77ae15f9901351773dd324b..787abaa80a6acda485f2ffd7ef58d4ecc6c94ca6 100644 (file)
@@ -1223,6 +1223,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0.
                        Default: 1024
 
+       hardlockup_all_cpu_backtrace=
+                       [KNL] Should the hard-lockup detector generate
+                       backtraces on all cpus.
+                       Format: <integer>
+
        hashdist=       [KNL,NUMA] Large hashes allocated during boot
                        are distributed across NUMA nodes.  Defaults on
                        for 64-bit NUMA, off otherwise.
index c831001c45f1162334b7a30544853a8baa47c6a5..5a4962cd5e1fe7ea154f14b4a277912a787ae013 100644 (file)
@@ -33,6 +33,7 @@ show up in /proc/sys/kernel:
 - domainname
 - hostname
 - hotplug
+- hardlockup_all_cpu_backtrace
 - hung_task_panic
 - hung_task_check_count
 - hung_task_timeout_secs
@@ -292,6 +293,17 @@ Information Service) or YP (Yellow Pages) domainname. These two
 domain names are in general different. For a detailed discussion
 see the hostname(1) man page.
 
+==============================================================
+hardlockup_all_cpu_backtrace:
+
+This value controls the hard lockup detector behavior when a hard
+lockup condition is detected as to whether or not to gather further
+debug information. If enabled, arch-specific all-CPU stack dumping
+will be initiated.
+
+0: do nothing. This is the default behavior.
+
+1: on detection capture more debug information.
 ==============================================================
 
 hotplug:
index 3d46fb4708e051ef78f4be83651f4a707a14b56d..d087348a841c58c4b6fae32d864f1359be2fe4f2 100644 (file)
@@ -68,6 +68,7 @@ extern int soft_watchdog_enabled;
 extern int watchdog_user_enabled;
 extern int watchdog_thresh;
 extern int sysctl_softlockup_all_cpu_backtrace;
+extern int sysctl_hardlockup_all_cpu_backtrace;
 struct ctl_table;
 extern int proc_watchdog(struct ctl_table *, int ,
                         void __user *, size_t *, loff_t *);
index 1cb32c89972c1419c120eaa9f18c95e8479cafd0..5edf47efaedee435bbe605845ad3e5a27129074d 100644 (file)
@@ -899,6 +899,15 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
+       {
+               .procname       = "hardlockup_all_cpu_backtrace",
+               .data           = &sysctl_hardlockup_all_cpu_backtrace,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
 #endif /* CONFIG_SMP */
 #endif
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
index 581a68a04c64089b847d3b76d1abc138a83bb209..07a03dcd0630b9edfd52447300d691c9fa625445 100644 (file)
@@ -55,8 +55,10 @@ int __read_mostly watchdog_thresh = 10;
 
 #ifdef CONFIG_SMP
 int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
 #else
 #define sysctl_softlockup_all_cpu_backtrace 0
+#define sysctl_hardlockup_all_cpu_backtrace 0
 #endif
 
 static int __read_mostly watchdog_running;
@@ -85,6 +87,7 @@ static unsigned long soft_lockup_nmi_warn;
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 static int hardlockup_panic =
                        CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+static unsigned long hardlockup_allcpu_dumped;
 /*
  * We may not want to enable hard lockup detection by default in all cases,
  * for example when running the kernel as a guest on a hypervisor. In these
@@ -146,6 +149,13 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str)
        return 1;
 }
 __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+static int __init hardlockup_all_cpu_backtrace_setup(char *str)
+{
+       sysctl_hardlockup_all_cpu_backtrace =
+               !!simple_strtol(str, NULL, 0);
+       return 1;
+}
+__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
 #endif
 
 /*
@@ -291,17 +301,30 @@ static void watchdog_overflow_callback(struct perf_event *event,
         */
        if (is_hardlockup()) {
                int this_cpu = smp_processor_id();
+               struct pt_regs *regs = get_irq_regs();
 
                /* only print hardlockups once */
                if (__this_cpu_read(hard_watchdog_warn) == true)
                        return;
 
-               if (hardlockup_panic)
-                       panic("Watchdog detected hard LOCKUP on cpu %d",
-                             this_cpu);
+               pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+               print_modules();
+               print_irqtrace_events(current);
+               if (regs)
+                       show_regs(regs);
                else
-                       WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
-                            this_cpu);
+                       dump_stack();
+
+               /*
+                * Perform all-CPU dump only once to avoid multiple hardlockups
+                * generating interleaving traces
+                */
+               if (sysctl_hardlockup_all_cpu_backtrace &&
+                               !test_and_set_bit(0, &hardlockup_allcpu_dumped))
+                       trigger_allbutself_cpu_backtrace();
+
+               if (hardlockup_panic)
+                       panic("Hard LOCKUP");
 
                __this_cpu_write(hard_watchdog_warn, true);
                return;