From 5fedaad27c600b2824b1eb2d589df0a7280ad9eb Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 5 Nov 2015 18:44:41 -0800 Subject: [PATCH] kernel/watchdog.c: perform all-CPU backtrace in case of hard lockup In many cases of hardlockup reports, it's actually not possible to know why it triggered, because the CPU that got stuck is usually waiting on a resource (with IRQs disabled) in posession of some other CPU is holding. IOW, we are often looking at the stacktrace of the victim and not the actual offender. Introduce sysctl / cmdline parameter that makes it possible to have hardlockup detector perform all-CPU backtrace. Signed-off-by: Jiri Kosina Reviewed-by: Aaron Tomlin Cc: Ulrich Obergfell Acked-by: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 55537871ef666b4153fd1ef8782e4a13fee142cc) Signed-off-by: Vijay Kumar Orabug: 24327572 --- Documentation/kernel-parameters.txt | 5 +++++ Documentation/sysctl/kernel.txt | 12 +++++++++++ include/linux/nmi.h | 1 + kernel/sysctl.c | 9 ++++++++ kernel/watchdog.c | 33 ++++++++++++++++++++++++----- 5 files changed, 55 insertions(+), 5 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 6726139bd289..787abaa80a6a 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1223,6 +1223,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Format: such that (rxsize & ~0x1fffc0) == 0. Default: 1024 + hardlockup_all_cpu_backtrace= + [KNL] Should the hard-lockup detector generate + backtraces on all cpus. + Format: + hashdist= [KNL,NUMA] Large hashes allocated during boot are distributed across NUMA nodes. Defaults on for 64-bit NUMA, off otherwise. diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index c831001c45f1..5a4962cd5e1f 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -33,6 +33,7 @@ show up in /proc/sys/kernel: - domainname - hostname - hotplug +- hardlockup_all_cpu_backtrace - hung_task_panic - hung_task_check_count - hung_task_timeout_secs @@ -292,6 +293,17 @@ Information Service) or YP (Yellow Pages) domainname. These two domain names are in general different. For a detailed discussion see the hostname(1) man page. +============================================================== +hardlockup_all_cpu_backtrace: + +This value controls the hard lockup detector behavior when a hard +lockup condition is detected as to whether or not to gather further +debug information. If enabled, arch-specific all-CPU stack dumping +will be initiated. + +0: do nothing. This is the default behavior. + +1: on detection capture more debug information. ============================================================== hotplug: diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 3d46fb4708e0..d087348a841c 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -68,6 +68,7 @@ extern int soft_watchdog_enabled; extern int watchdog_user_enabled; extern int watchdog_thresh; extern int sysctl_softlockup_all_cpu_backtrace; +extern int sysctl_hardlockup_all_cpu_backtrace; struct ctl_table; extern int proc_watchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1cb32c89972c..5edf47efaede 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -899,6 +899,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "hardlockup_all_cpu_backtrace", + .data = &sysctl_hardlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif /* CONFIG_SMP */ #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 581a68a04c64..07a03dcd0630 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -55,8 +55,10 @@ int __read_mostly watchdog_thresh = 10; #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; +int __read_mostly sysctl_hardlockup_all_cpu_backtrace; #else #define sysctl_softlockup_all_cpu_backtrace 0 +#define sysctl_hardlockup_all_cpu_backtrace 0 #endif static int __read_mostly watchdog_running; @@ -85,6 +87,7 @@ static unsigned long soft_lockup_nmi_warn; #ifdef CONFIG_HARDLOCKUP_DETECTOR static int hardlockup_panic = CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +static unsigned long hardlockup_allcpu_dumped; /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -146,6 +149,13 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str) return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +static int __init hardlockup_all_cpu_backtrace_setup(char *str) +{ + sysctl_hardlockup_all_cpu_backtrace = + !!simple_strtol(str, NULL, 0); + return 1; +} +__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); #endif /* @@ -291,17 +301,30 @@ static void watchdog_overflow_callback(struct perf_event *event, */ if (is_hardlockup()) { int this_cpu = smp_processor_id(); + struct pt_regs *regs = get_irq_regs(); /* only print hardlockups once */ if (__this_cpu_read(hard_watchdog_warn) == true) return; - if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %d", - this_cpu); + pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); else - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", - this_cpu); + dump_stack(); + + /* + * Perform all-CPU dump only once to avoid multiple hardlockups + * generating interleaving traces + */ + if (sysctl_hardlockup_all_cpu_backtrace && + !test_and_set_bit(0, &hardlockup_allcpu_dumped)) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + panic("Hard LOCKUP"); __this_cpu_write(hard_watchdog_warn, true); return; -- 2.50.1