Add a function to check that an offline CPU left the tracing infrastructure
in a sane state. The acpi_idle_play_dead() function was recently observed¹
calling safe_halt() instead of raw_safe_halt(), which had the side-effect
of setting the hardirqs_enabled flag for the offline CPU. On x86 this
triggered lockdep warnings when the CPU came back online, but too early
for the exception to be handled correctly, leading to a triple-fault.
Add lockdep_cleanup_dead_cpu() to check for this kind of failure mode,
print the events leading up to it, and correct it so that the CPU can
come online again correctly.
[ 61.556652] smpboot: CPU 1 is now offline
[ 61.556769] CPU 1 left hardirqs enabled!
[ 61.556915] irq event stamp: 128149
[ 61.556965] hardirqs last enabled at (128149): [<
ffffffff81720a36>] acpi_idle_play_dead+0x46/0x70
[ 61.557055] hardirqs last disabled at (128148): [<
ffffffff81124d50>] do_idle+0x90/0xe0
[ 61.557117] softirqs last enabled at (128078): [<
ffffffff81cec74c>] __do_softirq+0x31c/0x423
[ 61.557199] softirqs last disabled at (128065): [<
ffffffff810baae1>] __irq_exit_rcu+0x91/0x100
¹ https://lore.kernel.org/lkml/
a079bba5a0e47d6534b307553fc3772d26ce911b.camel@infradead.org/
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
#include <asm/irqflags.h>
#include <asm/percpu.h>
+struct task_struct;
+
/* Currently lockdep_softirqs_on/off is used only by lockdep */
#ifdef CONFIG_PROVE_LOCKING
extern void lockdep_softirqs_on(unsigned long ip);
extern void lockdep_hardirqs_on_prepare(void);
extern void lockdep_hardirqs_on(unsigned long ip);
extern void lockdep_hardirqs_off(unsigned long ip);
+ extern void lockdep_cleanup_dead_cpu(unsigned int cpu,
+ struct task_struct *idle);
#else
static inline void lockdep_softirqs_on(unsigned long ip) { }
static inline void lockdep_softirqs_off(unsigned long ip) { }
static inline void lockdep_hardirqs_on_prepare(void) { }
static inline void lockdep_hardirqs_on(unsigned long ip) { }
static inline void lockdep_hardirqs_off(unsigned long ip) { }
+ static inline void lockdep_cleanup_dead_cpu(unsigned int cpu,
+ struct task_struct *idle) {}
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
cpuhp_bp_sync_dead(cpu);
+ lockdep_cleanup_dead_cpu(cpu, idle_thread_get(cpu));
tick_cleanup_dead_cpu(cpu);
/*
debug_atomic_inc(redundant_softirqs_off);
}
+/**
+ * lockdep_cleanup_dead_cpu - Ensure CPU lockdep state is cleanly stopped
+ *
+ * @cpu: index of offlined CPU
+ * @idle: task pointer for offlined CPU's idle thread
+ *
+ * Invoked after the CPU is dead. Ensures that the tracing infrastructure
+ * is left in a suitable state for the CPU to be subsequently brought
+ * online again.
+ */
+void lockdep_cleanup_dead_cpu(unsigned int cpu, struct task_struct *idle)
+{
+ if (unlikely(!debug_locks))
+ return;
+
+ if (unlikely(per_cpu(hardirqs_enabled, cpu))) {
+ pr_warn("CPU %u left hardirqs enabled!", cpu);
+ if (idle)
+ print_irqtrace_events(idle);
+ /* Clean it up for when the CPU comes online again. */
+ per_cpu(hardirqs_enabled, cpu) = 0;
+ }
+}
+
static int
mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
{