]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
lockdep: add lockdep_cleanup_dead_cpu()
authorDavid Woodhouse <dwmw@amazon.co.uk>
Sat, 28 Oct 2023 10:11:51 +0000 (11:11 +0100)
committerDavid Woodhouse <dwmw@amazon.co.uk>
Tue, 24 Sep 2024 14:14:42 +0000 (15:14 +0100)
Add a function to check that an offline CPU left the tracing infrastructure
in a sane state. The acpi_idle_play_dead() function was recently observed¹
calling safe_halt() instead of raw_safe_halt(), which had the side-effect
of setting the hardirqs_enabled flag for the offline CPU. On x86 this
triggered lockdep warnings when the CPU came back online, but too early
for the exception to be handled correctly, leading to a triple-fault.

Add lockdep_cleanup_dead_cpu() to check for this kind of failure mode,
print the events leading up to it, and correct it so that the CPU can
come online again correctly.

[   61.556652] smpboot: CPU 1 is now offline
[   61.556769] CPU 1 left hardirqs enabled!
[   61.556915] irq event stamp: 128149
[   61.556965] hardirqs last  enabled at (128149): [<ffffffff81720a36>] acpi_idle_play_dead+0x46/0x70
[   61.557055] hardirqs last disabled at (128148): [<ffffffff81124d50>] do_idle+0x90/0xe0
[   61.557117] softirqs last  enabled at (128078): [<ffffffff81cec74c>] __do_softirq+0x31c/0x423
[   61.557199] softirqs last disabled at (128065): [<ffffffff810baae1>] __irq_exit_rcu+0x91/0x100

¹ https://lore.kernel.org/lkml/a079bba5a0e47d6534b307553fc3772d26ce911b.camel@infradead.org/

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
include/linux/irqflags.h
kernel/cpu.c
kernel/locking/lockdep.c

index 3f003d5fde5341bd789d0d1286109563624090d3..57b074e0cfbbb3ac2a1384ead68c2f074e4c9a2a 100644 (file)
@@ -18,6 +18,8 @@
 #include <asm/irqflags.h>
 #include <asm/percpu.h>
 
+struct task_struct;
+
 /* Currently lockdep_softirqs_on/off is used only by lockdep */
 #ifdef CONFIG_PROVE_LOCKING
   extern void lockdep_softirqs_on(unsigned long ip);
   extern void lockdep_hardirqs_on_prepare(void);
   extern void lockdep_hardirqs_on(unsigned long ip);
   extern void lockdep_hardirqs_off(unsigned long ip);
+  extern void lockdep_cleanup_dead_cpu(unsigned int cpu,
+                                      struct task_struct *idle);
 #else
   static inline void lockdep_softirqs_on(unsigned long ip) { }
   static inline void lockdep_softirqs_off(unsigned long ip) { }
   static inline void lockdep_hardirqs_on_prepare(void) { }
   static inline void lockdep_hardirqs_on(unsigned long ip) { }
   static inline void lockdep_hardirqs_off(unsigned long ip) { }
+  static inline void lockdep_cleanup_dead_cpu(unsigned int cpu,
+                                             struct task_struct *idle) {}
 #endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
index d293d52a3e00e1e8d7c82bdf2f1365bb459f8bb6..c4aaf73dec9e606e031273a9f80583512c9c6a4c 100644 (file)
@@ -1338,6 +1338,7 @@ static int takedown_cpu(unsigned int cpu)
 
        cpuhp_bp_sync_dead(cpu);
 
+       lockdep_cleanup_dead_cpu(cpu, idle_thread_get(cpu));
        tick_cleanup_dead_cpu(cpu);
 
        /*
index 7963deac33c31e763bb51e8d2c02266e2c3cce21..42b07c3b8862d3d4cfaf86c43b129fcfae58aa93 100644 (file)
@@ -4583,6 +4583,30 @@ void lockdep_softirqs_off(unsigned long ip)
                debug_atomic_inc(redundant_softirqs_off);
 }
 
+/**
+ * lockdep_cleanup_dead_cpu - Ensure CPU lockdep state is cleanly stopped
+ *
+ * @cpu: index of offlined CPU
+ * @idle: task pointer for offlined CPU's idle thread
+ *
+ * Invoked after the CPU is dead. Ensures that the tracing infrastructure
+ * is left in a suitable state for the CPU to be subsequently brought
+ * online again.
+ */
+void lockdep_cleanup_dead_cpu(unsigned int cpu, struct task_struct *idle)
+{
+       if (unlikely(!debug_locks))
+               return;
+
+       if (unlikely(per_cpu(hardirqs_enabled, cpu))) {
+               pr_warn("CPU %u left hardirqs enabled!", cpu);
+               if (idle)
+                       print_irqtrace_events(idle);
+               /* Clean it up for when the CPU comes online again. */
+               per_cpu(hardirqs_enabled, cpu) = 0;
+       }
+}
+
 static int
 mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
 {