From b35108a51cf7bab58d7eace1267d7965978bcdb8 Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Wed, 30 Oct 2024 17:47:35 +0000 Subject: [PATCH 01/16] jiffies: Define secs_to_jiffies() secs_to_jiffies() is defined in hci_event.c and cannot be reused by other call sites. Hoist it into the core code to allow conversion of the ~1150 usages of msecs_to_jiffies() that either: - use a multiplier value of 1000 or equivalently MSEC_PER_SEC, or - have timeouts that are denominated in seconds (i.e. end in 000) It's implemented as a macro to allow usage in static initializers. This will also allow conversion of yet more sites that use (sec * HZ) directly, and improve their readability. Suggested-by: Michael Kelley Signed-off-by: Easwar Hariharan Signed-off-by: Thomas Gleixner Reviewed-by: Luiz Augusto von Dentz Link: https://lore.kernel.org/all/20241030-open-coded-timeouts-v3-1-9ba123facf88@linux.microsoft.com --- include/linux/jiffies.h | 13 +++++++++++++ net/bluetooth/hci_event.c | 2 -- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 5d21dacd62bc..ed945f42e064 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -526,6 +526,19 @@ static __always_inline unsigned long msecs_to_jiffies(const unsigned int m) } } +/** + * secs_to_jiffies: - convert seconds to jiffies + * @_secs: time in seconds + * + * Conversion is done by simple multiplication with HZ + * + * secs_to_jiffies() is defined as a macro rather than a static inline + * function so it can be used in static initializers. + * + * Return: jiffies value + */ +#define secs_to_jiffies(_secs) ((_secs) * HZ) + extern unsigned long __usecs_to_jiffies(const unsigned int u); #if !(USEC_PER_SEC % HZ) static inline unsigned long _usecs_to_jiffies(const unsigned int u) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 1c82dcdf6e8f..4bd94d432bcf 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -42,8 +42,6 @@ #define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00" -#define secs_to_jiffies(_secs) msecs_to_jiffies((_secs) * 1000) - /* Handle HCI Event packets */ static void *hci_ev_skb_pull(struct hci_dev *hdev, struct sk_buff *skb, -- 2.51.0 From 17a8945f369ce2de2532ba8abdb93bb5b2d1c118 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:42 +0100 Subject: [PATCH 02/16] clockevents: Improve clockevents_notify_released() comment When a new clockevent device is added and replaces a previous device, the latter is put into the released list. Then the released list is added back. This may look counter-intuitive but the reason is that released device might be suitable for other uses. For example a released CPU regular clockevent can be a better replacement for the current broadcast event. Similarly a released broadcast clockevent can be a better replacement for the current regular clockevent of a given CPU. Improve comments stating about these subtleties. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-2-frederic@kernel.org --- kernel/time/clockevents.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 78c7bd64d0dd..4af27994db93 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -337,13 +337,21 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, } /* - * Called after a notify add to make devices available which were - * released from the notifier call. + * Called after a clockevent has been added which might + * have replaced a current regular or broadcast device. A + * released normal device might be a suitable replacement + * for the current broadcast device. Similarly a released + * broadcast device might be a suitable replacement for a + * normal device. */ static void clockevents_notify_released(void) { struct clock_event_device *dev; + /* + * Keep iterating as long as tick_check_new_device() + * replaces a device. + */ while (!list_empty(&clockevents_released)) { dev = list_entry(clockevents_released.next, struct clock_event_device, list); -- 2.51.0 From 3b1596a21fbf210f5b763fd3c0be280650475b52 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:43 +0100 Subject: [PATCH 03/16] clockevents: Shutdown and unregister current clockevents at CPUHP_AP_TICK_DYING The way the clockevent devices are finally stopped while a CPU is offlining is currently chaotic. The layout being by order: 1) tick_sched_timer_dying() stops the tick and the underlying clockevent but only for oneshot case. The periodic tick and its related clockevent still runs. 2) tick_broadcast_offline() detaches and stops the per-cpu oneshot broadcast and append it to the released list. 3) Some individual clockevent drivers stop the clockevents (a second time if the tick is oneshot) 4) Once the CPU is dead, a control CPU remotely detaches and stops (a 3rd time if oneshot mode) the CPU clockevent and adds it to the released list. 5) The released list containing the broadcast device released on step 2) and the remotely detached clockevent from step 4) are unregistered. These random events can be factorized if the current clockevent is detached and stopped by the dying CPU at the generic layer, that is from the dying CPU: a) Stop the tick b) Stop/detach the underlying per-cpu oneshot broadcast clockevent c) Stop/detach the underlying clockevent d) Release / unregister the clockevents from b) and c) e) Release / unregister the remaining clockevents from the dying CPU. This part could be performed by the dying CPU This way the drivers and the tick layer don't need to care about clockevent operations during cpuhotplug down. This also unifies the tick behaviour on offline CPUs between oneshot and periodic modes, avoiding offline ticks altogether for sanity. Adopt the simplification. [ tglx: Remove the WARN_ON() in clockevents_register_device() as that is called from an upcoming CPU before the CPU is marked online ] Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-3-frederic@kernel.org --- include/linux/tick.h | 2 -- kernel/cpu.c | 2 -- kernel/time/clockevents.c | 30 +++++++++++------------------- kernel/time/tick-internal.h | 3 +-- 4 files changed, 12 insertions(+), 25 deletions(-) diff --git a/include/linux/tick.h b/include/linux/tick.h index 72744638c5b0..b0c74bfe0600 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -20,12 +20,10 @@ extern void __init tick_init(void); extern void tick_suspend_local(void); /* Should be core only, but XEN resume magic and ARM BL switcher require it */ extern void tick_resume_local(void); -extern void tick_cleanup_dead_cpu(int cpu); #else /* CONFIG_GENERIC_CLOCKEVENTS */ static inline void tick_init(void) { } static inline void tick_suspend_local(void) { } static inline void tick_resume_local(void) { } -static inline void tick_cleanup_dead_cpu(int cpu) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ #if defined(CONFIG_GENERIC_CLOCKEVENTS) && defined(CONFIG_HOTPLUG_CPU) diff --git a/kernel/cpu.c b/kernel/cpu.c index d293d52a3e00..895f3287e3f3 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1338,8 +1338,6 @@ static int takedown_cpu(unsigned int cpu) cpuhp_bp_sync_dead(cpu); - tick_cleanup_dead_cpu(cpu); - /* * Callbacks must be re-integrated right away to the RCU state machine. * Otherwise an RCU callback could block a further teardown function diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 4af27994db93..f3e831f62906 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -618,39 +618,30 @@ void clockevents_resume(void) #ifdef CONFIG_HOTPLUG_CPU -# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST /** - * tick_offline_cpu - Take CPU out of the broadcast mechanism + * tick_offline_cpu - Shutdown all clock events related + * to this CPU and take it out of the + * broadcast mechanism. * @cpu: The outgoing CPU * - * Called on the outgoing CPU after it took itself offline. + * Called by the dying CPU during teardown. */ void tick_offline_cpu(unsigned int cpu) -{ - raw_spin_lock(&clockevents_lock); - tick_broadcast_offline(cpu); - raw_spin_unlock(&clockevents_lock); -} -# endif - -/** - * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu - * @cpu: The dead CPU - */ -void tick_cleanup_dead_cpu(int cpu) { struct clock_event_device *dev, *tmp; - unsigned long flags; - raw_spin_lock_irqsave(&clockevents_lock, flags); + raw_spin_lock(&clockevents_lock); + tick_broadcast_offline(cpu); tick_shutdown(cpu); + /* * Unregister the clock event devices which were - * released from the users in the notify chain. + * released above. */ list_for_each_entry_safe(dev, tmp, &clockevents_released, list) list_del(&dev->list); + /* * Now check whether the CPU has left unused per cpu devices */ @@ -662,7 +653,8 @@ void tick_cleanup_dead_cpu(int cpu) list_del(&dev->list); } } - raw_spin_unlock_irqrestore(&clockevents_lock, flags); + + raw_spin_unlock(&clockevents_lock); } #endif diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 5f2105e637bd..faac36de35b9 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -25,6 +25,7 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_check_new_device(struct clock_event_device *dev); +extern void tick_offline_cpu(unsigned int cpu); extern void tick_shutdown(unsigned int cpu); extern void tick_suspend(void); extern void tick_resume(void); @@ -142,10 +143,8 @@ static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_ #endif /* !(BROADCAST && ONESHOT) */ #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU) -extern void tick_offline_cpu(unsigned int cpu); extern void tick_broadcast_offline(unsigned int cpu); #else -static inline void tick_offline_cpu(unsigned int cpu) { } static inline void tick_broadcast_offline(unsigned int cpu) { } #endif -- 2.51.0 From a6347864d97506a021c469dad35875088edc03fc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:44 +0100 Subject: [PATCH 04/16] tick: Remove now unneeded low-res tick stop on CPUHP_AP_TICK_DYING The generic clockevent layer now detaches and stops the underlying clockevent from the dying CPU, unifying the tick behaviour for both periodic and oneshot mode on offline CPUs. There is no more need for the tick layer to care about that. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-4-frederic@kernel.org --- kernel/time/tick-sched.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 753a184c7090..9f90c7333b1d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -311,14 +311,6 @@ static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer) return HRTIMER_RESTART; } -static void tick_sched_timer_cancel(struct tick_sched *ts) -{ - if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) - hrtimer_cancel(&ts->sched_timer); - else if (tick_sched_flag_test(ts, TS_FLAG_NOHZ)) - tick_program_event(KTIME_MAX, 1); -} - #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; EXPORT_SYMBOL_GPL(tick_nohz_full_mask); @@ -1055,7 +1047,10 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) * the tick timer. */ if (unlikely(expires == KTIME_MAX)) { - tick_sched_timer_cancel(ts); + if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) + hrtimer_cancel(&ts->sched_timer); + else + tick_program_event(KTIME_MAX, 1); return; } @@ -1604,21 +1599,13 @@ void tick_setup_sched_timer(bool hrtimer) */ void tick_sched_timer_dying(int cpu) { - struct tick_device *td = &per_cpu(tick_cpu_device, cpu); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - struct clock_event_device *dev = td->evtdev; ktime_t idle_sleeptime, iowait_sleeptime; unsigned long idle_calls, idle_sleeps; /* This must happen before hrtimers are migrated! */ - tick_sched_timer_cancel(ts); - - /* - * If the clockevents doesn't support CLOCK_EVT_STATE_ONESHOT_STOPPED, - * make sure not to call low-res tick handler. - */ - if (tick_sched_flag_test(ts, TS_FLAG_NOHZ)) - dev->event_handler = clockevents_handle_noop; + if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) + hrtimer_cancel(&ts->sched_timer); idle_sleeptime = ts->idle_sleeptime; iowait_sleeptime = ts->iowait_sleeptime; -- 2.51.0 From 900053d9eedfc3f731e59a27d24da938907f5407 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:45 +0100 Subject: [PATCH 05/16] ARM: smp_twd: Remove clockevents shutdown call on offlining The clockevents core already detached and unregistered it at this stage. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-5-frederic@kernel.org --- arch/arm/kernel/smp_twd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c index 9a14f721a2b0..42a3706e16a6 100644 --- a/arch/arm/kernel/smp_twd.c +++ b/arch/arm/kernel/smp_twd.c @@ -93,7 +93,6 @@ static void twd_timer_stop(void) { struct clock_event_device *clk = raw_cpu_ptr(twd_evt); - twd_shutdown(clk); disable_percpu_irq(clk->irq); } -- 2.51.0 From 78b5c2ca5f27534dc04fbbe0b491dd3bd4ec814b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:46 +0100 Subject: [PATCH 06/16] clocksource/drivers/arm_arch_timer: Remove clockevents shutdown call on offlining The clockevents core already detached and unregistered it at this stage. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-6-frederic@kernel.org --- drivers/clocksource/arm_arch_timer.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 03733101e231..2bba81e25aa2 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -1179,8 +1179,6 @@ static void arch_timer_stop(struct clock_event_device *clk) disable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi]); if (arch_timer_has_nonsecure_ppi()) disable_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]); - - clk->set_state_shutdown(clk); } static int arch_timer_dying_cpu(unsigned int cpu) -- 2.51.0 From 15b810e0496eba62ca5a70d1545d1e4757c0a1ee Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:47 +0100 Subject: [PATCH 07/16] clocksource/drivers/arm_global_timer: Remove clockevents shutdown call on offlining The clockevents core already detached and unregistered it at this stage. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-7-frederic@kernel.org --- drivers/clocksource/arm_global_timer.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c index a05cfaab5f84..2d86bbc2764a 100644 --- a/drivers/clocksource/arm_global_timer.c +++ b/drivers/clocksource/arm_global_timer.c @@ -195,7 +195,6 @@ static int gt_dying_cpu(unsigned int cpu) { struct clock_event_device *clk = this_cpu_ptr(gt_evt); - gt_clockevent_shutdown(clk); disable_percpu_irq(clk->irq); return 0; } -- 2.51.0 From ba23b6c7f97428dc5dd1898edbae397f1a524b13 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:48 +0100 Subject: [PATCH 08/16] clocksource/drivers/exynos_mct: Remove clockevents shutdown call on offlining The clockevents core already detached and unregistered it at this stage. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-8-frederic@kernel.org --- drivers/clocksource/exynos_mct.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c index ef8cb1b71be4..e6a02e351d77 100644 --- a/drivers/clocksource/exynos_mct.c +++ b/drivers/clocksource/exynos_mct.c @@ -496,7 +496,6 @@ static int exynos4_mct_dying_cpu(unsigned int cpu) per_cpu_ptr(&percpu_mct_tick, cpu); struct clock_event_device *evt = &mevt->evt; - evt->set_state_shutdown(evt); if (mct_int_type == MCT_INT_SPI) { if (evt->irq != -1) disable_irq_nosync(evt->irq); -- 2.51.0 From 30f8c70a85bcb756b9247c27fff5f0fabf6d5c6e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:49 +0100 Subject: [PATCH 09/16] clocksource/drivers/armada-370-xp: Remove clockevents shutdown call on offlining The clockevents core already detached and unregistered it at this stage. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-9-frederic@kernel.org --- drivers/clocksource/timer-armada-370-xp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/clocksource/timer-armada-370-xp.c b/drivers/clocksource/timer-armada-370-xp.c index 6ec565d6939a..54284c1c0651 100644 --- a/drivers/clocksource/timer-armada-370-xp.c +++ b/drivers/clocksource/timer-armada-370-xp.c @@ -201,7 +201,6 @@ static int armada_370_xp_timer_dying_cpu(unsigned int cpu) { struct clock_event_device *evt = per_cpu_ptr(armada_370_xp_evt, cpu); - evt->set_state_shutdown(evt); disable_percpu_irq(evt->irq); return 0; } -- 2.51.0 From cd165ce8314f8b91b171c1f0d4cf144c0f88f757 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:50 +0100 Subject: [PATCH 10/16] clocksource/drivers/qcom: Remove clockevents shutdown call on offlining The clockevents core already detached and unregistered it at this stage. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-10-frederic@kernel.org --- drivers/clocksource/timer-qcom.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/clocksource/timer-qcom.c b/drivers/clocksource/timer-qcom.c index eac4c95c6127..ddb1debe6a6b 100644 --- a/drivers/clocksource/timer-qcom.c +++ b/drivers/clocksource/timer-qcom.c @@ -130,7 +130,6 @@ static int msm_local_timer_dying_cpu(unsigned int cpu) { struct clock_event_device *evt = per_cpu_ptr(msm_evt, cpu); - evt->set_state_shutdown(evt); disable_percpu_irq(evt->irq); return 0; } -- 2.51.0 From bf9a001fb8e46a23c43d4964523963e717d9e972 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:51 +0100 Subject: [PATCH 11/16] clocksource/drivers/timer-tegra: Remove clockevents shutdown call on offlining The clockevents core already detached and unregistered it at this stage. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-11-frederic@kernel.org --- drivers/clocksource/timer-tegra.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/clocksource/timer-tegra.c b/drivers/clocksource/timer-tegra.c index e9635c25eef4..35b6ce9deffa 100644 --- a/drivers/clocksource/timer-tegra.c +++ b/drivers/clocksource/timer-tegra.c @@ -158,7 +158,6 @@ static int tegra_timer_stop(unsigned int cpu) { struct timer_of *to = per_cpu_ptr(&tegra_to, cpu); - to->clkevt.set_state_shutdown(&to->clkevt); disable_irq_nosync(to->clkevt.irq); return 0; -- 2.51.0 From 1d4199cbbe95efaba51304cfd844bd0ccd224e61 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Oct 2024 08:53:51 +0100 Subject: [PATCH 12/16] timers: Add missing READ_ONCE() in __run_timer_base() __run_timer_base() checks base::next_expiry without holding base::lock. That can race with a remote CPU updating next_expiry under the lock. This is an intentional and harmless data race, but lacks a READ_ONCE(), so KCSAN complains about this. Add the missing READ_ONCE(). All other places are covered already. Fixes: 79f8b28e85f8 ("timers: Annotate possible non critical data race of next_expiry") Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/87a5emyqk0.ffs@tglx Closes: https://lore.kernel.org/oe-lkp/202410301205.ef8e9743-lkp@intel.com --- kernel/time/timer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 02355b275bab..a283e524835d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2421,7 +2421,8 @@ static inline void __run_timers(struct timer_base *base) static void __run_timer_base(struct timer_base *base) { - if (time_before(jiffies, base->next_expiry)) + /* Can race against a remote CPU updating next_expiry under the lock */ + if (time_before(jiffies, READ_ONCE(base->next_expiry))) return; timer_base_lock_expiry(base); -- 2.51.0 From d44d26987bb3df6d76556827097fc9ce17565cb8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 31 Oct 2024 13:04:07 +0100 Subject: [PATCH 13/16] timekeeping: Remove CONFIG_DEBUG_TIMEKEEPING Since 135225a363ae timekeeping_cycles_to_ns() handles large offsets which would lead to 64bit multiplication overflows correctly. It's also protected against negative motion of the clocksource unconditionally, which was exclusive to x86 before. timekeeping_advance() handles large offsets already correctly. That means the value of CONFIG_DEBUG_TIMEKEEPING which analyzed these cases is very close to zero. Remove all of it. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241031120328.536010148@linutronix.de --- arch/riscv/configs/defconfig | 1 - include/linux/timekeeper_internal.h | 16 --- kernel/time/timekeeping.c | 108 +----------------- lib/Kconfig.debug | 13 --- .../selftests/wireguard/qemu/debug.config | 1 - 5 files changed, 3 insertions(+), 136 deletions(-) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 2341393cfac1..26c01b9e3434 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -301,7 +301,6 @@ CONFIG_DEBUG_MEMORY_INIT=y CONFIG_DEBUG_PER_CPU_MAPS=y CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_WQ_WATCHDOG=y -CONFIG_DEBUG_TIMEKEEPING=y CONFIG_DEBUG_RT_MUTEXES=y CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_MUTEXES=y diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index a3b6380a7777..e39d4d563b19 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -76,9 +76,6 @@ struct tk_read_base { * ntp shifted nano seconds. * @ntp_err_mult: Multiplication factor for scaled math conversion * @skip_second_overflow: Flag used to avoid updating NTP twice with same second - * @last_warning: Warning ratelimiter (DEBUG_TIMEKEEPING) - * @underflow_seen: Underflow warning flag (DEBUG_TIMEKEEPING) - * @overflow_seen: Overflow warning flag (DEBUG_TIMEKEEPING) * * Note: For timespec(64) based interfaces wall_to_monotonic is what * we need to add to xtime (or xtime corrected for sub jiffy times) @@ -147,19 +144,6 @@ struct timekeeper { u32 ntp_error_shift; u32 ntp_err_mult; u32 skip_second_overflow; - -#ifdef CONFIG_DEBUG_TIMEKEEPING - long last_warning; - /* - * These simple flag variables are managed - * without locks, which is racy, but they are - * ok since we don't really care about being - * super precise about how many events were - * seen, just that a problem was observed. - */ - int underflow_seen; - int overflow_seen; -#endif }; #ifdef CONFIG_GENERIC_TIME_VSYSCALL diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 17cae886ca82..d115adebc418 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -226,97 +226,6 @@ static inline u64 tk_clock_read(const struct tk_read_base *tkr) return clock->read(clock); } -#ifdef CONFIG_DEBUG_TIMEKEEPING -#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ - -static void timekeeping_check_update(struct timekeeper *tk, u64 offset) -{ - - u64 max_cycles = tk->tkr_mono.clock->max_cycles; - const char *name = tk->tkr_mono.clock->name; - - if (offset > max_cycles) { - printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", - offset, name, max_cycles); - printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); - } else { - if (offset > (max_cycles >> 1)) { - printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n", - offset, name, max_cycles >> 1); - printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); - } - } - - if (tk->underflow_seen) { - if (jiffies - tk->last_warning > WARNING_FREQ) { - printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); - printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); - printk_deferred(" Your kernel is probably still fine.\n"); - tk->last_warning = jiffies; - } - tk->underflow_seen = 0; - } - - if (tk->overflow_seen) { - if (jiffies - tk->last_warning > WARNING_FREQ) { - printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); - printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); - printk_deferred(" Your kernel is probably still fine.\n"); - tk->last_warning = jiffies; - } - tk->overflow_seen = 0; - } -} - -static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles); - -static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr) -{ - struct timekeeper *tk = &tk_core.timekeeper; - u64 now, last, mask, max, delta; - unsigned int seq; - - /* - * Since we're called holding a seqcount, the data may shift - * under us while we're doing the calculation. This can cause - * false positives, since we'd note a problem but throw the - * results away. So nest another seqcount here to atomically - * grab the points we are checking with. - */ - do { - seq = read_seqcount_begin(&tk_core.seq); - now = tk_clock_read(tkr); - last = tkr->cycle_last; - mask = tkr->mask; - max = tkr->clock->max_cycles; - } while (read_seqcount_retry(&tk_core.seq, seq)); - - delta = clocksource_delta(now, last, mask); - - /* - * Try to catch underflows by checking if we are seeing small - * mask-relative negative values. - */ - if (unlikely((~delta & mask) < (mask >> 3))) - tk->underflow_seen = 1; - - /* Check for multiplication overflows */ - if (unlikely(delta > max)) - tk->overflow_seen = 1; - - /* timekeeping_cycles_to_ns() handles both under and overflow */ - return timekeeping_cycles_to_ns(tkr, now); -} -#else -static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset) -{ -} -static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr) -{ - BUG(); -} -#endif - /** * tk_setup_internals - Set up internals to use clocksource clock. * @@ -421,19 +330,11 @@ static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 c return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift; } -static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr) +static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) { return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr)); } -static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) -{ - if (IS_ENABLED(CONFIG_DEBUG_TIMEKEEPING)) - return timekeeping_debug_get_ns(tkr); - - return __timekeeping_get_ns(tkr); -} - /** * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. * @tkr: Timekeeping readout base from which we take the update @@ -477,7 +378,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); - now += __timekeeping_get_ns(tkr); + now += timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); return now; @@ -593,7 +494,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) tkr = tkf->base + (seq & 0x01); basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); - delta = __timekeeping_get_ns(tkr); + delta = timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); if (mono) @@ -2333,9 +2234,6 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; - /* Do some additional sanity checking */ - timekeeping_check_update(tk, offset); - /* * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7315f643817a..14977b9fc254 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1328,19 +1328,6 @@ config SCHEDSTATS endmenu -config DEBUG_TIMEKEEPING - bool "Enable extra timekeeping sanity checking" - help - This option will enable additional timekeeping sanity checks - which may be helpful when diagnosing issues where timekeeping - problems are suspected. - - This may include checks in the timekeeping hotpaths, so this - option may have a (very small) performance impact to some - workloads. - - If unsure, say N. - config DEBUG_PREEMPT bool "Debug preemptible kernel" depends on DEBUG_KERNEL && PREEMPTION && TRACE_IRQFLAGS_SUPPORT diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config index 9d172210e2c6..139fd9aa8b12 100644 --- a/tools/testing/selftests/wireguard/qemu/debug.config +++ b/tools/testing/selftests/wireguard/qemu/debug.config @@ -31,7 +31,6 @@ CONFIG_SCHED_DEBUG=y CONFIG_SCHED_INFO=y CONFIG_SCHEDSTATS=y CONFIG_SCHED_STACK_END_CHECK=y -CONFIG_DEBUG_TIMEKEEPING=y CONFIG_DEBUG_PREEMPT=y CONFIG_DEBUG_RT_MUTEXES=y CONFIG_DEBUG_SPINLOCK=y -- 2.51.0 From c163e40af9b2331b2c629fd4ec8b703ed4d4ae39 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 31 Oct 2024 13:04:08 +0100 Subject: [PATCH 14/16] timekeeping: Always check for negative motion clocksource_delta() has two variants. One with a check for negative motion, which is only selected by x86. This is a historic leftover as this function was previously used in the time getter hot paths. Since 135225a363ae timekeeping_cycles_to_ns() has unconditional protection against this as a by-product of the protection against 64bit math overflow. clocksource_delta() is only used in the clocksource watchdog and in timekeeping_advance(). The extra conditional there is not hurting anyone. Remove the config option and unconditionally prevent negative motion of the readout. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241031120328.599430157@linutronix.de --- arch/x86/Kconfig | 1 - kernel/time/Kconfig | 5 ----- kernel/time/timekeeping_internal.h | 7 ------- 3 files changed, 13 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2852fcd82cbd..53a5eda8219c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -145,7 +145,6 @@ config X86 select ARCH_HAS_PARANOID_L1D_FLUSH select BUILDTIME_TABLE_SORT select CLKEVT_I8253 - select CLOCKSOURCE_VALIDATE_LAST_CYCLE select CLOCKSOURCE_WATCHDOG # Word-size accesses may read uninitialized data past the trailing \0 # in strings and cause false KMSAN reports. diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 8ebb6d5a106b..b0b97a60aaa6 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -17,11 +17,6 @@ config ARCH_CLOCKSOURCE_DATA config ARCH_CLOCKSOURCE_INIT bool -# Clocksources require validation of the clocksource against the last -# cycle update - x86/TSC misfeature -config CLOCKSOURCE_VALIDATE_LAST_CYCLE - bool - # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index b3dca834f48c..63e600e943a7 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -30,7 +30,6 @@ static inline void timekeeping_inc_mg_floor_swaps(void) #endif -#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) { u64 ret = (now - last) & mask; @@ -41,12 +40,6 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) */ return ret & ~(mask >> 1) ? 0 : ret; } -#else -static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) -{ - return (now - last) & mask; -} -#endif /* Semi public for serialization of non timekeeper VDSO updates. */ unsigned long timekeeper_lock_irqsave(void); -- 2.51.0 From 15cbfb92efee5c7f09e531a331e19759dbe0ac3c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:29 +0100 Subject: [PATCH 15/16] posix-cpu-timers: Correctly update timer status in posix_cpu_timer_del() If posix_cpu_timer_del() exits early due to task not found or sighand invalid, it fails to clear the state of the timer. That's harmless but inconsistent. These early exits are accounted as successful delete. Move the update of the timer state into the success return path, so all "successful" deletions are handled. Reported-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241105064212.974053438@linutronix.de --- kernel/time/posix-cpu-timers.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 12f828d704b1..5f444e372464 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -493,20 +493,20 @@ static int posix_cpu_timer_del(struct k_itimer *timer) */ WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node)); } else { - if (timer->it.cpu.firing) { + if (timer->it.cpu.firing) ret = TIMER_RETRY; - } else { + else disarm_timer(timer, p); - timer->it_status = POSIX_TIMER_DISARMED; - } unlock_task_sighand(p, &flags); } out: rcu_read_unlock(); - if (!ret) - put_pid(ctmr->pid); + if (!ret) { + put_pid(ctmr->pid); + timer->it_status = POSIX_TIMER_DISARMED; + } return ret; } -- 2.51.0 From 513793bc6ab331b947111e8efaf8fcef33fb83e5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:31 +0100 Subject: [PATCH 16/16] posix-timers: Make signal delivery consistent Signals of timers which are reprogammed, disarmed or deleted can deliver signals related to the past. The POSIX spec is blury about this: - "The effect of disarming or resetting a timer with pending expiration notifications is unspecified." - "The disposition of pending signals for the deleted timer is unspecified." In both cases it is reasonable to expect that pending signals are discarded. Especially in the reprogramming case it does not make sense to account for previous overruns or to deliver a signal for a timer which has been disarmed. This makes the behaviour consistent and understandable. Remove the si_sys_private check from the signal delivery code and invoke posix_timer_deliver_signal() unconditionally for posix timer related signals. Change posix_timer_deliver_signal() so it controls the actual signal delivery via the return value. It now instructs the signal code to drop the signal when: 1) The timer does not longer exist in the hash table 2) The timer signal_seq value is not the same as the si_sys_private value which was set when the signal was queued. This is also a preparatory change to embed the sigqueue into the k_itimer structure, which in turn allows to remove the si_sys_private magic. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241105064213.040348644@linutronix.de --- include/linux/posix-timers.h | 2 -- kernel/signal.c | 6 ++---- kernel/time/posix-cpu-timers.c | 2 +- kernel/time/posix-timers.c | 28 ++++++++++++++++------------ 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 02afbb4da7f7..8c6d97412526 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -137,8 +137,6 @@ static inline void clear_posix_cputimers_work(struct task_struct *p) { } static inline void posix_cputimers_init_work(void) { } #endif -#define REQUEUE_PENDING 1 - /** * struct k_itimer - POSIX.1b interval timer structure. * @list: List head for binding the timer to signals->posix_timers diff --git a/kernel/signal.c b/kernel/signal.c index df34aa47181e..68e6bc70ccf2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -550,10 +550,8 @@ still_pending: list_del_init(&first->list); copy_siginfo(info, &first->info); - *resched_timer = - (first->flags & SIGQUEUE_PREALLOC) && - (info->si_code == SI_TIMER) && - (info->si_sys_private); + *resched_timer = (first->flags & SIGQUEUE_PREALLOC) && + (info->si_code == SI_TIMER); __sigqueue_free(first); } else { diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 5f444e372464..4305c003c8d4 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -746,7 +746,7 @@ static void __posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *i * - Timers which expired, but the signal has not yet been * delivered */ - if (iv && ((timer->it_signal_seq & REQUEUE_PENDING) || sigev_none)) + if (iv && timer->it_status != POSIX_TIMER_ARMED) expires = bump_cpu_timer(timer, now); else expires = cpu_timer_getexpires(&timer->it.cpu); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index dd72b8e72697..b380e25d4947 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -269,7 +269,10 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info) if (!timr) goto out; - if (timr->it_interval && timr->it_signal_seq == info->si_sys_private) { + if (timr->it_signal_seq != info->si_sys_private) + goto out_unlock; + + if (timr->it_interval && !WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) { timr->kclock->timer_rearm(timr); timr->it_status = POSIX_TIMER_ARMED; @@ -281,6 +284,7 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info) } ret = true; +out_unlock: unlock_timer(timr, flags); out: spin_lock(¤t->sighand->siglock); @@ -293,19 +297,18 @@ out: int posix_timer_queue_signal(struct k_itimer *timr) { enum posix_timer_state state = POSIX_TIMER_DISARMED; - int ret, si_private = 0; enum pid_type type; + int ret; lockdep_assert_held(&timr->it_lock); - if (timr->it_interval) { + if (timr->it_interval) state = POSIX_TIMER_REQUEUE_PENDING; - si_private = ++timr->it_signal_seq; - } + timr->it_status = state; type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID; - ret = send_sigqueue(timr->sigq, timr->it_pid, type, si_private); + ret = send_sigqueue(timr->sigq, timr->it_pid, type, timr->it_signal_seq); /* If we failed to send the signal the timer stops. */ return ret > 0; } @@ -663,7 +666,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) * is a SIGEV_NONE timer move the expiry time forward by intervals, * so expiry is > now. */ - if (iv && (timr->it_signal_seq & REQUEUE_PENDING || sig_none)) + if (iv && timr->it_status != POSIX_TIMER_ARMED) timr->it_overrun += kc->timer_forward(timr, now); remaining = kc->timer_remaining(timr, now); @@ -863,8 +866,6 @@ void posix_timer_set_common(struct k_itimer *timer, struct itimerspec64 *new_set else timer->it_interval = 0; - /* Prevent reloading in case there is a signal pending */ - timer->it_signal_seq = (timer->it_signal_seq + 2) & ~REQUEUE_PENDING; /* Reset overrun accounting */ timer->it_overrun_last = 0; timer->it_overrun = -1LL; @@ -882,8 +883,6 @@ int common_timer_set(struct k_itimer *timr, int flags, if (old_setting) common_timer_get(timr, old_setting); - /* Prevent rearming by clearing the interval */ - timr->it_interval = 0; /* * Careful here. On SMP systems the timer expiry function could be * active and spinning on timr->it_lock. @@ -933,6 +932,9 @@ retry: if (old_spec64) old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); + /* Prevent signal delivery and rearming. */ + timr->it_signal_seq++; + kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; @@ -1001,7 +1003,6 @@ int common_timer_del(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; - timer->it_interval = 0; if (kc->timer_try_to_cancel(timer) < 0) return TIMER_RETRY; timer->it_status = POSIX_TIMER_DISARMED; @@ -1012,6 +1013,9 @@ static inline int timer_delete_hook(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; + /* Prevent signal delivery and rearming. */ + timer->it_signal_seq++; + if (WARN_ON_ONCE(!kc || !kc->timer_del)) return -EINVAL; return kc->timer_del(timer); -- 2.51.0