From db45e9bce8df2396740c0c03906ad6ed63948a8b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:54 +0200 Subject: [PATCH 01/16] ntp: Move pps_fbase into ntp_data Continue the conversion from static variables to struct based data. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-18-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 576f86a6a4f1..4bde69c4841d 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -43,6 +43,7 @@ * @pps_valid: PPS signal watchdog counter * @pps_tf: PPS phase median filter * @pps_jitter: PPS current jitter in nanoseconds + * @pps_fbase: PPS beginning of the last freq interval * * Protected by the timekeeping locks. */ @@ -65,6 +66,7 @@ struct ntp_data { int pps_valid; long pps_tf[3]; long pps_jitter; + struct timespec64 pps_fbase; #endif }; @@ -100,7 +102,6 @@ static struct ntp_data tk_ntp_data = { intervals to decrease it */ #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ -static struct timespec64 pps_fbase; /* beginning of the last freq interval */ static int pps_shift; /* current interval duration (s) (shift) */ static int pps_intcnt; /* interval counter */ static s64 pps_freq; /* frequency offset (scaled ns/s) */ @@ -144,7 +145,7 @@ static inline void pps_clear(struct ntp_data *ntpdata) ntpdata->pps_tf[0] = 0; ntpdata->pps_tf[1] = 0; ntpdata->pps_tf[2] = 0; - pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; + ntpdata->pps_fbase.tv_sec = ntpdata->pps_fbase.tv_nsec = 0; pps_freq = 0; } @@ -1045,13 +1046,13 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t * When called for the first time, just start the frequency * interval */ - if (unlikely(pps_fbase.tv_sec == 0)) { - pps_fbase = *raw_ts; + if (unlikely(ntpdata->pps_fbase.tv_sec == 0)) { + ntpdata->pps_fbase = *raw_ts; return; } /* Ok, now we have a base for frequency calculation */ - freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase)); + freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, ntpdata->pps_fbase)); /* * Check that the signal is in the range @@ -1061,7 +1062,7 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { ntpdata->time_status |= STA_PPSJITTER; /* Restart the frequency calibration interval */ - pps_fbase = *raw_ts; + ntpdata->pps_fbase = *raw_ts; printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); return; } @@ -1070,7 +1071,7 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t if (freq_norm.sec >= (1 << pps_shift)) { pps_calcnt++; /* Restart the frequency calibration interval */ - pps_fbase = *raw_ts; + ntpdata->pps_fbase = *raw_ts; hardpps_update_freq(ntpdata, freq_norm); } -- 2.51.0 From b1c89a762f753bedd5a62be4a5a586281be6f3c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:55 +0200 Subject: [PATCH 02/16] ntp: Move pps_shift/intcnt into ntp_data Continue the conversion from static variables to struct based data. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-19-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 54 ++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 4bde69c4841d..bebff6c69c18 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -44,6 +44,8 @@ * @pps_tf: PPS phase median filter * @pps_jitter: PPS current jitter in nanoseconds * @pps_fbase: PPS beginning of the last freq interval + * @pps_shift: PPS current interval duration in seconds (shift value) + * @pps_intcnt: PPS interval counter * * Protected by the timekeeping locks. */ @@ -67,6 +69,8 @@ struct ntp_data { long pps_tf[3]; long pps_jitter; struct timespec64 pps_fbase; + int pps_shift; + int pps_intcnt; #endif }; @@ -102,8 +106,6 @@ static struct ntp_data tk_ntp_data = { intervals to decrease it */ #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ -static int pps_shift; /* current interval duration (s) (shift) */ -static int pps_intcnt; /* interval counter */ static s64 pps_freq; /* frequency offset (scaled ns/s) */ static long pps_stabil; /* current stability (scaled ns/s) */ @@ -128,11 +130,11 @@ static inline s64 ntp_offset_chunk(struct ntp_data *ntpdata, s64 offset) return shift_right(offset, SHIFT_PLL + ntpdata->time_constant); } -static inline void pps_reset_freq_interval(void) +static inline void pps_reset_freq_interval(struct ntp_data *ntpdata) { /* The PPS calibration interval may end surprisingly early */ - pps_shift = PPS_INTMIN; - pps_intcnt = 0; + ntpdata->pps_shift = PPS_INTMIN; + ntpdata->pps_intcnt = 0; } /** @@ -141,7 +143,7 @@ static inline void pps_reset_freq_interval(void) */ static inline void pps_clear(struct ntp_data *ntpdata) { - pps_reset_freq_interval(); + pps_reset_freq_interval(ntpdata); ntpdata->pps_tf[0] = 0; ntpdata->pps_tf[1] = 0; ntpdata->pps_tf[2] = 0; @@ -199,7 +201,7 @@ static inline void pps_fill_timex(struct ntp_data *ntpdata, struct __kernel_time txc->jitter = ntpdata->pps_jitter; if (!(ntpdata->time_status & STA_NANO)) txc->jitter = ntpdata->pps_jitter / NSEC_PER_USEC; - txc->shift = pps_shift; + txc->shift = ntpdata->pps_shift; txc->stabil = pps_stabil; txc->jitcnt = pps_jitcnt; txc->calcnt = pps_calcnt; @@ -214,7 +216,7 @@ static inline s64 ntp_offset_chunk(struct ntp_data *ntpdata, s64 offset) return shift_right(offset, SHIFT_PLL + ntpdata->time_constant); } -static inline void pps_reset_freq_interval(void) {} +static inline void pps_reset_freq_interval(struct ntp_data *ntpdata) {} static inline void pps_clear(struct ntp_data *ntpdata) {} static inline void pps_dec_valid(struct ntp_data *ntpdata) {} static inline void pps_set_freq(s64 freq) {} @@ -693,7 +695,7 @@ static inline void process_adj_status(struct ntp_data *ntpdata, const struct __k ntpdata->time_status = STA_UNSYNC; ntpdata->ntp_next_leap_sec = TIME64_MAX; /* Restart PPS frequency calibration */ - pps_reset_freq_interval(); + pps_reset_freq_interval(ntpdata); } /* @@ -896,13 +898,13 @@ static inline void pps_phase_filter_add(struct ntp_data *ntpdata, long err) * Decrease frequency calibration interval length. It is halved after four * consecutive unstable intervals. */ -static inline void pps_dec_freq_interval(void) +static inline void pps_dec_freq_interval(struct ntp_data *ntpdata) { - if (--pps_intcnt <= -PPS_INTCOUNT) { - pps_intcnt = -PPS_INTCOUNT; - if (pps_shift > PPS_INTMIN) { - pps_shift--; - pps_intcnt = 0; + if (--ntpdata->pps_intcnt <= -PPS_INTCOUNT) { + ntpdata->pps_intcnt = -PPS_INTCOUNT; + if (ntpdata->pps_shift > PPS_INTMIN) { + ntpdata->pps_shift--; + ntpdata->pps_intcnt = 0; } } } @@ -911,13 +913,13 @@ static inline void pps_dec_freq_interval(void) * Increase frequency calibration interval length. It is doubled after * four consecutive stable intervals. */ -static inline void pps_inc_freq_interval(void) +static inline void pps_inc_freq_interval(struct ntp_data *ntpdata) { - if (++pps_intcnt >= PPS_INTCOUNT) { - pps_intcnt = PPS_INTCOUNT; - if (pps_shift < PPS_INTMAX) { - pps_shift++; - pps_intcnt = 0; + if (++ntpdata->pps_intcnt >= PPS_INTCOUNT) { + ntpdata->pps_intcnt = PPS_INTCOUNT; + if (ntpdata->pps_shift < PPS_INTMAX) { + ntpdata->pps_shift++; + ntpdata->pps_intcnt = 0; } } } @@ -938,10 +940,10 @@ static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime fr s64 ftemp; /* Check if the frequency interval was too long */ - if (freq_norm.sec > (2 << pps_shift)) { + if (freq_norm.sec > (2 << ntpdata->pps_shift)) { ntpdata->time_status |= STA_PPSERROR; pps_errcnt++; - pps_dec_freq_interval(); + pps_dec_freq_interval(ntpdata); printk_deferred(KERN_ERR "hardpps: PPSERROR: interval too long - %lld s\n", freq_norm.sec); return 0; @@ -960,10 +962,10 @@ static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime fr printk_deferred(KERN_WARNING "hardpps: PPSWANDER: change=%ld\n", delta); ntpdata->time_status |= STA_PPSWANDER; pps_stbcnt++; - pps_dec_freq_interval(); + pps_dec_freq_interval(ntpdata); } else { /* Good sample */ - pps_inc_freq_interval(); + pps_inc_freq_interval(ntpdata); } /* @@ -1068,7 +1070,7 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t } /* Signal is ok. Check if the current frequency interval is finished */ - if (freq_norm.sec >= (1 << pps_shift)) { + if (freq_norm.sec >= (1 << ntpdata->pps_shift)) { pps_calcnt++; /* Restart the frequency calibration interval */ ntpdata->pps_fbase = *raw_ts; -- 2.51.0 From 12850b46583440911a2789355d25d8eb9fe8157d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:56 +0200 Subject: [PATCH 03/16] ntp: Move pps_freq/stabil into ntp_data Continue the conversion from static variables to struct based data. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-20-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index bebff6c69c18..533367d7cccc 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -46,6 +46,8 @@ * @pps_fbase: PPS beginning of the last freq interval * @pps_shift: PPS current interval duration in seconds (shift value) * @pps_intcnt: PPS interval counter + * @pps_freq: PPS frequency offset in scaled ns/s + * @pps_stabil: PPS current stability in scaled ns/s * * Protected by the timekeeping locks. */ @@ -71,6 +73,8 @@ struct ntp_data { struct timespec64 pps_fbase; int pps_shift; int pps_intcnt; + s64 pps_freq; + long pps_stabil; #endif }; @@ -106,9 +110,6 @@ static struct ntp_data tk_ntp_data = { intervals to decrease it */ #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ -static s64 pps_freq; /* frequency offset (scaled ns/s) */ -static long pps_stabil; /* current stability (scaled ns/s) */ - /* * PPS signal quality monitors */ @@ -148,7 +149,7 @@ static inline void pps_clear(struct ntp_data *ntpdata) ntpdata->pps_tf[1] = 0; ntpdata->pps_tf[2] = 0; ntpdata->pps_fbase.tv_sec = ntpdata->pps_fbase.tv_nsec = 0; - pps_freq = 0; + ntpdata->pps_freq = 0; } /* @@ -166,9 +167,9 @@ static inline void pps_dec_valid(struct ntp_data *ntpdata) } } -static inline void pps_set_freq(s64 freq) +static inline void pps_set_freq(struct ntp_data *ntpdata) { - pps_freq = freq; + ntpdata->pps_freq = ntpdata->time_freq; } static inline bool is_error_status(int status) @@ -196,13 +197,13 @@ static inline bool is_error_status(int status) static inline void pps_fill_timex(struct ntp_data *ntpdata, struct __kernel_timex *txc) { - txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * + txc->ppsfreq = shift_right((ntpdata->pps_freq >> PPM_SCALE_INV_SHIFT) * PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->jitter = ntpdata->pps_jitter; if (!(ntpdata->time_status & STA_NANO)) txc->jitter = ntpdata->pps_jitter / NSEC_PER_USEC; txc->shift = ntpdata->pps_shift; - txc->stabil = pps_stabil; + txc->stabil = ntpdata->pps_stabil; txc->jitcnt = pps_jitcnt; txc->calcnt = pps_calcnt; txc->errcnt = pps_errcnt; @@ -219,7 +220,7 @@ static inline s64 ntp_offset_chunk(struct ntp_data *ntpdata, s64 offset) static inline void pps_reset_freq_interval(struct ntp_data *ntpdata) {} static inline void pps_clear(struct ntp_data *ntpdata) {} static inline void pps_dec_valid(struct ntp_data *ntpdata) {} -static inline void pps_set_freq(s64 freq) {} +static inline void pps_set_freq(struct ntp_data *ntpdata) {} static inline bool is_error_status(int status) { @@ -727,7 +728,7 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct ntpdata->time_freq = min(ntpdata->time_freq, MAXFREQ_SCALED); ntpdata->time_freq = max(ntpdata->time_freq, -MAXFREQ_SCALED); /* Update pps_freq */ - pps_set_freq(ntpdata->time_freq); + pps_set_freq(ntpdata); } if (txc->modes & ADJ_MAXERROR) @@ -956,8 +957,8 @@ static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime fr */ ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, freq_norm.sec); - delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); - pps_freq = ftemp; + delta = shift_right(ftemp - ntpdata->pps_freq, NTP_SCALE_SHIFT); + ntpdata->pps_freq = ftemp; if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { printk_deferred(KERN_WARNING "hardpps: PPSWANDER: change=%ld\n", delta); ntpdata->time_status |= STA_PPSWANDER; @@ -975,12 +976,12 @@ static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime fr delta_mod = delta; if (delta_mod < 0) delta_mod = -delta_mod; - pps_stabil += (div_s64(((s64)delta_mod) << (NTP_SCALE_SHIFT - SHIFT_USEC), - NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; + ntpdata->pps_stabil += (div_s64(((s64)delta_mod) << (NTP_SCALE_SHIFT - SHIFT_USEC), + NSEC_PER_USEC) - ntpdata->pps_stabil) >> PPS_INTMIN; /* If enabled, the system clock frequency is updated */ if ((ntpdata->time_status & STA_PPSFREQ) && !(ntpdata->time_status & STA_FREQHOLD)) { - ntpdata->time_freq = pps_freq; + ntpdata->time_freq = ntpdata->pps_freq; ntp_update_frequency(ntpdata); } -- 2.51.0 From 6fadb4a61d3fd4cdc6ede38a911b4abbfb43eed4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:57 +0200 Subject: [PATCH 04/16] ntp: Move pps monitors into ntp_data Finalize the conversion from static variables to struct based data. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-21-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 533367d7cccc..b550ebe0f03b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -48,6 +48,10 @@ * @pps_intcnt: PPS interval counter * @pps_freq: PPS frequency offset in scaled ns/s * @pps_stabil: PPS current stability in scaled ns/s + * @pps_calcnt: PPS monitor: calibration intervals + * @pps_jitcnt: PPS monitor: jitter limit exceeded + * @pps_stbcnt: PPS monitor: stability limit exceeded + * @pps_errcnt: PPS monitor: calibration errors * * Protected by the timekeeping locks. */ @@ -75,6 +79,10 @@ struct ntp_data { int pps_intcnt; s64 pps_freq; long pps_stabil; + long pps_calcnt; + long pps_jitcnt; + long pps_stbcnt; + long pps_errcnt; #endif }; @@ -110,15 +118,6 @@ static struct ntp_data tk_ntp_data = { intervals to decrease it */ #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ -/* - * PPS signal quality monitors - */ -static long pps_calcnt; /* calibration intervals */ -static long pps_jitcnt; /* jitter limit exceeded */ -static long pps_stbcnt; /* stability limit exceeded */ -static long pps_errcnt; /* calibration errors */ - - /* * PPS kernel consumer compensates the whole phase error immediately. * Otherwise, reduce the offset by a fixed factor times the time constant. @@ -204,10 +203,10 @@ static inline void pps_fill_timex(struct ntp_data *ntpdata, struct __kernel_time txc->jitter = ntpdata->pps_jitter / NSEC_PER_USEC; txc->shift = ntpdata->pps_shift; txc->stabil = ntpdata->pps_stabil; - txc->jitcnt = pps_jitcnt; - txc->calcnt = pps_calcnt; - txc->errcnt = pps_errcnt; - txc->stbcnt = pps_stbcnt; + txc->jitcnt = ntpdata->pps_jitcnt; + txc->calcnt = ntpdata->pps_calcnt; + txc->errcnt = ntpdata->pps_errcnt; + txc->stbcnt = ntpdata->pps_stbcnt; } #else /* !CONFIG_NTP_PPS */ @@ -943,7 +942,7 @@ static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime fr /* Check if the frequency interval was too long */ if (freq_norm.sec > (2 << ntpdata->pps_shift)) { ntpdata->time_status |= STA_PPSERROR; - pps_errcnt++; + ntpdata->pps_errcnt++; pps_dec_freq_interval(ntpdata); printk_deferred(KERN_ERR "hardpps: PPSERROR: interval too long - %lld s\n", freq_norm.sec); @@ -962,7 +961,7 @@ static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime fr if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { printk_deferred(KERN_WARNING "hardpps: PPSWANDER: change=%ld\n", delta); ntpdata->time_status |= STA_PPSWANDER; - pps_stbcnt++; + ntpdata->pps_stbcnt++; pps_dec_freq_interval(ntpdata); } else { /* Good sample */ @@ -1007,7 +1006,7 @@ static void hardpps_update_phase(struct ntp_data *ntpdata, long error) printk_deferred(KERN_WARNING "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", jitter, (ntpdata->pps_jitter << PPS_POPCORN)); ntpdata->time_status |= STA_PPSJITTER; - pps_jitcnt++; + ntpdata->pps_jitcnt++; } else if (ntpdata->time_status & STA_PPSTIME) { /* Correct the time using the phase offset */ ntpdata->time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, @@ -1072,7 +1071,7 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t /* Signal is ok. Check if the current frequency interval is finished */ if (freq_norm.sec >= (1 << ntpdata->pps_shift)) { - pps_calcnt++; + ntpdata->pps_calcnt++; /* Restart the frequency calibration interval */ ntpdata->pps_fbase = *raw_ts; hardpps_update_freq(ntpdata, freq_norm); -- 2.51.0 From 8102c4daf44ab86c2d2226a8136bec905d6e2bd1 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 11 Sep 2024 10:30:20 +0100 Subject: [PATCH 05/16] timekeeping: Add the boot clock to system time snapshot For tracing purpose, the boot clock is interesting as it doesn't stop on suspend. Export it as part of the time snapshot. This will later allow the hypervisor to add boot clock timestamps to its events. Signed-off-by: Vincent Donnefort Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911093029.3279154-5-vdonnefort@google.com --- include/linux/timekeeping.h | 2 ++ kernel/time/timekeeping.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index fc12a9ba2c88..e85c27347e44 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -275,6 +275,7 @@ struct ktime_timestamps { * counter value * @cycles: Clocksource counter value to produce the system times * @real: Realtime system time + * @boot: Boot time * @raw: Monotonic raw system time * @cs_id: Clocksource ID * @clock_was_set_seq: The sequence number of clock-was-set events @@ -283,6 +284,7 @@ struct ktime_timestamps { struct system_time_snapshot { u64 cycles; ktime_t real; + ktime_t boot; ktime_t raw; enum clocksource_ids cs_id; unsigned int clock_was_set_seq; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7e6f409bf311..47e44b9d2671 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1060,6 +1060,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) unsigned int seq; ktime_t base_raw; ktime_t base_real; + ktime_t base_boot; u64 nsec_raw; u64 nsec_real; u64 now; @@ -1074,6 +1075,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); + base_boot = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_boot); base_raw = tk->tkr_raw.base; nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); @@ -1081,6 +1084,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) systime_snapshot->cycles = now; systime_snapshot->real = ktime_add_ns(base_real, nsec_real); + systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real); systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); } EXPORT_SYMBOL_GPL(ktime_get_snapshot); -- 2.51.0 From 8c111f1b967687f47bb0cfbedf2863b62c23223c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 10 Sep 2024 13:43:34 -0400 Subject: [PATCH 06/16] timekeeping: Don't use seqcount loop in ktime_mono_to_any() on 64-bit systems ktime_mono_to_any() only fetches the offset inside the loop. This is a single word on 64-bit CPUs, and seqcount_read_begin() implies a full SMP barrier. Use READ_ONCE() to fetch the offset instead of doing a seqcount loop on 64-bit and add the matching WRITE_ONCE()'s to update the offsets in tk_set_wall_to_mono() and tk_update_sleep_time(). [ tglx: Get rid of the #ifdeffery ] Signed-off-by: Jeff Layton Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240910-mgtime-v3-1-84406ed53fad@kernel.org --- kernel/time/timekeeping.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 47e44b9d2671..a57f2eed2ce6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -161,13 +161,15 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp)); tk->wall_to_monotonic = wtm; set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); - tk->offs_real = timespec64_to_ktime(tmp); - tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); + /* Paired with READ_ONCE() in ktime_mono_to_any() */ + WRITE_ONCE(tk->offs_real, timespec64_to_ktime(tmp)); + WRITE_ONCE(tk->offs_tai, ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0))); } static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { - tk->offs_boot = ktime_add(tk->offs_boot, delta); + /* Paired with READ_ONCE() in ktime_mono_to_any() */ + WRITE_ONCE(tk->offs_boot, ktime_add(tk->offs_boot, delta)); /* * Timespec representation for VDSO update to avoid 64bit division * on every update. @@ -930,6 +932,14 @@ ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) unsigned int seq; ktime_t tconv; + if (IS_ENABLED(CONFIG_64BIT)) { + /* + * Paired with WRITE_ONCE()s in tk_set_wall_to_mono() and + * tk_update_sleep_time(). + */ + return ktime_add(tmono, READ_ONCE(*offset)); + } + do { seq = read_seqcount_begin(&tk_core.seq); tconv = ktime_add(tmono, *offset); -- 2.51.0 From 70c8fd00a9bd0509bbf7bccd9baea8bbd5ddc756 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 2 Oct 2024 17:27:16 -0400 Subject: [PATCH 07/16] timekeeping: Add interfaces for handling timestamps with a floor value Multigrain timestamps allow the kernel to use fine-grained timestamps when an inode's attributes is being actively observed via ->getattr(). With this support, it's possible for a file to get a fine-grained timestamp, and another modified after it to get a coarse-grained stamp that is earlier than the fine-grained time. If this happens then the files can appear to have been modified in reverse order, which breaks VFS ordering guarantees [1]. To prevent this, maintain a floor value for multigrain timestamps. Whenever a fine-grained timestamp is handed out, record it, and when later coarse-grained stamps are handed out, ensure they are not earlier than that value. If the coarse-grained timestamp is earlier than the fine-grained floor, return the floor value instead. Add a static singleton atomic64_t into timekeeper.c that is used to keep track of the latest fine-grained time ever handed out. This is tracked as a monotonic ktime_t value to ensure that it isn't affected by clock jumps. Because it is updated at different times than the rest of the timekeeper object, the floor value is managed independently of the timekeeper via a cmpxchg() operation, and sits on its own cacheline. Add two new public interfaces: - ktime_get_coarse_real_ts64_mg() fills a timespec64 with the later of the coarse-grained clock and the floor time - ktime_get_real_ts64_mg() gets the fine-grained clock value, and tries to swap it into the floor. A timespec64 is filled with the result. The floor value is global and updated via a single try_cmpxchg(). If that fails then the operation raced with a concurrent update. Any concurrent update must be later than the existing floor value, so any racing tasks can accept any resulting floor value without retrying. [1]: POSIX requires that files be stamped with realtime clock values, and makes no provision for dealing with backward clock jumps. If a backward realtime clock jump occurs, then files can appear to have been modified in reverse order. Signed-off-by: Jeff Layton Signed-off-by: Thomas Gleixner Tested-by: Randy Dunlap # documentation bits Acked-by: John Stultz Link: https://lore.kernel.org/all/20241002-mgtime-v10-1-d1c4717f5284@kernel.org --- include/linux/timekeeping.h | 4 ++ kernel/time/timekeeping.c | 104 ++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index fc12a9ba2c88..7aa85246c183 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -45,6 +45,10 @@ extern void ktime_get_real_ts64(struct timespec64 *tv); extern void ktime_get_coarse_ts64(struct timespec64 *ts); extern void ktime_get_coarse_real_ts64(struct timespec64 *ts); +/* Multigrain timestamp interfaces */ +extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts); +extern void ktime_get_real_ts64_mg(struct timespec64 *ts); + void getboottime64(struct timespec64 *ts); /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7e6f409bf311..441792c907fa 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -114,6 +114,23 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { .base[1] = FAST_TK_INIT, }; +/* + * Multigrain timestamps require tracking the latest fine-grained timestamp + * that has been issued, and never returning a coarse-grained timestamp that is + * earlier than that value. + * + * mg_floor represents the latest fine-grained time that has been handed out as + * a file timestamp on the system. This is tracked as a monotonic ktime_t, and + * converted to a realtime clock value on an as-needed basis. + * + * Maintaining mg_floor ensures the multigrain interfaces never issue a + * timestamp earlier than one that has been previously issued. + * + * The exception to this rule is when there is a backward realtime clock jump. If + * such an event occurs, a timestamp can appear to be earlier than a previous one. + */ +static __cacheline_aligned_in_smp atomic64_t mg_floor; + static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { @@ -2394,6 +2411,93 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts) } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); +/** + * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor + * @ts: timespec64 to be filled + * + * Fetch the global mg_floor value, convert it to realtime and compare it + * to the current coarse-grained time. Fill @ts with whichever is + * latest. Note that this is a filesystem-specific interface and should be + * avoided outside of that context. + */ +void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + u64 floor = atomic64_read(&mg_floor); + ktime_t f_real, offset, coarse; + unsigned int seq; + + do { + seq = read_seqcount_begin(&tk_core.seq); + *ts = tk_xtime(tk); + offset = tk_core.timekeeper.offs_real; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + coarse = timespec64_to_ktime(*ts); + f_real = ktime_add(floor, offset); + if (ktime_after(f_real, coarse)) + *ts = ktime_to_timespec64(f_real); +} + +/** + * ktime_get_real_ts64_mg - attempt to update floor value and return result + * @ts: pointer to the timespec to be set + * + * Get a monotonic fine-grained time value and attempt to swap it into + * mg_floor. If that succeeds then accept the new floor value. If it fails + * then another task raced in during the interim time and updated the + * floor. Since any update to the floor must be later than the previous + * floor, either outcome is acceptable. + * + * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(), + * and determining that the resulting coarse-grained timestamp did not effect + * a change in ctime. Any more recent floor value would effect a change to + * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure. + * + * @ts will be filled with the latest floor value, regardless of the outcome of + * the cmpxchg. Note that this is a filesystem specific interface and should be + * avoided outside of that context. + */ +void ktime_get_real_ts64_mg(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + ktime_t old = atomic64_read(&mg_floor); + ktime_t offset, mono; + unsigned int seq; + u64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + ts->tv_sec = tk->xtime_sec; + mono = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); + offset = tk_core.timekeeper.offs_real; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + mono = ktime_add_ns(mono, nsecs); + + /* + * Attempt to update the floor with the new time value. As any + * update must be later then the existing floor, and would effect + * a change to ctime from the perspective of the current task, + * accept the resulting floor value regardless of the outcome of + * the swap. + */ + if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) { + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsecs); + } else { + /* + * Another task changed mg_floor since "old" was fetched. + * "old" has been updated with the latest value of "mg_floor". + * That value is newer than the previous floor value, which + * is enough to effect a change to ctime. Accept it. + */ + *ts = ktime_to_timespec64(ktime_add(old, offset)); + } +} + void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; -- 2.51.0 From 96f9a366ec8abe027326d7aab84d64370019f0f1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 2 Oct 2024 17:27:17 -0400 Subject: [PATCH 08/16] timekeeping: Add percpu counter for tracking floor swap events The mgtime_floor value is a global variable for tracking the latest fine-grained timestamp handed out. Because it's a global, track the number of times that a new floor value is assigned. Add a new percpu counter to the timekeeping code to track the number of floor swap events that have occurred. A later patch will add a debugfs file to display this counter alongside other stats involving multigrain timestamps. Signed-off-by: Jeff Layton Signed-off-by: Thomas Gleixner Tested-by: Randy Dunlap # documentation bits Link: https://lore.kernel.org/all/20241002-mgtime-v10-2-d1c4717f5284@kernel.org --- include/linux/timekeeping.h | 1 + kernel/time/timekeeping.c | 1 + kernel/time/timekeeping_debug.c | 13 +++++++++++++ kernel/time/timekeeping_internal.h | 15 +++++++++++++++ 4 files changed, 30 insertions(+) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 7aa85246c183..84a035e86ac8 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -48,6 +48,7 @@ extern void ktime_get_coarse_real_ts64(struct timespec64 *ts); /* Multigrain timestamp interfaces */ extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts); extern void ktime_get_real_ts64_mg(struct timespec64 *ts); +extern unsigned long timekeeping_get_mg_floor_swaps(void); void getboottime64(struct timespec64 *ts); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 441792c907fa..962b2a31f015 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2487,6 +2487,7 @@ void ktime_get_real_ts64_mg(struct timespec64 *ts) if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) { ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); + timekeeping_inc_mg_floor_swaps(); } else { /* * Another task changed mg_floor since "old" was fetched. diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index b73e8850e58d..badeb222eab9 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -17,6 +17,9 @@ #define NUM_BINS 32 +/* Incremented every time mg_floor is updated */ +DEFINE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps); + static unsigned int sleep_time_bin[NUM_BINS] = {0}; static int tk_debug_sleep_time_show(struct seq_file *s, void *data) @@ -53,3 +56,13 @@ void tk_debug_account_sleep_time(const struct timespec64 *t) (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC); } +unsigned long timekeeping_get_mg_floor_swaps(void) +{ + unsigned long sum = 0; + int cpu; + + for_each_possible_cpu(cpu) + sum += data_race(per_cpu(timekeeping_mg_floor_swaps, cpu)); + + return sum; +} diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 4ca2787d1642..0bbae825bc02 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -10,9 +10,24 @@ * timekeeping debug functions */ #ifdef CONFIG_DEBUG_FS + +DECLARE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps); + +static inline void timekeeping_inc_mg_floor_swaps(void) +{ + this_cpu_inc(timekeeping_mg_floor_swaps); +} + extern void tk_debug_account_sleep_time(const struct timespec64 *t); + #else + #define tk_debug_account_sleep_time(x) + +static inline void timekeeping_inc_mg_floor_swaps(void) +{ +} + #endif #ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE -- 2.51.0 From bafffd56c608106d11e7aec851f114dcd66b2091 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 10 Oct 2024 14:54:46 +0100 Subject: [PATCH 09/16] clocksource: Remove unused clocksource_change_rating clocksource_change_rating() has been unused since 2017's commit 63ed4e0c67df ("Drivers: hv: vmbus: Consolidate all Hyper-V specific clocksource code") Remove it. __clocksource_change_rating now only has one use which is ifdef'd. Move it into the ifdef'd section. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241010135446.213098-1-linux@treblig.org --- include/linux/clocksource.h | 1 - kernel/time/clocksource.c | 40 ++++++++++--------------------------- 2 files changed, 10 insertions(+), 31 deletions(-) diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index d35b677b08fe..ef1b16da6ad5 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -215,7 +215,6 @@ static inline s64 clocksource_cyc2ns(u64 cycles, u32 mult, u32 shift) extern int clocksource_unregister(struct clocksource*); extern void clocksource_touch_watchdog(void); -extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_suspend(void); extern void clocksource_resume(void); extern struct clocksource * __init clocksource_default_clock(void); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 23336eecb4f4..aab6472853fa 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -20,6 +20,8 @@ #include "tick-internal.h" #include "timekeeping_internal.h" +static void clocksource_enqueue(struct clocksource *cs); + static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end) { u64 delta = clocksource_delta(end, start, cs->mask); @@ -171,7 +173,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags) } static int clocksource_watchdog_kthread(void *data); -static void __clocksource_change_rating(struct clocksource *cs, int rating); static void clocksource_watchdog_work(struct work_struct *work) { @@ -191,6 +192,13 @@ static void clocksource_watchdog_work(struct work_struct *work) kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); } +static void clocksource_change_rating(struct clocksource *cs, int rating) +{ + list_del(&cs->list); + cs->rating = rating; + clocksource_enqueue(cs); +} + static void __clocksource_unstable(struct clocksource *cs) { cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); @@ -697,7 +705,7 @@ static int __clocksource_watchdog_kthread(void) list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { if (cs->flags & CLOCK_SOURCE_UNSTABLE) { list_del_init(&cs->wd_list); - __clocksource_change_rating(cs, 0); + clocksource_change_rating(cs, 0); select = 1; } if (cs->flags & CLOCK_SOURCE_RESELECT) { @@ -1255,34 +1263,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) } EXPORT_SYMBOL_GPL(__clocksource_register_scale); -static void __clocksource_change_rating(struct clocksource *cs, int rating) -{ - list_del(&cs->list); - cs->rating = rating; - clocksource_enqueue(cs); -} - -/** - * clocksource_change_rating - Change the rating of a registered clocksource - * @cs: clocksource to be changed - * @rating: new rating - */ -void clocksource_change_rating(struct clocksource *cs, int rating) -{ - unsigned long flags; - - mutex_lock(&clocksource_mutex); - clocksource_watchdog_lock(&flags); - __clocksource_change_rating(cs, rating); - clocksource_watchdog_unlock(&flags); - - clocksource_select(); - clocksource_select_watchdog(false); - clocksource_suspend_select(false); - mutex_unlock(&clocksource_mutex); -} -EXPORT_SYMBOL(clocksource_change_rating); - /* * Unbind clocksource @cs. Called with clocksource_mutex held */ -- 2.51.0 From a849881a9e5426cb4fa00660529bc501718ef85b Mon Sep 17 00:00:00 2001 From: Wang Jinchao Date: Wed, 9 Oct 2024 10:21:35 +0800 Subject: [PATCH 10/16] time: Remove '%' from numeric constant in kernel-doc comment Change %0 to 0 in kernel-doc comments. %0 is not valid. Signed-off-by: Wang Jinchao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241009022135.92400-2-wangjinchao@xfusion.com --- kernel/time/time.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/time/time.c b/kernel/time/time.c index 642647f5046b..5984d4a5639b 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -866,7 +866,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, * * Handles compat or 32-bit modes. * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int get_timespec64(struct timespec64 *ts, const struct __kernel_timespec __user *uts) @@ -897,7 +897,7 @@ EXPORT_SYMBOL_GPL(get_timespec64); * @ts: input &struct timespec64 * @uts: user's &struct __kernel_timespec * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int put_timespec64(const struct timespec64 *ts, struct __kernel_timespec __user *uts) @@ -944,7 +944,7 @@ static int __put_old_timespec32(const struct timespec64 *ts64, * * Handles X86_X32_ABI compatibility conversion. * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int get_old_timespec32(struct timespec64 *ts, const void __user *uts) { @@ -963,7 +963,7 @@ EXPORT_SYMBOL_GPL(get_old_timespec32); * * Handles X86_X32_ABI compatibility conversion. * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int put_old_timespec32(const struct timespec64 *ts, void __user *uts) { @@ -979,7 +979,7 @@ EXPORT_SYMBOL_GPL(put_old_timespec32); * @it: destination &struct itimerspec64 * @uit: user's &struct __kernel_itimerspec * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int get_itimerspec64(struct itimerspec64 *it, const struct __kernel_itimerspec __user *uit) @@ -1002,7 +1002,7 @@ EXPORT_SYMBOL_GPL(get_itimerspec64); * @it: input &struct itimerspec64 * @uit: user's &struct __kernel_itimerspec * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int put_itimerspec64(const struct itimerspec64 *it, struct __kernel_itimerspec __user *uit) @@ -1024,7 +1024,7 @@ EXPORT_SYMBOL_GPL(put_itimerspec64); * @its: destination &struct itimerspec64 * @uits: user's &struct old_itimerspec32 * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int get_old_itimerspec32(struct itimerspec64 *its, const struct old_itimerspec32 __user *uits) @@ -1043,7 +1043,7 @@ EXPORT_SYMBOL_GPL(get_old_itimerspec32); * @its: input &struct itimerspec64 * @uits: user's &struct old_itimerspec32 * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int put_old_itimerspec32(const struct itimerspec64 *its, struct old_itimerspec32 __user *uits) -- 2.51.0 From 3a2e83d350950a84dddb0094c92e380f31fd5333 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:18 +0200 Subject: [PATCH 11/16] MAINTAINERS: Add missing file include/linux/delay.h include/linux/delay.h is not covered by MAINTAINERS file. Add it to the "HIGH-RESOLUTION TIMERS, TIMER WHEEL, CLOCKEVENTS" section. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-1-dc8b907cb62f@linutronix.de --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index c27f3190737f..b52362566629 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10162,6 +10162,7 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core F: Documentation/timers/ F: include/linux/clockchips.h +F: include/linux/delay.h F: include/linux/hrtimer.h F: include/linux/timer.h F: kernel/time/clockevents.c -- 2.51.0 From da7bd0a9e0fce9f293b6e30c003f8f3978cee923 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:19 +0200 Subject: [PATCH 12/16] timers: Move *sleep*() and timeout functions into a separate file All schedule_timeout() and *sleep*() related functions are interfaces on top of timer list timers and hrtimers to add a sleep to the code. As they are built on top of the timer list timers and hrtimers, the [hr]timer interfaces are already used except when queuing the timer in schedule_timeout(). But there exists the appropriate interface add_timer() which does the same job with an extra check for an already pending timer. Split all those functions as they are into a separate file and use add_timer() instead of __mod_timer() in schedule_timeout(). While at it fix minor formatting issues and a multi line printk function call in schedule_timeout(). Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-2-dc8b907cb62f@linutronix.de --- MAINTAINERS | 1 + kernel/time/Makefile | 2 +- kernel/time/hrtimer.c | 120 -------------- kernel/time/sleep_timeout.c | 317 ++++++++++++++++++++++++++++++++++++ kernel/time/timer.c | 192 ---------------------- 5 files changed, 319 insertions(+), 313 deletions(-) create mode 100644 kernel/time/sleep_timeout.c diff --git a/MAINTAINERS b/MAINTAINERS index b52362566629..2250eb10ece1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10167,6 +10167,7 @@ F: include/linux/hrtimer.h F: include/linux/timer.h F: kernel/time/clockevents.c F: kernel/time/hrtimer.c +F: kernel/time/sleep_timeout.c F: kernel/time/timer.c F: kernel/time/timer_list.c F: kernel/time/timer_migration.* diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 4af2a264a160..fe0ae82124fe 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += time.o timer.o hrtimer.o +obj-y += time.o timer.o hrtimer.o sleep_timeout.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-y += timeconv.o timecounter.o alarmtimer.o diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index cddcd08ea827..04f7d8a392c3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2242,123 +2242,3 @@ void __init hrtimers_init(void) hrtimers_prepare_cpu(smp_processor_id()); open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); } - -/** - * schedule_hrtimeout_range_clock - sleep until timeout - * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode - * @clock_id: timer clock to be used - */ -int __sched -schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, - const enum hrtimer_mode mode, clockid_t clock_id) -{ - struct hrtimer_sleeper t; - - /* - * Optimize when a zero timeout value is given. It does not - * matter whether this is an absolute or a relative time. - */ - if (expires && *expires == 0) { - __set_current_state(TASK_RUNNING); - return 0; - } - - /* - * A NULL parameter means "infinite" - */ - if (!expires) { - schedule(); - return -EINTR; - } - - hrtimer_init_sleeper_on_stack(&t, clock_id, mode); - hrtimer_set_expires_range_ns(&t.timer, *expires, delta); - hrtimer_sleeper_start_expires(&t, mode); - - if (likely(t.task)) - schedule(); - - hrtimer_cancel(&t.timer); - destroy_hrtimer_on_stack(&t.timer); - - __set_current_state(TASK_RUNNING); - - return !t.task ? 0 : -EINTR; -} -EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); - -/** - * schedule_hrtimeout_range - sleep until timeout - * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode - * - * Make the current task sleep until the given expiry time has - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * The @delta argument gives the kernel the freedom to schedule the - * actual wakeup to a time that is both power and performance friendly - * for regular (non RT/DL) tasks. - * The kernel give the normal best effort behavior for "@expires+@delta", - * but may decide to fire the timer earlier, but no earlier than @expires. - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process()). - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task or the current task is explicitly woken - * up. - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Returns 0 when the timer has expired. If the task was woken before the - * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or - * by an explicit wakeup, it returns -EINTR. - */ -int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, - const enum hrtimer_mode mode) -{ - return schedule_hrtimeout_range_clock(expires, delta, mode, - CLOCK_MONOTONIC); -} -EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); - -/** - * schedule_hrtimeout - sleep until timeout - * @expires: timeout value (ktime_t) - * @mode: timer mode - * - * Make the current task sleep until the given expiry time has - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process()). - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task or the current task is explicitly woken - * up. - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Returns 0 when the timer has expired. If the task was woken before the - * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or - * by an explicit wakeup, it returns -EINTR. - */ -int __sched schedule_hrtimeout(ktime_t *expires, - const enum hrtimer_mode mode) -{ - return schedule_hrtimeout_range(expires, 0, mode); -} -EXPORT_SYMBOL_GPL(schedule_hrtimeout); diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c new file mode 100644 index 000000000000..78b2e7e30b1e --- /dev/null +++ b/kernel/time/sleep_timeout.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Kernel internal schedule timeout and sleeping functions + */ + +#include +#include +#include +#include +#include + +#include "tick-internal.h" + +/* + * Since schedule_timeout()'s timer is defined on the stack, it must store + * the target task on the stack as well. + */ +struct process_timer { + struct timer_list timer; + struct task_struct *task; +}; + +static void process_timeout(struct timer_list *t) +{ + struct process_timer *timeout = from_timer(timeout, t, timer); + + wake_up_process(timeout->task); +} + +/** + * schedule_timeout - sleep until timeout + * @timeout: timeout value in jiffies + * + * Make the current task sleep until @timeout jiffies have elapsed. + * The function behavior depends on the current task state + * (see also set_current_state() description): + * + * %TASK_RUNNING - the scheduler is called, but the task does not sleep + * at all. That happens because sched_submit_work() does nothing for + * tasks in %TASK_RUNNING state. + * + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task or the current task is explicitly woken + * up. + * + * The current task state is guaranteed to be %TASK_RUNNING when this + * routine returns. + * + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule + * the CPU away without a bound on the timeout. In this case the return + * value will be %MAX_SCHEDULE_TIMEOUT. + * + * Returns: 0 when the timer has expired otherwise the remaining time in + * jiffies will be returned. In all cases the return value is guaranteed + * to be non-negative. + */ +signed long __sched schedule_timeout(signed long timeout) +{ + struct process_timer timer; + unsigned long expire; + + switch (timeout) { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable + * in the caller. Nothing more. We could take + * MAX_SCHEDULE_TIMEOUT from one of the negative value + * but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be + * 0 since no piece of kernel is supposed to do a check + * for a negative retval of schedule_timeout() (since it + * should never happens anyway). You just have the printk() + * that will tell you if something is gone wrong and where. + */ + if (timeout < 0) { + pr_err("%s: wrong timeout value %lx\n", __func__, timeout); + dump_stack(); + __set_current_state(TASK_RUNNING); + goto out; + } + } + + expire = timeout + jiffies; + + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + timer.timer.expires = expire; + add_timer(&timer.timer); + schedule(); + del_timer_sync(&timer.timer); + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer.timer); + + timeout = expire - jiffies; + + out: + return timeout < 0 ? 0 : timeout; +} +EXPORT_SYMBOL(schedule_timeout); + +/* + * We can use __set_current_state() here because schedule_timeout() calls + * schedule() unconditionally. + */ +signed long __sched schedule_timeout_interruptible(signed long timeout) +{ + __set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_interruptible); + +signed long __sched schedule_timeout_killable(signed long timeout) +{ + __set_current_state(TASK_KILLABLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_killable); + +signed long __sched schedule_timeout_uninterruptible(signed long timeout) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_uninterruptible); + +/* + * Like schedule_timeout_uninterruptible(), except this task will not contribute + * to load average. + */ +signed long __sched schedule_timeout_idle(signed long timeout) +{ + __set_current_state(TASK_IDLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_idle); + +/** + * schedule_hrtimeout_range_clock - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) + * @mode: timer mode + * @clock_id: timer clock to be used + */ +int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, + const enum hrtimer_mode mode, clockid_t clock_id) +{ + struct hrtimer_sleeper t; + + /* + * Optimize when a zero timeout value is given. It does not + * matter whether this is an absolute or a relative time. + */ + if (expires && *expires == 0) { + __set_current_state(TASK_RUNNING); + return 0; + } + + /* + * A NULL parameter means "infinite" + */ + if (!expires) { + schedule(); + return -EINTR; + } + + hrtimer_init_sleeper_on_stack(&t, clock_id, mode); + hrtimer_set_expires_range_ns(&t.timer, *expires, delta); + hrtimer_sleeper_start_expires(&t, mode); + + if (likely(t.task)) + schedule(); + + hrtimer_cancel(&t.timer); + destroy_hrtimer_on_stack(&t.timer); + + __set_current_state(TASK_RUNNING); + + return !t.task ? 0 : -EINTR; +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); + +/** + * schedule_hrtimeout_range - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) + * @mode: timer mode + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * The @delta argument gives the kernel the freedom to schedule the + * actual wakeup to a time that is both power and performance friendly + * for regular (non RT/DL) tasks. + * The kernel give the normal best effort behavior for "@expires+@delta", + * but may decide to fire the timer earlier, but no earlier than @expires. + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task or the current task is explicitly woken + * up. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns: 0 when the timer has expired. If the task was woken before the + * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or + * by an explicit wakeup, it returns -EINTR. + */ +int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, + const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range_clock(expires, delta, mode, + CLOCK_MONOTONIC); +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); + +/** + * schedule_hrtimeout - sleep until timeout + * @expires: timeout value (ktime_t) + * @mode: timer mode + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task or the current task is explicitly woken + * up. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns: 0 when the timer has expired. If the task was woken before the + * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or + * by an explicit wakeup, it returns -EINTR. + */ +int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range(expires, 0, mode); +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout); + +/** + * msleep - sleep safely even with waitqueue interruptions + * @msecs: Time in milliseconds to sleep for + */ +void msleep(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs); + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +} +EXPORT_SYMBOL(msleep); + +/** + * msleep_interruptible - sleep waiting for signals + * @msecs: Time in milliseconds to sleep for + */ +unsigned long msleep_interruptible(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs); + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); + return jiffies_to_msecs(timeout); +} +EXPORT_SYMBOL(msleep_interruptible); + +/** + * usleep_range_state - Sleep for an approximate time in a given state + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + * @state: State of the current task that will be while sleeping + * + * In non-atomic context where the exact wakeup time is flexible, use + * usleep_range_state() instead of udelay(). The sleep improves responsiveness + * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces + * power usage by allowing hrtimers to take advantage of an already- + * scheduled interrupt instead of scheduling a new one just for this sleep. + */ +void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned int state) +{ + ktime_t exp = ktime_add_us(ktime_get(), min); + u64 delta = (u64)(max - min) * NSEC_PER_USEC; + + for (;;) { + __set_current_state(state); + /* Do not return before the requested sleep time has elapsed */ + if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) + break; + } +} +EXPORT_SYMBOL(usleep_range_state); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 0fc9d066a7be..02355b275bab 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -2526,141 +2525,6 @@ void update_process_times(int user_tick) run_posix_cpu_timers(); } -/* - * Since schedule_timeout()'s timer is defined on the stack, it must store - * the target task on the stack as well. - */ -struct process_timer { - struct timer_list timer; - struct task_struct *task; -}; - -static void process_timeout(struct timer_list *t) -{ - struct process_timer *timeout = from_timer(timeout, t, timer); - - wake_up_process(timeout->task); -} - -/** - * schedule_timeout - sleep until timeout - * @timeout: timeout value in jiffies - * - * Make the current task sleep until @timeout jiffies have elapsed. - * The function behavior depends on the current task state - * (see also set_current_state() description): - * - * %TASK_RUNNING - the scheduler is called, but the task does not sleep - * at all. That happens because sched_submit_work() does nothing for - * tasks in %TASK_RUNNING state. - * - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to - * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process()). - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task or the current task is explicitly woken - * up. - * - * The current task state is guaranteed to be %TASK_RUNNING when this - * routine returns. - * - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule - * the CPU away without a bound on the timeout. In this case the return - * value will be %MAX_SCHEDULE_TIMEOUT. - * - * Returns 0 when the timer has expired otherwise the remaining time in - * jiffies will be returned. In all cases the return value is guaranteed - * to be non-negative. - */ -signed long __sched schedule_timeout(signed long timeout) -{ - struct process_timer timer; - unsigned long expire; - - switch (timeout) - { - case MAX_SCHEDULE_TIMEOUT: - /* - * These two special cases are useful to be comfortable - * in the caller. Nothing more. We could take - * MAX_SCHEDULE_TIMEOUT from one of the negative value - * but I' d like to return a valid offset (>=0) to allow - * the caller to do everything it want with the retval. - */ - schedule(); - goto out; - default: - /* - * Another bit of PARANOID. Note that the retval will be - * 0 since no piece of kernel is supposed to do a check - * for a negative retval of schedule_timeout() (since it - * should never happens anyway). You just have the printk() - * that will tell you if something is gone wrong and where. - */ - if (timeout < 0) { - printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx\n", timeout); - dump_stack(); - __set_current_state(TASK_RUNNING); - goto out; - } - } - - expire = timeout + jiffies; - - timer.task = current; - timer_setup_on_stack(&timer.timer, process_timeout, 0); - __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); - schedule(); - del_timer_sync(&timer.timer); - - /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer.timer); - - timeout = expire - jiffies; - - out: - return timeout < 0 ? 0 : timeout; -} -EXPORT_SYMBOL(schedule_timeout); - -/* - * We can use __set_current_state() here because schedule_timeout() calls - * schedule() unconditionally. - */ -signed long __sched schedule_timeout_interruptible(signed long timeout) -{ - __set_current_state(TASK_INTERRUPTIBLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_interruptible); - -signed long __sched schedule_timeout_killable(signed long timeout) -{ - __set_current_state(TASK_KILLABLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_killable); - -signed long __sched schedule_timeout_uninterruptible(signed long timeout) -{ - __set_current_state(TASK_UNINTERRUPTIBLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_uninterruptible); - -/* - * Like schedule_timeout_uninterruptible(), except this task will not contribute - * to load average. - */ -signed long __sched schedule_timeout_idle(signed long timeout) -{ - __set_current_state(TASK_IDLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_idle); - #ifdef CONFIG_HOTPLUG_CPU static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) { @@ -2757,59 +2621,3 @@ void __init init_timers(void) posix_cputimers_init_work(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } - -/** - * msleep - sleep safely even with waitqueue interruptions - * @msecs: Time in milliseconds to sleep for - */ -void msleep(unsigned int msecs) -{ - unsigned long timeout = msecs_to_jiffies(msecs); - - while (timeout) - timeout = schedule_timeout_uninterruptible(timeout); -} - -EXPORT_SYMBOL(msleep); - -/** - * msleep_interruptible - sleep waiting for signals - * @msecs: Time in milliseconds to sleep for - */ -unsigned long msleep_interruptible(unsigned int msecs) -{ - unsigned long timeout = msecs_to_jiffies(msecs); - - while (timeout && !signal_pending(current)) - timeout = schedule_timeout_interruptible(timeout); - return jiffies_to_msecs(timeout); -} - -EXPORT_SYMBOL(msleep_interruptible); - -/** - * usleep_range_state - Sleep for an approximate time in a given state - * @min: Minimum time in usecs to sleep - * @max: Maximum time in usecs to sleep - * @state: State of the current task that will be while sleeping - * - * In non-atomic context where the exact wakeup time is flexible, use - * usleep_range_state() instead of udelay(). The sleep improves responsiveness - * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces - * power usage by allowing hrtimers to take advantage of an already- - * scheduled interrupt instead of scheduling a new one just for this sleep. - */ -void __sched usleep_range_state(unsigned long min, unsigned long max, - unsigned int state) -{ - ktime_t exp = ktime_add_us(ktime_get(), min); - u64 delta = (u64)(max - min) * NSEC_PER_USEC; - - for (;;) { - __set_current_state(state); - /* Do not return before the requested sleep time has elapsed */ - if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) - break; - } -} -EXPORT_SYMBOL(usleep_range_state); -- 2.51.0 From cf5b6ef0c36be3489972966b8a18aa5c48559661 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:20 +0200 Subject: [PATCH 13/16] timers: Update schedule_[hr]timeout*() related function descriptions schedule_timeout*() functions do not have proper kernel-doc formatted function descriptions. schedule_hrtimeout() and schedule_hrtimeout_range() have a almost identical description. Add missing function descriptions. Remove copy of function description and add a pointer to the existing description instead. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-3-dc8b907cb62f@linutronix.de --- kernel/time/sleep_timeout.c | 66 +++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 25 deletions(-) diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c index 78b2e7e30b1e..560d17c30aa5 100644 --- a/kernel/time/sleep_timeout.c +++ b/kernel/time/sleep_timeout.c @@ -110,8 +110,17 @@ signed long __sched schedule_timeout(signed long timeout) EXPORT_SYMBOL(schedule_timeout); /* - * We can use __set_current_state() here because schedule_timeout() calls - * schedule() unconditionally. + * __set_current_state() can be used in schedule_timeout_*() functions, because + * schedule_timeout() calls schedule() unconditionally. + */ + +/** + * schedule_timeout_interruptible - sleep until timeout (interruptible) + * @timeout: timeout value in jiffies + * + * See schedule_timeout() for details. + * + * Task state is set to TASK_INTERRUPTIBLE before starting the timeout. */ signed long __sched schedule_timeout_interruptible(signed long timeout) { @@ -120,6 +129,14 @@ signed long __sched schedule_timeout_interruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_interruptible); +/** + * schedule_timeout_killable - sleep until timeout (killable) + * @timeout: timeout value in jiffies + * + * See schedule_timeout() for details. + * + * Task state is set to TASK_KILLABLE before starting the timeout. + */ signed long __sched schedule_timeout_killable(signed long timeout) { __set_current_state(TASK_KILLABLE); @@ -127,6 +144,14 @@ signed long __sched schedule_timeout_killable(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_killable); +/** + * schedule_timeout_uninterruptible - sleep until timeout (uninterruptible) + * @timeout: timeout value in jiffies + * + * See schedule_timeout() for details. + * + * Task state is set to TASK_UNINTERRUPTIBLE before starting the timeout. + */ signed long __sched schedule_timeout_uninterruptible(signed long timeout) { __set_current_state(TASK_UNINTERRUPTIBLE); @@ -134,9 +159,15 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); -/* - * Like schedule_timeout_uninterruptible(), except this task will not contribute - * to load average. +/** + * schedule_timeout_idle - sleep until timeout (idle) + * @timeout: timeout value in jiffies + * + * See schedule_timeout() for details. + * + * Task state is set to TASK_IDLE before starting the timeout. It is similar to + * schedule_timeout_uninterruptible(), except this task will not contribute to + * load average. */ signed long __sched schedule_timeout_idle(signed long timeout) { @@ -151,6 +182,9 @@ EXPORT_SYMBOL(schedule_timeout_idle); * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * @clock_id: timer clock to be used + * + * Details are explained in schedule_hrtimeout_range() function description as + * this function is commonly used. */ int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, clockid_t clock_id) @@ -236,26 +270,8 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); * @expires: timeout value (ktime_t) * @mode: timer mode * - * Make the current task sleep until the given expiry time has - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process()). - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task or the current task is explicitly woken - * up. - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Returns: 0 when the timer has expired. If the task was woken before the - * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or - * by an explicit wakeup, it returns -EINTR. + * See schedule_hrtimeout_range() for details. @delta argument of + * schedule_hrtimeout_range() is set to 0 and has therefore no impact. */ int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) { -- 2.51.0 From 102f085d84607462234ac60f6027973b45a9bde2 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:21 +0200 Subject: [PATCH 14/16] timers: Rename usleep_idle_range() to usleep_range_idle() usleep_idle_range() is a variant of usleep_range(). Both are using usleep_range_state() as a base. To be able to find all the related functions in one go, rename it usleep_idle_range() to usleep_range_idle(). No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Reviewed-by: SeongJae Park Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-4-dc8b907cb62f@linutronix.de --- include/linux/delay.h | 2 +- mm/damon/core.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/delay.h b/include/linux/delay.h index ff9cda975e30..2bc586aa2068 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -68,7 +68,7 @@ static inline void usleep_range(unsigned long min, unsigned long max) usleep_range_state(min, max, TASK_UNINTERRUPTIBLE); } -static inline void usleep_idle_range(unsigned long min, unsigned long max) +static inline void usleep_range_idle(unsigned long min, unsigned long max) { usleep_range_state(min, max, TASK_IDLE); } diff --git a/mm/damon/core.c b/mm/damon/core.c index a83f3b736d51..c725c78b43f0 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1896,7 +1896,7 @@ static void kdamond_usleep(unsigned long usecs) if (usecs > 20 * USEC_PER_MSEC) schedule_timeout_idle(usecs_to_jiffies(usecs)); else - usleep_idle_range(usecs, usecs + 1); + usleep_range_idle(usecs, usecs + 1); } /* Returns negative error code if it's not activated but should return */ -- 2.51.0 From f36eb171410839325fff9cd9b7b7400f7e606962 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:22 +0200 Subject: [PATCH 15/16] timers: Update function descriptions of sleep/delay related functions A lot of commonly used functions for inserting a sleep or delay lack a proper function description. Add function descriptions to all of them to have important information in a central place close to the code. No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-5-dc8b907cb62f@linutronix.de --- include/asm-generic/delay.h | 41 +++++++++++++++++++++++++--- include/linux/delay.h | 48 ++++++++++++++++++++++++--------- kernel/time/sleep_timeout.c | 53 ++++++++++++++++++++++++++++++++----- 3 files changed, 120 insertions(+), 22 deletions(-) diff --git a/include/asm-generic/delay.h b/include/asm-generic/delay.h index e448ac61430c..a8cee41cc51b 100644 --- a/include/asm-generic/delay.h +++ b/include/asm-generic/delay.h @@ -12,11 +12,39 @@ extern void __const_udelay(unsigned long xloops); extern void __delay(unsigned long loops); /* - * The weird n/20000 thing suppresses a "comparison is always false due to - * limited range of data type" warning with non-const 8-bit arguments. + * Implementation details: + * + * * The weird n/20000 thing suppresses a "comparison is always false due to + * limited range of data type" warning with non-const 8-bit arguments. + * * 0x10c7 is 2**32 / 1000000 (rounded up) -> udelay + * * 0x5 is 2**32 / 1000000000 (rounded up) -> ndelay */ -/* 0x10c7 is 2**32 / 1000000 (rounded up) */ +/** + * udelay - Inserting a delay based on microseconds with busy waiting + * @usec: requested delay in microseconds + * + * When delaying in an atomic context ndelay(), udelay() and mdelay() are the + * only valid variants of delaying/sleeping to go with. + * + * When inserting delays in non atomic context which are shorter than the time + * which is required to queue e.g. an hrtimer and to enter then the scheduler, + * it is also valuable to use udelay(). But it is not simple to specify a + * generic threshold for this which will fit for all systems. An approximation + * is a threshold for all delays up to 10 microseconds. + * + * When having a delay which is larger than the architecture specific + * %MAX_UDELAY_MS value, please make sure mdelay() is used. Otherwise a overflow + * risk is given. + * + * Please note that ndelay(), udelay() and mdelay() may return early for several + * reasons (https://lists.openwall.net/linux-kernel/2011/01/09/56): + * + * #. computed loops_per_jiffy too low (due to the time taken to execute the + * timer interrupt.) + * #. cache behaviour affecting the time it takes to execute the loop function. + * #. CPU clock rate changes. + */ #define udelay(n) \ ({ \ if (__builtin_constant_p(n)) { \ @@ -29,7 +57,12 @@ extern void __delay(unsigned long loops); } \ }) -/* 0x5 is 2**32 / 1000000000 (rounded up) */ +/** + * ndelay - Inserting a delay based on nanoseconds with busy waiting + * @nsec: requested delay in nanoseconds + * + * See udelay() for basic information about ndelay() and it's variants. + */ #define ndelay(n) \ ({ \ if (__builtin_constant_p(n)) { \ diff --git a/include/linux/delay.h b/include/linux/delay.h index 2bc586aa2068..2de509e4adce 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -6,17 +6,7 @@ * Copyright (C) 1993 Linus Torvalds * * Delay routines, using a pre-computed "loops_per_jiffy" value. - * - * Please note that ndelay(), udelay() and mdelay() may return early for - * several reasons: - * 1. computed loops_per_jiffy too low (due to the time taken to - * execute the timer interrupt.) - * 2. cache behaviour affecting the time it takes to execute the - * loop function. - * 3. CPU clock rate changes. - * - * Please see this thread: - * https://lists.openwall.net/linux-kernel/2011/01/09/56 + * Sleep routines using timer list timers or hrtimers. */ #include @@ -35,12 +25,21 @@ extern unsigned long loops_per_jiffy; * The 2nd mdelay() definition ensures GCC will optimize away the * while loop for the common cases where n <= MAX_UDELAY_MS -- Paul G. */ - #ifndef MAX_UDELAY_MS #define MAX_UDELAY_MS 5 #endif #ifndef mdelay +/** + * mdelay - Inserting a delay based on milliseconds with busy waiting + * @n: requested delay in milliseconds + * + * See udelay() for basic information about mdelay() and it's variants. + * + * Please double check, whether mdelay() is the right way to go or whether a + * refactoring of the code is the better variant to be able to use msleep() + * instead. + */ #define mdelay(n) (\ (__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \ ({unsigned long __ms=(n); while (__ms--) udelay(1000);})) @@ -63,16 +62,41 @@ unsigned long msleep_interruptible(unsigned int msecs); void usleep_range_state(unsigned long min, unsigned long max, unsigned int state); +/** + * usleep_range - Sleep for an approximate time + * @min: Minimum time in microseconds to sleep + * @max: Maximum time in microseconds to sleep + * + * For basic information please refere to usleep_range_state(). + * + * The task will be in the state TASK_UNINTERRUPTIBLE during the sleep. + */ static inline void usleep_range(unsigned long min, unsigned long max) { usleep_range_state(min, max, TASK_UNINTERRUPTIBLE); } +/** + * usleep_range_idle - Sleep for an approximate time with idle time accounting + * @min: Minimum time in microseconds to sleep + * @max: Maximum time in microseconds to sleep + * + * For basic information please refere to usleep_range_state(). + * + * The sleeping task has the state TASK_IDLE during the sleep to prevent + * contribution to the load avarage. + */ static inline void usleep_range_idle(unsigned long min, unsigned long max) { usleep_range_state(min, max, TASK_IDLE); } +/** + * ssleep - wrapper for seconds around msleep + * @seconds: Requested sleep duration in seconds + * + * Please refere to msleep() for detailed information. + */ static inline void ssleep(unsigned int seconds) { msleep(seconds * 1000); diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c index 560d17c30aa5..f3f246e4c8d1 100644 --- a/kernel/time/sleep_timeout.c +++ b/kernel/time/sleep_timeout.c @@ -281,7 +281,34 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout); /** * msleep - sleep safely even with waitqueue interruptions - * @msecs: Time in milliseconds to sleep for + * @msecs: Requested sleep duration in milliseconds + * + * msleep() uses jiffy based timeouts for the sleep duration. Because of the + * design of the timer wheel, the maximum additional percentage delay (slack) is + * 12.5%. This is only valid for timers which will end up in level 1 or a higher + * level of the timer wheel. For explanation of those 12.5% please check the + * detailed description about the basics of the timer wheel. + * + * The slack of timers which will end up in level 0 depends on sleep duration + * (msecs) and HZ configuration and can be calculated in the following way (with + * the timer wheel design restriction that the slack is not less than 12.5%): + * + * ``slack = MSECS_PER_TICK / msecs`` + * + * When the allowed slack of the callsite is known, the calculation could be + * turned around to find the minimal allowed sleep duration to meet the + * constraints. For example: + * + * * ``HZ=1000`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 1 / (1/4) = 4``: + * all sleep durations greater or equal 4ms will meet the constraints. + * * ``HZ=1000`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 1 / (1/8) = 8``: + * all sleep durations greater or equal 8ms will meet the constraints. + * * ``HZ=250`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 4 / (1/4) = 16``: + * all sleep durations greater or equal 16ms will meet the constraints. + * * ``HZ=250`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 4 / (1/8) = 32``: + * all sleep durations greater or equal 32ms will meet the constraints. + * + * See also the signal aware variant msleep_interruptible(). */ void msleep(unsigned int msecs) { @@ -294,7 +321,15 @@ EXPORT_SYMBOL(msleep); /** * msleep_interruptible - sleep waiting for signals - * @msecs: Time in milliseconds to sleep for + * @msecs: Requested sleep duration in milliseconds + * + * See msleep() for some basic information. + * + * The difference between msleep() and msleep_interruptible() is that the sleep + * could be interrupted by a signal delivery and then returns early. + * + * Returns: The remaining time of the sleep duration transformed to msecs (see + * schedule_timeout() for details). */ unsigned long msleep_interruptible(unsigned int msecs) { @@ -312,11 +347,17 @@ EXPORT_SYMBOL(msleep_interruptible); * @max: Maximum time in usecs to sleep * @state: State of the current task that will be while sleeping * + * usleep_range_state() sleeps at least for the minimum specified time but not + * longer than the maximum specified amount of time. The range might reduce + * power usage by allowing hrtimers to coalesce an already scheduled interrupt + * with this hrtimer. In the worst case, an interrupt is scheduled for the upper + * bound. + * + * The sleeping task is set to the specified state before starting the sleep. + * * In non-atomic context where the exact wakeup time is flexible, use - * usleep_range_state() instead of udelay(). The sleep improves responsiveness - * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces - * power usage by allowing hrtimers to take advantage of an already- - * scheduled interrupt instead of scheduling a new one just for this sleep. + * usleep_range() or its variants instead of udelay(). The sleep improves + * responsiveness by avoiding the CPU-hogging busy-wait of udelay(). */ void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned int state) { -- 2.51.0 From 19e2d91d8cb1f333adf04731f2788ff6ca06cebd Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:23 +0200 Subject: [PATCH 16/16] delay: Rework udelay and ndelay udelay() as well as ndelay() are defines and no functions and are using constants to be able to transform a sleep time into loops and to prevent too long udelays/ndelays. There was a compiler error with non-const 8 bit arguments which was fixed by commit a87e553fabe8 ("asm-generic: delay.h fix udelay and ndelay for 8 bit args"). When using a function, the non-const 8 bit argument is type casted and the problem would be gone. Transform udelay() and ndelay() into proper functions, remove the no longer and confusing division, add defines for the magic values and add some explanations as well. Suggested-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-6-dc8b907cb62f@linutronix.de --- include/asm-generic/delay.h | 65 +++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/include/asm-generic/delay.h b/include/asm-generic/delay.h index a8cee41cc51b..76cf237b6e4c 100644 --- a/include/asm-generic/delay.h +++ b/include/asm-generic/delay.h @@ -2,6 +2,9 @@ #ifndef __ASM_GENERIC_DELAY_H #define __ASM_GENERIC_DELAY_H +#include +#include + /* Undefined functions to get compile-time errors */ extern void __bad_udelay(void); extern void __bad_ndelay(void); @@ -12,13 +15,18 @@ extern void __const_udelay(unsigned long xloops); extern void __delay(unsigned long loops); /* - * Implementation details: - * - * * The weird n/20000 thing suppresses a "comparison is always false due to - * limited range of data type" warning with non-const 8-bit arguments. - * * 0x10c7 is 2**32 / 1000000 (rounded up) -> udelay - * * 0x5 is 2**32 / 1000000000 (rounded up) -> ndelay + * The microseconds/nanosecond delay multiplicators are used to convert a + * constant microseconds/nanoseconds value to a value which can be used by the + * architectures specific implementation to transform it into loops. + */ +#define UDELAY_CONST_MULT ((unsigned long)DIV_ROUND_UP(1ULL << 32, USEC_PER_SEC)) +#define NDELAY_CONST_MULT ((unsigned long)DIV_ROUND_UP(1ULL << 32, NSEC_PER_SEC)) + +/* + * The maximum constant udelay/ndelay value picked out of thin air to prevent + * too long constant udelays/ndelays. */ +#define DELAY_CONST_MAX 20000 /** * udelay - Inserting a delay based on microseconds with busy waiting @@ -45,17 +53,17 @@ extern void __delay(unsigned long loops); * #. cache behaviour affecting the time it takes to execute the loop function. * #. CPU clock rate changes. */ -#define udelay(n) \ - ({ \ - if (__builtin_constant_p(n)) { \ - if ((n) / 20000 >= 1) \ - __bad_udelay(); \ - else \ - __const_udelay((n) * 0x10c7ul); \ - } else { \ - __udelay(n); \ - } \ - }) +static __always_inline void udelay(unsigned long usec) +{ + if (__builtin_constant_p(usec)) { + if (usec >= DELAY_CONST_MAX) + __bad_udelay(); + else + __const_udelay(usec * UDELAY_CONST_MULT); + } else { + __udelay(usec); + } +} /** * ndelay - Inserting a delay based on nanoseconds with busy waiting @@ -63,16 +71,17 @@ extern void __delay(unsigned long loops); * * See udelay() for basic information about ndelay() and it's variants. */ -#define ndelay(n) \ - ({ \ - if (__builtin_constant_p(n)) { \ - if ((n) / 20000 >= 1) \ - __bad_ndelay(); \ - else \ - __const_udelay((n) * 5ul); \ - } else { \ - __ndelay(n); \ - } \ - }) +static __always_inline void ndelay(unsigned long nsec) +{ + if (__builtin_constant_p(nsec)) { + if (nsec >= DELAY_CONST_MAX) + __bad_udelay(); + else + __const_udelay(nsec * NDELAY_CONST_MULT); + } else { + __udelay(nsec); + } +} +#define ndelay(x) ndelay(x) #endif /* __ASM_GENERIC_DELAY_H */ -- 2.51.0