From ef0245582e5bccd8b4c480a58bd4da91ee276397 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:25 +0200 Subject: [PATCH 01/16] mm/damon/core: Use generic upper bound recommondation for usleep_range() The upper bound for usleep_range_idle() was taken from the outdated documentation. As a recommondation for the upper bound of usleep_range() depends on HZ configuration it is not possible to hard code it. Use the define "USLEEP_RANGE_UPPER_BOUND" instead. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: SeongJae Park Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-8-dc8b907cb62f@linutronix.de --- mm/damon/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index c725c78b43f0..79efd8089d6c 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1892,8 +1892,7 @@ static unsigned long damos_wmark_wait_us(struct damos *scheme) static void kdamond_usleep(unsigned long usecs) { - /* See Documentation/timers/timers-howto.rst for the thresholds */ - if (usecs > 20 * USEC_PER_MSEC) + if (usecs >= USLEEP_RANGE_UPPER_BOUND) schedule_timeout_idle(usecs_to_jiffies(usecs)); else usleep_range_idle(usecs, usecs + 1); -- 2.51.0 From 6279abf16a014474fba3de2e28b6ede871141cde Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:26 +0200 Subject: [PATCH 02/16] timers: Add a warning to usleep_range_state() for wrong order of arguments There is a warning in checkpatch script that triggers, when min and max arguments of usleep_range_state() are in reverse order. This check does only cover callsites which uses constants. Add this check into the code as a WARN_ON_ONCE() to also cover callsites not using constants and fix the mis-usage by resetting the delta to 0. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-9-dc8b907cb62f@linutronix.de --- kernel/time/sleep_timeout.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c index f3f246e4c8d1..3054e5232d20 100644 --- a/kernel/time/sleep_timeout.c +++ b/kernel/time/sleep_timeout.c @@ -364,6 +364,9 @@ void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned i ktime_t exp = ktime_add_us(ktime_get(), min); u64 delta = (u64)(max - min) * NSEC_PER_USEC; + if (WARN_ON_ONCE(max < min)) + delta = 0; + for (;;) { __set_current_state(state); /* Do not return before the requested sleep time has elapsed */ -- 2.51.0 From 6534086aa684248f779944a2ac9253d6d637eec6 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:27 +0200 Subject: [PATCH 03/16] checkpatch: Remove links to outdated documentation checkpatch.pl checks for several things related to sleep and delay functions. In all warnings the outdated documentation is referenced. Also in checkpatch kernel documentation the outdated documentation is referenced. Replace the links to the outdated documentation with links to the function description. Note: Update of the outdated checkpatch checks is done in a second step. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-10-dc8b907cb62f@linutronix.de --- Documentation/dev-tools/checkpatch.rst | 2 -- scripts/checkpatch.pl | 10 +++++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/Documentation/dev-tools/checkpatch.rst b/Documentation/dev-tools/checkpatch.rst index a9fac978a525..abb3ff682076 100644 --- a/Documentation/dev-tools/checkpatch.rst +++ b/Documentation/dev-tools/checkpatch.rst @@ -470,8 +470,6 @@ API usage usleep_range() should be preferred over udelay(). The proper way of using usleep_range() is mentioned in the kernel docs. - See: https://www.kernel.org/doc/html/latest/timers/timers-howto.html#delays-information-on-the-various-kernel-delay-sleep-mechanisms - Comments -------- diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 4427572b2477..98790fe5115d 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6597,11 +6597,11 @@ sub process { # ignore udelay's < 10, however if (! ($delay < 10) ) { CHK("USLEEP_RANGE", - "usleep_range is preferred over udelay; see Documentation/timers/timers-howto.rst\n" . $herecurr); + "usleep_range is preferred over udelay; see function description of usleep_range() and udelay().\n" . $herecurr); } if ($delay > 2000) { WARN("LONG_UDELAY", - "long udelay - prefer mdelay; see arch/arm/include/asm/delay.h\n" . $herecurr); + "long udelay - prefer mdelay; see function description of mdelay().\n" . $herecurr); } } @@ -6609,7 +6609,7 @@ sub process { if ($line =~ /\bmsleep\s*\((\d+)\);/) { if ($1 < 20) { WARN("MSLEEP", - "msleep < 20ms can sleep for up to 20ms; see Documentation/timers/timers-howto.rst\n" . $herecurr); + "msleep < 20ms can sleep for up to 20ms; see function description of msleep().\n" . $herecurr); } } @@ -7077,11 +7077,11 @@ sub process { my $max = $7; if ($min eq $max) { WARN("USLEEP_RANGE", - "usleep_range should not use min == max args; see Documentation/timers/timers-howto.rst\n" . "$here\n$stat\n"); + "usleep_range should not use min == max args; see function description of usleep_range().\n" . "$here\n$stat\n"); } elsif ($min =~ /^\d+$/ && $max =~ /^\d+$/ && $min > $max) { WARN("USLEEP_RANGE", - "usleep_range args reversed, use min then max; see Documentation/timers/timers-howto.rst\n" . "$here\n$stat\n"); + "usleep_range args reversed, use min then max; see function description of usleep_range().\n" . "$here\n$stat\n"); } } -- 2.51.0 From 89124747f096fc0fe44be0162c7b4fb3271739e8 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:29 +0200 Subject: [PATCH 04/16] iopoll/regmap/phy/snd: Fix comment referencing outdated timer documentation Function descriptions in iopoll.h, regmap.h, phy.h and sound/soc/sof/ops.h copied all the same outdated documentation about sleep/delay function limitations. In those comments, the generic (and still outdated) timer documentation file is referenced. As proper function descriptions for used delay and sleep functions are in place, simply update the descriptions to reference to them. While at it fix missing colon after "Returns" in function description and move return value description to the end of the function description. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Andrew Lunn # for phy.h Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-12-dc8b907cb62f@linutronix.de --- include/linux/iopoll.h | 52 +++++++++++++++++++++--------------------- include/linux/phy.h | 9 ++++---- include/linux/regmap.h | 38 +++++++++++++++--------------- sound/soc/sof/ops.h | 8 +++---- 4 files changed, 54 insertions(+), 53 deletions(-) diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h index 19a7b00baff4..91324c331a4b 100644 --- a/include/linux/iopoll.h +++ b/include/linux/iopoll.h @@ -19,19 +19,19 @@ * @op: accessor function (takes @args as its arguments) * @val: Variable to read the value into * @cond: Break condition (usually involving @val) - * @sleep_us: Maximum time to sleep between reads in us (0 - * tight-loops). Should be less than ~20ms since usleep_range - * is used (see Documentation/timers/timers-howto.rst). + * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please + * read usleep_range() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * @sleep_before_read: if it is true, sleep @sleep_us before read. * @args: arguments for @op poll * - * Returns 0 on success and -ETIMEDOUT upon a timeout. In either - * case, the last read value at @args is stored in @val. Must not - * be called from atomic context if sleep_us or timeout_us are used. - * * When available, you'll probably want to use one of the specialized * macros defined below rather than this macro directly. + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either + * case, the last read value at @args is stored in @val. Must not + * be called from atomic context if sleep_us or timeout_us are used. */ #define read_poll_timeout(op, val, cond, sleep_us, timeout_us, \ sleep_before_read, args...) \ @@ -64,22 +64,22 @@ * @op: accessor function (takes @args as its arguments) * @val: Variable to read the value into * @cond: Break condition (usually involving @val) - * @delay_us: Time to udelay between reads in us (0 tight-loops). Should - * be less than ~10us since udelay is used (see - * Documentation/timers/timers-howto.rst). + * @delay_us: Time to udelay between reads in us (0 tight-loops). Please + * read udelay() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * @delay_before_read: if it is true, delay @delay_us before read. * @args: arguments for @op poll * - * Returns 0 on success and -ETIMEDOUT upon a timeout. In either - * case, the last read value at @args is stored in @val. - * * This macro does not rely on timekeeping. Hence it is safe to call even when * timekeeping is suspended, at the expense of an underestimation of wall clock * time, which is rather minimal with a non-zero delay_us. * * When available, you'll probably want to use one of the specialized * macros defined below rather than this macro directly. + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either + * case, the last read value at @args is stored in @val. */ #define read_poll_timeout_atomic(op, val, cond, delay_us, timeout_us, \ delay_before_read, args...) \ @@ -119,17 +119,17 @@ * @addr: Address to poll * @val: Variable to read the value into * @cond: Break condition (usually involving @val) - * @sleep_us: Maximum time to sleep between reads in us (0 - * tight-loops). Should be less than ~20ms since usleep_range - * is used (see Documentation/timers/timers-howto.rst). + * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please + * read usleep_range() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * - * Returns 0 on success and -ETIMEDOUT upon a timeout. In either - * case, the last read value at @addr is stored in @val. Must not - * be called from atomic context if sleep_us or timeout_us are used. - * * When available, you'll probably want to use one of the specialized * macros defined below rather than this macro directly. + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either + * case, the last read value at @addr is stored in @val. Must not + * be called from atomic context if sleep_us or timeout_us are used. */ #define readx_poll_timeout(op, addr, val, cond, sleep_us, timeout_us) \ read_poll_timeout(op, val, cond, sleep_us, timeout_us, false, addr) @@ -140,16 +140,16 @@ * @addr: Address to poll * @val: Variable to read the value into * @cond: Break condition (usually involving @val) - * @delay_us: Time to udelay between reads in us (0 tight-loops). Should - * be less than ~10us since udelay is used (see - * Documentation/timers/timers-howto.rst). + * @delay_us: Time to udelay between reads in us (0 tight-loops). Please + * read udelay() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * - * Returns 0 on success and -ETIMEDOUT upon a timeout. In either - * case, the last read value at @addr is stored in @val. - * * When available, you'll probably want to use one of the specialized * macros defined below rather than this macro directly. + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either + * case, the last read value at @addr is stored in @val. */ #define readx_poll_timeout_atomic(op, addr, val, cond, delay_us, timeout_us) \ read_poll_timeout_atomic(op, val, cond, delay_us, timeout_us, false, addr) diff --git a/include/linux/phy.h b/include/linux/phy.h index a98bc91a0cde..504766d4b2d5 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1378,12 +1378,13 @@ int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); * @regnum: The register on the MMD to read * @val: Variable to read the register into * @cond: Break condition (usually involving @val) - * @sleep_us: Maximum time to sleep between reads in us (0 - * tight-loops). Should be less than ~20ms since usleep_range - * is used (see Documentation/timers/timers-howto.rst). + * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please + * read usleep_range() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * @sleep_before_read: if it is true, sleep @sleep_us before read. - * Returns 0 on success and -ETIMEDOUT upon a timeout. In either + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either * case, the last read value at @args is stored in @val. Must not * be called from atomic context if sleep_us or timeout_us are used. */ diff --git a/include/linux/regmap.h b/include/linux/regmap.h index f9ccad32fc5c..75f162b60ba1 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -106,17 +106,17 @@ struct reg_sequence { * @addr: Address to poll * @val: Unsigned integer variable to read the value into * @cond: Break condition (usually involving @val) - * @sleep_us: Maximum time to sleep between reads in us (0 - * tight-loops). Should be less than ~20ms since usleep_range - * is used (see Documentation/timers/timers-howto.rst). + * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please + * read usleep_range() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * - * Returns 0 on success and -ETIMEDOUT upon a timeout or the regmap_read + * This is modelled after the readx_poll_timeout macros in linux/iopoll.h. + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout or the regmap_read * error return value in case of a error read. In the two former cases, * the last read value at @addr is stored in @val. Must not be called * from atomic context if sleep_us or timeout_us are used. - * - * This is modelled after the readx_poll_timeout macros in linux/iopoll.h. */ #define regmap_read_poll_timeout(map, addr, val, cond, sleep_us, timeout_us) \ ({ \ @@ -133,20 +133,20 @@ struct reg_sequence { * @addr: Address to poll * @val: Unsigned integer variable to read the value into * @cond: Break condition (usually involving @val) - * @delay_us: Time to udelay between reads in us (0 tight-loops). - * Should be less than ~10us since udelay is used - * (see Documentation/timers/timers-howto.rst). + * @delay_us: Time to udelay between reads in us (0 tight-loops). Please + * read udelay() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * - * Returns 0 on success and -ETIMEDOUT upon a timeout or the regmap_read - * error return value in case of a error read. In the two former cases, - * the last read value at @addr is stored in @val. - * * This is modelled after the readx_poll_timeout_atomic macros in linux/iopoll.h. * * Note: In general regmap cannot be used in atomic context. If you want to use * this macro then first setup your regmap for atomic use (flat or no cache * and MMIO regmap). + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout or the regmap_read + * error return value in case of a error read. In the two former cases, + * the last read value at @addr is stored in @val. */ #define regmap_read_poll_timeout_atomic(map, addr, val, cond, delay_us, timeout_us) \ ({ \ @@ -177,17 +177,17 @@ struct reg_sequence { * @field: Regmap field to read from * @val: Unsigned integer variable to read the value into * @cond: Break condition (usually involving @val) - * @sleep_us: Maximum time to sleep between reads in us (0 - * tight-loops). Should be less than ~20ms since usleep_range - * is used (see Documentation/timers/timers-howto.rst). + * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please + * read usleep_range() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * - * Returns 0 on success and -ETIMEDOUT upon a timeout or the regmap_field_read + * This is modelled after the readx_poll_timeout macros in linux/iopoll.h. + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout or the regmap_field_read * error return value in case of a error read. In the two former cases, * the last read value at @addr is stored in @val. Must not be called * from atomic context if sleep_us or timeout_us are used. - * - * This is modelled after the readx_poll_timeout macros in linux/iopoll.h. */ #define regmap_field_read_poll_timeout(field, val, cond, sleep_us, timeout_us) \ ({ \ diff --git a/sound/soc/sof/ops.h b/sound/soc/sof/ops.h index 2584621c3b2d..d73644e85b6e 100644 --- a/sound/soc/sof/ops.h +++ b/sound/soc/sof/ops.h @@ -597,12 +597,12 @@ snd_sof_is_chain_dma_supported(struct snd_sof_dev *sdev, u32 dai_type) * @addr: Address to poll * @val: Variable to read the value into * @cond: Break condition (usually involving @val) - * @sleep_us: Maximum time to sleep between reads in us (0 - * tight-loops). Should be less than ~20ms since usleep_range - * is used (see Documentation/timers/timers-howto.rst). + * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please + * read usleep_range() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * - * Returns 0 on success and -ETIMEDOUT upon a timeout. In either + * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either * case, the last read value at @addr is stored in @val. Must not * be called from atomic context if sleep_us or timeout_us are used. * -- 2.51.0 From b7f0eb8c9bc8662ca78082e82856fcb0cf16d7c6 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:30 +0200 Subject: [PATCH 05/16] powerpc/rtas: Use fsleep() to minimize additional sleep duration When commit 38f7b7067dae ("powerpc/rtas: rtas_busy_delay() improvements") was introduced, documentation about proper usage of sleep related functions was outdated. The commit message references the usage of a HZ=100 system. When using a 20ms sleep duration on such a system and therefore using msleep(), the possible additional slack will be +10ms. When the system is configured with HZ=100 the granularity of a jiffy and of a bucket of the lowest timer wheel level is 10ms. To make sure a timer will not expire early (when queueing of the timer races with an concurrent update of jiffies), timers are always queued into the next bucket. This is the reason for the maximal possible slack of 10ms. fsleep() limits the maximal possible slack to 25% by making threshold between usleep_range() and msleep() HZ dependent. As soon as the accuracy of msleep() is sufficient, the less expensive timer list timer based sleeping function is used instead of the more expensive hrtimer based usleep_range() function. The udelay() will not be used in this specific usecase as the lowest sleep length is larger than 1 millisecond. Use fsleep() directly instead of using an own heuristic for the best sleeping mechanism to use. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Michael Ellerman (powerpc) Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-13-dc8b907cb62f@linutronix.de --- arch/powerpc/kernel/rtas.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index f7e86e09c49f..d31c9799cab2 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -1390,21 +1390,14 @@ bool __ref rtas_busy_delay(int status) */ ms = clamp(ms, 1U, 1000U); /* - * The delay hint is an order-of-magnitude suggestion, not - * a minimum. It is fine, possibly even advantageous, for - * us to pause for less time than hinted. For small values, - * use usleep_range() to ensure we don't sleep much longer - * than actually needed. - * - * See Documentation/timers/timers-howto.rst for - * explanation of the threshold used here. In effect we use - * usleep_range() for 9900 and 9901, msleep() for - * 9902-9905. + * The delay hint is an order-of-magnitude suggestion, not a + * minimum. It is fine, possibly even advantageous, for us to + * pause for less time than hinted. To make sure pause time will + * not be way longer than requested independent of HZ + * configuration, use fsleep(). See fsleep() for details of + * used sleeping functions. */ - if (ms <= 20) - usleep_range(ms * 100, ms * 1000); - else - msleep(ms); + fsleep(ms * 1000); break; case RTAS_BUSY: ret = true; -- 2.51.0 From d2af954f225db2ccf446a4b174a5281dff171d41 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:31 +0200 Subject: [PATCH 06/16] media: anysee: Fix and remove outdated comment anysee driver was transformed to use usbv2 years ago. The comments in anysee_ctrl_msg() still are referencing the old interfaces where msleep() was used. The v2 interfaces also changed over the years and with commit 1162c7b383a6 ("[media] dvb_usb_v2: refactor dvb_usbv2_generic_rw()") the usage of msleep() was gone anyway. Remove FIXME comment and update also comment before call to dvb_usbv2_generic_rw_locked(). Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Sebastian Andrzej Siewior Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-14-dc8b907cb62f@linutronix.de --- drivers/media/usb/dvb-usb-v2/anysee.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/media/usb/dvb-usb-v2/anysee.c b/drivers/media/usb/dvb-usb-v2/anysee.c index 8699846eb416..bea12cdc85e8 100644 --- a/drivers/media/usb/dvb-usb-v2/anysee.c +++ b/drivers/media/usb/dvb-usb-v2/anysee.c @@ -46,24 +46,15 @@ static int anysee_ctrl_msg(struct dvb_usb_device *d, dev_dbg(&d->udev->dev, "%s: >>> %*ph\n", __func__, slen, state->buf); - /* We need receive one message more after dvb_usb_generic_rw due - to weird transaction flow, which is 1 x send + 2 x receive. */ + /* + * We need receive one message more after dvb_usbv2_generic_rw_locked() + * due to weird transaction flow, which is 1 x send + 2 x receive. + */ ret = dvb_usbv2_generic_rw_locked(d, state->buf, sizeof(state->buf), state->buf, sizeof(state->buf)); if (ret) goto error_unlock; - /* TODO FIXME: dvb_usb_generic_rw() fails rarely with error code -32 - * (EPIPE, Broken pipe). Function supports currently msleep() as a - * parameter but I would not like to use it, since according to - * Documentation/timers/timers-howto.rst it should not be used such - * short, under < 20ms, sleeps. Repeating failed message would be - * better choice as not to add unwanted delays... - * Fixing that correctly is one of those or both; - * 1) use repeat if possible - * 2) add suitable delay - */ - /* get answer, retry few times if error returned */ for (i = 0; i < 3; i++) { /* receive 2nd answer */ -- 2.51.0 From 1f455f601e2060497f9883991e8d5e79fbc7b047 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:32 +0200 Subject: [PATCH 07/16] timers/Documentation: Cleanup delay/sleep documentation The documentation which tries to give advices how to properly inserting delays or sleeps is outdated. The file name is 'timers-howto.rst' which might be misleading as it is only about delay and sleep mechanisms and not how to use timers. Update the documentation by integrating the important parts from the related function descriptions and move it all into a self explaining file with the name "delay_sleep_functions.rst". Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-15-dc8b907cb62f@linutronix.de --- .../timers/delay_sleep_functions.rst | 121 ++++++++++++++++++ Documentation/timers/index.rst | 2 +- Documentation/timers/timers-howto.rst | 115 ----------------- 3 files changed, 122 insertions(+), 116 deletions(-) create mode 100644 Documentation/timers/delay_sleep_functions.rst delete mode 100644 Documentation/timers/timers-howto.rst diff --git a/Documentation/timers/delay_sleep_functions.rst b/Documentation/timers/delay_sleep_functions.rst new file mode 100644 index 000000000000..49d603a3f113 --- /dev/null +++ b/Documentation/timers/delay_sleep_functions.rst @@ -0,0 +1,121 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Delay and sleep mechanisms +========================== + +This document seeks to answer the common question: "What is the +RightWay (TM) to insert a delay?" + +This question is most often faced by driver writers who have to +deal with hardware delays and who may not be the most intimately +familiar with the inner workings of the Linux Kernel. + +The following table gives a rough overview about the existing function +'families' and their limitations. This overview table does not replace the +reading of the function description before usage! + +.. list-table:: + :widths: 20 20 20 20 20 + :header-rows: 2 + + * - + - `*delay()` + - `usleep_range*()` + - `*sleep()` + - `fsleep()` + * - + - busy-wait loop + - hrtimers based + - timer list timers based + - combines the others + * - Usage in atomic Context + - yes + - no + - no + - no + * - precise on "short intervals" + - yes + - yes + - depends + - yes + * - precise on "long intervals" + - Do not use! + - yes + - max 12.5% slack + - yes + * - interruptible variant + - no + - yes + - yes + - no + +A generic advice for non atomic contexts could be: + +#. Use `fsleep()` whenever unsure (as it combines all the advantages of the + others) +#. Use `*sleep()` whenever possible +#. Use `usleep_range*()` whenever accuracy of `*sleep()` is not sufficient +#. Use `*delay()` for very, very short delays + +Find some more detailed information about the function 'families' in the next +sections. + +`*delay()` family of functions +------------------------------ + +These functions use the jiffy estimation of clock speed and will busy wait for +enough loop cycles to achieve the desired delay. udelay() is the basic +implementation and ndelay() as well as mdelay() are variants. + +These functions are mainly used to add a delay in atomic context. Please make +sure to ask yourself before adding a delay in atomic context: Is this really +required? + +.. kernel-doc:: include/asm-generic/delay.h + :identifiers: udelay ndelay + +.. kernel-doc:: include/linux/delay.h + :identifiers: mdelay + + +`usleep_range*()` and `*sleep()` family of functions +---------------------------------------------------- + +These functions use hrtimers or timer list timers to provide the requested +sleeping duration. In order to decide which function is the right one to use, +take some basic information into account: + +#. hrtimers are more expensive as they are using an rb-tree (instead of hashing) +#. hrtimers are more expensive when the requested sleeping duration is the first + timer which means real hardware has to be programmed +#. timer list timers always provide some sort of slack as they are jiffy based + +The generic advice is repeated here: + +#. Use `fsleep()` whenever unsure (as it combines all the advantages of the + others) +#. Use `*sleep()` whenever possible +#. Use `usleep_range*()` whenever accuracy of `*sleep()` is not sufficient + +First check fsleep() function description and to learn more about accuracy, +please check msleep() function description. + + +`usleep_range*()` +~~~~~~~~~~~~~~~~~ + +.. kernel-doc:: include/linux/delay.h + :identifiers: usleep_range usleep_range_idle + +.. kernel-doc:: kernel/time/sleep_timeout.c + :identifiers: usleep_range_state + + +`*sleep()` +~~~~~~~~~~ + +.. kernel-doc:: kernel/time/sleep_timeout.c + :identifiers: msleep msleep_interruptible + +.. kernel-doc:: include/linux/delay.h + :identifiers: ssleep fsleep diff --git a/Documentation/timers/index.rst b/Documentation/timers/index.rst index 983f91f8f023..4e88116e4dcf 100644 --- a/Documentation/timers/index.rst +++ b/Documentation/timers/index.rst @@ -12,7 +12,7 @@ Timers hrtimers no_hz timekeeping - timers-howto + delay_sleep_functions .. only:: subproject and html diff --git a/Documentation/timers/timers-howto.rst b/Documentation/timers/timers-howto.rst deleted file mode 100644 index ef7a4652ccc9..000000000000 --- a/Documentation/timers/timers-howto.rst +++ /dev/null @@ -1,115 +0,0 @@ -=================================================================== -delays - Information on the various kernel delay / sleep mechanisms -=================================================================== - -This document seeks to answer the common question: "What is the -RightWay (TM) to insert a delay?" - -This question is most often faced by driver writers who have to -deal with hardware delays and who may not be the most intimately -familiar with the inner workings of the Linux Kernel. - - -Inserting Delays ----------------- - -The first, and most important, question you need to ask is "Is my -code in an atomic context?" This should be followed closely by "Does -it really need to delay in atomic context?" If so... - -ATOMIC CONTEXT: - You must use the `*delay` family of functions. These - functions use the jiffy estimation of clock speed - and will busy wait for enough loop cycles to achieve - the desired delay: - - ndelay(unsigned long nsecs) - udelay(unsigned long usecs) - mdelay(unsigned long msecs) - - udelay is the generally preferred API; ndelay-level - precision may not actually exist on many non-PC devices. - - mdelay is macro wrapper around udelay, to account for - possible overflow when passing large arguments to udelay. - In general, use of mdelay is discouraged and code should - be refactored to allow for the use of msleep. - -NON-ATOMIC CONTEXT: - You should use the `*sleep[_range]` family of functions. - There are a few more options here, while any of them may - work correctly, using the "right" sleep function will - help the scheduler, power management, and just make your - driver better :) - - -- Backed by busy-wait loop: - - udelay(unsigned long usecs) - - -- Backed by hrtimers: - - usleep_range(unsigned long min, unsigned long max) - - -- Backed by jiffies / legacy_timers - - msleep(unsigned long msecs) - msleep_interruptible(unsigned long msecs) - - Unlike the `*delay` family, the underlying mechanism - driving each of these calls varies, thus there are - quirks you should be aware of. - - - SLEEPING FOR "A FEW" USECS ( < ~10us? ): - * Use udelay - - - Why not usleep? - On slower systems, (embedded, OR perhaps a speed- - stepped PC!) the overhead of setting up the hrtimers - for usleep *may* not be worth it. Such an evaluation - will obviously depend on your specific situation, but - it is something to be aware of. - - SLEEPING FOR ~USECS OR SMALL MSECS ( 10us - 20ms): - * Use usleep_range - - - Why not msleep for (1ms - 20ms)? - Explained originally here: - https://lore.kernel.org/r/15327.1186166232@lwn.net - - msleep(1~20) may not do what the caller intends, and - will often sleep longer (~20 ms actual sleep for any - value given in the 1~20ms range). In many cases this - is not the desired behavior. - - - Why is there no "usleep" / What is a good range? - Since usleep_range is built on top of hrtimers, the - wakeup will be very precise (ish), thus a simple - usleep function would likely introduce a large number - of undesired interrupts. - - With the introduction of a range, the scheduler is - free to coalesce your wakeup with any other wakeup - that may have happened for other reasons, or at the - worst case, fire an interrupt for your upper bound. - - The larger a range you supply, the greater a chance - that you will not trigger an interrupt; this should - be balanced with what is an acceptable upper bound on - delay / performance for your specific code path. Exact - tolerances here are very situation specific, thus it - is left to the caller to determine a reasonable range. - - SLEEPING FOR LARGER MSECS ( 10ms+ ) - * Use msleep or possibly msleep_interruptible - - - What's the difference? - msleep sets the current task to TASK_UNINTERRUPTIBLE - whereas msleep_interruptible sets the current task to - TASK_INTERRUPTIBLE before scheduling the sleep. In - short, the difference is whether the sleep can be ended - early by a signal. In general, just use msleep unless - you know you have a need for the interruptible variant. - - FLEXIBLE SLEEPING (any delay, uninterruptible) - * Use fsleep -- 2.51.0 From 2e529e637cef39057d9cf199a1ecb915d97ffcd9 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 13 Oct 2024 22:16:58 +0200 Subject: [PATCH 08/16] posix-timers: Replace call_rcu() by kfree_rcu() for simple kmem_cache_free() callback Since SLOB was removed and since commit 6c6c47b063b5 ("mm, slab: call kvfree_rcu_barrier() from kmem_cache_destroy()"), it is not longer necessary to use call_rcu() when the callback only performs kmem_cache_free(). Use kfree_rcu() directly. The changes were made using Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Thomas Gleixner Reviewed-by: Uladzislau Rezki (Sony) Link: https://lore.kernel.org/all/20241013201704.49576-12-Julia.Lawall@inria.fr --- kernel/time/posix-timers.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 4576aaed13b2..fc40dacabe78 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -413,18 +413,11 @@ static struct k_itimer * alloc_posix_timer(void) return tmr; } -static void k_itimer_rcu_free(struct rcu_head *head) -{ - struct k_itimer *tmr = container_of(head, struct k_itimer, rcu); - - kmem_cache_free(posix_timers_cache, tmr); -} - static void posix_timer_free(struct k_itimer *tmr) { put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); - call_rcu(&tmr->rcu, k_itimer_rcu_free); + kfree_rcu(tmr, rcu); } static void posix_timer_unhash_and_free(struct k_itimer *tmr) -- 2.51.0 From 14f1e3b3dfc7fc8b61fcb79f956f05625af6f049 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Oct 2024 10:28:54 +0200 Subject: [PATCH 09/16] timekeeping: Read NTP tick length only once No point in reading it a second time when the comparison fails. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-1-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1427c58e9802..2bc3542f29a2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2161,16 +2161,17 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, */ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { + u64 ntp_tl = ntp_tick_length(); u32 mult; /* * Determine the multiplier from the current NTP tick length. * Avoid expensive division when the tick length doesn't change. */ - if (likely(tk->ntp_tick == ntp_tick_length())) { + if (likely(tk->ntp_tick == ntp_tl)) { mult = tk->tkr_mono.mult - tk->ntp_err_mult; } else { - tk->ntp_tick = ntp_tick_length(); + tk->ntp_tick = ntp_tl; mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) - tk->xtime_remainder, tk->cycle_interval); } -- 2.51.0 From 886150fb4f19505b8f9d26201d7671b25c233a9f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Oct 2024 10:28:55 +0200 Subject: [PATCH 10/16] timekeeping: Don't stop time readers across hard_pps() update hard_pps() update does not modify anything which might be required by time readers so forcing readers out of the way during the update is a pointless exercise. The interaction with adjtimex() and timekeeper updates which call into the NTP code is properly serialized by timekeeper_lock. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-2-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2bc3542f29a2..ff98a0b54b54 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2746,11 +2746,7 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); - __hardpps(phase_ts, raw_ts); - - write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } EXPORT_SYMBOL(hardpps); -- 2.51.0 From 9fe7d9a984f2309ceb9f53bc89eb4885994e5052 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 9 Oct 2024 10:28:56 +0200 Subject: [PATCH 11/16] timekeeping: Avoid duplicate leap state update do_adjtimex() invokes tk_update_leap_state() unconditionally even when a previous invocation of timekeeping_update() already did that update. Put it into the else path which is invoked when timekeeping_update() is not called. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-3-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ff98a0b54b54..14aaa44104eb 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2715,8 +2715,9 @@ int do_adjtimex(struct __kernel_timex *txc) __timekeeping_set_tai_offset(tk, tai); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); clock_set = true; + } else { + tk_update_leap_state(tk); } - tk_update_leap_state(tk); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); -- 2.51.0 From 1f7226b1e70a0e2ca3b305808cc7f1ae3acbd127 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Oct 2024 10:28:57 +0200 Subject: [PATCH 12/16] timekeeping: Abort clocksource change in case of failure There is no point to go through a full timekeeping update when acquiring a module reference or enabling the new clocksource fails. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-4-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 14aaa44104eb..a9550f6a7f12 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1608,33 +1608,29 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) static int change_clocksource(void *data) { struct timekeeper *tk = &tk_core.timekeeper; - struct clocksource *new, *old = NULL; + struct clocksource *new = data, *old = NULL; unsigned long flags; - bool change = false; - - new = (struct clocksource *) data; /* - * If the cs is in module, get a module reference. Succeeds - * for built-in code (owner == NULL) as well. + * If the clocksource is in a module, get a module reference. + * Succeeds for built-in code (owner == NULL) as well. Abort if the + * reference can't be acquired. */ - if (try_module_get(new->owner)) { - if (!new->enable || new->enable(new) == 0) - change = true; - else - module_put(new->owner); + if (!try_module_get(new->owner)) + return 0; + + /* Abort if the device can't be enabled */ + if (new->enable && new->enable(new) != 0) { + module_put(new->owner); + return 0; } raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); - - if (change) { - old = tk->tkr_mono.clock; - tk_setup_internals(tk, new); - } - + old = tk->tkr_mono.clock; + tk_setup_internals(tk, new); timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); @@ -1643,7 +1639,6 @@ static int change_clocksource(void *data) if (old) { if (old->disable) old->disable(old); - module_put(old->owner); } -- 2.51.0 From c2a329566a3d5a638061733f232c40379235931d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Oct 2024 10:28:58 +0200 Subject: [PATCH 13/16] timekeeping: Simplify code in timekeeping_advance() timekeeping_advance() takes the timekeeper_lock and releases it before returning. When an early return is required, goto statements are used to make sure the lock is realeased properly. When the code was written the locking guard() was not yet available. Use the guard() to simplify the code and while at it cleanup ordering of function variables. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-5-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a9550f6a7f12..cfb718dec737 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2307,23 +2307,22 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) { struct timekeeper *real_tk = &tk_core.timekeeper; struct timekeeper *tk = &shadow_timekeeper; - u64 offset; - int shift = 0, maxshift; unsigned int clock_set = 0; - unsigned long flags; + int shift = 0, maxshift; + u64 offset; - raw_spin_lock_irqsave(&timekeeper_lock, flags); + guard(raw_spinlock_irqsave)(&timekeeper_lock); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) - goto out; + return false; offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask); /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) - goto out; + return false; /* Do some additional sanity checking */ timekeeping_check_update(tk, offset); @@ -2342,8 +2341,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { - offset = logarithmic_accumulation(tk, offset, shift, - &clock_set); + offset = logarithmic_accumulation(tk, offset, shift, &clock_set); if (offset < tk->cycle_interval< Date: Tue, 15 Oct 2024 12:08:39 +0200 Subject: [PATCH 14/16] timekeeping: Reorder struct timekeeper struct timekeeper is ordered suboptimal vs. cachelines. The layout, including the preceding seqcount (see struct tk_core in timekeeper.c) is: cacheline 0: seqcount, tkr_mono cacheline 1: tkr_raw, xtime_sec cacheline 2: ktime_sec ... tai_offset, internal variables cacheline 3: next_leap_ktime, raw_sec, internal variables cacheline 4: internal variables So any access to via ktime_get*() except for access to CLOCK_MONOTONIC_RAW will use either cachelines 0 + 1 or cachelines 0 + 2. Access to CLOCK_MONOTONIC_RAW uses cachelines 0 + 1 + 3. Reorder the members so that the result is more efficient: cacheline 0: seqcount, tkr_mono cacheline 1: xtime_sec, ktime_sec ... tai_offset cacheline 2: tkr_raw, raw_sec cacheline 3: internal variables cacheline 4: internal variables That means ktime_get*() will access cacheline 0 + 1 and CLOCK_MONOTONIC_RAW access will use cachelines 0 + 2. Update kernel-doc and fix formatting issues while at it. Also fix a typo in struct tk_read_base kernel-doc. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241015100839.12702-1-anna-maria@linutronix.de --- include/linux/timekeeper_internal.h | 106 +++++++++++++++++----------- 1 file changed, 65 insertions(+), 41 deletions(-) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 902c20ef495a..a3b6380a7777 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -26,7 +26,7 @@ * occupies a single 64byte cache line. * * The struct is separate from struct timekeeper as it is also used - * for a fast NMI safe accessors. + * for the fast NMI safe accessors. * * @base_real is for the fast NMI safe accessor to allow reading clock * realtime from any context. @@ -44,33 +44,41 @@ struct tk_read_base { /** * struct timekeeper - Structure holding internal timekeeping values. - * @tkr_mono: The readout base structure for CLOCK_MONOTONIC - * @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW - * @xtime_sec: Current CLOCK_REALTIME time in seconds - * @ktime_sec: Current CLOCK_MONOTONIC time in seconds - * @wall_to_monotonic: CLOCK_REALTIME to CLOCK_MONOTONIC offset - * @offs_real: Offset clock monotonic -> clock realtime - * @offs_boot: Offset clock monotonic -> clock boottime - * @offs_tai: Offset clock monotonic -> clock tai - * @tai_offset: The current UTC to TAI offset in seconds - * @clock_was_set_seq: The sequence number of clock was set events - * @cs_was_changed_seq: The sequence number of clocksource change events - * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second - * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds - * @monotonic_to_boot: CLOCK_MONOTONIC to CLOCK_BOOTTIME offset - * @cycle_interval: Number of clock cycles in one NTP interval - * @xtime_interval: Number of clock shifted nano seconds in one NTP - * interval. - * @xtime_remainder: Shifted nano seconds left over when rounding - * @cycle_interval - * @raw_interval: Shifted raw nano seconds accumulated per NTP interval. - * @ntp_error: Difference between accumulated time and NTP time in ntp - * shifted nano seconds. - * @ntp_error_shift: Shift conversion between clock shifted nano seconds and - * ntp shifted nano seconds. - * @last_warning: Warning ratelimiter (DEBUG_TIMEKEEPING) - * @underflow_seen: Underflow warning flag (DEBUG_TIMEKEEPING) - * @overflow_seen: Overflow warning flag (DEBUG_TIMEKEEPING) + * @tkr_mono: The readout base structure for CLOCK_MONOTONIC + * @xtime_sec: Current CLOCK_REALTIME time in seconds + * @ktime_sec: Current CLOCK_MONOTONIC time in seconds + * @wall_to_monotonic: CLOCK_REALTIME to CLOCK_MONOTONIC offset + * @offs_real: Offset clock monotonic -> clock realtime + * @offs_boot: Offset clock monotonic -> clock boottime + * @offs_tai: Offset clock monotonic -> clock tai + * @tai_offset: The current UTC to TAI offset in seconds + * @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW + * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds + * @clock_was_set_seq: The sequence number of clock was set events + * @cs_was_changed_seq: The sequence number of clocksource change events + * @monotonic_to_boot: CLOCK_MONOTONIC to CLOCK_BOOTTIME offset + * @cycle_interval: Number of clock cycles in one NTP interval + * @xtime_interval: Number of clock shifted nano seconds in one NTP + * interval. + * @xtime_remainder: Shifted nano seconds left over when rounding + * @cycle_interval + * @raw_interval: Shifted raw nano seconds accumulated per NTP interval. + * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second + * @ntp_tick: The ntp_tick_length() value currently being + * used. This cached copy ensures we consistently + * apply the tick length for an entire tick, as + * ntp_tick_length may change mid-tick, and we don't + * want to apply that new value to the tick in + * progress. + * @ntp_error: Difference between accumulated time and NTP time in ntp + * shifted nano seconds. + * @ntp_error_shift: Shift conversion between clock shifted nano seconds and + * ntp shifted nano seconds. + * @ntp_err_mult: Multiplication factor for scaled math conversion + * @skip_second_overflow: Flag used to avoid updating NTP twice with same second + * @last_warning: Warning ratelimiter (DEBUG_TIMEKEEPING) + * @underflow_seen: Underflow warning flag (DEBUG_TIMEKEEPING) + * @overflow_seen: Overflow warning flag (DEBUG_TIMEKEEPING) * * Note: For timespec(64) based interfaces wall_to_monotonic is what * we need to add to xtime (or xtime corrected for sub jiffy times) @@ -88,10 +96,28 @@ struct tk_read_base { * * @monotonic_to_boottime is a timespec64 representation of @offs_boot to * accelerate the VDSO update for CLOCK_BOOTTIME. + * + * The cacheline ordering of the structure is optimized for in kernel usage of + * the ktime_get() and ktime_get_ts64() family of time accessors. Struct + * timekeeper is prepended in the core timekeeping code with a sequence count, + * which results in the following cacheline layout: + * + * 0: seqcount, tkr_mono + * 1: xtime_sec ... tai_offset + * 2: tkr_raw, raw_sec + * 3,4: Internal variables + * + * Cacheline 0,1 contain the data which is used for accessing + * CLOCK_MONOTONIC/REALTIME/BOOTTIME/TAI, while cacheline 2 contains the + * data for accessing CLOCK_MONOTONIC_RAW. Cacheline 3,4 are internal + * variables which are only accessed during timekeeper updates once per + * tick. */ struct timekeeper { + /* Cacheline 0 (together with prepended seqcount of timekeeper core): */ struct tk_read_base tkr_mono; - struct tk_read_base tkr_raw; + + /* Cacheline 1: */ u64 xtime_sec; unsigned long ktime_sec; struct timespec64 wall_to_monotonic; @@ -99,31 +125,29 @@ struct timekeeper { ktime_t offs_boot; ktime_t offs_tai; s32 tai_offset; + + /* Cacheline 2: */ + struct tk_read_base tkr_raw; + u64 raw_sec; + + /* Cachline 3 and 4 (timekeeping internal variables): */ unsigned int clock_was_set_seq; u8 cs_was_changed_seq; - ktime_t next_leap_ktime; - u64 raw_sec; + struct timespec64 monotonic_to_boot; - /* The following members are for timekeeping internal use */ u64 cycle_interval; u64 xtime_interval; s64 xtime_remainder; u64 raw_interval; - /* The ntp_tick_length() value currently being used. - * This cached copy ensures we consistently apply the tick - * length for an entire tick, as ntp_tick_length may change - * mid-tick, and we don't want to apply that new value to - * the tick in progress. - */ + + ktime_t next_leap_ktime; u64 ntp_tick; - /* Difference between accumulated time and NTP time in ntp - * shifted nano seconds. */ s64 ntp_error; u32 ntp_error_shift; u32 ntp_err_mult; - /* Flag used to avoid updating NTP twice with same second */ u32 skip_second_overflow; + #ifdef CONFIG_DEBUG_TIMEKEEPING long last_warning; /* -- 2.51.0 From 20c7b582e88b8a72832637cd1754e5622aa8a92d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Oct 2024 10:29:00 +0200 Subject: [PATCH 15/16] timekeeping: Move shadow_timekeeper into tk_core tk_core requires shadow_timekeeper to allow timekeeping_advance() updating without holding the timekeeper sequence count write locked. This allows the readers to make progress up to the actual update where the shadow timekeeper is copied over to the real timekeeper. As long as there is only a single timekeeper, having them separate is fine. But when the timekeeper infrastructure will be reused for per ptp clock timekeepers, shadow_timekeeper needs to be part of tk_core. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-7-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cfb718dec737..848d2b18f800 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -50,11 +50,11 @@ DEFINE_RAW_SPINLOCK(timekeeper_lock); static struct { seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; + struct timekeeper shadow_timekeeper; } tk_core ____cacheline_aligned = { .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock), }; -static struct timekeeper shadow_timekeeper; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -795,8 +795,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) * timekeeper structure on the next update with stale data */ if (action & TK_MIRROR) - memcpy(&shadow_timekeeper, &tk_core.timekeeper, - sizeof(tk_core.timekeeper)); + memcpy(&tk_core.shadow_timekeeper, &tk_core.timekeeper, sizeof(tk_core.timekeeper)); } /** @@ -2305,8 +2304,8 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, */ static bool timekeeping_advance(enum timekeeping_adv_mode mode) { + struct timekeeper *tk = &tk_core.shadow_timekeeper; struct timekeeper *real_tk = &tk_core.timekeeper; - struct timekeeper *tk = &shadow_timekeeper; unsigned int clock_set = 0; int shift = 0, maxshift; u64 offset; -- 2.51.0 From dbdcf8c4caeca8192daa43429ccf23a1feec126c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Oct 2024 10:29:01 +0200 Subject: [PATCH 16/16] timekeeping: Encapsulate locking/unlocking of timekeeper_lock timekeeper_lock protects updates of timekeeper (tk_core). It is also used by vdso_update_begin/end() and not only internally by the timekeeper code. As long as there is only a single timekeeper, this works fine. But when the timekeeper infrastructure will be reused for per ptp clock timekeepers, timekeeper_lock needs to be part of tk_core.. Therefore encapuslate locking/unlocking of timekeeper_lock and make the lock static. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-8-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 15 ++++++++++++++- kernel/time/timekeeping_internal.h | 3 ++- kernel/time/vsyscall.c | 5 ++--- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 848d2b18f800..77e0a0fe7771 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -41,7 +41,7 @@ enum timekeeping_adv_mode { TK_ADV_FREQ }; -DEFINE_RAW_SPINLOCK(timekeeper_lock); +static DEFINE_RAW_SPINLOCK(timekeeper_lock); /* * The most important data for readout fits into a single 64 byte @@ -114,6 +114,19 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { .base[1] = FAST_TK_INIT, }; +unsigned long timekeeper_lock_irqsave(void) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + return flags; +} + +void timekeeper_unlock_irqrestore(unsigned long flags) +{ + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} + /* * Multigrain timestamps require tracking the latest fine-grained timestamp * that has been issued, and never returning a coarse-grained timestamp that is diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 0bbae825bc02..b3dca834f48c 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -49,6 +49,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) #endif /* Semi public for serialization of non timekeeper VDSO updates. */ -extern raw_spinlock_t timekeeper_lock; +unsigned long timekeeper_lock_irqsave(void); +void timekeeper_unlock_irqrestore(unsigned long flags); #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 9193d6133e5d..98488b20b594 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -151,9 +151,8 @@ void update_vsyscall_tz(void) unsigned long vdso_update_begin(void) { struct vdso_data *vdata = __arch_get_k_vdso_data(); - unsigned long flags; + unsigned long flags = timekeeper_lock_irqsave(); - raw_spin_lock_irqsave(&timekeeper_lock, flags); vdso_write_begin(vdata); return flags; } @@ -172,5 +171,5 @@ void vdso_update_end(unsigned long flags) vdso_write_end(vdata); __arch_sync_vdso_data(vdata); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + timekeeper_unlock_irqrestore(flags); } -- 2.51.0