From 2de01e0e57f3ebe7f90b08f6bca5ce0f3da3829f Mon Sep 17 00:00:00 2001 From: Nikita Travkin Date: Fri, 4 Oct 2024 21:17:30 +0500 Subject: [PATCH 01/16] Input: zinitix - don't fail if linux,keycodes prop is absent When initially adding the touchkey support, a mistake was made in the property parsing code. The possible negative errno from device_property_count_u32() was never checked, which was an oversight left from converting to it from the of_property as part of the review fixes. Re-add the correct handling of the absent property, in which case zero touchkeys should be assumed, which would disable the feature. Reported-by: Jakob Hauser Tested-by: Jakob Hauser Fixes: 075d9b22c8fe ("Input: zinitix - add touchkey support") Reviewed-by: Linus Walleij Signed-off-by: Nikita Travkin Tested-by: Yassine Oudjana Link: https://lore.kernel.org/r/20241004-zinitix-no-keycodes-v2-1-876dc9fea4b6@trvn.ru Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/zinitix.c | 34 +++++++++++++++++++---------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/drivers/input/touchscreen/zinitix.c b/drivers/input/touchscreen/zinitix.c index 52b3950460e2..716d6fa60f86 100644 --- a/drivers/input/touchscreen/zinitix.c +++ b/drivers/input/touchscreen/zinitix.c @@ -645,19 +645,29 @@ static int zinitix_ts_probe(struct i2c_client *client) return error; } - bt541->num_keycodes = device_property_count_u32(&client->dev, "linux,keycodes"); - if (bt541->num_keycodes > ARRAY_SIZE(bt541->keycodes)) { - dev_err(&client->dev, "too many keys defined (%d)\n", bt541->num_keycodes); - return -EINVAL; - } + if (device_property_present(&client->dev, "linux,keycodes")) { + bt541->num_keycodes = device_property_count_u32(&client->dev, + "linux,keycodes"); + if (bt541->num_keycodes < 0) { + dev_err(&client->dev, "Failed to count keys (%d)\n", + bt541->num_keycodes); + return bt541->num_keycodes; + } else if (bt541->num_keycodes > ARRAY_SIZE(bt541->keycodes)) { + dev_err(&client->dev, "Too many keys defined (%d)\n", + bt541->num_keycodes); + return -EINVAL; + } - error = device_property_read_u32_array(&client->dev, "linux,keycodes", - bt541->keycodes, - bt541->num_keycodes); - if (error) { - dev_err(&client->dev, - "Unable to parse \"linux,keycodes\" property: %d\n", error); - return error; + error = device_property_read_u32_array(&client->dev, + "linux,keycodes", + bt541->keycodes, + bt541->num_keycodes); + if (error) { + dev_err(&client->dev, + "Unable to parse \"linux,keycodes\" property: %d\n", + error); + return error; + } } error = zinitix_init_input_dev(bt541); -- 2.51.0 From 2c02f7375e658ae93d57a31a66f91b62754ef8f1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 18 Oct 2024 21:43:00 -0400 Subject: [PATCH 02/16] fgraph: Use CPU hotplug mechanism to initialize idle shadow stacks The function graph infrastructure allocates a shadow stack for every task when enabled. This includes the idle tasks. The first time the function graph is invoked, the shadow stacks are created and never freed until the task exits. This includes the idle tasks. Only the idle tasks that were for online CPUs had their shadow stacks created when function graph tracing started. If function graph tracing is enabled and a CPU comes online, the idle task representing that CPU will not have its shadow stack created, and all function graph tracing for that idle task will be silently dropped. Instead, use the CPU hotplug mechanism to allocate the idle shadow stacks. This will include idle tasks for CPUs that come online during tracing. This issue can be reproduced by: # cd /sys/kernel/tracing # echo 0 > /sys/devices/system/cpu/cpu1/online # echo 0 > set_ftrace_pid # echo function_graph > current_tracer # echo 1 > options/funcgraph-proc # echo 1 > /sys/devices/system/cpu/cpu1 # grep '' per_cpu/cpu1/trace | head Before, nothing would show up. After: 1) -0 | 0.811 us | __enqueue_entity(); 1) -0 | 5.626 us | } /* enqueue_entity */ 1) -0 | | dl_server_update_idle_time() { 1) -0 | | dl_scaled_delta_exec() { 1) -0 | 0.450 us | arch_scale_cpu_capacity(); 1) -0 | 1.242 us | } 1) -0 | 1.908 us | } 1) -0 | | dl_server_start() { 1) -0 | | enqueue_dl_entity() { 1) -0 | | task_contending() { Note, if tracing stops and restarts, the old way would then initialize the onlined CPUs. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Thomas Gleixner Link: https://lore.kernel.org/20241018214300.6df82178@rorschach Fixes: 868baf07b1a25 ("ftrace: Fix memory leak with function graph and cpu hotplug") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index d7d4fb403f6f..43f4e3f57438 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1160,19 +1160,13 @@ void fgraph_update_pid_func(void) static int start_graph_tracing(void) { unsigned long **ret_stack_list; - int ret, cpu; + int ret; ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL); if (!ret_stack_list) return -ENOMEM; - /* The cpu_boot init_task->ret_stack will never be freed */ - for_each_online_cpu(cpu) { - if (!idle_task(cpu)->ret_stack) - ftrace_graph_init_idle_task(idle_task(cpu), cpu); - } - do { ret = alloc_retstack_tasklist(ret_stack_list); } while (ret == -EAGAIN); @@ -1242,14 +1236,34 @@ static void ftrace_graph_disable_direct(bool disable_branch) fgraph_direct_gops = &fgraph_stub; } +/* The cpu_boot init_task->ret_stack will never be freed */ +static int fgraph_cpu_init(unsigned int cpu) +{ + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_idle_task(idle_task(cpu), cpu); + return 0; +} + int register_ftrace_graph(struct fgraph_ops *gops) { + static bool fgraph_initialized; int command = 0; int ret = 0; int i = -1; mutex_lock(&ftrace_lock); + if (!fgraph_initialized) { + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph_idle_init", + fgraph_cpu_init, NULL); + if (ret < 0) { + pr_warn("fgraph: Error to init cpu hotplug support\n"); + return ret; + } + fgraph_initialized = true; + ret = 0; + } + if (!fgraph_array[0]) { /* The array must always have real data on it */ for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) -- 2.51.0 From fae4078c289a2f24229c0de652249948b1cd6bdb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 18 Oct 2024 21:52:12 -0400 Subject: [PATCH 03/16] fgraph: Allocate ret_stack_list with proper size The ret_stack_list is an array of ret_stack shadow stacks for the function graph usage. When the first function graph is enabled, all tasks in the system get a shadow stack. The ret_stack_list is a 32 element array of pointers to these shadow stacks. It allocates the shadow stack in batches (32 stacks at a time), assigns them to running tasks, and continues until all tasks are covered. When the function graph shadow stack changed from an array of ftrace_ret_stack structures to an array of longs, the allocation of ret_stack_list went from allocating an array of 32 elements to just a block defined by SHADOW_STACK_SIZE. Luckily, that's defined as PAGE_SIZE and is much more than enough to hold 32 pointers. But it is way overkill for the amount needed to allocate. Change the allocation of ret_stack_list back to a kcalloc() of FTRACE_RETSTACK_ALLOC_SIZE pointers. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241018215212.23f13f40@rorschach Fixes: 42675b723b484 ("function_graph: Convert ret_stack to a series of longs") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 43f4e3f57438..41e7a15dcb50 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1162,7 +1162,8 @@ static int start_graph_tracing(void) unsigned long **ret_stack_list; int ret; - ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL); + ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE, + sizeof(*ret_stack_list), GFP_KERNEL); if (!ret_stack_list) return -ENOMEM; -- 2.51.0 From ae6a888a4357131c01d85f4c91fb32552dd0bf70 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 19 Oct 2024 09:16:51 -0600 Subject: [PATCH 04/16] io_uring/rw: fix wrong NOWAIT check in io_rw_init_file() A previous commit improved how !FMODE_NOWAIT is dealt with, but inadvertently negated a check whilst doing so. This caused -EAGAIN to be returned from reading files with O_NONBLOCK set. Fix up the check for REQ_F_SUPPORT_NOWAIT. Reported-by: Julian Orth Link: https://github.com/axboe/liburing/issues/1270 Fixes: f7c913438533 ("io_uring/rw: allow pollable non-blocking attempts for !FMODE_NOWAIT") Signed-off-by: Jens Axboe --- io_uring/rw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 80ae3c2ebb70..354c4e175654 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -807,7 +807,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) * reliably. If not, or it IOCB_NOWAIT is set, don't retry. */ if (kiocb->ki_flags & IOCB_NOWAIT || - ((file->f_flags & O_NONBLOCK && (req->flags & REQ_F_SUPPORT_NOWAIT)))) + ((file->f_flags & O_NONBLOCK && !(req->flags & REQ_F_SUPPORT_NOWAIT)))) req->flags |= REQ_F_NOWAIT; if (ctx->flags & IORING_SETUP_IOPOLL) { -- 2.51.0 From 42f7652d3eb527d03665b09edac47f85fb600924 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 20 Oct 2024 15:19:38 -0700 Subject: [PATCH 05/16] Linux 6.12-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8cf3cf528892..a9a7d9ffaa98 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 12 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.51.0 From 23f1178ad706a1aa69ac3dfaa6559f1fb876c14e Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Fri, 25 Oct 2024 11:53:17 +0100 Subject: [PATCH 06/16] sched/uclamp: Fix unnused variable warning uclamp_mutex is only used for CONFIG_SYSCTL or CONFIG_UCLAMP_TASK_GROUP so declare it __maybe_unused. Closes: https://lore.kernel.org/oe-kbuild-all/202410060258.bPl2ZoUo-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202410250459.EJe6PJI5-lkp@intel.com/ Reported-by: kernel test robot Signed-off-by: Christian Loehle Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/a1e9c342-01c9-44f0-a789-2c908e57942b@arm.com --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 114adac5a9c8..9bad282e7950 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1399,7 +1399,7 @@ void set_load_weight(struct task_struct *p, bool update_load) * requests are serialized using a mutex to reduce the risk of conflicting * updates or API abuses. */ -static DEFINE_MUTEX(uclamp_mutex); +static __maybe_unused DEFINE_MUTEX(uclamp_mutex); /* Max allowed minimum utilization */ static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; -- 2.51.0 From 1a6151017ee5a30cb2d959f110ab18fc49646467 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 14 Oct 2024 10:43:58 -0400 Subject: [PATCH 07/16] sched: psi: pass enqueue/dequeue flags to psi callbacks directly What psi needs to do on each enqueue and dequeue has gotten more subtle, and the generic sched code trying to distill this into a bool for the callbacks is awkward. Pass the flags directly and let psi parse them. For that to work, the #include "stats.h" (which has the psi callback implementations) needs to be below the flag definitions in "sched.h". Move that section further down, next to some of the other accounting stuff. This also puts the ENQUEUE_SAVE/RESTORE branch behind the psi jump label, slightly reducing overhead when PSI=y but runtime disabled. Suggested-by: Peter Zijlstra Signed-off-by: Johannes Weiner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241014144358.GB1021@cmpxchg.org --- kernel/sched/core.c | 12 +++++----- kernel/sched/sched.h | 56 ++++++++++++++++++++++---------------------- kernel/sched/stats.h | 29 +++++++++++++++-------- 3 files changed, 53 insertions(+), 44 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9bad282e7950..c57a79e34911 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2024,10 +2024,10 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) */ uclamp_rq_inc(rq, p); - if (!(flags & ENQUEUE_RESTORE)) { + psi_enqueue(p, flags); + + if (!(flags & ENQUEUE_RESTORE)) sched_info_enqueue(rq, p); - psi_enqueue(p, flags & ENQUEUE_MIGRATED); - } if (sched_core_enabled(rq)) sched_core_enqueue(rq, p); @@ -2044,10 +2044,10 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & DEQUEUE_SAVE)) { + if (!(flags & DEQUEUE_SAVE)) sched_info_dequeue(rq, p); - psi_dequeue(p, !(flags & DEQUEUE_SLEEP)); - } + + psi_dequeue(p, flags); /* * Must be before ->dequeue_task() because ->dequeue_task() can 'fail' diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7b139016cbd9..e51bf5a344d3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2093,34 +2093,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p) #endif /* CONFIG_SMP */ -#include "stats.h" - -#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) - -extern void __sched_core_account_forceidle(struct rq *rq); - -static inline void sched_core_account_forceidle(struct rq *rq) -{ - if (schedstat_enabled()) - __sched_core_account_forceidle(rq); -} - -extern void __sched_core_tick(struct rq *rq); - -static inline void sched_core_tick(struct rq *rq) -{ - if (sched_core_enabled(rq) && schedstat_enabled()) - __sched_core_tick(rq); -} - -#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */ - -static inline void sched_core_account_forceidle(struct rq *rq) { } - -static inline void sched_core_tick(struct rq *rq) { } - -#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */ - #ifdef CONFIG_CGROUP_SCHED /* @@ -3191,6 +3163,34 @@ extern void nohz_run_idle_balance(int cpu); static inline void nohz_run_idle_balance(int cpu) { } #endif +#include "stats.h" + +#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) + +extern void __sched_core_account_forceidle(struct rq *rq); + +static inline void sched_core_account_forceidle(struct rq *rq) +{ + if (schedstat_enabled()) + __sched_core_account_forceidle(rq); +} + +extern void __sched_core_tick(struct rq *rq); + +static inline void sched_core_tick(struct rq *rq) +{ + if (sched_core_enabled(rq) && schedstat_enabled()) + __sched_core_tick(rq); +} + +#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */ + +static inline void sched_core_account_forceidle(struct rq *rq) { } + +static inline void sched_core_tick(struct rq *rq) { } + +#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */ + #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 767e098a3bd1..8ee0add5a48a 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -127,21 +127,25 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, * go through migration requeues. In this case, *sleeping* states need * to be transferred. */ -static inline void psi_enqueue(struct task_struct *p, bool migrate) +static inline void psi_enqueue(struct task_struct *p, int flags) { int clear = 0, set = 0; if (static_branch_likely(&psi_disabled)) return; + /* Same runqueue, nothing changed for psi */ + if (flags & ENQUEUE_RESTORE) + return; + if (p->se.sched_delayed) { /* CPU migration of "sleeping" task */ - SCHED_WARN_ON(!migrate); + SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED)); if (p->in_memstall) set |= TSK_MEMSTALL; if (p->in_iowait) set |= TSK_IOWAIT; - } else if (migrate) { + } else if (flags & ENQUEUE_MIGRATED) { /* CPU migration of runnable task */ set = TSK_RUNNING; if (p->in_memstall) @@ -158,17 +162,14 @@ static inline void psi_enqueue(struct task_struct *p, bool migrate) psi_task_change(p, clear, set); } -static inline void psi_dequeue(struct task_struct *p, bool migrate) +static inline void psi_dequeue(struct task_struct *p, int flags) { if (static_branch_likely(&psi_disabled)) return; - /* - * When migrating a task to another CPU, clear all psi - * state. The enqueue callback above will work it out. - */ - if (migrate) - psi_task_change(p, p->psi_flags, 0); + /* Same runqueue, nothing changed for psi */ + if (flags & DEQUEUE_SAVE) + return; /* * A voluntary sleep is a dequeue followed by a task switch. To @@ -176,6 +177,14 @@ static inline void psi_dequeue(struct task_struct *p, bool migrate) * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. * Do nothing here. */ + if (flags & DEQUEUE_SLEEP) + return; + + /* + * When migrating a task to another CPU, clear all psi + * state. The enqueue callback above will work it out. + */ + psi_task_change(p, p->psi_flags, 0); } static inline void psi_ttwu_dequeue(struct task_struct *p) -- 2.51.0 From b23decf8ac9102fc52c4de5196f4dc0a5f3eb80b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 28 Oct 2024 11:43:42 +0100 Subject: [PATCH 08/16] sched: Initialize idle tasks only once Idle tasks are initialized via __sched_fork() twice: fork_idle() copy_process() sched_fork() __sched_fork() init_idle() __sched_fork() Instead of cleaning this up, sched_ext hacked around it. Even when analyis and solution were provided in a discussion, nobody cared to clean this up. init_idle() is also invoked from sched_init() to initialize the boot CPU's idle task, which requires the __sched_fork() invocation. But this can be trivially solved by invoking __sched_fork() before init_idle() in sched_init() and removing the __sched_fork() invocation from init_idle(). Do so and clean up the comments explaining this historical leftover. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241028103142.359584747@linutronix.de --- kernel/sched/core.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c57a79e34911..aad48850c1ef 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4423,7 +4423,8 @@ int wake_up_state(struct task_struct *p, unsigned int state) * Perform scheduler related setup for a newly forked process p. * p is forked by current. * - * __sched_fork() is basic setup used by init_idle() too: + * __sched_fork() is basic setup which is also used by sched_init() to + * initialize the boot CPU's idle task. */ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) { @@ -7697,8 +7698,6 @@ void __init init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - __sched_fork(0, idle); - raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_rq_lock(rq); @@ -7713,10 +7712,8 @@ void __init init_idle(struct task_struct *idle, int cpu) #ifdef CONFIG_SMP /* - * It's possible that init_idle() gets called multiple times on a task, - * in that case do_set_cpus_allowed() will not do the right thing. - * - * And since this is boot we can forgo the serialization. + * No validation and serialization required at boot time and for + * setting up the idle tasks of not yet online CPUs. */ set_cpus_allowed_common(idle, &ac); #endif @@ -8561,6 +8558,7 @@ void __init sched_init(void) * but because we are the idle thread, we just pick up running again * when this runqueue becomes "idle". */ + __sched_fork(0, current); init_idle(current, smp_processor_id()); calc_load_update = jiffies + LOAD_FREQ; -- 2.51.0 From 0f0d1b8e5010bfe1feeb4d78d137e41946a5370d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 28 Oct 2024 14:20:35 +0100 Subject: [PATCH 09/16] sched/ext: Remove sched_fork() hack Instead of solving the underlying problem of the double invocation of __sched_fork() for idle tasks, sched-ext decided to hack around the issue by partially clearing out the entity struct to preserve the already enqueued node. A provided analysis and solution has been ignored for four months. Now that someone else has taken care of cleaning it up, remove the disgusting hack and clear out the full structure. Remove the comment in the structure declaration as well, as there is no requirement for @node being the last element anymore. Fixes: f0e1a0643a59 ("sched_ext: Implement BPF extensible scheduler class") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Link: https://lore.kernel.org/r/87ldy82wkc.ffs@tglx --- include/linux/sched/ext.h | 1 - kernel/sched/ext.c | 7 +------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 1ddbde64a31b..2799e7284fff 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -199,7 +199,6 @@ struct sched_ext_entity { #ifdef CONFIG_EXT_GROUP_SCHED struct cgroup *cgrp_moving_from; #endif - /* must be the last field, see init_scx_entity() */ struct list_head tasks_node; }; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 5900b06fd036..f6e9a14042d5 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3548,12 +3548,7 @@ static void scx_ops_exit_task(struct task_struct *p) void init_scx_entity(struct sched_ext_entity *scx) { - /* - * init_idle() calls this function again after fork sequence is - * complete. Don't touch ->tasks_node as it's already linked. - */ - memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node)); - + memset(scx, 0, sizeof(*scx)); INIT_LIST_HEAD(&scx->dsq_list.node); RB_CLEAR_NODE(&scx->dsq_priq); scx->sticky_cpu = -1; -- 2.51.0 From 26baa1f1c4bdc34b8d698c1900b407d863ad0e69 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 4 Oct 2024 14:47:02 +0200 Subject: [PATCH 10/16] sched: Add TIF_NEED_RESCHED_LAZY infrastructure Add the basic infrastructure to split the TIF_NEED_RESCHED bit in two. Either bit will cause a resched on return-to-user, but only TIF_NEED_RESCHED will drive IRQ preemption. No behavioural change intended. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sebastian Andrzej Siewior Link: https://lkml.kernel.org/r/20241007075055.219540785@infradead.org --- include/linux/entry-common.h | 3 ++- include/linux/entry-kvm.h | 5 +++-- include/linux/sched.h | 3 ++- include/linux/thread_info.h | 21 +++++++++++++++++---- kernel/entry/common.c | 2 +- kernel/entry/kvm.c | 4 ++-- kernel/sched/core.c | 34 +++++++++++++++++++++------------- 7 files changed, 48 insertions(+), 24 deletions(-) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 1e50cdb83ae5..fc61d0205c97 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -64,7 +64,8 @@ #define EXIT_TO_USER_MODE_WORK \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ + _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ + _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ ARCH_EXIT_TO_USER_MODE_WORK) /** diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h index 6813171afccb..16149f6625e4 100644 --- a/include/linux/entry-kvm.h +++ b/include/linux/entry-kvm.h @@ -17,8 +17,9 @@ #endif #define XFER_TO_GUEST_MODE_WORK \ - (_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL | \ - _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK) + (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_SIGPENDING | \ + _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME | \ + ARCH_XFER_TO_GUEST_MODE_WORK) struct kvm_vcpu; diff --git a/include/linux/sched.h b/include/linux/sched.h index a76e3d074a2a..1d5cc3e50884 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2002,7 +2002,8 @@ static inline void set_tsk_need_resched(struct task_struct *tsk) static inline void clear_tsk_need_resched(struct task_struct *tsk) { - clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); + atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY, + (atomic_long_t *)&task_thread_info(tsk)->flags); } static inline int test_tsk_need_resched(struct task_struct *tsk) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 9ea0b28068f4..cf2446c9c30d 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -59,6 +59,14 @@ enum syscall_work_bit { #include +#ifndef TIF_NEED_RESCHED_LAZY +#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY +#error Inconsistent PREEMPT_LAZY +#endif +#define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED +#define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED +#endif + #ifdef __KERNEL__ #ifndef arch_set_restart_data @@ -179,22 +187,27 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H -static __always_inline bool tif_need_resched(void) +static __always_inline bool tif_test_bit(int bit) { - return arch_test_bit(TIF_NEED_RESCHED, + return arch_test_bit(bit, (unsigned long *)(¤t_thread_info()->flags)); } #else -static __always_inline bool tif_need_resched(void) +static __always_inline bool tif_test_bit(int bit) { - return test_bit(TIF_NEED_RESCHED, + return test_bit(bit, (unsigned long *)(¤t_thread_info()->flags)); } #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ +static __always_inline bool tif_need_resched(void) +{ + return tif_test_bit(TIF_NEED_RESCHED); +} + #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack, const void * const stackend, diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 5b6934e23c21..e33691d5adf7 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -98,7 +98,7 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, local_irq_enable_exit_to_user(ti_work); - if (ti_work & _TIF_NEED_RESCHED) + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) schedule(); if (ti_work & _TIF_UPROBE) diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index 2e0f75bcb7fd..8485f63863af 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -13,7 +13,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) return -EINTR; } - if (ti_work & _TIF_NEED_RESCHED) + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) schedule(); if (ti_work & _TIF_NOTIFY_RESUME) @@ -24,7 +24,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) return ret; ti_work = read_thread_flags(); - } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched()); + } while (ti_work & XFER_TO_GUEST_MODE_WORK); return 0; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index aad48850c1ef..0cd05e36b6b6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -941,10 +941,9 @@ static inline void hrtick_rq_init(struct rq *rq) * this avoids any races wrt polling state changes and thereby avoids * spurious IPIs. */ -static inline bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) { - struct thread_info *ti = task_thread_info(p); - return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); + return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG); } /* @@ -969,9 +968,9 @@ static bool set_nr_if_polling(struct task_struct *p) } #else -static inline bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) { - set_tsk_need_resched(p); + set_ti_thread_flag(ti, tif); return true; } @@ -1076,28 +1075,37 @@ void wake_up_q(struct wake_q_head *head) * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -void resched_curr(struct rq *rq) +static void __resched_curr(struct rq *rq, int tif) { struct task_struct *curr = rq->curr; + struct thread_info *cti = task_thread_info(curr); int cpu; lockdep_assert_rq_held(rq); - if (test_tsk_need_resched(curr)) + if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED)) return; cpu = cpu_of(rq); if (cpu == smp_processor_id()) { - set_tsk_need_resched(curr); - set_preempt_need_resched(); + set_ti_thread_flag(cti, tif); + if (tif == TIF_NEED_RESCHED) + set_preempt_need_resched(); return; } - if (set_nr_and_not_polling(curr)) - smp_send_reschedule(cpu); - else + if (set_nr_and_not_polling(cti, tif)) { + if (tif == TIF_NEED_RESCHED) + smp_send_reschedule(cpu); + } else { trace_sched_wake_idle_without_ipi(cpu); + } +} + +void resched_curr(struct rq *rq) +{ + __resched_curr(rq, TIF_NEED_RESCHED); } void resched_cpu(int cpu) @@ -1192,7 +1200,7 @@ static void wake_up_idle_cpu(int cpu) * and testing of the above solutions didn't appear to report * much benefits. */ - if (set_nr_and_not_polling(rq->idle)) + if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED)) smp_send_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); -- 2.51.0 From 7c70cb94d29cd325fabe4a818c18613e3b9919a1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 4 Oct 2024 14:46:58 +0200 Subject: [PATCH 11/16] sched: Add Lazy preemption model Change fair to use resched_curr_lazy(), which, when the lazy preemption model is selected, will set TIF_NEED_RESCHED_LAZY. This LAZY bit will be promoted to the full NEED_RESCHED bit on tick. As such, the average delay between setting LAZY and actually rescheduling will be TICK_NSEC/2. In short, Lazy preemption will delay preemption for fair class but will function as Full preemption for all the other classes, most notably the realtime (RR/FIFO/DEADLINE) classes. The goal is to bridge the performance gap with Voluntary, such that we might eventually remove that option entirely. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sebastian Andrzej Siewior Link: https://lkml.kernel.org/r/20241007075055.331243614@infradead.org --- include/linux/preempt.h | 8 ++++- kernel/Kconfig.preempt | 15 ++++++++ kernel/sched/core.c | 80 +++++++++++++++++++++++++++++++++++++++-- kernel/sched/debug.c | 5 +-- kernel/sched/fair.c | 6 ++-- kernel/sched/sched.h | 1 + 6 files changed, 107 insertions(+), 8 deletions(-) diff --git a/include/linux/preempt.h b/include/linux/preempt.h index ce76f1a45722..ca86235ac15c 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -486,6 +486,7 @@ DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) extern bool preempt_model_none(void); extern bool preempt_model_voluntary(void); extern bool preempt_model_full(void); +extern bool preempt_model_lazy(void); #else @@ -502,6 +503,11 @@ static inline bool preempt_model_full(void) return IS_ENABLED(CONFIG_PREEMPT); } +static inline bool preempt_model_lazy(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_LAZY); +} + #endif static inline bool preempt_model_rt(void) @@ -519,7 +525,7 @@ static inline bool preempt_model_rt(void) */ static inline bool preempt_model_preemptible(void) { - return preempt_model_full() || preempt_model_rt(); + return preempt_model_full() || preempt_model_lazy() || preempt_model_rt(); } #endif /* __LINUX_PREEMPT_H */ diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index fe782cd77388..09f06d8964cf 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -11,6 +11,9 @@ config PREEMPT_BUILD select PREEMPTION select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK +config ARCH_HAS_PREEMPT_LAZY + bool + choice prompt "Preemption Model" default PREEMPT_NONE @@ -67,6 +70,18 @@ config PREEMPT embedded system with latency requirements in the milliseconds range. +config PREEMPT_LAZY + bool "Scheduler controlled preemption model" + depends on !ARCH_NO_PREEMPT + depends on ARCH_HAS_PREEMPT_LAZY + select PREEMPT_BUILD + help + This option provides a scheduler driven preemption model that + is fundamentally similar to full preemption, but is less + eager to preempt SCHED_NORMAL tasks in an attempt to + reduce lock holder preemption and recover some of the performance + gains seen from using Voluntary preemption. + config PREEMPT_RT bool "Fully Preemptible Kernel (Real-Time)" depends on EXPERT && ARCH_SUPPORTS_RT diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0cd05e36b6b6..df6a34d27d2b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1083,6 +1083,13 @@ static void __resched_curr(struct rq *rq, int tif) lockdep_assert_rq_held(rq); + /* + * Always immediately preempt the idle task; no point in delaying doing + * actual work. + */ + if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY) + tif = TIF_NEED_RESCHED; + if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED)) return; @@ -1108,6 +1115,32 @@ void resched_curr(struct rq *rq) __resched_curr(rq, TIF_NEED_RESCHED); } +#ifdef CONFIG_PREEMPT_DYNAMIC +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy); +static __always_inline bool dynamic_preempt_lazy(void) +{ + return static_branch_unlikely(&sk_dynamic_preempt_lazy); +} +#else +static __always_inline bool dynamic_preempt_lazy(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_LAZY); +} +#endif + +static __always_inline int get_lazy_tif_bit(void) +{ + if (dynamic_preempt_lazy()) + return TIF_NEED_RESCHED_LAZY; + + return TIF_NEED_RESCHED; +} + +void resched_curr_lazy(struct rq *rq) +{ + __resched_curr(rq, get_lazy_tif_bit()); +} + void resched_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -5612,6 +5645,10 @@ void sched_tick(void) update_rq_clock(rq); hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); + + if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY)) + resched_curr(rq); + donor->sched_class->task_tick(rq, donor, 0); if (sched_feat(LATENCY_WARN)) resched_latency = cpu_resched_latency(rq); @@ -7374,6 +7411,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); * preempt_schedule <- NOP * preempt_schedule_notrace <- NOP * irqentry_exit_cond_resched <- NOP + * dynamic_preempt_lazy <- false * * VOLUNTARY: * cond_resched <- __cond_resched @@ -7381,6 +7419,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); * preempt_schedule <- NOP * preempt_schedule_notrace <- NOP * irqentry_exit_cond_resched <- NOP + * dynamic_preempt_lazy <- false * * FULL: * cond_resched <- RET0 @@ -7388,6 +7427,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); * preempt_schedule <- preempt_schedule * preempt_schedule_notrace <- preempt_schedule_notrace * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + * dynamic_preempt_lazy <- false + * + * LAZY: + * cond_resched <- RET0 + * might_resched <- RET0 + * preempt_schedule <- preempt_schedule + * preempt_schedule_notrace <- preempt_schedule_notrace + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + * dynamic_preempt_lazy <- true */ enum { @@ -7395,6 +7443,7 @@ enum { preempt_dynamic_none, preempt_dynamic_voluntary, preempt_dynamic_full, + preempt_dynamic_lazy, }; int preempt_dynamic_mode = preempt_dynamic_undefined; @@ -7410,15 +7459,23 @@ int sched_dynamic_mode(const char *str) if (!strcmp(str, "full")) return preempt_dynamic_full; +#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY + if (!strcmp(str, "lazy")) + return preempt_dynamic_lazy; +#endif + return -EINVAL; } +#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) +#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) + #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) #define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key) -#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key) +#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) +#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) #else #error "Unsupported PREEMPT_DYNAMIC mechanism" #endif @@ -7438,6 +7495,7 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); switch (mode) { case preempt_dynamic_none: @@ -7447,6 +7505,7 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); preempt_dynamic_disable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: none\n"); break; @@ -7458,6 +7517,7 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); preempt_dynamic_disable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: voluntary\n"); break; @@ -7469,9 +7529,22 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: full\n"); break; + + case preempt_dynamic_lazy: + if (!klp_override) + preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_enable(preempt_lazy); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: lazy\n"); + break; } preempt_dynamic_mode = mode; @@ -7534,6 +7607,8 @@ static void __init preempt_dynamic_init(void) sched_dynamic_update(preempt_dynamic_none); } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { sched_dynamic_update(preempt_dynamic_voluntary); + } else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) { + sched_dynamic_update(preempt_dynamic_lazy); } else { /* Default static call setting, nothing to do */ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); @@ -7554,6 +7629,7 @@ static void __init preempt_dynamic_init(void) PREEMPT_MODEL_ACCESSOR(none); PREEMPT_MODEL_ACCESSOR(voluntary); PREEMPT_MODEL_ACCESSOR(full); +PREEMPT_MODEL_ACCESSOR(lazy); #else /* !CONFIG_PREEMPT_DYNAMIC: */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index f4035c7a0fa1..44a49f90b05f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -245,11 +245,12 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, static int sched_dynamic_show(struct seq_file *m, void *v) { static const char * preempt_modes[] = { - "none", "voluntary", "full" + "none", "voluntary", "full", "lazy", }; + int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); int i; - for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { + for (i = 0; i < j; i++) { if (preempt_dynamic_mode == i) seq_puts(m, "("); seq_puts(m, preempt_modes[i]); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6512258dc71f..3356315d7e64 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1251,7 +1251,7 @@ static void update_curr(struct cfs_rq *cfs_rq) return; if (resched || did_preempt_short(cfs_rq, curr)) { - resched_curr(rq); + resched_curr_lazy(rq); clear_buddies(cfs_rq, curr); } } @@ -5677,7 +5677,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); return; } #endif @@ -8829,7 +8829,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int return; preempt: - resched_curr(rq); + resched_curr_lazy(rq); } static struct task_struct *pick_task_fair(struct rq *rq) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e51bf5a344d3..090dd4b38fa2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2689,6 +2689,7 @@ extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); extern void resched_curr(struct rq *rq); +extern void resched_curr_lazy(struct rq *rq); extern void resched_cpu(int cpu); extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); -- 2.51.0 From 35772d627b55cc7fb4f33bae57c564a25b3121a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 4 Oct 2024 14:46:56 +0200 Subject: [PATCH 12/16] sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT In order to enable PREEMPT_DYNAMIC for PREEMPT_RT, remove PREEMPT_RT from the 'Preemption Model' choice. Strictly speaking PREEMPT_RT is not a change in how preemption works, but rather it makes a ton more code preemptible. Notably, take away NONE and VOLUNTARY options for PREEMPT_RT, they make no sense (but are techincally possible). Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sebastian Andrzej Siewior Link: https://lkml.kernel.org/r/20241007075055.441622332@infradead.org --- kernel/Kconfig.preempt | 12 +++++++----- kernel/sched/core.c | 2 ++ kernel/sched/debug.c | 4 ++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 09f06d8964cf..7c1b29a3a491 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -20,6 +20,7 @@ choice config PREEMPT_NONE bool "No Forced Preemption (Server)" + depends on !PREEMPT_RT select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC help This is the traditional Linux preemption model, geared towards @@ -35,6 +36,7 @@ config PREEMPT_NONE config PREEMPT_VOLUNTARY bool "Voluntary Kernel Preemption (Desktop)" depends on !ARCH_NO_PREEMPT + depends on !PREEMPT_RT select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC help This option reduces the latency of the kernel by adding more @@ -54,7 +56,7 @@ config PREEMPT_VOLUNTARY config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" depends on !ARCH_NO_PREEMPT - select PREEMPT_BUILD + select PREEMPT_BUILD if !PREEMPT_DYNAMIC help This option reduces the latency of the kernel by making all kernel code (that is not executing in a critical section) @@ -74,7 +76,7 @@ config PREEMPT_LAZY bool "Scheduler controlled preemption model" depends on !ARCH_NO_PREEMPT depends on ARCH_HAS_PREEMPT_LAZY - select PREEMPT_BUILD + select PREEMPT_BUILD if !PREEMPT_DYNAMIC help This option provides a scheduler driven preemption model that is fundamentally similar to full preemption, but is less @@ -82,6 +84,8 @@ config PREEMPT_LAZY reduce lock holder preemption and recover some of the performance gains seen from using Voluntary preemption. +endchoice + config PREEMPT_RT bool "Fully Preemptible Kernel (Real-Time)" depends on EXPERT && ARCH_SUPPORTS_RT @@ -99,8 +103,6 @@ config PREEMPT_RT Select this if you are building a kernel for systems which require real-time guarantees. -endchoice - config PREEMPT_COUNT bool @@ -110,7 +112,7 @@ config PREEMPTION config PREEMPT_DYNAMIC bool "Preemption behaviour defined on boot" - depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT + depends on HAVE_PREEMPT_DYNAMIC select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY select PREEMPT_BUILD default y if HAVE_PREEMPT_DYNAMIC_CALL diff --git a/kernel/sched/core.c b/kernel/sched/core.c index df6a34d27d2b..5c47d70f4204 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7450,11 +7450,13 @@ int preempt_dynamic_mode = preempt_dynamic_undefined; int sched_dynamic_mode(const char *str) { +#ifndef CONFIG_PREEMPT_RT if (!strcmp(str, "none")) return preempt_dynamic_none; if (!strcmp(str, "voluntary")) return preempt_dynamic_voluntary; +#endif if (!strcmp(str, "full")) return preempt_dynamic_full; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 44a49f90b05f..a48b2a701ec2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -248,9 +248,9 @@ static int sched_dynamic_show(struct seq_file *m, void *v) "none", "voluntary", "full", "lazy", }; int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); - int i; + int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2; - for (i = 0; i < j; i++) { + for (; i < j; i++) { if (preempt_dynamic_mode == i) seq_puts(m, "("); seq_puts(m, preempt_modes[i]); -- 2.51.0 From 476e8583ca16eecec0a3a28b6ee7130f4e369389 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 4 Oct 2024 14:46:54 +0200 Subject: [PATCH 13/16] sched, x86: Enable Lazy preemption Add the TIF bit and select the Kconfig symbol to make it go. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sebastian Andrzej Siewior Link: https://lkml.kernel.org/r/20241007075055.555778919@infradead.org --- arch/x86/Kconfig | 1 + arch/x86/include/asm/thread_info.h | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2852fcd82cbd..b76aa7f20710 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -93,6 +93,7 @@ config X86 select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_PREEMPT_LAZY select ARCH_HAS_PTE_DEVMAP if X86_64 select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_HW_PTE_YOUNG diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 12da7dfd5ef1..75bb390f7baf 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -87,8 +87,9 @@ struct thread_info { #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ -#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ -#define TIF_SSBD 5 /* Speculative store bypass disable */ +#define TIF_NEED_RESCHED_LAZY 4 /* rescheduling necessary */ +#define TIF_SINGLESTEP 5 /* reenable singlestep on user return*/ +#define TIF_SSBD 6 /* Speculative store bypass disable */ #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ #define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ @@ -110,6 +111,7 @@ struct thread_info { #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_SSBD (1 << TIF_SSBD) #define _TIF_SPEC_IB (1 << TIF_SPEC_IB) -- 2.51.0 From 22aaec357c1ff85b72c105c90503e3b4187384b8 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 21 Oct 2024 17:08:42 +0200 Subject: [PATCH 14/16] riscv: add PREEMPT_LAZY support riscv has switched to GENERIC_ENTRY, so adding PREEMPT_LAZY is as simple as adding TIF_NEED_RESCHED_LAZY related definitions and enabling ARCH_HAS_PREEMPT_LAZY. [bigeasy: Replace old PREEMPT_AUTO bits with new PREEMPT_LAZY ] Signed-off-by: Jisheng Zhang Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Acked-by: Palmer Dabbelt Link: https://lkml.kernel.org/r/20241021151257.102296-4-bigeasy@linutronix.de --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/thread_info.h | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 62545946ecf4..3516c5848061 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -39,6 +39,7 @@ config RISCV select ARCH_HAS_MMIOWB select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API + select ARCH_HAS_PREEMPT_LAZY select ARCH_HAS_PREPARE_SYNC_CORE_CMD select ARCH_HAS_PTE_DEVMAP if 64BIT && MMU select ARCH_HAS_PTE_SPECIAL diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index 9c10fb180f43..f5916a70879a 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -107,9 +107,10 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); * - pending work-to-be-done flags are in lowest half-word * - other flags in upper half-word(s) */ -#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ -#define TIF_SIGPENDING 2 /* signal pending */ -#define TIF_NEED_RESCHED 3 /* rescheduling necessary */ +#define TIF_NEED_RESCHED 0 /* rescheduling necessary */ +#define TIF_NEED_RESCHED_LAZY 1 /* Lazy rescheduling needed */ +#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ +#define TIF_SIGPENDING 3 /* signal pending */ #define TIF_RESTORE_SIGMASK 4 /* restore signal mask in do_signal() */ #define TIF_MEMDIE 5 /* is terminating due to OOM killer */ #define TIF_NOTIFY_SIGNAL 9 /* signal notifications exist */ @@ -117,9 +118,10 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); #define TIF_32BIT 11 /* compat-mode 32bit process */ #define TIF_RISCV_V_DEFER_RESTORE 12 /* restore Vector before returing to user */ +#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_RISCV_V_DEFER_RESTORE (1 << TIF_RISCV_V_DEFER_RESTORE) -- 2.51.0 From fe9beaaa802d44d881b165430b3239a9d7bebf30 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Nov 2024 15:21:54 +0100 Subject: [PATCH 15/16] sched: No PREEMPT_RT=y for all{yes,mod}config While PREEMPT_RT is undoubtedly totally awesome, it does not, at this time, make sense to have all{yes,mod}config select it. Reported-by: Stephen Rothwell Fixes: 35772d627b55 ("sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT") Signed-off-by: Peter Zijlstra (Intel) --- kernel/Kconfig.preempt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 7c1b29a3a491..54ea59ff8fbe 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -88,7 +88,7 @@ endchoice config PREEMPT_RT bool "Fully Preemptible Kernel (Real-Time)" - depends on EXPERT && ARCH_SUPPORTS_RT + depends on EXPERT && ARCH_SUPPORTS_RT && !COMPILE_TEST select PREEMPTION help This option turns the kernel into a real-time kernel by replacing -- 2.51.0 From 771d271b2b908cf660d6789bb4355ed553250edc Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 6 Nov 2024 17:24:49 +0100 Subject: [PATCH 16/16] sched, x86: Update the comment for TIF_NEED_RESCHED_LAZY. Add the "Lazy" part to the comment for TIF_NEED_RESCHED_LAZY so it is not the same as TIF_NEED_RESCHED. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241106162449.sk6rDddk@linutronix.de --- arch/x86/include/asm/thread_info.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 75bb390f7baf..a55c214f3ba6 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -87,7 +87,7 @@ struct thread_info { #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ -#define TIF_NEED_RESCHED_LAZY 4 /* rescheduling necessary */ +#define TIF_NEED_RESCHED_LAZY 4 /* Lazy rescheduling needed */ #define TIF_SINGLESTEP 5 /* reenable singlestep on user return*/ #define TIF_SSBD 6 /* Speculative store bypass disable */ #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ -- 2.51.0