From b1b46751671be5a426982f037a47ae05f37ff80b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 18 Oct 2024 09:50:05 -0700 Subject: [PATCH 01/16] mm: fix follow_pfnmap API lockdep assert The lockdep asserts for the new follow_pfnmap() API "knows" that a pfnmap always has a vma->vm_file, since that's the only way to create such a mapping. And that's actually true for all the normal cases. But not for the mmap failure case, where the incomplete mapping is torn down and we have cleared vma->vm_file because the failure occured before the file was linked to the vma. So this codepath does actually need to check for vm_file being NULL. Reported-by: Jann Horn Fixes: 6da8e9634bb7 ("mm: new follow_pfnmap API") Cc: Peter Xu Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 30feedabc932..3ccee51adfbb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6350,7 +6350,8 @@ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args, static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma) { #ifdef CONFIG_LOCKDEP - struct address_space *mapping = vma->vm_file->f_mapping; + struct file *file = vma->vm_file; + struct address_space *mapping = file ? file->f_mapping : NULL; if (mapping) lockdep_assert(lockdep_is_held(&vma->vm_file->f_mapping->i_mmap_rwsem) || -- 2.51.0 From f40998a8e6bbf0314b8416350183a537f9b59ca9 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Fri, 27 Sep 2024 10:23:44 +0200 Subject: [PATCH 02/16] ipe: fallback to platform keyring also if key in trusted keyring is rejected If enabled, we fallback to the platform keyring if the trusted keyring doesn't have the key used to sign the ipe policy. But if pkcs7_verify() rejects the key for other reasons, such as usage restrictions, we do not fallback. Do so, following the same change in dm-verity. Signed-off-by: Luca Boccassi Suggested-by: Serge Hallyn [FW: fixed some line length issues and a typo in the commit message] Signed-off-by: Fan Wu --- security/ipe/policy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/ipe/policy.c b/security/ipe/policy.c index 45f7d6a0ed23..b628f696e32b 100644 --- a/security/ipe/policy.c +++ b/security/ipe/policy.c @@ -178,7 +178,7 @@ struct ipe_policy *ipe_new_policy(const char *text, size_t textlen, VERIFYING_UNSPECIFIED_SIGNATURE, set_pkcs7_data, new); #ifdef CONFIG_IPE_POLICY_SIG_PLATFORM_KEYRING - if (rc == -ENOKEY) + if (rc == -ENOKEY || rc == -EKEYREJECTED) rc = verify_pkcs7_signature(NULL, 0, new->pkcs7, pkcs7len, VERIFY_USE_PLATFORM_KEYRING, VERIFYING_UNSPECIFIED_SIGNATURE, -- 2.51.0 From 917a15c37d371bc40b5ad13df366e29bd49c04a1 Mon Sep 17 00:00:00 2001 From: Fan Wu Date: Wed, 16 Oct 2024 16:43:05 -0700 Subject: [PATCH 03/16] MAINTAINERS: update IPE tree url and Fan Wu's email Update Integrity Policy Enforcement (IPE) LSM tree url and maintainer's email to the newly issued kernel.org tree/email. Signed-off-by: Fan Wu --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7ad507f49324..33b158cf52b4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11283,10 +11283,10 @@ F: security/integrity/ F: security/integrity/ima/ INTEGRITY POLICY ENFORCEMENT (IPE) -M: Fan Wu +M: Fan Wu L: linux-security-module@vger.kernel.org S: Supported -T: git https://github.com/microsoft/ipe.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/wufan/ipe.git F: Documentation/admin-guide/LSM/ipe.rst F: Documentation/security/ipe.rst F: scripts/ipe/ -- 2.51.0 From 22a18935d7d96bbb1a28076f843c1926d0ba189e Mon Sep 17 00:00:00 2001 From: John Edwards Date: Thu, 10 Oct 2024 23:09:23 +0000 Subject: [PATCH 04/16] Input: xpad - add support for MSI Claw A1M Add MSI Claw A1M controller to xpad_device match table when in xinput mode. Add MSI VID as XPAD_XBOX360_VENDOR. Signed-off-by: John Edwards Reviewed-by: Derek J. Clark Reviewed-by: Christopher Snowhill Link: https://lore.kernel.org/r/20241010232020.3292284-4-uejji@uejji.net Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 30b4cca8b69f..22ea58bf76cb 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -218,6 +218,7 @@ static const struct xpad_device { { 0x0c12, 0x8810, "Zeroplus Xbox Controller", 0, XTYPE_XBOX }, { 0x0c12, 0x9902, "HAMA VibraX - *FAULTY HARDWARE*", 0, XTYPE_XBOX }, { 0x0d2f, 0x0002, "Andamiro Pump It Up pad", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX }, + { 0x0db0, 0x1901, "Micro Star International Xbox360 Controller for Windows", 0, XTYPE_XBOX360 }, { 0x0e4c, 0x1097, "Radica Gamester Controller", 0, XTYPE_XBOX }, { 0x0e4c, 0x1103, "Radica Gamester Reflex", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX }, { 0x0e4c, 0x2390, "Radica Games Jtech Controller", 0, XTYPE_XBOX }, @@ -493,6 +494,7 @@ static const struct usb_device_id xpad_table[] = { XPAD_XBOX360_VENDOR(0x07ff), /* Mad Catz Gamepad */ XPAD_XBOXONE_VENDOR(0x0b05), /* ASUS controllers */ XPAD_XBOX360_VENDOR(0x0c12), /* Zeroplus X-Box 360 controllers */ + XPAD_XBOX360_VENDOR(0x0db0), /* Micro Star International X-Box 360 controllers */ XPAD_XBOX360_VENDOR(0x0e6f), /* 0x0e6f Xbox 360 controllers */ XPAD_XBOXONE_VENDOR(0x0e6f), /* 0x0e6f Xbox One controllers */ XPAD_XBOX360_VENDOR(0x0f0d), /* Hori controllers */ -- 2.51.0 From 2de01e0e57f3ebe7f90b08f6bca5ce0f3da3829f Mon Sep 17 00:00:00 2001 From: Nikita Travkin Date: Fri, 4 Oct 2024 21:17:30 +0500 Subject: [PATCH 05/16] Input: zinitix - don't fail if linux,keycodes prop is absent When initially adding the touchkey support, a mistake was made in the property parsing code. The possible negative errno from device_property_count_u32() was never checked, which was an oversight left from converting to it from the of_property as part of the review fixes. Re-add the correct handling of the absent property, in which case zero touchkeys should be assumed, which would disable the feature. Reported-by: Jakob Hauser Tested-by: Jakob Hauser Fixes: 075d9b22c8fe ("Input: zinitix - add touchkey support") Reviewed-by: Linus Walleij Signed-off-by: Nikita Travkin Tested-by: Yassine Oudjana Link: https://lore.kernel.org/r/20241004-zinitix-no-keycodes-v2-1-876dc9fea4b6@trvn.ru Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/zinitix.c | 34 +++++++++++++++++++---------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/drivers/input/touchscreen/zinitix.c b/drivers/input/touchscreen/zinitix.c index 52b3950460e2..716d6fa60f86 100644 --- a/drivers/input/touchscreen/zinitix.c +++ b/drivers/input/touchscreen/zinitix.c @@ -645,19 +645,29 @@ static int zinitix_ts_probe(struct i2c_client *client) return error; } - bt541->num_keycodes = device_property_count_u32(&client->dev, "linux,keycodes"); - if (bt541->num_keycodes > ARRAY_SIZE(bt541->keycodes)) { - dev_err(&client->dev, "too many keys defined (%d)\n", bt541->num_keycodes); - return -EINVAL; - } + if (device_property_present(&client->dev, "linux,keycodes")) { + bt541->num_keycodes = device_property_count_u32(&client->dev, + "linux,keycodes"); + if (bt541->num_keycodes < 0) { + dev_err(&client->dev, "Failed to count keys (%d)\n", + bt541->num_keycodes); + return bt541->num_keycodes; + } else if (bt541->num_keycodes > ARRAY_SIZE(bt541->keycodes)) { + dev_err(&client->dev, "Too many keys defined (%d)\n", + bt541->num_keycodes); + return -EINVAL; + } - error = device_property_read_u32_array(&client->dev, "linux,keycodes", - bt541->keycodes, - bt541->num_keycodes); - if (error) { - dev_err(&client->dev, - "Unable to parse \"linux,keycodes\" property: %d\n", error); - return error; + error = device_property_read_u32_array(&client->dev, + "linux,keycodes", + bt541->keycodes, + bt541->num_keycodes); + if (error) { + dev_err(&client->dev, + "Unable to parse \"linux,keycodes\" property: %d\n", + error); + return error; + } } error = zinitix_init_input_dev(bt541); -- 2.51.0 From 2c02f7375e658ae93d57a31a66f91b62754ef8f1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 18 Oct 2024 21:43:00 -0400 Subject: [PATCH 06/16] fgraph: Use CPU hotplug mechanism to initialize idle shadow stacks The function graph infrastructure allocates a shadow stack for every task when enabled. This includes the idle tasks. The first time the function graph is invoked, the shadow stacks are created and never freed until the task exits. This includes the idle tasks. Only the idle tasks that were for online CPUs had their shadow stacks created when function graph tracing started. If function graph tracing is enabled and a CPU comes online, the idle task representing that CPU will not have its shadow stack created, and all function graph tracing for that idle task will be silently dropped. Instead, use the CPU hotplug mechanism to allocate the idle shadow stacks. This will include idle tasks for CPUs that come online during tracing. This issue can be reproduced by: # cd /sys/kernel/tracing # echo 0 > /sys/devices/system/cpu/cpu1/online # echo 0 > set_ftrace_pid # echo function_graph > current_tracer # echo 1 > options/funcgraph-proc # echo 1 > /sys/devices/system/cpu/cpu1 # grep '' per_cpu/cpu1/trace | head Before, nothing would show up. After: 1) -0 | 0.811 us | __enqueue_entity(); 1) -0 | 5.626 us | } /* enqueue_entity */ 1) -0 | | dl_server_update_idle_time() { 1) -0 | | dl_scaled_delta_exec() { 1) -0 | 0.450 us | arch_scale_cpu_capacity(); 1) -0 | 1.242 us | } 1) -0 | 1.908 us | } 1) -0 | | dl_server_start() { 1) -0 | | enqueue_dl_entity() { 1) -0 | | task_contending() { Note, if tracing stops and restarts, the old way would then initialize the onlined CPUs. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Thomas Gleixner Link: https://lore.kernel.org/20241018214300.6df82178@rorschach Fixes: 868baf07b1a25 ("ftrace: Fix memory leak with function graph and cpu hotplug") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index d7d4fb403f6f..43f4e3f57438 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1160,19 +1160,13 @@ void fgraph_update_pid_func(void) static int start_graph_tracing(void) { unsigned long **ret_stack_list; - int ret, cpu; + int ret; ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL); if (!ret_stack_list) return -ENOMEM; - /* The cpu_boot init_task->ret_stack will never be freed */ - for_each_online_cpu(cpu) { - if (!idle_task(cpu)->ret_stack) - ftrace_graph_init_idle_task(idle_task(cpu), cpu); - } - do { ret = alloc_retstack_tasklist(ret_stack_list); } while (ret == -EAGAIN); @@ -1242,14 +1236,34 @@ static void ftrace_graph_disable_direct(bool disable_branch) fgraph_direct_gops = &fgraph_stub; } +/* The cpu_boot init_task->ret_stack will never be freed */ +static int fgraph_cpu_init(unsigned int cpu) +{ + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_idle_task(idle_task(cpu), cpu); + return 0; +} + int register_ftrace_graph(struct fgraph_ops *gops) { + static bool fgraph_initialized; int command = 0; int ret = 0; int i = -1; mutex_lock(&ftrace_lock); + if (!fgraph_initialized) { + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph_idle_init", + fgraph_cpu_init, NULL); + if (ret < 0) { + pr_warn("fgraph: Error to init cpu hotplug support\n"); + return ret; + } + fgraph_initialized = true; + ret = 0; + } + if (!fgraph_array[0]) { /* The array must always have real data on it */ for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) -- 2.51.0 From fae4078c289a2f24229c0de652249948b1cd6bdb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 18 Oct 2024 21:52:12 -0400 Subject: [PATCH 07/16] fgraph: Allocate ret_stack_list with proper size The ret_stack_list is an array of ret_stack shadow stacks for the function graph usage. When the first function graph is enabled, all tasks in the system get a shadow stack. The ret_stack_list is a 32 element array of pointers to these shadow stacks. It allocates the shadow stack in batches (32 stacks at a time), assigns them to running tasks, and continues until all tasks are covered. When the function graph shadow stack changed from an array of ftrace_ret_stack structures to an array of longs, the allocation of ret_stack_list went from allocating an array of 32 elements to just a block defined by SHADOW_STACK_SIZE. Luckily, that's defined as PAGE_SIZE and is much more than enough to hold 32 pointers. But it is way overkill for the amount needed to allocate. Change the allocation of ret_stack_list back to a kcalloc() of FTRACE_RETSTACK_ALLOC_SIZE pointers. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241018215212.23f13f40@rorschach Fixes: 42675b723b484 ("function_graph: Convert ret_stack to a series of longs") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 43f4e3f57438..41e7a15dcb50 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1162,7 +1162,8 @@ static int start_graph_tracing(void) unsigned long **ret_stack_list; int ret; - ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL); + ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE, + sizeof(*ret_stack_list), GFP_KERNEL); if (!ret_stack_list) return -ENOMEM; -- 2.51.0 From ae6a888a4357131c01d85f4c91fb32552dd0bf70 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 19 Oct 2024 09:16:51 -0600 Subject: [PATCH 08/16] io_uring/rw: fix wrong NOWAIT check in io_rw_init_file() A previous commit improved how !FMODE_NOWAIT is dealt with, but inadvertently negated a check whilst doing so. This caused -EAGAIN to be returned from reading files with O_NONBLOCK set. Fix up the check for REQ_F_SUPPORT_NOWAIT. Reported-by: Julian Orth Link: https://github.com/axboe/liburing/issues/1270 Fixes: f7c913438533 ("io_uring/rw: allow pollable non-blocking attempts for !FMODE_NOWAIT") Signed-off-by: Jens Axboe --- io_uring/rw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 80ae3c2ebb70..354c4e175654 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -807,7 +807,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) * reliably. If not, or it IOCB_NOWAIT is set, don't retry. */ if (kiocb->ki_flags & IOCB_NOWAIT || - ((file->f_flags & O_NONBLOCK && (req->flags & REQ_F_SUPPORT_NOWAIT)))) + ((file->f_flags & O_NONBLOCK && !(req->flags & REQ_F_SUPPORT_NOWAIT)))) req->flags |= REQ_F_NOWAIT; if (ctx->flags & IORING_SETUP_IOPOLL) { -- 2.51.0 From 42f7652d3eb527d03665b09edac47f85fb600924 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 20 Oct 2024 15:19:38 -0700 Subject: [PATCH 09/16] Linux 6.12-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8cf3cf528892..a9a7d9ffaa98 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 12 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.51.0 From 165126dc5e23979721122dc5c7cfb28b1ca234cc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 21 Sep 2024 01:59:47 -0600 Subject: [PATCH 10/16] io_uring/eventfd: abstract out ev_fd put helper We call this in two spot, have a helper for it. In preparation for extending this part. Link: https://lore.kernel.org/r/20240921080307.185186-2-axboe@kernel.dk Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index e37fddd5d9ce..8b628ab6bbff 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -41,6 +41,12 @@ static void io_eventfd_do_signal(struct rcu_head *rcu) io_eventfd_free(rcu); } +static void io_eventfd_put(struct io_ev_fd *ev_fd) +{ + if (refcount_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); +} + void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd = NULL; @@ -77,8 +83,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) } } out: - if (refcount_dec_and_test(&ev_fd->refs)) - call_rcu(&ev_fd->rcu, io_eventfd_free); + io_eventfd_put(ev_fd); } void io_eventfd_flush_signal(struct io_ring_ctx *ctx) @@ -152,8 +157,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx) if (ev_fd) { ctx->has_evfd = false; rcu_assign_pointer(ctx->io_ev_fd, NULL); - if (refcount_dec_and_test(&ev_fd->refs)) - call_rcu(&ev_fd->rcu, io_eventfd_free); + io_eventfd_put(ev_fd); return 0; } -- 2.51.0 From 3c90b80df5b574c2c61626fd40fa3b23be21fa26 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 21 Sep 2024 01:59:48 -0600 Subject: [PATCH 11/16] io_uring/eventfd: check for the need to async notifier earlier It's not necessary to do this post grabbing a reference. With that, we can drop the out goto path as well. Link: https://lore.kernel.org/r/20240921080307.185186-3-axboe@kernel.dk Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 8b628ab6bbff..829873806f9f 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -69,10 +69,10 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) */ if (unlikely(!ev_fd)) return; + if (ev_fd->eventfd_async && !io_wq_current_is_worker()) + return; if (!refcount_inc_not_zero(&ev_fd->refs)) return; - if (ev_fd->eventfd_async && !io_wq_current_is_worker()) - goto out; if (likely(eventfd_signal_allowed())) { eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); @@ -82,7 +82,6 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) return; } } -out: io_eventfd_put(ev_fd); } -- 2.51.0 From 60c5f15800f21883615689e2423217a9c8a1b502 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 21 Sep 2024 01:59:49 -0600 Subject: [PATCH 12/16] io_uring/eventfd: move actual signaling part into separate helper In preparation for using this from multiple spots, move the signaling into a helper. Link: https://lore.kernel.org/r/20240921080307.185186-4-axboe@kernel.dk Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 829873806f9f..58e76f4d1e00 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -47,6 +47,22 @@ static void io_eventfd_put(struct io_ev_fd *ev_fd) call_rcu(&ev_fd->rcu, io_eventfd_free); } +/* + * Returns true if the caller should put the ev_fd reference, false if not. + */ +static bool __io_eventfd_signal(struct io_ev_fd *ev_fd) +{ + if (eventfd_signal_allowed()) { + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + return true; + } + if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { + call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); + return false; + } + return true; +} + void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd = NULL; @@ -73,16 +89,8 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) return; if (!refcount_inc_not_zero(&ev_fd->refs)) return; - - if (likely(eventfd_signal_allowed())) { - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - } else { - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { - call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); - return; - } - } - io_eventfd_put(ev_fd); + if (__io_eventfd_signal(ev_fd)) + io_eventfd_put(ev_fd); } void io_eventfd_flush_signal(struct io_ring_ctx *ctx) -- 2.51.0 From 3ca5a356041438534ecbb74159df91736238c6b1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 21 Sep 2024 01:59:50 -0600 Subject: [PATCH 13/16] io_uring/eventfd: move trigger check into a helper It's a bit hard to read what guards the triggering, move it into a helper and add a comment explaining it too. This additionally moves the ev_fd == NULL check in there as well. Link: https://lore.kernel.org/r/20240921080307.185186-5-axboe@kernel.dk Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 58e76f4d1e00..0946d3da88d3 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -63,6 +63,17 @@ static bool __io_eventfd_signal(struct io_ev_fd *ev_fd) return true; } +/* + * Trigger if eventfd_async isn't set, or if it's set and the caller is + * an async worker. If ev_fd isn't valid, obviously return false. + */ +static bool io_eventfd_trigger(struct io_ev_fd *ev_fd) +{ + if (ev_fd) + return !ev_fd->eventfd_async || io_wq_current_is_worker(); + return false; +} + void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd = NULL; @@ -83,9 +94,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) * completed between the NULL check of ctx->io_ev_fd at the start of * the function and rcu_read_lock. */ - if (unlikely(!ev_fd)) - return; - if (ev_fd->eventfd_async && !io_wq_current_is_worker()) + if (!io_eventfd_trigger(ev_fd)) return; if (!refcount_inc_not_zero(&ev_fd->refs)) return; -- 2.51.0 From 83a4f865e273b83426eafdd3aa51334cc21ac0fd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 21 Sep 2024 01:59:51 -0600 Subject: [PATCH 14/16] io_uring/eventfd: abstract out ev_fd grab + release helpers In preparation for needing the ev_fd grabbing (and releasing) from another path, abstract out two helpers for that. Link: https://lore.kernel.org/r/20240921080307.185186-6-axboe@kernel.dk Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 0946d3da88d3..d1fdecd0c458 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -47,6 +47,13 @@ static void io_eventfd_put(struct io_ev_fd *ev_fd) call_rcu(&ev_fd->rcu, io_eventfd_free); } +static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref) +{ + if (put_ref) + io_eventfd_put(ev_fd); + rcu_read_unlock(); +} + /* * Returns true if the caller should put the ev_fd reference, false if not. */ @@ -74,14 +81,18 @@ static bool io_eventfd_trigger(struct io_ev_fd *ev_fd) return false; } -void io_eventfd_signal(struct io_ring_ctx *ctx) +/* + * On success, returns with an ev_fd reference grabbed and the RCU read + * lock held. + */ +static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx) { - struct io_ev_fd *ev_fd = NULL; + struct io_ev_fd *ev_fd; if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - return; + return NULL; - guard(rcu)(); + rcu_read_lock(); /* * rcu_dereference ctx->io_ev_fd once and use it for both for checking @@ -90,16 +101,24 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) ev_fd = rcu_dereference(ctx->io_ev_fd); /* - * Check again if ev_fd exists incase an io_eventfd_unregister call + * Check again if ev_fd exists in case an io_eventfd_unregister call * completed between the NULL check of ctx->io_ev_fd at the start of * the function and rcu_read_lock. */ - if (!io_eventfd_trigger(ev_fd)) - return; - if (!refcount_inc_not_zero(&ev_fd->refs)) - return; - if (__io_eventfd_signal(ev_fd)) - io_eventfd_put(ev_fd); + if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs)) + return ev_fd; + + rcu_read_unlock(); + return NULL; +} + +void io_eventfd_signal(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd; + + ev_fd = io_eventfd_grab(ctx); + if (ev_fd) + io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd)); } void io_eventfd_flush_signal(struct io_ring_ctx *ctx) -- 2.51.0 From f4bb2f65bb8154c1a2c2d7e01db0c98dffb5918f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 21 Sep 2024 01:59:52 -0600 Subject: [PATCH 15/16] io_uring/eventfd: move ctx->evfd_last_cq_tail into io_ev_fd Everything else about the io_uring eventfd support is nicely kept private to that code, except the cached_cq_tail tracking. With everything else in place, move io_eventfd_flush_signal() to using the ev_fd grab+release helpers, which then enables the direct use of io_ev_fd for this tracking too. Link: https://lore.kernel.org/r/20240921080307.185186-7-axboe@kernel.dk Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 50 +++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index d1fdecd0c458..fab936d31ba8 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -13,10 +13,12 @@ struct io_ev_fd { struct eventfd_ctx *cq_ev_fd; - unsigned int eventfd_async: 1; - struct rcu_head rcu; + unsigned int eventfd_async; + /* protected by ->completion_lock */ + unsigned last_cq_tail; refcount_t refs; atomic_t ops; + struct rcu_head rcu; }; enum { @@ -123,25 +125,31 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) void io_eventfd_flush_signal(struct io_ring_ctx *ctx) { - bool skip; - - spin_lock(&ctx->completion_lock); - - /* - * Eventfd should only get triggered when at least one event has been - * posted. Some applications rely on the eventfd notification count - * only changing IFF a new CQE has been added to the CQ ring. There's - * no depedency on 1:1 relationship between how many times this - * function is called (and hence the eventfd count) and number of CQEs - * posted to the CQ ring. - */ - skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; - spin_unlock(&ctx->completion_lock); - if (skip) - return; + struct io_ev_fd *ev_fd; - io_eventfd_signal(ctx); + ev_fd = io_eventfd_grab(ctx); + if (ev_fd) { + bool skip, put_ref = true; + + /* + * Eventfd should only get triggered when at least one event + * has been posted. Some applications rely on the eventfd + * notification count only changing IFF a new CQE has been + * added to the CQ ring. There's no dependency on 1:1 + * relationship between how many times this function is called + * (and hence the eventfd count) and number of CQEs posted to + * the CQ ring. + */ + spin_lock(&ctx->completion_lock); + skip = ctx->cached_cq_tail == ev_fd->last_cq_tail; + ev_fd->last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + + if (!skip) + put_ref = __io_eventfd_signal(ev_fd); + + io_eventfd_release(ev_fd, put_ref); + } } int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, @@ -172,7 +180,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, } spin_lock(&ctx->completion_lock); - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + ev_fd->last_cq_tail = ctx->cached_cq_tail; spin_unlock(&ctx->completion_lock); ev_fd->eventfd_async = eventfd_async; -- 2.51.0 From 95d6c9229a04cc12d39034cd6be6446a55a85d6d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 Sep 2024 05:57:30 -0600 Subject: [PATCH 16/16] io_uring/msg_ring: refactor a few helper functions Mostly just to skip them taking an io_kiocb, rather just pass in the ctx and io_msg directly. In preparation for being able to issue a MSG_RING request without having an io_kiocb. No functional changes in this patch. Link: https://lore.kernel.org/r/20240924115932.116167-2-axboe@kernel.dk Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 7fd9badcfaf8..b8c527f08cd5 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -116,14 +116,13 @@ static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); } -static int io_msg_data_remote(struct io_kiocb *req) +static int io_msg_data_remote(struct io_ring_ctx *target_ctx, + struct io_msg *msg) { - struct io_ring_ctx *target_ctx = req->file->private_data; - struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_kiocb *target; u32 flags = 0; - target = io_msg_get_kiocb(req->ctx); + target = io_msg_get_kiocb(target_ctx); if (unlikely(!target)) return -ENOMEM; @@ -134,10 +133,9 @@ static int io_msg_data_remote(struct io_kiocb *req) msg->user_data); } -static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) +static int __io_msg_ring_data(struct io_ring_ctx *target_ctx, + struct io_msg *msg, unsigned int issue_flags) { - struct io_ring_ctx *target_ctx = req->file->private_data; - struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); u32 flags = 0; int ret; @@ -149,7 +147,7 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) return -EBADFD; if (io_msg_need_remote(target_ctx)) - return io_msg_data_remote(req); + return io_msg_data_remote(target_ctx, msg); if (msg->flags & IORING_MSG_RING_FLAGS_PASS) flags = msg->cqe_flags; @@ -166,6 +164,14 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) return ret; } +static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ring_ctx *target_ctx = req->file->private_data; + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + + return __io_msg_ring_data(target_ctx, msg, issue_flags); +} + static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags) { struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); @@ -271,10 +277,8 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) return io_msg_install_complete(req, issue_flags); } -int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int __io_msg_ring_prep(struct io_msg *msg, const struct io_uring_sqe *sqe) { - struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); - if (unlikely(sqe->buf_index || sqe->personality)) return -EINVAL; @@ -291,6 +295,11 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return __io_msg_ring_prep(io_kiocb_to_cmd(req, struct io_msg), sqe); +} + int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) { struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); -- 2.51.0