From f40998a8e6bbf0314b8416350183a537f9b59ca9 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Fri, 27 Sep 2024 10:23:44 +0200 Subject: [PATCH 01/16] ipe: fallback to platform keyring also if key in trusted keyring is rejected If enabled, we fallback to the platform keyring if the trusted keyring doesn't have the key used to sign the ipe policy. But if pkcs7_verify() rejects the key for other reasons, such as usage restrictions, we do not fallback. Do so, following the same change in dm-verity. Signed-off-by: Luca Boccassi Suggested-by: Serge Hallyn [FW: fixed some line length issues and a typo in the commit message] Signed-off-by: Fan Wu --- security/ipe/policy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/ipe/policy.c b/security/ipe/policy.c index 45f7d6a0ed23..b628f696e32b 100644 --- a/security/ipe/policy.c +++ b/security/ipe/policy.c @@ -178,7 +178,7 @@ struct ipe_policy *ipe_new_policy(const char *text, size_t textlen, VERIFYING_UNSPECIFIED_SIGNATURE, set_pkcs7_data, new); #ifdef CONFIG_IPE_POLICY_SIG_PLATFORM_KEYRING - if (rc == -ENOKEY) + if (rc == -ENOKEY || rc == -EKEYREJECTED) rc = verify_pkcs7_signature(NULL, 0, new->pkcs7, pkcs7len, VERIFY_USE_PLATFORM_KEYRING, VERIFYING_UNSPECIFIED_SIGNATURE, -- 2.51.0 From 917a15c37d371bc40b5ad13df366e29bd49c04a1 Mon Sep 17 00:00:00 2001 From: Fan Wu Date: Wed, 16 Oct 2024 16:43:05 -0700 Subject: [PATCH 02/16] MAINTAINERS: update IPE tree url and Fan Wu's email Update Integrity Policy Enforcement (IPE) LSM tree url and maintainer's email to the newly issued kernel.org tree/email. Signed-off-by: Fan Wu --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7ad507f49324..33b158cf52b4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11283,10 +11283,10 @@ F: security/integrity/ F: security/integrity/ima/ INTEGRITY POLICY ENFORCEMENT (IPE) -M: Fan Wu +M: Fan Wu L: linux-security-module@vger.kernel.org S: Supported -T: git https://github.com/microsoft/ipe.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/wufan/ipe.git F: Documentation/admin-guide/LSM/ipe.rst F: Documentation/security/ipe.rst F: scripts/ipe/ -- 2.51.0 From 22a18935d7d96bbb1a28076f843c1926d0ba189e Mon Sep 17 00:00:00 2001 From: John Edwards Date: Thu, 10 Oct 2024 23:09:23 +0000 Subject: [PATCH 03/16] Input: xpad - add support for MSI Claw A1M Add MSI Claw A1M controller to xpad_device match table when in xinput mode. Add MSI VID as XPAD_XBOX360_VENDOR. Signed-off-by: John Edwards Reviewed-by: Derek J. Clark Reviewed-by: Christopher Snowhill Link: https://lore.kernel.org/r/20241010232020.3292284-4-uejji@uejji.net Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 30b4cca8b69f..22ea58bf76cb 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -218,6 +218,7 @@ static const struct xpad_device { { 0x0c12, 0x8810, "Zeroplus Xbox Controller", 0, XTYPE_XBOX }, { 0x0c12, 0x9902, "HAMA VibraX - *FAULTY HARDWARE*", 0, XTYPE_XBOX }, { 0x0d2f, 0x0002, "Andamiro Pump It Up pad", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX }, + { 0x0db0, 0x1901, "Micro Star International Xbox360 Controller for Windows", 0, XTYPE_XBOX360 }, { 0x0e4c, 0x1097, "Radica Gamester Controller", 0, XTYPE_XBOX }, { 0x0e4c, 0x1103, "Radica Gamester Reflex", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX }, { 0x0e4c, 0x2390, "Radica Games Jtech Controller", 0, XTYPE_XBOX }, @@ -493,6 +494,7 @@ static const struct usb_device_id xpad_table[] = { XPAD_XBOX360_VENDOR(0x07ff), /* Mad Catz Gamepad */ XPAD_XBOXONE_VENDOR(0x0b05), /* ASUS controllers */ XPAD_XBOX360_VENDOR(0x0c12), /* Zeroplus X-Box 360 controllers */ + XPAD_XBOX360_VENDOR(0x0db0), /* Micro Star International X-Box 360 controllers */ XPAD_XBOX360_VENDOR(0x0e6f), /* 0x0e6f Xbox 360 controllers */ XPAD_XBOXONE_VENDOR(0x0e6f), /* 0x0e6f Xbox One controllers */ XPAD_XBOX360_VENDOR(0x0f0d), /* Hori controllers */ -- 2.51.0 From 2de01e0e57f3ebe7f90b08f6bca5ce0f3da3829f Mon Sep 17 00:00:00 2001 From: Nikita Travkin Date: Fri, 4 Oct 2024 21:17:30 +0500 Subject: [PATCH 04/16] Input: zinitix - don't fail if linux,keycodes prop is absent When initially adding the touchkey support, a mistake was made in the property parsing code. The possible negative errno from device_property_count_u32() was never checked, which was an oversight left from converting to it from the of_property as part of the review fixes. Re-add the correct handling of the absent property, in which case zero touchkeys should be assumed, which would disable the feature. Reported-by: Jakob Hauser Tested-by: Jakob Hauser Fixes: 075d9b22c8fe ("Input: zinitix - add touchkey support") Reviewed-by: Linus Walleij Signed-off-by: Nikita Travkin Tested-by: Yassine Oudjana Link: https://lore.kernel.org/r/20241004-zinitix-no-keycodes-v2-1-876dc9fea4b6@trvn.ru Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/zinitix.c | 34 +++++++++++++++++++---------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/drivers/input/touchscreen/zinitix.c b/drivers/input/touchscreen/zinitix.c index 52b3950460e2..716d6fa60f86 100644 --- a/drivers/input/touchscreen/zinitix.c +++ b/drivers/input/touchscreen/zinitix.c @@ -645,19 +645,29 @@ static int zinitix_ts_probe(struct i2c_client *client) return error; } - bt541->num_keycodes = device_property_count_u32(&client->dev, "linux,keycodes"); - if (bt541->num_keycodes > ARRAY_SIZE(bt541->keycodes)) { - dev_err(&client->dev, "too many keys defined (%d)\n", bt541->num_keycodes); - return -EINVAL; - } + if (device_property_present(&client->dev, "linux,keycodes")) { + bt541->num_keycodes = device_property_count_u32(&client->dev, + "linux,keycodes"); + if (bt541->num_keycodes < 0) { + dev_err(&client->dev, "Failed to count keys (%d)\n", + bt541->num_keycodes); + return bt541->num_keycodes; + } else if (bt541->num_keycodes > ARRAY_SIZE(bt541->keycodes)) { + dev_err(&client->dev, "Too many keys defined (%d)\n", + bt541->num_keycodes); + return -EINVAL; + } - error = device_property_read_u32_array(&client->dev, "linux,keycodes", - bt541->keycodes, - bt541->num_keycodes); - if (error) { - dev_err(&client->dev, - "Unable to parse \"linux,keycodes\" property: %d\n", error); - return error; + error = device_property_read_u32_array(&client->dev, + "linux,keycodes", + bt541->keycodes, + bt541->num_keycodes); + if (error) { + dev_err(&client->dev, + "Unable to parse \"linux,keycodes\" property: %d\n", + error); + return error; + } } error = zinitix_init_input_dev(bt541); -- 2.51.0 From 2c02f7375e658ae93d57a31a66f91b62754ef8f1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 18 Oct 2024 21:43:00 -0400 Subject: [PATCH 05/16] fgraph: Use CPU hotplug mechanism to initialize idle shadow stacks The function graph infrastructure allocates a shadow stack for every task when enabled. This includes the idle tasks. The first time the function graph is invoked, the shadow stacks are created and never freed until the task exits. This includes the idle tasks. Only the idle tasks that were for online CPUs had their shadow stacks created when function graph tracing started. If function graph tracing is enabled and a CPU comes online, the idle task representing that CPU will not have its shadow stack created, and all function graph tracing for that idle task will be silently dropped. Instead, use the CPU hotplug mechanism to allocate the idle shadow stacks. This will include idle tasks for CPUs that come online during tracing. This issue can be reproduced by: # cd /sys/kernel/tracing # echo 0 > /sys/devices/system/cpu/cpu1/online # echo 0 > set_ftrace_pid # echo function_graph > current_tracer # echo 1 > options/funcgraph-proc # echo 1 > /sys/devices/system/cpu/cpu1 # grep '' per_cpu/cpu1/trace | head Before, nothing would show up. After: 1) -0 | 0.811 us | __enqueue_entity(); 1) -0 | 5.626 us | } /* enqueue_entity */ 1) -0 | | dl_server_update_idle_time() { 1) -0 | | dl_scaled_delta_exec() { 1) -0 | 0.450 us | arch_scale_cpu_capacity(); 1) -0 | 1.242 us | } 1) -0 | 1.908 us | } 1) -0 | | dl_server_start() { 1) -0 | | enqueue_dl_entity() { 1) -0 | | task_contending() { Note, if tracing stops and restarts, the old way would then initialize the onlined CPUs. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Thomas Gleixner Link: https://lore.kernel.org/20241018214300.6df82178@rorschach Fixes: 868baf07b1a25 ("ftrace: Fix memory leak with function graph and cpu hotplug") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index d7d4fb403f6f..43f4e3f57438 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1160,19 +1160,13 @@ void fgraph_update_pid_func(void) static int start_graph_tracing(void) { unsigned long **ret_stack_list; - int ret, cpu; + int ret; ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL); if (!ret_stack_list) return -ENOMEM; - /* The cpu_boot init_task->ret_stack will never be freed */ - for_each_online_cpu(cpu) { - if (!idle_task(cpu)->ret_stack) - ftrace_graph_init_idle_task(idle_task(cpu), cpu); - } - do { ret = alloc_retstack_tasklist(ret_stack_list); } while (ret == -EAGAIN); @@ -1242,14 +1236,34 @@ static void ftrace_graph_disable_direct(bool disable_branch) fgraph_direct_gops = &fgraph_stub; } +/* The cpu_boot init_task->ret_stack will never be freed */ +static int fgraph_cpu_init(unsigned int cpu) +{ + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_idle_task(idle_task(cpu), cpu); + return 0; +} + int register_ftrace_graph(struct fgraph_ops *gops) { + static bool fgraph_initialized; int command = 0; int ret = 0; int i = -1; mutex_lock(&ftrace_lock); + if (!fgraph_initialized) { + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph_idle_init", + fgraph_cpu_init, NULL); + if (ret < 0) { + pr_warn("fgraph: Error to init cpu hotplug support\n"); + return ret; + } + fgraph_initialized = true; + ret = 0; + } + if (!fgraph_array[0]) { /* The array must always have real data on it */ for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) -- 2.51.0 From fae4078c289a2f24229c0de652249948b1cd6bdb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 18 Oct 2024 21:52:12 -0400 Subject: [PATCH 06/16] fgraph: Allocate ret_stack_list with proper size The ret_stack_list is an array of ret_stack shadow stacks for the function graph usage. When the first function graph is enabled, all tasks in the system get a shadow stack. The ret_stack_list is a 32 element array of pointers to these shadow stacks. It allocates the shadow stack in batches (32 stacks at a time), assigns them to running tasks, and continues until all tasks are covered. When the function graph shadow stack changed from an array of ftrace_ret_stack structures to an array of longs, the allocation of ret_stack_list went from allocating an array of 32 elements to just a block defined by SHADOW_STACK_SIZE. Luckily, that's defined as PAGE_SIZE and is much more than enough to hold 32 pointers. But it is way overkill for the amount needed to allocate. Change the allocation of ret_stack_list back to a kcalloc() of FTRACE_RETSTACK_ALLOC_SIZE pointers. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241018215212.23f13f40@rorschach Fixes: 42675b723b484 ("function_graph: Convert ret_stack to a series of longs") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 43f4e3f57438..41e7a15dcb50 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1162,7 +1162,8 @@ static int start_graph_tracing(void) unsigned long **ret_stack_list; int ret; - ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL); + ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE, + sizeof(*ret_stack_list), GFP_KERNEL); if (!ret_stack_list) return -ENOMEM; -- 2.51.0 From ae6a888a4357131c01d85f4c91fb32552dd0bf70 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 19 Oct 2024 09:16:51 -0600 Subject: [PATCH 07/16] io_uring/rw: fix wrong NOWAIT check in io_rw_init_file() A previous commit improved how !FMODE_NOWAIT is dealt with, but inadvertently negated a check whilst doing so. This caused -EAGAIN to be returned from reading files with O_NONBLOCK set. Fix up the check for REQ_F_SUPPORT_NOWAIT. Reported-by: Julian Orth Link: https://github.com/axboe/liburing/issues/1270 Fixes: f7c913438533 ("io_uring/rw: allow pollable non-blocking attempts for !FMODE_NOWAIT") Signed-off-by: Jens Axboe --- io_uring/rw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 80ae3c2ebb70..354c4e175654 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -807,7 +807,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) * reliably. If not, or it IOCB_NOWAIT is set, don't retry. */ if (kiocb->ki_flags & IOCB_NOWAIT || - ((file->f_flags & O_NONBLOCK && (req->flags & REQ_F_SUPPORT_NOWAIT)))) + ((file->f_flags & O_NONBLOCK && !(req->flags & REQ_F_SUPPORT_NOWAIT)))) req->flags |= REQ_F_NOWAIT; if (ctx->flags & IORING_SETUP_IOPOLL) { -- 2.51.0 From 42f7652d3eb527d03665b09edac47f85fb600924 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 20 Oct 2024 15:19:38 -0700 Subject: [PATCH 08/16] Linux 6.12-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8cf3cf528892..a9a7d9ffaa98 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 12 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.51.0 From 165126dc5e23979721122dc5c7cfb28b1ca234cc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 21 Sep 2024 01:59:47 -0600 Subject: [PATCH 09/16] io_uring/eventfd: abstract out ev_fd put helper We call this in two spot, have a helper for it. In preparation for extending this part. Link: https://lore.kernel.org/r/20240921080307.185186-2-axboe@kernel.dk Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index e37fddd5d9ce..8b628ab6bbff 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -41,6 +41,12 @@ static void io_eventfd_do_signal(struct rcu_head *rcu) io_eventfd_free(rcu); } +static void io_eventfd_put(struct io_ev_fd *ev_fd) +{ + if (refcount_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); +} + void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd = NULL; @@ -77,8 +83,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) } } out: - if (refcount_dec_and_test(&ev_fd->refs)) - call_rcu(&ev_fd->rcu, io_eventfd_free); + io_eventfd_put(ev_fd); } void io_eventfd_flush_signal(struct io_ring_ctx *ctx) @@ -152,8 +157,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx) if (ev_fd) { ctx->has_evfd = false; rcu_assign_pointer(ctx->io_ev_fd, NULL); - if (refcount_dec_and_test(&ev_fd->refs)) - call_rcu(&ev_fd->rcu, io_eventfd_free); + io_eventfd_put(ev_fd); return 0; } -- 2.51.0 From 3c90b80df5b574c2c61626fd40fa3b23be21fa26 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 21 Sep 2024 01:59:48 -0600 Subject: [PATCH 10/16] io_uring/eventfd: check for the need to async notifier earlier It's not necessary to do this post grabbing a reference. With that, we can drop the out goto path as well. Link: https://lore.kernel.org/r/20240921080307.185186-3-axboe@kernel.dk Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 8b628ab6bbff..829873806f9f 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -69,10 +69,10 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) */ if (unlikely(!ev_fd)) return; + if (ev_fd->eventfd_async && !io_wq_current_is_worker()) + return; if (!refcount_inc_not_zero(&ev_fd->refs)) return; - if (ev_fd->eventfd_async && !io_wq_current_is_worker()) - goto out; if (likely(eventfd_signal_allowed())) { eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); @@ -82,7 +82,6 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) return; } } -out: io_eventfd_put(ev_fd); } -- 2.51.0 From 60c5f15800f21883615689e2423217a9c8a1b502 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 21 Sep 2024 01:59:49 -0600 Subject: [PATCH 11/16] io_uring/eventfd: move actual signaling part into separate helper In preparation for using this from multiple spots, move the signaling into a helper. Link: https://lore.kernel.org/r/20240921080307.185186-4-axboe@kernel.dk Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 829873806f9f..58e76f4d1e00 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -47,6 +47,22 @@ static void io_eventfd_put(struct io_ev_fd *ev_fd) call_rcu(&ev_fd->rcu, io_eventfd_free); } +/* + * Returns true if the caller should put the ev_fd reference, false if not. + */ +static bool __io_eventfd_signal(struct io_ev_fd *ev_fd) +{ + if (eventfd_signal_allowed()) { + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + return true; + } + if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { + call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); + return false; + } + return true; +} + void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd = NULL; @@ -73,16 +89,8 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) return; if (!refcount_inc_not_zero(&ev_fd->refs)) return; - - if (likely(eventfd_signal_allowed())) { - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - } else { - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { - call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); - return; - } - } - io_eventfd_put(ev_fd); + if (__io_eventfd_signal(ev_fd)) + io_eventfd_put(ev_fd); } void io_eventfd_flush_signal(struct io_ring_ctx *ctx) -- 2.51.0 From 3ca5a356041438534ecbb74159df91736238c6b1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 21 Sep 2024 01:59:50 -0600 Subject: [PATCH 12/16] io_uring/eventfd: move trigger check into a helper It's a bit hard to read what guards the triggering, move it into a helper and add a comment explaining it too. This additionally moves the ev_fd == NULL check in there as well. Link: https://lore.kernel.org/r/20240921080307.185186-5-axboe@kernel.dk Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 58e76f4d1e00..0946d3da88d3 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -63,6 +63,17 @@ static bool __io_eventfd_signal(struct io_ev_fd *ev_fd) return true; } +/* + * Trigger if eventfd_async isn't set, or if it's set and the caller is + * an async worker. If ev_fd isn't valid, obviously return false. + */ +static bool io_eventfd_trigger(struct io_ev_fd *ev_fd) +{ + if (ev_fd) + return !ev_fd->eventfd_async || io_wq_current_is_worker(); + return false; +} + void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd = NULL; @@ -83,9 +94,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) * completed between the NULL check of ctx->io_ev_fd at the start of * the function and rcu_read_lock. */ - if (unlikely(!ev_fd)) - return; - if (ev_fd->eventfd_async && !io_wq_current_is_worker()) + if (!io_eventfd_trigger(ev_fd)) return; if (!refcount_inc_not_zero(&ev_fd->refs)) return; -- 2.51.0 From 83a4f865e273b83426eafdd3aa51334cc21ac0fd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 21 Sep 2024 01:59:51 -0600 Subject: [PATCH 13/16] io_uring/eventfd: abstract out ev_fd grab + release helpers In preparation for needing the ev_fd grabbing (and releasing) from another path, abstract out two helpers for that. Link: https://lore.kernel.org/r/20240921080307.185186-6-axboe@kernel.dk Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 0946d3da88d3..d1fdecd0c458 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -47,6 +47,13 @@ static void io_eventfd_put(struct io_ev_fd *ev_fd) call_rcu(&ev_fd->rcu, io_eventfd_free); } +static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref) +{ + if (put_ref) + io_eventfd_put(ev_fd); + rcu_read_unlock(); +} + /* * Returns true if the caller should put the ev_fd reference, false if not. */ @@ -74,14 +81,18 @@ static bool io_eventfd_trigger(struct io_ev_fd *ev_fd) return false; } -void io_eventfd_signal(struct io_ring_ctx *ctx) +/* + * On success, returns with an ev_fd reference grabbed and the RCU read + * lock held. + */ +static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx) { - struct io_ev_fd *ev_fd = NULL; + struct io_ev_fd *ev_fd; if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - return; + return NULL; - guard(rcu)(); + rcu_read_lock(); /* * rcu_dereference ctx->io_ev_fd once and use it for both for checking @@ -90,16 +101,24 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) ev_fd = rcu_dereference(ctx->io_ev_fd); /* - * Check again if ev_fd exists incase an io_eventfd_unregister call + * Check again if ev_fd exists in case an io_eventfd_unregister call * completed between the NULL check of ctx->io_ev_fd at the start of * the function and rcu_read_lock. */ - if (!io_eventfd_trigger(ev_fd)) - return; - if (!refcount_inc_not_zero(&ev_fd->refs)) - return; - if (__io_eventfd_signal(ev_fd)) - io_eventfd_put(ev_fd); + if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs)) + return ev_fd; + + rcu_read_unlock(); + return NULL; +} + +void io_eventfd_signal(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd; + + ev_fd = io_eventfd_grab(ctx); + if (ev_fd) + io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd)); } void io_eventfd_flush_signal(struct io_ring_ctx *ctx) -- 2.51.0 From f4bb2f65bb8154c1a2c2d7e01db0c98dffb5918f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 21 Sep 2024 01:59:52 -0600 Subject: [PATCH 14/16] io_uring/eventfd: move ctx->evfd_last_cq_tail into io_ev_fd Everything else about the io_uring eventfd support is nicely kept private to that code, except the cached_cq_tail tracking. With everything else in place, move io_eventfd_flush_signal() to using the ev_fd grab+release helpers, which then enables the direct use of io_ev_fd for this tracking too. Link: https://lore.kernel.org/r/20240921080307.185186-7-axboe@kernel.dk Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 50 +++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index d1fdecd0c458..fab936d31ba8 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -13,10 +13,12 @@ struct io_ev_fd { struct eventfd_ctx *cq_ev_fd; - unsigned int eventfd_async: 1; - struct rcu_head rcu; + unsigned int eventfd_async; + /* protected by ->completion_lock */ + unsigned last_cq_tail; refcount_t refs; atomic_t ops; + struct rcu_head rcu; }; enum { @@ -123,25 +125,31 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) void io_eventfd_flush_signal(struct io_ring_ctx *ctx) { - bool skip; - - spin_lock(&ctx->completion_lock); - - /* - * Eventfd should only get triggered when at least one event has been - * posted. Some applications rely on the eventfd notification count - * only changing IFF a new CQE has been added to the CQ ring. There's - * no depedency on 1:1 relationship between how many times this - * function is called (and hence the eventfd count) and number of CQEs - * posted to the CQ ring. - */ - skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; - spin_unlock(&ctx->completion_lock); - if (skip) - return; + struct io_ev_fd *ev_fd; - io_eventfd_signal(ctx); + ev_fd = io_eventfd_grab(ctx); + if (ev_fd) { + bool skip, put_ref = true; + + /* + * Eventfd should only get triggered when at least one event + * has been posted. Some applications rely on the eventfd + * notification count only changing IFF a new CQE has been + * added to the CQ ring. There's no dependency on 1:1 + * relationship between how many times this function is called + * (and hence the eventfd count) and number of CQEs posted to + * the CQ ring. + */ + spin_lock(&ctx->completion_lock); + skip = ctx->cached_cq_tail == ev_fd->last_cq_tail; + ev_fd->last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + + if (!skip) + put_ref = __io_eventfd_signal(ev_fd); + + io_eventfd_release(ev_fd, put_ref); + } } int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, @@ -172,7 +180,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, } spin_lock(&ctx->completion_lock); - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + ev_fd->last_cq_tail = ctx->cached_cq_tail; spin_unlock(&ctx->completion_lock); ev_fd->eventfd_async = eventfd_async; -- 2.51.0 From 95d6c9229a04cc12d39034cd6be6446a55a85d6d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 Sep 2024 05:57:30 -0600 Subject: [PATCH 15/16] io_uring/msg_ring: refactor a few helper functions Mostly just to skip them taking an io_kiocb, rather just pass in the ctx and io_msg directly. In preparation for being able to issue a MSG_RING request without having an io_kiocb. No functional changes in this patch. Link: https://lore.kernel.org/r/20240924115932.116167-2-axboe@kernel.dk Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 7fd9badcfaf8..b8c527f08cd5 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -116,14 +116,13 @@ static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); } -static int io_msg_data_remote(struct io_kiocb *req) +static int io_msg_data_remote(struct io_ring_ctx *target_ctx, + struct io_msg *msg) { - struct io_ring_ctx *target_ctx = req->file->private_data; - struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_kiocb *target; u32 flags = 0; - target = io_msg_get_kiocb(req->ctx); + target = io_msg_get_kiocb(target_ctx); if (unlikely(!target)) return -ENOMEM; @@ -134,10 +133,9 @@ static int io_msg_data_remote(struct io_kiocb *req) msg->user_data); } -static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) +static int __io_msg_ring_data(struct io_ring_ctx *target_ctx, + struct io_msg *msg, unsigned int issue_flags) { - struct io_ring_ctx *target_ctx = req->file->private_data; - struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); u32 flags = 0; int ret; @@ -149,7 +147,7 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) return -EBADFD; if (io_msg_need_remote(target_ctx)) - return io_msg_data_remote(req); + return io_msg_data_remote(target_ctx, msg); if (msg->flags & IORING_MSG_RING_FLAGS_PASS) flags = msg->cqe_flags; @@ -166,6 +164,14 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) return ret; } +static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ring_ctx *target_ctx = req->file->private_data; + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + + return __io_msg_ring_data(target_ctx, msg, issue_flags); +} + static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags) { struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); @@ -271,10 +277,8 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) return io_msg_install_complete(req, issue_flags); } -int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int __io_msg_ring_prep(struct io_msg *msg, const struct io_uring_sqe *sqe) { - struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); - if (unlikely(sqe->buf_index || sqe->personality)) return -EINVAL; @@ -291,6 +295,11 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return __io_msg_ring_prep(io_kiocb_to_cmd(req, struct io_msg), sqe); +} + int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) { struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); -- 2.51.0 From a377132154ab8404dafcc52e8bc0c73050a954c2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 Sep 2024 05:57:31 -0600 Subject: [PATCH 16/16] io_uring/msg_ring: add support for sending a sync message Normally MSG_RING requires both a source and a destination ring. But some users don't always have a ring avilable to send a message from, yet they still need to notify a target ring. Add support for using io_uring_register(2) without having a source ring, using a file descriptor of -1 for that. Internally those are called blind registration opcodes. Implement IORING_REGISTER_SEND_MSG_RING as a blind opcode, which simply takes an sqe that the application can put on the stack and use the normal liburing helpers to initialize it. Then the app can call: io_uring_register(-1, IORING_REGISTER_SEND_MSG_RING, &sqe, 1); and get the same behavior in terms of the target, where a CQE is posted with the details given in the sqe. For now this takes a single sqe pointer argument, and hence arg must be set to that, and nr_args must be 1. Could easily be extended to take an array of sqes, but for now let's keep it simple. Link: https://lore.kernel.org/r/20240924115932.116167-3-axboe@kernel.dk Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 3 +++ io_uring/msg_ring.c | 29 +++++++++++++++++++++++++++++ io_uring/msg_ring.h | 1 + io_uring/register.c | 30 ++++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1fe79e750470..86cb385fe0b5 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -612,6 +612,9 @@ enum io_uring_register_op { /* clone registered buffers from source ring to current ring */ IORING_REGISTER_CLONE_BUFFERS = 30, + /* send MSG_RING without having a ring */ + IORING_REGISTER_SEND_MSG_RING = 31, + /* this goes last */ IORING_REGISTER_LAST, diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index b8c527f08cd5..edea1ffd501c 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -331,6 +331,35 @@ done: return IOU_OK; } +int io_uring_sync_msg_ring(struct io_uring_sqe *sqe) +{ + struct io_msg io_msg = { }; + struct fd f; + int ret; + + ret = __io_msg_ring_prep(&io_msg, sqe); + if (unlikely(ret)) + return ret; + + /* + * Only data sending supported, not IORING_MSG_SEND_FD as that one + * doesn't make sense without a source ring to send files from. + */ + if (io_msg.cmd != IORING_MSG_DATA) + return -EINVAL; + + ret = -EBADF; + f = fdget(sqe->fd); + if (fd_file(f)) { + ret = -EBADFD; + if (io_is_uring_fops(fd_file(f))) + ret = __io_msg_ring_data(fd_file(f)->private_data, + &io_msg, IO_URING_F_UNLOCKED); + fdput(f); + } + return ret; +} + void io_msg_cache_free(const void *entry) { struct io_kiocb *req = (struct io_kiocb *) entry; diff --git a/io_uring/msg_ring.h b/io_uring/msg_ring.h index 3030f3942f0f..38e7f8f0c944 100644 --- a/io_uring/msg_ring.h +++ b/io_uring/msg_ring.h @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +int io_uring_sync_msg_ring(struct io_uring_sqe *sqe); int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags); void io_msg_ring_cleanup(struct io_kiocb *req); diff --git a/io_uring/register.c b/io_uring/register.c index eca26d4884d9..52b2f9b74af8 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -28,6 +28,7 @@ #include "kbuf.h" #include "napi.h" #include "eventfd.h" +#include "msg_ring.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -588,6 +589,32 @@ struct file *io_uring_register_get_file(unsigned int fd, bool registered) return ERR_PTR(-EOPNOTSUPP); } +/* + * "blind" registration opcodes are ones where there's no ring given, and + * hence the source fd must be -1. + */ +static int io_uring_register_blind(unsigned int opcode, void __user *arg, + unsigned int nr_args) +{ + switch (opcode) { + case IORING_REGISTER_SEND_MSG_RING: { + struct io_uring_sqe sqe; + + if (!arg || nr_args != 1) + return -EINVAL; + if (copy_from_user(&sqe, arg, sizeof(sqe))) + return -EFAULT; + /* no flags supported */ + if (sqe.flags) + return -EINVAL; + if (sqe.opcode == IORING_OP_MSG_RING) + return io_uring_sync_msg_ring(&sqe); + } + } + + return -EINVAL; +} + SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, void __user *, arg, unsigned int, nr_args) { @@ -602,6 +629,9 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, if (opcode >= IORING_REGISTER_LAST) return -EINVAL; + if (fd == -1) + return io_uring_register_blind(opcode, arg, nr_args); + file = io_uring_register_get_file(fd, use_registered_ring); if (IS_ERR(file)) return PTR_ERR(file); -- 2.51.0