From a07d2d7930c75e6bf88683b376d09ab1f3fed2aa Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Tue, 3 Dec 2024 11:31:05 +0100 Subject: [PATCH 01/16] io_uring: Change res2 parameter type in io_uring_cmd_done Change the type of the res2 parameter in io_uring_cmd_done from ssize_t to u64. This aligns the parameter type with io_req_set_cqe32_extra, which expects u64 arguments. The change eliminates potential issues on 32-bit architectures where ssize_t might be 32-bit. Only user of passing res2 is drivers/nvme/host/ioctl.c and it actually passes u64. Fixes: ee692a21e9bf ("fs,io_uring: add infrastructure for uring-cmd") Cc: stable@vger.kernel.org Reviewed-by: Kanchan Joshi Tested-by: Li Zetao Reviewed-by: Li Zetao Signed-off-by: Bernd Schubert Link: https://lore.kernel.org/r/20241203-io_uring_cmd_done-res2-as-u64-v2-1-5e59ae617151@ddn.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 4 ++-- io_uring/uring_cmd.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 578a3fdf5c71..0d5448c0b86c 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -43,7 +43,7 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, * Note: the caller should never hard code @issue_flags and is only allowed * to pass the mask provided by the core io_uring code. */ -void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, +void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, u64 res2, unsigned issue_flags); void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, @@ -67,7 +67,7 @@ static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, return -EOPNOTSUPP; } static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, - ssize_t ret2, unsigned issue_flags) + u64 ret2, unsigned issue_flags) { } static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index d9fb2143f56f..af842e9b4eb9 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -151,7 +151,7 @@ static inline void io_req_set_cqe32_extra(struct io_kiocb *req, * Called by consumers of io_uring_cmd, if they originally returned * -EIOCBQUEUED upon receiving the command. */ -void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2, +void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, unsigned issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); -- 2.51.0 From 99d6af6e8a22b792e1845b186f943cd10bb4b7b0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 12 Dec 2024 08:01:52 -0700 Subject: [PATCH 02/16] io_uring/rsrc: don't put/free empty buffers If cloning of buffers fail and we have to put the ones already grabbed, check for NULL buffers and skip those. They used to be dummy ubufs, but now they are just NULL and that should be checked before reaping them. Reported-by: chase xd Link: https://lore.kernel.org/io-uring/CADZouDQ7TcKn8gz8_efnyAEp1JvU1ktRk8PWz-tO0FXUoh8VGQ@mail.gmail.com/ Fixes: d50f94d761a5 ("io_uring/rsrc: get rid of the empty node and dummy_ubuf") Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index adaae8630932..077f84684c18 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1036,8 +1036,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx out_put_free: i = data.nr; while (i--) { - io_buffer_unmap(src_ctx, data.nodes[i]); - kfree(data.nodes[i]); + if (data.nodes[i]) { + io_buffer_unmap(src_ctx, data.nodes[i]); + kfree(data.nodes[i]); + } } out_unlock: io_rsrc_data_free(ctx, &data); -- 2.51.0 From 020b40f3562495f3c703a283ece145ffec19e82d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Dec 2024 08:21:46 -0700 Subject: [PATCH 03/16] io_uring: make ctx->timeout_lock a raw spinlock Chase reports that their tester complaints about a locking context mismatch: ============================= [ BUG: Invalid wait context ] 6.13.0-rc1-gf137f14b7ccb-dirty #9 Not tainted ----------------------------- syz.1.25198/182604 is trying to lock: ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: spin_lock_irq include/linux/spinlock.h:376 [inline] ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: io_match_task_safe io_uring/io_uring.c:218 [inline] ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: io_match_task_safe+0x187/0x250 io_uring/io_uring.c:204 other info that might help us debug this: context-{5:5} 1 lock held by syz.1.25198/182604: #0: ffff88802b7d48c0 (&acct->lock){+.+.}-{2:2}, at: io_acct_cancel_pending_work+0x2d/0x6b0 io_uring/io-wq.c:1049 stack backtrace: CPU: 0 UID: 0 PID: 182604 Comm: syz.1.25198 Not tainted 6.13.0-rc1-gf137f14b7ccb-dirty #9 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x82/0xd0 lib/dump_stack.c:120 print_lock_invalid_wait_context kernel/locking/lockdep.c:4826 [inline] check_wait_context kernel/locking/lockdep.c:4898 [inline] __lock_acquire+0x883/0x3c80 kernel/locking/lockdep.c:5176 lock_acquire.part.0+0x11b/0x370 kernel/locking/lockdep.c:5849 __raw_spin_lock_irq include/linux/spinlock_api_smp.h:119 [inline] _raw_spin_lock_irq+0x36/0x50 kernel/locking/spinlock.c:170 spin_lock_irq include/linux/spinlock.h:376 [inline] io_match_task_safe io_uring/io_uring.c:218 [inline] io_match_task_safe+0x187/0x250 io_uring/io_uring.c:204 io_acct_cancel_pending_work+0xb8/0x6b0 io_uring/io-wq.c:1052 io_wq_cancel_pending_work io_uring/io-wq.c:1074 [inline] io_wq_cancel_cb+0xb0/0x390 io_uring/io-wq.c:1112 io_uring_try_cancel_requests+0x15e/0xd70 io_uring/io_uring.c:3062 io_uring_cancel_generic+0x6ec/0x8c0 io_uring/io_uring.c:3140 io_uring_files_cancel include/linux/io_uring.h:20 [inline] do_exit+0x494/0x27a0 kernel/exit.c:894 do_group_exit+0xb3/0x250 kernel/exit.c:1087 get_signal+0x1d77/0x1ef0 kernel/signal.c:3017 arch_do_signal_or_restart+0x79/0x5b0 arch/x86/kernel/signal.c:337 exit_to_user_mode_loop kernel/entry/common.c:111 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:329 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] syscall_exit_to_user_mode+0x150/0x2a0 kernel/entry/common.c:218 do_syscall_64+0xd8/0x250 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x77/0x7f which is because io_uring has ctx->timeout_lock nesting inside the io-wq acct lock, the latter of which is used from inside the scheduler and hence is a raw spinlock, while the former is a "normal" spinlock and can hence be sleeping on PREEMPT_RT. Change ctx->timeout_lock to be a raw spinlock to solve this nesting dependency on PREEMPT_RT=y. Reported-by: chase xd Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 10 ++++----- io_uring/timeout.c | 40 +++++++++++++++++----------------- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 011860ade268..fd4cdb0860a2 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -345,7 +345,7 @@ struct io_ring_ctx { /* timeouts */ struct { - spinlock_t timeout_lock; + raw_spinlock_t timeout_lock; struct list_head timeout_list; struct list_head ltimeout_list; unsigned cq_last_tm_flush; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 06ff41484e29..605625e932eb 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -215,9 +215,9 @@ bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, struct io_ring_ctx *ctx = head->ctx; /* protect against races with linked timeouts */ - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); matched = io_match_linked(head); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } else { matched = io_match_linked(head); } @@ -333,7 +333,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->cq_wait); init_waitqueue_head(&ctx->poll_wq); spin_lock_init(&ctx->completion_lock); - spin_lock_init(&ctx->timeout_lock); + raw_spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->defer_list); @@ -498,10 +498,10 @@ static void io_prep_async_link(struct io_kiocb *req) if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); io_for_each_link(cur, req) io_prep_async_work(cur); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } else { io_for_each_link(cur, req) io_prep_async_work(cur); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index f3d502717aeb..bbe58638eca7 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -74,10 +74,10 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) if (!io_timeout_finish(timeout, data)) { if (io_req_post_cqe(req, -ETIME, IORING_CQE_F_MORE)) { /* re-arm timer */ - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); list_add(&timeout->list, ctx->timeout_list.prev); hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); return; } } @@ -109,7 +109,7 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) u32 seq; struct io_timeout *timeout, *tmp; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { @@ -134,7 +134,7 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) io_kill_timeout(req, 0); } ctx->cq_last_tm_flush = seq; - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) @@ -200,9 +200,9 @@ void io_disarm_next(struct io_kiocb *req) } else if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); link = io_disarm_linked_timeout(req); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); if (link) io_req_queue_tw_complete(link, -ECANCELED); } @@ -238,11 +238,11 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->timeout_lock, flags); + raw_spin_lock_irqsave(&ctx->timeout_lock, flags); list_del_init(&timeout->list); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - spin_unlock_irqrestore(&ctx->timeout_lock, flags); + raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) req_set_fail(req); @@ -285,9 +285,9 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) { struct io_kiocb *req; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); req = io_timeout_extract(ctx, cd); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); if (IS_ERR(req)) return PTR_ERR(req); @@ -330,7 +330,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->timeout_lock, flags); + raw_spin_lock_irqsave(&ctx->timeout_lock, flags); prev = timeout->head; timeout->head = NULL; @@ -345,7 +345,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) } list_del(&timeout->list); timeout->prev = prev; - spin_unlock_irqrestore(&ctx->timeout_lock, flags); + raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); req->io_task_work.func = io_req_task_link_timeout; io_req_task_work_add(req); @@ -472,12 +472,12 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) } else { enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); if (tr->ltimeout) ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); else ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } if (ret < 0) @@ -572,7 +572,7 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags) struct list_head *entry; u32 tail, off = timeout->off; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); /* * sqe->off holds how many events that need to occur for this @@ -611,7 +611,7 @@ add: list_add(&timeout->list, entry); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); return IOU_ISSUE_SKIP_COMPLETE; } @@ -620,7 +620,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); /* * If the back reference is NULL, then our linked request finished * before we got a chance to setup the timer @@ -633,7 +633,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) data->mode); list_add_tail(&timeout->list, &ctx->ltimeout_list); } - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); /* drop submission reference */ io_put_req(req); } @@ -668,7 +668,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx * timeout_lockfirst to keep locking ordering. */ spin_lock(&ctx->completion_lock); - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); @@ -676,7 +676,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx io_kill_timeout(req, -ECANCELED)) canceled++; } - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); spin_unlock(&ctx->completion_lock); return canceled != 0; } -- 2.51.0 From 12d908116f7efd34f255a482b9afc729d7a5fb78 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 18 Dec 2024 17:56:25 +0100 Subject: [PATCH 04/16] io_uring: Fix registered ring file refcount leak Currently, io_uring_unreg_ringfd() (which cleans up registered rings) is only called on exit, but __io_uring_free (which frees the tctx in which the registered ring pointers are stored) is also called on execve (via begin_new_exec -> io_uring_task_cancel -> __io_uring_cancel -> io_uring_cancel_generic -> __io_uring_free). This means: A process going through execve while having registered rings will leak references to the rings' `struct file`. Fix it by zapping registered rings on execve(). This is implemented by moving the io_uring_unreg_ringfd() from io_uring_files_cancel() into its callee __io_uring_cancel(), which is called from io_uring_task_cancel() on execve. This could probably be exploited *on 32-bit kernels* by leaking 2^32 references to the same ring, because the file refcount is stored in a pointer-sized field and get_file() doesn't have protection against refcount overflow, just a WARN_ONCE(); but on 64-bit it should have no impact beyond a memory leak. Cc: stable@vger.kernel.org Fixes: e7a6c00dc77a ("io_uring: add support for registering ring file descriptors") Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20241218-uring-reg-ring-cleanup-v1-1-8f63e999045b@google.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 4 +--- io_uring/io_uring.c | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index e123d5e17b52..85fe4e6b275c 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -15,10 +15,8 @@ bool io_is_uring_fops(struct file *file); static inline void io_uring_files_cancel(void) { - if (current->io_uring) { - io_uring_unreg_ringfd(); + if (current->io_uring) __io_uring_cancel(false); - } } static inline void io_uring_task_cancel(void) { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 605625e932eb..432b95ca9c85 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3214,6 +3214,7 @@ end_wait: void __io_uring_cancel(bool cancel_all) { + io_uring_unreg_ringfd(); io_uring_cancel_generic(cancel_all, NULL); } -- 2.51.0 From c261e4f1dd29fabab54b325bc1da8769a3998be1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Dec 2024 09:32:26 -0700 Subject: [PATCH 05/16] io_uring/register: limit ring resizing to DEFER_TASKRUN With DEFER_TASKRUN, we know the ring can't be both waited upon and resized at the same time. This is important for CQ resizing. Allowing SQ ring resizing is more trivial, but isn't the interesting use case. Hence limit ring resizing in general to DEFER_TASKRUN only for now. This isn't a huge problem as CQ ring resizing is generally the most useful on networking type of workloads where it can be hard to size the ring appropriately upfront, and those should be using DEFER_TASKRUN for better performance. Fixes: 79cfe9e59c2a ("io_uring/register: add IORING_REGISTER_RESIZE_RINGS") Signed-off-by: Jens Axboe --- io_uring/register.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/io_uring/register.c b/io_uring/register.c index 1e99c783abdf..fdd44914c39c 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -414,6 +414,9 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && current != ctx->submitter_task) return -EEXIST; + /* limited to DEFER_TASKRUN for now */ + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; if (copy_from_user(&p, arg, sizeof(p))) return -EFAULT; if (p.flags & ~RESIZE_FLAGS) -- 2.51.0 From dbd2ca9367eb19bc5e269b8c58b0b1514ada9156 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 19 Dec 2024 19:52:58 +0000 Subject: [PATCH 06/16] io_uring: check if iowq is killed before queuing task work can be executed after the task has gone through io_uring termination, whether it's the final task_work run or the fallback path. In this case, task work will find ->io_wq being already killed and null'ed, which is a problem if it then tries to forward the request to io_queue_iowq(). Make io_queue_iowq() fail requests in this case. Note that it also checks PF_KTHREAD, because the user can first close a DEFER_TASKRUN ring and shortly after kill the task, in which case ->iowq check would race. Cc: stable@vger.kernel.org Fixes: 50c52250e2d74 ("block: implement async io_uring discard cmd") Fixes: 773af69121ecc ("io_uring: always reissue from task_work context") Reported-by: Will Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/63312b4a2c2bb67ad67b857d17a300e1d3b078e8.1734637909.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 432b95ca9c85..d3403c8216db 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -514,7 +514,11 @@ static void io_queue_iowq(struct io_kiocb *req) struct io_uring_task *tctx = req->tctx; BUG_ON(!tctx); - BUG_ON(!tctx->io_wq); + + if ((current->flags & PF_KTHREAD) || !tctx->io_wq) { + io_req_task_queue_fail(req, -ECANCELED); + return; + } /* init ->work of the whole link before punting */ io_prep_async_link(req); -- 2.51.0 From e33ac68e5e21ec1292490dfe061e75c0dbdd3bd4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 26 Dec 2024 16:49:23 +0000 Subject: [PATCH 07/16] io_uring/sqpoll: fix sqpoll error handling races BUG: KASAN: slab-use-after-free in __lock_acquire+0x370b/0x4a10 kernel/locking/lockdep.c:5089 Call Trace: ... _raw_spin_lock_irqsave+0x3d/0x60 kernel/locking/spinlock.c:162 class_raw_spinlock_irqsave_constructor include/linux/spinlock.h:551 [inline] try_to_wake_up+0xb5/0x23c0 kernel/sched/core.c:4205 io_sq_thread_park+0xac/0xe0 io_uring/sqpoll.c:55 io_sq_thread_finish+0x6b/0x310 io_uring/sqpoll.c:96 io_sq_offload_create+0x162/0x11d0 io_uring/sqpoll.c:497 io_uring_create io_uring/io_uring.c:3724 [inline] io_uring_setup+0x1728/0x3230 io_uring/io_uring.c:3806 ... Kun Hu reports that the SQPOLL creating error path has UAF, which happens if io_uring_alloc_task_context() fails and then io_sq_thread() manages to run and complete before the rest of error handling code, which means io_sq_thread_finish() is looking at already killed task. Note that this is mostly theoretical, requiring fault injection on the allocation side to trigger in practice. Cc: stable@vger.kernel.org Reported-by: Kun Hu Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0f2f1aa5729332612bd01fe0f2f385fd1f06ce7c.1735231717.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 6df5e649c413..9e5bd79fd2b5 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -405,6 +405,7 @@ void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) __cold int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p) { + struct task_struct *task_to_put = NULL; int ret; /* Retain compatibility with failing for an invalid attach attempt */ @@ -480,6 +481,7 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, } sqd->thread = tsk; + task_to_put = get_task_struct(tsk); ret = io_uring_alloc_task_context(tsk, ctx); wake_up_new_task(tsk); if (ret) @@ -490,11 +492,15 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, goto err; } + if (task_to_put) + put_task_struct(task_to_put); return 0; err_sqpoll: complete(&ctx->sq_data->exited); err: io_sq_thread_finish(ctx); + if (task_to_put) + put_task_struct(task_to_put); return ret; } -- 2.51.0 From 38fc96a58ce40257aec79b32e9b310c86907c63c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 28 Dec 2024 17:44:52 +0000 Subject: [PATCH 08/16] io_uring/rw: fix downgraded mshot read The io-wq path can downgrade a multishot request to oneshot mode, however io_read_mshot() doesn't handle that and would still post multiple CQEs. That's not allowed, because io_req_post_cqe() requires stricter context requirements. The described can only happen with pollable files that don't support FMODE_NOWAIT, which is an odd combination, so if even allowed it should be fairly rare. Cc: stable@vger.kernel.org Reported-by: chase xd Fixes: bee1d5becdf5b ("io_uring: disable io-wq execution of multishot NOWAIT requests") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c5c8c4a50a882fd581257b81bf52eee260ac29fd.1735407848.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/rw.c b/io_uring/rw.c index 0bcb83e4ce3c..29bb3010f9c0 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -983,6 +983,8 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) io_kbuf_recycle(req, issue_flags); if (ret < 0) req_set_fail(req); + } else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + cflags = io_put_kbuf(req, ret, issue_flags); } else { /* * Any successful return value will keep the multishot read -- 2.51.0 From a9c83a0ab66a5b02e914daed502fb8d3a8d3d619 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Dec 2024 14:15:17 -0700 Subject: [PATCH 09/16] io_uring/timeout: flush timeouts outside of the timeout lock syzbot reports that a recent fix causes nesting issues between the (now) raw timeoutlock and the eventfd locking: ============================= [ BUG: Invalid wait context ] 6.13.0-rc4-00080-g9828a4c0901f #29 Not tainted ----------------------------- kworker/u32:0/68094 is trying to lock: ffff000014d7a520 (&ctx->wqh#2){..-.}-{3:3}, at: eventfd_signal_mask+0x64/0x180 other info that might help us debug this: context-{5:5} 6 locks held by kworker/u32:0/68094: #0: ffff0000c1d98148 ((wq_completion)iou_exit){+.+.}-{0:0}, at: process_one_work+0x4e8/0xfc0 #1: ffff80008d927c78 ((work_completion)(&ctx->exit_work)){+.+.}-{0:0}, at: process_one_work+0x53c/0xfc0 #2: ffff0000c59bc3d8 (&ctx->completion_lock){+.+.}-{3:3}, at: io_kill_timeouts+0x40/0x180 #3: ffff0000c59bc358 (&ctx->timeout_lock){-.-.}-{2:2}, at: io_kill_timeouts+0x48/0x180 #4: ffff800085127aa0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire+0x8/0x38 #5: ffff800085127aa0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire+0x8/0x38 stack backtrace: CPU: 7 UID: 0 PID: 68094 Comm: kworker/u32:0 Not tainted 6.13.0-rc4-00080-g9828a4c0901f #29 Hardware name: linux,dummy-virt (DT) Workqueue: iou_exit io_ring_exit_work Call trace: show_stack+0x1c/0x30 (C) __dump_stack+0x24/0x30 dump_stack_lvl+0x60/0x80 dump_stack+0x14/0x20 __lock_acquire+0x19f8/0x60c8 lock_acquire+0x1a4/0x540 _raw_spin_lock_irqsave+0x90/0xd0 eventfd_signal_mask+0x64/0x180 io_eventfd_signal+0x64/0x108 io_req_local_work_add+0x294/0x430 __io_req_task_work_add+0x1c0/0x270 io_kill_timeout+0x1f0/0x288 io_kill_timeouts+0xd4/0x180 io_uring_try_cancel_requests+0x2e8/0x388 io_ring_exit_work+0x150/0x550 process_one_work+0x5e8/0xfc0 worker_thread+0x7ec/0xc80 kthread+0x24c/0x300 ret_from_fork+0x10/0x20 because after the preempt-rt fix for the timeout lock nesting inside the io-wq lock, we now have the eventfd spinlock nesting inside the raw timeout spinlock. Rather than play whack-a-mole with other nesting on the timeout lock, split the deletion and killing of timeouts so queueing the task_work for the timeout cancelations can get done outside of the timeout lock. Reported-by: syzbot+b1fc199a40b65d601b65@syzkaller.appspotmail.com Fixes: 020b40f35624 ("io_uring: make ctx->timeout_lock a raw spinlock") Signed-off-by: Jens Axboe --- io_uring/timeout.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index bbe58638eca7..362689b17ccc 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -85,7 +85,27 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) io_req_task_complete(req, ts); } -static bool io_kill_timeout(struct io_kiocb *req, int status) +static __cold bool io_flush_killed_timeouts(struct list_head *list, int err) +{ + if (list_empty(list)) + return false; + + while (!list_empty(list)) { + struct io_timeout *timeout; + struct io_kiocb *req; + + timeout = list_first_entry(list, struct io_timeout, list); + list_del_init(&timeout->list); + req = cmd_to_io_kiocb(timeout); + if (err) + req_set_fail(req); + io_req_queue_tw_complete(req, err); + } + + return true; +} + +static void io_kill_timeout(struct io_kiocb *req, struct list_head *list) __must_hold(&req->ctx->timeout_lock) { struct io_timeout_data *io = req->async_data; @@ -93,21 +113,17 @@ static bool io_kill_timeout(struct io_kiocb *req, int status) if (hrtimer_try_to_cancel(&io->timer) != -1) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); - if (status) - req_set_fail(req); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - list_del_init(&timeout->list); - io_req_queue_tw_complete(req, status); - return true; + list_move_tail(&timeout->list, list); } - return false; } __cold void io_flush_timeouts(struct io_ring_ctx *ctx) { - u32 seq; struct io_timeout *timeout, *tmp; + LIST_HEAD(list); + u32 seq; raw_spin_lock_irq(&ctx->timeout_lock); seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); @@ -131,10 +147,11 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) if (events_got < events_needed) break; - io_kill_timeout(req, 0); + io_kill_timeout(req, &list); } ctx->cq_last_tm_flush = seq; raw_spin_unlock_irq(&ctx->timeout_lock); + io_flush_killed_timeouts(&list, 0); } static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) @@ -661,7 +678,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx bool cancel_all) { struct io_timeout *timeout, *tmp; - int canceled = 0; + LIST_HEAD(list); /* * completion_lock is needed for io_match_task(). Take it before @@ -672,11 +689,11 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); - if (io_match_task(req, tctx, cancel_all) && - io_kill_timeout(req, -ECANCELED)) - canceled++; + if (io_match_task(req, tctx, cancel_all)) + io_kill_timeout(req, &list); } raw_spin_unlock_irq(&ctx->timeout_lock); spin_unlock(&ctx->completion_lock); - return canceled != 0; + + return io_flush_killed_timeouts(&list, -ECANCELED); } -- 2.51.0 From c6e60a0a68b7e6b3c7e33863a16e8e88ba9eee6f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 2 Jan 2025 16:32:51 -0700 Subject: [PATCH 10/16] io_uring/net: always initialize kmsg->msg.msg_inq upfront syzbot reports that ->msg_inq may get used uinitialized from the following path: BUG: KMSAN: uninit-value in io_recv_buf_select io_uring/net.c:1094 [inline] BUG: KMSAN: uninit-value in io_recv+0x930/0x1f90 io_uring/net.c:1158 io_recv_buf_select io_uring/net.c:1094 [inline] io_recv+0x930/0x1f90 io_uring/net.c:1158 io_issue_sqe+0x420/0x2130 io_uring/io_uring.c:1740 io_queue_sqe io_uring/io_uring.c:1950 [inline] io_req_task_submit+0xfa/0x1d0 io_uring/io_uring.c:1374 io_handle_tw_list+0x55f/0x5c0 io_uring/io_uring.c:1057 tctx_task_work_run+0x109/0x3e0 io_uring/io_uring.c:1121 tctx_task_work+0x6d/0xc0 io_uring/io_uring.c:1139 task_work_run+0x268/0x310 kernel/task_work.c:239 io_run_task_work+0x43a/0x4a0 io_uring/io_uring.h:343 io_cqring_wait io_uring/io_uring.c:2527 [inline] __do_sys_io_uring_enter io_uring/io_uring.c:3439 [inline] __se_sys_io_uring_enter+0x204f/0x4ce0 io_uring/io_uring.c:3330 __x64_sys_io_uring_enter+0x11f/0x1a0 io_uring/io_uring.c:3330 x64_sys_call+0xce5/0x3c30 arch/x86/include/generated/asm/syscalls_64.h:427 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f and it is correct, as it's never initialized upfront. Hence the first submission can end up using it uninitialized, if the recv wasn't successful and the networking stack didn't honor ->msg_get_inq being set and filling in the output value of ->msg_inq as requested. Set it to 0 upfront when it's allocated, just to silence this KMSAN warning. There's no side effect of using it uninitialized, it'll just potentially cause the next receive to use a recv value hint that's not accurate. Fixes: c6f32c7d9e09 ("io_uring/net: get rid of ->prep_async() for receive side") Reported-by: syzbot+068ff190354d2f74892f@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/net.c b/io_uring/net.c index df1f7dc6f1c8..c6cd38cc5dc4 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -754,6 +754,7 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req) if (req->opcode == IORING_OP_RECV) { kmsg->msg.msg_name = NULL; kmsg->msg.msg_namelen = 0; + kmsg->msg.msg_inq = 0; kmsg->msg.msg_control = NULL; kmsg->msg.msg_get_inq = 1; kmsg->msg.msg_controllen = 0; -- 2.51.0 From ed123c948d06688d10f3b10a7bce1d6fbfd1ed07 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Jan 2025 09:29:09 -0700 Subject: [PATCH 11/16] io_uring/kbuf: use pre-committed buffer address for non-pollable file For non-pollable files, buffer ring consumption will commit upfront. This is fine, but io_ring_buffer_select() will return the address of the buffer after having committed it. For incrementally consumed buffers, this is incorrect as it will modify the buffer address. Store the pre-committed value and return that. If that isn't done, then the initial part of the buffer is not used and the application will correctly assume the content arrived at the start of the userspace buffer, but the kernel will have put it later in the buffer. Or it can cause a spurious -EFAULT returned in the CQE, depending on the buffer size. As bounds are suitably checked for doing the actual IO, no adverse side effects are possible - it's just a data misplacement within the existing buffer. Reported-by: Gwendal Fernet Cc: stable@vger.kernel.org Fixes: ae98dbf43d75 ("io_uring/kbuf: add support for incremental buffer consumption") Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index d407576ddfb7..eec5eb7de843 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -139,6 +139,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, struct io_uring_buf_ring *br = bl->buf_ring; __u16 tail, head = bl->head; struct io_uring_buf *buf; + void __user *ret; tail = smp_load_acquire(&br->tail); if (unlikely(tail == head)) @@ -153,6 +154,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; req->buf_list = bl; req->buf_index = buf->bid; + ret = u64_to_user_ptr(buf->addr); if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) { /* @@ -168,7 +170,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, io_kbuf_commit(req, bl, *len, 1); req->buf_list = NULL; } - return u64_to_user_ptr(buf->addr); + return ret; } void __user *io_buffer_select(struct io_kiocb *req, size_t *len, -- 2.51.0 From c83c846231db8b153bfcb44d552d373c34f78245 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 4 Jan 2025 18:29:02 +0000 Subject: [PATCH 12/16] io_uring/timeout: fix multishot updates After update only the first shot of a multishot timeout request adheres to the new timeout value while all subsequent retries continue to use the old value. Don't forget to update the timeout stored in struct io_timeout_data. Cc: stable@vger.kernel.org Fixes: ea97f6c8558e8 ("io_uring: add support for multishot timeouts") Reported-by: Christian Mazakas Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e6516c3304eb654ec234cfa65c88a9579861e597.1736015288.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/timeout.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 362689b17ccc..e9cec9e4dc2f 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -427,10 +427,12 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, timeout->off = 0; /* noseq */ data = req->async_data; + data->ts = *ts; + list_add_tail(&timeout->list, &ctx->timeout_list); hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); data->timer.function = io_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), mode); return 0; } -- 2.51.0 From 60495b08cf7a6920035c5172a22655ca2001270b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 7 Jan 2025 14:11:32 +0000 Subject: [PATCH 13/16] io_uring: silence false positive warnings If we kill a ring and then immediately exit the task, we'll get cancellattion running by the task and a kthread in io_ring_exit_work. For DEFER_TASKRUN, we do want to limit it to only one entity executing it, however it's currently not an issue as it's protected by uring_lock. Silence lockdep assertions for now, we'll return to it later. Reported-by: syzbot+1bcb75613069ad4957fc@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7e5f68281acb0f081f65fde435833c68a3b7e02f.1736257837.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 12abee607e4a..492cbbf2c23b 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -125,6 +125,9 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) #if defined(CONFIG_PROVE_LOCKING) lockdep_assert(in_task()); + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + lockdep_assert_held(&ctx->uring_lock); + if (ctx->flags & IORING_SETUP_IOPOLL) { lockdep_assert_held(&ctx->uring_lock); } else if (!ctx->task_complete) { @@ -136,9 +139,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) * Not from an SQE, as those cannot be submitted, but via * updating tagged resources. */ - if (percpu_ref_is_dying(&ctx->refs)) - lockdep_assert(current_work()); - else + if (!percpu_ref_is_dying(&ctx->refs)) lockdep_assert(current == ctx->submitter_task); } #endif -- 2.51.0 From c9a40292a44e78f71258b8522655bffaf5753bdb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Jan 2025 10:28:05 -0700 Subject: [PATCH 14/16] io_uring/eventfd: ensure io_eventfd_signal() defers another RCU period io_eventfd_do_signal() is invoked from an RCU callback, but when dropping the reference to the io_ev_fd, it calls io_eventfd_free() directly if the refcount drops to zero. This isn't correct, as any potential freeing of the io_ev_fd should be deferred another RCU grace period. Just call io_eventfd_put() rather than open-code the dec-and-test and free, which will correctly defer it another RCU grace period. Fixes: 21a091b970cd ("io_uring: signal registered eventfd to process deferred task work") Reported-by: Jann Horn Cc: stable@vger.kernel.org Tested-by: Li Zetao Reviewed-by: Li Zetao Reviewed-by: Prasanna Kumar T S M Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index fab936d31ba8..100d5da94cb9 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -33,20 +33,18 @@ static void io_eventfd_free(struct rcu_head *rcu) kfree(ev_fd); } -static void io_eventfd_do_signal(struct rcu_head *rcu) +static void io_eventfd_put(struct io_ev_fd *ev_fd) { - struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); - - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - if (refcount_dec_and_test(&ev_fd->refs)) - io_eventfd_free(rcu); + call_rcu(&ev_fd->rcu, io_eventfd_free); } -static void io_eventfd_put(struct io_ev_fd *ev_fd) +static void io_eventfd_do_signal(struct rcu_head *rcu) { - if (refcount_dec_and_test(&ev_fd->refs)) - call_rcu(&ev_fd->rcu, io_eventfd_free); + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + io_eventfd_put(ev_fd); } static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref) -- 2.51.0 From 4b7cfa8b6c28a9fa22b86894166a1a34f6d630ba Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 10 Jan 2025 14:31:23 +0000 Subject: [PATCH 15/16] io_uring/sqpoll: zero sqd->thread on tctx errors Syzkeller reports: BUG: KASAN: slab-use-after-free in thread_group_cputime+0x409/0x700 kernel/sched/cputime.c:341 Read of size 8 at addr ffff88803578c510 by task syz.2.3223/27552 Call Trace: ... kasan_report+0x143/0x180 mm/kasan/report.c:602 thread_group_cputime+0x409/0x700 kernel/sched/cputime.c:341 thread_group_cputime_adjusted+0xa6/0x340 kernel/sched/cputime.c:639 getrusage+0x1000/0x1340 kernel/sys.c:1863 io_uring_show_fdinfo+0xdfe/0x1770 io_uring/fdinfo.c:197 seq_show+0x608/0x770 fs/proc/fd.c:68 ... That's due to sqd->task not being cleared properly in cases where SQPOLL task tctx setup fails, which can essentially only happen with fault injection to insert allocation errors. Cc: stable@vger.kernel.org Fixes: 1251d2025c3e1 ("io_uring/sqpoll: early exit thread if task_context wasn't allocated") Reported-by: syzbot+3d92cfcfa84070b0a470@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/efc7ec7010784463b2e7466d7b5c02c2cb381635.1736519461.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 9e5bd79fd2b5..8961a3c1e73c 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -268,8 +268,12 @@ static int io_sq_thread(void *data) DEFINE_WAIT(wait); /* offload context creation failed, just exit */ - if (!current->io_uring) + if (!current->io_uring) { + mutex_lock(&sqd->lock); + sqd->thread = NULL; + mutex_unlock(&sqd->lock); goto err_out; + } snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); set_task_comm(current, buf); -- 2.51.0 From bd2703b42decebdcddf76e277ba76b4c4a142d73 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 10 Jan 2025 20:36:45 +0000 Subject: [PATCH 16/16] io_uring: don't touch sqd->thread off tw add With IORING_SETUP_SQPOLL all requests are created by the SQPOLL task, which means that req->task should always match sqd->thread. Since accesses to sqd->thread should be separately protected, use req->task in io_req_normal_work_add() instead. Note, in the eyes of io_req_normal_work_add(), the SQPOLL task struct is always pinned and alive, and sqd->thread can either be the task or NULL. It's only problematic if the compiler decides to reload the value after the null check, which is not so likely. Cc: stable@vger.kernel.org Cc: Bui Quang Minh Reported-by: lizetao Fixes: 78f9b61bd8e54 ("io_uring: wake SQPOLL task when task_work is added to an empty queue") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1cbbe72cf32c45a8fee96026463024cd8564a7d7.1736541357.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d3403c8216db..5eb119002099 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1226,10 +1226,7 @@ static void io_req_normal_work_add(struct io_kiocb *req) /* SQPOLL doesn't need the task_work added, it'll run it itself */ if (ctx->flags & IORING_SETUP_SQPOLL) { - struct io_sq_data *sqd = ctx->sq_data; - - if (sqd->thread) - __set_notify_signal(sqd->thread); + __set_notify_signal(tctx->task); return; } -- 2.51.0