From 475a8d30371604a6363da8e304a608a5959afc40 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 May 2025 18:26:46 +0100 Subject: [PATCH 01/16] io_uring/kbuf: account ring io_buffer_list memory Follow the non-ringed pbuf struct io_buffer_list allocations and account it against the memcg. There is low chance of that being an actual problem as ring provided buffer should either pin user memory or allocate it, which is already accounted. Cc: stable@vger.kernel.org # 6.1 Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3985218b50d341273cafff7234e1a7e6d0db9808.1747150490.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 1cf0d2c01287..446207db1edf 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -621,7 +621,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) io_destroy_bl(ctx, bl); } - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); + free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); if (!bl) return -ENOMEM; -- 2.51.0 From 1724849072854a66861d461b298b04612702d685 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 May 2025 18:26:47 +0100 Subject: [PATCH 02/16] io_uring/kbuf: use mem_is_zero() Make use of mem_is_zero() for reserved fields checking. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/11fe27b7a831329bcdb4ea087317ef123ba7c171.1747150490.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 446207db1edf..344517d1d921 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -602,8 +602,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - - if (reg.resv[0] || reg.resv[1] || reg.resv[2]) + if (!mem_is_zero(reg.resv, sizeof(reg.resv))) return -EINVAL; if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) return -EINVAL; @@ -679,9 +678,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (reg.resv[0] || reg.resv[1] || reg.resv[2]) - return -EINVAL; - if (reg.flags) + if (!mem_is_zero(reg.resv, sizeof(reg.resv)) || reg.flags) return -EINVAL; bl = io_buffer_get_list(ctx, reg.bgid); @@ -701,14 +698,11 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_status buf_status; struct io_buffer_list *bl; - int i; if (copy_from_user(&buf_status, arg, sizeof(buf_status))) return -EFAULT; - - for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++) - if (buf_status.resv[i]) - return -EINVAL; + if (!mem_is_zero(buf_status.resv, sizeof(buf_status.resv))) + return -EINVAL; bl = io_buffer_get_list(ctx, buf_status.buf_group); if (!bl) -- 2.51.0 From 4e9fda29d66b06caf5c81b8acbe0a504effc73fb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 May 2025 18:26:48 +0100 Subject: [PATCH 03/16] io_uring/kbuf: drop extra vars in io_register_pbuf_ring bl and free_bl variables in io_register_pbuf_ring() always point to the same list since we started to reallocate the pre-existent list. Drop free_bl. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d45c3342d74c9030f99376c777a4b3d59089074d.1747150490.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 344517d1d921..406e8a9b42c3 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -591,7 +591,7 @@ err: int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_reg reg; - struct io_buffer_list *bl, *free_bl = NULL; + struct io_buffer_list *bl; struct io_uring_region_desc rd; struct io_uring_buf_ring *br; unsigned long mmap_offset; @@ -620,7 +620,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) io_destroy_bl(ctx, bl); } - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); + bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); if (!bl) return -ENOMEM; @@ -665,7 +665,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return 0; fail: io_free_region(ctx, &bl->region); - kfree(free_bl); + kfree(bl); return ret; } -- 2.51.0 From 52a05d0cf8f3b4569c525153132a90661c32fe11 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 May 2025 18:26:49 +0100 Subject: [PATCH 04/16] io_uring/kbuf: don't compute size twice on prep The size in prep is calculated by io_provide_buffers_prep(), so remove the recomputation a few lines after. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7c97206561b74fce245cb22449c6082d2e066844.1747150490.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 406e8a9b42c3..eb666c02f488 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -509,8 +509,6 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return -EOVERFLOW; if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) return -EOVERFLOW; - - size = (unsigned long)p->len * p->nbufs; if (!access_ok(u64_to_user_ptr(p->addr), size)) return -EFAULT; -- 2.51.0 From c724e801239ffc3714afe65cf6e721ddd04199d0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 May 2025 18:26:50 +0100 Subject: [PATCH 05/16] io_uring/kbuf: refactor __io_remove_buffers __io_remove_buffers used for two purposes, the first is removing buffers for non ring based lists, which implies that it can be called multiple times for the same list. And the second is for destroying lists, which is not perfectly reentrable for ring based lists. It's confusing, so just have a helper for the legacy pbuf buffer removal, make sure it's not called for ring pbuf, and open code all ring pbuf destruction into io_put_bl(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0ae416b099d311ad23f285cea02f2c94c8ae9a6c.1747150490.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index eb666c02f488..df8aeb42e910 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -376,45 +376,33 @@ unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs) return ret; } -static int __io_remove_buffers(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned nbufs) +static int io_remove_buffers_legacy(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, + unsigned long nbufs) { - unsigned i = 0; - - /* shouldn't happen */ - if (!nbufs) - return 0; - - if (bl->flags & IOBL_BUF_RING) { - i = bl->buf_ring->tail - bl->head; - io_free_region(ctx, &bl->region); - /* make sure it's seen as empty */ - INIT_LIST_HEAD(&bl->buf_list); - bl->flags &= ~IOBL_BUF_RING; - return i; - } + unsigned long i = 0; + struct io_buffer *nxt; /* protects io_buffers_cache */ lockdep_assert_held(&ctx->uring_lock); + WARN_ON_ONCE(bl->flags & IOBL_BUF_RING); - while (!list_empty(&bl->buf_list)) { - struct io_buffer *nxt; - + for (i = 0; i < nbufs && !list_empty(&bl->buf_list); i++) { nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&nxt->list); kfree(nxt); - - if (++i == nbufs) - return i; cond_resched(); } - return i; } static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { - __io_remove_buffers(ctx, bl, -1U); + if (bl->flags & IOBL_BUF_RING) + io_free_region(ctx, &bl->region); + else + io_remove_buffers_legacy(ctx, bl, -1U); + kfree(bl); } @@ -477,7 +465,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) ret = -EINVAL; /* can't use provide/remove buffers command on mapped buffers */ if (!(bl->flags & IOBL_BUF_RING)) - ret = __io_remove_buffers(ctx, bl, p->nbufs); + ret = io_remove_buffers_legacy(ctx, bl, p->nbufs); } io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) -- 2.51.0 From 2b61bb1d9aa601ec393054a61be0a707a5bea928 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 May 2025 18:26:51 +0100 Subject: [PATCH 06/16] io_uring/kbuf: unify legacy buf provision and removal Combine IORING_OP_PROVIDE_BUFFERS and IORING_OP_REMOVE_BUFFERS ->issue(), so that we can deduplicate ring locking and list lookups. This way we further reduce code for legacy provided buffers. Locking is also separated from buffer related handling, which makes it a bit simpler with label jumps. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f61af131622ad4337c2fb9f7c453d5b0102c7b90.1747150490.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 73 +++++++++++++++++++----------------------------- io_uring/kbuf.h | 4 +-- io_uring/opdef.c | 4 +-- 3 files changed, 31 insertions(+), 50 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index df8aeb42e910..823e7eb15fb2 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -450,30 +450,6 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - ret = -ENOENT; - bl = io_buffer_get_list(ctx, p->bgid); - if (bl) { - ret = -EINVAL; - /* can't use provide/remove buffers command on mapped buffers */ - if (!(bl->flags & IOBL_BUF_RING)) - ret = io_remove_buffers_legacy(ctx, bl, p->nbufs); - } - io_ring_submit_unlock(ctx, issue_flags); - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { unsigned long size, tmp_check; @@ -535,37 +511,44 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, return i ? 0 : -ENOMEM; } -int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) +static int __io_manage_buffers_legacy(struct io_kiocb *req, + struct io_buffer_list *bl) { struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); + int ret; - bl = io_buffer_get_list(ctx, p->bgid); - if (unlikely(!bl)) { + if (!bl) { + if (req->opcode != IORING_OP_PROVIDE_BUFFERS) + return -ENOENT; bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); - if (!bl) { - ret = -ENOMEM; - goto err; - } + if (!bl) + return -ENOMEM; + INIT_LIST_HEAD(&bl->buf_list); - ret = io_buffer_add_list(ctx, bl, p->bgid); + ret = io_buffer_add_list(req->ctx, bl, p->bgid); if (ret) { kfree(bl); - goto err; + return ret; } } - /* can't add buffers via this command for a mapped buffer ring */ - if (bl->flags & IOBL_BUF_RING) { - ret = -EINVAL; - goto err; - } + /* can't use provide/remove buffers command on mapped buffers */ + if (bl->flags & IOBL_BUF_RING) + return -EINVAL; + if (req->opcode == IORING_OP_PROVIDE_BUFFERS) + return io_add_buffers(req->ctx, p, bl); + return io_remove_buffers_legacy(req->ctx, bl, p->nbufs); +} + +int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret; - ret = io_add_buffers(ctx, p, bl); -err: + io_ring_submit_lock(ctx, issue_flags); + bl = io_buffer_get_list(ctx, p->bgid); + ret = __io_manage_buffers_legacy(req, bl); io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 0798a732e6cb..4d2c209d1a41 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -66,10 +66,8 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg); void io_destroy_buffers(struct io_ring_ctx *ctx); int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags); - int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags); +int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags); int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index db36433c2294..6e0882b051f9 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -333,13 +333,13 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .iopoll = 1, .prep = io_provide_buffers_prep, - .issue = io_provide_buffers, + .issue = io_manage_buffers_legacy, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, .prep = io_remove_buffers_prep, - .issue = io_remove_buffers, + .issue = io_manage_buffers_legacy, }, [IORING_OP_TEE] = { .needs_file = 1, -- 2.51.0 From d871198ee431d90f5308d53998c1ba1d5db5619a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 13 May 2025 15:02:23 -0600 Subject: [PATCH 07/16] io_uring/fdinfo: grab ctx->uring_lock around io_uring_show_fdinfo() Not everything requires locking in there, which is why the 'has_lock' variable exists. But enough does that it's a bit unwieldy to manage. Wrap the whole thing in a ->uring_lock trylock, and just return with no output if we fail to grab it. The existing trylock() will already have greatly diminished utility/output for the failure case. This fixes an issue with reading the SQE fields, if the ring is being actively resized at the same time. Reported-by: Jann Horn Fixes: 79cfe9e59c2a ("io_uring/register: add IORING_REGISTER_RESIZE_RINGS") Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 48 ++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 9414ca6d101c..e0d6a59a89fa 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -86,13 +86,8 @@ static inline void napi_show_fdinfo(struct io_ring_ctx *ctx, } #endif -/* - * Caller holds a reference to the file already, we don't need to do - * anything else to get an extra reference. - */ -__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) +static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { - struct io_ring_ctx *ctx = file->private_data; struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; struct rusage sq_usage; @@ -106,7 +101,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) unsigned int sq_entries, cq_entries; int sq_pid = -1, sq_cpu = -1; u64 sq_total_time = 0, sq_work_time = 0; - bool has_lock; unsigned int i; if (ctx->flags & IORING_SETUP_CQE32) @@ -176,15 +170,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "\n"); } - /* - * Avoid ABBA deadlock between the seq lock and the io_uring mutex, - * since fdinfo case grabs it in the opposite direction of normal use - * cases. If we fail to get the lock, we just don't iterate any - * structures that could be going away outside the io_uring mutex. - */ - has_lock = mutex_trylock(&ctx->uring_lock); - - if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { + if (ctx->flags & IORING_SETUP_SQPOLL) { struct io_sq_data *sq = ctx->sq_data; /* @@ -206,7 +192,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time); seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time); seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr); - for (i = 0; has_lock && i < ctx->file_table.data.nr; i++) { + for (i = 0; i < ctx->file_table.data.nr; i++) { struct file *f = NULL; if (ctx->file_table.data.nodes[i]) @@ -218,7 +204,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) } } seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr); - for (i = 0; has_lock && i < ctx->buf_table.nr; i++) { + for (i = 0; i < ctx->buf_table.nr; i++) { struct io_mapped_ubuf *buf = NULL; if (ctx->buf_table.nodes[i]) @@ -228,7 +214,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) else seq_printf(m, "%5u: \n", i); } - if (has_lock && !xa_empty(&ctx->personalities)) { + if (!xa_empty(&ctx->personalities)) { unsigned long index; const struct cred *cred; @@ -238,7 +224,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) } seq_puts(m, "PollList:\n"); - for (i = 0; has_lock && i < (1U << ctx->cancel_table.hash_bits); i++) { + for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; struct io_kiocb *req; @@ -247,9 +233,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) task_work_pending(req->tctx->task)); } - if (has_lock) - mutex_unlock(&ctx->uring_lock); - seq_puts(m, "CqOverflowList:\n"); spin_lock(&ctx->completion_lock); list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { @@ -262,4 +245,23 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) spin_unlock(&ctx->completion_lock); napi_show_fdinfo(ctx, m); } + +/* + * Caller holds a reference to the file already, we don't need to do + * anything else to get an extra reference. + */ +__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) +{ + struct io_ring_ctx *ctx = file->private_data; + + /* + * Avoid ABBA deadlock between the seq lock and the io_uring mutex, + * since fdinfo case grabs it in the opposite direction of normal use + * cases. + */ + if (mutex_trylock(&ctx->uring_lock)) { + __io_uring_show_fdinfo(ctx, m); + mutex_unlock(&ctx->uring_lock); + } +} #endif -- 2.51.0 From 9a109266278f200ae0b64508273fea3db8af7a9e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 May 2025 12:10:12 -0600 Subject: [PATCH 08/16] io_uring/fdinfo: only compile if CONFIG_PROC_FS is set Rather than wrap fdinfo.c in one big if, handle it on the Makefile side instead. io_uring.c already conditionally sets fops->fdinfo() anyway. Signed-off-by: Jens Axboe --- io_uring/Makefile | 3 ++- io_uring/fdinfo.c | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/io_uring/Makefile b/io_uring/Makefile index 11a739927a62..d97c6b51d584 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ eventfd.o uring_cmd.o openclose.o \ sqpoll.o xattr.o nop.o fs.o splice.o \ sync.o msg_ring.o advise.o openclose.o \ - statx.o timeout.o fdinfo.o cancel.o \ + statx.o timeout.o cancel.o \ waitid.o register.o truncate.o \ memmap.o alloc_cache.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o @@ -20,3 +20,4 @@ obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_EPOLL) += epoll.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o obj-$(CONFIG_NET) += net.o cmd_net.o +obj-$(CONFIG_PROC_FS) += fdinfo.o diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index e0d6a59a89fa..b83296eee5f8 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -15,7 +15,6 @@ #include "cancel.h" #include "rsrc.h" -#ifdef CONFIG_PROC_FS static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, const struct cred *cred) { @@ -264,4 +263,3 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) mutex_unlock(&ctx->uring_lock); } } -#endif -- 2.51.0 From 16256648cd0877aed9ede41d5d4ad3c1d65d9b2b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 May 2025 12:33:28 -0600 Subject: [PATCH 09/16] io_uring/fdinfo: get rid of dumping credentials It's a faily obscure feature, and registered credentials would for that mostly be a static thing. Don't bother including code to dump the personalities indices. Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 38 -------------------------------------- 1 file changed, 38 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index b83296eee5f8..e9355276ab5d 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -15,36 +15,6 @@ #include "cancel.h" #include "rsrc.h" -static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, - const struct cred *cred) -{ - struct user_namespace *uns = seq_user_ns(m); - struct group_info *gi; - kernel_cap_t cap; - int g; - - seq_printf(m, "%5d\n", id); - seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); - seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); - seq_puts(m, "\n\tGroups:\t"); - gi = cred->group_info; - for (g = 0; g < gi->ngroups; g++) { - seq_put_decimal_ull(m, g ? " " : "", - from_kgid_munged(uns, gi->gid[g])); - } - seq_puts(m, "\n\tCapEff:\t"); - cap = cred->cap_effective; - seq_put_hex_ll(m, NULL, cap.val, 16); - seq_putc(m, '\n'); - return 0; -} - #ifdef CONFIG_NET_RX_BUSY_POLL static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m, @@ -213,14 +183,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) else seq_printf(m, "%5u: \n", i); } - if (!xa_empty(&ctx->personalities)) { - unsigned long index; - const struct cred *cred; - - seq_printf(m, "Personalities:\n"); - xa_for_each(&ctx->personalities, index, cred) - io_uring_show_cred(m, index, cred); - } seq_puts(m, "PollList:\n"); for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { -- 2.51.0 From 5288b9e28f8a6f464746ddabcf9bf49d1323acfc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 14 May 2025 09:07:20 +0100 Subject: [PATCH 10/16] io_uring: open code io_req_cqe_overflow() A preparation patch, just open code io_req_cqe_overflow(). Signed-off-by: Pavel Begunkov Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 43c285cd2294..e4d6e572eabc 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -739,14 +739,6 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -static void io_req_cqe_overflow(struct io_kiocb *req) -{ - io_cqring_event_overflow(req->ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->big_cqe.extra1, req->big_cqe.extra2); - memset(&req->big_cqe, 0, sizeof(req->big_cqe)); -} - /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to @@ -1435,11 +1427,19 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) unlikely(!io_fill_cqe_req(ctx, req))) { if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); - io_req_cqe_overflow(req); + io_cqring_event_overflow(req->ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + req->big_cqe.extra1, + req->big_cqe.extra2); spin_unlock(&ctx->completion_lock); } else { - io_req_cqe_overflow(req); + io_cqring_event_overflow(req->ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + req->big_cqe.extra1, + req->big_cqe.extra2); } + + memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } } __io_cq_unlock_post(ctx); -- 2.51.0 From 10f466abc404443cb72ab3384f297345ac7415e1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 May 2025 09:39:14 -0600 Subject: [PATCH 11/16] io_uring: split alloc and add of overflow Add a new helper, io_alloc_ocqe(), that simply allocates and fills an overflow entry. Then it can get done outside of the locking section, and hence use more appropriate gfp_t allocation flags rather than always default to GFP_ATOMIC. Inspired by a previous series from Pavel: https://lore.kernel.org/io-uring/cover.1747209332.git.asml.silence@gmail.com/ Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 74 +++++++++++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 29 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e4d6e572eabc..c2f9610fa891 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -697,20 +697,11 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } } -static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags, u64 extra1, u64 extra2) +static bool io_cqring_add_overflow(struct io_ring_ctx *ctx, + struct io_overflow_cqe *ocqe) { - struct io_overflow_cqe *ocqe; - size_t ocq_size = sizeof(struct io_overflow_cqe); - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); - lockdep_assert_held(&ctx->completion_lock); - if (is_cqe32) - ocq_size += sizeof(struct io_uring_cqe); - - ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); - trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); if (!ocqe) { struct io_rings *r = ctx->rings; @@ -728,17 +719,35 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } - ocqe->cqe.user_data = user_data; - ocqe->cqe.res = res; - ocqe->cqe.flags = cflags; - if (is_cqe32) { - ocqe->cqe.big_cqe[0] = extra1; - ocqe->cqe.big_cqe[1] = extra2; - } list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } +static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, + u64 user_data, s32 res, u32 cflags, + u64 extra1, u64 extra2, gfp_t gfp) +{ + struct io_overflow_cqe *ocqe; + size_t ocq_size = sizeof(struct io_overflow_cqe); + bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + + if (is_cqe32) + ocq_size += sizeof(struct io_uring_cqe); + + ocqe = kmalloc(ocq_size, gfp | __GFP_ACCOUNT); + trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); + if (ocqe) { + ocqe->cqe.user_data = user_data; + ocqe->cqe.res = res; + ocqe->cqe.flags = cflags; + if (is_cqe32) { + ocqe->cqe.big_cqe[0] = extra1; + ocqe->cqe.big_cqe[1] = extra2; + } + } + return ocqe; +} + /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to @@ -803,8 +812,12 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); - if (!filled) - filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + if (unlikely(!filled)) { + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, user_data, res, cflags, 0, 0, GFP_ATOMIC); + filled = io_cqring_add_overflow(ctx, ocqe); + } io_cq_unlock_post(ctx); return filled; } @@ -819,8 +832,11 @@ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) lockdep_assert(ctx->lockless_cq); if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, user_data, res, cflags, 0, 0, GFP_KERNEL); spin_lock(&ctx->completion_lock); - io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + io_cqring_add_overflow(ctx, ocqe); spin_unlock(&ctx->completion_lock); } ctx->submit_state.cq_flush = true; @@ -1425,18 +1441,18 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) */ if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { + gfp_t gfp = ctx->lockless_cq ? GFP_KERNEL : GFP_ATOMIC; + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, req->cqe.user_data, req->cqe.res, + req->cqe.flags, req->big_cqe.extra1, + req->big_cqe.extra2, gfp); if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); - io_cqring_event_overflow(req->ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->big_cqe.extra1, - req->big_cqe.extra2); + io_cqring_add_overflow(ctx, ocqe); spin_unlock(&ctx->completion_lock); } else { - io_cqring_event_overflow(req->ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->big_cqe.extra1, - req->big_cqe.extra2); + io_cqring_add_overflow(ctx, ocqe); } memset(&req->big_cqe, 0, sizeof(req->big_cqe)); -- 2.51.0 From 072d37b52c914271319b9f7e596ff3cba02e249c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 May 2025 09:45:24 -0600 Subject: [PATCH 12/16] io_uring: make io_alloc_ocqe() take a struct io_cqe pointer The number of arguments to io_alloc_ocqe() is a bit unwieldy. Make it take a struct io_cqe pointer rather than three separate CQE args. One path already has that readily available, add an io_init_cqe() helper for the remaining two. Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c2f9610fa891..02d597716467 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -724,8 +724,8 @@ static bool io_cqring_add_overflow(struct io_ring_ctx *ctx, } static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, - u64 user_data, s32 res, u32 cflags, - u64 extra1, u64 extra2, gfp_t gfp) + struct io_cqe *cqe, u64 extra1, + u64 extra2, gfp_t gfp) { struct io_overflow_cqe *ocqe; size_t ocq_size = sizeof(struct io_overflow_cqe); @@ -735,11 +735,11 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, ocq_size += sizeof(struct io_uring_cqe); ocqe = kmalloc(ocq_size, gfp | __GFP_ACCOUNT); - trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); + trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); if (ocqe) { - ocqe->cqe.user_data = user_data; - ocqe->cqe.res = res; - ocqe->cqe.flags = cflags; + ocqe->cqe.user_data = cqe->user_data; + ocqe->cqe.res = cqe->res; + ocqe->cqe.flags = cqe->flags; if (is_cqe32) { ocqe->cqe.big_cqe[0] = extra1; ocqe->cqe.big_cqe[1] = extra2; @@ -806,6 +806,11 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, return false; } +static inline struct io_cqe io_init_cqe(u64 user_data, s32 res, u32 cflags) +{ + return (struct io_cqe) { .user_data = user_data, .res = res, .flags = cflags }; +} + bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { bool filled; @@ -814,8 +819,9 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags filled = io_fill_cqe_aux(ctx, user_data, res, cflags); if (unlikely(!filled)) { struct io_overflow_cqe *ocqe; + struct io_cqe cqe = io_init_cqe(user_data, res, cflags); - ocqe = io_alloc_ocqe(ctx, user_data, res, cflags, 0, 0, GFP_ATOMIC); + ocqe = io_alloc_ocqe(ctx, &cqe, 0, 0, GFP_ATOMIC); filled = io_cqring_add_overflow(ctx, ocqe); } io_cq_unlock_post(ctx); @@ -833,8 +839,9 @@ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { struct io_overflow_cqe *ocqe; + struct io_cqe cqe = io_init_cqe(user_data, res, cflags); - ocqe = io_alloc_ocqe(ctx, user_data, res, cflags, 0, 0, GFP_KERNEL); + ocqe = io_alloc_ocqe(ctx, &cqe, 0, 0, GFP_KERNEL); spin_lock(&ctx->completion_lock); io_cqring_add_overflow(ctx, ocqe); spin_unlock(&ctx->completion_lock); @@ -1444,8 +1451,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) gfp_t gfp = ctx->lockless_cq ? GFP_KERNEL : GFP_ATOMIC; struct io_overflow_cqe *ocqe; - ocqe = io_alloc_ocqe(ctx, req->cqe.user_data, req->cqe.res, - req->cqe.flags, req->big_cqe.extra1, + ocqe = io_alloc_ocqe(ctx, &req->cqe, req->big_cqe.extra1, req->big_cqe.extra2, gfp); if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); -- 2.51.0 From c80bdb1c55719cd6308d648a7920272a3be09e34 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 May 2025 13:16:44 -0600 Subject: [PATCH 13/16] io_uring: pass in struct io_big_cqe to io_alloc_ocqe() Rather than pass extra1/extra2 separately, just pass in the (now) named io_big_cqe struct instead. The callers that don't use/support CQE32 will now just pass a single NULL, rather than two seperate mystery zero values. Move the clearing of the big_cqe elements into io_alloc_ocqe() as well, so it can get moved out of the generic code. Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 23 +++++++++++------------ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 00dbd7cd0e7d..2922635986f5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -710,7 +710,7 @@ struct io_kiocb { const struct cred *creds; struct io_wq_work work; - struct { + struct io_big_cqe { u64 extra1; u64 extra2; } big_cqe; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 02d597716467..4081ffd890af 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -724,8 +724,8 @@ static bool io_cqring_add_overflow(struct io_ring_ctx *ctx, } static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, - struct io_cqe *cqe, u64 extra1, - u64 extra2, gfp_t gfp) + struct io_cqe *cqe, + struct io_big_cqe *big_cqe, gfp_t gfp) { struct io_overflow_cqe *ocqe; size_t ocq_size = sizeof(struct io_overflow_cqe); @@ -734,17 +734,19 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, if (is_cqe32) ocq_size += sizeof(struct io_uring_cqe); - ocqe = kmalloc(ocq_size, gfp | __GFP_ACCOUNT); + ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); if (ocqe) { ocqe->cqe.user_data = cqe->user_data; ocqe->cqe.res = cqe->res; ocqe->cqe.flags = cqe->flags; - if (is_cqe32) { - ocqe->cqe.big_cqe[0] = extra1; - ocqe->cqe.big_cqe[1] = extra2; + if (is_cqe32 && big_cqe) { + ocqe->cqe.big_cqe[0] = big_cqe->extra1; + ocqe->cqe.big_cqe[1] = big_cqe->extra2; } } + if (big_cqe) + big_cqe->extra1 = big_cqe->extra2 = 0; return ocqe; } @@ -821,7 +823,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags struct io_overflow_cqe *ocqe; struct io_cqe cqe = io_init_cqe(user_data, res, cflags); - ocqe = io_alloc_ocqe(ctx, &cqe, 0, 0, GFP_ATOMIC); + ocqe = io_alloc_ocqe(ctx, &cqe, NULL, GFP_ATOMIC); filled = io_cqring_add_overflow(ctx, ocqe); } io_cq_unlock_post(ctx); @@ -841,7 +843,7 @@ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) struct io_overflow_cqe *ocqe; struct io_cqe cqe = io_init_cqe(user_data, res, cflags); - ocqe = io_alloc_ocqe(ctx, &cqe, 0, 0, GFP_KERNEL); + ocqe = io_alloc_ocqe(ctx, &cqe, NULL, GFP_KERNEL); spin_lock(&ctx->completion_lock); io_cqring_add_overflow(ctx, ocqe); spin_unlock(&ctx->completion_lock); @@ -1451,8 +1453,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) gfp_t gfp = ctx->lockless_cq ? GFP_KERNEL : GFP_ATOMIC; struct io_overflow_cqe *ocqe; - ocqe = io_alloc_ocqe(ctx, &req->cqe, req->big_cqe.extra1, - req->big_cqe.extra2, gfp); + ocqe = io_alloc_ocqe(ctx, &req->cqe, &req->big_cqe, gfp); if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); io_cqring_add_overflow(ctx, ocqe); @@ -1460,8 +1461,6 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) } else { io_cqring_add_overflow(ctx, ocqe); } - - memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } } __io_cq_unlock_post(ctx); -- 2.51.0 From f660fd2ca15a3743f65f6110ae60d5b80500d856 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 May 2025 13:39:32 -0600 Subject: [PATCH 14/16] io_uring: add new helpers for posting overflows Add two helpers, one for posting overflows for lockless_cq rings, and one for non-lockless_cq rings. The former can allocate sanely with GFP_KERNEL, but needs to grab the completion lock for posting, while the latter must do non-sleeping allocs as it already holds the completion lock. While at it, mark the overflow handling functions as __cold as well, as they should not generally be called during normal operations of the ring. Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 50 ++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4081ffd890af..3c4a9561941f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -697,8 +697,8 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } } -static bool io_cqring_add_overflow(struct io_ring_ctx *ctx, - struct io_overflow_cqe *ocqe) +static __cold bool io_cqring_add_overflow(struct io_ring_ctx *ctx, + struct io_overflow_cqe *ocqe) { lockdep_assert_held(&ctx->completion_lock); @@ -813,6 +813,27 @@ static inline struct io_cqe io_init_cqe(u64 user_data, s32 res, u32 cflags) return (struct io_cqe) { .user_data = user_data, .res = res, .flags = cflags }; } +static __cold void io_cqe_overflow(struct io_ring_ctx *ctx, struct io_cqe *cqe, + struct io_big_cqe *big_cqe) +{ + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_KERNEL); + spin_lock(&ctx->completion_lock); + io_cqring_add_overflow(ctx, ocqe); + spin_unlock(&ctx->completion_lock); +} + +static __cold bool io_cqe_overflow_locked(struct io_ring_ctx *ctx, + struct io_cqe *cqe, + struct io_big_cqe *big_cqe) +{ + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_ATOMIC); + return io_cqring_add_overflow(ctx, ocqe); +} + bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { bool filled; @@ -820,11 +841,9 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); if (unlikely(!filled)) { - struct io_overflow_cqe *ocqe; struct io_cqe cqe = io_init_cqe(user_data, res, cflags); - ocqe = io_alloc_ocqe(ctx, &cqe, NULL, GFP_ATOMIC); - filled = io_cqring_add_overflow(ctx, ocqe); + filled = io_cqe_overflow_locked(ctx, &cqe, NULL); } io_cq_unlock_post(ctx); return filled; @@ -840,13 +859,9 @@ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) lockdep_assert(ctx->lockless_cq); if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { - struct io_overflow_cqe *ocqe; struct io_cqe cqe = io_init_cqe(user_data, res, cflags); - ocqe = io_alloc_ocqe(ctx, &cqe, NULL, GFP_KERNEL); - spin_lock(&ctx->completion_lock); - io_cqring_add_overflow(ctx, ocqe); - spin_unlock(&ctx->completion_lock); + io_cqe_overflow(ctx, &cqe, NULL); } ctx->submit_state.cq_flush = true; } @@ -1450,17 +1465,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) */ if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { - gfp_t gfp = ctx->lockless_cq ? GFP_KERNEL : GFP_ATOMIC; - struct io_overflow_cqe *ocqe; - - ocqe = io_alloc_ocqe(ctx, &req->cqe, &req->big_cqe, gfp); - if (ctx->lockless_cq) { - spin_lock(&ctx->completion_lock); - io_cqring_add_overflow(ctx, ocqe); - spin_unlock(&ctx->completion_lock); - } else { - io_cqring_add_overflow(ctx, ocqe); - } + if (ctx->lockless_cq) + io_cqe_overflow(ctx, &req->cqe, &req->big_cqe); + else + io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe); } } __io_cq_unlock_post(ctx); -- 2.51.0 From 8bb9d6ccd36062d16baa707b759809e1f494017e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 8 May 2025 14:48:33 -0600 Subject: [PATCH 15/16] io_uring: finish IOU_OK -> IOU_COMPLETE transition IOU_COMPLETE is more descriptive, in that it explicitly says that the return value means "please post a completion for this request". This patch completes the transition from IOU_OK to IOU_COMPLETE, replacing existing IOU_OK users. This is a purely mechanical change. Signed-off-by: Jens Axboe --- io_uring/advise.c | 4 ++-- io_uring/cancel.c | 2 +- io_uring/epoll.c | 4 ++-- io_uring/fs.c | 10 +++++----- io_uring/futex.c | 6 +++--- io_uring/io_uring.c | 2 +- io_uring/io_uring.h | 1 - io_uring/kbuf.c | 2 +- io_uring/msg_ring.c | 2 +- io_uring/net.c | 14 +++++++------- io_uring/nop.c | 2 +- io_uring/openclose.c | 8 ++++---- io_uring/poll.c | 4 ++-- io_uring/rsrc.c | 2 +- io_uring/rw.c | 2 +- io_uring/splice.c | 4 ++-- io_uring/statx.c | 2 +- io_uring/sync.c | 6 +++--- io_uring/timeout.c | 2 +- io_uring/truncate.c | 2 +- io_uring/uring_cmd.c | 2 +- io_uring/waitid.c | 2 +- io_uring/xattr.c | 8 ++++---- 23 files changed, 46 insertions(+), 47 deletions(-) diff --git a/io_uring/advise.c b/io_uring/advise.c index cb7b881665e5..0073f74e3658 100644 --- a/io_uring/advise.c +++ b/io_uring/advise.c @@ -58,7 +58,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags) ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; #else return -EOPNOTSUPP; #endif @@ -104,5 +104,5 @@ int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 0870060bac7c..6d57602304df 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -229,7 +229,7 @@ done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } static int __io_sync_cancel(struct io_uring_task *tctx, diff --git a/io_uring/epoll.c b/io_uring/epoll.c index 6d2c48ba1923..8d4610246ba0 100644 --- a/io_uring/epoll.c +++ b/io_uring/epoll.c @@ -61,7 +61,7 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -88,5 +88,5 @@ int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/fs.c b/io_uring/fs.c index eccea851dd5a..37079a414eab 100644 --- a/io_uring/fs.c +++ b/io_uring/fs.c @@ -90,7 +90,7 @@ int io_renameat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_renameat_cleanup(struct io_kiocb *req) @@ -141,7 +141,7 @@ int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_unlinkat_cleanup(struct io_kiocb *req) @@ -185,7 +185,7 @@ int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_mkdirat_cleanup(struct io_kiocb *req) @@ -235,7 +235,7 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -281,7 +281,7 @@ int io_linkat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_link_cleanup(struct io_kiocb *req) diff --git a/io_uring/futex.c b/io_uring/futex.c index 0ea4820cd8ff..b34695022baa 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -234,7 +234,7 @@ int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) kfree(futexv); req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; - return IOU_OK; + return IOU_COMPLETE; } /* @@ -311,7 +311,7 @@ done: req_set_fail(req); io_req_set_res(req, ret, 0); kfree(ifd); - return IOU_OK; + return IOU_COMPLETE; } int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags) @@ -328,5 +328,5 @@ int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3c4a9561941f..5cdccf65c652 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1751,7 +1751,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) ret = __io_issue_sqe(req, issue_flags, def); - if (ret == IOU_OK) { + if (ret == IOU_COMPLETE) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) io_req_complete_defer(req); else diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 81f22196a57d..0ea7a435d1de 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -19,7 +19,6 @@ #endif enum { - IOU_OK = 0, /* deprecated, use IOU_COMPLETE */ IOU_COMPLETE = 0, IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 823e7eb15fb2..8cce3ebd813f 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -554,7 +554,7 @@ int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 50a958e9c921..71400d6cefc8 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -328,7 +328,7 @@ done: req_set_fail(req); } io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_uring_sync_msg_ring(struct io_uring_sqe *sqe) diff --git a/io_uring/net.c b/io_uring/net.c index 1fbdb2bbb3f3..ee3f721ad758 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -128,7 +128,7 @@ int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) ret = __sys_shutdown_sock(sock, shutdown->how); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } static bool io_net_retry(struct socket *sock, int flags) @@ -502,7 +502,7 @@ static inline bool io_send_finish(struct io_kiocb *req, int *ret, /* Otherwise stop bundle and use the current result. */ finish: io_req_set_res(req, *ret, cflags); - *ret = IOU_OK; + *ret = IOU_COMPLETE; return true; } @@ -553,7 +553,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) else if (sr->done_io) ret = sr->done_io; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, @@ -1459,7 +1459,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) io_req_msg_cleanup(req, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); - return IOU_OK; + return IOU_COMPLETE; } int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) @@ -1530,7 +1530,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) io_req_msg_cleanup(req, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); - return IOU_OK; + return IOU_COMPLETE; } void io_sendrecv_fail(struct io_kiocb *req) @@ -1694,7 +1694,7 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags) sock->file_slot); } io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -1761,7 +1761,7 @@ out: req_set_fail(req); io_req_msg_cleanup(req, issue_flags); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) diff --git a/io_uring/nop.c b/io_uring/nop.c index 28f06285fdc2..6ac2de761fd3 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -68,5 +68,5 @@ done: if (ret < 0) req_set_fail(req); io_req_set_res(req, nop->result, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/openclose.c b/io_uring/openclose.c index 4dd461163457..83e36ad4e31b 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -171,7 +171,7 @@ err: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_openat(struct io_kiocb *req, unsigned int issue_flags) @@ -259,7 +259,7 @@ err: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -302,7 +302,7 @@ int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } struct io_pipe { @@ -426,7 +426,7 @@ int io_pipe(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); if (!ret) - return IOU_OK; + return IOU_COMPLETE; req_set_fail(req); if (files[0]) diff --git a/io_uring/poll.c b/io_uring/poll.c index 8eb744eb9f4c..0526062e2f81 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -893,7 +893,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags); if (ret > 0) { io_req_set_res(req, ipt.result_mask, 0); - return IOU_OK; + return IOU_COMPLETE; } return ret ?: IOU_ISSUE_SKIP_COMPLETE; } @@ -948,5 +948,5 @@ out: } /* complete update request, we're done with it */ io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 1657d775c8ba..c592ceace97d 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -500,7 +500,7 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) diff --git a/io_uring/rw.c b/io_uring/rw.c index 17a12a1cf3a6..8857b8445e46 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -660,7 +660,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, io_req_io_end(req); io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags)); io_req_rw_cleanup(req, issue_flags); - return IOU_OK; + return IOU_COMPLETE; } else { io_rw_done(req, ret); } diff --git a/io_uring/splice.c b/io_uring/splice.c index 7b89bd84d486..35ce4e60b495 100644 --- a/io_uring/splice.c +++ b/io_uring/splice.c @@ -103,7 +103,7 @@ done: if (ret != sp->len) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -144,5 +144,5 @@ done: if (ret != sp->len) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/statx.c b/io_uring/statx.c index 6bc4651700a2..5111e9befbfe 100644 --- a/io_uring/statx.c +++ b/io_uring/statx.c @@ -59,7 +59,7 @@ int io_statx(struct io_kiocb *req, unsigned int issue_flags) ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_statx_cleanup(struct io_kiocb *req) diff --git a/io_uring/sync.c b/io_uring/sync.c index 255f68c37e55..cea2d381ffd2 100644 --- a/io_uring/sync.c +++ b/io_uring/sync.c @@ -47,7 +47,7 @@ int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -79,7 +79,7 @@ int io_fsync(struct io_kiocb *req, unsigned int issue_flags) ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, sync->flags & IORING_FSYNC_DATASYNC); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -108,5 +108,5 @@ int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) if (ret >= 0) fsnotify_modify(req->file); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/timeout.c b/io_uring/timeout.c index a6ff8c026b1f..7f13bfa9f2b6 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -505,7 +505,7 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } static int __io_timeout_prep(struct io_kiocb *req, diff --git a/io_uring/truncate.c b/io_uring/truncate.c index 62ee73d34d72..487baf23b44e 100644 --- a/io_uring/truncate.c +++ b/io_uring/truncate.c @@ -44,5 +44,5 @@ int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags) ret = do_ftruncate(req->file, ft->len, 1); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 8a6b0ddef796..fe84c934734e 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -265,7 +265,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); io_req_uring_cleanup(req, issue_flags); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 54e69984cd8a..e07a94694397 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -323,5 +323,5 @@ done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/xattr.c b/io_uring/xattr.c index de5064fcae8a..322b94ff9e4b 100644 --- a/io_uring/xattr.c +++ b/io_uring/xattr.c @@ -109,7 +109,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) ret = file_getxattr(req->file, &ix->ctx); io_xattr_finish(req, ret); - return IOU_OK; + return IOU_COMPLETE; } int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) @@ -122,7 +122,7 @@ int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) ret = filename_getxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx); ix->filename = NULL; io_xattr_finish(req, ret); - return IOU_OK; + return IOU_COMPLETE; } static int __io_setxattr_prep(struct io_kiocb *req, @@ -190,7 +190,7 @@ int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) ret = file_setxattr(req->file, &ix->ctx); io_xattr_finish(req, ret); - return IOU_OK; + return IOU_COMPLETE; } int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) @@ -203,5 +203,5 @@ int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) ret = filename_setxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx); ix->filename = NULL; io_xattr_finish(req, ret); - return IOU_OK; + return IOU_COMPLETE; } -- 2.51.0 From 28be240c763a44932bfe573f09e145d182e52609 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 22 May 2025 09:04:50 -0600 Subject: [PATCH 16/16] trace/io_uring: fix io_uring_local_work_run ctx documentation The comment for the tracepoint io_uring_local_work_run refers to a field "tctx" and a type "io_uring_ctx", neither of which exist. "tctx" looks to mean "ctx" and "io_uring_ctx" should be "io_ring_ctx". Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250522150451.2385652-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index fb81c533b310..178ab6f611be 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -645,7 +645,7 @@ TRACE_EVENT(io_uring_short_write, /* * io_uring_local_work_run - ran ring local task work * - * @tctx: pointer to a io_uring_ctx + * @ctx: pointer to an io_ring_ctx * @count: how many functions it ran * @loops: how many loops it ran * -- 2.51.0