From 63de899cb6220357dea9d0f4e5aa459ff5193bb0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 9 May 2025 12:12:53 +0100 Subject: [PATCH 01/16] io_uring: count allocated requests Keep track of the number requests a ring currently has allocated (and not freed), it'll be needed in the next patch. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c8f8308294dc2a1cb8925d984d937d4fc14ab5d4.1746788718.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/io_uring.c | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 7e23e993280e..73b289b48280 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -435,6 +435,7 @@ struct io_ring_ctx { /* protected by ->completion_lock */ unsigned evfd_last_cq_tail; + unsigned nr_req_allocated; /* * Protection for resize vs mmap races - both the mmap and resize diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6efecb46c828..714b66ab34b0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -957,6 +957,8 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) } percpu_ref_get_many(&ctx->refs, ret); + ctx->nr_req_allocated += ret; + while (ret--) { struct io_kiocb *req = reqs[ret]; @@ -2694,8 +2696,10 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) kmem_cache_free(req_cachep, req); nr++; } - if (nr) + if (nr) { + ctx->nr_req_allocated -= nr; percpu_ref_put_many(&ctx->refs, nr); + } mutex_unlock(&ctx->uring_lock); } @@ -2732,6 +2736,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) percpu_ref_exit(&ctx->refs); free_uid(ctx->user); io_req_caches_free(ctx); + + WARN_ON_ONCE(ctx->nr_req_allocated); + if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); io_napi_free(ctx); -- 2.51.0 From 63166b815dc163b2e46426cecf707dc5923d6d13 Mon Sep 17 00:00:00 2001 From: hexue Date: Mon, 12 May 2025 13:20:25 +0800 Subject: [PATCH 02/16] io_uring/uring_cmd: fix hybrid polling initialization issue Modify the check for whether the timer is initialized during IO transfer when passthrough is used with hybrid polling, to ensure that it's always setup correctly. Cc: stable@vger.kernel.org Fixes: 01ee194d1aba ("io_uring: add support for hybrid IOPOLL") Signed-off-by: hexue Link: https://lore.kernel.org/r/20250512052025.293031-1-xue01.he@samsung.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index a9ea7d29cdd9..430ed620ddfe 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -254,6 +254,11 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) return -EOPNOTSUPP; issue_flags |= IO_URING_F_IOPOLL; req->iopoll_completed = 0; + if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { + /* make sure every req only blocks once */ + req->flags &= ~REQ_F_IOPOLL_STATE; + req->iopoll_start = ktime_get_ns(); + } } ret = file->f_op->uring_cmd(ioucmd, issue_flags); -- 2.51.0 From 8fb7aee05591fd4d3dca1460448a59e95fa821c3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 9 May 2025 12:12:54 +0100 Subject: [PATCH 03/16] io_uring: drain based on allocates reqs Don't rely on CQ sequence numbers for draining, as it has become messy and needs cq_extra adjustments. Instead, base it on the number of allocated requests and only allow flushing when all requests are in the drain list. As a result, cq_extra is gone, no overhead for its accounting in aux cqe posting, less bloating as it was inlined before, and it's in general simpler than trying to track where we should bump it and where it should be put back like in cases of overflow. Also, it'll likely help with cleaning and unifying some of the CQ posting helpers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/46ece1e34320b046c06fee2498d6b4cd12a700f2.1746788718.git.asml.silence@gmail.com Link: https://lore.kernel.org/r/24497b04b004bceada496033d3c9d09ff8e81ae9.1746944903.git.asml.silence@gmail.com [axboe: fold in fix from link2] Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 79 ++++++++++++++-------------------- io_uring/io_uring.h | 3 +- 3 files changed, 34 insertions(+), 50 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 73b289b48280..00dbd7cd0e7d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -341,7 +341,6 @@ struct io_ring_ctx { unsigned cached_cq_tail; unsigned cq_entries; struct io_ev_fd __rcu *io_ev_fd; - unsigned cq_extra; void *cq_wait_arg; size_t cq_wait_size; @@ -417,6 +416,7 @@ struct io_ring_ctx { struct callback_head poll_wq_task_work; struct list_head defer_list; + unsigned nr_drained; struct io_alloc_cache msg_cache; spinlock_t msg_lock; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 714b66ab34b0..9a9b8d35349b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -129,7 +129,6 @@ struct io_defer_entry { struct list_head list; struct io_kiocb *req; - u32 seq; }; /* requests with any of those set should undergo io_disarm_next() */ @@ -149,6 +148,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool is_sqpoll_thread); static void io_queue_sqe(struct io_kiocb *req); +static void __io_req_caches_free(struct io_ring_ctx *ctx); static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray); @@ -540,46 +540,45 @@ void io_req_queue_iowq(struct io_kiocb *req) io_req_task_work_add(req); } -static bool io_drain_defer_seq(struct io_kiocb *req, u32 seq) +static unsigned io_linked_nr(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *tmp; + unsigned nr = 0; - return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; + io_for_each_link(tmp, req) + nr++; + return nr; } -static __cold noinline void __io_queue_deferred(struct io_ring_ctx *ctx) +static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) { bool drain_seen = false, first = true; + lockdep_assert_held(&ctx->uring_lock); + __io_req_caches_free(ctx); + while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); drain_seen |= de->req->flags & REQ_F_IO_DRAIN; - if ((drain_seen || first) && io_drain_defer_seq(de->req, de->seq)) - break; + if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained) + return; list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); io_req_task_queue(de->req); kfree(de); first = false; } } -static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) -{ - guard(spinlock)(&ctx->completion_lock); - __io_queue_deferred(ctx); -} - void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->poll_activated) io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); - if (ctx->drain_active) - io_queue_deferred(ctx); if (ctx->has_evfd) io_eventfd_signal(ctx, true); } @@ -742,7 +741,6 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, * on the floor. */ WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); - ctx->cq_extra--; set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); return false; } @@ -812,8 +810,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, { struct io_uring_cqe *cqe; - ctx->cq_extra++; - if (likely(io_get_cqe(ctx, &cqe))) { WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); @@ -1459,6 +1455,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } + + if (unlikely(ctx->drain_active)) + io_queue_deferred(ctx); + ctx->submit_state.cq_flush = false; } @@ -1646,17 +1646,6 @@ io_req_flags_t io_file_get_flags(struct file *file) return res; } -static u32 io_get_sequence(struct io_kiocb *req) -{ - u32 seq = req->ctx->cached_sq_head; - struct io_kiocb *cur; - - /* need original cached_sq_head, but it was increased for each req */ - io_for_each_link(cur, req) - seq--; - return seq; -} - static __cold void io_drain_req(struct io_kiocb *req) __must_hold(&ctx->uring_lock) { @@ -1673,14 +1662,12 @@ static __cold void io_drain_req(struct io_kiocb *req) io_prep_async_link(req); trace_io_uring_defer(req); de->req = req; - de->seq = io_get_sequence(req); - scoped_guard(spinlock, &ctx->completion_lock) { - list_add_tail(&de->list, &ctx->defer_list); - __io_queue_deferred(ctx); - if (!drain && list_empty(&ctx->defer_list)) - ctx->drain_active = false; - } + ctx->nr_drained += io_linked_nr(req); + list_add_tail(&de->list, &ctx->defer_list); + io_queue_deferred(ctx); + if (!drain && list_empty(&ctx->defer_list)) + ctx->drain_active = false; } static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, @@ -2263,10 +2250,6 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) { head = READ_ONCE(ctx->sq_array[head]); if (unlikely(head >= ctx->sq_entries)) { - /* drop invalid entries */ - spin_lock(&ctx->completion_lock); - ctx->cq_extra--; - spin_unlock(&ctx->completion_lock); WRITE_ONCE(ctx->rings->sq_dropped, READ_ONCE(ctx->rings->sq_dropped) + 1); return false; @@ -2684,13 +2667,11 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, return off; } -static void io_req_caches_free(struct io_ring_ctx *ctx) +static __cold void __io_req_caches_free(struct io_ring_ctx *ctx) { struct io_kiocb *req; int nr = 0; - mutex_lock(&ctx->uring_lock); - while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); kmem_cache_free(req_cachep, req); @@ -2700,7 +2681,12 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) ctx->nr_req_allocated -= nr; percpu_ref_put_many(&ctx->refs, nr); } - mutex_unlock(&ctx->uring_lock); +} + +static __cold void io_req_caches_free(struct io_ring_ctx *ctx) +{ + guard(mutex)(&ctx->uring_lock); + __io_req_caches_free(ctx); } static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) @@ -3005,20 +2991,19 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, struct io_defer_entry *de; LIST_HEAD(list); - spin_lock(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { if (io_match_task_safe(de->req, tctx, cancel_all)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } } - spin_unlock(&ctx->completion_lock); if (list_empty(&list)) return false; while (!list_empty(&list)) { de = list_first_entry(&list, struct io_defer_entry, list); list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); io_req_task_queue_fail(de->req, -ECANCELED); kfree(de); } @@ -3093,8 +3078,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && io_allowed_defer_tw_run(ctx)) ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; - ret |= io_cancel_defer_files(ctx, tctx, cancel_all); mutex_lock(&ctx->uring_lock); + ret |= io_cancel_defer_files(ctx, tctx, cancel_all); ret |= io_poll_remove_all(ctx, tctx, cancel_all); ret |= io_waitid_remove_all(ctx, tctx, cancel_all); ret |= io_futex_remove_all(ctx, tctx, cancel_all); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e4050b2d0821..81f22196a57d 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -196,7 +196,6 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, { io_lockdep_assert_cq_locked(ctx); - ctx->cq_extra++; ctx->submit_state.cq_flush = true; return io_get_cqe(ctx, cqe_ret); } @@ -414,7 +413,7 @@ static inline void io_req_complete_defer(struct io_kiocb *req) static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active || + if (unlikely(ctx->off_timeout_used || ctx->has_evfd || ctx->poll_activated)) __io_commit_cqring_flush(ctx); } -- 2.51.0 From f446c6311e86618a1f81eb576b56a6266307238f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 12 May 2025 09:06:06 -0600 Subject: [PATCH 04/16] io_uring/memmap: don't use page_address() on a highmem page For older/32-bit systems with highmem, don't assume that the pages in a mapped region are always going to be mapped. If io_region_init_ptr() finds that the pages are coalescable, also check if the first page is a HighMem page or not. If it is, fall through to the usual vmap() mapping rather than attempt to get the unmapped page address. Cc: stable@vger.kernel.org Fixes: c4d0ac1c1567 ("io_uring/memmap: optimise single folio regions") Link: https://lore.kernel.org/all/681fe2fb.050a0220.f2294.001a.GAE@google.com/ Reported-by: syzbot+5b8c4abafcb1d791ccfc@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/681fed0a.050a0220.f2294.001c.GAE@google.com/ Reported-by: syzbot+6456a99dfdc2e78c4feb@syzkaller.appspotmail.com Tested-by: syzbot+6456a99dfdc2e78c4feb@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 76fcc79656b0..07f8a5cbd37e 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -116,7 +116,7 @@ static int io_region_init_ptr(struct io_mapped_region *mr) void *ptr; if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) { - if (ifd.nr_folios == 1) { + if (ifd.nr_folios == 1 && !PageHighMem(mr->pages[0])) { mr->ptr = page_address(mr->pages[0]); return 0; } -- 2.51.0 From 475a8d30371604a6363da8e304a608a5959afc40 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 May 2025 18:26:46 +0100 Subject: [PATCH 05/16] io_uring/kbuf: account ring io_buffer_list memory Follow the non-ringed pbuf struct io_buffer_list allocations and account it against the memcg. There is low chance of that being an actual problem as ring provided buffer should either pin user memory or allocate it, which is already accounted. Cc: stable@vger.kernel.org # 6.1 Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3985218b50d341273cafff7234e1a7e6d0db9808.1747150490.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 1cf0d2c01287..446207db1edf 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -621,7 +621,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) io_destroy_bl(ctx, bl); } - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); + free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); if (!bl) return -ENOMEM; -- 2.51.0 From 1724849072854a66861d461b298b04612702d685 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 May 2025 18:26:47 +0100 Subject: [PATCH 06/16] io_uring/kbuf: use mem_is_zero() Make use of mem_is_zero() for reserved fields checking. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/11fe27b7a831329bcdb4ea087317ef123ba7c171.1747150490.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 446207db1edf..344517d1d921 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -602,8 +602,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - - if (reg.resv[0] || reg.resv[1] || reg.resv[2]) + if (!mem_is_zero(reg.resv, sizeof(reg.resv))) return -EINVAL; if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) return -EINVAL; @@ -679,9 +678,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (reg.resv[0] || reg.resv[1] || reg.resv[2]) - return -EINVAL; - if (reg.flags) + if (!mem_is_zero(reg.resv, sizeof(reg.resv)) || reg.flags) return -EINVAL; bl = io_buffer_get_list(ctx, reg.bgid); @@ -701,14 +698,11 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_status buf_status; struct io_buffer_list *bl; - int i; if (copy_from_user(&buf_status, arg, sizeof(buf_status))) return -EFAULT; - - for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++) - if (buf_status.resv[i]) - return -EINVAL; + if (!mem_is_zero(buf_status.resv, sizeof(buf_status.resv))) + return -EINVAL; bl = io_buffer_get_list(ctx, buf_status.buf_group); if (!bl) -- 2.51.0 From 4e9fda29d66b06caf5c81b8acbe0a504effc73fb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 May 2025 18:26:48 +0100 Subject: [PATCH 07/16] io_uring/kbuf: drop extra vars in io_register_pbuf_ring bl and free_bl variables in io_register_pbuf_ring() always point to the same list since we started to reallocate the pre-existent list. Drop free_bl. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d45c3342d74c9030f99376c777a4b3d59089074d.1747150490.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 344517d1d921..406e8a9b42c3 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -591,7 +591,7 @@ err: int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_reg reg; - struct io_buffer_list *bl, *free_bl = NULL; + struct io_buffer_list *bl; struct io_uring_region_desc rd; struct io_uring_buf_ring *br; unsigned long mmap_offset; @@ -620,7 +620,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) io_destroy_bl(ctx, bl); } - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); + bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); if (!bl) return -ENOMEM; @@ -665,7 +665,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return 0; fail: io_free_region(ctx, &bl->region); - kfree(free_bl); + kfree(bl); return ret; } -- 2.51.0 From 52a05d0cf8f3b4569c525153132a90661c32fe11 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 May 2025 18:26:49 +0100 Subject: [PATCH 08/16] io_uring/kbuf: don't compute size twice on prep The size in prep is calculated by io_provide_buffers_prep(), so remove the recomputation a few lines after. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7c97206561b74fce245cb22449c6082d2e066844.1747150490.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 406e8a9b42c3..eb666c02f488 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -509,8 +509,6 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return -EOVERFLOW; if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) return -EOVERFLOW; - - size = (unsigned long)p->len * p->nbufs; if (!access_ok(u64_to_user_ptr(p->addr), size)) return -EFAULT; -- 2.51.0 From c724e801239ffc3714afe65cf6e721ddd04199d0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 May 2025 18:26:50 +0100 Subject: [PATCH 09/16] io_uring/kbuf: refactor __io_remove_buffers __io_remove_buffers used for two purposes, the first is removing buffers for non ring based lists, which implies that it can be called multiple times for the same list. And the second is for destroying lists, which is not perfectly reentrable for ring based lists. It's confusing, so just have a helper for the legacy pbuf buffer removal, make sure it's not called for ring pbuf, and open code all ring pbuf destruction into io_put_bl(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0ae416b099d311ad23f285cea02f2c94c8ae9a6c.1747150490.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index eb666c02f488..df8aeb42e910 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -376,45 +376,33 @@ unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs) return ret; } -static int __io_remove_buffers(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned nbufs) +static int io_remove_buffers_legacy(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, + unsigned long nbufs) { - unsigned i = 0; - - /* shouldn't happen */ - if (!nbufs) - return 0; - - if (bl->flags & IOBL_BUF_RING) { - i = bl->buf_ring->tail - bl->head; - io_free_region(ctx, &bl->region); - /* make sure it's seen as empty */ - INIT_LIST_HEAD(&bl->buf_list); - bl->flags &= ~IOBL_BUF_RING; - return i; - } + unsigned long i = 0; + struct io_buffer *nxt; /* protects io_buffers_cache */ lockdep_assert_held(&ctx->uring_lock); + WARN_ON_ONCE(bl->flags & IOBL_BUF_RING); - while (!list_empty(&bl->buf_list)) { - struct io_buffer *nxt; - + for (i = 0; i < nbufs && !list_empty(&bl->buf_list); i++) { nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&nxt->list); kfree(nxt); - - if (++i == nbufs) - return i; cond_resched(); } - return i; } static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { - __io_remove_buffers(ctx, bl, -1U); + if (bl->flags & IOBL_BUF_RING) + io_free_region(ctx, &bl->region); + else + io_remove_buffers_legacy(ctx, bl, -1U); + kfree(bl); } @@ -477,7 +465,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) ret = -EINVAL; /* can't use provide/remove buffers command on mapped buffers */ if (!(bl->flags & IOBL_BUF_RING)) - ret = __io_remove_buffers(ctx, bl, p->nbufs); + ret = io_remove_buffers_legacy(ctx, bl, p->nbufs); } io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) -- 2.51.0 From 2b61bb1d9aa601ec393054a61be0a707a5bea928 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 May 2025 18:26:51 +0100 Subject: [PATCH 10/16] io_uring/kbuf: unify legacy buf provision and removal Combine IORING_OP_PROVIDE_BUFFERS and IORING_OP_REMOVE_BUFFERS ->issue(), so that we can deduplicate ring locking and list lookups. This way we further reduce code for legacy provided buffers. Locking is also separated from buffer related handling, which makes it a bit simpler with label jumps. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f61af131622ad4337c2fb9f7c453d5b0102c7b90.1747150490.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 73 +++++++++++++++++++----------------------------- io_uring/kbuf.h | 4 +-- io_uring/opdef.c | 4 +-- 3 files changed, 31 insertions(+), 50 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index df8aeb42e910..823e7eb15fb2 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -450,30 +450,6 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - ret = -ENOENT; - bl = io_buffer_get_list(ctx, p->bgid); - if (bl) { - ret = -EINVAL; - /* can't use provide/remove buffers command on mapped buffers */ - if (!(bl->flags & IOBL_BUF_RING)) - ret = io_remove_buffers_legacy(ctx, bl, p->nbufs); - } - io_ring_submit_unlock(ctx, issue_flags); - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { unsigned long size, tmp_check; @@ -535,37 +511,44 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, return i ? 0 : -ENOMEM; } -int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) +static int __io_manage_buffers_legacy(struct io_kiocb *req, + struct io_buffer_list *bl) { struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); + int ret; - bl = io_buffer_get_list(ctx, p->bgid); - if (unlikely(!bl)) { + if (!bl) { + if (req->opcode != IORING_OP_PROVIDE_BUFFERS) + return -ENOENT; bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); - if (!bl) { - ret = -ENOMEM; - goto err; - } + if (!bl) + return -ENOMEM; + INIT_LIST_HEAD(&bl->buf_list); - ret = io_buffer_add_list(ctx, bl, p->bgid); + ret = io_buffer_add_list(req->ctx, bl, p->bgid); if (ret) { kfree(bl); - goto err; + return ret; } } - /* can't add buffers via this command for a mapped buffer ring */ - if (bl->flags & IOBL_BUF_RING) { - ret = -EINVAL; - goto err; - } + /* can't use provide/remove buffers command on mapped buffers */ + if (bl->flags & IOBL_BUF_RING) + return -EINVAL; + if (req->opcode == IORING_OP_PROVIDE_BUFFERS) + return io_add_buffers(req->ctx, p, bl); + return io_remove_buffers_legacy(req->ctx, bl, p->nbufs); +} + +int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret; - ret = io_add_buffers(ctx, p, bl); -err: + io_ring_submit_lock(ctx, issue_flags); + bl = io_buffer_get_list(ctx, p->bgid); + ret = __io_manage_buffers_legacy(req, bl); io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 0798a732e6cb..4d2c209d1a41 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -66,10 +66,8 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg); void io_destroy_buffers(struct io_ring_ctx *ctx); int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags); - int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags); +int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags); int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index db36433c2294..6e0882b051f9 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -333,13 +333,13 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .iopoll = 1, .prep = io_provide_buffers_prep, - .issue = io_provide_buffers, + .issue = io_manage_buffers_legacy, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, .prep = io_remove_buffers_prep, - .issue = io_remove_buffers, + .issue = io_manage_buffers_legacy, }, [IORING_OP_TEE] = { .needs_file = 1, -- 2.51.0 From d871198ee431d90f5308d53998c1ba1d5db5619a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 13 May 2025 15:02:23 -0600 Subject: [PATCH 11/16] io_uring/fdinfo: grab ctx->uring_lock around io_uring_show_fdinfo() Not everything requires locking in there, which is why the 'has_lock' variable exists. But enough does that it's a bit unwieldy to manage. Wrap the whole thing in a ->uring_lock trylock, and just return with no output if we fail to grab it. The existing trylock() will already have greatly diminished utility/output for the failure case. This fixes an issue with reading the SQE fields, if the ring is being actively resized at the same time. Reported-by: Jann Horn Fixes: 79cfe9e59c2a ("io_uring/register: add IORING_REGISTER_RESIZE_RINGS") Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 48 ++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 9414ca6d101c..e0d6a59a89fa 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -86,13 +86,8 @@ static inline void napi_show_fdinfo(struct io_ring_ctx *ctx, } #endif -/* - * Caller holds a reference to the file already, we don't need to do - * anything else to get an extra reference. - */ -__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) +static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { - struct io_ring_ctx *ctx = file->private_data; struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; struct rusage sq_usage; @@ -106,7 +101,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) unsigned int sq_entries, cq_entries; int sq_pid = -1, sq_cpu = -1; u64 sq_total_time = 0, sq_work_time = 0; - bool has_lock; unsigned int i; if (ctx->flags & IORING_SETUP_CQE32) @@ -176,15 +170,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "\n"); } - /* - * Avoid ABBA deadlock between the seq lock and the io_uring mutex, - * since fdinfo case grabs it in the opposite direction of normal use - * cases. If we fail to get the lock, we just don't iterate any - * structures that could be going away outside the io_uring mutex. - */ - has_lock = mutex_trylock(&ctx->uring_lock); - - if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { + if (ctx->flags & IORING_SETUP_SQPOLL) { struct io_sq_data *sq = ctx->sq_data; /* @@ -206,7 +192,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time); seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time); seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr); - for (i = 0; has_lock && i < ctx->file_table.data.nr; i++) { + for (i = 0; i < ctx->file_table.data.nr; i++) { struct file *f = NULL; if (ctx->file_table.data.nodes[i]) @@ -218,7 +204,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) } } seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr); - for (i = 0; has_lock && i < ctx->buf_table.nr; i++) { + for (i = 0; i < ctx->buf_table.nr; i++) { struct io_mapped_ubuf *buf = NULL; if (ctx->buf_table.nodes[i]) @@ -228,7 +214,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) else seq_printf(m, "%5u: \n", i); } - if (has_lock && !xa_empty(&ctx->personalities)) { + if (!xa_empty(&ctx->personalities)) { unsigned long index; const struct cred *cred; @@ -238,7 +224,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) } seq_puts(m, "PollList:\n"); - for (i = 0; has_lock && i < (1U << ctx->cancel_table.hash_bits); i++) { + for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; struct io_kiocb *req; @@ -247,9 +233,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) task_work_pending(req->tctx->task)); } - if (has_lock) - mutex_unlock(&ctx->uring_lock); - seq_puts(m, "CqOverflowList:\n"); spin_lock(&ctx->completion_lock); list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { @@ -262,4 +245,23 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) spin_unlock(&ctx->completion_lock); napi_show_fdinfo(ctx, m); } + +/* + * Caller holds a reference to the file already, we don't need to do + * anything else to get an extra reference. + */ +__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) +{ + struct io_ring_ctx *ctx = file->private_data; + + /* + * Avoid ABBA deadlock between the seq lock and the io_uring mutex, + * since fdinfo case grabs it in the opposite direction of normal use + * cases. + */ + if (mutex_trylock(&ctx->uring_lock)) { + __io_uring_show_fdinfo(ctx, m); + mutex_unlock(&ctx->uring_lock); + } +} #endif -- 2.51.0 From 9a109266278f200ae0b64508273fea3db8af7a9e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 May 2025 12:10:12 -0600 Subject: [PATCH 12/16] io_uring/fdinfo: only compile if CONFIG_PROC_FS is set Rather than wrap fdinfo.c in one big if, handle it on the Makefile side instead. io_uring.c already conditionally sets fops->fdinfo() anyway. Signed-off-by: Jens Axboe --- io_uring/Makefile | 3 ++- io_uring/fdinfo.c | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/io_uring/Makefile b/io_uring/Makefile index 11a739927a62..d97c6b51d584 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ eventfd.o uring_cmd.o openclose.o \ sqpoll.o xattr.o nop.o fs.o splice.o \ sync.o msg_ring.o advise.o openclose.o \ - statx.o timeout.o fdinfo.o cancel.o \ + statx.o timeout.o cancel.o \ waitid.o register.o truncate.o \ memmap.o alloc_cache.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o @@ -20,3 +20,4 @@ obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_EPOLL) += epoll.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o obj-$(CONFIG_NET) += net.o cmd_net.o +obj-$(CONFIG_PROC_FS) += fdinfo.o diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index e0d6a59a89fa..b83296eee5f8 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -15,7 +15,6 @@ #include "cancel.h" #include "rsrc.h" -#ifdef CONFIG_PROC_FS static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, const struct cred *cred) { @@ -264,4 +263,3 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) mutex_unlock(&ctx->uring_lock); } } -#endif -- 2.51.0 From 16256648cd0877aed9ede41d5d4ad3c1d65d9b2b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 May 2025 12:33:28 -0600 Subject: [PATCH 13/16] io_uring/fdinfo: get rid of dumping credentials It's a faily obscure feature, and registered credentials would for that mostly be a static thing. Don't bother including code to dump the personalities indices. Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 38 -------------------------------------- 1 file changed, 38 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index b83296eee5f8..e9355276ab5d 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -15,36 +15,6 @@ #include "cancel.h" #include "rsrc.h" -static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, - const struct cred *cred) -{ - struct user_namespace *uns = seq_user_ns(m); - struct group_info *gi; - kernel_cap_t cap; - int g; - - seq_printf(m, "%5d\n", id); - seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); - seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); - seq_puts(m, "\n\tGroups:\t"); - gi = cred->group_info; - for (g = 0; g < gi->ngroups; g++) { - seq_put_decimal_ull(m, g ? " " : "", - from_kgid_munged(uns, gi->gid[g])); - } - seq_puts(m, "\n\tCapEff:\t"); - cap = cred->cap_effective; - seq_put_hex_ll(m, NULL, cap.val, 16); - seq_putc(m, '\n'); - return 0; -} - #ifdef CONFIG_NET_RX_BUSY_POLL static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m, @@ -213,14 +183,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) else seq_printf(m, "%5u: \n", i); } - if (!xa_empty(&ctx->personalities)) { - unsigned long index; - const struct cred *cred; - - seq_printf(m, "Personalities:\n"); - xa_for_each(&ctx->personalities, index, cred) - io_uring_show_cred(m, index, cred); - } seq_puts(m, "PollList:\n"); for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { -- 2.51.0 From 5288b9e28f8a6f464746ddabcf9bf49d1323acfc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 14 May 2025 09:07:20 +0100 Subject: [PATCH 14/16] io_uring: open code io_req_cqe_overflow() A preparation patch, just open code io_req_cqe_overflow(). Signed-off-by: Pavel Begunkov Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 43c285cd2294..e4d6e572eabc 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -739,14 +739,6 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -static void io_req_cqe_overflow(struct io_kiocb *req) -{ - io_cqring_event_overflow(req->ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->big_cqe.extra1, req->big_cqe.extra2); - memset(&req->big_cqe, 0, sizeof(req->big_cqe)); -} - /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to @@ -1435,11 +1427,19 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) unlikely(!io_fill_cqe_req(ctx, req))) { if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); - io_req_cqe_overflow(req); + io_cqring_event_overflow(req->ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + req->big_cqe.extra1, + req->big_cqe.extra2); spin_unlock(&ctx->completion_lock); } else { - io_req_cqe_overflow(req); + io_cqring_event_overflow(req->ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + req->big_cqe.extra1, + req->big_cqe.extra2); } + + memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } } __io_cq_unlock_post(ctx); -- 2.51.0 From 10f466abc404443cb72ab3384f297345ac7415e1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 May 2025 09:39:14 -0600 Subject: [PATCH 15/16] io_uring: split alloc and add of overflow Add a new helper, io_alloc_ocqe(), that simply allocates and fills an overflow entry. Then it can get done outside of the locking section, and hence use more appropriate gfp_t allocation flags rather than always default to GFP_ATOMIC. Inspired by a previous series from Pavel: https://lore.kernel.org/io-uring/cover.1747209332.git.asml.silence@gmail.com/ Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 74 +++++++++++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 29 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e4d6e572eabc..c2f9610fa891 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -697,20 +697,11 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } } -static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags, u64 extra1, u64 extra2) +static bool io_cqring_add_overflow(struct io_ring_ctx *ctx, + struct io_overflow_cqe *ocqe) { - struct io_overflow_cqe *ocqe; - size_t ocq_size = sizeof(struct io_overflow_cqe); - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); - lockdep_assert_held(&ctx->completion_lock); - if (is_cqe32) - ocq_size += sizeof(struct io_uring_cqe); - - ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); - trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); if (!ocqe) { struct io_rings *r = ctx->rings; @@ -728,17 +719,35 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } - ocqe->cqe.user_data = user_data; - ocqe->cqe.res = res; - ocqe->cqe.flags = cflags; - if (is_cqe32) { - ocqe->cqe.big_cqe[0] = extra1; - ocqe->cqe.big_cqe[1] = extra2; - } list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } +static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, + u64 user_data, s32 res, u32 cflags, + u64 extra1, u64 extra2, gfp_t gfp) +{ + struct io_overflow_cqe *ocqe; + size_t ocq_size = sizeof(struct io_overflow_cqe); + bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + + if (is_cqe32) + ocq_size += sizeof(struct io_uring_cqe); + + ocqe = kmalloc(ocq_size, gfp | __GFP_ACCOUNT); + trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); + if (ocqe) { + ocqe->cqe.user_data = user_data; + ocqe->cqe.res = res; + ocqe->cqe.flags = cflags; + if (is_cqe32) { + ocqe->cqe.big_cqe[0] = extra1; + ocqe->cqe.big_cqe[1] = extra2; + } + } + return ocqe; +} + /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to @@ -803,8 +812,12 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); - if (!filled) - filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + if (unlikely(!filled)) { + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, user_data, res, cflags, 0, 0, GFP_ATOMIC); + filled = io_cqring_add_overflow(ctx, ocqe); + } io_cq_unlock_post(ctx); return filled; } @@ -819,8 +832,11 @@ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) lockdep_assert(ctx->lockless_cq); if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, user_data, res, cflags, 0, 0, GFP_KERNEL); spin_lock(&ctx->completion_lock); - io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + io_cqring_add_overflow(ctx, ocqe); spin_unlock(&ctx->completion_lock); } ctx->submit_state.cq_flush = true; @@ -1425,18 +1441,18 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) */ if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { + gfp_t gfp = ctx->lockless_cq ? GFP_KERNEL : GFP_ATOMIC; + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, req->cqe.user_data, req->cqe.res, + req->cqe.flags, req->big_cqe.extra1, + req->big_cqe.extra2, gfp); if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); - io_cqring_event_overflow(req->ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->big_cqe.extra1, - req->big_cqe.extra2); + io_cqring_add_overflow(ctx, ocqe); spin_unlock(&ctx->completion_lock); } else { - io_cqring_event_overflow(req->ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->big_cqe.extra1, - req->big_cqe.extra2); + io_cqring_add_overflow(ctx, ocqe); } memset(&req->big_cqe, 0, sizeof(req->big_cqe)); -- 2.51.0 From 072d37b52c914271319b9f7e596ff3cba02e249c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 May 2025 09:45:24 -0600 Subject: [PATCH 16/16] io_uring: make io_alloc_ocqe() take a struct io_cqe pointer The number of arguments to io_alloc_ocqe() is a bit unwieldy. Make it take a struct io_cqe pointer rather than three separate CQE args. One path already has that readily available, add an io_init_cqe() helper for the remaining two. Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c2f9610fa891..02d597716467 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -724,8 +724,8 @@ static bool io_cqring_add_overflow(struct io_ring_ctx *ctx, } static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, - u64 user_data, s32 res, u32 cflags, - u64 extra1, u64 extra2, gfp_t gfp) + struct io_cqe *cqe, u64 extra1, + u64 extra2, gfp_t gfp) { struct io_overflow_cqe *ocqe; size_t ocq_size = sizeof(struct io_overflow_cqe); @@ -735,11 +735,11 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, ocq_size += sizeof(struct io_uring_cqe); ocqe = kmalloc(ocq_size, gfp | __GFP_ACCOUNT); - trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); + trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); if (ocqe) { - ocqe->cqe.user_data = user_data; - ocqe->cqe.res = res; - ocqe->cqe.flags = cflags; + ocqe->cqe.user_data = cqe->user_data; + ocqe->cqe.res = cqe->res; + ocqe->cqe.flags = cqe->flags; if (is_cqe32) { ocqe->cqe.big_cqe[0] = extra1; ocqe->cqe.big_cqe[1] = extra2; @@ -806,6 +806,11 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, return false; } +static inline struct io_cqe io_init_cqe(u64 user_data, s32 res, u32 cflags) +{ + return (struct io_cqe) { .user_data = user_data, .res = res, .flags = cflags }; +} + bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { bool filled; @@ -814,8 +819,9 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags filled = io_fill_cqe_aux(ctx, user_data, res, cflags); if (unlikely(!filled)) { struct io_overflow_cqe *ocqe; + struct io_cqe cqe = io_init_cqe(user_data, res, cflags); - ocqe = io_alloc_ocqe(ctx, user_data, res, cflags, 0, 0, GFP_ATOMIC); + ocqe = io_alloc_ocqe(ctx, &cqe, 0, 0, GFP_ATOMIC); filled = io_cqring_add_overflow(ctx, ocqe); } io_cq_unlock_post(ctx); @@ -833,8 +839,9 @@ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { struct io_overflow_cqe *ocqe; + struct io_cqe cqe = io_init_cqe(user_data, res, cflags); - ocqe = io_alloc_ocqe(ctx, user_data, res, cflags, 0, 0, GFP_KERNEL); + ocqe = io_alloc_ocqe(ctx, &cqe, 0, 0, GFP_KERNEL); spin_lock(&ctx->completion_lock); io_cqring_add_overflow(ctx, ocqe); spin_unlock(&ctx->completion_lock); @@ -1444,8 +1451,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) gfp_t gfp = ctx->lockless_cq ? GFP_KERNEL : GFP_ATOMIC; struct io_overflow_cqe *ocqe; - ocqe = io_alloc_ocqe(ctx, req->cqe.user_data, req->cqe.res, - req->cqe.flags, req->big_cqe.extra1, + ocqe = io_alloc_ocqe(ctx, &req->cqe, req->big_cqe.extra1, req->big_cqe.extra2, gfp); if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); -- 2.51.0