From 10f466abc404443cb72ab3384f297345ac7415e1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 May 2025 09:39:14 -0600 Subject: [PATCH] io_uring: split alloc and add of overflow Add a new helper, io_alloc_ocqe(), that simply allocates and fills an overflow entry. Then it can get done outside of the locking section, and hence use more appropriate gfp_t allocation flags rather than always default to GFP_ATOMIC. Inspired by a previous series from Pavel: https://lore.kernel.org/io-uring/cover.1747209332.git.asml.silence@gmail.com/ Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 74 +++++++++++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 29 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e4d6e572eabc..c2f9610fa891 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -697,20 +697,11 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } } -static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags, u64 extra1, u64 extra2) +static bool io_cqring_add_overflow(struct io_ring_ctx *ctx, + struct io_overflow_cqe *ocqe) { - struct io_overflow_cqe *ocqe; - size_t ocq_size = sizeof(struct io_overflow_cqe); - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); - lockdep_assert_held(&ctx->completion_lock); - if (is_cqe32) - ocq_size += sizeof(struct io_uring_cqe); - - ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); - trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); if (!ocqe) { struct io_rings *r = ctx->rings; @@ -728,17 +719,35 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } - ocqe->cqe.user_data = user_data; - ocqe->cqe.res = res; - ocqe->cqe.flags = cflags; - if (is_cqe32) { - ocqe->cqe.big_cqe[0] = extra1; - ocqe->cqe.big_cqe[1] = extra2; - } list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } +static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, + u64 user_data, s32 res, u32 cflags, + u64 extra1, u64 extra2, gfp_t gfp) +{ + struct io_overflow_cqe *ocqe; + size_t ocq_size = sizeof(struct io_overflow_cqe); + bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + + if (is_cqe32) + ocq_size += sizeof(struct io_uring_cqe); + + ocqe = kmalloc(ocq_size, gfp | __GFP_ACCOUNT); + trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); + if (ocqe) { + ocqe->cqe.user_data = user_data; + ocqe->cqe.res = res; + ocqe->cqe.flags = cflags; + if (is_cqe32) { + ocqe->cqe.big_cqe[0] = extra1; + ocqe->cqe.big_cqe[1] = extra2; + } + } + return ocqe; +} + /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to @@ -803,8 +812,12 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); - if (!filled) - filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + if (unlikely(!filled)) { + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, user_data, res, cflags, 0, 0, GFP_ATOMIC); + filled = io_cqring_add_overflow(ctx, ocqe); + } io_cq_unlock_post(ctx); return filled; } @@ -819,8 +832,11 @@ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) lockdep_assert(ctx->lockless_cq); if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, user_data, res, cflags, 0, 0, GFP_KERNEL); spin_lock(&ctx->completion_lock); - io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + io_cqring_add_overflow(ctx, ocqe); spin_unlock(&ctx->completion_lock); } ctx->submit_state.cq_flush = true; @@ -1425,18 +1441,18 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) */ if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { + gfp_t gfp = ctx->lockless_cq ? GFP_KERNEL : GFP_ATOMIC; + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, req->cqe.user_data, req->cqe.res, + req->cqe.flags, req->big_cqe.extra1, + req->big_cqe.extra2, gfp); if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); - io_cqring_event_overflow(req->ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->big_cqe.extra1, - req->big_cqe.extra2); + io_cqring_add_overflow(ctx, ocqe); spin_unlock(&ctx->completion_lock); } else { - io_cqring_event_overflow(req->ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->big_cqe.extra1, - req->big_cqe.extra2); + io_cqring_add_overflow(ctx, ocqe); } memset(&req->big_cqe, 0, sizeof(req->big_cqe)); -- 2.50.1