From 8abf47a8d61c9e8314ae4cfa27e18c8df67c37bc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Sep 2024 14:30:39 -0600 Subject: [PATCH 01/16] io_uring/cancel: get rid of init_hash_table() helper All it does is initialize the lists, just move the INIT_HLIST_HEAD() into the one caller. Signed-off-by: Jens Axboe --- io_uring/cancel.c | 8 -------- io_uring/cancel.h | 1 - io_uring/io_uring.c | 4 +++- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 755dd5506a5f..cc3475b22ae5 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -232,14 +232,6 @@ done: return IOU_OK; } -void init_hash_table(struct io_hash_table *table, unsigned size) -{ - unsigned int i; - - for (i = 0; i < size; i++) - INIT_HLIST_HEAD(&table->hbs[i].list); -} - static int __io_sync_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, int fd) { diff --git a/io_uring/cancel.h b/io_uring/cancel.h index b33995e00ba9..bbfea2cd00ea 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -20,7 +20,6 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, unsigned int issue_flags); -void init_hash_table(struct io_hash_table *table, unsigned size); int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f4e069cd03a5..6aac72b2958f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -263,13 +263,15 @@ static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) { unsigned hash_buckets = 1U << bits; size_t hash_size = hash_buckets * sizeof(table->hbs[0]); + int i; table->hbs = kmalloc(hash_size, GFP_KERNEL); if (!table->hbs) return -ENOMEM; table->hash_bits = bits; - init_hash_table(table, hash_buckets); + for (i = 0; i < hash_buckets; i++) + INIT_HLIST_HEAD(&table->hbs[i].list); return 0; } -- 2.51.0 From b6b3eb19dd86ecc3f188bd419f12cdfcfbeda5e7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Sep 2024 17:11:32 -0600 Subject: [PATCH 02/16] io_uring: move cancel hash tables to kvmalloc/kvfree Convert to using kvmalloc/kfree() for the hash tables, and while at it, make it handle low memory situations better. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6aac72b2958f..d7ad4ea5f40b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -261,13 +261,19 @@ static __cold void io_fallback_req_func(struct work_struct *work) static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) { - unsigned hash_buckets = 1U << bits; - size_t hash_size = hash_buckets * sizeof(table->hbs[0]); + unsigned int hash_buckets; int i; - table->hbs = kmalloc(hash_size, GFP_KERNEL); - if (!table->hbs) - return -ENOMEM; + do { + hash_buckets = 1U << bits; + table->hbs = kvmalloc_array(hash_buckets, sizeof(table->hbs[0]), + GFP_KERNEL_ACCOUNT); + if (table->hbs) + break; + if (bits == 1) + return -ENOMEM; + bits--; + } while (1); table->hash_bits = bits; for (i = 0; i < hash_buckets; i++) @@ -360,7 +366,7 @@ err: io_alloc_cache_free(&ctx->uring_cache, kfree); io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); io_futex_cache_free(ctx); - kfree(ctx->cancel_table.hbs); + kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; @@ -2772,7 +2778,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); io_napi_free(ctx); - kfree(ctx->cancel_table.hbs); + kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); kfree(ctx); } -- 2.51.0 From 1e6e7602cc9fdeaf7e2593755409e8d50545ed69 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 18 Oct 2024 17:07:31 +0100 Subject: [PATCH 03/16] io_uring: kill io_llist_xchg io_llist_xchg is only used to set the list to NULL, which can also be done with llist_del_all(). Use the latter and kill io_llist_xchg. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d6765112680d2e86a58b76166b7513391ff4e5d7.1729264960.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d7ad4ea5f40b..c0358a8d85d2 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1081,20 +1081,6 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, return node; } -/** - * io_llist_xchg - swap all entries in a lock-less list - * @head: the head of lock-less list to delete all entries - * @new: new entry as the head of the list - * - * If list is empty, return NULL, otherwise, return the pointer to the first entry. - * The order of entries returned is from the newest to the oldest added one. - */ -static inline struct llist_node *io_llist_xchg(struct llist_head *head, - struct llist_node *new) -{ - return xchg(&head->first, new); -} - static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync) { struct llist_node *node = llist_del_all(&tctx->task_list); @@ -1316,7 +1302,7 @@ again: * llists are in reverse order, flip it back the right way before * running the pending items. */ - node = llist_reverse_order(io_llist_xchg(&ctx->work_llist, NULL)); + node = llist_reverse_order(llist_del_all(&ctx->work_llist)); while (node) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, -- 2.51.0 From 9b296c625ac1d2ca9b129743c3f886bf7a0f471d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 18 Oct 2024 17:07:59 +0100 Subject: [PATCH 04/16] io_uring: static_key for !IORING_SETUP_NO_SQARRAY IORING_SETUP_NO_SQARRAY should be preferred and used by default by liburing, optimise flag checking in io_get_sqe() with a static key. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c164a48542fbb080115e2377ecf160c758562742.1729264988.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c0358a8d85d2..fa9d31034c62 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -149,6 +150,8 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, static void io_queue_sqe(struct io_kiocb *req); +static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray); + struct kmem_cache *req_cachep; static struct workqueue_struct *iou_wq __ro_after_init; @@ -2254,7 +2257,8 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) unsigned mask = ctx->sq_entries - 1; unsigned head = ctx->cached_sq_head++ & mask; - if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) { + if (static_branch_unlikely(&io_key_has_sqarray) && + (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) { head = READ_ONCE(ctx->sq_array[head]); if (unlikely(head >= ctx->sq_entries)) { /* drop invalid entries */ @@ -2758,6 +2762,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) } io_rings_free(ctx); + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + static_branch_dec(&io_key_has_sqarray); + percpu_ref_exit(&ctx->refs); free_uid(ctx->user); io_req_caches_free(ctx); @@ -3549,6 +3556,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->clockid = CLOCK_MONOTONIC; ctx->clock_offset = 0; + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + static_branch_inc(&io_key_has_sqarray); + if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && !(ctx->flags & IORING_SETUP_IOPOLL) && !(ctx->flags & IORING_SETUP_SQPOLL)) -- 2.51.0 From 2946f08ae9ed650b94e0ffebcdfdda8de76bd926 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 18 Oct 2024 17:14:00 +0100 Subject: [PATCH 05/16] io_uring: clean up cqe trace points We have too many helpers posting CQEs, instead of tracing completion events before filling in a CQE and thus having to pass all the data, set the CQE first, pass it to the tracing helper and let it extract everything it needs. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b83c1ca9ee5aed2df0f3bb743bf5ed699cce4c86.1729267437.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++++ include/trace/events/io_uring.h | 24 +++++++++--------------- io_uring/io_uring.c | 4 ++-- io_uring/io_uring.h | 7 +++---- 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 9c7e1d3f06e5..391087144666 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -662,4 +662,9 @@ struct io_overflow_cqe { struct io_uring_cqe cqe; }; +static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx) +{ + return ctx->flags & IORING_SETUP_CQE32; +} + #endif diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 412c9c210a32..fb81c533b310 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -315,20 +315,14 @@ TRACE_EVENT(io_uring_fail_link, * io_uring_complete - called when completing an SQE * * @ctx: pointer to a ring context structure - * @req: pointer to a submitted request - * @user_data: user data associated with the request - * @res: result of the request - * @cflags: completion flags - * @extra1: extra 64-bit data for CQE32 - * @extra2: extra 64-bit data for CQE32 - * + * @req: (optional) pointer to a submitted request + * @cqe: pointer to the filled in CQE being posted */ TRACE_EVENT(io_uring_complete, - TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags, - u64 extra1, u64 extra2), +TP_PROTO(struct io_ring_ctx *ctx, void *req, struct io_uring_cqe *cqe), - TP_ARGS(ctx, req, user_data, res, cflags, extra1, extra2), + TP_ARGS(ctx, req, cqe), TP_STRUCT__entry ( __field( void *, ctx ) @@ -343,11 +337,11 @@ TRACE_EVENT(io_uring_complete, TP_fast_assign( __entry->ctx = ctx; __entry->req = req; - __entry->user_data = user_data; - __entry->res = res; - __entry->cflags = cflags; - __entry->extra1 = extra1; - __entry->extra2 = extra2; + __entry->user_data = cqe->user_data; + __entry->res = cqe->res; + __entry->cflags = cqe->flags; + __entry->extra1 = io_ctx_cqe32(ctx) ? cqe->big_cqe[0] : 0; + __entry->extra2 = io_ctx_cqe32(ctx) ? cqe->big_cqe[1] : 0; ), TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x " diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fa9d31034c62..58b401900b41 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -828,8 +828,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, * the ring. */ if (likely(io_get_cqe(ctx, &cqe))) { - trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); - WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); @@ -838,6 +836,8 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, WRITE_ONCE(cqe->big_cqe[0], 0); WRITE_ONCE(cqe->big_cqe[1], 0); } + + trace_io_uring_complete(ctx, NULL, cqe); return true; } return false; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 70b6675941ff..9cd9a127e9ed 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -189,16 +189,15 @@ static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, if (unlikely(!io_get_cqe(ctx, &cqe))) return false; - if (trace_io_uring_complete_enabled()) - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->big_cqe.extra1, req->big_cqe.extra2); memcpy(cqe, &req->cqe, sizeof(*cqe)); if (ctx->flags & IORING_SETUP_CQE32) { memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } + + if (trace_io_uring_complete_enabled()) + trace_io_uring_complete(req->ctx, req, cqe); return true; } -- 2.51.0 From c919790060230ac2b1824bbf4d3b64eb51f471ff Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Oct 2024 15:04:55 -0600 Subject: [PATCH 06/16] io_uring/rsrc: don't assign bvec twice in io_import_fixed() iter->bvec is already set to imu->bvec - remove the one dead assignment and turn the other one into an addition instead. Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 6f3b6de230bd..ca2ec8a018be 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1127,7 +1127,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter, const struct bio_vec *bvec = imu->bvec; if (offset < bvec->bv_len) { - iter->bvec = bvec; iter->count -= offset; iter->iov_offset = offset; } else { @@ -1137,7 +1136,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, offset -= bvec->bv_len; seg_skip = 1 + (offset >> imu->folio_shift); - iter->bvec = bvec + seg_skip; + iter->bvec += seg_skip; iter->nr_segs -= seg_skip; iter->count -= bvec->bv_len + offset; iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); -- 2.51.0 From 892d3e80e1b9fc09aefdfd4d31f10f3d018863a0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Oct 2024 15:48:38 -0600 Subject: [PATCH 07/16] io_uring/uring_cmd: get rid of using req->imu It's pretty pointless to use io_kiocb as intermediate storage for this, so split the validity check and the actual usage. The resource node is assigned upfront at prep time, to prevent it from going away. The actual import is never called with the ctx->uring_lock held, so grab it for the import. Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 39c3c816ec78..58d0b817d6ea 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -211,11 +211,15 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_ring_ctx *ctx = req->ctx; u16 index; - req->buf_index = READ_ONCE(sqe->buf_index); - if (unlikely(req->buf_index >= ctx->nr_user_bufs)) + index = READ_ONCE(sqe->buf_index); + if (unlikely(index >= ctx->nr_user_bufs)) return -EFAULT; - index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); - req->imu = ctx->user_bufs[index]; + req->buf_index = array_index_nospec(index, ctx->nr_user_bufs); + /* + * Pi node upfront, prior to io_uring_cmd_import_fixed() + * being called. This prevents destruction of the mapped buffer + * we'll need at actual import time. + */ io_req_set_rsrc_node(req, ctx, 0); } ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); @@ -272,8 +276,17 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + struct io_ring_ctx *ctx = req->ctx; + + /* Must have had rsrc_node assigned at prep time */ + if (req->rsrc_node) { + struct io_mapped_ubuf *imu; + + imu = READ_ONCE(ctx->user_bufs[req->buf_index]); + return io_import_fixed(rw, iter, imu, ubuf, len); + } - return io_import_fixed(rw, iter, req->imu, ubuf, len); + return -EFAULT; } EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); -- 2.51.0 From 003f82b58c99146dfb0c9ce1ee7ed59bc572959b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Oct 2024 15:49:49 -0600 Subject: [PATCH 08/16] io_uring/rw: get rid of using req->imu It's assigned in the same function that it's being used, get rid of it. A local variable will do just fine. Signed-off-by: Jens Axboe --- io_uring/rw.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 354c4e175654..d8b9e7a712f6 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -330,6 +330,7 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct io_ring_ctx *ctx = req->ctx; + struct io_mapped_ubuf *imu; struct io_async_rw *io; u16 index; int ret; @@ -341,11 +342,11 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe if (unlikely(req->buf_index >= ctx->nr_user_bufs)) return -EFAULT; index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); - req->imu = ctx->user_bufs[index]; + imu = ctx->user_bufs[index]; io_req_set_rsrc_node(req, ctx, 0); io = req->async_data; - ret = io_import_fixed(ddir, &io->iter, req->imu, rw->addr, rw->len); + ret = io_import_fixed(ddir, &io->iter, imu, rw->addr, rw->len); iov_iter_save_state(&io->iter, &io->iter_state); return ret; } -- 2.51.0 From 1caa00d6b61651e04c04c2b50b3e149f24c6764d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 23 Oct 2024 07:14:22 -0600 Subject: [PATCH 09/16] io_uring: remove 'issue_flags' argument for io_req_set_rsrc_node() All callers already hold the ring lock and hence are passing '0', remove the argument and the conditional locking that it controlled. Suggested-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/net.c | 2 +- io_uring/rsrc.h | 8 ++------ io_uring/rw.c | 2 +- io_uring/uring_cmd.c | 2 +- 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 18507658a921..fb1f2c37f7d1 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1261,7 +1261,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EFAULT; idx = array_index_nospec(idx, ctx->nr_user_bufs); req->imu = READ_ONCE(ctx->user_bufs[idx]); - io_req_set_rsrc_node(notif, ctx, 0); + io_req_set_rsrc_node(notif, ctx); } if (req->opcode == IORING_OP_SEND_ZC) { diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 8ed588036210..c50d4be4aa6d 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -107,14 +107,10 @@ static inline void __io_req_set_rsrc_node(struct io_kiocb *req, } static inline void io_req_set_rsrc_node(struct io_kiocb *req, - struct io_ring_ctx *ctx, - unsigned int issue_flags) + struct io_ring_ctx *ctx) { - if (!req->rsrc_node) { - io_ring_submit_lock(ctx, issue_flags); + if (!req->rsrc_node) __io_req_set_rsrc_node(req, ctx); - io_ring_submit_unlock(ctx, issue_flags); - } } static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) diff --git a/io_uring/rw.c b/io_uring/rw.c index d8b9e7a712f6..8080ffd6d571 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -343,7 +343,7 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe return -EFAULT; index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); imu = ctx->user_bufs[index]; - io_req_set_rsrc_node(req, ctx, 0); + io_req_set_rsrc_node(req, ctx); io = req->async_data; ret = io_import_fixed(ddir, &io->iter, imu, rw->addr, rw->len); diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 58d0b817d6ea..6994f60d7ec7 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -220,7 +220,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) * being called. This prevents destruction of the mapped buffer * we'll need at actual import time. */ - io_req_set_rsrc_node(req, ctx, 0); + io_req_set_rsrc_node(req, ctx); } ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); -- 2.51.0 From 51c967c6c9ea6c4d480e4778ace5243db22aa27b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Oct 2024 07:39:31 -0600 Subject: [PATCH 10/16] io_uring/net: move send zc fixed buffer import to issue path Let's keep it close with the actual import, there's no reason to do this on the prep side. With that, we can drop one of the branches checking for whether or not IORING_RECVSEND_FIXED_BUF is set. As a side-effect, get rid of req->imu usage. Signed-off-by: Jens Axboe --- io_uring/net.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index fb1f2c37f7d1..b9e7e496ae85 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -76,6 +76,7 @@ struct io_sr_msg { /* initialised and used only by !msg send variants */ u16 addr_len; u16 buf_group; + u16 buf_index; void __user *addr; void __user *msg_control; /* used only for send zerocopy */ @@ -1254,16 +1255,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } } - if (zc->flags & IORING_RECVSEND_FIXED_BUF) { - unsigned idx = READ_ONCE(sqe->buf_index); - - if (unlikely(idx >= ctx->nr_user_bufs)) - return -EFAULT; - idx = array_index_nospec(idx, ctx->nr_user_bufs); - req->imu = READ_ONCE(ctx->user_bufs[idx]); - io_req_set_rsrc_node(notif, ctx); - } - if (req->opcode == IORING_OP_SEND_ZC) { if (READ_ONCE(sqe->__pad3[0])) return -EINVAL; @@ -1279,6 +1270,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); zc->len = READ_ONCE(sqe->len); zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; + zc->buf_index = READ_ONCE(sqe->buf_index); if (zc->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; @@ -1339,13 +1331,31 @@ static int io_sg_from_iter(struct sk_buff *skb, return ret; } -static int io_send_zc_import(struct io_kiocb *req, struct io_async_msghdr *kmsg) +static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct io_async_msghdr *kmsg = req->async_data; int ret; if (sr->flags & IORING_RECVSEND_FIXED_BUF) { - ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, req->imu, + struct io_ring_ctx *ctx = req->ctx; + struct io_mapped_ubuf *imu; + int idx; + + ret = -EFAULT; + io_ring_submit_lock(ctx, issue_flags); + if (sr->buf_index < ctx->nr_user_bufs) { + idx = array_index_nospec(sr->buf_index, ctx->nr_user_bufs); + imu = READ_ONCE(ctx->user_bufs[idx]); + io_req_set_rsrc_node(sr->notif, ctx); + ret = 0; + } + io_ring_submit_unlock(ctx, issue_flags); + + if (unlikely(ret)) + return ret; + + ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, imu, (u64)(uintptr_t)sr->buf, sr->len); if (unlikely(ret)) return ret; @@ -1382,7 +1392,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; if (!zc->done_io) { - ret = io_send_zc_import(req, kmsg); + ret = io_send_zc_import(req, issue_flags); if (unlikely(ret)) return ret; } -- 2.51.0 From e6d43739d0ee49a39505d696ba6a656f47c2bd39 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Oct 2024 15:54:06 -0600 Subject: [PATCH 11/16] io_uring: kill 'imu' from struct io_kiocb It's no longer being used, remove it. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 391087144666..6d3ee71bd832 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -613,9 +613,6 @@ struct io_kiocb { struct task_struct *task; union { - /* store used ubuf, so we can prevent reloading */ - struct io_mapped_ubuf *imu; - /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ struct io_buffer *kbuf; -- 2.51.0 From 93db98f6f1d62c9e58787f6beb62245ddb91f354 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 22 Oct 2024 15:43:12 +0100 Subject: [PATCH 12/16] io_uring/net: split send and sendmsg prep helpers A preparation patch splitting io_sendmsg_prep_setup into two separate helpers for send and sendmsg variants. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1a2319471ba040e053b7f1d22f4af510d1118eca.1729607201.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index b9e7e496ae85..aa256aa46409 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -385,16 +385,11 @@ static int io_send_setup(struct io_kiocb *req) return 0; } -static int io_sendmsg_prep_setup(struct io_kiocb *req, int is_msg) +static int io_sendmsg_setup(struct io_kiocb *req) { - struct io_async_msghdr *kmsg; + struct io_async_msghdr *kmsg = req->async_data; int ret; - kmsg = io_msg_alloc_async(req); - if (unlikely(!kmsg)) - return -ENOMEM; - if (!is_msg) - return io_send_setup(req); ret = io_sendmsg_copy_hdr(req, kmsg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; @@ -440,7 +435,11 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif - return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG); + if (unlikely(!io_msg_alloc_async(req))) + return -ENOMEM; + if (req->opcode != IORING_OP_SENDMSG) + return io_send_setup(req); + return io_sendmsg_setup(req); } static void io_req_msg_cleanup(struct io_kiocb *req, @@ -1278,7 +1277,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) zc->msg_flags |= MSG_CMSG_COMPAT; #endif - return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG_ZC); + if (unlikely(!io_msg_alloc_async(req))) + return -ENOMEM; + if (req->opcode != IORING_OP_SENDMSG_ZC) + return io_send_setup(req); + return io_sendmsg_setup(req); } static int io_sg_from_iter_iovec(struct sk_buff *skb, -- 2.51.0 From ad438d070a3bf2a3ae45b59a885a5d7b0dbbc465 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 22 Oct 2024 15:43:13 +0100 Subject: [PATCH 13/16] io_uring/net: don't store send address ptr For non "msg" requests we copy the address at the prep stage and there is no need to store the address user pointer long term. Pass the SQE into io_send_setup(), let it parse it, and remove struct io_sr_msg addr addr_len fields. It saves some space and also less confusing. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/db3dce544e17ca9d4b17d2506fbbac1da8a87824.1729607201.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index aa256aa46409..ad34c99930be 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -74,10 +74,8 @@ struct io_sr_msg { unsigned nr_multishot_loops; u16 flags; /* initialised and used only by !msg send variants */ - u16 addr_len; u16 buf_group; u16 buf_index; - void __user *addr; void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; @@ -357,24 +355,31 @@ void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) io_netmsg_iovec_free(io); } -static int io_send_setup(struct io_kiocb *req) +static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; + void __user *addr; + u16 addr_len; int ret; + if (READ_ONCE(sqe->__pad3[0])) + return -EINVAL; + kmsg->msg.msg_name = NULL; kmsg->msg.msg_namelen = 0; kmsg->msg.msg_control = NULL; kmsg->msg.msg_controllen = 0; kmsg->msg.msg_ubuf = NULL; - if (sr->addr) { - ret = move_addr_to_kernel(sr->addr, sr->addr_len, &kmsg->addr); + addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + addr_len = READ_ONCE(sqe->addr_len); + if (addr) { + ret = move_addr_to_kernel(addr, addr_len, &kmsg->addr); if (unlikely(ret < 0)) return ret; kmsg->msg.msg_name = &kmsg->addr; - kmsg->msg.msg_namelen = sr->addr_len; + kmsg->msg.msg_namelen = addr_len; } if (!io_do_buffer_select(req)) { ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, @@ -404,13 +409,9 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->done_io = 0; - if (req->opcode == IORING_OP_SEND) { - if (READ_ONCE(sqe->__pad3[0])) + if (req->opcode != IORING_OP_SEND) { + if (sqe->addr2 || sqe->file_index) return -EINVAL; - sr->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - sr->addr_len = READ_ONCE(sqe->addr_len); - } else if (sqe->addr2 || sqe->file_index) { - return -EINVAL; } sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -438,7 +439,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(!io_msg_alloc_async(req))) return -ENOMEM; if (req->opcode != IORING_OP_SENDMSG) - return io_send_setup(req); + return io_send_setup(req, sqe); return io_sendmsg_setup(req); } @@ -1254,12 +1255,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } } - if (req->opcode == IORING_OP_SEND_ZC) { - if (READ_ONCE(sqe->__pad3[0])) - return -EINVAL; - zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - zc->addr_len = READ_ONCE(sqe->addr_len); - } else { + if (req->opcode != IORING_OP_SEND_ZC) { if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL; if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF)) @@ -1280,7 +1276,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(!io_msg_alloc_async(req))) return -ENOMEM; if (req->opcode != IORING_OP_SENDMSG_ZC) - return io_send_setup(req); + return io_send_setup(req, sqe); return io_sendmsg_setup(req); } -- 2.51.0 From 52838787350d4ea8132804940d5308d95ce5e035 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 22 Oct 2024 15:43:14 +0100 Subject: [PATCH 14/16] io_uring/net: don't alias send user pointer reads We keep user pointers in an union, which could be a user buffer or a user pointer to msghdr. What is confusing is that it potenitally reads and assigns sqe->addr as one type but then uses it as another via the union. Even more, it's not even consistent across copy and zerocopy versions. Make send and sendmsg setup helpers read sqe->addr and treat it as the right type from the beginning. The end goal would be to get rid of the use of struct io_sr_msg::umsg for send requests as we only need it at the prep side. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/685d788605f5d78af18802fcabf61ba65cfd8002.1729607201.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index ad34c99930be..5e7263846243 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -363,6 +363,8 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) u16 addr_len; int ret; + sr->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + if (READ_ONCE(sqe->__pad3[0])) return -EINVAL; @@ -390,11 +392,14 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_sendmsg_setup(struct io_kiocb *req) +static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; int ret; + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + ret = io_sendmsg_copy_hdr(req, kmsg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; @@ -414,7 +419,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; } - sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); if (sr->flags & ~SENDMSG_FLAGS) @@ -440,7 +444,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -ENOMEM; if (req->opcode != IORING_OP_SENDMSG) return io_send_setup(req, sqe); - return io_sendmsg_setup(req); + return io_sendmsg_setup(req, sqe); } static void io_req_msg_cleanup(struct io_kiocb *req, @@ -1262,7 +1266,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; } - zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); zc->len = READ_ONCE(sqe->len); zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; zc->buf_index = READ_ONCE(sqe->buf_index); @@ -1277,7 +1280,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -ENOMEM; if (req->opcode != IORING_OP_SENDMSG_ZC) return io_send_setup(req, sqe); - return io_sendmsg_setup(req); + return io_sendmsg_setup(req, sqe); } static int io_sg_from_iter_iovec(struct sk_buff *skb, -- 2.51.0 From 882dec6c39c40c13dd03e418952c4af38d91bb38 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 22 Oct 2024 15:43:15 +0100 Subject: [PATCH 15/16] io_uring/net: clean up io_msg_copy_hdr Put sr->umsg into a local variable, so it doesn't repeat "sr->umsg->" for every field. It looks nicer, and likely without the patch it compiles into a bunch of umsg memory reads. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/26c2f30b491ea7998bfdb5bb290662572a61064d.1729607201.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 5e7263846243..2040195e33ab 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -262,6 +262,7 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, struct user_msghdr *msg, int ddir) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct user_msghdr __user *umsg = sr->umsg; struct iovec *iov; int ret, nr_segs; @@ -273,16 +274,16 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, nr_segs = 1; } - if (!user_access_begin(sr->umsg, sizeof(*sr->umsg))) + if (!user_access_begin(umsg, sizeof(*umsg))) return -EFAULT; ret = -EFAULT; - unsafe_get_user(msg->msg_name, &sr->umsg->msg_name, ua_end); - unsafe_get_user(msg->msg_namelen, &sr->umsg->msg_namelen, ua_end); - unsafe_get_user(msg->msg_iov, &sr->umsg->msg_iov, ua_end); - unsafe_get_user(msg->msg_iovlen, &sr->umsg->msg_iovlen, ua_end); - unsafe_get_user(msg->msg_control, &sr->umsg->msg_control, ua_end); - unsafe_get_user(msg->msg_controllen, &sr->umsg->msg_controllen, ua_end); + unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end); + unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end); + unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end); + unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end); + unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end); + unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end); msg->msg_flags = 0; if (req->flags & REQ_F_BUFFER_SELECT) { -- 2.51.0 From 09d0a8ea7facc8b1581c9bd85c3ea6f5aa62ab7d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 21 Oct 2024 13:29:39 -0600 Subject: [PATCH 16/16] io_uring: move max entry definition and ring sizing into header In preparation for needing this somewhere else, move the definitions for the maximum CQ and SQ ring size into io_uring.h. Make the rings_size() helper available as well, and have it take just the setup flags argument rather than the fill ring pointer. That's all that is needed. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 14 ++++++-------- io_uring/io_uring.h | 5 +++++ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 58b401900b41..6dea5242d666 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -105,9 +105,6 @@ #include "alloc_cache.h" #include "eventfd.h" -#define IORING_MAX_ENTRIES 32768 -#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) - #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) @@ -2667,8 +2664,8 @@ static void io_rings_free(struct io_ring_ctx *ctx) ctx->sq_sqes = NULL; } -static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, - unsigned int cq_entries, size_t *sq_offset) +unsigned long rings_size(unsigned int flags, unsigned int sq_entries, + unsigned int cq_entries, size_t *sq_offset) { struct io_rings *rings; size_t off, sq_array_size; @@ -2676,7 +2673,7 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries off = struct_size(rings, cqes, cq_entries); if (off == SIZE_MAX) return SIZE_MAX; - if (ctx->flags & IORING_SETUP_CQE32) { + if (flags & IORING_SETUP_CQE32) { if (check_shl_overflow(off, 1, &off)) return SIZE_MAX; } @@ -2687,7 +2684,7 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries return SIZE_MAX; #endif - if (ctx->flags & IORING_SETUP_NO_SQARRAY) { + if (flags & IORING_SETUP_NO_SQARRAY) { *sq_offset = SIZE_MAX; return off; } @@ -3434,7 +3431,8 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; - size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset); + size = rings_size(ctx->flags, p->sq_entries, p->cq_entries, + &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 9cd9a127e9ed..4a471a810f02 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -65,6 +65,11 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; } +#define IORING_MAX_ENTRIES 32768 +#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) + +unsigned long rings_size(unsigned int flags, unsigned int sq_entries, + unsigned int cq_entries, size_t *sq_offset); bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); -- 2.51.0