From e6d43739d0ee49a39505d696ba6a656f47c2bd39 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Oct 2024 15:54:06 -0600 Subject: [PATCH 01/16] io_uring: kill 'imu' from struct io_kiocb It's no longer being used, remove it. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 391087144666..6d3ee71bd832 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -613,9 +613,6 @@ struct io_kiocb { struct task_struct *task; union { - /* store used ubuf, so we can prevent reloading */ - struct io_mapped_ubuf *imu; - /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ struct io_buffer *kbuf; -- 2.51.0 From 93db98f6f1d62c9e58787f6beb62245ddb91f354 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 22 Oct 2024 15:43:12 +0100 Subject: [PATCH 02/16] io_uring/net: split send and sendmsg prep helpers A preparation patch splitting io_sendmsg_prep_setup into two separate helpers for send and sendmsg variants. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1a2319471ba040e053b7f1d22f4af510d1118eca.1729607201.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index b9e7e496ae85..aa256aa46409 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -385,16 +385,11 @@ static int io_send_setup(struct io_kiocb *req) return 0; } -static int io_sendmsg_prep_setup(struct io_kiocb *req, int is_msg) +static int io_sendmsg_setup(struct io_kiocb *req) { - struct io_async_msghdr *kmsg; + struct io_async_msghdr *kmsg = req->async_data; int ret; - kmsg = io_msg_alloc_async(req); - if (unlikely(!kmsg)) - return -ENOMEM; - if (!is_msg) - return io_send_setup(req); ret = io_sendmsg_copy_hdr(req, kmsg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; @@ -440,7 +435,11 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif - return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG); + if (unlikely(!io_msg_alloc_async(req))) + return -ENOMEM; + if (req->opcode != IORING_OP_SENDMSG) + return io_send_setup(req); + return io_sendmsg_setup(req); } static void io_req_msg_cleanup(struct io_kiocb *req, @@ -1278,7 +1277,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) zc->msg_flags |= MSG_CMSG_COMPAT; #endif - return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG_ZC); + if (unlikely(!io_msg_alloc_async(req))) + return -ENOMEM; + if (req->opcode != IORING_OP_SENDMSG_ZC) + return io_send_setup(req); + return io_sendmsg_setup(req); } static int io_sg_from_iter_iovec(struct sk_buff *skb, -- 2.51.0 From ad438d070a3bf2a3ae45b59a885a5d7b0dbbc465 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 22 Oct 2024 15:43:13 +0100 Subject: [PATCH 03/16] io_uring/net: don't store send address ptr For non "msg" requests we copy the address at the prep stage and there is no need to store the address user pointer long term. Pass the SQE into io_send_setup(), let it parse it, and remove struct io_sr_msg addr addr_len fields. It saves some space and also less confusing. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/db3dce544e17ca9d4b17d2506fbbac1da8a87824.1729607201.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index aa256aa46409..ad34c99930be 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -74,10 +74,8 @@ struct io_sr_msg { unsigned nr_multishot_loops; u16 flags; /* initialised and used only by !msg send variants */ - u16 addr_len; u16 buf_group; u16 buf_index; - void __user *addr; void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; @@ -357,24 +355,31 @@ void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) io_netmsg_iovec_free(io); } -static int io_send_setup(struct io_kiocb *req) +static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; + void __user *addr; + u16 addr_len; int ret; + if (READ_ONCE(sqe->__pad3[0])) + return -EINVAL; + kmsg->msg.msg_name = NULL; kmsg->msg.msg_namelen = 0; kmsg->msg.msg_control = NULL; kmsg->msg.msg_controllen = 0; kmsg->msg.msg_ubuf = NULL; - if (sr->addr) { - ret = move_addr_to_kernel(sr->addr, sr->addr_len, &kmsg->addr); + addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + addr_len = READ_ONCE(sqe->addr_len); + if (addr) { + ret = move_addr_to_kernel(addr, addr_len, &kmsg->addr); if (unlikely(ret < 0)) return ret; kmsg->msg.msg_name = &kmsg->addr; - kmsg->msg.msg_namelen = sr->addr_len; + kmsg->msg.msg_namelen = addr_len; } if (!io_do_buffer_select(req)) { ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, @@ -404,13 +409,9 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->done_io = 0; - if (req->opcode == IORING_OP_SEND) { - if (READ_ONCE(sqe->__pad3[0])) + if (req->opcode != IORING_OP_SEND) { + if (sqe->addr2 || sqe->file_index) return -EINVAL; - sr->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - sr->addr_len = READ_ONCE(sqe->addr_len); - } else if (sqe->addr2 || sqe->file_index) { - return -EINVAL; } sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -438,7 +439,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(!io_msg_alloc_async(req))) return -ENOMEM; if (req->opcode != IORING_OP_SENDMSG) - return io_send_setup(req); + return io_send_setup(req, sqe); return io_sendmsg_setup(req); } @@ -1254,12 +1255,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } } - if (req->opcode == IORING_OP_SEND_ZC) { - if (READ_ONCE(sqe->__pad3[0])) - return -EINVAL; - zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - zc->addr_len = READ_ONCE(sqe->addr_len); - } else { + if (req->opcode != IORING_OP_SEND_ZC) { if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL; if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF)) @@ -1280,7 +1276,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(!io_msg_alloc_async(req))) return -ENOMEM; if (req->opcode != IORING_OP_SENDMSG_ZC) - return io_send_setup(req); + return io_send_setup(req, sqe); return io_sendmsg_setup(req); } -- 2.51.0 From 52838787350d4ea8132804940d5308d95ce5e035 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 22 Oct 2024 15:43:14 +0100 Subject: [PATCH 04/16] io_uring/net: don't alias send user pointer reads We keep user pointers in an union, which could be a user buffer or a user pointer to msghdr. What is confusing is that it potenitally reads and assigns sqe->addr as one type but then uses it as another via the union. Even more, it's not even consistent across copy and zerocopy versions. Make send and sendmsg setup helpers read sqe->addr and treat it as the right type from the beginning. The end goal would be to get rid of the use of struct io_sr_msg::umsg for send requests as we only need it at the prep side. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/685d788605f5d78af18802fcabf61ba65cfd8002.1729607201.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index ad34c99930be..5e7263846243 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -363,6 +363,8 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) u16 addr_len; int ret; + sr->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + if (READ_ONCE(sqe->__pad3[0])) return -EINVAL; @@ -390,11 +392,14 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_sendmsg_setup(struct io_kiocb *req) +static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; int ret; + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + ret = io_sendmsg_copy_hdr(req, kmsg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; @@ -414,7 +419,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; } - sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); if (sr->flags & ~SENDMSG_FLAGS) @@ -440,7 +444,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -ENOMEM; if (req->opcode != IORING_OP_SENDMSG) return io_send_setup(req, sqe); - return io_sendmsg_setup(req); + return io_sendmsg_setup(req, sqe); } static void io_req_msg_cleanup(struct io_kiocb *req, @@ -1262,7 +1266,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; } - zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); zc->len = READ_ONCE(sqe->len); zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; zc->buf_index = READ_ONCE(sqe->buf_index); @@ -1277,7 +1280,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -ENOMEM; if (req->opcode != IORING_OP_SENDMSG_ZC) return io_send_setup(req, sqe); - return io_sendmsg_setup(req); + return io_sendmsg_setup(req, sqe); } static int io_sg_from_iter_iovec(struct sk_buff *skb, -- 2.51.0 From 882dec6c39c40c13dd03e418952c4af38d91bb38 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 22 Oct 2024 15:43:15 +0100 Subject: [PATCH 05/16] io_uring/net: clean up io_msg_copy_hdr Put sr->umsg into a local variable, so it doesn't repeat "sr->umsg->" for every field. It looks nicer, and likely without the patch it compiles into a bunch of umsg memory reads. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/26c2f30b491ea7998bfdb5bb290662572a61064d.1729607201.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 5e7263846243..2040195e33ab 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -262,6 +262,7 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, struct user_msghdr *msg, int ddir) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct user_msghdr __user *umsg = sr->umsg; struct iovec *iov; int ret, nr_segs; @@ -273,16 +274,16 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, nr_segs = 1; } - if (!user_access_begin(sr->umsg, sizeof(*sr->umsg))) + if (!user_access_begin(umsg, sizeof(*umsg))) return -EFAULT; ret = -EFAULT; - unsafe_get_user(msg->msg_name, &sr->umsg->msg_name, ua_end); - unsafe_get_user(msg->msg_namelen, &sr->umsg->msg_namelen, ua_end); - unsafe_get_user(msg->msg_iov, &sr->umsg->msg_iov, ua_end); - unsafe_get_user(msg->msg_iovlen, &sr->umsg->msg_iovlen, ua_end); - unsafe_get_user(msg->msg_control, &sr->umsg->msg_control, ua_end); - unsafe_get_user(msg->msg_controllen, &sr->umsg->msg_controllen, ua_end); + unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end); + unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end); + unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end); + unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end); + unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end); + unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end); msg->msg_flags = 0; if (req->flags & REQ_F_BUFFER_SELECT) { -- 2.51.0 From 09d0a8ea7facc8b1581c9bd85c3ea6f5aa62ab7d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 21 Oct 2024 13:29:39 -0600 Subject: [PATCH 06/16] io_uring: move max entry definition and ring sizing into header In preparation for needing this somewhere else, move the definitions for the maximum CQ and SQ ring size into io_uring.h. Make the rings_size() helper available as well, and have it take just the setup flags argument rather than the fill ring pointer. That's all that is needed. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 14 ++++++-------- io_uring/io_uring.h | 5 +++++ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 58b401900b41..6dea5242d666 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -105,9 +105,6 @@ #include "alloc_cache.h" #include "eventfd.h" -#define IORING_MAX_ENTRIES 32768 -#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) - #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) @@ -2667,8 +2664,8 @@ static void io_rings_free(struct io_ring_ctx *ctx) ctx->sq_sqes = NULL; } -static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, - unsigned int cq_entries, size_t *sq_offset) +unsigned long rings_size(unsigned int flags, unsigned int sq_entries, + unsigned int cq_entries, size_t *sq_offset) { struct io_rings *rings; size_t off, sq_array_size; @@ -2676,7 +2673,7 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries off = struct_size(rings, cqes, cq_entries); if (off == SIZE_MAX) return SIZE_MAX; - if (ctx->flags & IORING_SETUP_CQE32) { + if (flags & IORING_SETUP_CQE32) { if (check_shl_overflow(off, 1, &off)) return SIZE_MAX; } @@ -2687,7 +2684,7 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries return SIZE_MAX; #endif - if (ctx->flags & IORING_SETUP_NO_SQARRAY) { + if (flags & IORING_SETUP_NO_SQARRAY) { *sq_offset = SIZE_MAX; return off; } @@ -3434,7 +3431,8 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; - size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset); + size = rings_size(ctx->flags, p->sq_entries, p->cq_entries, + &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 9cd9a127e9ed..4a471a810f02 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -65,6 +65,11 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; } +#define IORING_MAX_ENTRIES 32768 +#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) + +unsigned long rings_size(unsigned int flags, unsigned int sq_entries, + unsigned int cq_entries, size_t *sq_offset); bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); -- 2.51.0 From 81d8191eb99d95b32e55d09d74f682d40d3e74e9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 21 Oct 2024 13:32:19 -0600 Subject: [PATCH 07/16] io_uring: abstract out a bit of the ring filling logic Abstract out a io_uring_fill_params() helper, which fills out the necessary bits of struct io_uring_params. Add it to io_uring.h as well, in preparation for having another internal user of it. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 70 ++++++++++++++++++++++++++------------------- io_uring/io_uring.h | 1 + 2 files changed, 41 insertions(+), 30 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6dea5242d666..b5974bdad48b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3498,14 +3498,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) O_RDWR | O_CLOEXEC, NULL); } -static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, - struct io_uring_params __user *params) +int io_uring_fill_params(unsigned entries, struct io_uring_params *p) { - struct io_ring_ctx *ctx; - struct io_uring_task *tctx; - struct file *file; - int ret; - if (!entries) return -EINVAL; if (entries > IORING_MAX_ENTRIES) { @@ -3547,6 +3541,42 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, p->cq_entries = 2 * p->sq_entries; } + p->sq_off.head = offsetof(struct io_rings, sq.head); + p->sq_off.tail = offsetof(struct io_rings, sq.tail); + p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); + p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); + p->sq_off.flags = offsetof(struct io_rings, sq_flags); + p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); + p->sq_off.resv1 = 0; + if (!(p->flags & IORING_SETUP_NO_MMAP)) + p->sq_off.user_addr = 0; + + p->cq_off.head = offsetof(struct io_rings, cq.head); + p->cq_off.tail = offsetof(struct io_rings, cq.tail); + p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); + p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); + p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); + p->cq_off.cqes = offsetof(struct io_rings, cqes); + p->cq_off.flags = offsetof(struct io_rings, cq_flags); + p->cq_off.resv1 = 0; + if (!(p->flags & IORING_SETUP_NO_MMAP)) + p->cq_off.user_addr = 0; + + return 0; +} + +static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, + struct io_uring_params __user *params) +{ + struct io_ring_ctx *ctx; + struct io_uring_task *tctx; + struct file *file; + int ret; + + ret = io_uring_fill_params(entries, p); + if (unlikely(ret)) + return ret; + ctx = io_ring_ctx_alloc(p); if (!ctx) return -ENOMEM; @@ -3630,6 +3660,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, if (ret) goto err; + if (!(p->flags & IORING_SETUP_NO_SQARRAY)) + p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; + ret = io_sq_offload_create(ctx, p); if (ret) goto err; @@ -3638,29 +3671,6 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, if (ret) goto err; - p->sq_off.head = offsetof(struct io_rings, sq.head); - p->sq_off.tail = offsetof(struct io_rings, sq.tail); - p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); - p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); - p->sq_off.flags = offsetof(struct io_rings, sq_flags); - p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); - if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) - p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; - p->sq_off.resv1 = 0; - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - p->sq_off.user_addr = 0; - - p->cq_off.head = offsetof(struct io_rings, cq.head); - p->cq_off.tail = offsetof(struct io_rings, cq.tail); - p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); - p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); - p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); - p->cq_off.cqes = offsetof(struct io_rings, cqes); - p->cq_off.flags = offsetof(struct io_rings, cq_flags); - p->cq_off.resv1 = 0; - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - p->cq_off.user_addr = 0; - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 4a471a810f02..e3e6cb14de5d 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -70,6 +70,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) unsigned long rings_size(unsigned int flags, unsigned int sq_entries, unsigned int cq_entries, size_t *sq_offset); +int io_uring_fill_params(unsigned entries, struct io_uring_params *p); bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); -- 2.51.0 From d090bffab609762af06dec295a305ce270941b42 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 24 Oct 2024 10:52:02 -0600 Subject: [PATCH 08/16] io_uring/memmap: explicitly return -EFAULT for mmap on NULL rings The later mapping will actually check this too, but in terms of code clarify, explicitly check for whether or not the rings and sqes are valid during validation. That makes it explicit that if they are non-NULL, they are valid and can get mapped. Signed-off-by: Jens Axboe --- io_uring/memmap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index a0f32a255fd1..d614824e17bd 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -204,11 +204,15 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, /* Don't allow mmap if the ring was setup without it */ if (ctx->flags & IORING_SETUP_NO_MMAP) return ERR_PTR(-EINVAL); + if (!ctx->rings) + return ERR_PTR(-EFAULT); return ctx->rings; case IORING_OFF_SQES: /* Don't allow mmap if the ring was setup without it */ if (ctx->flags & IORING_SETUP_NO_MMAP) return ERR_PTR(-EINVAL); + if (!ctx->sq_sqes) + return ERR_PTR(-EFAULT); return ctx->sq_sqes; case IORING_OFF_PBUF_RING: { struct io_buffer_list *bl; -- 2.51.0 From 79cfe9e59c2a12c3b3faeeefe38d23f3d8030972 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 21 Oct 2024 13:34:10 -0600 Subject: [PATCH 09/16] io_uring/register: add IORING_REGISTER_RESIZE_RINGS Once a ring has been created, the size of the CQ and SQ rings are fixed. Usually this isn't a problem on the SQ ring side, as it merely controls the available number of requests that can be submitted in a single system call, and there's rarely a need to change that. For the CQ ring, it's a different story. For most efficient use of io_uring, it's important that the CQ ring never overflows. This means that applications must size it for the worst case scenario, which can be wasteful. Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize the existing rings. It takes a struct io_uring_params argument, the same one which is used to setup the ring initially, and resizes rings according to the sizes given. Certain properties are always inherited from the original ring setup, like SQE128/CQE32 and other setup options. The implementation only allows flag associated with how the CQ ring is sized and clamped. Existing unconsumed SQE and CQE entries are copied as part of the process. If either the SQ or CQ resized destination ring cannot hold the entries already present in the source rings, then the operation is failed with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new submissions, and the internal mapping holds the completion lock as well across moving CQ ring state. To prevent races between mmap and ring resizing, add a mutex that's solely used to serialize ring resize and mmap. mmap_sem can't be used here, as as fork'ed process may be doing mmaps on the ring as well. The ctx->resize_lock is held across mmap operations, and the resize will grab it before swapping out the already mapped new data. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 7 ++ include/uapi/linux/io_uring.h | 5 + io_uring/io_uring.c | 1 + io_uring/memmap.c | 8 ++ io_uring/register.c | 215 +++++++++++++++++++++++++++++++++ 5 files changed, 236 insertions(+) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 6d3ee71bd832..841579dcdae9 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -415,6 +415,13 @@ struct io_ring_ctx { /* protected by ->completion_lock */ unsigned evfd_last_cq_tail; + /* + * Protection for resize vs mmap races - both the mmap and resize + * side will need to grab this lock, to prevent either side from + * being run concurrently with the other. + */ + struct mutex resize_lock; + /* * If IORING_SETUP_NO_MMAP is used, then the below holds * the gup'ed pages for the two rings, and the sqes. diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 86cb385fe0b5..60b9c98595fa 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -615,6 +615,11 @@ enum io_uring_register_op { /* send MSG_RING without having a ring */ IORING_REGISTER_SEND_MSG_RING = 31, + /* 32 reserved for zc rx */ + + /* resize CQ ring */ + IORING_REGISTER_RESIZE_RINGS = 33, + /* this goes last */ IORING_REGISTER_LAST, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b5974bdad48b..140cd47fbdb3 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -353,6 +353,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_WQ_LIST(&ctx->submit_state.compl_reqs); INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); io_napi_init(ctx); + mutex_init(&ctx->resize_lock); return ctx; diff --git a/io_uring/memmap.c b/io_uring/memmap.c index d614824e17bd..85c66fa54956 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -251,6 +251,8 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) unsigned int npages; void *ptr; + guard(mutex)(&ctx->resize_lock); + ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); if (IS_ERR(ptr)) return PTR_ERR(ptr); @@ -274,6 +276,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { + struct io_ring_ctx *ctx = filp->private_data; void *ptr; /* @@ -284,6 +287,8 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) return -EINVAL; + guard(mutex)(&ctx->resize_lock); + ptr = io_uring_validate_mmap_request(filp, pgoff, len); if (IS_ERR(ptr)) return -ENOMEM; @@ -329,8 +334,11 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { + struct io_ring_ctx *ctx = file->private_data; void *ptr; + guard(mutex)(&ctx->resize_lock); + ptr = io_uring_validate_mmap_request(file, pgoff, len); if (IS_ERR(ptr)) return PTR_ERR(ptr); diff --git a/io_uring/register.c b/io_uring/register.c index 52b2f9b74af8..fc6c94d694b2 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -29,6 +29,7 @@ #include "napi.h" #include "eventfd.h" #include "msg_ring.h" +#include "memmap.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -361,6 +362,214 @@ static int io_register_clock(struct io_ring_ctx *ctx, return 0; } +/* + * State to maintain until we can swap. Both new and old state, used for + * either mapping or freeing. + */ +struct io_ring_ctx_rings { + unsigned short n_ring_pages; + unsigned short n_sqe_pages; + struct page **ring_pages; + struct page **sqe_pages; + struct io_uring_sqe *sq_sqes; + struct io_rings *rings; +}; + +static void io_register_free_rings(struct io_uring_params *p, + struct io_ring_ctx_rings *r) +{ + if (!(p->flags & IORING_SETUP_NO_MMAP)) { + io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages, + true); + io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages, + true); + } else { + io_pages_free(&r->ring_pages, r->n_ring_pages); + io_pages_free(&r->sqe_pages, r->n_sqe_pages); + vunmap(r->rings); + vunmap(r->sq_sqes); + } +} + +#define swap_old(ctx, o, n, field) \ + do { \ + (o).field = (ctx)->field; \ + (ctx)->field = (n).field; \ + } while (0) + +#define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) +#define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ + IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) + +static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; + size_t size, sq_array_offset; + struct io_uring_params p; + unsigned i, tail; + void *ptr; + int ret; + + /* for single issuer, must be owner resizing */ + if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && + current != ctx->submitter_task) + return -EEXIST; + if (copy_from_user(&p, arg, sizeof(p))) + return -EFAULT; + if (p.flags & ~RESIZE_FLAGS) + return -EINVAL; + + /* properties that are always inherited */ + p.flags |= (ctx->flags & COPY_FLAGS); + + ret = io_uring_fill_params(p.sq_entries, &p); + if (unlikely(ret)) + return ret; + + /* nothing to do, but copy params back */ + if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) { + if (copy_to_user(arg, &p, sizeof(p))) + return -EFAULT; + return 0; + } + + size = rings_size(p.flags, p.sq_entries, p.cq_entries, + &sq_array_offset); + if (size == SIZE_MAX) + return -EOVERFLOW; + + if (!(p.flags & IORING_SETUP_NO_MMAP)) + n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size); + else + n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages, + p.cq_off.user_addr, size); + if (IS_ERR(n.rings)) + return PTR_ERR(n.rings); + + n.rings->sq_ring_mask = p.sq_entries - 1; + n.rings->cq_ring_mask = p.cq_entries - 1; + n.rings->sq_ring_entries = p.sq_entries; + n.rings->cq_ring_entries = p.cq_entries; + + if (copy_to_user(arg, &p, sizeof(p))) { + io_register_free_rings(&p, &n); + return -EFAULT; + } + + if (p.flags & IORING_SETUP_SQE128) + size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries); + else + size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); + if (size == SIZE_MAX) { + io_register_free_rings(&p, &n); + return -EOVERFLOW; + } + + if (!(p.flags & IORING_SETUP_NO_MMAP)) + ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size); + else + ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages, + p.sq_off.user_addr, + size); + if (IS_ERR(ptr)) { + io_register_free_rings(&p, &n); + return PTR_ERR(ptr); + } + + /* + * If using SQPOLL, park the thread + */ + if (ctx->sq_data) { + mutex_unlock(&ctx->uring_lock); + io_sq_thread_park(ctx->sq_data); + mutex_lock(&ctx->uring_lock); + } + + /* + * We'll do the swap. Grab the ctx->resize_lock, which will exclude + * any new mmap's on the ring fd. Clear out existing mappings to prevent + * mmap from seeing them, as we'll unmap them. Any attempt to mmap + * existing rings beyond this point will fail. Not that it could proceed + * at this point anyway, as the io_uring mmap side needs go grab the + * ctx->resize_lock as well. Likewise, hold the completion lock over the + * duration of the actual swap. + */ + mutex_lock(&ctx->resize_lock); + spin_lock(&ctx->completion_lock); + o.rings = ctx->rings; + ctx->rings = NULL; + o.sq_sqes = ctx->sq_sqes; + ctx->sq_sqes = NULL; + + /* + * Now copy SQ and CQ entries, if any. If either of the destination + * rings can't hold what is already there, then fail the operation. + */ + n.sq_sqes = ptr; + tail = o.rings->sq.tail; + if (tail - o.rings->sq.head > p.sq_entries) + goto overflow; + for (i = o.rings->sq.head; i < tail; i++) { + unsigned src_head = i & (ctx->sq_entries - 1); + unsigned dst_head = i & n.rings->sq_ring_mask; + + n.sq_sqes[dst_head] = o.sq_sqes[src_head]; + } + n.rings->sq.head = o.rings->sq.head; + n.rings->sq.tail = o.rings->sq.tail; + + tail = o.rings->cq.tail; + if (tail - o.rings->cq.head > p.cq_entries) { +overflow: + /* restore old rings, and return -EOVERFLOW via cleanup path */ + ctx->rings = o.rings; + ctx->sq_sqes = o.sq_sqes; + to_free = &n; + ret = -EOVERFLOW; + goto out; + } + for (i = o.rings->cq.head; i < tail; i++) { + unsigned src_head = i & (ctx->cq_entries - 1); + unsigned dst_head = i & n.rings->cq_ring_mask; + + n.rings->cqes[dst_head] = o.rings->cqes[src_head]; + } + n.rings->cq.head = o.rings->cq.head; + n.rings->cq.tail = o.rings->cq.tail; + /* invalidate cached cqe refill */ + ctx->cqe_cached = ctx->cqe_sentinel = NULL; + + n.rings->sq_dropped = o.rings->sq_dropped; + n.rings->sq_flags = o.rings->sq_flags; + n.rings->cq_flags = o.rings->cq_flags; + n.rings->cq_overflow = o.rings->cq_overflow; + + /* all done, store old pointers and assign new ones */ + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); + + ctx->sq_entries = p.sq_entries; + ctx->cq_entries = p.cq_entries; + + ctx->rings = n.rings; + ctx->sq_sqes = n.sq_sqes; + swap_old(ctx, o, n, n_ring_pages); + swap_old(ctx, o, n, n_sqe_pages); + swap_old(ctx, o, n, ring_pages); + swap_old(ctx, o, n, sqe_pages); + to_free = &o; + ret = 0; +out: + spin_unlock(&ctx->completion_lock); + mutex_unlock(&ctx->resize_lock); + io_register_free_rings(&p, to_free); + + if (ctx->sq_data) + io_sq_thread_unpark(ctx->sq_data); + + return ret; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -549,6 +758,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_clone_buffers(ctx, arg); break; + case IORING_REGISTER_RESIZE_RINGS: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_resize_rings(ctx, arg); + break; default: ret = -EINVAL; break; -- 2.51.0 From b898b8c99ead1ce8bee95083bba296e4a86a6c05 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 28 Oct 2024 13:18:27 -0600 Subject: [PATCH 10/16] io_uring/sqpoll: wait on sqd->wait for thread parking io_sqd_handle_event() just does a mutex unlock/lock dance when it's supposed to park, somewhat relying on full ordering with the thread trying to park it which does a similar unlock/lock dance on sqd->lock. However, with adaptive spinning on mutexes, this can waste an awful lot of time. Normally this isn't very noticeable, as parking and unparking the thread isn't a common (or fast path) occurence. However, in testing ring resizing, it's testing exactly that, as each resize will require the SQPOLL to safely park and unpark. Have io_sq_thread_park() explicitly wait on sqd->park_pending being zero before attempting to grab the sqd->lock again. In a resize test, this brings the runtime of SQPOLL down from about 60 seconds to a few seconds, just like the !SQPOLL tests. And saves a ton of spinning time on the mutex, on both sides. Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index a26593979887..1f18b642fbd4 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -40,6 +40,7 @@ void io_sq_thread_unpark(struct io_sq_data *sqd) if (atomic_dec_return(&sqd->park_pending)) set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); mutex_unlock(&sqd->lock); + wake_up(&sqd->wait); } void io_sq_thread_park(struct io_sq_data *sqd) @@ -215,7 +216,7 @@ static bool io_sqd_handle_event(struct io_sq_data *sqd) mutex_unlock(&sqd->lock); if (signal_pending(current)) did_sig = get_signal(&ksig); - cond_resched(); + wait_event(sqd->wait, !atomic_read(&sqd->park_pending)); mutex_lock(&sqd->lock); sqd->sq_cpu = raw_smp_processor_id(); } -- 2.51.0 From 0a54a7dd0a12b777721f5ca55c9d6331d2a46b01 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 22 Oct 2024 13:37:00 -0600 Subject: [PATCH 11/16] io_uring: switch struct ext_arg from __kernel_timespec to timespec64 This avoids intermediate storage for turning a __kernel_timespec user pointer into an on-stack struct timespec64, only then to turn it into a ktime_t. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 140cd47fbdb3..8f0e0749a581 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2495,9 +2495,10 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct ext_arg { size_t argsz; - struct __kernel_timespec __user *ts; + struct timespec64 ts; const sigset_t __user *sig; ktime_t min_time; + bool ts_set; }; /* @@ -2535,13 +2536,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, iowq.timeout = KTIME_MAX; start_time = io_get_time(ctx); - if (ext_arg->ts) { - struct timespec64 ts; - - if (get_timespec64(&ts, ext_arg->ts)) - return -EFAULT; - - iowq.timeout = timespec64_to_ktime(ts); + if (ext_arg->ts_set) { + iowq.timeout = timespec64_to_ktime(ext_arg->ts); if (!(flags & IORING_ENTER_ABS_TIMER)) iowq.timeout = ktime_add(iowq.timeout, start_time); } @@ -3252,7 +3248,6 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, */ if (!(flags & IORING_ENTER_EXT_ARG)) { ext_arg->sig = (const sigset_t __user *) argp; - ext_arg->ts = NULL; return 0; } @@ -3267,7 +3262,11 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, ext_arg->min_time = arg.min_wait_usec * NSEC_PER_USEC; ext_arg->sig = u64_to_user_ptr(arg.sigmask); ext_arg->argsz = arg.sigmask_sz; - ext_arg->ts = u64_to_user_ptr(arg.ts); + if (arg.ts) { + if (get_timespec64(&ext_arg->ts, u64_to_user_ptr(arg.ts))) + return -EFAULT; + ext_arg->ts_set = true; + } return 0; } -- 2.51.0 From 371b47da25e1f7a1a6323f84c776bd9fa079a490 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 22 Oct 2024 13:41:42 -0600 Subject: [PATCH 12/16] io_uring: change io_get_ext_arg() to use uaccess begin + end In scenarios where a high frequency of wait events are seen, the copy of the struct io_uring_getevents_arg is quite noticeable in the profiles in terms of time spent. It can be seen as up to 3.5-4.5%. Rewrite the copy-in logic, saving about 0.5% of the time. Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8f0e0749a581..4cd0ee52710d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3240,6 +3240,7 @@ static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t a static int io_get_ext_arg(unsigned flags, const void __user *argp, struct ext_arg *ext_arg) { + const struct io_uring_getevents_arg __user *uarg = argp; struct io_uring_getevents_arg arg; /* @@ -3257,8 +3258,18 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, */ if (ext_arg->argsz != sizeof(arg)) return -EINVAL; - if (copy_from_user(&arg, argp, sizeof(arg))) +#ifdef CONFIG_64BIT + if (!user_access_begin(uarg, sizeof(*uarg))) return -EFAULT; + unsafe_get_user(arg.sigmask, &uarg->sigmask, uaccess_end); + unsafe_get_user(arg.sigmask_sz, &uarg->sigmask_sz, uaccess_end); + unsafe_get_user(arg.min_wait_usec, &uarg->min_wait_usec, uaccess_end); + unsafe_get_user(arg.ts, &uarg->ts, uaccess_end); + user_access_end(); +#else + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; +#endif ext_arg->min_time = arg.min_wait_usec * NSEC_PER_USEC; ext_arg->sig = u64_to_user_ptr(arg.sigmask); ext_arg->argsz = arg.sigmask_sz; @@ -3268,6 +3279,11 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, ext_arg->ts_set = true; } return 0; +#ifdef CONFIG_64BIT +uaccess_end: + user_access_end(); + return -EFAULT; +#endif } SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, -- 2.51.0 From aa00f67adc2c0d6439f81b5a81ff181377c47a7e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 22 Oct 2024 13:47:00 -0600 Subject: [PATCH 13/16] io_uring: add support for fixed wait regions Generally applications have 1 or a few waits of waiting, yet they pass in a struct io_uring_getevents_arg every time. This needs to get copied and, in turn, the timeout value needs to get copied. Rather than do this for every invocation, allow the application to register a fixed set of wait regions that can simply be indexed when asking the kernel to wait on events. At ring setup time, the application can register a number of these wait regions and initialize region/index 0 upfront: struct io_uring_reg_wait *reg; reg = io_uring_setup_reg_wait(ring, nr_regions, &ret); /* set timeout and mark as set, sigmask/sigmask_sz as needed */ reg->ts.tv_sec = 0; reg->ts.tv_nsec = 100000; reg->flags = IORING_REG_WAIT_TS; where nr_regions >= 1 && nr_regions <= PAGE_SIZE / sizeof(*reg). The above initializes index 0, but 63 other regions can be initialized, if needed. Now, instead of doing: struct __kernel_timespec timeout = { .tv_nsec = 100000, }; io_uring_submit_and_wait_timeout(ring, &cqe, nr, &t, NULL); to wait for events for each submit_and_wait, or just wait, operation, it can just reference the above region at offset 0 and do: io_uring_submit_and_wait_reg(ring, &cqe, nr, 0); to achieve the same goal of waiting 100usec without needing to copy both struct io_uring_getevents_arg (24b) and struct __kernel_timeout (16b) for each invocation. Struct io_uring_reg_wait looks as follows: struct io_uring_reg_wait { struct __kernel_timespec ts; __u32 min_wait_usec; __u32 flags; __u64 sigmask; __u32 sigmask_sz; __u32 pad[3]; __u64 pad2[2]; }; embedding the timeout itself in the region, rather than passing it as a pointer as well. Note that the signal mask is still passed as a pointer, both for compatability reasons, but also because there doesn't seem to be a lot of high frequency waits scenarios that involve setting and resetting the signal mask for each wait. The application is free to modify any region before a wait call, or it can use keep multiple regions with different settings to avoid needing to modify the same one for wait calls. Up to a page size of regions is mapped by default, allowing PAGE_SIZE / 64 available regions for use. The registered region must fit within a page. On a 4kb page size system, that allows for 64 wait regions if a full page is used, as the size of struct io_uring_reg_wait is 64b. The region registered must be aligned to io_uring_reg_wait in size. It's valid to register less than 64 entries. In network performance testing with zero-copy, this reduced the time spent waiting on the TX side from 3.12% to 0.3% and the RX side from 4.4% to 0.3%. Wait regions are fixed for the lifetime of the ring - once registered, they are persistent until the ring is torn down. The regions support minimum wait timeout as well as the regular waits. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 10 +++++ include/uapi/linux/io_uring.h | 41 +++++++++++++++++ io_uring/io_uring.c | 68 +++++++++++++++++++++++----- io_uring/register.c | 82 ++++++++++++++++++++++++++++++++++ io_uring/register.h | 1 + 5 files changed, 191 insertions(+), 11 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 841579dcdae9..2f12828b22a4 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -327,6 +327,14 @@ struct io_ring_ctx { atomic_t cq_wait_nr; atomic_t cq_timeouts; struct wait_queue_head cq_wait; + + /* + * If registered with IORING_REGISTER_CQWAIT_REG, a single + * page holds N entries, mapped in cq_wait_arg. cq_wait_index + * is the maximum allowable index. + */ + struct io_uring_reg_wait *cq_wait_arg; + unsigned char cq_wait_index; } ____cacheline_aligned_in_smp; /* timeouts */ @@ -430,6 +438,8 @@ struct io_ring_ctx { unsigned short n_sqe_pages; struct page **ring_pages; struct page **sqe_pages; + + struct page **cq_wait_page; }; struct io_tw_state { diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 60b9c98595fa..65b7417c1b05 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -518,6 +518,7 @@ struct io_cqring_offsets { #define IORING_ENTER_EXT_ARG (1U << 3) #define IORING_ENTER_REGISTERED_RING (1U << 4) #define IORING_ENTER_ABS_TIMER (1U << 5) +#define IORING_ENTER_EXT_ARG_REG (1U << 6) /* * Passed in for io_uring_setup(2). Copied back with updated info on success @@ -620,6 +621,9 @@ enum io_uring_register_op { /* resize CQ ring */ IORING_REGISTER_RESIZE_RINGS = 33, + /* register fixed io_uring_reg_wait arguments */ + IORING_REGISTER_CQWAIT_REG = 34, + /* this goes last */ IORING_REGISTER_LAST, @@ -803,6 +807,43 @@ enum io_uring_register_restriction_op { IORING_RESTRICTION_LAST }; +enum { + IORING_REG_WAIT_TS = (1U << 0), +}; + +/* + * Argument for IORING_REGISTER_CQWAIT_REG, registering a region of + * struct io_uring_reg_wait that can be indexed when io_uring_enter(2) is + * called rather than pass in a wait argument structure separately. + */ +struct io_uring_cqwait_reg_arg { + __u32 flags; + __u32 struct_size; + __u32 nr_entries; + __u32 pad; + __u64 user_addr; + __u64 pad2[3]; +}; + +/* + * Argument for io_uring_enter(2) with + * IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument + * is an index into a previously registered fixed wait region described by + * the below structure. + */ +struct io_uring_reg_wait { + struct __kernel_timespec ts; + __u32 min_wait_usec; + __u32 flags; + __u64 sigmask; + __u32 sigmask_sz; + __u32 pad[3]; + __u64 pad2[2]; +}; + +/* + * Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG + */ struct io_uring_getevents_arg { __u64 sigmask; __u32 sigmask_sz; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4cd0ee52710d..2863b957e373 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2736,6 +2736,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); io_futex_cache_free(ctx); io_destroy_buffers(ctx); + io_unregister_cqwait_reg(ctx); mutex_unlock(&ctx->uring_lock); if (ctx->sq_creds) put_cred(ctx->sq_creds); @@ -3224,21 +3225,43 @@ void __io_uring_cancel(bool cancel_all) io_uring_cancel_generic(cancel_all, NULL); } -static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) +static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, + const struct io_uring_getevents_arg __user *uarg) { - if (flags & IORING_ENTER_EXT_ARG) { - struct io_uring_getevents_arg arg; + struct io_uring_reg_wait *arg = READ_ONCE(ctx->cq_wait_arg); - if (argsz != sizeof(arg)) + if (arg) { + unsigned int index = (unsigned int) (uintptr_t) uarg; + + if (index <= ctx->cq_wait_index) + return arg + index; + } + + return ERR_PTR(-EFAULT); +} + +static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags, + const void __user *argp, size_t argsz) +{ + struct io_uring_getevents_arg arg; + + if (!(flags & IORING_ENTER_EXT_ARG)) + return 0; + + if (flags & IORING_ENTER_EXT_ARG_REG) { + if (argsz != sizeof(struct io_uring_reg_wait)) return -EINVAL; - if (copy_from_user(&arg, argp, sizeof(arg))) - return -EFAULT; + return PTR_ERR(io_get_ext_arg_reg(ctx, argp)); } + if (argsz != sizeof(arg)) + return -EINVAL; + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; return 0; } -static int io_get_ext_arg(unsigned flags, const void __user *argp, - struct ext_arg *ext_arg) +static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags, + const void __user *argp, struct ext_arg *ext_arg) { const struct io_uring_getevents_arg __user *uarg = argp; struct io_uring_getevents_arg arg; @@ -3252,6 +3275,28 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, return 0; } + if (flags & IORING_ENTER_EXT_ARG_REG) { + struct io_uring_reg_wait *w; + + if (ext_arg->argsz != sizeof(struct io_uring_reg_wait)) + return -EINVAL; + w = io_get_ext_arg_reg(ctx, argp); + if (IS_ERR(w)) + return PTR_ERR(w); + + if (w->flags & ~IORING_REG_WAIT_TS) + return -EINVAL; + ext_arg->min_time = READ_ONCE(w->min_wait_usec) * NSEC_PER_USEC; + ext_arg->sig = u64_to_user_ptr(READ_ONCE(w->sigmask)); + ext_arg->argsz = READ_ONCE(w->sigmask_sz); + if (w->flags & IORING_REG_WAIT_TS) { + ext_arg->ts.tv_sec = READ_ONCE(w->ts.tv_sec); + ext_arg->ts.tv_nsec = READ_ONCE(w->ts.tv_nsec); + ext_arg->ts_set = true; + } + return 0; + } + /* * EXT_ARG is set - ensure we agree on the size of it and copy in our * timespec and sigset_t pointers if good. @@ -3297,7 +3342,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | IORING_ENTER_REGISTERED_RING | - IORING_ENTER_ABS_TIMER))) + IORING_ENTER_ABS_TIMER | + IORING_ENTER_EXT_ARG_REG))) return -EINVAL; /* @@ -3380,7 +3426,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ mutex_lock(&ctx->uring_lock); iopoll_locked: - ret2 = io_validate_ext_arg(flags, argp, argsz); + ret2 = io_validate_ext_arg(ctx, flags, argp, argsz); if (likely(!ret2)) { min_complete = min(min_complete, ctx->cq_entries); @@ -3390,7 +3436,7 @@ iopoll_locked: } else { struct ext_arg ext_arg = { .argsz = argsz }; - ret2 = io_get_ext_arg(flags, argp, &ext_arg); + ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg); if (likely(!ret2)) { min_complete = min(min_complete, ctx->cq_entries); diff --git a/io_uring/register.c b/io_uring/register.c index fc6c94d694b2..1eb686eaa310 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -570,6 +570,82 @@ out: return ret; } +void io_unregister_cqwait_reg(struct io_ring_ctx *ctx) +{ + unsigned short npages = 1; + + if (!ctx->cq_wait_page) + return; + + io_pages_unmap(ctx->cq_wait_arg, &ctx->cq_wait_page, &npages, true); + ctx->cq_wait_arg = NULL; + if (ctx->user) + __io_unaccount_mem(ctx->user, 1); +} + +/* + * Register a page holding N entries of struct io_uring_reg_wait, which can + * be used via io_uring_enter(2) if IORING_GETEVENTS_EXT_ARG_REG is set. + * If that is set with IORING_GETEVENTS_EXT_ARG, then instead of passing + * in a pointer for a struct io_uring_getevents_arg, an index into this + * registered array is passed, avoiding two (arg + timeout) copies per + * invocation. + */ +static int io_register_cqwait_reg(struct io_ring_ctx *ctx, void __user *uarg) +{ + struct io_uring_cqwait_reg_arg arg; + struct io_uring_reg_wait *reg; + struct page **pages; + unsigned long len; + int nr_pages, poff; + int ret; + + if (ctx->cq_wait_page || ctx->cq_wait_arg) + return -EBUSY; + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + if (!arg.nr_entries || arg.flags) + return -EINVAL; + if (arg.struct_size != sizeof(*reg)) + return -EINVAL; + if (check_mul_overflow(arg.struct_size, arg.nr_entries, &len)) + return -EOVERFLOW; + if (len > PAGE_SIZE) + return -EINVAL; + /* offset + len must fit within a page, and must be reg_wait aligned */ + poff = arg.user_addr & ~PAGE_MASK; + if (len + poff > PAGE_SIZE) + return -EINVAL; + if (poff % arg.struct_size) + return -EINVAL; + + pages = io_pin_pages(arg.user_addr, len, &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + ret = -EINVAL; + if (nr_pages != 1) + goto out_free; + if (ctx->user) { + ret = __io_account_mem(ctx->user, 1); + if (ret) + goto out_free; + } + + reg = vmap(pages, 1, VM_MAP, PAGE_KERNEL); + if (reg) { + ctx->cq_wait_index = arg.nr_entries - 1; + WRITE_ONCE(ctx->cq_wait_page, pages); + WRITE_ONCE(ctx->cq_wait_arg, (void *) reg + poff); + return 0; + } + ret = -ENOMEM; + if (ctx->user) + __io_unaccount_mem(ctx->user, 1); +out_free: + io_pages_free(&pages, nr_pages); + return ret; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -764,6 +840,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_resize_rings(ctx, arg); break; + case IORING_REGISTER_CQWAIT_REG: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_cqwait_reg(ctx, arg); + break; default: ret = -EINVAL; break; diff --git a/io_uring/register.h b/io_uring/register.h index a5f39d5ef9e0..3e935e8fa4b2 100644 --- a/io_uring/register.h +++ b/io_uring/register.h @@ -5,5 +5,6 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx); int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id); struct file *io_uring_register_get_file(unsigned int fd, bool registered); +void io_unregister_cqwait_reg(struct io_ring_ctx *ctx); #endif -- 2.51.0 From a85f31052bce52111b4e9d5a536003481d0421d0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 27 Oct 2024 08:59:10 -0600 Subject: [PATCH 14/16] io_uring/nop: add support for testing registered files and buffers Useful for testing performance/efficiency impact of registered files and buffers, vs (particularly) non-registered files. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 3 +++ io_uring/nop.c | 49 +++++++++++++++++++++++++++++++---- 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 65b7417c1b05..024745283783 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -416,6 +416,9 @@ enum io_uring_msg_ring_flags { * IORING_NOP_INJECT_RESULT Inject result from sqe->result */ #define IORING_NOP_INJECT_RESULT (1U << 0) +#define IORING_NOP_FILE (1U << 1) +#define IORING_NOP_FIXED_FILE (1U << 2) +#define IORING_NOP_FIXED_BUFFER (1U << 3) /* * IO completion data structure (Completion Queue Entry) diff --git a/io_uring/nop.c b/io_uring/nop.c index a5bcf3d6984f..2c7a22ba4053 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -8,35 +8,74 @@ #include #include "io_uring.h" +#include "rsrc.h" #include "nop.h" struct io_nop { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct file *file; int result; + int fd; + int buffer; + unsigned int flags; }; +#define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \ + IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE) + int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - unsigned int flags; struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); - flags = READ_ONCE(sqe->nop_flags); - if (flags & ~IORING_NOP_INJECT_RESULT) + nop->flags = READ_ONCE(sqe->nop_flags); + if (nop->flags & ~NOP_FLAGS) return -EINVAL; - if (flags & IORING_NOP_INJECT_RESULT) + if (nop->flags & IORING_NOP_INJECT_RESULT) nop->result = READ_ONCE(sqe->len); else nop->result = 0; + if (nop->flags & IORING_NOP_FIXED_FILE) + nop->fd = READ_ONCE(sqe->fd); + if (nop->flags & IORING_NOP_FIXED_BUFFER) + nop->buffer = READ_ONCE(sqe->buf_index); return 0; } int io_nop(struct io_kiocb *req, unsigned int issue_flags) { struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); + int ret = nop->result; + + if (nop->flags & IORING_NOP_FILE) { + if (nop->flags & IORING_NOP_FIXED_FILE) { + req->file = io_file_get_fixed(req, nop->fd, issue_flags); + req->flags |= REQ_F_FIXED_FILE; + } else { + req->file = io_file_get_normal(req, nop->fd); + } + if (!req->file) { + ret = -EBADF; + goto done; + } + } + if (nop->flags & IORING_NOP_FIXED_BUFFER) { + struct io_ring_ctx *ctx = req->ctx; + struct io_mapped_ubuf *imu; + int idx; - if (nop->result < 0) + ret = -EFAULT; + io_ring_submit_lock(ctx, issue_flags); + if (nop->buffer < ctx->nr_user_bufs) { + idx = array_index_nospec(nop->buffer, ctx->nr_user_bufs); + imu = READ_ONCE(ctx->user_bufs[idx]); + io_req_set_rsrc_node(req, ctx); + ret = 0; + } + io_ring_submit_unlock(ctx, issue_flags); + } +done: + if (ret < 0) req_set_fail(req); io_req_set_res(req, nop->result, 0); return IOU_OK; -- 2.51.0 From ff1256b8f3c45f222bce19fbfc1e1bc498b31d03 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 25 Oct 2024 08:54:28 -0600 Subject: [PATCH 15/16] io_uring/rsrc: move struct io_fixed_file to rsrc.h header There's no need for this internal structure to be visible, move it to the private rsrc.h header instead. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 ----- io_uring/filetable.h | 1 + io_uring/rsrc.h | 5 +++++ 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 2f12828b22a4..d4ba4ae480d6 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -55,11 +55,6 @@ struct io_wq_work { int cancel_seq; }; -struct io_fixed_file { - /* file * with additional FFS_* flags */ - unsigned long file_ptr; -}; - struct io_file_table { struct io_fixed_file *files; unsigned long *bitmap; diff --git a/io_uring/filetable.h b/io_uring/filetable.h index b2435c4dca1f..c027ed4ad68d 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -4,6 +4,7 @@ #include #include +#include "rsrc.h" bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); void io_free_file_tables(struct io_file_table *table); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index c50d4be4aa6d..e072fb3ee351 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -40,6 +40,11 @@ struct io_rsrc_node { struct io_rsrc_put item; }; +struct io_fixed_file { + /* file * with additional FFS_* flags */ + unsigned long file_ptr; +}; + struct io_mapped_ubuf { u64 ubuf; unsigned int len; -- 2.51.0 From aaa736b186239b7dc7778ae94c75f26c96972796 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 15 Oct 2024 12:19:33 -0600 Subject: [PATCH 16/16] io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache Doesn't matter right now as there's still some bytes left for it, but let's prepare for the io_kiocb potentially growing and add a specific freeptr offset for it. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2863b957e373..a09c67b38c1b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3846,6 +3846,8 @@ static int __init io_uring_init(void) struct kmem_cache_args kmem_args = { .useroffset = offsetof(struct io_kiocb, cmd.data), .usersize = sizeof_field(struct io_kiocb, cmd.data), + .freeptr_offset = offsetof(struct io_kiocb, work), + .use_freeptr_offset = true, }; #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \ -- 2.51.0