From 039c878db7add23c1c9ea18424c442cce76670f9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 7 Nov 2024 19:01:36 +0800 Subject: [PATCH 01/16] io_uring/rsrc: add & apply io_req_assign_buf_node() The following pattern becomes more and more: + io_req_assign_rsrc_node(&req->buf_node, node); + req->flags |= REQ_F_BUF_NODE; so make it a helper, which is less fragile to use than above code, for example, the BUF_NODE flag is even missed in current io_uring_cmd_prep(). Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241107110149.890530-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- io_uring/net.c | 3 +-- io_uring/nop.c | 3 +-- io_uring/rsrc.h | 7 +++++++ io_uring/rw.c | 3 +-- io_uring/uring_cmd.c | 2 +- 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 2ccc2b409431..df1f7dc6f1c8 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1348,8 +1348,7 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) io_ring_submit_lock(ctx, issue_flags); node = io_rsrc_node_lookup(&ctx->buf_table, sr->buf_index); if (node) { - io_req_assign_rsrc_node(&sr->notif->buf_node, node); - sr->notif->flags |= REQ_F_BUF_NODE; + io_req_assign_buf_node(sr->notif, node); ret = 0; } io_ring_submit_unlock(ctx, issue_flags); diff --git a/io_uring/nop.c b/io_uring/nop.c index bc22bcc739f3..6d470d4251ee 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -67,8 +67,7 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags) io_ring_submit_lock(ctx, issue_flags); node = io_rsrc_node_lookup(&ctx->buf_table, nop->buffer); if (node) { - io_req_assign_rsrc_node(&req->buf_node, node); - req->flags |= REQ_F_BUF_NODE; + io_req_assign_buf_node(req, node); ret = 0; } io_ring_submit_unlock(ctx, issue_flags); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index c8a64a9ed5b9..7a4668deaa1a 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -111,6 +111,13 @@ static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node, *dst_node = node; } +static inline void io_req_assign_buf_node(struct io_kiocb *req, + struct io_rsrc_node *node) +{ + io_req_assign_rsrc_node(&req->buf_node, node); + req->flags |= REQ_F_BUF_NODE; +} + int io_files_update(struct io_kiocb *req, unsigned int issue_flags); int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); diff --git a/io_uring/rw.c b/io_uring/rw.c index e368b9afde03..b62cdb5fc936 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -341,8 +341,7 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); if (!node) return -EFAULT; - io_req_assign_rsrc_node(&req->buf_node, node); - req->flags |= REQ_F_BUF_NODE; + io_req_assign_buf_node(req, node); io = req->async_data; ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len); diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 40b8b777ba12..b62965f58f30 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -219,7 +219,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) * being called. This prevents destruction of the mapped buffer * we'll need at actual import time. */ - io_req_assign_rsrc_node(&req->buf_node, node); + io_req_assign_buf_node(req, node); } ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); -- 2.51.0 From a43e236fb9aef4528f2bd24095d1f348030f5d9d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 11 Nov 2024 18:13:18 +0800 Subject: [PATCH 02/16] io_uring/uring_cmd: fix buffer index retrieval Add back buffer index retrieval for IORING_URING_CMD_FIXED. Reported-by: Guangwu Zhang Cc: Jeff Moyer Fixes: b54a14041ee6 ("io_uring/rsrc: add io_rsrc_node_lookup() helper") Signed-off-by: Ming Lei Reviewed-by: Kanchan Joshi Reviewed-by: Anuj Gupta Tested-by: Guangwu Zhang Link: https://lore.kernel.org/r/20241111101318.1387557-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index b62965f58f30..e9d99d3ecc34 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -210,8 +210,9 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ioucmd->flags & IORING_URING_CMD_FIXED) { struct io_ring_ctx *ctx = req->ctx; struct io_rsrc_node *node; + u16 index = READ_ONCE(sqe->buf_index); - node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); + node = io_rsrc_node_lookup(&ctx->buf_table, index); if (unlikely(!node)) return -EFAULT; /* -- 2.51.0 From b9d69371e8fa90fa3ab100f4fcb4815b13b3673a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 13 Nov 2024 03:26:01 +0000 Subject: [PATCH 03/16] io_uring: fix invalid hybrid polling ctx leaks It has already allocated the ctx by the point where it checks the hybrid poll configuration, plain return leaks the memory. Fixes: 01ee194d1aba1 ("io_uring: add support for hybrid IOPOLL") Signed-off-by: Pavel Begunkov Reviewed-by: Anuj Gupta Link: https://lore.kernel.org/r/b57f2608088020501d352fcdeebdb949e281d65b.1731468230.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 042a65d38d0c..bd71782057de 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3616,11 +3616,6 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) static_branch_inc(&io_key_has_sqarray); - /* HYBRID_IOPOLL only valid with IOPOLL */ - if ((ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_HYBRID_IOPOLL)) == - IORING_SETUP_HYBRID_IOPOLL) - return -EINVAL; - if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && !(ctx->flags & IORING_SETUP_IOPOLL) && !(ctx->flags & IORING_SETUP_SQPOLL)) @@ -3671,6 +3666,11 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->notify_method = TWA_SIGNAL; } + /* HYBRID_IOPOLL only valid with IOPOLL */ + if ((ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_HYBRID_IOPOLL)) == + IORING_SETUP_HYBRID_IOPOLL) + goto err; + /* * For DEFER_TASKRUN we require the completion task to be the same as the * submission task. This implies that there is only one submitter, so enforce -- 2.51.0 From 56cec28dc4da396d6032c59ae9614c5a6ae7d7a8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 15 Nov 2024 03:49:02 +0000 Subject: [PATCH 04/16] switch io_msg_ring() to CLASS(fd) Use CLASS(fd) to get the file for sync message ring requests, rather than open-code the file retrieval dance. Signed-off-by: Al Viro Link: https://lore.kernel.org/r/20241115034902.GP3387508@ZenIV [axboe: make a more coherent commit message] Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index e63af34004b7..333c220d322a 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -333,7 +333,6 @@ done: int io_uring_sync_msg_ring(struct io_uring_sqe *sqe) { struct io_msg io_msg = { }; - struct fd f; int ret; ret = __io_msg_ring_prep(&io_msg, sqe); @@ -347,16 +346,13 @@ int io_uring_sync_msg_ring(struct io_uring_sqe *sqe) if (io_msg.cmd != IORING_MSG_DATA) return -EINVAL; - ret = -EBADF; - f = fdget(sqe->fd); - if (fd_file(f)) { - ret = -EBADFD; - if (io_is_uring_fops(fd_file(f))) - ret = __io_msg_ring_data(fd_file(f)->private_data, - &io_msg, IO_URING_F_UNLOCKED); - fdput(f); - } - return ret; + CLASS(fd, f)(sqe->fd); + if (fd_empty(f)) + return -EBADF; + if (!io_is_uring_fops(fd_file(f))) + return -EBADFD; + return __io_msg_ring_data(fd_file(f)->private_data, + &io_msg, IO_URING_F_UNLOCKED); } void io_msg_cache_free(const void *entry) -- 2.51.0 From 68685fa20edc5307fc893a06473c19661c236f29 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 15 Nov 2024 16:54:38 +0000 Subject: [PATCH 05/16] io_uring: fortify io_pin_pages with a warning We're a bit too frivolous with types of nr_pages arguments, converting it to long and back to int, passing an unsigned int pointer as an int pointer and so on. Shouldn't cause any problem but should be carefully reviewed, but until then let's add a WARN_ON_ONCE check to be more confident callers don't pass poorely checked arguents. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d48e0c097cbd90fb47acaddb6c247596510d8cfc.1731689588.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 85c66fa54956..6ab59c60dfd0 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -140,6 +140,8 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) nr_pages = end - start; if (WARN_ON_ONCE(!nr_pages)) return ERR_PTR(-EINVAL); + if (WARN_ON_ONCE(nr_pages > INT_MAX)) + return ERR_PTR(-EOVERFLOW); pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) -- 2.51.0 From 3730aebbdac8770f64ab66eb5e7129bc8dae731d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 15 Nov 2024 16:54:39 +0000 Subject: [PATCH 06/16] io_uring: disable ENTER_EXT_ARG_REG for IOPOLL IOPOLL doesn't use the extended arguments, no need for it to support IORING_ENTER_EXT_ARG_REG. Let's disable it for IOPOLL, if anything it leaves more space for future extensions. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a35ecd919dbdc17bd5b7932273e317832c531b45.1731689588.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bd71782057de..464a70bde7e6 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3214,12 +3214,8 @@ static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags, if (!(flags & IORING_ENTER_EXT_ARG)) return 0; - - if (flags & IORING_ENTER_EXT_ARG_REG) { - if (argsz != sizeof(struct io_uring_reg_wait)) - return -EINVAL; - return PTR_ERR(io_get_ext_arg_reg(ctx, argp)); - } + if (flags & IORING_ENTER_EXT_ARG_REG) + return -EINVAL; if (argsz != sizeof(arg)) return -EINVAL; if (copy_from_user(&arg, argp, sizeof(arg))) -- 2.51.0 From 83e041522eb9c45479f4490b212687cf1e7e9999 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 15 Nov 2024 16:54:40 +0000 Subject: [PATCH 07/16] io_uring: temporarily disable registered waits Disable wait argument registration as it'll be replaced with a more generic feature. We'll still need IORING_ENTER_EXT_ARG_REG parsing in a few commits so leave it be. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/70b1d1d218c41ba77a76d1789c8641dab0b0563e.1731689588.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 10 ----- include/uapi/linux/io_uring.h | 3 -- io_uring/io_uring.c | 10 ----- io_uring/register.c | 82 ---------------------------------- io_uring/register.h | 1 - 5 files changed, 106 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 072e65e93105..52a5da99a205 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -330,14 +330,6 @@ struct io_ring_ctx { atomic_t cq_wait_nr; atomic_t cq_timeouts; struct wait_queue_head cq_wait; - - /* - * If registered with IORING_REGISTER_CQWAIT_REG, a single - * page holds N entries, mapped in cq_wait_arg. cq_wait_index - * is the maximum allowable index. - */ - struct io_uring_reg_wait *cq_wait_arg; - unsigned char cq_wait_index; } ____cacheline_aligned_in_smp; /* timeouts */ @@ -431,8 +423,6 @@ struct io_ring_ctx { unsigned short n_sqe_pages; struct page **ring_pages; struct page **sqe_pages; - - struct page **cq_wait_page; }; struct io_tw_state { diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 5d08435b95a8..132f5db3d4e8 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -627,9 +627,6 @@ enum io_uring_register_op { /* resize CQ ring */ IORING_REGISTER_RESIZE_RINGS = 33, - /* register fixed io_uring_reg_wait arguments */ - IORING_REGISTER_CQWAIT_REG = 34, - /* this goes last */ IORING_REGISTER_LAST, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 464a70bde7e6..286b7bb73978 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2709,7 +2709,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); io_futex_cache_free(ctx); io_destroy_buffers(ctx); - io_unregister_cqwait_reg(ctx); mutex_unlock(&ctx->uring_lock); if (ctx->sq_creds) put_cred(ctx->sq_creds); @@ -3195,15 +3194,6 @@ void __io_uring_cancel(bool cancel_all) static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, const struct io_uring_getevents_arg __user *uarg) { - struct io_uring_reg_wait *arg = READ_ONCE(ctx->cq_wait_arg); - - if (arg) { - unsigned int index = (unsigned int) (uintptr_t) uarg; - - if (index <= ctx->cq_wait_index) - return arg + index; - } - return ERR_PTR(-EFAULT); } diff --git a/io_uring/register.c b/io_uring/register.c index 45edfc57963a..3c5a3cfb186b 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -570,82 +570,6 @@ out: return ret; } -void io_unregister_cqwait_reg(struct io_ring_ctx *ctx) -{ - unsigned short npages = 1; - - if (!ctx->cq_wait_page) - return; - - io_pages_unmap(ctx->cq_wait_arg, &ctx->cq_wait_page, &npages, true); - ctx->cq_wait_arg = NULL; - if (ctx->user) - __io_unaccount_mem(ctx->user, 1); -} - -/* - * Register a page holding N entries of struct io_uring_reg_wait, which can - * be used via io_uring_enter(2) if IORING_GETEVENTS_EXT_ARG_REG is set. - * If that is set with IORING_GETEVENTS_EXT_ARG, then instead of passing - * in a pointer for a struct io_uring_getevents_arg, an index into this - * registered array is passed, avoiding two (arg + timeout) copies per - * invocation. - */ -static int io_register_cqwait_reg(struct io_ring_ctx *ctx, void __user *uarg) -{ - struct io_uring_cqwait_reg_arg arg; - struct io_uring_reg_wait *reg; - struct page **pages; - unsigned long len; - int nr_pages, poff; - int ret; - - if (ctx->cq_wait_page || ctx->cq_wait_arg) - return -EBUSY; - if (copy_from_user(&arg, uarg, sizeof(arg))) - return -EFAULT; - if (!arg.nr_entries || arg.flags) - return -EINVAL; - if (arg.struct_size != sizeof(*reg)) - return -EINVAL; - if (check_mul_overflow(arg.struct_size, arg.nr_entries, &len)) - return -EOVERFLOW; - if (len > PAGE_SIZE) - return -EINVAL; - /* offset + len must fit within a page, and must be reg_wait aligned */ - poff = arg.user_addr & ~PAGE_MASK; - if (len + poff > PAGE_SIZE) - return -EINVAL; - if (poff % arg.struct_size) - return -EINVAL; - - pages = io_pin_pages(arg.user_addr, len, &nr_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); - ret = -EINVAL; - if (nr_pages != 1) - goto out_free; - if (ctx->user) { - ret = __io_account_mem(ctx->user, 1); - if (ret) - goto out_free; - } - - reg = vmap(pages, 1, VM_MAP, PAGE_KERNEL); - if (reg) { - ctx->cq_wait_index = arg.nr_entries - 1; - WRITE_ONCE(ctx->cq_wait_page, pages); - WRITE_ONCE(ctx->cq_wait_arg, (void *) reg + poff); - return 0; - } - ret = -ENOMEM; - if (ctx->user) - __io_unaccount_mem(ctx->user, 1); -out_free: - io_pages_free(&pages, nr_pages); - return ret; -} - static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -840,12 +764,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_resize_rings(ctx, arg); break; - case IORING_REGISTER_CQWAIT_REG: - ret = -EINVAL; - if (!arg || nr_args != 1) - break; - ret = io_register_cqwait_reg(ctx, arg); - break; default: ret = -EINVAL; break; diff --git a/io_uring/register.h b/io_uring/register.h index 3e935e8fa4b2..a5f39d5ef9e0 100644 --- a/io_uring/register.h +++ b/io_uring/register.h @@ -5,6 +5,5 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx); int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id); struct file *io_uring_register_get_file(unsigned int fd, bool registered); -void io_unregister_cqwait_reg(struct io_ring_ctx *ctx); #endif -- 2.51.0 From dfbbfbf191878e8dd422768ce009858d8b5b761e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 15 Nov 2024 16:54:41 +0000 Subject: [PATCH 08/16] io_uring: introduce concept of memory regions We've got a good number of mappings we share with the userspace, that includes the main rings, provided buffer rings, upcoming rings for zerocopy rx and more. All of them duplicate user argument parsing and some internal details as well (page pinnning, huge page optimisations, mmap'ing, etc.) Introduce a notion of regions. For userspace for now it's just a new structure called struct io_uring_region_desc which is supposed to parameterise all such mapping / queue creations. A region either represents a user provided chunk of memory, in which case the user_addr field should point to it, or a request for the kernel to allocate the memory, in which case the user would need to mmap it after using the offset returned in the mmap_offset field. With a uniform userspace API we can avoid additional boiler plate code and apply future optimisation to all of them at once. Internally, there is a new structure struct io_mapped_region holding all relevant runtime information and some helpers to work with it. This patch limits it to user provided regions. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 +++ include/uapi/linux/io_uring.h | 14 +++++++ io_uring/memmap.c | 67 ++++++++++++++++++++++++++++++++++ io_uring/memmap.h | 14 +++++++ 4 files changed, 101 insertions(+) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 52a5da99a205..1d3a37234ace 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -75,6 +75,12 @@ struct io_hash_table { unsigned hash_bits; }; +struct io_mapped_region { + struct page **pages; + void *vmap_ptr; + size_t nr_pages; +}; + /* * Arbitrary limit, can be raised if need be */ diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 132f5db3d4e8..5cbfd330c688 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -647,6 +647,20 @@ struct io_uring_files_update { __aligned_u64 /* __s32 * */ fds; }; +enum { + /* initialise with user provided memory pointed by user_addr */ + IORING_MEM_REGION_TYPE_USER = 1, +}; + +struct io_uring_region_desc { + __u64 user_addr; + __u64 size; + __u32 flags; + __u32 id; + __u64 mmap_offset; + __u64 __resv[4]; +}; + /* * Register a fully sparse file space, rather than pass in an array of all * -1 file descriptors. diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 6ab59c60dfd0..bbd9569a0120 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -12,6 +12,7 @@ #include "memmap.h" #include "kbuf.h" +#include "rsrc.h" static void *io_mem_alloc_compound(struct page **pages, int nr_pages, size_t size, gfp_t gfp) @@ -194,6 +195,72 @@ void *__io_uaddr_map(struct page ***pages, unsigned short *npages, return ERR_PTR(-ENOMEM); } +void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) +{ + if (mr->pages) { + unpin_user_pages(mr->pages, mr->nr_pages); + kvfree(mr->pages); + } + if (mr->vmap_ptr) + vunmap(mr->vmap_ptr); + if (mr->nr_pages && ctx->user) + __io_unaccount_mem(ctx->user, mr->nr_pages); + + memset(mr, 0, sizeof(*mr)); +} + +int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, + struct io_uring_region_desc *reg) +{ + int pages_accounted = 0; + struct page **pages; + int nr_pages, ret; + void *vptr; + u64 end; + + if (WARN_ON_ONCE(mr->pages || mr->vmap_ptr || mr->nr_pages)) + return -EFAULT; + if (memchr_inv(®->__resv, 0, sizeof(reg->__resv))) + return -EINVAL; + if (reg->flags != IORING_MEM_REGION_TYPE_USER) + return -EINVAL; + if (!reg->user_addr) + return -EFAULT; + if (!reg->size || reg->mmap_offset || reg->id) + return -EINVAL; + if ((reg->size >> PAGE_SHIFT) > INT_MAX) + return E2BIG; + if ((reg->user_addr | reg->size) & ~PAGE_MASK) + return -EINVAL; + if (check_add_overflow(reg->user_addr, reg->size, &end)) + return -EOVERFLOW; + + pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + if (ctx->user) { + ret = __io_account_mem(ctx->user, nr_pages); + if (ret) + goto out_free; + pages_accounted = nr_pages; + } + + vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (!vptr) + goto out_free; + + mr->pages = pages; + mr->vmap_ptr = vptr; + mr->nr_pages = nr_pages; + return 0; +out_free: + if (pages_accounted) + __io_unaccount_mem(ctx->user, pages_accounted); + io_pages_free(&pages, nr_pages); + return ret; +} + static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, size_t sz) { diff --git a/io_uring/memmap.h b/io_uring/memmap.h index 5cec5b7ac49a..f361a635b6c7 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -22,4 +22,18 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, unsigned long flags); int io_uring_mmap(struct file *file, struct vm_area_struct *vma); +void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr); +int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, + struct io_uring_region_desc *reg); + +static inline void *io_region_get_ptr(struct io_mapped_region *mr) +{ + return mr->vmap_ptr; +} + +static inline bool io_region_is_set(struct io_mapped_region *mr) +{ + return !!mr->nr_pages; +} + #endif -- 2.51.0 From 93238e66185524aad925acefb2312203b9e26d63 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 15 Nov 2024 16:54:42 +0000 Subject: [PATCH 09/16] io_uring: add memory region registration Regions will serve multiple purposes. First, with it we can decouple ring/etc. object creation from registration / mapping of the memory they will be placed in. We already have hacks that allow to put both SQ and CQ into the same huge page, in the future we should be able to: region = create_region(io_ring); create_pbuf_ring(io_uring, region, offset=0); create_pbuf_ring(io_uring, region, offset=N); The second use case is efficiently passing parameters. The following patch enables back on top of regions IORING_ENTER_EXT_ARG_REG, which optimises wait arguments. It'll also be useful for request arguments replacing iovecs, msghdr, etc. pointers. Eventually it would also be handy for BPF as well if it comes to fruition. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0798cf3a14fad19cfc96fc9feca5f3e11481691d.1731689588.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ include/uapi/linux/io_uring.h | 8 ++++++++ io_uring/io_uring.c | 1 + io_uring/register.c | 37 ++++++++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 1d3a37234ace..e1d69123e164 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -429,6 +429,9 @@ struct io_ring_ctx { unsigned short n_sqe_pages; struct page **ring_pages; struct page **sqe_pages; + + /* used for optimised request parameter and wait argument passing */ + struct io_mapped_region param_region; }; struct io_tw_state { diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 5cbfd330c688..1ee35890125b 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -627,6 +627,8 @@ enum io_uring_register_op { /* resize CQ ring */ IORING_REGISTER_RESIZE_RINGS = 33, + IORING_REGISTER_MEM_REGION = 34, + /* this goes last */ IORING_REGISTER_LAST, @@ -661,6 +663,12 @@ struct io_uring_region_desc { __u64 __resv[4]; }; +struct io_uring_mem_region_reg { + __u64 region_uptr; /* struct io_uring_region_desc * */ + __u64 flags; + __u64 __resv[2]; +}; + /* * Register a fully sparse file space, rather than pass in an array of all * -1 file descriptors. diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 286b7bb73978..c640b8a4ceee 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2709,6 +2709,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); io_futex_cache_free(ctx); io_destroy_buffers(ctx); + io_free_region(ctx, &ctx->param_region); mutex_unlock(&ctx->uring_lock); if (ctx->sq_creds) put_cred(ctx->sq_creds); diff --git a/io_uring/register.c b/io_uring/register.c index 3c5a3cfb186b..2cbac3d9b288 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -570,6 +570,37 @@ out: return ret; } +static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) +{ + struct io_uring_mem_region_reg __user *reg_uptr = uarg; + struct io_uring_mem_region_reg reg; + struct io_uring_region_desc __user *rd_uptr; + struct io_uring_region_desc rd; + int ret; + + if (io_region_is_set(&ctx->param_region)) + return -EBUSY; + if (copy_from_user(®, reg_uptr, sizeof(reg))) + return -EFAULT; + rd_uptr = u64_to_user_ptr(reg.region_uptr); + if (copy_from_user(&rd, rd_uptr, sizeof(rd))) + return -EFAULT; + + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) + return -EINVAL; + if (reg.flags) + return -EINVAL; + + ret = io_create_region(ctx, &ctx->param_region, &rd); + if (ret) + return ret; + if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { + io_free_region(ctx, &ctx->param_region); + return -EFAULT; + } + return 0; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -764,6 +795,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_resize_rings(ctx, arg); break; + case IORING_REGISTER_MEM_REGION: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_mem_region(ctx, arg); + break; default: ret = -EINVAL; break; -- 2.51.0 From d617b3147d54c42351eac63b5398d4ddf4f4011b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 15 Nov 2024 16:54:43 +0000 Subject: [PATCH 10/16] io_uring: restore back registered wait arguments Now we've got a more generic region registration API, place IORING_ENTER_EXT_ARG_REG and re-enable it. First, the user has to register a region with the IORING_MEM_REGION_REG_WAIT_ARG flag set. It can only be done for a ring in a disabled state, aka IORING_SETUP_R_DISABLED, to avoid races with already running waiters. With that we should have stable constant values for ctx->cq_wait_{size,arg} in io_get_ext_arg_reg() and hence no READ_ONCE required. The other API difference is that we're now passing byte offsets instead of indexes. The user _must_ align all offsets / pointers to the native word size, failing to do so might but not necessarily has to lead to a failure usually returned as -EFAULT. liburing will be hiding this details from users. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/81822c1b4ffbe8ad391b4f9ad1564def0d26d990.1731689588.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ include/uapi/linux/io_uring.h | 5 +++++ io_uring/io_uring.c | 14 +++++++++++++- io_uring/register.c | 16 +++++++++++++++- 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e1d69123e164..aa5f5ea98076 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -324,6 +324,9 @@ struct io_ring_ctx { unsigned cq_entries; struct io_ev_fd __rcu *io_ev_fd; unsigned cq_extra; + + void *cq_wait_arg; + size_t cq_wait_size; } ____cacheline_aligned_in_smp; /* diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1ee35890125b..4418d0192959 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -663,6 +663,11 @@ struct io_uring_region_desc { __u64 __resv[4]; }; +enum { + /* expose the region as registered wait arguments */ + IORING_MEM_REGION_REG_WAIT_ARG = 1, +}; + struct io_uring_mem_region_reg { __u64 region_uptr; /* struct io_uring_region_desc * */ __u64 flags; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c640b8a4ceee..da8fd460977b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3195,7 +3195,19 @@ void __io_uring_cancel(bool cancel_all) static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, const struct io_uring_getevents_arg __user *uarg) { - return ERR_PTR(-EFAULT); + unsigned long size = sizeof(struct io_uring_reg_wait); + unsigned long offset = (uintptr_t)uarg; + unsigned long end; + + if (unlikely(offset % sizeof(long))) + return ERR_PTR(-EFAULT); + + /* also protects from NULL ->cq_wait_arg as the size would be 0 */ + if (unlikely(check_add_overflow(offset, size, &end) || + end > ctx->cq_wait_size)) + return ERR_PTR(-EFAULT); + + return ctx->cq_wait_arg + offset; } static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags, diff --git a/io_uring/register.c b/io_uring/register.c index 2cbac3d9b288..1a60f4916649 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -588,7 +588,16 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) return -EINVAL; - if (reg.flags) + if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) + return -EINVAL; + + /* + * This ensures there are no waiters. Waiters are unlocked and it's + * hard to synchronise with them, especially if we need to initialise + * the region. + */ + if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) && + !(ctx->flags & IORING_SETUP_R_DISABLED)) return -EINVAL; ret = io_create_region(ctx, &ctx->param_region, &rd); @@ -598,6 +607,11 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) io_free_region(ctx, &ctx->param_region); return -EFAULT; } + + if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) { + ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region); + ctx->cq_wait_size = rd.size; + } return 0; } -- 2.51.0 From a652958888fb1ada3e4f6b548576c2d2c1b60d66 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 17 Nov 2024 00:38:33 +0000 Subject: [PATCH 11/16] io_uring/region: fix error codes after failed vmap io_create_region() jumps after a vmap failure without setting the return code, it could be 0 or just uninitialised. Fixes: dfbbfbf191878 ("io_uring: introduce concept of memory regions") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0abac19dbf81c061cffaa9534a2471ed5460ad3e.1731803848.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index bbd9569a0120..6e6ee79ba94f 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -247,8 +247,10 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, } vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (!vptr) + if (!vptr) { + ret = -ENOMEM; goto out_free; + } mr->pages = pages; mr->vmap_ptr = vptr; -- 2.51.0 From c750629caeca01979da3403f4bebecda88713233 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 18 Nov 2024 15:14:34 +0000 Subject: [PATCH 12/16] io_uring: remove io_uring_cqwait_reg_arg A separate wait argument registration API was removed, also delete leftover uapi definitions. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/143b6a53591badac23632d3e6fa3e5db4b342ee2.1731942445.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4418d0192959..aac9a4f8fa9a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -873,20 +873,6 @@ enum { IORING_REG_WAIT_TS = (1U << 0), }; -/* - * Argument for IORING_REGISTER_CQWAIT_REG, registering a region of - * struct io_uring_reg_wait that can be indexed when io_uring_enter(2) is - * called rather than pass in a wait argument structure separately. - */ -struct io_uring_cqwait_reg_arg { - __u32 flags; - __u32 struct_size; - __u32 nr_entries; - __u32 pad; - __u64 user_addr; - __u64 pad2[3]; -}; - /* * Argument for io_uring_enter(2) with * IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument -- 2.51.0 From e358e09a894dbcd51fdbbcf62bec1df249915834 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 18 Nov 2024 15:14:50 +0000 Subject: [PATCH 13/16] io_uring: protect register tracing Syz reports: BUG: KCSAN: data-race in __se_sys_io_uring_register / io_sqe_files_register read-write to 0xffff8881021940b8 of 4 bytes by task 5923 on cpu 1: io_sqe_files_register+0x2c4/0x3b0 io_uring/rsrc.c:713 __io_uring_register io_uring/register.c:403 [inline] __do_sys_io_uring_register io_uring/register.c:611 [inline] __se_sys_io_uring_register+0x8d0/0x1280 io_uring/register.c:591 __x64_sys_io_uring_register+0x55/0x70 io_uring/register.c:591 x64_sys_call+0x202/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:428 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xc9/0x1c0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f read to 0xffff8881021940b8 of 4 bytes by task 5924 on cpu 0: __do_sys_io_uring_register io_uring/register.c:613 [inline] __se_sys_io_uring_register+0xe4a/0x1280 io_uring/register.c:591 __x64_sys_io_uring_register+0x55/0x70 io_uring/register.c:591 x64_sys_call+0x202/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:428 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xc9/0x1c0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Which should be due to reading the table size after unlock. We don't care much as it's just to print it in trace, but we might as well do it under the lock. Reported-by: syzbot+5a486fef3de40e0d8c76@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8233af2886a37b57f79e444e3db88fcfda1817ac.1731942203.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/register.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/register.c b/io_uring/register.c index 1a60f4916649..1e99c783abdf 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -905,9 +905,10 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); - mutex_unlock(&ctx->uring_lock); + trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr, ctx->buf_table.nr, ret); + mutex_unlock(&ctx->uring_lock); if (!use_registered_ring) fput(file); return ret; -- 2.51.0 From 2ae6bdb1e145af0a47253953132decbd2d52f4b2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 20 Nov 2024 12:15:05 +0300 Subject: [PATCH 14/16] io_uring/region: return negative -E2BIG in io_create_region() This code accidentally returns positivie E2BIG instead of negative -E2BIG. The callers treat negatives and positives the same so this doesn't affect the kernel. The error code is returned to userspace via the system call. Fixes: dfbbfbf19187 ("io_uring: introduce concept of memory regions") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/d8ea3bef-74d8-4f77-8223-6d36464dd4dc@stanley.mountain Signed-off-by: Jens Axboe --- io_uring/memmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 6e6ee79ba94f..3d71756bc598 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -229,7 +229,7 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, if (!reg->size || reg->mmap_offset || reg->id) return -EINVAL; if ((reg->size >> PAGE_SHIFT) > INT_MAX) - return E2BIG; + return -E2BIG; if ((reg->user_addr | reg->size) & ~PAGE_MASK) return -EINVAL; if (check_add_overflow(reg->user_addr, reg->size, &end)) -- 2.51.0 From 40cfe553240b32333b42652370ef5232e6ac59e1 Mon Sep 17 00:00:00 2001 From: David Wei Date: Wed, 20 Nov 2024 14:14:51 -0800 Subject: [PATCH 15/16] io_uring: add io_local_work_pending() In preparation for adding a new llist of tw to retry due to hitting the tw limit, add a helper io_local_work_pending(). This function returns true if there is any local tw pending. For now it only checks ctx->work_llist. Signed-off-by: David Wei Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20241120221452.3762588-2-dw@davidwei.uk Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 14 +++++++------- io_uring/io_uring.h | 9 +++++++-- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index da8fd460977b..55e3618b726d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1261,7 +1261,7 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, int min_events) { - if (llist_empty(&ctx->work_llist)) + if (!io_local_work_pending(ctx)) return false; if (events < min_events) return true; @@ -1314,7 +1314,7 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, { struct io_tw_state ts = {}; - if (llist_empty(&ctx->work_llist)) + if (!io_local_work_pending(ctx)) return 0; return __io_run_local_work(ctx, &ts, min_events); } @@ -2329,7 +2329,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, int io_run_task_work_sig(struct io_ring_ctx *ctx) { - if (!llist_empty(&ctx->work_llist)) { + if (io_local_work_pending(ctx)) { __set_current_state(TASK_RUNNING); if (io_run_local_work(ctx, INT_MAX) > 0) return 0; @@ -2459,7 +2459,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, { if (unlikely(READ_ONCE(ctx->check_cq))) return 1; - if (unlikely(!llist_empty(&ctx->work_llist))) + if (unlikely(io_local_work_pending(ctx))) return 1; if (unlikely(task_work_pending(current))) return 1; @@ -2493,7 +2493,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, if (!io_allowed_run_tw(ctx)) return -EEXIST; - if (!llist_empty(&ctx->work_llist)) + if (io_local_work_pending(ctx)) io_run_local_work(ctx, min_events); io_run_task_work(); @@ -2564,7 +2564,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, * If we got woken because of task_work being processed, run it * now rather than let the caller do another wait loop. */ - if (!llist_empty(&ctx->work_llist)) + if (io_local_work_pending(ctx)) io_run_local_work(ctx, nr_wait); io_run_task_work(); @@ -3158,7 +3158,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) io_run_task_work(); io_uring_drop_tctx_refs(current); xa_for_each(&tctx->xa, index, node) { - if (!llist_empty(&node->ctx->work_llist)) { + if (io_local_work_pending(node->ctx)) { WARN_ON_ONCE(node->ctx->submitter_task && node->ctx->submitter_task != current); goto end_wait; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 4070d4c8ef97..69eb3b23a5a0 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -347,9 +347,14 @@ static inline int io_run_task_work(void) return ret; } +static inline bool io_local_work_pending(struct io_ring_ctx *ctx) +{ + return !llist_empty(&ctx->work_llist); +} + static inline bool io_task_work_pending(struct io_ring_ctx *ctx) { - return task_work_pending(current) || !llist_empty(&ctx->work_llist); + return task_work_pending(current) || io_local_work_pending(ctx); } static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) @@ -484,6 +489,6 @@ enum { static inline bool io_has_work(struct io_ring_ctx *ctx) { return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || - !llist_empty(&ctx->work_llist); + io_local_work_pending(ctx); } #endif -- 2.51.0 From f46b9cdb22f7a167c36b6bcddaef7e8aee2598fa Mon Sep 17 00:00:00 2001 From: David Wei Date: Wed, 20 Nov 2024 14:14:52 -0800 Subject: [PATCH 16/16] io_uring: limit local tw done Instead of eagerly running all available local tw, limit the amount of local tw done to the max of IO_LOCAL_TW_DEFAULT_MAX (20) or wait_nr. The value of 20 is chosen as a reasonable heuristic to allow enough work batching but also keep latency down. Add a retry_llist that maintains a list of local tw that couldn't be done in time. No synchronisation is needed since it is only modified within the task context. Signed-off-by: David Wei Link: https://lore.kernel.org/r/20241120221452.3762588-3-dw@davidwei.uk Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/io_uring.c | 43 +++++++++++++++++++++++++--------- io_uring/io_uring.h | 2 +- 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index aa5f5ea98076..3e934feb3187 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -335,6 +335,7 @@ struct io_ring_ctx { */ struct { struct llist_head work_llist; + struct llist_head retry_llist; unsigned long check_cq; atomic_t cq_wait_nr; atomic_t cq_timeouts; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 55e3618b726d..bfa93888f862 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -122,6 +122,7 @@ #define IO_COMPL_BATCH 32 #define IO_REQ_ALLOC_BATCH 8 +#define IO_LOCAL_TW_DEFAULT_MAX 20 struct io_defer_entry { struct list_head list; @@ -1256,6 +1257,8 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) struct llist_node *node = llist_del_all(&ctx->work_llist); __io_fallback_tw(node, false); + node = llist_del_all(&ctx->retry_llist); + __io_fallback_tw(node, false); } static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, @@ -1270,37 +1273,55 @@ static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, return false; } +static int __io_run_local_work_loop(struct llist_node **node, + struct io_tw_state *ts, + int events) +{ + while (*node) { + struct llist_node *next = (*node)->next; + struct io_kiocb *req = container_of(*node, struct io_kiocb, + io_task_work.node); + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + req, ts); + *node = next; + if (--events <= 0) + break; + } + + return events; +} + static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, int min_events) { struct llist_node *node; unsigned int loops = 0; - int ret = 0; + int ret, limit; if (WARN_ON_ONCE(ctx->submitter_task != current)) return -EEXIST; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + limit = max(IO_LOCAL_TW_DEFAULT_MAX, min_events); again: + ret = __io_run_local_work_loop(&ctx->retry_llist.first, ts, limit); + if (ctx->retry_llist.first) + goto retry_done; + /* * llists are in reverse order, flip it back the right way before * running the pending items. */ node = llist_reverse_order(llist_del_all(&ctx->work_llist)); - while (node) { - struct llist_node *next = node->next; - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); - INDIRECT_CALL_2(req->io_task_work.func, - io_poll_task_func, io_req_rw_complete, - req, ts); - ret++; - node = next; - } + ret = __io_run_local_work_loop(&node, ts, ret); + ctx->retry_llist.first = node; loops++; + ret = limit - ret; if (io_run_local_work_continue(ctx, ret, min_events)) goto again; +retry_done: io_submit_flush_completions(ctx); if (io_run_local_work_continue(ctx, ret, min_events)) goto again; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 69eb3b23a5a0..12abee607e4a 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -349,7 +349,7 @@ static inline int io_run_task_work(void) static inline bool io_local_work_pending(struct io_ring_ctx *ctx) { - return !llist_empty(&ctx->work_llist); + return !llist_empty(&ctx->work_llist) || !llist_empty(&ctx->retry_llist); } static inline bool io_task_work_pending(struct io_ring_ctx *ctx) -- 2.51.0