From 68685fa20edc5307fc893a06473c19661c236f29 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 15 Nov 2024 16:54:38 +0000 Subject: [PATCH 01/16] io_uring: fortify io_pin_pages with a warning We're a bit too frivolous with types of nr_pages arguments, converting it to long and back to int, passing an unsigned int pointer as an int pointer and so on. Shouldn't cause any problem but should be carefully reviewed, but until then let's add a WARN_ON_ONCE check to be more confident callers don't pass poorely checked arguents. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d48e0c097cbd90fb47acaddb6c247596510d8cfc.1731689588.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 85c66fa54956..6ab59c60dfd0 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -140,6 +140,8 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) nr_pages = end - start; if (WARN_ON_ONCE(!nr_pages)) return ERR_PTR(-EINVAL); + if (WARN_ON_ONCE(nr_pages > INT_MAX)) + return ERR_PTR(-EOVERFLOW); pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) -- 2.51.0 From 3730aebbdac8770f64ab66eb5e7129bc8dae731d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 15 Nov 2024 16:54:39 +0000 Subject: [PATCH 02/16] io_uring: disable ENTER_EXT_ARG_REG for IOPOLL IOPOLL doesn't use the extended arguments, no need for it to support IORING_ENTER_EXT_ARG_REG. Let's disable it for IOPOLL, if anything it leaves more space for future extensions. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a35ecd919dbdc17bd5b7932273e317832c531b45.1731689588.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bd71782057de..464a70bde7e6 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3214,12 +3214,8 @@ static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags, if (!(flags & IORING_ENTER_EXT_ARG)) return 0; - - if (flags & IORING_ENTER_EXT_ARG_REG) { - if (argsz != sizeof(struct io_uring_reg_wait)) - return -EINVAL; - return PTR_ERR(io_get_ext_arg_reg(ctx, argp)); - } + if (flags & IORING_ENTER_EXT_ARG_REG) + return -EINVAL; if (argsz != sizeof(arg)) return -EINVAL; if (copy_from_user(&arg, argp, sizeof(arg))) -- 2.51.0 From 83e041522eb9c45479f4490b212687cf1e7e9999 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 15 Nov 2024 16:54:40 +0000 Subject: [PATCH 03/16] io_uring: temporarily disable registered waits Disable wait argument registration as it'll be replaced with a more generic feature. We'll still need IORING_ENTER_EXT_ARG_REG parsing in a few commits so leave it be. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/70b1d1d218c41ba77a76d1789c8641dab0b0563e.1731689588.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 10 ----- include/uapi/linux/io_uring.h | 3 -- io_uring/io_uring.c | 10 ----- io_uring/register.c | 82 ---------------------------------- io_uring/register.h | 1 - 5 files changed, 106 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 072e65e93105..52a5da99a205 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -330,14 +330,6 @@ struct io_ring_ctx { atomic_t cq_wait_nr; atomic_t cq_timeouts; struct wait_queue_head cq_wait; - - /* - * If registered with IORING_REGISTER_CQWAIT_REG, a single - * page holds N entries, mapped in cq_wait_arg. cq_wait_index - * is the maximum allowable index. - */ - struct io_uring_reg_wait *cq_wait_arg; - unsigned char cq_wait_index; } ____cacheline_aligned_in_smp; /* timeouts */ @@ -431,8 +423,6 @@ struct io_ring_ctx { unsigned short n_sqe_pages; struct page **ring_pages; struct page **sqe_pages; - - struct page **cq_wait_page; }; struct io_tw_state { diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 5d08435b95a8..132f5db3d4e8 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -627,9 +627,6 @@ enum io_uring_register_op { /* resize CQ ring */ IORING_REGISTER_RESIZE_RINGS = 33, - /* register fixed io_uring_reg_wait arguments */ - IORING_REGISTER_CQWAIT_REG = 34, - /* this goes last */ IORING_REGISTER_LAST, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 464a70bde7e6..286b7bb73978 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2709,7 +2709,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); io_futex_cache_free(ctx); io_destroy_buffers(ctx); - io_unregister_cqwait_reg(ctx); mutex_unlock(&ctx->uring_lock); if (ctx->sq_creds) put_cred(ctx->sq_creds); @@ -3195,15 +3194,6 @@ void __io_uring_cancel(bool cancel_all) static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, const struct io_uring_getevents_arg __user *uarg) { - struct io_uring_reg_wait *arg = READ_ONCE(ctx->cq_wait_arg); - - if (arg) { - unsigned int index = (unsigned int) (uintptr_t) uarg; - - if (index <= ctx->cq_wait_index) - return arg + index; - } - return ERR_PTR(-EFAULT); } diff --git a/io_uring/register.c b/io_uring/register.c index 45edfc57963a..3c5a3cfb186b 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -570,82 +570,6 @@ out: return ret; } -void io_unregister_cqwait_reg(struct io_ring_ctx *ctx) -{ - unsigned short npages = 1; - - if (!ctx->cq_wait_page) - return; - - io_pages_unmap(ctx->cq_wait_arg, &ctx->cq_wait_page, &npages, true); - ctx->cq_wait_arg = NULL; - if (ctx->user) - __io_unaccount_mem(ctx->user, 1); -} - -/* - * Register a page holding N entries of struct io_uring_reg_wait, which can - * be used via io_uring_enter(2) if IORING_GETEVENTS_EXT_ARG_REG is set. - * If that is set with IORING_GETEVENTS_EXT_ARG, then instead of passing - * in a pointer for a struct io_uring_getevents_arg, an index into this - * registered array is passed, avoiding two (arg + timeout) copies per - * invocation. - */ -static int io_register_cqwait_reg(struct io_ring_ctx *ctx, void __user *uarg) -{ - struct io_uring_cqwait_reg_arg arg; - struct io_uring_reg_wait *reg; - struct page **pages; - unsigned long len; - int nr_pages, poff; - int ret; - - if (ctx->cq_wait_page || ctx->cq_wait_arg) - return -EBUSY; - if (copy_from_user(&arg, uarg, sizeof(arg))) - return -EFAULT; - if (!arg.nr_entries || arg.flags) - return -EINVAL; - if (arg.struct_size != sizeof(*reg)) - return -EINVAL; - if (check_mul_overflow(arg.struct_size, arg.nr_entries, &len)) - return -EOVERFLOW; - if (len > PAGE_SIZE) - return -EINVAL; - /* offset + len must fit within a page, and must be reg_wait aligned */ - poff = arg.user_addr & ~PAGE_MASK; - if (len + poff > PAGE_SIZE) - return -EINVAL; - if (poff % arg.struct_size) - return -EINVAL; - - pages = io_pin_pages(arg.user_addr, len, &nr_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); - ret = -EINVAL; - if (nr_pages != 1) - goto out_free; - if (ctx->user) { - ret = __io_account_mem(ctx->user, 1); - if (ret) - goto out_free; - } - - reg = vmap(pages, 1, VM_MAP, PAGE_KERNEL); - if (reg) { - ctx->cq_wait_index = arg.nr_entries - 1; - WRITE_ONCE(ctx->cq_wait_page, pages); - WRITE_ONCE(ctx->cq_wait_arg, (void *) reg + poff); - return 0; - } - ret = -ENOMEM; - if (ctx->user) - __io_unaccount_mem(ctx->user, 1); -out_free: - io_pages_free(&pages, nr_pages); - return ret; -} - static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -840,12 +764,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_resize_rings(ctx, arg); break; - case IORING_REGISTER_CQWAIT_REG: - ret = -EINVAL; - if (!arg || nr_args != 1) - break; - ret = io_register_cqwait_reg(ctx, arg); - break; default: ret = -EINVAL; break; diff --git a/io_uring/register.h b/io_uring/register.h index 3e935e8fa4b2..a5f39d5ef9e0 100644 --- a/io_uring/register.h +++ b/io_uring/register.h @@ -5,6 +5,5 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx); int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id); struct file *io_uring_register_get_file(unsigned int fd, bool registered); -void io_unregister_cqwait_reg(struct io_ring_ctx *ctx); #endif -- 2.51.0 From dfbbfbf191878e8dd422768ce009858d8b5b761e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 15 Nov 2024 16:54:41 +0000 Subject: [PATCH 04/16] io_uring: introduce concept of memory regions We've got a good number of mappings we share with the userspace, that includes the main rings, provided buffer rings, upcoming rings for zerocopy rx and more. All of them duplicate user argument parsing and some internal details as well (page pinnning, huge page optimisations, mmap'ing, etc.) Introduce a notion of regions. For userspace for now it's just a new structure called struct io_uring_region_desc which is supposed to parameterise all such mapping / queue creations. A region either represents a user provided chunk of memory, in which case the user_addr field should point to it, or a request for the kernel to allocate the memory, in which case the user would need to mmap it after using the offset returned in the mmap_offset field. With a uniform userspace API we can avoid additional boiler plate code and apply future optimisation to all of them at once. Internally, there is a new structure struct io_mapped_region holding all relevant runtime information and some helpers to work with it. This patch limits it to user provided regions. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 +++ include/uapi/linux/io_uring.h | 14 +++++++ io_uring/memmap.c | 67 ++++++++++++++++++++++++++++++++++ io_uring/memmap.h | 14 +++++++ 4 files changed, 101 insertions(+) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 52a5da99a205..1d3a37234ace 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -75,6 +75,12 @@ struct io_hash_table { unsigned hash_bits; }; +struct io_mapped_region { + struct page **pages; + void *vmap_ptr; + size_t nr_pages; +}; + /* * Arbitrary limit, can be raised if need be */ diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 132f5db3d4e8..5cbfd330c688 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -647,6 +647,20 @@ struct io_uring_files_update { __aligned_u64 /* __s32 * */ fds; }; +enum { + /* initialise with user provided memory pointed by user_addr */ + IORING_MEM_REGION_TYPE_USER = 1, +}; + +struct io_uring_region_desc { + __u64 user_addr; + __u64 size; + __u32 flags; + __u32 id; + __u64 mmap_offset; + __u64 __resv[4]; +}; + /* * Register a fully sparse file space, rather than pass in an array of all * -1 file descriptors. diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 6ab59c60dfd0..bbd9569a0120 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -12,6 +12,7 @@ #include "memmap.h" #include "kbuf.h" +#include "rsrc.h" static void *io_mem_alloc_compound(struct page **pages, int nr_pages, size_t size, gfp_t gfp) @@ -194,6 +195,72 @@ void *__io_uaddr_map(struct page ***pages, unsigned short *npages, return ERR_PTR(-ENOMEM); } +void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) +{ + if (mr->pages) { + unpin_user_pages(mr->pages, mr->nr_pages); + kvfree(mr->pages); + } + if (mr->vmap_ptr) + vunmap(mr->vmap_ptr); + if (mr->nr_pages && ctx->user) + __io_unaccount_mem(ctx->user, mr->nr_pages); + + memset(mr, 0, sizeof(*mr)); +} + +int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, + struct io_uring_region_desc *reg) +{ + int pages_accounted = 0; + struct page **pages; + int nr_pages, ret; + void *vptr; + u64 end; + + if (WARN_ON_ONCE(mr->pages || mr->vmap_ptr || mr->nr_pages)) + return -EFAULT; + if (memchr_inv(®->__resv, 0, sizeof(reg->__resv))) + return -EINVAL; + if (reg->flags != IORING_MEM_REGION_TYPE_USER) + return -EINVAL; + if (!reg->user_addr) + return -EFAULT; + if (!reg->size || reg->mmap_offset || reg->id) + return -EINVAL; + if ((reg->size >> PAGE_SHIFT) > INT_MAX) + return E2BIG; + if ((reg->user_addr | reg->size) & ~PAGE_MASK) + return -EINVAL; + if (check_add_overflow(reg->user_addr, reg->size, &end)) + return -EOVERFLOW; + + pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + if (ctx->user) { + ret = __io_account_mem(ctx->user, nr_pages); + if (ret) + goto out_free; + pages_accounted = nr_pages; + } + + vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (!vptr) + goto out_free; + + mr->pages = pages; + mr->vmap_ptr = vptr; + mr->nr_pages = nr_pages; + return 0; +out_free: + if (pages_accounted) + __io_unaccount_mem(ctx->user, pages_accounted); + io_pages_free(&pages, nr_pages); + return ret; +} + static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, size_t sz) { diff --git a/io_uring/memmap.h b/io_uring/memmap.h index 5cec5b7ac49a..f361a635b6c7 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -22,4 +22,18 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, unsigned long flags); int io_uring_mmap(struct file *file, struct vm_area_struct *vma); +void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr); +int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, + struct io_uring_region_desc *reg); + +static inline void *io_region_get_ptr(struct io_mapped_region *mr) +{ + return mr->vmap_ptr; +} + +static inline bool io_region_is_set(struct io_mapped_region *mr) +{ + return !!mr->nr_pages; +} + #endif -- 2.51.0 From 93238e66185524aad925acefb2312203b9e26d63 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 15 Nov 2024 16:54:42 +0000 Subject: [PATCH 05/16] io_uring: add memory region registration Regions will serve multiple purposes. First, with it we can decouple ring/etc. object creation from registration / mapping of the memory they will be placed in. We already have hacks that allow to put both SQ and CQ into the same huge page, in the future we should be able to: region = create_region(io_ring); create_pbuf_ring(io_uring, region, offset=0); create_pbuf_ring(io_uring, region, offset=N); The second use case is efficiently passing parameters. The following patch enables back on top of regions IORING_ENTER_EXT_ARG_REG, which optimises wait arguments. It'll also be useful for request arguments replacing iovecs, msghdr, etc. pointers. Eventually it would also be handy for BPF as well if it comes to fruition. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0798cf3a14fad19cfc96fc9feca5f3e11481691d.1731689588.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ include/uapi/linux/io_uring.h | 8 ++++++++ io_uring/io_uring.c | 1 + io_uring/register.c | 37 ++++++++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 1d3a37234ace..e1d69123e164 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -429,6 +429,9 @@ struct io_ring_ctx { unsigned short n_sqe_pages; struct page **ring_pages; struct page **sqe_pages; + + /* used for optimised request parameter and wait argument passing */ + struct io_mapped_region param_region; }; struct io_tw_state { diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 5cbfd330c688..1ee35890125b 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -627,6 +627,8 @@ enum io_uring_register_op { /* resize CQ ring */ IORING_REGISTER_RESIZE_RINGS = 33, + IORING_REGISTER_MEM_REGION = 34, + /* this goes last */ IORING_REGISTER_LAST, @@ -661,6 +663,12 @@ struct io_uring_region_desc { __u64 __resv[4]; }; +struct io_uring_mem_region_reg { + __u64 region_uptr; /* struct io_uring_region_desc * */ + __u64 flags; + __u64 __resv[2]; +}; + /* * Register a fully sparse file space, rather than pass in an array of all * -1 file descriptors. diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 286b7bb73978..c640b8a4ceee 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2709,6 +2709,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); io_futex_cache_free(ctx); io_destroy_buffers(ctx); + io_free_region(ctx, &ctx->param_region); mutex_unlock(&ctx->uring_lock); if (ctx->sq_creds) put_cred(ctx->sq_creds); diff --git a/io_uring/register.c b/io_uring/register.c index 3c5a3cfb186b..2cbac3d9b288 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -570,6 +570,37 @@ out: return ret; } +static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) +{ + struct io_uring_mem_region_reg __user *reg_uptr = uarg; + struct io_uring_mem_region_reg reg; + struct io_uring_region_desc __user *rd_uptr; + struct io_uring_region_desc rd; + int ret; + + if (io_region_is_set(&ctx->param_region)) + return -EBUSY; + if (copy_from_user(®, reg_uptr, sizeof(reg))) + return -EFAULT; + rd_uptr = u64_to_user_ptr(reg.region_uptr); + if (copy_from_user(&rd, rd_uptr, sizeof(rd))) + return -EFAULT; + + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) + return -EINVAL; + if (reg.flags) + return -EINVAL; + + ret = io_create_region(ctx, &ctx->param_region, &rd); + if (ret) + return ret; + if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { + io_free_region(ctx, &ctx->param_region); + return -EFAULT; + } + return 0; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -764,6 +795,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_resize_rings(ctx, arg); break; + case IORING_REGISTER_MEM_REGION: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_mem_region(ctx, arg); + break; default: ret = -EINVAL; break; -- 2.51.0 From d617b3147d54c42351eac63b5398d4ddf4f4011b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 15 Nov 2024 16:54:43 +0000 Subject: [PATCH 06/16] io_uring: restore back registered wait arguments Now we've got a more generic region registration API, place IORING_ENTER_EXT_ARG_REG and re-enable it. First, the user has to register a region with the IORING_MEM_REGION_REG_WAIT_ARG flag set. It can only be done for a ring in a disabled state, aka IORING_SETUP_R_DISABLED, to avoid races with already running waiters. With that we should have stable constant values for ctx->cq_wait_{size,arg} in io_get_ext_arg_reg() and hence no READ_ONCE required. The other API difference is that we're now passing byte offsets instead of indexes. The user _must_ align all offsets / pointers to the native word size, failing to do so might but not necessarily has to lead to a failure usually returned as -EFAULT. liburing will be hiding this details from users. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/81822c1b4ffbe8ad391b4f9ad1564def0d26d990.1731689588.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ include/uapi/linux/io_uring.h | 5 +++++ io_uring/io_uring.c | 14 +++++++++++++- io_uring/register.c | 16 +++++++++++++++- 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e1d69123e164..aa5f5ea98076 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -324,6 +324,9 @@ struct io_ring_ctx { unsigned cq_entries; struct io_ev_fd __rcu *io_ev_fd; unsigned cq_extra; + + void *cq_wait_arg; + size_t cq_wait_size; } ____cacheline_aligned_in_smp; /* diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1ee35890125b..4418d0192959 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -663,6 +663,11 @@ struct io_uring_region_desc { __u64 __resv[4]; }; +enum { + /* expose the region as registered wait arguments */ + IORING_MEM_REGION_REG_WAIT_ARG = 1, +}; + struct io_uring_mem_region_reg { __u64 region_uptr; /* struct io_uring_region_desc * */ __u64 flags; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c640b8a4ceee..da8fd460977b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3195,7 +3195,19 @@ void __io_uring_cancel(bool cancel_all) static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, const struct io_uring_getevents_arg __user *uarg) { - return ERR_PTR(-EFAULT); + unsigned long size = sizeof(struct io_uring_reg_wait); + unsigned long offset = (uintptr_t)uarg; + unsigned long end; + + if (unlikely(offset % sizeof(long))) + return ERR_PTR(-EFAULT); + + /* also protects from NULL ->cq_wait_arg as the size would be 0 */ + if (unlikely(check_add_overflow(offset, size, &end) || + end > ctx->cq_wait_size)) + return ERR_PTR(-EFAULT); + + return ctx->cq_wait_arg + offset; } static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags, diff --git a/io_uring/register.c b/io_uring/register.c index 2cbac3d9b288..1a60f4916649 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -588,7 +588,16 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) return -EINVAL; - if (reg.flags) + if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) + return -EINVAL; + + /* + * This ensures there are no waiters. Waiters are unlocked and it's + * hard to synchronise with them, especially if we need to initialise + * the region. + */ + if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) && + !(ctx->flags & IORING_SETUP_R_DISABLED)) return -EINVAL; ret = io_create_region(ctx, &ctx->param_region, &rd); @@ -598,6 +607,11 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) io_free_region(ctx, &ctx->param_region); return -EFAULT; } + + if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) { + ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region); + ctx->cq_wait_size = rd.size; + } return 0; } -- 2.51.0 From a652958888fb1ada3e4f6b548576c2d2c1b60d66 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 17 Nov 2024 00:38:33 +0000 Subject: [PATCH 07/16] io_uring/region: fix error codes after failed vmap io_create_region() jumps after a vmap failure without setting the return code, it could be 0 or just uninitialised. Fixes: dfbbfbf191878 ("io_uring: introduce concept of memory regions") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0abac19dbf81c061cffaa9534a2471ed5460ad3e.1731803848.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index bbd9569a0120..6e6ee79ba94f 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -247,8 +247,10 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, } vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (!vptr) + if (!vptr) { + ret = -ENOMEM; goto out_free; + } mr->pages = pages; mr->vmap_ptr = vptr; -- 2.51.0 From c750629caeca01979da3403f4bebecda88713233 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 18 Nov 2024 15:14:34 +0000 Subject: [PATCH 08/16] io_uring: remove io_uring_cqwait_reg_arg A separate wait argument registration API was removed, also delete leftover uapi definitions. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/143b6a53591badac23632d3e6fa3e5db4b342ee2.1731942445.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4418d0192959..aac9a4f8fa9a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -873,20 +873,6 @@ enum { IORING_REG_WAIT_TS = (1U << 0), }; -/* - * Argument for IORING_REGISTER_CQWAIT_REG, registering a region of - * struct io_uring_reg_wait that can be indexed when io_uring_enter(2) is - * called rather than pass in a wait argument structure separately. - */ -struct io_uring_cqwait_reg_arg { - __u32 flags; - __u32 struct_size; - __u32 nr_entries; - __u32 pad; - __u64 user_addr; - __u64 pad2[3]; -}; - /* * Argument for io_uring_enter(2) with * IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument -- 2.51.0 From e358e09a894dbcd51fdbbcf62bec1df249915834 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 18 Nov 2024 15:14:50 +0000 Subject: [PATCH 09/16] io_uring: protect register tracing Syz reports: BUG: KCSAN: data-race in __se_sys_io_uring_register / io_sqe_files_register read-write to 0xffff8881021940b8 of 4 bytes by task 5923 on cpu 1: io_sqe_files_register+0x2c4/0x3b0 io_uring/rsrc.c:713 __io_uring_register io_uring/register.c:403 [inline] __do_sys_io_uring_register io_uring/register.c:611 [inline] __se_sys_io_uring_register+0x8d0/0x1280 io_uring/register.c:591 __x64_sys_io_uring_register+0x55/0x70 io_uring/register.c:591 x64_sys_call+0x202/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:428 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xc9/0x1c0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f read to 0xffff8881021940b8 of 4 bytes by task 5924 on cpu 0: __do_sys_io_uring_register io_uring/register.c:613 [inline] __se_sys_io_uring_register+0xe4a/0x1280 io_uring/register.c:591 __x64_sys_io_uring_register+0x55/0x70 io_uring/register.c:591 x64_sys_call+0x202/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:428 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xc9/0x1c0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Which should be due to reading the table size after unlock. We don't care much as it's just to print it in trace, but we might as well do it under the lock. Reported-by: syzbot+5a486fef3de40e0d8c76@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8233af2886a37b57f79e444e3db88fcfda1817ac.1731942203.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/register.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/register.c b/io_uring/register.c index 1a60f4916649..1e99c783abdf 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -905,9 +905,10 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); - mutex_unlock(&ctx->uring_lock); + trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr, ctx->buf_table.nr, ret); + mutex_unlock(&ctx->uring_lock); if (!use_registered_ring) fput(file); return ret; -- 2.51.0 From 2ae6bdb1e145af0a47253953132decbd2d52f4b2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 20 Nov 2024 12:15:05 +0300 Subject: [PATCH 10/16] io_uring/region: return negative -E2BIG in io_create_region() This code accidentally returns positivie E2BIG instead of negative -E2BIG. The callers treat negatives and positives the same so this doesn't affect the kernel. The error code is returned to userspace via the system call. Fixes: dfbbfbf19187 ("io_uring: introduce concept of memory regions") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/d8ea3bef-74d8-4f77-8223-6d36464dd4dc@stanley.mountain Signed-off-by: Jens Axboe --- io_uring/memmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 6e6ee79ba94f..3d71756bc598 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -229,7 +229,7 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, if (!reg->size || reg->mmap_offset || reg->id) return -EINVAL; if ((reg->size >> PAGE_SHIFT) > INT_MAX) - return E2BIG; + return -E2BIG; if ((reg->user_addr | reg->size) & ~PAGE_MASK) return -EINVAL; if (check_add_overflow(reg->user_addr, reg->size, &end)) -- 2.51.0 From 40cfe553240b32333b42652370ef5232e6ac59e1 Mon Sep 17 00:00:00 2001 From: David Wei Date: Wed, 20 Nov 2024 14:14:51 -0800 Subject: [PATCH 11/16] io_uring: add io_local_work_pending() In preparation for adding a new llist of tw to retry due to hitting the tw limit, add a helper io_local_work_pending(). This function returns true if there is any local tw pending. For now it only checks ctx->work_llist. Signed-off-by: David Wei Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20241120221452.3762588-2-dw@davidwei.uk Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 14 +++++++------- io_uring/io_uring.h | 9 +++++++-- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index da8fd460977b..55e3618b726d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1261,7 +1261,7 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, int min_events) { - if (llist_empty(&ctx->work_llist)) + if (!io_local_work_pending(ctx)) return false; if (events < min_events) return true; @@ -1314,7 +1314,7 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, { struct io_tw_state ts = {}; - if (llist_empty(&ctx->work_llist)) + if (!io_local_work_pending(ctx)) return 0; return __io_run_local_work(ctx, &ts, min_events); } @@ -2329,7 +2329,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, int io_run_task_work_sig(struct io_ring_ctx *ctx) { - if (!llist_empty(&ctx->work_llist)) { + if (io_local_work_pending(ctx)) { __set_current_state(TASK_RUNNING); if (io_run_local_work(ctx, INT_MAX) > 0) return 0; @@ -2459,7 +2459,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, { if (unlikely(READ_ONCE(ctx->check_cq))) return 1; - if (unlikely(!llist_empty(&ctx->work_llist))) + if (unlikely(io_local_work_pending(ctx))) return 1; if (unlikely(task_work_pending(current))) return 1; @@ -2493,7 +2493,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, if (!io_allowed_run_tw(ctx)) return -EEXIST; - if (!llist_empty(&ctx->work_llist)) + if (io_local_work_pending(ctx)) io_run_local_work(ctx, min_events); io_run_task_work(); @@ -2564,7 +2564,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, * If we got woken because of task_work being processed, run it * now rather than let the caller do another wait loop. */ - if (!llist_empty(&ctx->work_llist)) + if (io_local_work_pending(ctx)) io_run_local_work(ctx, nr_wait); io_run_task_work(); @@ -3158,7 +3158,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) io_run_task_work(); io_uring_drop_tctx_refs(current); xa_for_each(&tctx->xa, index, node) { - if (!llist_empty(&node->ctx->work_llist)) { + if (io_local_work_pending(node->ctx)) { WARN_ON_ONCE(node->ctx->submitter_task && node->ctx->submitter_task != current); goto end_wait; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 4070d4c8ef97..69eb3b23a5a0 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -347,9 +347,14 @@ static inline int io_run_task_work(void) return ret; } +static inline bool io_local_work_pending(struct io_ring_ctx *ctx) +{ + return !llist_empty(&ctx->work_llist); +} + static inline bool io_task_work_pending(struct io_ring_ctx *ctx) { - return task_work_pending(current) || !llist_empty(&ctx->work_llist); + return task_work_pending(current) || io_local_work_pending(ctx); } static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) @@ -484,6 +489,6 @@ enum { static inline bool io_has_work(struct io_ring_ctx *ctx) { return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || - !llist_empty(&ctx->work_llist); + io_local_work_pending(ctx); } #endif -- 2.51.0 From f46b9cdb22f7a167c36b6bcddaef7e8aee2598fa Mon Sep 17 00:00:00 2001 From: David Wei Date: Wed, 20 Nov 2024 14:14:52 -0800 Subject: [PATCH 12/16] io_uring: limit local tw done Instead of eagerly running all available local tw, limit the amount of local tw done to the max of IO_LOCAL_TW_DEFAULT_MAX (20) or wait_nr. The value of 20 is chosen as a reasonable heuristic to allow enough work batching but also keep latency down. Add a retry_llist that maintains a list of local tw that couldn't be done in time. No synchronisation is needed since it is only modified within the task context. Signed-off-by: David Wei Link: https://lore.kernel.org/r/20241120221452.3762588-3-dw@davidwei.uk Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/io_uring.c | 43 +++++++++++++++++++++++++--------- io_uring/io_uring.h | 2 +- 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index aa5f5ea98076..3e934feb3187 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -335,6 +335,7 @@ struct io_ring_ctx { */ struct { struct llist_head work_llist; + struct llist_head retry_llist; unsigned long check_cq; atomic_t cq_wait_nr; atomic_t cq_timeouts; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 55e3618b726d..bfa93888f862 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -122,6 +122,7 @@ #define IO_COMPL_BATCH 32 #define IO_REQ_ALLOC_BATCH 8 +#define IO_LOCAL_TW_DEFAULT_MAX 20 struct io_defer_entry { struct list_head list; @@ -1256,6 +1257,8 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) struct llist_node *node = llist_del_all(&ctx->work_llist); __io_fallback_tw(node, false); + node = llist_del_all(&ctx->retry_llist); + __io_fallback_tw(node, false); } static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, @@ -1270,37 +1273,55 @@ static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, return false; } +static int __io_run_local_work_loop(struct llist_node **node, + struct io_tw_state *ts, + int events) +{ + while (*node) { + struct llist_node *next = (*node)->next; + struct io_kiocb *req = container_of(*node, struct io_kiocb, + io_task_work.node); + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + req, ts); + *node = next; + if (--events <= 0) + break; + } + + return events; +} + static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, int min_events) { struct llist_node *node; unsigned int loops = 0; - int ret = 0; + int ret, limit; if (WARN_ON_ONCE(ctx->submitter_task != current)) return -EEXIST; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + limit = max(IO_LOCAL_TW_DEFAULT_MAX, min_events); again: + ret = __io_run_local_work_loop(&ctx->retry_llist.first, ts, limit); + if (ctx->retry_llist.first) + goto retry_done; + /* * llists are in reverse order, flip it back the right way before * running the pending items. */ node = llist_reverse_order(llist_del_all(&ctx->work_llist)); - while (node) { - struct llist_node *next = node->next; - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); - INDIRECT_CALL_2(req->io_task_work.func, - io_poll_task_func, io_req_rw_complete, - req, ts); - ret++; - node = next; - } + ret = __io_run_local_work_loop(&node, ts, ret); + ctx->retry_llist.first = node; loops++; + ret = limit - ret; if (io_run_local_work_continue(ctx, ret, min_events)) goto again; +retry_done: io_submit_flush_completions(ctx); if (io_run_local_work_continue(ctx, ret, min_events)) goto again; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 69eb3b23a5a0..12abee607e4a 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -349,7 +349,7 @@ static inline int io_run_task_work(void) static inline bool io_local_work_pending(struct io_ring_ctx *ctx) { - return !llist_empty(&ctx->work_llist); + return !llist_empty(&ctx->work_llist) || !llist_empty(&ctx->retry_llist); } static inline bool io_task_work_pending(struct io_ring_ctx *ctx) -- 2.51.0 From ee116574de8415b0673c466e6cd28ba5f70c41a2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 21 Nov 2024 07:12:17 -0700 Subject: [PATCH 13/16] io_uring/nop: ensure nop->fd is always initialized A previous commit added file support for nop, but it only initializes nop->fd if IORING_NOP_FIXED_FILE is set. That check should be IORING_NOP_FILE. Fix up the condition in nop preparation, and initialize it to a sane value even if we're not going to be directly using it. While in there, do the same thing for the nop->buffer field. Reported-by: syzbot+9a8500a45c2cabdf9577@syzkaller.appspotmail.com Fixes: a85f31052bce ("io_uring/nop: add support for testing registered files and buffers") Signed-off-by: Jens Axboe --- io_uring/nop.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/io_uring/nop.c b/io_uring/nop.c index 6d470d4251ee..5e5196df650a 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -35,10 +35,14 @@ int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) nop->result = READ_ONCE(sqe->len); else nop->result = 0; - if (nop->flags & IORING_NOP_FIXED_FILE) + if (nop->flags & IORING_NOP_FILE) nop->fd = READ_ONCE(sqe->fd); + else + nop->fd = -1; if (nop->flags & IORING_NOP_FIXED_BUFFER) nop->buffer = READ_ONCE(sqe->buf_index); + else + nop->buffer = -1; return 0; } -- 2.51.0 From 0c0a4eae26ac78379d0c1db053de168a8febc6c9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 26 Nov 2024 00:34:18 +0000 Subject: [PATCH 14/16] io_uring: check for overflows in io_pin_pages WARNING: CPU: 0 PID: 5834 at io_uring/memmap.c:144 io_pin_pages+0x149/0x180 io_uring/memmap.c:144 CPU: 0 UID: 0 PID: 5834 Comm: syz-executor825 Not tainted 6.12.0-next-20241118-syzkaller #0 Call Trace: __io_uaddr_map+0xfb/0x2d0 io_uring/memmap.c:183 io_rings_map io_uring/io_uring.c:2611 [inline] io_allocate_scq_urings+0x1c0/0x650 io_uring/io_uring.c:3470 io_uring_create+0x5b5/0xc00 io_uring/io_uring.c:3692 io_uring_setup io_uring/io_uring.c:3781 [inline] ... io_pin_pages()'s uaddr parameter came directly from the user and can be garbage. Don't just add size to it as it can overflow. Cc: stable@vger.kernel.org Reported-by: syzbot+2159cbb522b02847c053@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1b7520ddb168e1d537d64be47414a0629d0d8f8f.1732581026.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 3d71756bc598..ea08f19dc648 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -136,7 +136,12 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) struct page **pages; int ret; - end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (check_add_overflow(uaddr, len, &end)) + return ERR_PTR(-EOVERFLOW); + if (check_add_overflow(end, PAGE_SIZE - 1, &end)) + return ERR_PTR(-EOVERFLOW); + + end = end >> PAGE_SHIFT; start = uaddr >> PAGE_SHIFT; nr_pages = end - start; if (WARN_ON_ONCE(!nr_pages)) -- 2.51.0 From 49c5c63d48eb5b110580e4c4b937f0006fcc9b10 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 26 Nov 2024 13:42:27 -0700 Subject: [PATCH 15/16] io_uring: fix task_work cap overshooting A previous commit fixed task_work overrunning by a lot more than what the user asked for, by adding a retry list. However, it didn't cap the overall count, hence for multiple task_work runs inside the same wait loop, it'd still overshoot the target by potentially a large amount. Cap it generally inside the wait path. Note that this will still overshoot the default limit of 20, but should overshoot by no more than limit-1 in addition to the limit. That still provides a ceiling over how much task_work will be run, rather than still having gaps where it was uncapped essentially. Fixes: f46b9cdb22f7 ("io_uring: limit local tw done") Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bfa93888f862..ae199e44da57 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1277,6 +1277,8 @@ static int __io_run_local_work_loop(struct llist_node **node, struct io_tw_state *ts, int events) { + int ret = 0; + while (*node) { struct llist_node *next = (*node)->next; struct io_kiocb *req = container_of(*node, struct io_kiocb, @@ -1285,27 +1287,27 @@ static int __io_run_local_work_loop(struct llist_node **node, io_poll_task_func, io_req_rw_complete, req, ts); *node = next; - if (--events <= 0) + if (++ret >= events) break; } - return events; + return ret; } static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, - int min_events) + int min_events, int max_events) { struct llist_node *node; unsigned int loops = 0; - int ret, limit; + int ret = 0; if (WARN_ON_ONCE(ctx->submitter_task != current)) return -EEXIST; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - limit = max(IO_LOCAL_TW_DEFAULT_MAX, min_events); again: - ret = __io_run_local_work_loop(&ctx->retry_llist.first, ts, limit); + min_events -= ret; + ret = __io_run_local_work_loop(&ctx->retry_llist.first, ts, max_events); if (ctx->retry_llist.first) goto retry_done; @@ -1314,11 +1316,10 @@ again: * running the pending items. */ node = llist_reverse_order(llist_del_all(&ctx->work_llist)); - ret = __io_run_local_work_loop(&node, ts, ret); + ret += __io_run_local_work_loop(&node, ts, max_events - ret); ctx->retry_llist.first = node; loops++; - ret = limit - ret; if (io_run_local_work_continue(ctx, ret, min_events)) goto again; retry_done: @@ -1337,16 +1338,18 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, if (!io_local_work_pending(ctx)) return 0; - return __io_run_local_work(ctx, &ts, min_events); + return __io_run_local_work(ctx, &ts, min_events, + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); } -static int io_run_local_work(struct io_ring_ctx *ctx, int min_events) +static int io_run_local_work(struct io_ring_ctx *ctx, int min_events, + int max_events) { struct io_tw_state ts = {}; int ret; mutex_lock(&ctx->uring_lock); - ret = __io_run_local_work(ctx, &ts, min_events); + ret = __io_run_local_work(ctx, &ts, min_events, max_events); mutex_unlock(&ctx->uring_lock); return ret; } @@ -2352,7 +2355,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx) { if (io_local_work_pending(ctx)) { __set_current_state(TASK_RUNNING); - if (io_run_local_work(ctx, INT_MAX) > 0) + if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) return 0; } if (io_run_task_work() > 0) @@ -2515,7 +2518,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, if (!io_allowed_run_tw(ctx)) return -EEXIST; if (io_local_work_pending(ctx)) - io_run_local_work(ctx, min_events); + io_run_local_work(ctx, min_events, + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); io_run_task_work(); if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) @@ -2586,7 +2590,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, * now rather than let the caller do another wait loop. */ if (io_local_work_pending(ctx)) - io_run_local_work(ctx, nr_wait); + io_run_local_work(ctx, nr_wait, nr_wait); io_run_task_work(); /* @@ -3098,7 +3102,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && io_allowed_defer_tw_run(ctx)) - ret |= io_run_local_work(ctx, INT_MAX) > 0; + ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; ret |= io_cancel_defer_files(ctx, tctx, cancel_all); mutex_lock(&ctx->uring_lock); ret |= io_poll_remove_all(ctx, tctx, cancel_all); -- 2.51.0 From 43eef70e7e2ac74e7767731dd806720c7fb5e010 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 25 Nov 2024 23:10:31 +0000 Subject: [PATCH 16/16] io_uring: fix corner case forgetting to vunmap io_pages_unmap() is a bit tricky in trying to figure whether the pages were previously vmap'ed or not. In particular If there is juts one page it belives there is no need to vunmap. Paired io_pages_map(), however, could've failed io_mem_alloc_compound() and attempted to io_mem_alloc_single(), which does vmap, and that leads to unpaired vmap. The solution is to fail if io_mem_alloc_compound() can't allocate a single page. That's the easiest way to deal with it, and those two functions are getting removed soon, so no need to overcomplicate it. Cc: stable@vger.kernel.org Fixes: 3ab1db3c6039e ("io_uring: get rid of remap_pfn_range() for mapping rings/sqes") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/477e75a3907a2fe83249e49c0a92cd480b2c60e0.1732569842.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index ea08f19dc648..57de9bccbf50 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -73,6 +73,8 @@ void *io_pages_map(struct page ***out_pages, unsigned short *npages, ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); if (!IS_ERR(ret)) goto done; + if (nr_pages == 1) + goto fail; ret = io_mem_alloc_single(pages, nr_pages, size, gfp); if (!IS_ERR(ret)) { @@ -81,7 +83,7 @@ done: *npages = nr_pages; return ret; } - +fail: kvfree(pages); *out_pages = NULL; *npages = 0; -- 2.51.0