From 19a94da447f832ee614f8f5532d31c1c70061520 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 9 May 2025 12:12:51 +0100 Subject: [PATCH 01/16] io_uring: consolidate drain seq checking We check sequences when queuing drained requests as well when flushing them. Instead, always queue and immediately try to flush, so that all seq handling can be kept contained in the flushing code. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d4651f742e671af5b3216581e539ea5d31bc7125.1746788718.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 45 +++++++++++++++++---------------------------- 1 file changed, 17 insertions(+), 28 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f83abdf8a056..3d1f4b2e4536 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -389,17 +389,6 @@ static void io_account_cq_overflow(struct io_ring_ctx *ctx) ctx->cq_extra--; } -static bool req_need_defer(struct io_kiocb *req, u32 seq) -{ - if (unlikely(req->flags & REQ_F_IO_DRAIN)) { - struct io_ring_ctx *ctx = req->ctx; - - return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; - } - - return false; -} - static void io_clean_op(struct io_kiocb *req) { if (unlikely(req->flags & REQ_F_BUFFER_SELECTED)) @@ -566,11 +555,10 @@ static bool io_drain_defer_seq(struct io_kiocb *req, u32 seq) return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; } -static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) +static __cold noinline void __io_queue_deferred(struct io_ring_ctx *ctx) { bool drain_seen = false, first = true; - spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); @@ -584,7 +572,12 @@ static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) kfree(de); first = false; } - spin_unlock(&ctx->completion_lock); +} + +static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) +{ + guard(spinlock)(&ctx->completion_lock); + __io_queue_deferred(ctx); } void __io_commit_cqring_flush(struct io_ring_ctx *ctx) @@ -1671,30 +1664,26 @@ static __cold void io_drain_req(struct io_kiocb *req) __must_hold(&ctx->uring_lock) { struct io_ring_ctx *ctx = req->ctx; + bool drain = req->flags & IOSQE_IO_DRAIN; struct io_defer_entry *de; - u32 seq = io_get_sequence(req); - io_prep_async_link(req); de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT); if (!de) { io_req_defer_failed(req, -ENOMEM); return; } - spin_lock(&ctx->completion_lock); - if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { - spin_unlock(&ctx->completion_lock); - kfree(de); - ctx->drain_active = false; - io_req_task_queue(req); - return; - } - + io_prep_async_link(req); trace_io_uring_defer(req); de->req = req; - de->seq = seq; - list_add_tail(&de->list, &ctx->defer_list); - spin_unlock(&ctx->completion_lock); + de->seq = io_get_sequence(req); + + scoped_guard(spinlock, &ctx->completion_lock) { + list_add_tail(&de->list, &ctx->defer_list); + __io_queue_deferred(ctx); + if (!drain && list_empty(&ctx->defer_list)) + ctx->drain_active = false; + } } static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, -- 2.51.0 From b0c8a6401fbca91da4fe0dc10d61a770f1581e45 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 9 May 2025 12:12:52 +0100 Subject: [PATCH 02/16] io_uring: open code io_account_cq_overflow() io_account_cq_overflow() doesn't help explaining what's going on in there, and it'll become even smaller with following patches, so open code it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e4333fa0d371f519e52a71148ebdffed4b8d3aa9.1746788718.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3d1f4b2e4536..6efecb46c828 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -381,14 +381,6 @@ err: return NULL; } -static void io_account_cq_overflow(struct io_ring_ctx *ctx) -{ - struct io_rings *r = ctx->rings; - - WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); - ctx->cq_extra--; -} - static void io_clean_op(struct io_kiocb *req) { if (unlikely(req->flags & REQ_F_BUFFER_SELECTED)) @@ -742,12 +734,15 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); if (!ocqe) { + struct io_rings *r = ctx->rings; + /* * If we're in ring overflow flush mode, or in task cancel mode, * or cannot allocate an overflow entry, then we need to drop it * on the floor. */ - io_account_cq_overflow(ctx); + WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); + ctx->cq_extra--; set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); return false; } -- 2.51.0 From 63de899cb6220357dea9d0f4e5aa459ff5193bb0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 9 May 2025 12:12:53 +0100 Subject: [PATCH 03/16] io_uring: count allocated requests Keep track of the number requests a ring currently has allocated (and not freed), it'll be needed in the next patch. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c8f8308294dc2a1cb8925d984d937d4fc14ab5d4.1746788718.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/io_uring.c | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 7e23e993280e..73b289b48280 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -435,6 +435,7 @@ struct io_ring_ctx { /* protected by ->completion_lock */ unsigned evfd_last_cq_tail; + unsigned nr_req_allocated; /* * Protection for resize vs mmap races - both the mmap and resize diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6efecb46c828..714b66ab34b0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -957,6 +957,8 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) } percpu_ref_get_many(&ctx->refs, ret); + ctx->nr_req_allocated += ret; + while (ret--) { struct io_kiocb *req = reqs[ret]; @@ -2694,8 +2696,10 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) kmem_cache_free(req_cachep, req); nr++; } - if (nr) + if (nr) { + ctx->nr_req_allocated -= nr; percpu_ref_put_many(&ctx->refs, nr); + } mutex_unlock(&ctx->uring_lock); } @@ -2732,6 +2736,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) percpu_ref_exit(&ctx->refs); free_uid(ctx->user); io_req_caches_free(ctx); + + WARN_ON_ONCE(ctx->nr_req_allocated); + if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); io_napi_free(ctx); -- 2.51.0 From 63166b815dc163b2e46426cecf707dc5923d6d13 Mon Sep 17 00:00:00 2001 From: hexue Date: Mon, 12 May 2025 13:20:25 +0800 Subject: [PATCH 04/16] io_uring/uring_cmd: fix hybrid polling initialization issue Modify the check for whether the timer is initialized during IO transfer when passthrough is used with hybrid polling, to ensure that it's always setup correctly. Cc: stable@vger.kernel.org Fixes: 01ee194d1aba ("io_uring: add support for hybrid IOPOLL") Signed-off-by: hexue Link: https://lore.kernel.org/r/20250512052025.293031-1-xue01.he@samsung.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index a9ea7d29cdd9..430ed620ddfe 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -254,6 +254,11 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) return -EOPNOTSUPP; issue_flags |= IO_URING_F_IOPOLL; req->iopoll_completed = 0; + if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { + /* make sure every req only blocks once */ + req->flags &= ~REQ_F_IOPOLL_STATE; + req->iopoll_start = ktime_get_ns(); + } } ret = file->f_op->uring_cmd(ioucmd, issue_flags); -- 2.51.0 From 8fb7aee05591fd4d3dca1460448a59e95fa821c3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 9 May 2025 12:12:54 +0100 Subject: [PATCH 05/16] io_uring: drain based on allocates reqs Don't rely on CQ sequence numbers for draining, as it has become messy and needs cq_extra adjustments. Instead, base it on the number of allocated requests and only allow flushing when all requests are in the drain list. As a result, cq_extra is gone, no overhead for its accounting in aux cqe posting, less bloating as it was inlined before, and it's in general simpler than trying to track where we should bump it and where it should be put back like in cases of overflow. Also, it'll likely help with cleaning and unifying some of the CQ posting helpers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/46ece1e34320b046c06fee2498d6b4cd12a700f2.1746788718.git.asml.silence@gmail.com Link: https://lore.kernel.org/r/24497b04b004bceada496033d3c9d09ff8e81ae9.1746944903.git.asml.silence@gmail.com [axboe: fold in fix from link2] Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 79 ++++++++++++++-------------------- io_uring/io_uring.h | 3 +- 3 files changed, 34 insertions(+), 50 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 73b289b48280..00dbd7cd0e7d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -341,7 +341,6 @@ struct io_ring_ctx { unsigned cached_cq_tail; unsigned cq_entries; struct io_ev_fd __rcu *io_ev_fd; - unsigned cq_extra; void *cq_wait_arg; size_t cq_wait_size; @@ -417,6 +416,7 @@ struct io_ring_ctx { struct callback_head poll_wq_task_work; struct list_head defer_list; + unsigned nr_drained; struct io_alloc_cache msg_cache; spinlock_t msg_lock; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 714b66ab34b0..9a9b8d35349b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -129,7 +129,6 @@ struct io_defer_entry { struct list_head list; struct io_kiocb *req; - u32 seq; }; /* requests with any of those set should undergo io_disarm_next() */ @@ -149,6 +148,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool is_sqpoll_thread); static void io_queue_sqe(struct io_kiocb *req); +static void __io_req_caches_free(struct io_ring_ctx *ctx); static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray); @@ -540,46 +540,45 @@ void io_req_queue_iowq(struct io_kiocb *req) io_req_task_work_add(req); } -static bool io_drain_defer_seq(struct io_kiocb *req, u32 seq) +static unsigned io_linked_nr(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *tmp; + unsigned nr = 0; - return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; + io_for_each_link(tmp, req) + nr++; + return nr; } -static __cold noinline void __io_queue_deferred(struct io_ring_ctx *ctx) +static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) { bool drain_seen = false, first = true; + lockdep_assert_held(&ctx->uring_lock); + __io_req_caches_free(ctx); + while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); drain_seen |= de->req->flags & REQ_F_IO_DRAIN; - if ((drain_seen || first) && io_drain_defer_seq(de->req, de->seq)) - break; + if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained) + return; list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); io_req_task_queue(de->req); kfree(de); first = false; } } -static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) -{ - guard(spinlock)(&ctx->completion_lock); - __io_queue_deferred(ctx); -} - void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->poll_activated) io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); - if (ctx->drain_active) - io_queue_deferred(ctx); if (ctx->has_evfd) io_eventfd_signal(ctx, true); } @@ -742,7 +741,6 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, * on the floor. */ WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); - ctx->cq_extra--; set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); return false; } @@ -812,8 +810,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, { struct io_uring_cqe *cqe; - ctx->cq_extra++; - if (likely(io_get_cqe(ctx, &cqe))) { WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); @@ -1459,6 +1455,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } + + if (unlikely(ctx->drain_active)) + io_queue_deferred(ctx); + ctx->submit_state.cq_flush = false; } @@ -1646,17 +1646,6 @@ io_req_flags_t io_file_get_flags(struct file *file) return res; } -static u32 io_get_sequence(struct io_kiocb *req) -{ - u32 seq = req->ctx->cached_sq_head; - struct io_kiocb *cur; - - /* need original cached_sq_head, but it was increased for each req */ - io_for_each_link(cur, req) - seq--; - return seq; -} - static __cold void io_drain_req(struct io_kiocb *req) __must_hold(&ctx->uring_lock) { @@ -1673,14 +1662,12 @@ static __cold void io_drain_req(struct io_kiocb *req) io_prep_async_link(req); trace_io_uring_defer(req); de->req = req; - de->seq = io_get_sequence(req); - scoped_guard(spinlock, &ctx->completion_lock) { - list_add_tail(&de->list, &ctx->defer_list); - __io_queue_deferred(ctx); - if (!drain && list_empty(&ctx->defer_list)) - ctx->drain_active = false; - } + ctx->nr_drained += io_linked_nr(req); + list_add_tail(&de->list, &ctx->defer_list); + io_queue_deferred(ctx); + if (!drain && list_empty(&ctx->defer_list)) + ctx->drain_active = false; } static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, @@ -2263,10 +2250,6 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) { head = READ_ONCE(ctx->sq_array[head]); if (unlikely(head >= ctx->sq_entries)) { - /* drop invalid entries */ - spin_lock(&ctx->completion_lock); - ctx->cq_extra--; - spin_unlock(&ctx->completion_lock); WRITE_ONCE(ctx->rings->sq_dropped, READ_ONCE(ctx->rings->sq_dropped) + 1); return false; @@ -2684,13 +2667,11 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, return off; } -static void io_req_caches_free(struct io_ring_ctx *ctx) +static __cold void __io_req_caches_free(struct io_ring_ctx *ctx) { struct io_kiocb *req; int nr = 0; - mutex_lock(&ctx->uring_lock); - while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); kmem_cache_free(req_cachep, req); @@ -2700,7 +2681,12 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) ctx->nr_req_allocated -= nr; percpu_ref_put_many(&ctx->refs, nr); } - mutex_unlock(&ctx->uring_lock); +} + +static __cold void io_req_caches_free(struct io_ring_ctx *ctx) +{ + guard(mutex)(&ctx->uring_lock); + __io_req_caches_free(ctx); } static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) @@ -3005,20 +2991,19 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, struct io_defer_entry *de; LIST_HEAD(list); - spin_lock(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { if (io_match_task_safe(de->req, tctx, cancel_all)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } } - spin_unlock(&ctx->completion_lock); if (list_empty(&list)) return false; while (!list_empty(&list)) { de = list_first_entry(&list, struct io_defer_entry, list); list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); io_req_task_queue_fail(de->req, -ECANCELED); kfree(de); } @@ -3093,8 +3078,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && io_allowed_defer_tw_run(ctx)) ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; - ret |= io_cancel_defer_files(ctx, tctx, cancel_all); mutex_lock(&ctx->uring_lock); + ret |= io_cancel_defer_files(ctx, tctx, cancel_all); ret |= io_poll_remove_all(ctx, tctx, cancel_all); ret |= io_waitid_remove_all(ctx, tctx, cancel_all); ret |= io_futex_remove_all(ctx, tctx, cancel_all); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e4050b2d0821..81f22196a57d 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -196,7 +196,6 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, { io_lockdep_assert_cq_locked(ctx); - ctx->cq_extra++; ctx->submit_state.cq_flush = true; return io_get_cqe(ctx, cqe_ret); } @@ -414,7 +413,7 @@ static inline void io_req_complete_defer(struct io_kiocb *req) static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active || + if (unlikely(ctx->off_timeout_used || ctx->has_evfd || ctx->poll_activated)) __io_commit_cqring_flush(ctx); } -- 2.51.0 From f446c6311e86618a1f81eb576b56a6266307238f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 12 May 2025 09:06:06 -0600 Subject: [PATCH 06/16] io_uring/memmap: don't use page_address() on a highmem page For older/32-bit systems with highmem, don't assume that the pages in a mapped region are always going to be mapped. If io_region_init_ptr() finds that the pages are coalescable, also check if the first page is a HighMem page or not. If it is, fall through to the usual vmap() mapping rather than attempt to get the unmapped page address. Cc: stable@vger.kernel.org Fixes: c4d0ac1c1567 ("io_uring/memmap: optimise single folio regions") Link: https://lore.kernel.org/all/681fe2fb.050a0220.f2294.001a.GAE@google.com/ Reported-by: syzbot+5b8c4abafcb1d791ccfc@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/681fed0a.050a0220.f2294.001c.GAE@google.com/ Reported-by: syzbot+6456a99dfdc2e78c4feb@syzkaller.appspotmail.com Tested-by: syzbot+6456a99dfdc2e78c4feb@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 76fcc79656b0..07f8a5cbd37e 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -116,7 +116,7 @@ static int io_region_init_ptr(struct io_mapped_region *mr) void *ptr; if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) { - if (ifd.nr_folios == 1) { + if (ifd.nr_folios == 1 && !PageHighMem(mr->pages[0])) { mr->ptr = page_address(mr->pages[0]); return 0; } -- 2.51.0 From 475a8d30371604a6363da8e304a608a5959afc40 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 May 2025 18:26:46 +0100 Subject: [PATCH 07/16] io_uring/kbuf: account ring io_buffer_list memory Follow the non-ringed pbuf struct io_buffer_list allocations and account it against the memcg. There is low chance of that being an actual problem as ring provided buffer should either pin user memory or allocate it, which is already accounted. Cc: stable@vger.kernel.org # 6.1 Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3985218b50d341273cafff7234e1a7e6d0db9808.1747150490.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 1cf0d2c01287..446207db1edf 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -621,7 +621,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) io_destroy_bl(ctx, bl); } - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); + free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); if (!bl) return -ENOMEM; -- 2.51.0 From 1724849072854a66861d461b298b04612702d685 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 May 2025 18:26:47 +0100 Subject: [PATCH 08/16] io_uring/kbuf: use mem_is_zero() Make use of mem_is_zero() for reserved fields checking. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/11fe27b7a831329bcdb4ea087317ef123ba7c171.1747150490.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 446207db1edf..344517d1d921 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -602,8 +602,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - - if (reg.resv[0] || reg.resv[1] || reg.resv[2]) + if (!mem_is_zero(reg.resv, sizeof(reg.resv))) return -EINVAL; if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) return -EINVAL; @@ -679,9 +678,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (reg.resv[0] || reg.resv[1] || reg.resv[2]) - return -EINVAL; - if (reg.flags) + if (!mem_is_zero(reg.resv, sizeof(reg.resv)) || reg.flags) return -EINVAL; bl = io_buffer_get_list(ctx, reg.bgid); @@ -701,14 +698,11 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_status buf_status; struct io_buffer_list *bl; - int i; if (copy_from_user(&buf_status, arg, sizeof(buf_status))) return -EFAULT; - - for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++) - if (buf_status.resv[i]) - return -EINVAL; + if (!mem_is_zero(buf_status.resv, sizeof(buf_status.resv))) + return -EINVAL; bl = io_buffer_get_list(ctx, buf_status.buf_group); if (!bl) -- 2.51.0 From 4e9fda29d66b06caf5c81b8acbe0a504effc73fb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 May 2025 18:26:48 +0100 Subject: [PATCH 09/16] io_uring/kbuf: drop extra vars in io_register_pbuf_ring bl and free_bl variables in io_register_pbuf_ring() always point to the same list since we started to reallocate the pre-existent list. Drop free_bl. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d45c3342d74c9030f99376c777a4b3d59089074d.1747150490.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 344517d1d921..406e8a9b42c3 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -591,7 +591,7 @@ err: int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_reg reg; - struct io_buffer_list *bl, *free_bl = NULL; + struct io_buffer_list *bl; struct io_uring_region_desc rd; struct io_uring_buf_ring *br; unsigned long mmap_offset; @@ -620,7 +620,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) io_destroy_bl(ctx, bl); } - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); + bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); if (!bl) return -ENOMEM; @@ -665,7 +665,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return 0; fail: io_free_region(ctx, &bl->region); - kfree(free_bl); + kfree(bl); return ret; } -- 2.51.0 From 52a05d0cf8f3b4569c525153132a90661c32fe11 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 May 2025 18:26:49 +0100 Subject: [PATCH 10/16] io_uring/kbuf: don't compute size twice on prep The size in prep is calculated by io_provide_buffers_prep(), so remove the recomputation a few lines after. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7c97206561b74fce245cb22449c6082d2e066844.1747150490.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 406e8a9b42c3..eb666c02f488 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -509,8 +509,6 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return -EOVERFLOW; if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) return -EOVERFLOW; - - size = (unsigned long)p->len * p->nbufs; if (!access_ok(u64_to_user_ptr(p->addr), size)) return -EFAULT; -- 2.51.0 From c724e801239ffc3714afe65cf6e721ddd04199d0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 May 2025 18:26:50 +0100 Subject: [PATCH 11/16] io_uring/kbuf: refactor __io_remove_buffers __io_remove_buffers used for two purposes, the first is removing buffers for non ring based lists, which implies that it can be called multiple times for the same list. And the second is for destroying lists, which is not perfectly reentrable for ring based lists. It's confusing, so just have a helper for the legacy pbuf buffer removal, make sure it's not called for ring pbuf, and open code all ring pbuf destruction into io_put_bl(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0ae416b099d311ad23f285cea02f2c94c8ae9a6c.1747150490.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index eb666c02f488..df8aeb42e910 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -376,45 +376,33 @@ unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs) return ret; } -static int __io_remove_buffers(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned nbufs) +static int io_remove_buffers_legacy(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, + unsigned long nbufs) { - unsigned i = 0; - - /* shouldn't happen */ - if (!nbufs) - return 0; - - if (bl->flags & IOBL_BUF_RING) { - i = bl->buf_ring->tail - bl->head; - io_free_region(ctx, &bl->region); - /* make sure it's seen as empty */ - INIT_LIST_HEAD(&bl->buf_list); - bl->flags &= ~IOBL_BUF_RING; - return i; - } + unsigned long i = 0; + struct io_buffer *nxt; /* protects io_buffers_cache */ lockdep_assert_held(&ctx->uring_lock); + WARN_ON_ONCE(bl->flags & IOBL_BUF_RING); - while (!list_empty(&bl->buf_list)) { - struct io_buffer *nxt; - + for (i = 0; i < nbufs && !list_empty(&bl->buf_list); i++) { nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&nxt->list); kfree(nxt); - - if (++i == nbufs) - return i; cond_resched(); } - return i; } static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { - __io_remove_buffers(ctx, bl, -1U); + if (bl->flags & IOBL_BUF_RING) + io_free_region(ctx, &bl->region); + else + io_remove_buffers_legacy(ctx, bl, -1U); + kfree(bl); } @@ -477,7 +465,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) ret = -EINVAL; /* can't use provide/remove buffers command on mapped buffers */ if (!(bl->flags & IOBL_BUF_RING)) - ret = __io_remove_buffers(ctx, bl, p->nbufs); + ret = io_remove_buffers_legacy(ctx, bl, p->nbufs); } io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) -- 2.51.0 From 2b61bb1d9aa601ec393054a61be0a707a5bea928 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 May 2025 18:26:51 +0100 Subject: [PATCH 12/16] io_uring/kbuf: unify legacy buf provision and removal Combine IORING_OP_PROVIDE_BUFFERS and IORING_OP_REMOVE_BUFFERS ->issue(), so that we can deduplicate ring locking and list lookups. This way we further reduce code for legacy provided buffers. Locking is also separated from buffer related handling, which makes it a bit simpler with label jumps. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f61af131622ad4337c2fb9f7c453d5b0102c7b90.1747150490.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 73 +++++++++++++++++++----------------------------- io_uring/kbuf.h | 4 +-- io_uring/opdef.c | 4 +-- 3 files changed, 31 insertions(+), 50 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index df8aeb42e910..823e7eb15fb2 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -450,30 +450,6 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - ret = -ENOENT; - bl = io_buffer_get_list(ctx, p->bgid); - if (bl) { - ret = -EINVAL; - /* can't use provide/remove buffers command on mapped buffers */ - if (!(bl->flags & IOBL_BUF_RING)) - ret = io_remove_buffers_legacy(ctx, bl, p->nbufs); - } - io_ring_submit_unlock(ctx, issue_flags); - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { unsigned long size, tmp_check; @@ -535,37 +511,44 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, return i ? 0 : -ENOMEM; } -int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) +static int __io_manage_buffers_legacy(struct io_kiocb *req, + struct io_buffer_list *bl) { struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); + int ret; - bl = io_buffer_get_list(ctx, p->bgid); - if (unlikely(!bl)) { + if (!bl) { + if (req->opcode != IORING_OP_PROVIDE_BUFFERS) + return -ENOENT; bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); - if (!bl) { - ret = -ENOMEM; - goto err; - } + if (!bl) + return -ENOMEM; + INIT_LIST_HEAD(&bl->buf_list); - ret = io_buffer_add_list(ctx, bl, p->bgid); + ret = io_buffer_add_list(req->ctx, bl, p->bgid); if (ret) { kfree(bl); - goto err; + return ret; } } - /* can't add buffers via this command for a mapped buffer ring */ - if (bl->flags & IOBL_BUF_RING) { - ret = -EINVAL; - goto err; - } + /* can't use provide/remove buffers command on mapped buffers */ + if (bl->flags & IOBL_BUF_RING) + return -EINVAL; + if (req->opcode == IORING_OP_PROVIDE_BUFFERS) + return io_add_buffers(req->ctx, p, bl); + return io_remove_buffers_legacy(req->ctx, bl, p->nbufs); +} + +int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret; - ret = io_add_buffers(ctx, p, bl); -err: + io_ring_submit_lock(ctx, issue_flags); + bl = io_buffer_get_list(ctx, p->bgid); + ret = __io_manage_buffers_legacy(req, bl); io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 0798a732e6cb..4d2c209d1a41 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -66,10 +66,8 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg); void io_destroy_buffers(struct io_ring_ctx *ctx); int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags); - int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags); +int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags); int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index db36433c2294..6e0882b051f9 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -333,13 +333,13 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .iopoll = 1, .prep = io_provide_buffers_prep, - .issue = io_provide_buffers, + .issue = io_manage_buffers_legacy, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, .prep = io_remove_buffers_prep, - .issue = io_remove_buffers, + .issue = io_manage_buffers_legacy, }, [IORING_OP_TEE] = { .needs_file = 1, -- 2.51.0 From d871198ee431d90f5308d53998c1ba1d5db5619a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 13 May 2025 15:02:23 -0600 Subject: [PATCH 13/16] io_uring/fdinfo: grab ctx->uring_lock around io_uring_show_fdinfo() Not everything requires locking in there, which is why the 'has_lock' variable exists. But enough does that it's a bit unwieldy to manage. Wrap the whole thing in a ->uring_lock trylock, and just return with no output if we fail to grab it. The existing trylock() will already have greatly diminished utility/output for the failure case. This fixes an issue with reading the SQE fields, if the ring is being actively resized at the same time. Reported-by: Jann Horn Fixes: 79cfe9e59c2a ("io_uring/register: add IORING_REGISTER_RESIZE_RINGS") Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 48 ++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 9414ca6d101c..e0d6a59a89fa 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -86,13 +86,8 @@ static inline void napi_show_fdinfo(struct io_ring_ctx *ctx, } #endif -/* - * Caller holds a reference to the file already, we don't need to do - * anything else to get an extra reference. - */ -__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) +static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { - struct io_ring_ctx *ctx = file->private_data; struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; struct rusage sq_usage; @@ -106,7 +101,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) unsigned int sq_entries, cq_entries; int sq_pid = -1, sq_cpu = -1; u64 sq_total_time = 0, sq_work_time = 0; - bool has_lock; unsigned int i; if (ctx->flags & IORING_SETUP_CQE32) @@ -176,15 +170,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "\n"); } - /* - * Avoid ABBA deadlock between the seq lock and the io_uring mutex, - * since fdinfo case grabs it in the opposite direction of normal use - * cases. If we fail to get the lock, we just don't iterate any - * structures that could be going away outside the io_uring mutex. - */ - has_lock = mutex_trylock(&ctx->uring_lock); - - if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { + if (ctx->flags & IORING_SETUP_SQPOLL) { struct io_sq_data *sq = ctx->sq_data; /* @@ -206,7 +192,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time); seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time); seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr); - for (i = 0; has_lock && i < ctx->file_table.data.nr; i++) { + for (i = 0; i < ctx->file_table.data.nr; i++) { struct file *f = NULL; if (ctx->file_table.data.nodes[i]) @@ -218,7 +204,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) } } seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr); - for (i = 0; has_lock && i < ctx->buf_table.nr; i++) { + for (i = 0; i < ctx->buf_table.nr; i++) { struct io_mapped_ubuf *buf = NULL; if (ctx->buf_table.nodes[i]) @@ -228,7 +214,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) else seq_printf(m, "%5u: \n", i); } - if (has_lock && !xa_empty(&ctx->personalities)) { + if (!xa_empty(&ctx->personalities)) { unsigned long index; const struct cred *cred; @@ -238,7 +224,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) } seq_puts(m, "PollList:\n"); - for (i = 0; has_lock && i < (1U << ctx->cancel_table.hash_bits); i++) { + for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; struct io_kiocb *req; @@ -247,9 +233,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) task_work_pending(req->tctx->task)); } - if (has_lock) - mutex_unlock(&ctx->uring_lock); - seq_puts(m, "CqOverflowList:\n"); spin_lock(&ctx->completion_lock); list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { @@ -262,4 +245,23 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) spin_unlock(&ctx->completion_lock); napi_show_fdinfo(ctx, m); } + +/* + * Caller holds a reference to the file already, we don't need to do + * anything else to get an extra reference. + */ +__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) +{ + struct io_ring_ctx *ctx = file->private_data; + + /* + * Avoid ABBA deadlock between the seq lock and the io_uring mutex, + * since fdinfo case grabs it in the opposite direction of normal use + * cases. + */ + if (mutex_trylock(&ctx->uring_lock)) { + __io_uring_show_fdinfo(ctx, m); + mutex_unlock(&ctx->uring_lock); + } +} #endif -- 2.51.0 From 9a109266278f200ae0b64508273fea3db8af7a9e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 May 2025 12:10:12 -0600 Subject: [PATCH 14/16] io_uring/fdinfo: only compile if CONFIG_PROC_FS is set Rather than wrap fdinfo.c in one big if, handle it on the Makefile side instead. io_uring.c already conditionally sets fops->fdinfo() anyway. Signed-off-by: Jens Axboe --- io_uring/Makefile | 3 ++- io_uring/fdinfo.c | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/io_uring/Makefile b/io_uring/Makefile index 11a739927a62..d97c6b51d584 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ eventfd.o uring_cmd.o openclose.o \ sqpoll.o xattr.o nop.o fs.o splice.o \ sync.o msg_ring.o advise.o openclose.o \ - statx.o timeout.o fdinfo.o cancel.o \ + statx.o timeout.o cancel.o \ waitid.o register.o truncate.o \ memmap.o alloc_cache.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o @@ -20,3 +20,4 @@ obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_EPOLL) += epoll.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o obj-$(CONFIG_NET) += net.o cmd_net.o +obj-$(CONFIG_PROC_FS) += fdinfo.o diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index e0d6a59a89fa..b83296eee5f8 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -15,7 +15,6 @@ #include "cancel.h" #include "rsrc.h" -#ifdef CONFIG_PROC_FS static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, const struct cred *cred) { @@ -264,4 +263,3 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) mutex_unlock(&ctx->uring_lock); } } -#endif -- 2.51.0 From 16256648cd0877aed9ede41d5d4ad3c1d65d9b2b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 May 2025 12:33:28 -0600 Subject: [PATCH 15/16] io_uring/fdinfo: get rid of dumping credentials It's a faily obscure feature, and registered credentials would for that mostly be a static thing. Don't bother including code to dump the personalities indices. Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 38 -------------------------------------- 1 file changed, 38 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index b83296eee5f8..e9355276ab5d 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -15,36 +15,6 @@ #include "cancel.h" #include "rsrc.h" -static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, - const struct cred *cred) -{ - struct user_namespace *uns = seq_user_ns(m); - struct group_info *gi; - kernel_cap_t cap; - int g; - - seq_printf(m, "%5d\n", id); - seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); - seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); - seq_puts(m, "\n\tGroups:\t"); - gi = cred->group_info; - for (g = 0; g < gi->ngroups; g++) { - seq_put_decimal_ull(m, g ? " " : "", - from_kgid_munged(uns, gi->gid[g])); - } - seq_puts(m, "\n\tCapEff:\t"); - cap = cred->cap_effective; - seq_put_hex_ll(m, NULL, cap.val, 16); - seq_putc(m, '\n'); - return 0; -} - #ifdef CONFIG_NET_RX_BUSY_POLL static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m, @@ -213,14 +183,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) else seq_printf(m, "%5u: \n", i); } - if (!xa_empty(&ctx->personalities)) { - unsigned long index; - const struct cred *cred; - - seq_printf(m, "Personalities:\n"); - xa_for_each(&ctx->personalities, index, cred) - io_uring_show_cred(m, index, cred); - } seq_puts(m, "PollList:\n"); for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { -- 2.51.0 From 5288b9e28f8a6f464746ddabcf9bf49d1323acfc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 14 May 2025 09:07:20 +0100 Subject: [PATCH 16/16] io_uring: open code io_req_cqe_overflow() A preparation patch, just open code io_req_cqe_overflow(). Signed-off-by: Pavel Begunkov Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 43c285cd2294..e4d6e572eabc 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -739,14 +739,6 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -static void io_req_cqe_overflow(struct io_kiocb *req) -{ - io_cqring_event_overflow(req->ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->big_cqe.extra1, req->big_cqe.extra2); - memset(&req->big_cqe, 0, sizeof(req->big_cqe)); -} - /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to @@ -1435,11 +1427,19 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) unlikely(!io_fill_cqe_req(ctx, req))) { if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); - io_req_cqe_overflow(req); + io_cqring_event_overflow(req->ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + req->big_cqe.extra1, + req->big_cqe.extra2); spin_unlock(&ctx->completion_lock); } else { - io_req_cqe_overflow(req); + io_cqring_event_overflow(req->ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + req->big_cqe.extra1, + req->big_cqe.extra2); } + + memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } } __io_cq_unlock_post(ctx); -- 2.51.0