From 6af82f7614a2e31e7ef23e5e160697aef31e8edd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 3 Nov 2024 08:17:28 -0700 Subject: [PATCH 01/16] io_uring/rsrc: encode node type and ctx together Rather than keep the type field separate rom ctx, use the fact that we can encode up to 4 types of nodes in the LSB of the ctx pointer. Doesn't reclaim any space right now on 64-bit archs, but it leaves a full int for future use. Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 11 +++++------ io_uring/rsrc.h | 17 ++++++++++++++--- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 60fa857985cb..2fb1791d7255 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -124,9 +124,8 @@ struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) node = kzalloc(sizeof(*node), GFP_KERNEL); if (node) { - node->ctx = ctx; + node->ctx_ptr = (unsigned long) ctx | type; node->refs = 1; - node->type = type; } return node; } @@ -445,21 +444,21 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) void io_free_rsrc_node(struct io_rsrc_node *node) { - struct io_ring_ctx *ctx = node->ctx; + struct io_ring_ctx *ctx = io_rsrc_node_ctx(node); lockdep_assert_held(&ctx->uring_lock); if (node->tag) - io_post_aux_cqe(node->ctx, node->tag, 0, 0); + io_post_aux_cqe(ctx, node->tag, 0, 0); - switch (node->type) { + switch (io_rsrc_node_type(node)) { case IORING_RSRC_FILE: if (io_slot_file(node)) fput(io_slot_file(node)); break; case IORING_RSRC_BUFFER: if (node->buf) - io_buffer_unmap(node->ctx, node); + io_buffer_unmap(ctx, node); break; default: WARN_ON_ONCE(1); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index a40fad783a69..9a8fac31fa49 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -11,12 +11,13 @@ enum { IORING_RSRC_FILE = 0, IORING_RSRC_BUFFER = 1, + + IORING_RSRC_TYPE_MASK = 0x3UL, }; struct io_rsrc_node { - struct io_ring_ctx *ctx; + unsigned long ctx_ptr; int refs; - u16 type; u64 tag; union { @@ -100,11 +101,21 @@ static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) req->rsrc_nodes[IORING_RSRC_BUFFER] = NULL; } +static inline struct io_ring_ctx *io_rsrc_node_ctx(struct io_rsrc_node *node) +{ + return (struct io_ring_ctx *) (node->ctx_ptr & ~IORING_RSRC_TYPE_MASK); +} + +static inline int io_rsrc_node_type(struct io_rsrc_node *node) +{ + return node->ctx_ptr & IORING_RSRC_TYPE_MASK; +} + static inline void io_req_assign_rsrc_node(struct io_kiocb *req, struct io_rsrc_node *node) { node->refs++; - req->rsrc_nodes[node->type] = node; + req->rsrc_nodes[io_rsrc_node_type(node)] = node; } int io_files_update(struct io_kiocb *req, unsigned int issue_flags); -- 2.51.0 From 6f94cbc29adacc15007c5a16295052e674099282 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 3 Nov 2024 08:46:07 -0700 Subject: [PATCH 02/16] io_uring/rsrc: split io_kiocb node type assignments Currently the io_rsrc_node assignment in io_kiocb is an array of two pointers, as two nodes may be assigned to a request - one file node, and one buffer node. However, the buffer node can co-exist with the provided buffers, as currently it's not supported to use both provided and registered buffers at the same time. This crucially brings struct io_kiocb down to 4 cache lines again, as before it spilled into the 5th cacheline. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 7 ++++++- io_uring/io_uring.c | 6 +++--- io_uring/net.c | 3 ++- io_uring/nop.c | 3 ++- io_uring/notif.c | 4 ++-- io_uring/rsrc.h | 16 ++++++++++------ io_uring/rw.c | 3 ++- io_uring/uring_cmd.c | 4 ++-- 8 files changed, 29 insertions(+), 17 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index d52fec533c51..01e7fb9fcfe2 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -475,6 +475,7 @@ enum { REQ_F_BL_EMPTY_BIT, REQ_F_BL_NO_RECYCLE_BIT, REQ_F_BUFFERS_COMMIT_BIT, + REQ_F_BUF_NODE_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -553,6 +554,8 @@ enum { REQ_F_BL_NO_RECYCLE = IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT), /* buffer ring head needs incrementing on put */ REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT), + /* buf node is valid */ + REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); @@ -633,6 +636,8 @@ struct io_kiocb { * REQ_F_BUFFER_RING is set. */ struct io_buffer_list *buf_list; + + struct io_rsrc_node *buf_node; }; union { @@ -642,7 +647,7 @@ struct io_kiocb { __poll_t apoll_events; }; - struct io_rsrc_node *rsrc_nodes[2]; + struct io_rsrc_node *file_node; atomic_t refs; bool cancel_seq_set; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f08ea7fd5998..5bab8a3b0456 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -948,8 +948,8 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res) static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) { req->ctx = ctx; - req->rsrc_nodes[IORING_RSRC_FILE] = NULL; - req->rsrc_nodes[IORING_RSRC_BUFFER] = NULL; + req->buf_node = NULL; + req->file_node = NULL; req->link = NULL; req->async_data = NULL; /* not necessary, but safer to zero */ @@ -1882,7 +1882,7 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, io_ring_submit_lock(ctx, issue_flags); node = io_rsrc_node_lookup(&ctx->file_table.data, fd); if (node) { - io_req_assign_rsrc_node(req, node); + io_req_assign_rsrc_node(&req->file_node, node); req->flags |= io_slot_flags(node); file = io_slot_file(node); } diff --git a/io_uring/net.c b/io_uring/net.c index 2f7b334ed708..2ccc2b409431 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1348,7 +1348,8 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) io_ring_submit_lock(ctx, issue_flags); node = io_rsrc_node_lookup(&ctx->buf_table, sr->buf_index); if (node) { - io_req_assign_rsrc_node(sr->notif, node); + io_req_assign_rsrc_node(&sr->notif->buf_node, node); + sr->notif->flags |= REQ_F_BUF_NODE; ret = 0; } io_ring_submit_unlock(ctx, issue_flags); diff --git a/io_uring/nop.c b/io_uring/nop.c index 149dbdc53607..bc22bcc739f3 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -67,7 +67,8 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags) io_ring_submit_lock(ctx, issue_flags); node = io_rsrc_node_lookup(&ctx->buf_table, nop->buffer); if (node) { - io_req_assign_rsrc_node(req, node); + io_req_assign_rsrc_node(&req->buf_node, node); + req->flags |= REQ_F_BUF_NODE; ret = 0; } io_ring_submit_unlock(ctx, issue_flags); diff --git a/io_uring/notif.c b/io_uring/notif.c index 4f02e969cf08..8dfbb0bd8e4d 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -117,8 +117,8 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) notif->file = NULL; notif->task = current; io_get_task_refs(1); - notif->rsrc_nodes[IORING_RSRC_FILE] = NULL; - notif->rsrc_nodes[IORING_RSRC_BUFFER] = NULL; + notif->file_node = NULL; + notif->buf_node = NULL; nd = io_notif_to_data(notif); nd->zc_report = false; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 9a8fac31fa49..bc3a863b14bb 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -95,10 +95,14 @@ static inline bool io_reset_rsrc_node(struct io_rsrc_data *data, int index) static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) { - io_put_rsrc_node(req->rsrc_nodes[IORING_RSRC_FILE]); - io_put_rsrc_node(req->rsrc_nodes[IORING_RSRC_BUFFER]); - req->rsrc_nodes[IORING_RSRC_FILE] = NULL; - req->rsrc_nodes[IORING_RSRC_BUFFER] = NULL; + if (req->file_node) { + io_put_rsrc_node(req->file_node); + req->file_node = NULL; + } + if (req->flags & REQ_F_BUF_NODE) { + io_put_rsrc_node(req->buf_node); + req->buf_node = NULL; + } } static inline struct io_ring_ctx *io_rsrc_node_ctx(struct io_rsrc_node *node) @@ -111,11 +115,11 @@ static inline int io_rsrc_node_type(struct io_rsrc_node *node) return node->ctx_ptr & IORING_RSRC_TYPE_MASK; } -static inline void io_req_assign_rsrc_node(struct io_kiocb *req, +static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node, struct io_rsrc_node *node) { node->refs++; - req->rsrc_nodes[io_rsrc_node_type(node)] = node; + *dst_node = node; } int io_files_update(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/rw.c b/io_uring/rw.c index 1ea6be2ccc90..144730344c0f 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -341,7 +341,8 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); if (!node) return -EFAULT; - io_req_assign_rsrc_node(req, node); + io_req_assign_rsrc_node(&req->buf_node, node); + req->flags |= REQ_F_BUF_NODE; io = req->async_data; ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len); diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 535909a38e76..88a73d21fc0b 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -219,7 +219,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) * being called. This prevents destruction of the mapped buffer * we'll need at actual import time. */ - io_req_assign_rsrc_node(req, node); + io_req_assign_rsrc_node(&req->buf_node, node); } ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); @@ -275,7 +275,7 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); - struct io_rsrc_node *node = req->rsrc_nodes[IORING_RSRC_BUFFER]; + struct io_rsrc_node *node = req->buf_node; /* Must have had rsrc_node assigned at prep time */ if (node) -- 2.51.0 From f03baece08188f2e239c0ca0c098c14c71739ffb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 3 Nov 2024 10:22:43 -0700 Subject: [PATCH 03/16] io_uring: move cancelations to be io_uring_task based Right now the task_struct pointer is used as the key to match a task, but in preparation for some io_kiocb changes, move it to using struct io_uring_task instead. No functional changes intended in this patch. Signed-off-by: Jens Axboe --- io_uring/futex.c | 4 ++-- io_uring/futex.h | 4 ++-- io_uring/io_uring.c | 42 +++++++++++++++++++++--------------------- io_uring/io_uring.h | 2 +- io_uring/poll.c | 4 ++-- io_uring/poll.h | 2 +- io_uring/timeout.c | 8 ++++---- io_uring/timeout.h | 2 +- io_uring/uring_cmd.c | 4 ++-- io_uring/uring_cmd.h | 2 +- io_uring/waitid.c | 4 ++-- io_uring/waitid.h | 2 +- 12 files changed, 40 insertions(+), 40 deletions(-) diff --git a/io_uring/futex.c b/io_uring/futex.c index 914848f46beb..e29662f039e1 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -141,7 +141,7 @@ int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, return -ENOENT; } -bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, +bool io_futex_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { struct hlist_node *tmp; @@ -151,7 +151,7 @@ bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, lockdep_assert_held(&ctx->uring_lock); hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { - if (!io_match_task_safe(req, task, cancel_all)) + if (!io_match_task_safe(req, tctx, cancel_all)) continue; hlist_del_init(&req->hash_node); __io_futex_cancel(ctx, req); diff --git a/io_uring/futex.h b/io_uring/futex.h index b8bb09873d57..d789fcf715e3 100644 --- a/io_uring/futex.h +++ b/io_uring/futex.h @@ -11,7 +11,7 @@ int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags); #if defined(CONFIG_FUTEX) int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags); -bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, +bool io_futex_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); bool io_futex_cache_init(struct io_ring_ctx *ctx); void io_futex_cache_free(struct io_ring_ctx *ctx); @@ -23,7 +23,7 @@ static inline int io_futex_cancel(struct io_ring_ctx *ctx, return 0; } static inline bool io_futex_remove_all(struct io_ring_ctx *ctx, - struct task_struct *task, bool cancel_all) + struct io_uring_task *tctx, bool cancel_all) { return false; } diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5bab8a3b0456..4a2282c85464 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -142,7 +142,7 @@ struct io_defer_entry { #define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct task_struct *task, + struct io_uring_task *tctx, bool cancel_all); static void io_queue_sqe(struct io_kiocb *req); @@ -201,12 +201,12 @@ static bool io_match_linked(struct io_kiocb *head) * As io_match_task() but protected against racing with linked timeouts. * User must not hold timeout_lock. */ -bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, +bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, bool cancel_all) { bool matched; - if (task && head->task != task) + if (tctx && head->task->io_uring != tctx) return false; if (cancel_all) return true; @@ -2987,7 +2987,7 @@ static int io_uring_release(struct inode *inode, struct file *file) } struct io_task_cancel { - struct task_struct *task; + struct io_uring_task *tctx; bool all; }; @@ -2996,11 +2996,11 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_task_cancel *cancel = data; - return io_match_task_safe(req, cancel->task, cancel->all); + return io_match_task_safe(req, cancel->tctx, cancel->all); } static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, - struct task_struct *task, + struct io_uring_task *tctx, bool cancel_all) { struct io_defer_entry *de; @@ -3008,7 +3008,7 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, spin_lock(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_match_task_safe(de->req, task, cancel_all)) { + if (io_match_task_safe(de->req, tctx, cancel_all)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } @@ -3051,11 +3051,10 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) } static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct task_struct *task, + struct io_uring_task *tctx, bool cancel_all) { - struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; - struct io_uring_task *tctx = task ? task->io_uring : NULL; + struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; enum io_wq_cancel cret; bool ret = false; @@ -3069,9 +3068,9 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, if (!ctx->rings) return false; - if (!task) { + if (!tctx) { ret |= io_uring_try_cancel_iowq(ctx); - } else if (tctx && tctx->io_wq) { + } else if (tctx->io_wq) { /* * Cancels requests of all rings, not only @ctx, but * it's fine as the task is in exit/exec. @@ -3094,15 +3093,15 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && io_allowed_defer_tw_run(ctx)) ret |= io_run_local_work(ctx, INT_MAX) > 0; - ret |= io_cancel_defer_files(ctx, task, cancel_all); + ret |= io_cancel_defer_files(ctx, tctx, cancel_all); mutex_lock(&ctx->uring_lock); - ret |= io_poll_remove_all(ctx, task, cancel_all); - ret |= io_waitid_remove_all(ctx, task, cancel_all); - ret |= io_futex_remove_all(ctx, task, cancel_all); - ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all); + ret |= io_poll_remove_all(ctx, tctx, cancel_all); + ret |= io_waitid_remove_all(ctx, tctx, cancel_all); + ret |= io_futex_remove_all(ctx, tctx, cancel_all); + ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); mutex_unlock(&ctx->uring_lock); - ret |= io_kill_timeouts(ctx, task, cancel_all); - if (task) + ret |= io_kill_timeouts(ctx, tctx, cancel_all); + if (tctx) ret |= io_run_task_work() > 0; else ret |= flush_delayed_work(&ctx->fallback_work); @@ -3155,12 +3154,13 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) if (node->ctx->sq_data) continue; loop |= io_uring_try_cancel_requests(node->ctx, - current, cancel_all); + current->io_uring, + cancel_all); } } else { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) loop |= io_uring_try_cancel_requests(ctx, - current, + current->io_uring, cancel_all); } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e3e6cb14de5d..17ffdb1e41c5 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -115,7 +115,7 @@ void io_queue_next(struct io_kiocb *req); void io_task_refs_refill(struct io_uring_task *tctx); bool __io_alloc_req_refill(struct io_ring_ctx *ctx); -bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, +bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, bool cancel_all); void io_activate_pollwq(struct io_ring_ctx *ctx); diff --git a/io_uring/poll.c b/io_uring/poll.c index 2d6698fb7400..7db3010b5733 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -714,7 +714,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) /* * Returns true if we found and killed one or more poll requests */ -__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, +__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { unsigned nr_buckets = 1U << ctx->cancel_table.hash_bits; @@ -729,7 +729,7 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) { - if (io_match_task_safe(req, tsk, cancel_all)) { + if (io_match_task_safe(req, tctx, cancel_all)) { hlist_del_init(&req->hash_node); io_poll_cancel_req(req); found = true; diff --git a/io_uring/poll.h b/io_uring/poll.h index b0e3745f5a29..04ede93113dc 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -40,7 +40,7 @@ struct io_cancel_data; int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned issue_flags); int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); -bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, +bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 9973876d91b0..18286cb53a69 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -637,13 +637,13 @@ void io_queue_linked_timeout(struct io_kiocb *req) io_put_req(req); } -static bool io_match_task(struct io_kiocb *head, struct task_struct *task, +static bool io_match_task(struct io_kiocb *head, struct io_uring_task *tctx, bool cancel_all) __must_hold(&head->ctx->timeout_lock) { struct io_kiocb *req; - if (task && head->task != task) + if (tctx && head->task->io_uring != tctx) return false; if (cancel_all) return true; @@ -656,7 +656,7 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task, } /* Returns true if we found and killed one or more timeouts */ -__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, +__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { struct io_timeout *timeout, *tmp; @@ -671,7 +671,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); - if (io_match_task(req, tsk, cancel_all) && + if (io_match_task(req, tctx, cancel_all) && io_kill_timeout(req, -ECANCELED)) canceled++; } diff --git a/io_uring/timeout.h b/io_uring/timeout.h index a6939f18313e..e91b32448dcf 100644 --- a/io_uring/timeout.h +++ b/io_uring/timeout.h @@ -24,7 +24,7 @@ static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) __cold void io_flush_timeouts(struct io_ring_ctx *ctx); struct io_cancel_data; int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd); -__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, +__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); void io_queue_linked_timeout(struct io_kiocb *req); void io_disarm_next(struct io_kiocb *req); diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 88a73d21fc0b..f88fbc9869d0 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -47,7 +47,7 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) } bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, - struct task_struct *task, bool cancel_all) + struct io_uring_task *tctx, bool cancel_all) { struct hlist_node *tmp; struct io_kiocb *req; @@ -61,7 +61,7 @@ bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, struct io_uring_cmd); struct file *file = req->file; - if (!cancel_all && req->task != task) + if (!cancel_all && req->task->io_uring != tctx) continue; if (cmd->flags & IORING_URING_CMD_CANCELABLE) { diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index a361f98664d2..7dba0f1efc58 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -8,4 +8,4 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, - struct task_struct *task, bool cancel_all); + struct io_uring_task *tctx, bool cancel_all); diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 6362ec20abc0..9b7c23f96c47 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -184,7 +184,7 @@ int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, return -ENOENT; } -bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, +bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { struct hlist_node *tmp; @@ -194,7 +194,7 @@ bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, lockdep_assert_held(&ctx->uring_lock); hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { - if (!io_match_task_safe(req, task, cancel_all)) + if (!io_match_task_safe(req, tctx, cancel_all)) continue; hlist_del_init(&req->hash_node); __io_waitid_cancel(ctx, req); diff --git a/io_uring/waitid.h b/io_uring/waitid.h index 956a8adafe8c..d5544aaf302a 100644 --- a/io_uring/waitid.h +++ b/io_uring/waitid.h @@ -11,5 +11,5 @@ int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_waitid(struct io_kiocb *req, unsigned int issue_flags); int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags); -bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, +bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); -- 2.51.0 From 6ed368cc5d5d255ffffad33cfa02ecf2b77b7c44 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 2 Nov 2024 21:26:16 -0600 Subject: [PATCH 04/16] io_uring: remove task ref helpers They are only used right where they are defined, just open-code them inside io_put_task(). Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4a2282c85464..43afd9da7d07 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -677,30 +677,19 @@ static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -/* can be called by any task */ -static void io_put_task_remote(struct task_struct *task) -{ - struct io_uring_task *tctx = task->io_uring; - - percpu_counter_sub(&tctx->inflight, 1); - if (unlikely(atomic_read(&tctx->in_cancel))) - wake_up(&tctx->wait); - put_task_struct(task); -} - -/* used by a task to put its own references */ -static void io_put_task_local(struct task_struct *task) -{ - task->io_uring->cached_refs++; -} - /* must to be called somewhat shortly after putting a request */ static inline void io_put_task(struct task_struct *task) { - if (likely(task == current)) - io_put_task_local(task); - else - io_put_task_remote(task); + struct io_uring_task *tctx = task->io_uring; + + if (likely(task == current)) { + tctx->cached_refs++; + } else { + percpu_counter_sub(&tctx->inflight, 1); + if (unlikely(atomic_read(&tctx->in_cancel))) + wake_up(&tctx->wait); + put_task_struct(task); + } } void io_task_refs_refill(struct io_uring_task *tctx) -- 2.51.0 From b6f58a3f4aa8dba424356c7a69388a81f4459300 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 3 Nov 2024 10:23:38 -0700 Subject: [PATCH 05/16] io_uring: move struct io_kiocb from task_struct to io_uring_task Rather than store the task_struct itself in struct io_kiocb, store the io_uring specific task_struct. The life times are the same in terms of io_uring, and this avoids doing some dereferences through the task_struct. For the hot path of putting local task references, we can deref req->tctx instead, which we'll need anyway in that function regardless of whether it's local or remote references. This is mostly straight forward, except the original task PF_EXITING check needs a bit of tweaking. task_work is _always_ run from the originating task, except in the fallback case, where it's run from a kernel thread. Replace the potentially racy (in case of fallback work) checks for req->task->flags with current->flags. It's either the still the original task, in which case PF_EXITING will be sane, or it has PF_KTHREAD set, in which case it's fallback work. Both cases should prevent moving forward with the given request. Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 2 +- include/linux/io_uring_types.h | 3 ++- io_uring/cancel.c | 2 +- io_uring/fdinfo.c | 2 +- io_uring/io_uring.c | 34 +++++++++++++++------------------- io_uring/io_uring.h | 13 +++++++++++++ io_uring/msg_ring.c | 4 ++-- io_uring/notif.c | 4 ++-- io_uring/poll.c | 3 +-- io_uring/rw.c | 2 +- io_uring/tctx.c | 1 + io_uring/timeout.c | 10 ++++++---- io_uring/uring_cmd.c | 2 +- io_uring/waitid.c | 2 +- 14 files changed, 48 insertions(+), 36 deletions(-) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index c189d36ad55e..578a3fdf5c71 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -110,7 +110,7 @@ static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) { - return cmd_to_io_kiocb(cmd)->task; + return cmd_to_io_kiocb(cmd)->tctx->task; } #endif /* _LINUX_IO_URING_CMD_H */ diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 01e7fb9fcfe2..fba2988accc3 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -84,6 +84,7 @@ struct io_uring_task { /* submission side */ int cached_refs; const struct io_ring_ctx *last; + struct task_struct *task; struct io_wq *io_wq; struct file *registered_rings[IO_RINGFD_REG_MAX]; @@ -625,7 +626,7 @@ struct io_kiocb { struct io_cqe cqe; struct io_ring_ctx *ctx; - struct task_struct *task; + struct io_uring_task *tctx; union { /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ diff --git a/io_uring/cancel.c b/io_uring/cancel.c index bbca5cb69cb5..484193567839 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -205,7 +205,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) .opcode = cancel->opcode, .seq = atomic_inc_return(&req->ctx->cancel_seq), }; - struct io_uring_task *tctx = req->task->io_uring; + struct io_uring_task *tctx = req->tctx; int ret; if (cd.flags & IORING_ASYNC_CANCEL_FD) { diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 8da0d9e4533a..efbec34ccb18 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -203,7 +203,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) hlist_for_each_entry(req, &hb->list, hash_node) seq_printf(m, " op=%d, task_works=%d\n", req->opcode, - task_work_pending(req->task)); + task_work_pending(req->tctx->task)); } if (has_lock) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 43afd9da7d07..7c1ca36b117b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -206,7 +206,7 @@ bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, { bool matched; - if (tctx && head->task->io_uring != tctx) + if (tctx && head->tctx != tctx) return false; if (cancel_all) return true; @@ -407,11 +407,8 @@ static void io_clean_op(struct io_kiocb *req) kfree(req->apoll); req->apoll = NULL; } - if (req->flags & REQ_F_INFLIGHT) { - struct io_uring_task *tctx = req->task->io_uring; - - atomic_dec(&tctx->inflight_tracked); - } + if (req->flags & REQ_F_INFLIGHT) + atomic_dec(&req->tctx->inflight_tracked); if (req->flags & REQ_F_CREDS) put_cred(req->creds); if (req->flags & REQ_F_ASYNC_DATA) { @@ -425,7 +422,7 @@ static inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { req->flags |= REQ_F_INFLIGHT; - atomic_inc(&req->task->io_uring->inflight_tracked); + atomic_inc(&req->tctx->inflight_tracked); } } @@ -514,7 +511,7 @@ static void io_prep_async_link(struct io_kiocb *req) static void io_queue_iowq(struct io_kiocb *req) { struct io_kiocb *link = io_prep_linked_timeout(req); - struct io_uring_task *tctx = req->task->io_uring; + struct io_uring_task *tctx = req->tctx; BUG_ON(!tctx); BUG_ON(!tctx->io_wq); @@ -529,7 +526,7 @@ static void io_queue_iowq(struct io_kiocb *req) * procedure rather than attempt to run this request (or create a new * worker for it). */ - if (WARN_ON_ONCE(!same_thread_group(req->task, current))) + if (WARN_ON_ONCE(!same_thread_group(tctx->task, current))) atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags); trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); @@ -678,17 +675,17 @@ static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) } /* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task) +static inline void io_put_task(struct io_kiocb *req) { - struct io_uring_task *tctx = task->io_uring; + struct io_uring_task *tctx = req->tctx; - if (likely(task == current)) { + if (likely(tctx->task == current)) { tctx->cached_refs++; } else { percpu_counter_sub(&tctx->inflight, 1); if (unlikely(atomic_read(&tctx->in_cancel))) wake_up(&tctx->wait); - put_task_struct(task); + put_task_struct(tctx->task); } } @@ -1207,7 +1204,7 @@ static inline void io_req_local_work_add(struct io_kiocb *req, static void io_req_normal_work_add(struct io_kiocb *req) { - struct io_uring_task *tctx = req->task->io_uring; + struct io_uring_task *tctx = req->tctx; struct io_ring_ctx *ctx = req->ctx; /* task_work already pending, we're done */ @@ -1226,7 +1223,7 @@ static void io_req_normal_work_add(struct io_kiocb *req) return; } - if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) + if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method))) return; io_fallback_tw(tctx, false); @@ -1343,8 +1340,7 @@ static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts) void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) { io_tw_lock(req->ctx, ts); - /* req->task == current here, checking PF_EXITING is safe */ - if (unlikely(req->task->flags & PF_EXITING)) + if (unlikely(io_should_terminate_tw())) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) io_queue_iowq(req); @@ -1403,7 +1399,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, } io_put_file(req); io_req_put_rsrc_nodes(req); - io_put_task(req->task); + io_put_task(req); node = req->comp_list.next; io_req_add_to_cache(req, ctx); @@ -2019,7 +2015,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->flags = (__force io_req_flags_t) sqe_flags; req->cqe.user_data = READ_ONCE(sqe->user_data); req->file = NULL; - req->task = current; + req->tctx = current->io_uring; req->cancel_seq_set = false; if (unlikely(opcode >= IORING_OP_LAST)) { diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 17ffdb1e41c5..702c8e987430 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -426,6 +426,19 @@ static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) ctx->submitter_task == current); } +/* + * Terminate the request if either of these conditions are true: + * + * 1) It's being executed by the original task, but that task is marked + * with PF_EXITING as it's exiting. + * 2) PF_KTHREAD is set, in which case the invoker of the task_work is + * our fallback task_work. + */ +static inline bool io_should_terminate_tw(void) +{ + return current->flags & (PF_KTHREAD | PF_EXITING); +} + static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) { io_req_set_res(req, res, 0); diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 99af39e1d0fb..e63af34004b7 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -89,8 +89,8 @@ static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, int res, u32 cflags, u64 user_data) { - req->task = READ_ONCE(ctx->submitter_task); - if (!req->task) { + req->tctx = READ_ONCE(ctx->submitter_task->io_uring); + if (!req->tctx) { kmem_cache_free(req_cachep, req); return -EOWNERDEAD; } diff --git a/io_uring/notif.c b/io_uring/notif.c index 8dfbb0bd8e4d..ee3a33510b3c 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -89,7 +89,7 @@ static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg) /* make sure all noifications can be finished in the same task_work */ if (unlikely(notif->ctx != prev_notif->ctx || - notif->task != prev_notif->task)) + notif->tctx != prev_notif->tctx)) return -EEXIST; nd->head = prev_nd->head; @@ -115,7 +115,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) notif->opcode = IORING_OP_NOP; notif->flags = 0; notif->file = NULL; - notif->task = current; + notif->tctx = current->io_uring; io_get_task_refs(1); notif->file_node = NULL; notif->buf_node = NULL; diff --git a/io_uring/poll.c b/io_uring/poll.c index 7db3010b5733..bced9edd5233 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -224,8 +224,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) { int v; - /* req->task == current here, checking PF_EXITING is safe */ - if (unlikely(req->task->flags & PF_EXITING)) + if (unlikely(io_should_terminate_tw())) return -ECANCELED; do { diff --git a/io_uring/rw.c b/io_uring/rw.c index 144730344c0f..e368b9afde03 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -435,7 +435,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req) * Play it safe and assume not safe to re-import and reissue if we're * not in the original thread group (or in task context). */ - if (!same_thread_group(req->task, current) || !in_task()) + if (!same_thread_group(req->tctx->task, current) || !in_task()) return false; return true; } diff --git a/io_uring/tctx.c b/io_uring/tctx.c index c043fe93a3f2..503f3ff8bc4f 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -81,6 +81,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, return ret; } + tctx->task = task; xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); atomic_set(&tctx->in_cancel, 0); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 18286cb53a69..5b12bd6a804c 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -300,16 +300,18 @@ static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *t { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_kiocb *prev = timeout->prev; - int ret = -ENOENT; + int ret; if (prev) { - if (!(req->task->flags & PF_EXITING)) { + if (!io_should_terminate_tw()) { struct io_cancel_data cd = { .ctx = req->ctx, .data = prev->cqe.user_data, }; - ret = io_try_cancel(req->task->io_uring, &cd, 0); + ret = io_try_cancel(req->tctx, &cd, 0); + } else { + ret = -ECANCELED; } io_req_set_res(req, ret ?: -ETIME, 0); io_req_task_complete(req, ts); @@ -643,7 +645,7 @@ static bool io_match_task(struct io_kiocb *head, struct io_uring_task *tctx, { struct io_kiocb *req; - if (tctx && head->task->io_uring != tctx) + if (tctx && head->tctx != tctx) return false; if (cancel_all) return true; diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index f88fbc9869d0..40b8b777ba12 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -61,7 +61,7 @@ bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, struct io_uring_cmd); struct file *file = req->file; - if (!cancel_all && req->task->io_uring != tctx) + if (!cancel_all && req->tctx != tctx) continue; if (cmd->flags & IORING_URING_CMD_CANCELABLE) { diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 9b7c23f96c47..daef5dd644f0 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -331,7 +331,7 @@ int io_waitid(struct io_kiocb *req, unsigned int issue_flags) hlist_add_head(&req->hash_node, &ctx->waitid_list); init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait); - iwa->wo.child_wait.private = req->task; + iwa->wo.child_wait.private = req->tctx->task; iw->head = ¤t->signal->wait_chldexit; add_wait_queue(iw->head, &iwa->wo.child_wait); -- 2.51.0 From 483242714fcc853f3f5ef728116f5ec168468bca Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 4 Nov 2024 12:02:47 +0000 Subject: [PATCH 06/16] io_uring: prevent speculating sq_array indexing The SQ index array consists of user provided indexes, which io_uring then uses to index the SQ, and so it's susceptible to speculation. For all other queues io_uring tracks heads and tails in kernel, and they shouldn't need any special care. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c6c7a25962924a55869e317e4fdb682dfdc6b279.1730687889.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 7c1ca36b117b..751b9e19da6e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2245,6 +2245,7 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) READ_ONCE(ctx->rings->sq_dropped) + 1); return false; } + head = array_index_nospec(head, ctx->sq_entries); } /* -- 2.51.0 From 2f3cc8e441c9f657ff036c56baaab7dddbd0b350 Mon Sep 17 00:00:00 2001 From: Olivier Langlois Date: Sun, 13 Oct 2024 14:28:24 -0400 Subject: [PATCH 07/16] io_uring/napi: protect concurrent io_napi_entry timeout accesses io_napi_entry timeout value can be updated while accessed from the poll functions. Its concurrent accesses are wrapped with READ_ONCE()/WRITE_ONCE() macros to avoid incorrect compiler optimizations. Signed-off-by: Olivier Langlois Link: https://lore.kernel.org/r/3de3087563cf98f75266fd9f85fdba063a8720db.1728828877.git.olivier@trillion01.com Signed-off-by: Jens Axboe --- io_uring/napi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/napi.c b/io_uring/napi.c index d0cf694d0172..dda2e083fb5d 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -60,7 +60,7 @@ void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock) rcu_read_lock(); e = io_napi_hash_find(hash_list, napi_id); if (e) { - e->timeout = jiffies + NAPI_TIMEOUT; + WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT); rcu_read_unlock(); return; } @@ -92,7 +92,7 @@ static void __io_napi_remove_stale(struct io_ring_ctx *ctx) spin_lock(&ctx->napi_lock); hash_for_each(ctx->napi_ht, i, e, node) { - if (time_after(jiffies, e->timeout)) { + if (time_after(jiffies, READ_ONCE(e->timeout))) { list_del(&e->list); hash_del_rcu(&e->node); kfree_rcu(e, rcu); @@ -150,7 +150,7 @@ static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx, napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); - if (time_after(jiffies, e->timeout)) + if (time_after(jiffies, READ_ONCE(e->timeout))) is_stale = true; } -- 2.51.0 From 45b3941d09d13b3503309be1f023b83deaf69b4d Mon Sep 17 00:00:00 2001 From: Olivier Langlois Date: Sun, 13 Oct 2024 14:28:38 -0400 Subject: [PATCH 08/16] io_uring/napi: fix io_napi_entry RCU accesses correct 3 RCU structures modifications that were not using the RCU functions to make their update. Signed-off-by: Olivier Langlois Link: https://lore.kernel.org/r/9f53b5169afa8c7bf3665a0b19dc2f7061173530.1728828877.git.olivier@trillion01.com Signed-off-by: Jens Axboe --- io_uring/napi.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/io_uring/napi.c b/io_uring/napi.c index dda2e083fb5d..921de9de8d75 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -81,19 +81,24 @@ void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock) } hlist_add_tail_rcu(&e->node, hash_list); - list_add_tail(&e->list, &ctx->napi_list); + list_add_tail_rcu(&e->list, &ctx->napi_list); spin_unlock(&ctx->napi_lock); } static void __io_napi_remove_stale(struct io_ring_ctx *ctx) { struct io_napi_entry *e; - unsigned int i; spin_lock(&ctx->napi_lock); - hash_for_each(ctx->napi_ht, i, e, node) { + /* + * list_for_each_entry_safe() is not required as long as: + * 1. list_del_rcu() does not reset the deleted node next pointer + * 2. kfree_rcu() delays the memory freeing until the next quiescent + * state + */ + list_for_each_entry(e, &ctx->napi_list, list) { if (time_after(jiffies, READ_ONCE(e->timeout))) { - list_del(&e->list); + list_del_rcu(&e->list); hash_del_rcu(&e->node); kfree_rcu(e, rcu); } @@ -204,13 +209,13 @@ void io_napi_init(struct io_ring_ctx *ctx) void io_napi_free(struct io_ring_ctx *ctx) { struct io_napi_entry *e; - unsigned int i; spin_lock(&ctx->napi_lock); - hash_for_each(ctx->napi_ht, i, e, node) { + list_for_each_entry(e, &ctx->napi_list, list) { hash_del_rcu(&e->node); kfree_rcu(e, rcu); } + INIT_LIST_HEAD_RCU(&ctx->napi_list); spin_unlock(&ctx->napi_lock); } -- 2.51.0 From a5e26f49fef9485bc4ae24666d984a6de11e058c Mon Sep 17 00:00:00 2001 From: Olivier Langlois Date: Sun, 13 Oct 2024 14:28:50 -0400 Subject: [PATCH 09/16] io_uring/napi: improve __io_napi_add 1. move the sock->sk pointer validity test outside the function to avoid the function call overhead and to make the function more more reusable 2. change its name to __io_napi_add_id to be more precise about it is doing 3. return an error code to report errors Signed-off-by: Olivier Langlois Link: https://lore.kernel.org/r/d637fa3b437d753c0f4e44ff6a7b5bf2c2611270.1728828877.git.olivier@trillion01.com Signed-off-by: Jens Axboe --- io_uring/napi.c | 19 ++++++------------- io_uring/napi.h | 6 +++--- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/io_uring/napi.c b/io_uring/napi.c index 921de9de8d75..5e2299e7ff8e 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -38,22 +38,14 @@ static inline ktime_t net_to_ktime(unsigned long t) return ns_to_ktime(t << 10); } -void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock) +int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) { struct hlist_head *hash_list; - unsigned int napi_id; - struct sock *sk; struct io_napi_entry *e; - sk = sock->sk; - if (!sk) - return; - - napi_id = READ_ONCE(sk->sk_napi_id); - /* Non-NAPI IDs can be rejected. */ if (napi_id < MIN_NAPI_ID) - return; + return -EINVAL; hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; @@ -62,13 +54,13 @@ void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock) if (e) { WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT); rcu_read_unlock(); - return; + return -EEXIST; } rcu_read_unlock(); e = kmalloc(sizeof(*e), GFP_NOWAIT); if (!e) - return; + return -ENOMEM; e->napi_id = napi_id; e->timeout = jiffies + NAPI_TIMEOUT; @@ -77,12 +69,13 @@ void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock) if (unlikely(io_napi_hash_find(hash_list, napi_id))) { spin_unlock(&ctx->napi_lock); kfree(e); - return; + return -EEXIST; } hlist_add_tail_rcu(&e->node, hash_list); list_add_tail_rcu(&e->list, &ctx->napi_list); spin_unlock(&ctx->napi_lock); + return 0; } static void __io_napi_remove_stale(struct io_ring_ctx *ctx) diff --git a/io_uring/napi.h b/io_uring/napi.h index fd275ef0456d..4ae622f37b30 100644 --- a/io_uring/napi.h +++ b/io_uring/napi.h @@ -15,7 +15,7 @@ void io_napi_free(struct io_ring_ctx *ctx); int io_register_napi(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); -void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock); +int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id); void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); @@ -48,8 +48,8 @@ static inline void io_napi_add(struct io_kiocb *req) return; sock = sock_from_file(req->file); - if (sock) - __io_napi_add(ctx, sock); + if (sock && sock->sk) + __io_napi_add_id(ctx, READ_ONCE(sock->sk->sk_napi_id)); } #else -- 2.51.0 From db1e1adf6f993b1c2cef605d86eff709a8db5052 Mon Sep 17 00:00:00 2001 From: Olivier Langlois Date: Sun, 13 Oct 2024 14:29:02 -0400 Subject: [PATCH 10/16] io_uring/napi: Use lock guards Convert napi locks to use the shiny new Scope-Based Resource Management machinery. Signed-off-by: Olivier Langlois Link: https://lore.kernel.org/r/2680ca47ee183cfdb89d1a40c84d349edeb620ab.1728828877.git.olivier@trillion01.com Signed-off-by: Jens Axboe --- io_uring/napi.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/io_uring/napi.c b/io_uring/napi.c index 5e2299e7ff8e..6d5fdd397f2f 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -49,14 +49,13 @@ int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; - rcu_read_lock(); - e = io_napi_hash_find(hash_list, napi_id); - if (e) { - WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT); - rcu_read_unlock(); - return -EEXIST; + scoped_guard(rcu) { + e = io_napi_hash_find(hash_list, napi_id); + if (e) { + WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT); + return -EEXIST; + } } - rcu_read_unlock(); e = kmalloc(sizeof(*e), GFP_NOWAIT); if (!e) @@ -65,6 +64,10 @@ int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) e->napi_id = napi_id; e->timeout = jiffies + NAPI_TIMEOUT; + /* + * guard(spinlock) is not used to manually unlock it before calling + * kfree() + */ spin_lock(&ctx->napi_lock); if (unlikely(io_napi_hash_find(hash_list, napi_id))) { spin_unlock(&ctx->napi_lock); @@ -82,7 +85,7 @@ static void __io_napi_remove_stale(struct io_ring_ctx *ctx) { struct io_napi_entry *e; - spin_lock(&ctx->napi_lock); + guard(spinlock)(&ctx->napi_lock); /* * list_for_each_entry_safe() is not required as long as: * 1. list_del_rcu() does not reset the deleted node next pointer @@ -96,7 +99,6 @@ static void __io_napi_remove_stale(struct io_ring_ctx *ctx) kfree_rcu(e, rcu); } } - spin_unlock(&ctx->napi_lock); } static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) @@ -168,11 +170,12 @@ static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, if (list_is_singular(&ctx->napi_list)) loop_end_arg = iowq; - rcu_read_lock(); - do { - is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg); - } while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg); - rcu_read_unlock(); + scoped_guard(rcu) { + do { + is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg); + } while (!io_napi_busy_loop_should_end(iowq, start_time) && + !loop_end_arg); + } io_napi_remove_stale(ctx, is_stale); } @@ -203,13 +206,12 @@ void io_napi_free(struct io_ring_ctx *ctx) { struct io_napi_entry *e; - spin_lock(&ctx->napi_lock); + guard(spinlock)(&ctx->napi_lock); list_for_each_entry(e, &ctx->napi_list, list) { hash_del_rcu(&e->node); kfree_rcu(e, rcu); } INIT_LIST_HEAD_RCU(&ctx->napi_list); - spin_unlock(&ctx->napi_lock); } /* @@ -305,9 +307,9 @@ int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) if (list_empty_careful(&ctx->napi_list)) return 0; - rcu_read_lock(); - is_stale = __io_napi_do_busy_loop(ctx, NULL); - rcu_read_unlock(); + scoped_guard(rcu) { + is_stale = __io_napi_do_busy_loop(ctx, NULL); + } io_napi_remove_stale(ctx, is_stale); return 1; -- 2.51.0 From 71afd926f292bb8f3ca86e6c3c40123037f109c6 Mon Sep 17 00:00:00 2001 From: Olivier Langlois Date: Sun, 13 Oct 2024 14:29:12 -0400 Subject: [PATCH 11/16] io_uring/napi: clean up __io_napi_do_busy_loop __io_napi_do_busy_loop now requires to have loop_end in its parameters. This makes the code cleaner and also has the benefit of removing a branch since the only caller not passing NULL for loop_end_arg is also setting the value conditionally. Signed-off-by: Olivier Langlois Link: https://lore.kernel.org/r/d5b9bb91b1a08fff50525e1c18d7b4709b9ca100.1728828877.git.olivier@trillion01.com Signed-off-by: Jens Axboe --- io_uring/napi.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/io_uring/napi.c b/io_uring/napi.c index 6d5fdd397f2f..1de1543d8034 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -137,15 +137,12 @@ static bool io_napi_busy_loop_should_end(void *data, } static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx, + bool (*loop_end)(void *, unsigned long), void *loop_end_arg) { struct io_napi_entry *e; - bool (*loop_end)(void *, unsigned long) = NULL; bool is_stale = false; - if (loop_end_arg) - loop_end = io_napi_busy_loop_should_end; - list_for_each_entry_rcu(e, &ctx->napi_list, list) { napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); @@ -161,18 +158,22 @@ static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { unsigned long start_time = busy_loop_current_time(); + bool (*loop_end)(void *, unsigned long) = NULL; void *loop_end_arg = NULL; bool is_stale = false; /* Singular lists use a different napi loop end check function and are * only executed once. */ - if (list_is_singular(&ctx->napi_list)) + if (list_is_singular(&ctx->napi_list)) { + loop_end = io_napi_busy_loop_should_end; loop_end_arg = iowq; + } scoped_guard(rcu) { do { - is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg); + is_stale = __io_napi_do_busy_loop(ctx, loop_end, + loop_end_arg); } while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg); } @@ -308,7 +309,7 @@ int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) return 0; scoped_guard(rcu) { - is_stale = __io_napi_do_busy_loop(ctx, NULL); + is_stale = __io_napi_do_busy_loop(ctx, NULL, NULL); } io_napi_remove_stale(ctx, is_stale); -- 2.51.0 From 6bf90bd8c58a305994948eb3409d91a7d8f2edae Mon Sep 17 00:00:00 2001 From: Olivier Langlois Date: Sun, 13 Oct 2024 14:29:24 -0400 Subject: [PATCH 12/16] io_uring/napi: add static napi tracking strategy Add the static napi tracking strategy. That allows the user to manually manage the napi ids list for busy polling, and eliminate the overhead of dynamically updating the list from the fast path. Signed-off-by: Olivier Langlois Link: https://lore.kernel.org/r/96943de14968c35a5c599352259ad98f3c0770ba.1728828877.git.olivier@trillion01.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- include/uapi/linux/io_uring.h | 32 ++++++++++- io_uring/fdinfo.c | 54 ++++++++++++++----- io_uring/napi.c | 97 ++++++++++++++++++++++++++++++---- io_uring/napi.h | 2 +- 5 files changed, 160 insertions(+), 27 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index fba2988accc3..072e65e93105 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -408,7 +408,7 @@ struct io_ring_ctx { /* napi busy poll default timeout */ ktime_t napi_busy_poll_dt; bool napi_prefer_busy_poll; - bool napi_enabled; + u8 napi_track_mode; DECLARE_HASHTABLE(napi_ht, 4); #endif diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 47977a5c65f5..5d08435b95a8 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -790,12 +790,40 @@ struct io_uring_buf_status { __u32 resv[8]; }; +enum io_uring_napi_op { + /* register/ungister backward compatible opcode */ + IO_URING_NAPI_REGISTER_OP = 0, + + /* opcodes to update napi_list when static tracking is used */ + IO_URING_NAPI_STATIC_ADD_ID = 1, + IO_URING_NAPI_STATIC_DEL_ID = 2 +}; + +enum io_uring_napi_tracking_strategy { + /* value must be 0 for backward compatibility */ + IO_URING_NAPI_TRACKING_DYNAMIC = 0, + IO_URING_NAPI_TRACKING_STATIC = 1, + IO_URING_NAPI_TRACKING_INACTIVE = 255 +}; + /* argument for IORING_(UN)REGISTER_NAPI */ struct io_uring_napi { __u32 busy_poll_to; __u8 prefer_busy_poll; - __u8 pad[3]; - __u64 resv; + + /* a io_uring_napi_op value */ + __u8 opcode; + __u8 pad[2]; + + /* + * for IO_URING_NAPI_REGISTER_OP, it is a + * io_uring_napi_tracking_strategy value. + * + * for IO_URING_NAPI_STATIC_ADD_ID/IO_URING_NAPI_STATIC_DEL_ID + * it is the napi id to add/del from napi_list. + */ + __u32 op_param; + __u32 resv; }; /* diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index efbec34ccb18..b214e5a407b5 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -46,6 +46,46 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, return 0; } +#ifdef CONFIG_NET_RX_BUSY_POLL +static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx, + struct seq_file *m, + const char *tracking_strategy) +{ + seq_puts(m, "NAPI:\tenabled\n"); + seq_printf(m, "napi tracking:\t%s\n", tracking_strategy); + seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt); + if (ctx->napi_prefer_busy_poll) + seq_puts(m, "napi_prefer_busy_poll:\ttrue\n"); + else + seq_puts(m, "napi_prefer_busy_poll:\tfalse\n"); +} + +static __cold void napi_show_fdinfo(struct io_ring_ctx *ctx, + struct seq_file *m) +{ + unsigned int mode = READ_ONCE(ctx->napi_track_mode); + + switch (mode) { + case IO_URING_NAPI_TRACKING_INACTIVE: + seq_puts(m, "NAPI:\tdisabled\n"); + break; + case IO_URING_NAPI_TRACKING_DYNAMIC: + common_tracking_show_fdinfo(ctx, m, "dynamic"); + break; + case IO_URING_NAPI_TRACKING_STATIC: + common_tracking_show_fdinfo(ctx, m, "static"); + break; + default: + seq_printf(m, "NAPI:\tunknown mode (%u)\n", mode); + } +} +#else +static inline void napi_show_fdinfo(struct io_ring_ctx *ctx, + struct seq_file *m) +{ +} +#endif + /* * Caller holds a reference to the file already, we don't need to do * anything else to get an extra reference. @@ -219,18 +259,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) } spin_unlock(&ctx->completion_lock); - -#ifdef CONFIG_NET_RX_BUSY_POLL - if (ctx->napi_enabled) { - seq_puts(m, "NAPI:\tenabled\n"); - seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt); - if (ctx->napi_prefer_busy_poll) - seq_puts(m, "napi_prefer_busy_poll:\ttrue\n"); - else - seq_puts(m, "napi_prefer_busy_poll:\tfalse\n"); - } else { - seq_puts(m, "NAPI:\tdisabled\n"); - } -#endif + napi_show_fdinfo(ctx, m); } #endif diff --git a/io_uring/napi.c b/io_uring/napi.c index 1de1543d8034..b1ade3fda30f 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -81,6 +81,27 @@ int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) return 0; } +static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id) +{ + struct hlist_head *hash_list; + struct io_napi_entry *e; + + /* Non-NAPI IDs can be rejected. */ + if (napi_id < MIN_NAPI_ID) + return -EINVAL; + + hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; + guard(spinlock)(&ctx->napi_lock); + e = io_napi_hash_find(hash_list, napi_id); + if (!e) + return -ENOENT; + + list_del_rcu(&e->list); + hash_del_rcu(&e->node); + kfree_rcu(e, rcu); + return 0; +} + static void __io_napi_remove_stale(struct io_ring_ctx *ctx) { struct io_napi_entry *e; @@ -136,9 +157,25 @@ static bool io_napi_busy_loop_should_end(void *data, return false; } -static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx, - bool (*loop_end)(void *, unsigned long), - void *loop_end_arg) +/* + * never report stale entries + */ +static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg) +{ + struct io_napi_entry *e; + + list_for_each_entry_rcu(e, &ctx->napi_list, list) + napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, + ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); + return false; +} + +static bool +dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg) { struct io_napi_entry *e; bool is_stale = false; @@ -154,6 +191,16 @@ static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx, return is_stale; } +static inline bool +__io_napi_do_busy_loop(struct io_ring_ctx *ctx, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg) +{ + if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC) + return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); + return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); +} + static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { @@ -195,6 +242,7 @@ void io_napi_init(struct io_ring_ctx *ctx) spin_lock_init(&ctx->napi_lock); ctx->napi_prefer_busy_poll = false; ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt); + ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE; } /* @@ -215,6 +263,24 @@ void io_napi_free(struct io_ring_ctx *ctx) INIT_LIST_HEAD_RCU(&ctx->napi_list); } +static int io_napi_register_napi(struct io_ring_ctx *ctx, + struct io_uring_napi *napi) +{ + switch (napi->op_param) { + case IO_URING_NAPI_TRACKING_DYNAMIC: + case IO_URING_NAPI_TRACKING_STATIC: + break; + default: + return -EINVAL; + } + /* clean the napi list for new settings */ + io_napi_free(ctx); + WRITE_ONCE(ctx->napi_track_mode, napi->op_param); + WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC); + WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll); + return 0; +} + /* * io_napi_register() - Register napi with io-uring * @ctx: pointer to io-uring context structure @@ -226,7 +292,8 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) { const struct io_uring_napi curr = { .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), - .prefer_busy_poll = ctx->napi_prefer_busy_poll + .prefer_busy_poll = ctx->napi_prefer_busy_poll, + .op_param = ctx->napi_track_mode }; struct io_uring_napi napi; @@ -234,16 +301,26 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) return -EINVAL; if (copy_from_user(&napi, arg, sizeof(napi))) return -EFAULT; - if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv) + if (napi.pad[0] || napi.pad[1] || napi.resv) return -EINVAL; if (copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; - WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC); - WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll); - WRITE_ONCE(ctx->napi_enabled, true); - return 0; + switch (napi.opcode) { + case IO_URING_NAPI_REGISTER_OP: + return io_napi_register_napi(ctx, &napi); + case IO_URING_NAPI_STATIC_ADD_ID: + if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) + return -EINVAL; + return __io_napi_add_id(ctx, napi.op_param); + case IO_URING_NAPI_STATIC_DEL_ID: + if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) + return -EINVAL; + return __io_napi_del_id(ctx, napi.op_param); + default: + return -EINVAL; + } } /* @@ -266,7 +343,7 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) WRITE_ONCE(ctx->napi_busy_poll_dt, 0); WRITE_ONCE(ctx->napi_prefer_busy_poll, false); - WRITE_ONCE(ctx->napi_enabled, false); + WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); return 0; } diff --git a/io_uring/napi.h b/io_uring/napi.h index 4ae622f37b30..fa742f42e09b 100644 --- a/io_uring/napi.h +++ b/io_uring/napi.h @@ -44,7 +44,7 @@ static inline void io_napi_add(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; struct socket *sock; - if (!READ_ONCE(ctx->napi_enabled)) + if (READ_ONCE(ctx->napi_track_mode) != IO_URING_NAPI_TRACKING_DYNAMIC) return; sock = sock_from_file(req->file); -- 2.51.0 From af0a2ffef0e6d23412dd55df29f5caef8f3583f2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 5 Nov 2024 02:12:33 +0000 Subject: [PATCH 13/16] io_uring: avoid normal tw intermediate fallback When a DEFER_TASKRUN io_uring is terminating it requeues deferred task work items as normal tw, which can further fallback to kthread execution. Avoid this extra step and always push them to the fallback kthread. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d1cd472cec2230c66bd1c8d412a5833f0af75384.1730772720.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 21 ++++++++++----------- io_uring/io_uring.h | 2 +- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 751b9e19da6e..042a65d38d0c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1066,9 +1066,8 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, return node; } -static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync) +static __cold void __io_fallback_tw(struct llist_node *node, bool sync) { - struct llist_node *node = llist_del_all(&tctx->task_list); struct io_ring_ctx *last_ctx = NULL; struct io_kiocb *req; @@ -1094,6 +1093,13 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync) } } +static void io_fallback_tw(struct io_uring_task *tctx, bool sync) +{ + struct llist_node *node = llist_del_all(&tctx->task_list); + + __io_fallback_tw(node, sync); +} + struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count) @@ -1247,16 +1253,9 @@ void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) { - struct llist_node *node; + struct llist_node *node = llist_del_all(&ctx->work_llist); - node = llist_del_all(&ctx->work_llist); - while (node) { - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); - - node = node->next; - io_req_normal_work_add(req); - } + __io_fallback_tw(node, false); } static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 702c8e987430..4070d4c8ef97 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -136,7 +136,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) * Not from an SQE, as those cannot be submitted, but via * updating tagged resources. */ - if (ctx->submitter_task->flags & PF_EXITING) + if (percpu_ref_is_dying(&ctx->refs)) lockdep_assert(current_work()); else lockdep_assert(current == ctx->submitter_task); -- 2.51.0 From 0d98c509086837a8cf5a32f82f2a58f39a539192 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 7 Nov 2024 19:01:34 +0800 Subject: [PATCH 14/16] io_uring/rsrc: pass 'struct io_ring_ctx' reference to rsrc helpers `io_rsrc_node` instance won't be shared among different io_uring ctxs, and its allocation 'ctx' is always same with the user's 'ctx', so it is safe to pass user 'ctx' reference to rsrc helpers. Even in io_clone_buffers(), `io_rsrc_node` instance is allocated actually for destination io_uring_ctx. Then io_rsrc_node_ctx() can be removed, and the 8 bytes `ctx` pointer will be removed from `io_rsrc_node` in the following patch. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241107110149.890530-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- io_uring/filetable.c | 13 +++++++------ io_uring/filetable.h | 4 ++-- io_uring/rsrc.c | 24 +++++++++++------------- io_uring/rsrc.h | 22 +++++++++------------- io_uring/splice.c | 2 +- 5 files changed, 30 insertions(+), 35 deletions(-) diff --git a/io_uring/filetable.c b/io_uring/filetable.c index 45f005f5db42..a21660e3145a 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -36,20 +36,21 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx) return -ENFILE; } -bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) +bool io_alloc_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table, + unsigned nr_files) { if (io_rsrc_data_alloc(&table->data, nr_files)) return false; table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT); if (table->bitmap) return true; - io_rsrc_data_free(&table->data); + io_rsrc_data_free(ctx, &table->data); return false; } -void io_free_file_tables(struct io_file_table *table) +void io_free_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table) { - io_rsrc_data_free(&table->data); + io_rsrc_data_free(ctx, &table->data); bitmap_free(table->bitmap); table->bitmap = NULL; } @@ -71,7 +72,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, if (!node) return -ENOMEM; - if (!io_reset_rsrc_node(&ctx->file_table.data, slot_index)) + if (!io_reset_rsrc_node(ctx, &ctx->file_table.data, slot_index)) io_file_bitmap_set(&ctx->file_table, slot_index); ctx->file_table.data.nodes[slot_index] = node; @@ -130,7 +131,7 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset) node = io_rsrc_node_lookup(&ctx->file_table.data, offset); if (!node) return -EBADF; - io_reset_rsrc_node(&ctx->file_table.data, offset); + io_reset_rsrc_node(ctx, &ctx->file_table.data, offset); io_file_bitmap_clear(&ctx->file_table, offset); return 0; } diff --git a/io_uring/filetable.h b/io_uring/filetable.h index bfacadb8d089..7717ea9efd0e 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -6,8 +6,8 @@ #include #include "rsrc.h" -bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); -void io_free_file_tables(struct io_file_table *table); +bool io_alloc_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table, unsigned nr_files); +void io_free_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table); int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, struct file *file, unsigned int file_slot); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 2fb1791d7255..d7db36a2c66e 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -130,13 +130,13 @@ struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) return node; } -__cold void io_rsrc_data_free(struct io_rsrc_data *data) +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) { if (!data->nr) return; while (data->nr--) { if (data->nodes[data->nr]) - io_put_rsrc_node(data->nodes[data->nr]); + io_put_rsrc_node(ctx, data->nodes[data->nr]); } kvfree(data->nodes); data->nodes = NULL; @@ -184,7 +184,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, continue; i = up->offset + done; - if (io_reset_rsrc_node(&ctx->file_table.data, i)) + if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) io_file_bitmap_clear(&ctx->file_table, i); if (fd != -1) { @@ -266,7 +266,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, node->tag = tag; } i = array_index_nospec(up->offset + done, ctx->buf_table.nr); - io_reset_rsrc_node(&ctx->buf_table, i); + io_reset_rsrc_node(ctx, &ctx->buf_table, i); ctx->buf_table.nodes[i] = node; if (ctx->compat) user_data += sizeof(struct compat_iovec); @@ -442,10 +442,8 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -void io_free_rsrc_node(struct io_rsrc_node *node) +void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - struct io_ring_ctx *ctx = io_rsrc_node_ctx(node); - lockdep_assert_held(&ctx->uring_lock); if (node->tag) @@ -473,7 +471,7 @@ int io_sqe_files_unregister(struct io_ring_ctx *ctx) if (!ctx->file_table.data.nr) return -ENXIO; - io_free_file_tables(&ctx->file_table); + io_free_file_tables(ctx, &ctx->file_table); io_file_table_set_alloc_range(ctx, 0, 0); return 0; } @@ -494,7 +492,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -EMFILE; if (nr_args > rlimit(RLIMIT_NOFILE)) return -EMFILE; - if (!io_alloc_file_tables(&ctx->file_table, nr_args)) + if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) return -ENOMEM; for (i = 0; i < nr_args; i++) { @@ -551,7 +549,7 @@ int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) { if (!ctx->buf_table.nr) return -ENXIO; - io_rsrc_data_free(&ctx->buf_table); + io_rsrc_data_free(ctx, &ctx->buf_table); return 0; } @@ -788,7 +786,7 @@ done: if (ret) { kvfree(imu); if (node) - io_put_rsrc_node(node); + io_put_rsrc_node(ctx, node); node = ERR_PTR(ret); } kvfree(pages); @@ -1018,7 +1016,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx * old and new nodes at this point. */ if (arg->flags & IORING_REGISTER_DST_REPLACE) - io_rsrc_data_free(&ctx->buf_table); + io_rsrc_data_free(ctx, &ctx->buf_table); /* * ctx->buf_table should be empty now - either the contents are being @@ -1042,7 +1040,7 @@ out_put_free: kfree(data.nodes[i]); } out_unlock: - io_rsrc_data_free(&data); + io_rsrc_data_free(ctx, &data); mutex_unlock(&src_ctx->uring_lock); mutex_lock(&ctx->uring_lock); return ret; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index bc3a863b14bb..c9057f7a06f5 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -45,8 +45,8 @@ struct io_imu_folio_data { }; struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type); -void io_free_rsrc_node(struct io_rsrc_node *node); -void io_rsrc_data_free(struct io_rsrc_data *data); +void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node); +void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data); int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr); int io_import_fixed(int ddir, struct iov_iter *iter, @@ -76,19 +76,20 @@ static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data return NULL; } -static inline void io_put_rsrc_node(struct io_rsrc_node *node) +static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { if (node && !--node->refs) - io_free_rsrc_node(node); + io_free_rsrc_node(ctx, node); } -static inline bool io_reset_rsrc_node(struct io_rsrc_data *data, int index) +static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx, + struct io_rsrc_data *data, int index) { struct io_rsrc_node *node = data->nodes[index]; if (!node) return false; - io_put_rsrc_node(node); + io_put_rsrc_node(ctx, node); data->nodes[index] = NULL; return true; } @@ -96,20 +97,15 @@ static inline bool io_reset_rsrc_node(struct io_rsrc_data *data, int index) static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) { if (req->file_node) { - io_put_rsrc_node(req->file_node); + io_put_rsrc_node(req->ctx, req->file_node); req->file_node = NULL; } if (req->flags & REQ_F_BUF_NODE) { - io_put_rsrc_node(req->buf_node); + io_put_rsrc_node(req->ctx, req->buf_node); req->buf_node = NULL; } } -static inline struct io_ring_ctx *io_rsrc_node_ctx(struct io_rsrc_node *node) -{ - return (struct io_ring_ctx *) (node->ctx_ptr & ~IORING_RSRC_TYPE_MASK); -} - static inline int io_rsrc_node_type(struct io_rsrc_node *node) { return node->ctx_ptr & IORING_RSRC_TYPE_MASK; diff --git a/io_uring/splice.c b/io_uring/splice.c index e8ed15f4ea1a..5b84f1630611 100644 --- a/io_uring/splice.c +++ b/io_uring/splice.c @@ -51,7 +51,7 @@ void io_splice_cleanup(struct io_kiocb *req) { struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice); - io_put_rsrc_node(sp->rsrc_node); + io_put_rsrc_node(req->ctx, sp->rsrc_node); } static struct file *io_splice_get_file(struct io_kiocb *req, -- 2.51.0 From 4f219fcce5e4366cc121fc98270beb1fbbb3df2b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 7 Nov 2024 19:01:35 +0800 Subject: [PATCH 15/16] io_uring/rsrc: remove '->ctx_ptr' of 'struct io_rsrc_node' Remove '->ctx_ptr' of 'struct io_rsrc_node', and add 'type' field, meantime remove io_rsrc_node_type(). Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241107110149.890530-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 4 ++-- io_uring/rsrc.h | 9 +-------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index d7db36a2c66e..adaae8630932 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -124,7 +124,7 @@ struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) node = kzalloc(sizeof(*node), GFP_KERNEL); if (node) { - node->ctx_ptr = (unsigned long) ctx | type; + node->type = type; node->refs = 1; } return node; @@ -449,7 +449,7 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) if (node->tag) io_post_aux_cqe(ctx, node->tag, 0, 0); - switch (io_rsrc_node_type(node)) { + switch (node->type) { case IORING_RSRC_FILE: if (io_slot_file(node)) fput(io_slot_file(node)); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index c9057f7a06f5..c8a64a9ed5b9 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -11,12 +11,10 @@ enum { IORING_RSRC_FILE = 0, IORING_RSRC_BUFFER = 1, - - IORING_RSRC_TYPE_MASK = 0x3UL, }; struct io_rsrc_node { - unsigned long ctx_ptr; + unsigned char type; int refs; u64 tag; @@ -106,11 +104,6 @@ static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) } } -static inline int io_rsrc_node_type(struct io_rsrc_node *node) -{ - return node->ctx_ptr & IORING_RSRC_TYPE_MASK; -} - static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node, struct io_rsrc_node *node) { -- 2.51.0 From 039c878db7add23c1c9ea18424c442cce76670f9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 7 Nov 2024 19:01:36 +0800 Subject: [PATCH 16/16] io_uring/rsrc: add & apply io_req_assign_buf_node() The following pattern becomes more and more: + io_req_assign_rsrc_node(&req->buf_node, node); + req->flags |= REQ_F_BUF_NODE; so make it a helper, which is less fragile to use than above code, for example, the BUF_NODE flag is even missed in current io_uring_cmd_prep(). Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241107110149.890530-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- io_uring/net.c | 3 +-- io_uring/nop.c | 3 +-- io_uring/rsrc.h | 7 +++++++ io_uring/rw.c | 3 +-- io_uring/uring_cmd.c | 2 +- 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 2ccc2b409431..df1f7dc6f1c8 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1348,8 +1348,7 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) io_ring_submit_lock(ctx, issue_flags); node = io_rsrc_node_lookup(&ctx->buf_table, sr->buf_index); if (node) { - io_req_assign_rsrc_node(&sr->notif->buf_node, node); - sr->notif->flags |= REQ_F_BUF_NODE; + io_req_assign_buf_node(sr->notif, node); ret = 0; } io_ring_submit_unlock(ctx, issue_flags); diff --git a/io_uring/nop.c b/io_uring/nop.c index bc22bcc739f3..6d470d4251ee 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -67,8 +67,7 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags) io_ring_submit_lock(ctx, issue_flags); node = io_rsrc_node_lookup(&ctx->buf_table, nop->buffer); if (node) { - io_req_assign_rsrc_node(&req->buf_node, node); - req->flags |= REQ_F_BUF_NODE; + io_req_assign_buf_node(req, node); ret = 0; } io_ring_submit_unlock(ctx, issue_flags); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index c8a64a9ed5b9..7a4668deaa1a 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -111,6 +111,13 @@ static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node, *dst_node = node; } +static inline void io_req_assign_buf_node(struct io_kiocb *req, + struct io_rsrc_node *node) +{ + io_req_assign_rsrc_node(&req->buf_node, node); + req->flags |= REQ_F_BUF_NODE; +} + int io_files_update(struct io_kiocb *req, unsigned int issue_flags); int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); diff --git a/io_uring/rw.c b/io_uring/rw.c index e368b9afde03..b62cdb5fc936 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -341,8 +341,7 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); if (!node) return -EFAULT; - io_req_assign_rsrc_node(&req->buf_node, node); - req->flags |= REQ_F_BUF_NODE; + io_req_assign_buf_node(req, node); io = req->async_data; ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len); diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 40b8b777ba12..b62965f58f30 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -219,7 +219,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) * being called. This prevents destruction of the mapped buffer * we'll need at actual import time. */ - io_req_assign_rsrc_node(&req->buf_node, node); + io_req_assign_buf_node(req, node); } ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); -- 2.51.0