From 9fe99eed91e8273d3750367af759fe11e9512759 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Sat, 29 Mar 2025 10:15:24 -0600 Subject: [PATCH 01/16] io_uring/wq: avoid indirect do_work/free_work calls struct io_wq stores do_work and free_work function pointers which are called on each work item. But these function pointers are always set to io_wq_submit_work and io_wq_free_work, respectively. So remove these function pointers and just call the functions directly. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250329161527.3281314-1-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 15 ++++----------- io_uring/io-wq.h | 5 ----- io_uring/io_uring.c | 2 +- io_uring/tctx.c | 2 -- 4 files changed, 5 insertions(+), 19 deletions(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 04a75d666195..d52069b1177b 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -114,9 +114,6 @@ enum { struct io_wq { unsigned long state; - free_work_fn *free_work; - io_wq_work_fn *do_work; - struct io_wq_hash *hash; atomic_t worker_refs; @@ -612,10 +609,10 @@ static void io_worker_handle_work(struct io_wq_acct *acct, if (do_kill && (work_flags & IO_WQ_WORK_UNBOUND)) atomic_or(IO_WQ_WORK_CANCEL, &work->flags); - wq->do_work(work); + io_wq_submit_work(work); io_assign_current_work(worker, NULL); - linked = wq->free_work(work); + linked = io_wq_free_work(work); work = next_hashed; if (!work && linked && !io_wq_is_hashed(linked)) { work = linked; @@ -934,8 +931,8 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) { do { atomic_or(IO_WQ_WORK_CANCEL, &work->flags); - wq->do_work(work); - work = wq->free_work(work); + io_wq_submit_work(work); + work = io_wq_free_work(work); } while (work); } @@ -1195,8 +1192,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) int ret, i; struct io_wq *wq; - if (WARN_ON_ONCE(!data->free_work || !data->do_work)) - return ERR_PTR(-EINVAL); if (WARN_ON_ONCE(!bounded)) return ERR_PTR(-EINVAL); @@ -1206,8 +1201,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) refcount_inc(&data->hash->refs); wq->hash = data->hash; - wq->free_work = data->free_work; - wq->do_work = data->do_work; ret = -ENOMEM; diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index d4fb2940e435..774abab54732 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -21,9 +21,6 @@ enum io_wq_cancel { IO_WQ_CANCEL_NOTFOUND, /* work not found */ }; -typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *); -typedef void (io_wq_work_fn)(struct io_wq_work *); - struct io_wq_hash { refcount_t refs; unsigned long map; @@ -39,8 +36,6 @@ static inline void io_wq_put_hash(struct io_wq_hash *hash) struct io_wq_data { struct io_wq_hash *hash; struct task_struct *task; - io_wq_work_fn *do_work; - free_work_fn *free_work; }; struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c6209fe44cb1..61514b14ee3f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1812,7 +1812,7 @@ void io_wq_submit_work(struct io_wq_work *work) bool needs_poll = false; int ret = 0, err = -ECANCELED; - /* one will be dropped by ->io_wq_free_work() after returning to io-wq */ + /* one will be dropped by io_wq_free_work() after returning to io-wq */ if (!(req->flags & REQ_F_REFCOUNT)) __io_req_set_refcount(req, 2); else diff --git a/io_uring/tctx.c b/io_uring/tctx.c index adc6e42c14df..5b66755579c0 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -35,8 +35,6 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, data.hash = hash; data.task = task; - data.free_work = io_wq_free_work; - data.do_work = io_wq_submit_work; /* Do QD, or 4 * CPUS, whatever is smallest */ concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); -- 2.51.0 From e9ff9ae103573c7393d80533214a567654987ed5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 31 Mar 2025 17:17:59 +0100 Subject: [PATCH 02/16] io_uring/net: don't use io_do_buffer_select at prep Prep code is interested whether it's a selected buffer request, not whether a buffer has already been selected like what io_do_buffer_select() returns. Check for REQ_F_BUFFER_SELECT directly. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4488a029ac698554bebf732263fe19d7734affa6.1743437358.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 24040bc3916a..b9a03aeda4bb 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -361,13 +361,9 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) } if (sr->flags & IORING_RECVSEND_FIXED_BUF) return 0; - if (!io_do_buffer_select(req)) { - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, - &kmsg->msg.msg_iter); - if (unlikely(ret < 0)) - return ret; - } - return 0; + if (req->flags & REQ_F_BUFFER_SELECT) + return 0; + return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); } static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -723,7 +719,6 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg; - int ret; kmsg = io_msg_alloc_async(req); if (unlikely(!kmsg)) @@ -739,13 +734,10 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req) kmsg->msg.msg_iocb = NULL; kmsg->msg.msg_ubuf = NULL; - if (!io_do_buffer_select(req)) { - ret = import_ubuf(ITER_DEST, sr->buf, sr->len, - &kmsg->msg.msg_iter); - if (unlikely(ret)) - return ret; - } - return 0; + if (req->flags & REQ_F_BUFFER_SELECT) + return 0; + return import_ubuf(ITER_DEST, sr->buf, sr->len, + &kmsg->msg.msg_iter); } return io_recvmsg_copy_hdr(req, kmsg); -- 2.51.0 From e6f74fd67d50c8a938fcfa83c97cd06995f6aaa1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 31 Mar 2025 17:18:00 +0100 Subject: [PATCH 03/16] io_uring: set IMPORT_BUFFER in generic send setup Move REQ_F_IMPORT_BUFFER to the common send setup. Currently, the only user is send zc, but we'll want for normal sends to support that in the future. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/18b74edfc61d8a1b6c9fed3b78a9276fe80f8ced.1743437358.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index b9a03aeda4bb..eb2130112e0e 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -359,8 +359,10 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) kmsg->msg.msg_name = &kmsg->addr; kmsg->msg.msg_namelen = addr_len; } - if (sr->flags & IORING_RECVSEND_FIXED_BUF) + if (sr->flags & IORING_RECVSEND_FIXED_BUF) { + req->flags |= REQ_F_IMPORT_BUFFER; return 0; + } if (req->flags & REQ_F_BUFFER_SELECT) return 0; return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); @@ -1313,8 +1315,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -ENOMEM; if (req->opcode == IORING_OP_SEND_ZC) { - if (zc->flags & IORING_RECVSEND_FIXED_BUF) - req->flags |= REQ_F_IMPORT_BUFFER; ret = io_send_setup(req, sqe); } else { if (unlikely(sqe->addr2 || sqe->file_index)) -- 2.51.0 From c0e965052149c883317774711205456d08285741 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 31 Mar 2025 17:18:01 +0100 Subject: [PATCH 04/16] io_uring/kbuf: pass bgid to io_buffer_select() The current situation with buffer group id juggling is not ideal. req->buf_index first stores the bgid, then it's overwritten by a buffer id, and then it can get restored back no recycling / etc. It's not so easy to control, and it's not handled consistently across request types with receive requests saving and restoring the bgid it by hand. It's a prep patch that adds a buffer group id argument to io_buffer_select(). The caller will be responsible for stashing a copy somewhere and passing it into the function. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a210d6427cc3f4f42271a6853274cd5a50e56820.1743437358.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 4 ++-- io_uring/kbuf.h | 2 +- io_uring/net.c | 9 ++++----- io_uring/rw.c | 5 ++++- io_uring/rw.h | 2 ++ 5 files changed, 13 insertions(+), 9 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 953d5e742569..f195876732be 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -193,7 +193,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, } void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags) + unsigned buf_group, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; @@ -201,7 +201,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, io_ring_submit_lock(req->ctx, issue_flags); - bl = io_buffer_get_list(ctx, req->buf_index); + bl = io_buffer_get_list(ctx, buf_group); if (likely(bl)) { if (bl->flags & IOBL_BUF_RING) ret = io_ring_buffer_select(req, len, bl, issue_flags); diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 2ec0b983ce24..09129115f3ef 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -58,7 +58,7 @@ struct buf_sel_arg { }; void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags); + unsigned buf_group, unsigned int issue_flags); int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, unsigned int issue_flags); int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg); diff --git a/io_uring/net.c b/io_uring/net.c index eb2130112e0e..6314b1583c8c 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -407,13 +407,12 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; + if (req->flags & REQ_F_BUFFER_SELECT) + sr->buf_group = req->buf_index; if (sr->flags & IORING_RECVSEND_BUNDLE) { if (req->opcode == IORING_OP_SENDMSG) return -EINVAL; - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return -EINVAL; sr->msg_flags |= MSG_WAITALL; - sr->buf_group = req->buf_index; req->buf_list = NULL; req->flags |= REQ_F_MULTISHOT; } @@ -979,7 +978,7 @@ retry_multishot: void __user *buf; size_t len = sr->len; - buf = io_buffer_select(req, &len, issue_flags); + buf = io_buffer_select(req, &len, sr->buf_group, issue_flags); if (!buf) return -ENOBUFS; @@ -1089,7 +1088,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg void __user *buf; *len = sr->len; - buf = io_buffer_select(req, len, issue_flags); + buf = io_buffer_select(req, len, sr->buf_group, issue_flags); if (!buf) return -ENOBUFS; sr->buf = buf; diff --git a/io_uring/rw.c b/io_uring/rw.c index 039e063f7091..17a12a1cf3a6 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -119,7 +119,7 @@ static int __io_import_rw_buffer(int ddir, struct io_kiocb *req, return io_import_vec(ddir, req, io, buf, sqe_len); if (io_do_buffer_select(req)) { - buf = io_buffer_select(req, &sqe_len, issue_flags); + buf = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags); if (!buf) return -ENOBUFS; rw->addr = (unsigned long) buf; @@ -253,16 +253,19 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, int ddir) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct io_async_rw *io; unsigned ioprio; u64 attr_type_mask; int ret; if (io_rw_alloc_async(req)) return -ENOMEM; + io = req->async_data; rw->kiocb.ki_pos = READ_ONCE(sqe->off); /* used for fixed read/write too - just read unconditionally */ req->buf_index = READ_ONCE(sqe->buf_index); + io->buf_group = req->buf_index; ioprio = READ_ONCE(sqe->ioprio); if (ioprio) { diff --git a/io_uring/rw.h b/io_uring/rw.h index 81d6d9a8cf69..129a53fe5482 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -16,6 +16,8 @@ struct io_async_rw { struct iov_iter iter; struct iov_iter_state iter_state; struct iovec fast_iov; + unsigned buf_group; + /* * wpq is for buffered io, while meta fields are used with * direct io -- 2.51.0 From bd32923e5f02fa7b04d487ec265dc8080d27a257 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 31 Mar 2025 17:18:02 +0100 Subject: [PATCH 05/16] io_uring: don't store bgid in req->buf_index Pass buffer group id into the rest of helpers via struct buf_sel_arg and remove all reassignments of req->buf_index back to bgid. Now, it only stores buffer indexes, and the group is provided by callers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3ea9fa08113ecb4d9224b943e7806e80a324bdf9.1743437358.git.asml.silence@gmail.com Link: https://lore.kernel.org/io-uring/0c01d76ff12986c2f48614db8610caff8f78c869.1743500909.git.asml.silence@gmail.com/ [axboe: fold in patch from second link] Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +-- io_uring/kbuf.c | 11 ++++------- io_uring/kbuf.h | 2 +- io_uring/net.c | 3 ++- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index b44d201520d8..3b467879bca8 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -653,8 +653,7 @@ struct io_kiocb { u8 iopoll_completed; /* * Can be either a fixed buffer index, or used with provided buffers. - * For the latter, before issue it points to the buffer group ID, - * and after selection it points to the buffer ID itself. + * For the latter, it points to the selected buffer ID. */ u16 buf_index; diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index f195876732be..1cf0d2c01287 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -92,7 +92,6 @@ void io_kbuf_drop_legacy(struct io_kiocb *req) { if (WARN_ON_ONCE(!(req->flags & REQ_F_BUFFER_SELECTED))) return; - req->buf_index = req->kbuf->bgid; req->flags &= ~REQ_F_BUFFER_SELECTED; kfree(req->kbuf); req->kbuf = NULL; @@ -110,7 +109,6 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) bl = io_buffer_get_list(ctx, buf->bgid); list_add(&buf->list, &bl->buf_list); req->flags &= ~REQ_F_BUFFER_SELECTED; - req->buf_index = buf->bgid; io_ring_submit_unlock(ctx, issue_flags); return true; @@ -302,7 +300,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, int ret = -ENOENT; io_ring_submit_lock(ctx, issue_flags); - bl = io_buffer_get_list(ctx, req->buf_index); + bl = io_buffer_get_list(ctx, arg->buf_group); if (unlikely(!bl)) goto out_unlock; @@ -335,7 +333,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) lockdep_assert_held(&ctx->uring_lock); - bl = io_buffer_get_list(ctx, req->buf_index); + bl = io_buffer_get_list(ctx, arg->buf_group); if (unlikely(!bl)) return -ENOENT; @@ -355,10 +353,9 @@ static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) struct io_buffer_list *bl = req->buf_list; bool ret = true; - if (bl) { + if (bl) ret = io_kbuf_commit(req, bl, len, nr); - req->buf_index = bl->bgid; - } + req->flags &= ~REQ_F_BUFFER_RING; return ret; } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 09129115f3ef..0798a732e6cb 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -55,6 +55,7 @@ struct buf_sel_arg { size_t max_len; unsigned short nr_iovs; unsigned short mode; + unsigned buf_group; }; void __user *io_buffer_select(struct io_kiocb *req, size_t *len, @@ -94,7 +95,6 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) * to monopolize the buffer. */ if (req->buf_list) { - req->buf_index = req->buf_list->bgid; req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT); return true; } diff --git a/io_uring/net.c b/io_uring/net.c index 6314b1583c8c..5f1a519d1fc6 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -190,7 +190,6 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req, sr->done_io = 0; sr->retry = false; sr->len = 0; /* get from the provided buffer */ - req->buf_index = sr->buf_group; } static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg, @@ -568,6 +567,7 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, .iovs = &kmsg->fast_iov, .max_len = min_not_zero(sr->len, INT_MAX), .nr_iovs = 1, + .buf_group = sr->buf_group, }; if (kmsg->vec.iovec) { @@ -1056,6 +1056,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg .iovs = &kmsg->fast_iov, .nr_iovs = 1, .mode = KBUF_MODE_EXPAND, + .buf_group = sr->buf_group, }; if (kmsg->vec.iovec) { -- 2.51.0 From 53db8a71ecb42c2ec5e9c6925269a750255f9af5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 4 Apr 2025 14:50:59 -0600 Subject: [PATCH 06/16] io_uring: add support for IORING_OP_PIPE This works just like pipe2(2), except it also supports fixed file descriptors. Used in a similar fashion as for other fd instantiating opcodes (like accept, socket, open, etc), where sqe->file_slot is set appropriately if two direct descriptors are desired rather than a set of normal file descriptors. sqe->addr must be set to a pointer to an array of 2 integers, which is where the fixed/normal file descriptors are copied to. sqe->pipe_flags contains flags, same as what is allowed for pipe2(2). Future expansion of per-op private flags can go in sqe->ioprio, like we do for other opcodes that take both a "syscall" flag set and an io_uring opcode specific flag set. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 + io_uring/opdef.c | 7 ++ io_uring/openclose.c | 133 ++++++++++++++++++++++++++++++++++ io_uring/openclose.h | 3 + 4 files changed, 145 insertions(+) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 8f1fc12bac46..130f3bc71a69 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -73,6 +73,7 @@ struct io_uring_sqe { __u32 futex_flags; __u32 install_fd_flags; __u32 nop_flags; + __u32 pipe_flags; }; __u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ @@ -283,6 +284,7 @@ enum io_uring_op { IORING_OP_EPOLL_WAIT, IORING_OP_READV_FIXED, IORING_OP_WRITEV_FIXED, + IORING_OP_PIPE, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 489384c0438b..db36433c2294 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -569,6 +569,10 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_prep_writev_fixed, .issue = io_write, }, + [IORING_OP_PIPE] = { + .prep = io_pipe_prep, + .issue = io_pipe, + }, }; const struct io_cold_def io_cold_defs[] = { @@ -815,6 +819,9 @@ const struct io_cold_def io_cold_defs[] = { .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, + [IORING_OP_PIPE] = { + .name = "PIPE", + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/openclose.c b/io_uring/openclose.c index e3357dfa14ca..4dd461163457 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include @@ -302,3 +304,134 @@ int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); return IOU_OK; } + +struct io_pipe { + struct file *file; + int __user *fds; + int flags; + int file_slot; + unsigned long nofile; +}; + +int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + + if (sqe->fd || sqe->off || sqe->addr3) + return -EINVAL; + + p->fds = u64_to_user_ptr(READ_ONCE(sqe->addr)); + p->flags = READ_ONCE(sqe->pipe_flags); + if (p->flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE)) + return -EINVAL; + + p->file_slot = READ_ONCE(sqe->file_index); + p->nofile = rlimit(RLIMIT_NOFILE); + return 0; +} + +static int io_pipe_fixed(struct io_kiocb *req, struct file **files, + unsigned int issue_flags) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + struct io_ring_ctx *ctx = req->ctx; + int ret, fds[2] = { -1, -1 }; + int slot = p->file_slot; + + if (p->flags & O_CLOEXEC) + return -EINVAL; + + io_ring_submit_lock(ctx, issue_flags); + + ret = __io_fixed_fd_install(ctx, files[0], slot); + if (ret < 0) + goto err; + fds[0] = ret; + files[0] = NULL; + + /* + * If a specific slot is given, next one will be used for + * the write side. + */ + if (slot != IORING_FILE_INDEX_ALLOC) + slot++; + + ret = __io_fixed_fd_install(ctx, files[1], slot); + if (ret < 0) + goto err; + fds[1] = ret; + files[1] = NULL; + + io_ring_submit_unlock(ctx, issue_flags); + + if (!copy_to_user(p->fds, fds, sizeof(fds))) + return 0; + + ret = -EFAULT; + io_ring_submit_lock(ctx, issue_flags); +err: + if (fds[0] != -1) + io_fixed_fd_remove(ctx, fds[0]); + if (fds[1] != -1) + io_fixed_fd_remove(ctx, fds[1]); + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} + +static int io_pipe_fd(struct io_kiocb *req, struct file **files) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + int ret, fds[2] = { -1, -1 }; + + ret = __get_unused_fd_flags(p->flags, p->nofile); + if (ret < 0) + goto err; + fds[0] = ret; + + ret = __get_unused_fd_flags(p->flags, p->nofile); + if (ret < 0) + goto err; + fds[1] = ret; + + if (!copy_to_user(p->fds, fds, sizeof(fds))) { + fd_install(fds[0], files[0]); + fd_install(fds[1], files[1]); + return 0; + } + ret = -EFAULT; +err: + if (fds[0] != -1) + put_unused_fd(fds[0]); + if (fds[1] != -1) + put_unused_fd(fds[1]); + return ret; +} + +int io_pipe(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + struct file *files[2]; + int ret; + + ret = create_pipe_files(files, p->flags); + if (ret) + return ret; + files[0]->f_mode |= FMODE_NOWAIT; + files[1]->f_mode |= FMODE_NOWAIT; + + if (!!p->file_slot) + ret = io_pipe_fixed(req, files, issue_flags); + else + ret = io_pipe_fd(req, files); + + io_req_set_res(req, ret, 0); + if (!ret) + return IOU_OK; + + req_set_fail(req); + if (files[0]) + fput(files[0]); + if (files[1]) + fput(files[1]); + return ret; +} diff --git a/io_uring/openclose.h b/io_uring/openclose.h index 8a93c98ad0ad..4ca2a9935abc 100644 --- a/io_uring/openclose.h +++ b/io_uring/openclose.h @@ -13,5 +13,8 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags); int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_close(struct io_kiocb *req, unsigned int issue_flags); +int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_pipe(struct io_kiocb *req, unsigned int issue_flags); + int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags); -- 2.51.0 From 8a2dacd49f1d19f4e3a4ebc99d11622cafd1659e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Apr 2025 13:25:03 -0600 Subject: [PATCH 07/16] io_uring/rsrc: remove node assignment helpers There are two helpers here, one assigns and increments the node ref count, and the other is simply a wrapper around that for the buffer node handling. The buffer node assignment benefits from checking and setting REQ_F_BUF_NODE together, otherwise stalls have been observed on setting that flag later in the process. Hence re-do it so that it's set when checked, and cleared in case of (unlikely) failure. With that, the buffer node helper can go, and then drop the generic io_req_assign_rsrc_node() helper as well as there's only a single user of it left at that point. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 3 ++- io_uring/rsrc.c | 12 +++++++++--- io_uring/rsrc.h | 14 -------------- 3 files changed, 11 insertions(+), 18 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 61514b14ee3f..75c022526548 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1912,7 +1912,8 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, io_ring_submit_lock(ctx, issue_flags); node = io_rsrc_node_lookup(&ctx->file_table.data, fd); if (node) { - io_req_assign_rsrc_node(&req->file_node, node); + node->refs++; + req->file_node = node; req->flags |= io_slot_flags(node); file = io_slot_file(node); } diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f80a77c4973f..107064c9c6fc 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1110,13 +1110,19 @@ inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, if (req->flags & REQ_F_BUF_NODE) return req->buf_node; + req->flags |= REQ_F_BUF_NODE; io_ring_submit_lock(ctx, issue_flags); node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); - if (node) - io_req_assign_buf_node(req, node); + if (node) { + node->refs++; + req->buf_node = node; + io_ring_submit_unlock(ctx, issue_flags); + return node; + } + req->flags &= ~REQ_F_BUF_NODE; io_ring_submit_unlock(ctx, issue_flags); - return node; + return NULL; } int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index b52242852ff3..6008ad2e6d9e 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -127,20 +127,6 @@ static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) } } -static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node, - struct io_rsrc_node *node) -{ - node->refs++; - *dst_node = node; -} - -static inline void io_req_assign_buf_node(struct io_kiocb *req, - struct io_rsrc_node *node) -{ - io_req_assign_rsrc_node(&req->buf_node, node); - req->flags |= REQ_F_BUF_NODE; -} - int io_files_update(struct io_kiocb *req, unsigned int issue_flags); int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -- 2.51.0 From ea76925614189bdcb6571e2ea8de68af409ebd56 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 19 Apr 2025 18:47:04 +0100 Subject: [PATCH 08/16] io_uring/rsrc: use unpin_user_folio We want to have a full folio to be left pinned but with only one reference, for that we "unpin" all but the first page with unpin_user_pages(), which can be confusing. There is a new helper to achieve that called unpin_user_folio(), so use that. Signed-off-by: Pavel Begunkov Reviewed-by: Anuj Gupta Link: https://lore.kernel.org/r/e0b2be8f9ea68f6b351ec3bb046f04f437f68491.1745083025.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 107064c9c6fc..6bf8dff4adf3 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -699,10 +699,9 @@ static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, * The pages are bound to the folio, it doesn't * actually unpin them but drops all but one reference, * which is usually put down by io_buffer_unmap(). - * Note, needs a better helper. */ if (data->nr_pages_head > 1) - unpin_user_pages(&page_array[1], data->nr_pages_head - 1); + unpin_user_folio(page_folio(new_array[0]), data->nr_pages_head - 1); j = data->nr_pages_head; nr_pages_left -= data->nr_pages_head; @@ -713,7 +712,7 @@ static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, nr_unpin = min_t(unsigned int, nr_pages_left - 1, data->nr_pages_mid - 1); if (nr_unpin) - unpin_user_pages(&page_array[j+1], nr_unpin); + unpin_user_folio(page_folio(new_array[i]), nr_unpin); j += data->nr_pages_mid; nr_pages_left -= data->nr_pages_mid; } -- 2.51.0 From 9cebcf7b0c38bca1b501d8716163aa254b230559 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 19 Apr 2025 18:47:05 +0100 Subject: [PATCH 09/16] io_uring/rsrc: clean up io_coalesce_buffer() We don't need special handling for the first page in io_coalesce_buffer(), move it inside the loop. Signed-off-by: Pavel Begunkov Reviewed-by: Anuj Gupta Link: https://lore.kernel.org/r/ad698cddc1eadb3d92a7515e95bb13f79420323d.1745083025.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 6bf8dff4adf3..5d25f3391650 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -685,37 +685,34 @@ static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, struct io_imu_folio_data *data) { struct page **page_array = *pages, **new_array = NULL; - int nr_pages_left = *nr_pages, i, j; - int nr_folios = data->nr_folios; + unsigned nr_pages_left = *nr_pages; + unsigned nr_folios = data->nr_folios; + unsigned i, j; /* Store head pages only*/ - new_array = kvmalloc_array(nr_folios, sizeof(struct page *), - GFP_KERNEL); + new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL); if (!new_array) return false; - new_array[0] = compound_head(page_array[0]); - /* - * The pages are bound to the folio, it doesn't - * actually unpin them but drops all but one reference, - * which is usually put down by io_buffer_unmap(). - */ - if (data->nr_pages_head > 1) - unpin_user_folio(page_folio(new_array[0]), data->nr_pages_head - 1); - - j = data->nr_pages_head; - nr_pages_left -= data->nr_pages_head; - for (i = 1; i < nr_folios; i++) { - unsigned int nr_unpin; - - new_array[i] = page_array[j]; - nr_unpin = min_t(unsigned int, nr_pages_left - 1, - data->nr_pages_mid - 1); - if (nr_unpin) - unpin_user_folio(page_folio(new_array[i]), nr_unpin); - j += data->nr_pages_mid; - nr_pages_left -= data->nr_pages_mid; + for (i = 0, j = 0; i < nr_folios; i++) { + struct page *p = compound_head(page_array[j]); + struct folio *folio = page_folio(p); + unsigned int nr; + + WARN_ON_ONCE(i > 0 && p != page_array[j]); + + nr = i ? data->nr_pages_mid : data->nr_pages_head; + nr = min(nr, nr_pages_left); + /* Drop all but one ref, the entire folio will remain pinned. */ + if (nr > 1) + unpin_user_folio(folio, nr - 1); + j += nr; + nr_pages_left -= nr; + new_array[i] = p; } + + WARN_ON_ONCE(j != *nr_pages); + kvfree(page_array); *pages = new_array; *nr_pages = nr_folios; -- 2.51.0 From be6bad57b217491733754ae8113eec94a90a2769 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 19 Apr 2025 18:47:06 +0100 Subject: [PATCH 10/16] io_uring/rsrc: remove null check on import WARN_ON_ONCE() checking imu for NULL in io_import_fixed() is in the hot path and too protective, it's time to get rid of it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3782c01e311dbd474bca45aefaf79a2f2822fafb.1745083025.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 5d25f3391650..b4c5f3ee8855 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1058,8 +1058,6 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, size_t offset; int ret; - if (WARN_ON_ONCE(!imu)) - return -EFAULT; ret = validate_fixed_range(buf_addr, len, imu); if (unlikely(ret)) return ret; -- 2.51.0 From 37d26edd6bb4984849a71e21c6824c961fcd19db Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 20 Apr 2025 10:31:15 +0100 Subject: [PATCH 11/16] io_uring/zcrx: remove duplicated freelist init Several lines below we already initialise the freelist, don't do it twice. Reviewed-by: David Wei Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/71b07c4e00d1db65899d1ed8d603d961fe7d1c28.1745141261.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index fe86606b9f30..79f58c94494b 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -245,9 +245,6 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, if (!area->freelist) goto err; - for (i = 0; i < nr_iovs; i++) - area->freelist[i] = i; - area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]), GFP_KERNEL | __GFP_ZERO); if (!area->user_refs) -- 2.51.0 From a79154ae5df9e21dbacb1eb77fad984fd4c45cca Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 20 Apr 2025 10:31:16 +0100 Subject: [PATCH 12/16] io_uring/zcrx: move io_zcrx_iov_page We'll need io_zcrx_iov_page at the top to keep offset calculations closer together, move it there. Reviewed-by: David Wei Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/575617033a8b84a5985c7eb760b7121efdbe7e56.1745141261.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 79f58c94494b..ecfe080b24c5 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -31,6 +31,20 @@ static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) return pp->mp_priv; } +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) +{ + struct net_iov_area *owner = net_iov_owner(niov); + + return container_of(owner, struct io_zcrx_area, nia); +} + +static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + return area->pages[net_iov_idx(niov)]; +} + #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, @@ -118,13 +132,6 @@ struct io_zcrx_args { static const struct memory_provider_ops io_uring_pp_zc_ops; -static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) -{ - struct net_iov_area *owner = net_iov_owner(niov); - - return container_of(owner, struct io_zcrx_area, nia); -} - static inline atomic_t *io_get_user_counter(struct net_iov *niov) { struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); @@ -147,13 +154,6 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov) atomic_inc(io_get_user_counter(niov)); } -static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) -{ - struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); - - return area->pages[net_iov_idx(niov)]; -} - static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, struct io_uring_region_desc *rd) -- 2.51.0 From 59bc1ab922bbb36558292c204e56ab951e833384 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 20 Apr 2025 10:31:17 +0100 Subject: [PATCH 13/16] io_uring/zcrx: remove sqe->file_index check sqe->file_index and sqe->zcrx_ifq_idx are aliased, so even though io_recvzc_prep() has all necessary handling and checking for ->zcrx_ifq_idx, it's already rejected a couple of lines above. It's not a real problem, however, as we can only have one ifq at the moment, which index is always zero, and it returns correct error codes to the user. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b51acedd326d1a87bf53e33303e6fc87661a3e3f.1745141261.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 5f1a519d1fc6..782f8e76c5c7 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1185,8 +1185,7 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); unsigned ifq_idx; - if (unlikely(sqe->file_index || sqe->addr2 || sqe->addr || - sqe->addr3)) + if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3)) return -EINVAL; ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); -- 2.51.0 From 77231d4e46555d30289b1909c2a2f26bcf00f08c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 20 Apr 2025 10:31:18 +0100 Subject: [PATCH 14/16] io_uring/zcrx: let zcrx choose region for mmaping In preparation for adding multiple ifqs, add a helper returning a region for mmaping zcrx refill queue. For now it's trivial and returns the same ctx global ->zcrx_region. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d9006e2ef8cd5e5b337c2ba2228ede8409eb60f2.1745141261.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 11 +++++++---- io_uring/memmap.h | 2 ++ io_uring/zcrx.c | 10 ++++++++++ io_uring/zcrx.h | 7 +++++++ 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 76fcc79656b0..5cf3f23e751b 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -13,6 +13,7 @@ #include "memmap.h" #include "kbuf.h" #include "rsrc.h" +#include "zcrx.h" static void *io_mem_alloc_compound(struct page **pages, int nr_pages, size_t size, gfp_t gfp) @@ -258,7 +259,8 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, loff_t pgoff) { loff_t offset = pgoff << PAGE_SHIFT; - unsigned int bgid; + unsigned int id; + switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: @@ -267,12 +269,13 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, case IORING_OFF_SQES: return &ctx->sq_region; case IORING_OFF_PBUF_RING: - bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - return io_pbuf_get_region(ctx, bgid); + id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + return io_pbuf_get_region(ctx, id); case IORING_MAP_OFF_PARAM_REGION: return &ctx->param_region; case IORING_MAP_OFF_ZCRX_REGION: - return &ctx->zcrx_region; + id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_ZCRX_SHIFT; + return io_zcrx_get_region(ctx, id); } return NULL; } diff --git a/io_uring/memmap.h b/io_uring/memmap.h index dad0aa5b1b45..24afb298e974 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -4,6 +4,8 @@ #define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL #define IORING_MAP_OFF_ZCRX_REGION 0x30000000ULL +#define IORING_OFF_ZCRX_SHIFT 16 + struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); #ifndef CONFIG_MMU diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index ecfe080b24c5..b7e65ae413b6 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -338,6 +338,16 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) kfree(ifq); } +struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, + unsigned int id) +{ + lockdep_assert_held(&ctx->mmap_lock); + + if (id != 0) + return NULL; + return &ctx->zcrx_region; +} + int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index f2bc811f022c..43134e5c9213 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -49,6 +49,8 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, struct socket *sock, unsigned int flags, unsigned issue_flags, unsigned int *len); +struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, + unsigned int id); #else static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) @@ -67,6 +69,11 @@ static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, { return -EOPNOTSUPP; } +static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, + unsigned int id) +{ + return NULL; +} #endif int io_recvzc(struct io_kiocb *req, unsigned int issue_flags); -- 2.51.0 From 632b3186726984319e2337987de86a442407f30e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 20 Apr 2025 10:31:19 +0100 Subject: [PATCH 15/16] io_uring/zcrx: move zcrx region to struct io_zcrx_ifq Refill queue region is a part of zcrx and should stay in struct io_zcrx_ifq. We can't have multiple queues without it, so move it there. As a result there is no context global zcrx region anymore, and the region is looked up together with its ifq. To protect a concurrent mmap from seeing an inconsistent region we were protecting changes to ->zcrx_region with mmap_lock, but now it protect the publishing of the ifq. Reviewed-by: David Wei Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/24f1a728fc03d0166f16d099575457e10d9d90f2.1745141261.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 -- io_uring/zcrx.c | 20 ++++++++++++-------- io_uring/zcrx.h | 1 + 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3b467879bca8..06d722289fc5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -448,8 +448,6 @@ struct io_ring_ctx { struct io_mapped_region ring_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; - /* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */ - struct io_mapped_region zcrx_region; }; /* diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index b7e65ae413b6..033284b695c7 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -167,12 +167,11 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, if (size > rd->size) return -EINVAL; - ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd, - IORING_MAP_OFF_ZCRX_REGION); + ret = io_create_region(ifq->ctx, &ifq->region, rd, IORING_MAP_OFF_ZCRX_REGION); if (ret < 0) return ret; - ptr = io_region_get_ptr(&ifq->ctx->zcrx_region); + ptr = io_region_get_ptr(&ifq->region); ifq->rq_ring = (struct io_uring *)ptr; ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); return 0; @@ -180,7 +179,10 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) { - io_free_region(ifq->ctx, &ifq->ctx->zcrx_region); + if (WARN_ON_ONCE(ifq->ctx->ifq)) + return; + + io_free_region(ifq->ctx, &ifq->region); ifq->rq_ring = NULL; ifq->rqes = NULL; } @@ -343,9 +345,9 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, { lockdep_assert_held(&ctx->mmap_lock); - if (id != 0) + if (id != 0 || !ctx->ifq) return NULL; - return &ctx->zcrx_region; + return &ctx->ifq->region; } int io_register_zcrx_ifq(struct io_ring_ctx *ctx, @@ -433,7 +435,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ret = -EFAULT; goto err; } - ctx->ifq = ifq; + scoped_guard(mutex, &ctx->mmap_lock) + ctx->ifq = ifq; return 0; err: io_zcrx_ifq_free(ifq); @@ -449,7 +452,8 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) if (!ifq) return; - ctx->ifq = NULL; + scoped_guard(mutex, &ctx->mmap_lock) + ctx->ifq = NULL; io_zcrx_ifq_free(ifq); } diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 43134e5c9213..e3c7c4e647f1 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -39,6 +39,7 @@ struct io_zcrx_ifq { netdevice_tracker netdev_tracker; spinlock_t lock; struct mutex dma_lock; + struct io_mapped_region region; }; #if defined(CONFIG_IO_URING_ZCRX) -- 2.51.0 From 76f1cc98b23cefd1f0ae90c51f1fb837e5f46528 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 20 Apr 2025 10:31:20 +0100 Subject: [PATCH 16/16] io_uring/zcrx: add support for multiple ifqs Allow the user to register multiple ifqs / zcrx contexts. With that we can use multiple interfaces / interface queues in a single io_uring instance. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/668b03bee03b5216564482edcfefbc2ee337dd30.1745141261.git.asml.silence@gmail.com [axboe: fold in fix] Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +-- io_uring/io_uring.c | 3 +- io_uring/net.c | 5 +-- io_uring/zcrx.c | 73 +++++++++++++++++++++++----------- 4 files changed, 56 insertions(+), 30 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 06d722289fc5..7e23e993280e 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -40,8 +40,6 @@ enum io_uring_cmd_flags { IO_URING_F_TASK_DEAD = (1 << 13), }; -struct io_zcrx_ifq; - struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -394,7 +392,8 @@ struct io_ring_ctx { struct wait_queue_head poll_wq; struct io_restriction restrictions; - struct io_zcrx_ifq *ifq; + /* Stores zcrx object pointers of type struct io_zcrx_ifq */ + struct xarray zcrx_ctxs; u32 pers_next; struct xarray personalities; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 75c022526548..0dc6c2f1295e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -359,6 +359,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; INIT_HLIST_HEAD(&ctx->waitid_list); + xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC); #ifdef CONFIG_FUTEX INIT_HLIST_HEAD(&ctx->futex_list); #endif @@ -2889,7 +2890,7 @@ static __cold void io_ring_exit_work(struct work_struct *work) io_cqring_overflow_kill(ctx); mutex_unlock(&ctx->uring_lock); } - if (ctx->ifq) { + if (!xa_empty(&ctx->zcrx_ctxs)) { mutex_lock(&ctx->uring_lock); io_shutdown_zcrx_ifqs(ctx); mutex_unlock(&ctx->uring_lock); diff --git a/io_uring/net.c b/io_uring/net.c index 782f8e76c5c7..b3a643675ce8 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1189,11 +1189,10 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); - if (ifq_idx != 0) - return -EINVAL; - zc->ifq = req->ctx->ifq; + zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx); if (!zc->ifq) return -EINVAL; + zc->len = READ_ONCE(sqe->len); zc->flags = READ_ONCE(sqe->ioprio); zc->msg_flags = READ_ONCE(sqe->msg_flags); diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 033284b695c7..22f420d6fbb9 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -156,8 +156,10 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov) static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, - struct io_uring_region_desc *rd) + struct io_uring_region_desc *rd, + u32 id) { + u64 mmap_offset; size_t off, size; void *ptr; int ret; @@ -167,7 +169,10 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, if (size > rd->size) return -EINVAL; - ret = io_create_region(ifq->ctx, &ifq->region, rd, IORING_MAP_OFF_ZCRX_REGION); + mmap_offset = IORING_MAP_OFF_ZCRX_REGION; + mmap_offset += id << IORING_OFF_PBUF_SHIFT; + + ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset); if (ret < 0) return ret; @@ -179,9 +184,6 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) { - if (WARN_ON_ONCE(ifq->ctx->ifq)) - return; - io_free_region(ifq->ctx, &ifq->region); ifq->rq_ring = NULL; ifq->rqes = NULL; @@ -343,11 +345,11 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, unsigned int id) { + struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id); + lockdep_assert_held(&ctx->mmap_lock); - if (id != 0 || !ctx->ifq) - return NULL; - return &ctx->ifq->region; + return ifq ? &ifq->region : NULL; } int io_register_zcrx_ifq(struct io_ring_ctx *ctx, @@ -359,6 +361,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_region_desc rd; struct io_zcrx_ifq *ifq; int ret; + u32 id; /* * 1. Interface queue allocation. @@ -371,8 +374,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && ctx->flags & IORING_SETUP_CQE32)) return -EINVAL; - if (ctx->ifq) - return -EBUSY; if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) @@ -396,7 +397,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (!ifq) return -ENOMEM; - ret = io_allocate_rbuf_ring(ifq, ®, &rd); + scoped_guard(mutex, &ctx->mmap_lock) { + /* preallocate id */ + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); + if (ret) + goto ifq_free; + } + + ret = io_allocate_rbuf_ring(ifq, ®, &rd, id); if (ret) goto err; @@ -428,6 +436,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, reg.offsets.rqes = sizeof(struct io_uring); reg.offsets.head = offsetof(struct io_uring, head); reg.offsets.tail = offsetof(struct io_uring, tail); + reg.zcrx_id = id; + + scoped_guard(mutex, &ctx->mmap_lock) { + /* publish ifq */ + ret = -ENOMEM; + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) + goto err; + } if (copy_to_user(arg, ®, sizeof(reg)) || copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || @@ -435,26 +451,34 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ret = -EFAULT; goto err; } - scoped_guard(mutex, &ctx->mmap_lock) - ctx->ifq = ifq; return 0; err: + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->zcrx_ctxs, id); +ifq_free: io_zcrx_ifq_free(ifq); return ret; } void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) { - struct io_zcrx_ifq *ifq = ctx->ifq; + struct io_zcrx_ifq *ifq; + unsigned long id; lockdep_assert_held(&ctx->uring_lock); - if (!ifq) - return; + while (1) { + scoped_guard(mutex, &ctx->mmap_lock) { + ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); + if (ifq) + xa_erase(&ctx->zcrx_ctxs, id); + } + if (!ifq) + break; + io_zcrx_ifq_free(ifq); + } - scoped_guard(mutex, &ctx->mmap_lock) - ctx->ifq = NULL; - io_zcrx_ifq_free(ifq); + xa_destroy(&ctx->zcrx_ctxs); } static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) @@ -511,12 +535,15 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) { + struct io_zcrx_ifq *ifq; + unsigned long index; + lockdep_assert_held(&ctx->uring_lock); - if (!ctx->ifq) - return; - io_zcrx_scrub(ctx->ifq); - io_close_queue(ctx->ifq); + xa_for_each(&ctx->zcrx_ctxs, index, ifq) { + io_zcrx_scrub(ifq); + io_close_queue(ifq); + } } static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) -- 2.51.0