From 76f1cc98b23cefd1f0ae90c51f1fb837e5f46528 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 20 Apr 2025 10:31:20 +0100 Subject: [PATCH 01/16] io_uring/zcrx: add support for multiple ifqs Allow the user to register multiple ifqs / zcrx contexts. With that we can use multiple interfaces / interface queues in a single io_uring instance. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/668b03bee03b5216564482edcfefbc2ee337dd30.1745141261.git.asml.silence@gmail.com [axboe: fold in fix] Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +-- io_uring/io_uring.c | 3 +- io_uring/net.c | 5 +-- io_uring/zcrx.c | 73 +++++++++++++++++++++++----------- 4 files changed, 56 insertions(+), 30 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 06d722289fc5..7e23e993280e 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -40,8 +40,6 @@ enum io_uring_cmd_flags { IO_URING_F_TASK_DEAD = (1 << 13), }; -struct io_zcrx_ifq; - struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -394,7 +392,8 @@ struct io_ring_ctx { struct wait_queue_head poll_wq; struct io_restriction restrictions; - struct io_zcrx_ifq *ifq; + /* Stores zcrx object pointers of type struct io_zcrx_ifq */ + struct xarray zcrx_ctxs; u32 pers_next; struct xarray personalities; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 75c022526548..0dc6c2f1295e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -359,6 +359,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; INIT_HLIST_HEAD(&ctx->waitid_list); + xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC); #ifdef CONFIG_FUTEX INIT_HLIST_HEAD(&ctx->futex_list); #endif @@ -2889,7 +2890,7 @@ static __cold void io_ring_exit_work(struct work_struct *work) io_cqring_overflow_kill(ctx); mutex_unlock(&ctx->uring_lock); } - if (ctx->ifq) { + if (!xa_empty(&ctx->zcrx_ctxs)) { mutex_lock(&ctx->uring_lock); io_shutdown_zcrx_ifqs(ctx); mutex_unlock(&ctx->uring_lock); diff --git a/io_uring/net.c b/io_uring/net.c index 782f8e76c5c7..b3a643675ce8 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1189,11 +1189,10 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); - if (ifq_idx != 0) - return -EINVAL; - zc->ifq = req->ctx->ifq; + zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx); if (!zc->ifq) return -EINVAL; + zc->len = READ_ONCE(sqe->len); zc->flags = READ_ONCE(sqe->ioprio); zc->msg_flags = READ_ONCE(sqe->msg_flags); diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 033284b695c7..22f420d6fbb9 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -156,8 +156,10 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov) static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, - struct io_uring_region_desc *rd) + struct io_uring_region_desc *rd, + u32 id) { + u64 mmap_offset; size_t off, size; void *ptr; int ret; @@ -167,7 +169,10 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, if (size > rd->size) return -EINVAL; - ret = io_create_region(ifq->ctx, &ifq->region, rd, IORING_MAP_OFF_ZCRX_REGION); + mmap_offset = IORING_MAP_OFF_ZCRX_REGION; + mmap_offset += id << IORING_OFF_PBUF_SHIFT; + + ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset); if (ret < 0) return ret; @@ -179,9 +184,6 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) { - if (WARN_ON_ONCE(ifq->ctx->ifq)) - return; - io_free_region(ifq->ctx, &ifq->region); ifq->rq_ring = NULL; ifq->rqes = NULL; @@ -343,11 +345,11 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, unsigned int id) { + struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id); + lockdep_assert_held(&ctx->mmap_lock); - if (id != 0 || !ctx->ifq) - return NULL; - return &ctx->ifq->region; + return ifq ? &ifq->region : NULL; } int io_register_zcrx_ifq(struct io_ring_ctx *ctx, @@ -359,6 +361,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_region_desc rd; struct io_zcrx_ifq *ifq; int ret; + u32 id; /* * 1. Interface queue allocation. @@ -371,8 +374,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && ctx->flags & IORING_SETUP_CQE32)) return -EINVAL; - if (ctx->ifq) - return -EBUSY; if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) @@ -396,7 +397,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (!ifq) return -ENOMEM; - ret = io_allocate_rbuf_ring(ifq, ®, &rd); + scoped_guard(mutex, &ctx->mmap_lock) { + /* preallocate id */ + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); + if (ret) + goto ifq_free; + } + + ret = io_allocate_rbuf_ring(ifq, ®, &rd, id); if (ret) goto err; @@ -428,6 +436,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, reg.offsets.rqes = sizeof(struct io_uring); reg.offsets.head = offsetof(struct io_uring, head); reg.offsets.tail = offsetof(struct io_uring, tail); + reg.zcrx_id = id; + + scoped_guard(mutex, &ctx->mmap_lock) { + /* publish ifq */ + ret = -ENOMEM; + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) + goto err; + } if (copy_to_user(arg, ®, sizeof(reg)) || copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || @@ -435,26 +451,34 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ret = -EFAULT; goto err; } - scoped_guard(mutex, &ctx->mmap_lock) - ctx->ifq = ifq; return 0; err: + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->zcrx_ctxs, id); +ifq_free: io_zcrx_ifq_free(ifq); return ret; } void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) { - struct io_zcrx_ifq *ifq = ctx->ifq; + struct io_zcrx_ifq *ifq; + unsigned long id; lockdep_assert_held(&ctx->uring_lock); - if (!ifq) - return; + while (1) { + scoped_guard(mutex, &ctx->mmap_lock) { + ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); + if (ifq) + xa_erase(&ctx->zcrx_ctxs, id); + } + if (!ifq) + break; + io_zcrx_ifq_free(ifq); + } - scoped_guard(mutex, &ctx->mmap_lock) - ctx->ifq = NULL; - io_zcrx_ifq_free(ifq); + xa_destroy(&ctx->zcrx_ctxs); } static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) @@ -511,12 +535,15 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) { + struct io_zcrx_ifq *ifq; + unsigned long index; + lockdep_assert_held(&ctx->uring_lock); - if (!ctx->ifq) - return; - io_zcrx_scrub(ctx->ifq); - io_close_queue(ctx->ifq); + xa_for_each(&ctx->zcrx_ctxs, index, ifq) { + io_zcrx_scrub(ifq); + io_close_queue(ifq); + } } static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) -- 2.51.0 From 62f666df765ecaf9cc1892ca056d5c071a335d85 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 24 Apr 2025 12:31:16 +0100 Subject: [PATCH 02/16] io_uring/eventfd: dedup signalling helpers Consolidate io_eventfd_flush_signal() and io_eventfd_signal(). Not much of a difference for now, but it prepares it for following changes. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5beecd4da65d8d2d83df499196f84b329387f6a2.1745493845.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 26 +++++++++----------------- io_uring/eventfd.h | 3 +-- io_uring/io_uring.c | 4 ++-- 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 100d5da94cb9..a9da2d0d7510 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -112,23 +112,16 @@ static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx) return NULL; } -void io_eventfd_signal(struct io_ring_ctx *ctx) +void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event) { + bool skip = false, put_ref = true; struct io_ev_fd *ev_fd; ev_fd = io_eventfd_grab(ctx); - if (ev_fd) - io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd)); -} - -void io_eventfd_flush_signal(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = io_eventfd_grab(ctx); - if (ev_fd) { - bool skip, put_ref = true; + if (!ev_fd) + return; + if (cqe_event) { /* * Eventfd should only get triggered when at least one event * has been posted. Some applications rely on the eventfd @@ -142,12 +135,11 @@ void io_eventfd_flush_signal(struct io_ring_ctx *ctx) skip = ctx->cached_cq_tail == ev_fd->last_cq_tail; ev_fd->last_cq_tail = ctx->cached_cq_tail; spin_unlock(&ctx->completion_lock); - - if (!skip) - put_ref = __io_eventfd_signal(ev_fd); - - io_eventfd_release(ev_fd, put_ref); } + + if (!skip) + put_ref = __io_eventfd_signal(ev_fd); + io_eventfd_release(ev_fd, put_ref); } int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h index d394f49c6321..e2f1985c2cf9 100644 --- a/io_uring/eventfd.h +++ b/io_uring/eventfd.h @@ -4,5 +4,4 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int eventfd_async); int io_eventfd_unregister(struct io_ring_ctx *ctx); -void io_eventfd_flush_signal(struct io_ring_ctx *ctx); -void io_eventfd_signal(struct io_ring_ctx *ctx); +void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0dc6c2f1295e..2e4d8e76316a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -584,7 +584,7 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) if (ctx->drain_active) io_queue_deferred(ctx); if (ctx->has_evfd) - io_eventfd_flush_signal(ctx); + io_eventfd_signal(ctx, true); } static inline void __io_cq_lock(struct io_ring_ctx *ctx) @@ -1199,7 +1199,7 @@ static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (ctx->has_evfd) - io_eventfd_signal(ctx); + io_eventfd_signal(ctx, false); } nr_wait = atomic_read(&ctx->cq_wait_nr); -- 2.51.0 From da01f60f8ad144a8a0844833a8d0f0005b0a7c51 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 24 Apr 2025 12:31:17 +0100 Subject: [PATCH 03/16] io_uring/eventfd: clean up rcu locking Conditional locking is never welcome if there are better options. Move rcu locking into io_eventfd_signal(), make it unconditional and use guards. It also helps with sparse warnings. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/91a925e708ca8a5aa7fee61f96d29b24ea9adeaf.1745493845.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index a9da2d0d7510..8c2835ac17a0 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -47,13 +47,6 @@ static void io_eventfd_do_signal(struct rcu_head *rcu) io_eventfd_put(ev_fd); } -static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref) -{ - if (put_ref) - io_eventfd_put(ev_fd); - rcu_read_unlock(); -} - /* * Returns true if the caller should put the ev_fd reference, false if not. */ @@ -89,11 +82,6 @@ static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd; - if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - return NULL; - - rcu_read_lock(); - /* * rcu_dereference ctx->io_ev_fd once and use it for both for checking * and eventfd_signal @@ -108,15 +96,18 @@ static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx) if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs)) return ev_fd; - rcu_read_unlock(); return NULL; } void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event) { - bool skip = false, put_ref = true; + bool skip = false; struct io_ev_fd *ev_fd; + if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) + return; + + guard(rcu)(); ev_fd = io_eventfd_grab(ctx); if (!ev_fd) return; @@ -137,9 +128,8 @@ void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event) spin_unlock(&ctx->completion_lock); } - if (!skip) - put_ref = __io_eventfd_signal(ev_fd); - io_eventfd_release(ev_fd, put_ref); + if (skip || __io_eventfd_signal(ev_fd)) + io_eventfd_put(ev_fd); } int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, -- 2.51.0 From f6da4fee69860d4a02d14b016021ffb516a25f38 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 24 Apr 2025 12:31:18 +0100 Subject: [PATCH 04/16] io_uring/eventfd: open code io_eventfd_grab() io_eventfd_grab() doesn't help wit understanding the path, it'll be simpler to keep the helper open coded. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5cb53ce3876c2819db9e8055cf41dca4398521db.1745493845.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 8c2835ac17a0..78f8ab7db104 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -65,38 +65,11 @@ static bool __io_eventfd_signal(struct io_ev_fd *ev_fd) /* * Trigger if eventfd_async isn't set, or if it's set and the caller is - * an async worker. If ev_fd isn't valid, obviously return false. + * an async worker. */ static bool io_eventfd_trigger(struct io_ev_fd *ev_fd) { - if (ev_fd) - return !ev_fd->eventfd_async || io_wq_current_is_worker(); - return false; -} - -/* - * On success, returns with an ev_fd reference grabbed and the RCU read - * lock held. - */ -static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - /* - * rcu_dereference ctx->io_ev_fd once and use it for both for checking - * and eventfd_signal - */ - ev_fd = rcu_dereference(ctx->io_ev_fd); - - /* - * Check again if ev_fd exists in case an io_eventfd_unregister call - * completed between the NULL check of ctx->io_ev_fd at the start of - * the function and rcu_read_lock. - */ - if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs)) - return ev_fd; - - return NULL; + return !ev_fd->eventfd_async || io_wq_current_is_worker(); } void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event) @@ -108,9 +81,16 @@ void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event) return; guard(rcu)(); - ev_fd = io_eventfd_grab(ctx); + ev_fd = rcu_dereference(ctx->io_ev_fd); + /* + * Check again if ev_fd exists in case an io_eventfd_unregister call + * completed between the NULL check of ctx->io_ev_fd at the start of + * the function and rcu_read_lock. + */ if (!ev_fd) return; + if (!io_eventfd_trigger(ev_fd) || !refcount_inc_not_zero(&ev_fd->refs)) + return; if (cqe_event) { /* -- 2.51.0 From 27d2fed790ce6407e321e89aac3c8c0e28986fff Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 28 Apr 2025 13:52:32 +0100 Subject: [PATCH 05/16] io_uring: delete misleading comment in io_fill_cqe_aux() io_fill_cqe_aux() doesn't overflow completions, however it might fail them and lets the caller handle it. Remove the comment, which doesn't make any sense. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/021aa8c1d8f20ef2b66da6aeabb6b511938fd2c5.1745843119.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2e4d8e76316a..703251f6f4d8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -814,11 +814,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, ctx->cq_extra++; - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ if (likely(io_get_cqe(ctx, &cqe))) { WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); -- 2.51.0 From 91db6edc573bf238c277602b2ea4b4f4688fdedc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 28 Apr 2025 13:52:33 +0100 Subject: [PATCH 06/16] io_uring/cmd: move net cmd into a separate file We keep socket io_uring command implementation in io_uring/uring_cmd.c. Separate it from generic command code into a separate file. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/747d0519a2255bd055ae76b691d38d2b4c311001.1745843119.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/Makefile | 1 + io_uring/cmd_net.c | 83 ++++++++++++++++++++++++++++++++++++++++++++ io_uring/uring_cmd.c | 83 -------------------------------------------- 3 files changed, 84 insertions(+), 83 deletions(-) create mode 100644 io_uring/cmd_net.c diff --git a/io_uring/Makefile b/io_uring/Makefile index 3e28a741ca15..75e0ca795685 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_EPOLL) += epoll.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o +obj-$(CONFIG_NET) += cmd_net.o diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c new file mode 100644 index 000000000000..e99170c7d41a --- /dev/null +++ b/io_uring/cmd_net.c @@ -0,0 +1,83 @@ +#include +#include +#include + +#include "uring_cmd.h" + +static inline int io_uring_cmd_getsockopt(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + bool compat = !!(issue_flags & IO_URING_F_COMPAT); + int optlen, optname, level, err; + void __user *optval; + + level = READ_ONCE(sqe->level); + if (level != SOL_SOCKET) + return -EOPNOTSUPP; + + optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); + optname = READ_ONCE(sqe->optname); + optlen = READ_ONCE(sqe->optlen); + + err = do_sock_getsockopt(sock, compat, level, optname, + USER_SOCKPTR(optval), + KERNEL_SOCKPTR(&optlen)); + if (err) + return err; + + /* On success, return optlen */ + return optlen; +} + +static inline int io_uring_cmd_setsockopt(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + bool compat = !!(issue_flags & IO_URING_F_COMPAT); + int optname, optlen, level; + void __user *optval; + sockptr_t optval_s; + + optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); + optname = READ_ONCE(sqe->optname); + optlen = READ_ONCE(sqe->optlen); + level = READ_ONCE(sqe->level); + optval_s = USER_SOCKPTR(optval); + + return do_sock_setsockopt(sock, compat, level, optname, optval_s, + optlen); +} + +int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct socket *sock = cmd->file->private_data; + struct sock *sk = sock->sk; + struct proto *prot = READ_ONCE(sk->sk_prot); + int ret, arg = 0; + + if (!prot || !prot->ioctl) + return -EOPNOTSUPP; + + switch (cmd->cmd_op) { + case SOCKET_URING_OP_SIOCINQ: + ret = prot->ioctl(sk, SIOCINQ, &arg); + if (ret) + return ret; + return arg; + case SOCKET_URING_OP_SIOCOUTQ: + ret = prot->ioctl(sk, SIOCOUTQ, &arg); + if (ret) + return ret; + return arg; + case SOCKET_URING_OP_GETSOCKOPT: + return io_uring_cmd_getsockopt(sock, cmd, issue_flags); + case SOCKET_URING_OP_SETSOCKOPT: + return io_uring_cmd_setsockopt(sock, cmd, issue_flags); + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL_GPL(io_uring_cmd_sock); diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index a9ea7d29cdd9..34b450c78e2b 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -3,13 +3,10 @@ #include #include #include -#include #include #include -#include #include -#include #include "io_uring.h" #include "alloc_cache.h" @@ -302,83 +299,3 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) io_req_queue_iowq(req); } - -static inline int io_uring_cmd_getsockopt(struct socket *sock, - struct io_uring_cmd *cmd, - unsigned int issue_flags) -{ - const struct io_uring_sqe *sqe = cmd->sqe; - bool compat = !!(issue_flags & IO_URING_F_COMPAT); - int optlen, optname, level, err; - void __user *optval; - - level = READ_ONCE(sqe->level); - if (level != SOL_SOCKET) - return -EOPNOTSUPP; - - optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); - optname = READ_ONCE(sqe->optname); - optlen = READ_ONCE(sqe->optlen); - - err = do_sock_getsockopt(sock, compat, level, optname, - USER_SOCKPTR(optval), - KERNEL_SOCKPTR(&optlen)); - if (err) - return err; - - /* On success, return optlen */ - return optlen; -} - -static inline int io_uring_cmd_setsockopt(struct socket *sock, - struct io_uring_cmd *cmd, - unsigned int issue_flags) -{ - const struct io_uring_sqe *sqe = cmd->sqe; - bool compat = !!(issue_flags & IO_URING_F_COMPAT); - int optname, optlen, level; - void __user *optval; - sockptr_t optval_s; - - optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); - optname = READ_ONCE(sqe->optname); - optlen = READ_ONCE(sqe->optlen); - level = READ_ONCE(sqe->level); - optval_s = USER_SOCKPTR(optval); - - return do_sock_setsockopt(sock, compat, level, optname, optval_s, - optlen); -} - -#if defined(CONFIG_NET) -int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) -{ - struct socket *sock = cmd->file->private_data; - struct sock *sk = sock->sk; - struct proto *prot = READ_ONCE(sk->sk_prot); - int ret, arg = 0; - - if (!prot || !prot->ioctl) - return -EOPNOTSUPP; - - switch (cmd->cmd_op) { - case SOCKET_URING_OP_SIOCINQ: - ret = prot->ioctl(sk, SIOCINQ, &arg); - if (ret) - return ret; - return arg; - case SOCKET_URING_OP_SIOCOUTQ: - ret = prot->ioctl(sk, SIOCOUTQ, &arg); - if (ret) - return ret; - return arg; - case SOCKET_URING_OP_GETSOCKOPT: - return io_uring_cmd_getsockopt(sock, cmd, issue_flags); - case SOCKET_URING_OP_SETSOCKOPT: - return io_uring_cmd_setsockopt(sock, cmd, issue_flags); - default: - return -EOPNOTSUPP; - } -} -EXPORT_SYMBOL_GPL(io_uring_cmd_sock); -#endif -- 2.51.0 From d760d3f59f0d8d0df2895db30d36cf23106d6b05 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 1 May 2025 13:17:14 +0100 Subject: [PATCH 07/16] io_uring/zcrx: improve area validation dmabuf backed area will be taking an offset instead of addresses, and io_buffer_validate() is not flexible enough to facilitate it. It also takes an iovec, which may truncate the u64 length zcrx takes. Add a new helper function for validation. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0b3b735391a0a8f8971bf0121c19765131fddd3b.1746097431.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 27 +++++++++++++++------------ io_uring/rsrc.h | 2 +- io_uring/zcrx.c | 7 +++---- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index b4c5f3ee8855..1657d775c8ba 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -80,10 +80,21 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return 0; } -int io_buffer_validate(struct iovec *iov) +int io_validate_user_buf_range(u64 uaddr, u64 ulen) { - unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); + unsigned long tmp, base = (unsigned long)uaddr; + unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); + /* arbitrary limit, but we need something */ + if (ulen > SZ_1G || !ulen) + return -EFAULT; + if (check_add_overflow(base, acct_len, &tmp)) + return -EOVERFLOW; + return 0; +} + +static int io_buffer_validate(struct iovec *iov) +{ /* * Don't impose further limits on the size and buffer * constraints here, we'll -EINVAL later when IO is @@ -91,17 +102,9 @@ int io_buffer_validate(struct iovec *iov) */ if (!iov->iov_base) return iov->iov_len ? -EFAULT : 0; - if (!iov->iov_len) - return -EFAULT; - - /* arbitrary limit, but we need something */ - if (iov->iov_len > SZ_1G) - return -EFAULT; - if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) - return -EOVERFLOW; - - return 0; + return io_validate_user_buf_range((unsigned long)iov->iov_base, + iov->iov_len); } static void io_release_ubuf(void *priv) diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 6008ad2e6d9e..2818aa0d0472 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -83,7 +83,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type); int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); -int io_buffer_validate(struct iovec *iov); +int io_validate_user_buf_range(u64 uaddr, u64 ulen); bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, struct io_imu_folio_data *data); diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 22f420d6fbb9..5e918587fdc5 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -209,7 +209,6 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, { struct io_zcrx_area *area; int i, ret, nr_pages, nr_iovs; - struct iovec iov; if (area_reg->flags || area_reg->rq_area_token) return -EINVAL; @@ -218,11 +217,11 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) return -EINVAL; - iov.iov_base = u64_to_user_ptr(area_reg->addr); - iov.iov_len = area_reg->len; - ret = io_buffer_validate(&iov); + ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); if (ret) return ret; + if (!area_reg->addr) + return -EFAULT; ret = -ENOMEM; area = kzalloc(sizeof(*area), GFP_KERNEL); -- 2.51.0 From 6c9589aa08471f8984cdb5e743d2a2c048dc2403 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 1 May 2025 13:17:15 +0100 Subject: [PATCH 08/16] io_uring/zcrx: resolve netdev before area creation Some area types will require a valid struct device to be created, so resolve netdev and struct device before creating an area. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ac8c1482be22acfe9ca788d2c3ce31b7451ce488.1746097431.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 5e918587fdc5..b5335dd4f5b1 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -395,6 +395,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ifq = io_zcrx_ifq_alloc(ctx); if (!ifq) return -ENOMEM; + ifq->rq_entries = reg.rq_entries; scoped_guard(mutex, &ctx->mmap_lock) { /* preallocate id */ @@ -407,24 +408,24 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (ret) goto err; - ret = io_zcrx_create_area(ifq, &ifq->area, &area); - if (ret) - goto err; - - ifq->rq_entries = reg.rq_entries; - - ret = -ENODEV; ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx, &ifq->netdev_tracker, GFP_KERNEL); - if (!ifq->netdev) + if (!ifq->netdev) { + ret = -ENODEV; goto err; + } ifq->dev = ifq->netdev->dev.parent; - ret = -EOPNOTSUPP; - if (!ifq->dev) + if (!ifq->dev) { + ret = -EOPNOTSUPP; goto err; + } get_device(ifq->dev); + ret = io_zcrx_create_area(ifq, &ifq->area, &area); + if (ret) + goto err; + mp_param.mp_ops = &io_uring_pp_zc_ops; mp_param.mp_priv = ifq; ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param); -- 2.51.0 From 782dfa329ac9d1b5ca7b6df56a7696bac58cb829 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 1 May 2025 13:17:16 +0100 Subject: [PATCH 09/16] io_uring/zcrx: split out memory holders from area In the data path users of struct io_zcrx_area don't need to know what kind of memory it's backed by. Only keep there generic bits in there and and split out memory type dependent fields into a new structure. It also logically separates the step that actually imports the memory, e.g. pinning user pages, from the generic area initialisation. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b60fc09c76921bf69e77eb17e07eb4decedb3bf4.1746097431.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 71 ++++++++++++++++++++++++++++++++----------------- io_uring/zcrx.h | 11 ++++++-- 2 files changed, 56 insertions(+), 26 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index b5335dd4f5b1..8d4cfd957e38 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -26,6 +26,8 @@ #include "zcrx.h" #include "rsrc.h" +#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) + static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) { return pp->mp_priv; @@ -42,10 +44,43 @@ static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) { struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); - return area->pages[net_iov_idx(niov)]; + return area->mem.pages[net_iov_idx(niov)]; } -#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) +static void io_release_area_mem(struct io_zcrx_mem *mem) +{ + if (mem->pages) { + unpin_user_pages(mem->pages, mem->nr_folios); + kvfree(mem->pages); + } +} + +static int io_import_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + struct page **pages; + int nr_pages; + int ret; + + ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); + if (ret) + return ret; + if (!area_reg->addr) + return -EFAULT; + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) + return -EINVAL; + + pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, + &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + mem->pages = pages; + mem->nr_folios = nr_pages; + mem->size = area_reg->len; + return 0; +} static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area, int nr_mapped) @@ -84,8 +119,8 @@ static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) struct net_iov *niov = &area->nia.niovs[i]; dma_addr_t dma; - dma = dma_map_page_attrs(ifq->dev, area->pages[i], 0, PAGE_SIZE, - DMA_FROM_DEVICE, IO_DMA_ATTR); + dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0, + PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR); if (dma_mapping_error(ifq->dev, dma)) break; if (net_mp_niov_set_dma_addr(niov, dma)) { @@ -192,14 +227,11 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) static void io_zcrx_free_area(struct io_zcrx_area *area) { io_zcrx_unmap_area(area->ifq, area); + io_release_area_mem(&area->mem); kvfree(area->freelist); kvfree(area->nia.niovs); kvfree(area->user_refs); - if (area->pages) { - unpin_user_pages(area->pages, area->nr_folios); - kvfree(area->pages); - } kfree(area); } @@ -208,36 +240,27 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, struct io_uring_zcrx_area_reg *area_reg) { struct io_zcrx_area *area; - int i, ret, nr_pages, nr_iovs; + unsigned nr_iovs; + int i, ret; if (area_reg->flags || area_reg->rq_area_token) return -EINVAL; if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) return -EINVAL; - if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) - return -EINVAL; - - ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); - if (ret) - return ret; - if (!area_reg->addr) - return -EFAULT; ret = -ENOMEM; area = kzalloc(sizeof(*area), GFP_KERNEL); if (!area) goto err; - area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, - &nr_pages); - if (IS_ERR(area->pages)) { - ret = PTR_ERR(area->pages); - area->pages = NULL; + ret = io_import_area(ifq, &area->mem, area_reg); + if (ret) goto err; - } - area->nr_folios = nr_iovs = nr_pages; + + nr_iovs = area->mem.size >> PAGE_SHIFT; area->nia.num_niovs = nr_iovs; + ret = -ENOMEM; area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]), GFP_KERNEL | __GFP_ZERO); if (!area->nia.niovs) diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index e3c7c4e647f1..9c22807af807 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -7,6 +7,13 @@ #include #include +struct io_zcrx_mem { + unsigned long size; + + struct page **pages; + unsigned long nr_folios; +}; + struct io_zcrx_area { struct net_iov_area nia; struct io_zcrx_ifq *ifq; @@ -14,13 +21,13 @@ struct io_zcrx_area { bool is_mapped; u16 area_id; - struct page **pages; - unsigned long nr_folios; /* freelist */ spinlock_t freelist_lock ____cacheline_aligned_in_smp; u32 free_count; u32 *freelist; + + struct io_zcrx_mem mem; }; struct io_zcrx_ifq { -- 2.51.0 From 8a62804248fff77749048a0f5511649b2569bba9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 1 May 2025 13:17:17 +0100 Subject: [PATCH 10/16] io_uring/zcrx: split common area map/unmap parts Extract area type depedent parts of io_zcrx_[un]map_area from the generic path. It'll be helpful once there are more area memory types and not only user pages. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/50f6e893e2d20f937e628196cbf528d15f81c289.1746097431.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 8d4cfd957e38..34b09beba992 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -82,22 +82,31 @@ static int io_import_area(struct io_zcrx_ifq *ifq, return 0; } -static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, - struct io_zcrx_area *area, int nr_mapped) +static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area, int nr_mapped) { int i; for (i = 0; i < nr_mapped; i++) { - struct net_iov *niov = &area->nia.niovs[i]; - dma_addr_t dma; + netmem_ref netmem = net_iov_to_netmem(&area->nia.niovs[i]); + dma_addr_t dma = page_pool_get_dma_addr_netmem(netmem); - dma = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR); - net_mp_niov_set_dma_addr(niov, 0); } } +static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area, int nr_mapped) +{ + int i; + + io_zcrx_unmap_umem(ifq, area, nr_mapped); + + for (i = 0; i < area->nia.num_niovs; i++) + net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); +} + static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) { guard(mutex)(&ifq->dma_lock); @@ -107,14 +116,10 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *are area->is_mapped = false; } -static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) { int i; - guard(mutex)(&ifq->dma_lock); - if (area->is_mapped) - return 0; - for (i = 0; i < area->nia.num_niovs; i++) { struct net_iov *niov = &area->nia.niovs[i]; dma_addr_t dma; @@ -129,9 +134,20 @@ static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) break; } } + return i; +} + +static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + unsigned nr; + + guard(mutex)(&ifq->dma_lock); + if (area->is_mapped) + return 0; - if (i != area->nia.num_niovs) { - __io_zcrx_unmap_area(ifq, area, i); + nr = io_zcrx_map_area_umem(ifq, area); + if (nr != area->nia.num_niovs) { + __io_zcrx_unmap_area(ifq, area, nr); return -EINVAL; } -- 2.51.0 From a5c98e9424573649e59988199a3356a79c9e1fd9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 1 May 2025 13:17:18 +0100 Subject: [PATCH 11/16] io_uring/zcrx: dmabuf backed zerocopy receive Add support for dmabuf backed zcrx areas. To use it, the user should pass IORING_ZCRX_AREA_DMABUF in the struct io_uring_zcrx_area_reg flags field and pass a dmabuf fd in the dmabuf_fd field. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/20bb1890e60a82ec945ab36370d1fd54be414ab6.1746097431.git.asml.silence@gmail.com Link: https://lore.kernel.org/io-uring/6e37db97303212bbd8955f9501cf99b579f8aece.1746547722.git.asml.silence@gmail.com [axboe: fold in fixup] Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 6 +- io_uring/zcrx.c | 163 ++++++++++++++++++++++++++++++---- io_uring/zcrx.h | 7 ++ 3 files changed, 159 insertions(+), 17 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 130f3bc71a69..5ce096090b0c 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -990,12 +990,16 @@ struct io_uring_zcrx_offsets { __u64 __resv[2]; }; +enum io_uring_zcrx_area_flags { + IORING_ZCRX_AREA_DMABUF = 1, +}; + struct io_uring_zcrx_area_reg { __u64 addr; __u64 len; __u64 rq_area_token; __u32 flags; - __u32 __resv1; + __u32 dmabuf_fd; __u64 __resv2[2]; }; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 34b09beba992..9a568d049204 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -47,30 +47,118 @@ static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) return area->mem.pages[net_iov_idx(niov)]; } -static void io_release_area_mem(struct io_zcrx_mem *mem) +static void io_release_dmabuf(struct io_zcrx_mem *mem) { - if (mem->pages) { - unpin_user_pages(mem->pages, mem->nr_folios); - kvfree(mem->pages); + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return; + + if (mem->sgt) + dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt, + DMA_FROM_DEVICE); + if (mem->attach) + dma_buf_detach(mem->dmabuf, mem->attach); + if (mem->dmabuf) + dma_buf_put(mem->dmabuf); + + mem->sgt = NULL; + mem->attach = NULL; + mem->dmabuf = NULL; +} + +static int io_import_dmabuf(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + unsigned long off = (unsigned long)area_reg->addr; + unsigned long len = (unsigned long)area_reg->len; + unsigned long total_size = 0; + struct scatterlist *sg; + int dmabuf_fd = area_reg->dmabuf_fd; + int i, ret; + + if (WARN_ON_ONCE(!ifq->dev)) + return -EFAULT; + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return -EINVAL; + + mem->is_dmabuf = true; + mem->dmabuf = dma_buf_get(dmabuf_fd); + if (IS_ERR(mem->dmabuf)) { + ret = PTR_ERR(mem->dmabuf); + mem->dmabuf = NULL; + goto err; } + + mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev); + if (IS_ERR(mem->attach)) { + ret = PTR_ERR(mem->attach); + mem->attach = NULL; + goto err; + } + + mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE); + if (IS_ERR(mem->sgt)) { + ret = PTR_ERR(mem->sgt); + mem->sgt = NULL; + goto err; + } + + for_each_sgtable_dma_sg(mem->sgt, sg, i) + total_size += sg_dma_len(sg); + + if (total_size < off + len) + return -EINVAL; + + mem->dmabuf_offset = off; + mem->size = len; + return 0; +err: + io_release_dmabuf(mem); + return ret; } -static int io_import_area(struct io_zcrx_ifq *ifq, +static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + unsigned long off = area->mem.dmabuf_offset; + struct scatterlist *sg; + unsigned i, niov_idx = 0; + + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return -EINVAL; + + for_each_sgtable_dma_sg(area->mem.sgt, sg, i) { + dma_addr_t dma = sg_dma_address(sg); + unsigned long sg_len = sg_dma_len(sg); + unsigned long sg_off = min(sg_len, off); + + off -= sg_off; + sg_len -= sg_off; + dma += sg_off; + + while (sg_len && niov_idx < area->nia.num_niovs) { + struct net_iov *niov = &area->nia.niovs[niov_idx]; + + if (net_mp_niov_set_dma_addr(niov, dma)) + return 0; + sg_len -= PAGE_SIZE; + dma += PAGE_SIZE; + niov_idx++; + } + } + return niov_idx; +} + +static int io_import_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_mem *mem, struct io_uring_zcrx_area_reg *area_reg) { struct page **pages; int nr_pages; - int ret; - ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); - if (ret) - return ret; + if (area_reg->dmabuf_fd) + return -EINVAL; if (!area_reg->addr) return -EFAULT; - if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) - return -EINVAL; - pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, &nr_pages); if (IS_ERR(pages)) @@ -82,6 +170,35 @@ static int io_import_area(struct io_zcrx_ifq *ifq, return 0; } +static void io_release_area_mem(struct io_zcrx_mem *mem) +{ + if (mem->is_dmabuf) { + io_release_dmabuf(mem); + return; + } + if (mem->pages) { + unpin_user_pages(mem->pages, mem->nr_folios); + kvfree(mem->pages); + } +} + +static int io_import_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + int ret; + + ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); + if (ret) + return ret; + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) + return -EINVAL; + + if (area_reg->flags & IORING_ZCRX_AREA_DMABUF) + return io_import_dmabuf(ifq, mem, area_reg); + return io_import_umem(ifq, mem, area_reg); +} + static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area, int nr_mapped) { @@ -101,7 +218,10 @@ static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, { int i; - io_zcrx_unmap_umem(ifq, area, nr_mapped); + if (area->mem.is_dmabuf) + io_release_dmabuf(&area->mem); + else + io_zcrx_unmap_umem(ifq, area, nr_mapped); for (i = 0; i < area->nia.num_niovs; i++) net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); @@ -145,7 +265,11 @@ static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) if (area->is_mapped) return 0; - nr = io_zcrx_map_area_umem(ifq, area); + if (area->mem.is_dmabuf) + nr = io_zcrx_map_area_dmabuf(ifq, area); + else + nr = io_zcrx_map_area_umem(ifq, area); + if (nr != area->nia.num_niovs) { __io_zcrx_unmap_area(ifq, area, nr); return -EINVAL; @@ -251,6 +375,8 @@ static void io_zcrx_free_area(struct io_zcrx_area *area) kfree(area); } +#define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) + static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area **res, struct io_uring_zcrx_area_reg *area_reg) @@ -259,9 +385,11 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, unsigned nr_iovs; int i, ret; - if (area_reg->flags || area_reg->rq_area_token) + if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) + return -EINVAL; + if (area_reg->rq_area_token) return -EINVAL; - if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) + if (area_reg->__resv2[0] || area_reg->__resv2[1]) return -EINVAL; ret = -ENOMEM; @@ -819,6 +947,9 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, size_t copied = 0; int ret = 0; + if (area->mem.is_dmabuf) + return -EFAULT; + while (len) { size_t copy_size = min_t(size_t, PAGE_SIZE, len); const int dst_off = 0; diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 9c22807af807..2f5e26389f22 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -3,15 +3,22 @@ #define IOU_ZC_RX_H #include +#include #include #include #include struct io_zcrx_mem { unsigned long size; + bool is_dmabuf; struct page **pages; unsigned long nr_folios; + + struct dma_buf_attachment *attach; + struct dma_buf *dmabuf; + struct sg_table *sgt; + unsigned long dmabuf_offset; }; struct io_zcrx_area { -- 2.51.0 From 78967aabf6138bd43798c966a75167579ce42955 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 6 May 2025 13:30:47 +0100 Subject: [PATCH 12/16] io_uring/timeout: don't export link t-out disarm helper [__]io_disarm_linked_timeout() are only used inside timeout.c. so confine them inside the file. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1eb200911255e643bf252a8e65fb2c787340cf18.1746533800.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/timeout.c | 11 ++++++++--- io_uring/timeout.h | 13 ------------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 2a107665230b..a6ff8c026b1f 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -35,6 +35,9 @@ struct io_timeout_rem { bool ltimeout; }; +static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, + struct io_kiocb *link); + static inline bool io_is_timeout_noseq(struct io_kiocb *req) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); @@ -218,7 +221,9 @@ void io_disarm_next(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; raw_spin_lock_irq(&ctx->timeout_lock); - link = io_disarm_linked_timeout(req); + if (req->link && req->link->opcode == IORING_OP_LINK_TIMEOUT) + link = __io_disarm_linked_timeout(req, req->link); + raw_spin_unlock_irq(&ctx->timeout_lock); if (link) io_req_queue_tw_complete(link, -ECANCELED); @@ -228,8 +233,8 @@ void io_disarm_next(struct io_kiocb *req) io_fail_links(req); } -struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, - struct io_kiocb *link) +static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, + struct io_kiocb *link) __must_hold(&req->ctx->completion_lock) __must_hold(&req->ctx->timeout_lock) { diff --git a/io_uring/timeout.h b/io_uring/timeout.h index e91b32448dcf..2b7c9ad72992 100644 --- a/io_uring/timeout.h +++ b/io_uring/timeout.h @@ -8,19 +8,6 @@ struct io_timeout_data { u32 flags; }; -struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, - struct io_kiocb *link); - -static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) -{ - struct io_kiocb *link = req->link; - - if (link && link->opcode == IORING_OP_LINK_TIMEOUT) - return __io_disarm_linked_timeout(req, link); - - return NULL; -} - __cold void io_flush_timeouts(struct io_ring_ctx *ctx); struct io_cancel_data; int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd); -- 2.51.0 From 9c2ff3f9b5e0202d1cc1f6193b1e96df203ae4a4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 6 May 2025 13:31:07 +0100 Subject: [PATCH 13/16] io_uring: remove io_preinit_req() Apart from setting ->ctx, io_preinit_req() zeroes a bunch of fields of a request, from which only ->file_node is mandatory. Remove the function and zero the entire request on first allocation. With that, we also need to initialise ->ctx every time, which might be a good thing for performance as now we're likely overwriting the entire cache line, and so it can write combined and avoid RMW. Suggested-by: Jens Axboe Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ba5485dc913f1e275862ce88f5169d4ac4a33836.1746533807.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 21 ++------------------- io_uring/notif.c | 1 + 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 703251f6f4d8..3d20f3b63443 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -927,22 +927,6 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res) io_req_complete_defer(req); } -/* - * Don't initialise the fields below on every allocation, but do that in - * advance and keep them valid across allocations. - */ -static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) -{ - req->ctx = ctx; - req->buf_node = NULL; - req->file_node = NULL; - req->link = NULL; - req->async_data = NULL; - /* not necessary, but safer to zero */ - memset(&req->cqe, 0, sizeof(req->cqe)); - memset(&req->big_cqe, 0, sizeof(req->big_cqe)); -} - /* * A request might get retired back into the request caches even before opcode * handlers and io_issue_sqe() are done with it, e.g. inline completion path. @@ -952,7 +936,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO; void *reqs[IO_REQ_ALLOC_BATCH]; int ret; @@ -973,7 +957,6 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) while (ret--) { struct io_kiocb *req = reqs[ret]; - io_preinit_req(req, ctx); io_req_add_to_cache(req, ctx); } return true; @@ -2049,7 +2032,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, int personality; u8 opcode; - /* req is partially pre-initialised, see io_preinit_req() */ + req->ctx = ctx; req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ sqe_flags = READ_ONCE(sqe->flags); diff --git a/io_uring/notif.c b/io_uring/notif.c index 7bd92538dccb..9a6f6e92d742 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -112,6 +112,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) if (unlikely(!io_alloc_req(ctx, ¬if))) return NULL; + notif->ctx = ctx; notif->opcode = IORING_OP_NOP; notif->flags = 0; notif->file = NULL; -- 2.51.0 From 35adea1d018ab1e450ea2304e58dc2f987a639d3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 6 May 2025 13:31:16 +0100 Subject: [PATCH 14/16] io_uring: move io_req_put_rsrc_nodes() It'd be nice to hide details of how rsrc nodes are used by a request from rsrc.c, specifically which request fields store them, and what bits are signifying if there is a node in a request. It rather belong to generic request handling, so move the helper to io_uring.c. While doing so remove clearing of ->buf_node as it's controlled by REQ_F_BUF_NODE and doesn't require zeroing. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/bb73fb42baf825edb39344365aff48cdfdd4c692.1746533789.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 10 ++++++++++ io_uring/rsrc.h | 12 ------------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3d20f3b63443..0d051476008c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1369,6 +1369,16 @@ void io_queue_next(struct io_kiocb *req) io_req_task_queue(nxt); } +static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) +{ + if (req->file_node) { + io_put_rsrc_node(req->ctx, req->file_node); + req->file_node = NULL; + } + if (req->flags & REQ_F_BUF_NODE) + io_put_rsrc_node(req->ctx, req->buf_node); +} + static void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 2818aa0d0472..0d2138f16322 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -115,18 +115,6 @@ static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx, return true; } -static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) -{ - if (req->file_node) { - io_put_rsrc_node(req->ctx, req->file_node); - req->file_node = NULL; - } - if (req->flags & REQ_F_BUF_NODE) { - io_put_rsrc_node(req->ctx, req->buf_node); - req->buf_node = NULL; - } -} - int io_files_update(struct io_kiocb *req, unsigned int issue_flags); int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -- 2.51.0 From 6ae4308116f1033ceb11b419c01e9c5f17a35633 Mon Sep 17 00:00:00 2001 From: Long Li Date: Fri, 9 May 2025 14:30:15 +0800 Subject: [PATCH 15/16] io_uring: update parameter name in io_pin_pages function declaration Rename first parameter in io_pin_pages from ubuf to uaddr for consistency between declaration and implementation. Signed-off-by: Long Li Link: https://lore.kernel.org/r/20250509063015.3799255-1-leo.lilong@huaweicloud.com Signed-off-by: Jens Axboe --- io_uring/memmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/memmap.h b/io_uring/memmap.h index 24afb298e974..08419684e4bc 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -6,7 +6,7 @@ #define IORING_OFF_ZCRX_SHIFT 16 -struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); +struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages); #ifndef CONFIG_MMU unsigned int io_uring_nommu_mmap_capabilities(struct file *file); -- 2.51.0 From 28b8cd864da516ea87162015e490b1dea444888f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 9 May 2025 12:03:28 +0100 Subject: [PATCH 16/16] io_uring/net: move CONFIG_NET guards to Makefile Instruct Makefile to never try to compile net.c without CONFIG_NET and kill ifdefs in the file. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f466400e20c3f536191bfd559b1f3cd2a2ab5a1e.1746788579.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/Makefile | 4 ++-- io_uring/net.c | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/io_uring/Makefile b/io_uring/Makefile index 75e0ca795685..11a739927a62 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -7,7 +7,7 @@ GCOV_PROFILE := y endif obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ - tctx.o filetable.o rw.o net.o poll.o \ + tctx.o filetable.o rw.o poll.o \ eventfd.o uring_cmd.o openclose.o \ sqpoll.o xattr.o nop.o fs.o splice.o \ sync.o msg_ring.o advise.o openclose.o \ @@ -19,4 +19,4 @@ obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_EPOLL) += epoll.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o -obj-$(CONFIG_NET) += cmd_net.o +obj-$(CONFIG_NET) += net.o cmd_net.o diff --git a/io_uring/net.c b/io_uring/net.c index b3a643675ce8..1fbdb2bbb3f3 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -18,7 +18,6 @@ #include "rsrc.h" #include "zcrx.h" -#if defined(CONFIG_NET) struct io_shutdown { struct file *file; int how; @@ -1836,4 +1835,3 @@ void io_netmsg_cache_free(const void *entry) io_vec_free(&kmsg->vec); kfree(kmsg); } -#endif -- 2.51.0