io_uring: support SQE grouping

author Ming Lei <ming.lei@redhat.com>

Fri, 25 Oct 2024 12:22:41 +0000 (20:22 +0800)

committer Jens Axboe <axboe@kernel.dk>

Thu, 7 Nov 2024 22:24:33 +0000 (15:24 -0700)
author Ming Lei <ming.lei@redhat.com>
Fri, 25 Oct 2024 12:22:41 +0000 (20:22 +0800)
committer Jens Axboe <axboe@kernel.dk>
Thu, 7 Nov 2024 22:24:33 +0000 (15:24 -0700)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h

index 072e65e931053336f0fcc99e4b4723babcd33fdb..d060ce5e61450334dcce6a180f0a287ed1c1dd09 100644 (file)
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -202,6 +202,8 @@ struct io_submit_state {
         /* batch completion logic */
         struct io_wq_work_list  compl_reqs;
         struct io_submit_link   link;
+       /* points to current group */
+       struct io_submit_link   group;
  
         bool                    plug_started;
         bool                    need_plug;
@@ -446,6 +448,7 @@ enum {
         REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
         REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
         REQ_F_CQE_SKIP_BIT      = IOSQE_CQE_SKIP_SUCCESS_BIT,
+       REQ_F_GROUP_BIT         = IOSQE_GROUP_LINK_BIT,
  
         /* first byte is taken by user flags, shift it to not overlap */
         REQ_F_FAIL_BIT          = 8,
@@ -477,6 +480,7 @@ enum {
         REQ_F_BL_NO_RECYCLE_BIT,
         REQ_F_BUFFERS_COMMIT_BIT,
         REQ_F_BUF_NODE_BIT,
+       REQ_F_GROUP_LEADER_BIT,
  
         /* not a real bit, just to check we're not overflowing the space */
         __REQ_F_LAST_BIT,
@@ -500,6 +504,8 @@ enum {
         REQ_F_BUFFER_SELECT     = IO_REQ_FLAG(REQ_F_BUFFER_SELECT_BIT),
         /* IOSQE_CQE_SKIP_SUCCESS */
         REQ_F_CQE_SKIP          = IO_REQ_FLAG(REQ_F_CQE_SKIP_BIT),
+       /* SQE grouping */
+       REQ_F_GROUP             = IO_REQ_FLAG(REQ_F_GROUP_BIT),
  
         /* fail rest of links */
         REQ_F_FAIL              = IO_REQ_FLAG(REQ_F_FAIL_BIT),
@@ -557,6 +563,8 @@ enum {
         REQ_F_BUFFERS_COMMIT    = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT),
         /* buf node is valid */
         REQ_F_BUF_NODE          = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT),
+       /* sqe group lead */
+       REQ_F_GROUP_LEADER      = IO_REQ_FLAG(REQ_F_GROUP_LEADER_BIT),
  };
  
  typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
@@ -668,6 +676,8 @@ struct io_kiocb {
         void                            *async_data;
         /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
         atomic_t                        poll_refs;
+       /* reference for group leader request */
+       int                             grp_refs;
         struct io_kiocb                 *link;
         /* custom credentials, valid IFF REQ_F_CREDS is set */
         const struct cred               *creds;
@@ -677,6 +687,14 @@ struct io_kiocb {
                 u64                     extra1;
                 u64                     extra2;
         } big_cqe;
+
+       union {
+               /* links all group members for leader */
+               struct io_kiocb                 *grp_link;
+
+               /* points to group leader for member */
+               struct io_kiocb                 *grp_leader;
+       };
  };
  
  struct io_overflow_cqe {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h

index 5d08435b95a8aaab317edb248c882f7384b5b254..56cf30b49ef5f9d8fa6021d877d7b89cac427490 100644 (file)
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -124,6 +124,7 @@ enum io_uring_sqe_flags_bit {
         IOSQE_ASYNC_BIT,
         IOSQE_BUFFER_SELECT_BIT,
         IOSQE_CQE_SKIP_SUCCESS_BIT,
+       IOSQE_GROUP_LINK_BIT,
  };
  
  /*
@@ -143,6 +144,8 @@ enum io_uring_sqe_flags_bit {
  #define IOSQE_BUFFER_SELECT    (1U << IOSQE_BUFFER_SELECT_BIT)
  /* don't post CQE if request succeeded */
  #define IOSQE_CQE_SKIP_SUCCESS (1U << IOSQE_CQE_SKIP_SUCCESS_BIT)
+/* SQE grouping */
+#define IOSQE_GROUP_LINK       (1U << IOSQE_GROUP_LINK_BIT)
  
  /*
   * io_uring_setup() flags
@@ -561,6 +564,7 @@ struct io_uring_params {
  #define IORING_FEAT_REG_REG_RING       (1U << 13)
  #define IORING_FEAT_RECVSEND_BUNDLE    (1U << 14)
  #define IORING_FEAT_MIN_TIMEOUT                (1U << 15)
+#define IORING_FEAT_SQE_GROUP          (1U << 16)
  
  /*
   * io_uring_register(2) opcodes and arguments
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c

index fbe1d68ba15a068fcb6792e0767c1648849cb759..076171977d5e39662374d05020d7919a483abadf 100644 (file)
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -109,14 +109,15 @@
                           IOSQE_IO_HARDLINK | IOSQE_ASYNC)
  
  #define SQE_VALID_FLAGS        (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
-                       IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
+                       IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS | \
+                       IOSQE_GROUP_LINK)
  
  #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
                                 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
                                 REQ_F_ASYNC_DATA)
  
  #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
-                                IO_REQ_CLEAN_FLAGS)
+                                REQ_F_GROUP | IO_REQ_CLEAN_FLAGS)
  
  #define IO_TCTX_REFS_CACHE_NR  (1U << 10)
  
@@ -892,6 +893,123 @@ static __always_inline void io_req_commit_cqe(struct io_ring_ctx *ctx,
         }
  }
  
+/* Can only be called after this request is issued */
+static inline struct io_kiocb *get_group_leader(struct io_kiocb *req)
+{
+       if (!(req->flags & REQ_F_GROUP))
+               return NULL;
+       if (req_is_group_leader(req))
+               return req;
+       return req->grp_leader;
+}
+
+void io_fail_group_members(struct io_kiocb *req)
+{
+       struct io_kiocb *member = req->grp_link;
+
+       while (member) {
+               struct io_kiocb *next = member->grp_link;
+
+               if (!(member->flags & REQ_F_FAIL)) {
+                       req_set_fail(member);
+                       io_req_set_res(member, -ECANCELED, 0);
+               }
+               member = next;
+       }
+}
+
+static void io_queue_group_members(struct io_kiocb *req)
+{
+       struct io_kiocb *member = req->grp_link;
+
+       req->grp_link = NULL;
+       while (member) {
+               struct io_kiocb *next = member->grp_link;
+
+               member->grp_leader = req;
+               if (unlikely(member->flags & REQ_F_FAIL))
+                       io_req_task_queue_fail(member, member->cqe.res);
+               else if (unlikely(req->flags & REQ_F_FAIL))
+                       io_req_task_queue_fail(member, -ECANCELED);
+               else
+                       io_req_task_queue(member);
+               member = next;
+       }
+}
+
+/* called only after the request is completed */
+static bool req_is_last_group_member(struct io_kiocb *req)
+{
+       return req->grp_leader != NULL;
+}
+
+static void io_complete_group_req(struct io_kiocb *req)
+{
+       struct io_kiocb *lead;
+
+       if (req_is_group_leader(req)) {
+               req->grp_refs--;
+               return;
+       }
+
+       lead = get_group_leader(req);
+
+       /* member CQE needs to be posted first */
+       if (!(req->flags & REQ_F_CQE_SKIP))
+               io_req_commit_cqe(req->ctx, req);
+
+       /* Set leader as failed in case of any member failed */
+       if (unlikely((req->flags & REQ_F_FAIL)))
+               req_set_fail(lead);
+
+       WARN_ON_ONCE(lead->grp_refs <= 0);
+       if (!--lead->grp_refs) {
+               /*
+                * We are the last member, and ->grp_leader isn't cleared,
+                * so our leader can be found & freed with the last member
+                */
+               if (!(lead->flags & REQ_F_CQE_SKIP))
+                       io_req_commit_cqe(lead->ctx, lead);
+       } else {
+               /* we are done with the group now */
+               req->grp_leader = NULL;
+       }
+}
+
+enum group_mem {
+       GROUP_LEADER,
+       GROUP_LAST_MEMBER,
+       GROUP_OTHER_MEMBER,
+};
+
+static enum group_mem io_prep_free_group_req(struct io_kiocb *req,
+                                            struct io_kiocb **leader)
+{
+       /*
+        * Group completion is done, so clear the flag for avoiding double
+        * handling in case of io-wq
+        */
+       req->flags &= ~REQ_F_GROUP;
+
+       if (req_is_group_leader(req)) {
+               /* Queue members now */
+               if (req->grp_link)
+                       io_queue_group_members(req);
+               return GROUP_LEADER;
+       }
+       if (!req_is_last_group_member(req))
+               return GROUP_OTHER_MEMBER;
+
+       /*
+        * Prepare for freeing leader which can only be found from the last
+        * member
+        */
+       *leader = req->grp_leader;
+       (*leader)->flags &= ~REQ_F_GROUP_LEADER;
+       req->grp_leader = NULL;
+       return GROUP_LAST_MEMBER;
+}
+
  static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
  {
         struct io_ring_ctx *ctx = req->ctx;
@@ -907,7 +1025,8 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
          * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires
          * the submitter task context, IOPOLL protects with uring_lock.
          */
-       if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) {
+       if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL) ||
+           (req->flags & REQ_F_GROUP)) {
                 req->io_task_work.func = io_req_task_complete;
                 io_req_task_work_add(req);
                 return;
@@ -1382,6 +1501,25 @@ void io_queue_next(struct io_kiocb *req)
                 io_req_task_queue(nxt);
  }
  
+static bool io_group_complete(struct io_kiocb *req)
+{
+       struct io_kiocb *leader = NULL;
+       enum group_mem type = io_prep_free_group_req(req, &leader);
+
+       if (type == GROUP_LEADER) {
+               return true;
+       } else if (type == GROUP_LAST_MEMBER) {
+               /*
+                * Link leader to current request's next, this way works
+                * because the iterator always check the next node only.
+                *
+                * Be careful when you change the iterator in future
+                */
+               wq_stack_add_head(&leader->comp_list, &req->comp_list);
+       }
+       return false;
+}
+
  static void io_free_batch_list(struct io_ring_ctx *ctx,
                                struct io_wq_work_node *node)
         __must_hold(&ctx->uring_lock)
@@ -1391,6 +1529,12 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
                                                     comp_list);
  
                 if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
+                       if (req->flags & REQ_F_GROUP) {
+                               if (io_group_complete(req)) {
+                                       node = req->comp_list.next;
+                                       continue;
+                               }
+                       }
                         if (req->flags & REQ_F_REFCOUNT) {
                                 node = req->comp_list.next;
                                 if (!req_ref_put_and_test(req))
@@ -1430,8 +1574,16 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
                 struct io_kiocb *req = container_of(node, struct io_kiocb,
                                             comp_list);
  
-               if (!(req->flags & REQ_F_CQE_SKIP))
-                       io_req_commit_cqe(ctx, req);
+               if (unlikely(req->flags & (REQ_F_CQE_SKIP | REQ_F_GROUP))) {
+                       if (req->flags & REQ_F_GROUP) {
+                               io_complete_group_req(req);
+                               continue;
+                       }
+
+                       if (req->flags & REQ_F_CQE_SKIP)
+                               continue;
+               }
+               io_req_commit_cqe(ctx, req);
         }
         __io_cq_unlock_post(ctx);
  
@@ -1641,8 +1793,12 @@ static u32 io_get_sequence(struct io_kiocb *req)
         struct io_kiocb *cur;
  
         /* need original cached_sq_head, but it was increased for each req */
-       io_for_each_link(cur, req)
-               seq--;
+       io_for_each_link(cur, req) {
+               if (req_is_group_leader(cur))
+                       seq -= cur->grp_refs;
+               else
+                       seq--;
+       }
         return seq;
  }
  
@@ -2099,6 +2255,66 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
         return def->prep(req, sqe);
  }
  
+static struct io_kiocb *io_group_sqe(struct io_submit_link *group,
+                                    struct io_kiocb *req)
+{
+       /*
+        * Group chain is similar with link chain: starts with 1st sqe with
+        * REQ_F_GROUP, and ends with the 1st sqe without REQ_F_GROUP
+        */
+       if (group->head) {
+               struct io_kiocb *lead = group->head;
+
+               /*
+                * Members can't be in link chain, can't be drained, but
+                * the whole group can be linked or drained by setting
+                * flags on group leader.
+                *
+                * IOSQE_CQE_SKIP_SUCCESS can't be set for member
+                * for the sake of simplicity
+                */
+               if (req->flags & (IO_REQ_LINK_FLAGS | REQ_F_IO_DRAIN |
+                               REQ_F_CQE_SKIP))
+                       req_fail_link_node(lead, -EINVAL);
+
+               lead->grp_refs += 1;
+               group->last->grp_link = req;
+               group->last = req;
+
+               if (req->flags & REQ_F_GROUP)
+                       return NULL;
+
+               req->grp_link = NULL;
+               req->flags |= REQ_F_GROUP;
+               group->head = NULL;
+               return lead;
+       }
+
+       if (WARN_ON_ONCE(!(req->flags & REQ_F_GROUP)))
+               return req;
+       group->head = req;
+       group->last = req;
+       req->grp_refs = 1;
+       req->flags |= REQ_F_GROUP_LEADER;
+       return NULL;
+}
+
+static __cold struct io_kiocb *io_submit_fail_group(
+               struct io_submit_link *link, struct io_kiocb *req)
+{
+       struct io_kiocb *lead = link->head;
+
+       /*
+        * Instead of failing eagerly, continue assembling the group link
+        * if applicable and mark the leader with REQ_F_FAIL. The group
+        * flushing code should find the flag and handle the rest
+        */
+       if (lead && !(lead->flags & REQ_F_FAIL))
+               req_fail_link_node(lead, -ECANCELED);
+
+       return io_group_sqe(link, req);
+}
+
  static __cold int io_submit_fail_link(struct io_submit_link *link,
                                       struct io_kiocb *req, int ret)
  {
@@ -2137,11 +2353,18 @@ static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
  {
         struct io_ring_ctx *ctx = req->ctx;
         struct io_submit_link *link = &ctx->submit_state.link;
+       struct io_submit_link *group = &ctx->submit_state.group;
  
         trace_io_uring_req_failed(sqe, req, ret);
  
         req_fail_link_node(req, ret);
  
+       if (group->head || (req->flags & REQ_F_GROUP)) {
+               req = io_submit_fail_group(group, req);
+               if (!req)
+                       return 0;
+       }
+
         /* cover both linked and non-linked request */
         return io_submit_fail_link(link, req, ret);
  }
@@ -2185,11 +2408,29 @@ fallback:
         return req;
  }
  
+static inline bool io_group_assembling(const struct io_submit_state *state,
+                                      const struct io_kiocb *req)
+{
+       if (state->group.head || req->flags & REQ_F_GROUP)
+               return true;
+       return false;
+}
+
+/* Failed request is covered too */
+static inline bool io_link_assembling(const struct io_submit_state *state,
+                                     const struct io_kiocb *req)
+{
+       if (state->link.head ||
+           (req->flags & (IO_REQ_LINK_FLAGS | REQ_F_FORCE_ASYNC | REQ_F_FAIL)))
+               return true;
+       return false;
+}
+
  static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
                          const struct io_uring_sqe *sqe)
         __must_hold(&ctx->uring_lock)
  {
-       struct io_submit_link *link = &ctx->submit_state.link;
+       struct io_submit_state *state = &ctx->submit_state;
         int ret;
  
         ret = io_init_req(ctx, req, sqe);
@@ -2198,11 +2439,20 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
  
         trace_io_uring_submit_req(req);
  
-       if (unlikely(link->head || (req->flags & (IO_REQ_LINK_FLAGS |
-                                   REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
-               req = io_link_sqe(link, req);
-               if (!req)
-                       return 0;
+       if (unlikely(io_link_assembling(state, req) ||
+                    io_group_assembling(state, req))) {
+               if (io_group_assembling(state, req)) {
+                       req = io_group_sqe(&state->group, req);
+                       if (!req)
+                               return 0;
+               }
+
+               /* covers non-linked failed request too */
+               if (io_link_assembling(state, req)) {
+                       req = io_link_sqe(&state->link, req);
+                       if (!req)
+                               return 0;
+               }
         }
         io_queue_sqe(req);
         return 0;
@@ -2215,8 +2465,27 @@ static void io_submit_state_end(struct io_ring_ctx *ctx)
  {
         struct io_submit_state *state = &ctx->submit_state;
  
-       if (unlikely(state->link.head))
-               io_queue_sqe_fallback(state->link.head);
+       if (unlikely(state->group.head || state->link.head)) {
+               /* the last member must set REQ_F_GROUP */
+               if (state->group.head) {
+                       struct io_kiocb *lead = state->group.head;
+                       struct io_kiocb *last = state->group.last;
+
+                       /* fail group with single leader */
+                       if (unlikely(last == lead))
+                               req_fail_link_node(lead, -EINVAL);
+
+                       last->grp_link = NULL;
+                       if (state->link.head)
+                               io_link_sqe(&state->link, lead);
+                       else
+                               io_queue_sqe_fallback(lead);
+               }
+
+               if (unlikely(state->link.head))
+                       io_queue_sqe_fallback(state->link.head);
+       }
+
         /* flush only after queuing links as they can generate completions */
         io_submit_flush_completions(ctx);
         if (state->plug_started)
@@ -2234,6 +2503,7 @@ static void io_submit_state_start(struct io_submit_state *state,
         state->submit_nr = max_ios;
         /* set only head, no need to init link_last in advance */
         state->link.head = NULL;
+       state->group.head = NULL;
  }
  
  static void io_commit_sqring(struct io_ring_ctx *ctx)
@@ -3737,7 +4007,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
                         IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
                         IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
                         IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
-                       IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT;
+                       IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT |
+                       IORING_FEAT_SQE_GROUP;
  
         if (copy_to_user(params, p, sizeof(*p))) {
                 ret = -EFAULT;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h

index 4070d4c8ef9713ed1baa6b4bc28e63dc10255cf2..facd5c85ba8bcffe60a2db476e189c42300f0dfe 100644 (file)
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -78,6 +78,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
  void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
  bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags);
  void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
+void io_fail_group_members(struct io_kiocb *req);
  
  struct file *io_file_get_normal(struct io_kiocb *req, int fd);
  struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
@@ -357,6 +358,11 @@ static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts)
         lockdep_assert_held(&ctx->uring_lock);
  }
  
+static inline bool req_is_group_leader(struct io_kiocb *req)
+{
+       return req->flags & REQ_F_GROUP_LEADER;
+}
+
  /*
   * Don't complete immediately but use deferred completion infrastructure.
   * Protected by ->uring_lock and can only be used either with
diff --git a/io_uring/timeout.c b/io_uring/timeout.c

index 5b12bd6a804c80d596b1fd46ed15f5b8b480b114..a36e7696d55a20c93160a545dbb9854199141591 100644 (file)
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -149,6 +149,8 @@ static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts)
                         res = link->cqe.res;
                 link->link = NULL;
                 io_req_set_res(link, res, 0);
+               if (req_is_group_leader(link))
+                       io_fail_group_members(link);
                 io_req_task_complete(link, ts);
                 link = nxt;
         }
@@ -545,6 +547,10 @@ static int __io_timeout_prep(struct io_kiocb *req,
         if (is_timeout_link) {
                 struct io_submit_link *link = &req->ctx->submit_state.link;
  
+               /* so far disallow IO group link timeout */
+               if (req->ctx->submit_state.group.head)
+                       return -EINVAL;
+
                 if (!link->head)
                         return -EINVAL;
                 if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
author	Ming Lei <ming.lei@redhat.com>
	Fri, 25 Oct 2024 12:22:41 +0000 (20:22 +0800)
committer	Jens Axboe <axboe@kernel.dk>
	Thu, 7 Nov 2024 22:24:33 +0000 (15:24 -0700)
include/linux/io_uring_types.h		patch \| blob \| history
include/uapi/linux/io_uring.h		patch \| blob \| history
io_uring/io_uring.c		patch \| blob \| history
io_uring/io_uring.h		patch \| blob \| history
io_uring/timeout.c		patch \| blob \| history