io_uring: don't rely on weak ->files references

author Jens Axboe <axboe@kernel.dk>

Sun, 13 Sep 2020 19:09:39 +0000 (13:09 -0600)

committer Jens Axboe <axboe@kernel.dk>

Thu, 1 Oct 2020 02:32:32 +0000 (20:32 -0600)
author Jens Axboe <axboe@kernel.dk>
Sun, 13 Sep 2020 19:09:39 +0000 (13:09 -0600)
committer Jens Axboe <axboe@kernel.dk>
Thu, 1 Oct 2020 02:32:32 +0000 (20:32 -0600)
diff --git a/fs/exec.c b/fs/exec.c

index a91003e28eaae23c7b56b35ad631b49897034bac..07910f5032e74ae9fcc7749d8d5c75504a68a64e 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -62,6 +62,7 @@
  #include <linux/oom.h>
  #include <linux/compat.h>
  #include <linux/vmalloc.h>
+#include <linux/io_uring.h>
  
  #include <linux/uaccess.h>
  #include <asm/mmu_context.h>
@@ -1895,6 +1896,11 @@ static int bprm_execve(struct linux_binprm *bprm,
         struct files_struct *displaced;
         int retval;
  
+       /*
+        * Cancel any io_uring activity across execve
+        */
+       io_uring_task_cancel();
+
         retval = unshare_files(&displaced);
         if (retval)
                 return retval;
diff --git a/fs/file.c b/fs/file.c

index 21c0893f2f1df8f2feda08913cdeceeaed7a0b89..4559b5fec3bd53e8d22f79ddfa2a065cd3d527bb 100644 (file)
--- a/fs/file.c
+++ b/fs/file.c
@@ -21,6 +21,7 @@
  #include <linux/rcupdate.h>
  #include <linux/close_range.h>
  #include <net/sock.h>
+#include <linux/io_uring.h>
  
  unsigned int sysctl_nr_open __read_mostly = 1024*1024;
  unsigned int sysctl_nr_open_min = BITS_PER_LONG;
@@ -452,6 +453,7 @@ void exit_files(struct task_struct *tsk)
         struct files_struct * files = tsk->files;
  
         if (files) {
+               io_uring_files_cancel(files);
                 task_lock(tsk);
                 tsk->files = NULL;
                 task_unlock(tsk);
diff --git a/fs/io_uring.c b/fs/io_uring.c

index 046d06266a116ee0e654c4512ceadbb86c4d02d8..ee75ba7113cfe21fd6b79e63e8e1ba6435446462 100644 (file)
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -79,6 +79,7 @@
  #include <linux/splice.h>
  #include <linux/task_work.h>
  #include <linux/pagemap.h>
+#include <linux/io_uring.h>
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/io_uring.h>
@@ -284,8 +285,6 @@ struct io_ring_ctx {
          */
         struct fixed_file_data  *file_data;
         unsigned                nr_user_files;
-       int                     ring_fd;
-       struct file             *ring_file;
  
         /* if used, fixed mapped user buffers */
         unsigned                nr_user_bufs;
@@ -1433,7 +1432,12 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
                 WRITE_ONCE(cqe->user_data, req->user_data);
                 WRITE_ONCE(cqe->res, res);
                 WRITE_ONCE(cqe->flags, cflags);
-       } else if (ctx->cq_overflow_flushed) {
+       } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) {
+               /*
+                * If we're in ring overflow flush mode, or in task cancel mode,
+                * then we cannot store the request for later flushing, we need
+                * to drop it on the floor.
+                */
                 WRITE_ONCE(ctx->rings->cq_overflow,
                                 atomic_inc_return(&ctx->cached_cq_overflow));
         } else {
@@ -1591,8 +1595,12 @@ static bool io_dismantle_req(struct io_kiocb *req)
  
  static void __io_free_req_finish(struct io_kiocb *req)
  {
+       struct io_uring_task *tctx = req->task->io_uring;
         struct io_ring_ctx *ctx = req->ctx;
  
+       atomic_long_inc(&tctx->req_complete);
+       if (tctx->in_idle)
+               wake_up(&tctx->wait);
         put_task_struct(req->task);
  
         if (likely(!io_is_fallback_req(req)))
@@ -1907,6 +1915,7 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
         if (rb->to_free)
                 __io_req_free_batch_flush(ctx, rb);
         if (rb->task) {
+               atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete);
                 put_task_struct_many(rb->task, rb->task_refs);
                 rb->task = NULL;
         }
@@ -1922,8 +1931,10 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
                 io_queue_next(req);
  
         if (req->task != rb->task) {
-               if (rb->task)
+               if (rb->task) {
+                       atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete);
                         put_task_struct_many(rb->task, rb->task_refs);
+               }
                 rb->task = req->task;
                 rb->task_refs = 0;
         }
@@ -3978,8 +3989,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
                 return -EBADF;
  
         req->close.fd = READ_ONCE(sqe->fd);
-       if ((req->file && req->file->f_op == &io_uring_fops) ||
-           req->close.fd == req->ctx->ring_fd)
+       if ((req->file && req->file->f_op == &io_uring_fops))
                 return -EBADF;
  
         req->close.put_file = NULL;
@@ -5667,6 +5677,7 @@ static void io_req_drop_files(struct io_kiocb *req)
                 wake_up(&ctx->inflight_wait);
         spin_unlock_irqrestore(&ctx->inflight_lock, flags);
         req->flags &= ~REQ_F_INFLIGHT;
+       put_files_struct(req->work.files);
         req->work.files = NULL;
  }
  
@@ -6067,34 +6078,20 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
  
  static int io_grab_files(struct io_kiocb *req)
  {
-       int ret = -EBADF;
         struct io_ring_ctx *ctx = req->ctx;
  
         io_req_init_async(req);
  
         if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
                 return 0;
-       if (!ctx->ring_file)
-               return -EBADF;
  
-       rcu_read_lock();
+       req->work.files = get_files_struct(current);
+       req->flags |= REQ_F_INFLIGHT;
+
         spin_lock_irq(&ctx->inflight_lock);
-       /*
-        * We use the f_ops->flush() handler to ensure that we can flush
-        * out work accessing these files if the fd is closed. Check if
-        * the fd has changed since we started down this path, and disallow
-        * this operation if it has.
-        */
-       if (fcheck(ctx->ring_fd) == ctx->ring_file) {
-               list_add(&req->inflight_entry, &ctx->inflight_list);
-               req->flags |= REQ_F_INFLIGHT;
-               req->work.files = current->files;
-               ret = 0;
-       }
+       list_add(&req->inflight_entry, &ctx->inflight_list);
         spin_unlock_irq(&ctx->inflight_lock);
-       rcu_read_unlock();
-
-       return ret;
+       return 0;
  }
  
  static inline int io_prep_work_files(struct io_kiocb *req)
@@ -6459,6 +6456,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
         refcount_set(&req->refs, 2);
         req->task = current;
         get_task_struct(req->task);
+       atomic_long_inc(&req->task->io_uring->req_issue);
         req->result = 0;
  
         if (unlikely(req->opcode >= IORING_OP_LAST))
@@ -6494,8 +6492,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
         return io_req_set_file(state, req, READ_ONCE(sqe->fd));
  }
  
-static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
-                         struct file *ring_file, int ring_fd)
+static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
  {
         struct io_submit_state state;
         struct io_kiocb *link = NULL;
@@ -6516,9 +6513,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
  
         io_submit_state_start(&state, ctx, nr);
  
-       ctx->ring_fd = ring_fd;
-       ctx->ring_file = ring_file;
-
         for (i = 0; i < nr; i++) {
                 const struct io_uring_sqe *sqe;
                 struct io_kiocb *req;
@@ -6687,7 +6681,7 @@ static int io_sq_thread(void *data)
  
                 mutex_lock(&ctx->uring_lock);
                 if (likely(!percpu_ref_is_dying(&ctx->refs)))
-                       ret = io_submit_sqes(ctx, to_submit, NULL, -1);
+                       ret = io_submit_sqes(ctx, to_submit);
                 mutex_unlock(&ctx->uring_lock);
                 timeout = jiffies + ctx->sq_thread_idle;
         }
@@ -7516,6 +7510,34 @@ out_fput:
         return ret;
  }
  
+static int io_uring_alloc_task_context(struct task_struct *task)
+{
+       struct io_uring_task *tctx;
+
+       tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
+       if (unlikely(!tctx))
+               return -ENOMEM;
+
+       xa_init(&tctx->xa);
+       init_waitqueue_head(&tctx->wait);
+       tctx->last = NULL;
+       tctx->in_idle = 0;
+       atomic_long_set(&tctx->req_issue, 0);
+       atomic_long_set(&tctx->req_complete, 0);
+       task->io_uring = tctx;
+       return 0;
+}
+
+void __io_uring_free(struct task_struct *tsk)
+{
+       struct io_uring_task *tctx = tsk->io_uring;
+
+       WARN_ON_ONCE(!xa_empty(&tctx->xa));
+       xa_destroy(&tctx->xa);
+       kfree(tctx);
+       tsk->io_uring = NULL;
+}
+
  static int io_sq_offload_start(struct io_ring_ctx *ctx,
                                struct io_uring_params *p)
  {
@@ -7551,6 +7573,9 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
                         ctx->sqo_thread = NULL;
                         goto err;
                 }
+               ret = io_uring_alloc_task_context(ctx->sqo_thread);
+               if (ret)
+                       goto err;
                 wake_up_process(ctx->sqo_thread);
         } else if (p->flags & IORING_SETUP_SQ_AFF) {
                 /* Can't have SQ_AFF without SQPOLL */
@@ -8063,7 +8088,7 @@ static bool io_wq_files_match(struct io_wq_work *work, void *data)
  {
         struct files_struct *files = data;
  
-       return work->files == files;
+       return !files || work->files == files;
  }
  
  /*
@@ -8218,7 +8243,7 @@ static bool io_uring_cancel_files(struct io_ring_ctx *ctx,
  
                 spin_lock_irq(&ctx->inflight_lock);
                 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
-                       if (req->work.files != files)
+                       if (files && req->work.files != files)
                                 continue;
                         /* req is being completed, ignore */
                         if (!refcount_inc_not_zero(&req->refs))
@@ -8254,18 +8279,217 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
         return io_task_match(req, task);
  }
  
+static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
+                                           struct task_struct *task,
+                                           struct files_struct *files)
+{
+       bool ret;
+
+       ret = io_uring_cancel_files(ctx, files);
+       if (!files) {
+               enum io_wq_cancel cret;
+
+               cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true);
+               if (cret != IO_WQ_CANCEL_NOTFOUND)
+                       ret = true;
+
+               /* SQPOLL thread does its own polling */
+               if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
+                       while (!list_empty_careful(&ctx->iopoll_list)) {
+                               io_iopoll_try_reap_events(ctx);
+                               ret = true;
+                       }
+               }
+
+               ret |= io_poll_remove_all(ctx, task);
+               ret |= io_kill_timeouts(ctx, task);
+       }
+
+       return ret;
+}
+
+/*
+ * We need to iteratively cancel requests, in case a request has dependent
+ * hard links. These persist even for failure of cancelations, hence keep
+ * looping until none are found.
+ */
+static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
+                                         struct files_struct *files)
+{
+       struct task_struct *task = current;
+
+       if (ctx->flags & IORING_SETUP_SQPOLL)
+               task = ctx->sqo_thread;
+
+       io_cqring_overflow_flush(ctx, true, task, files);
+
+       while (__io_uring_cancel_task_requests(ctx, task, files)) {
+               io_run_task_work();
+               cond_resched();
+       }
+}
+
+/*
+ * Note that this task has used io_uring. We use it for cancelation purposes.
+ */
+static int io_uring_add_task_file(struct file *file)
+{
+       if (unlikely(!current->io_uring)) {
+               int ret;
+
+               ret = io_uring_alloc_task_context(current);
+               if (unlikely(ret))
+                       return ret;
+       }
+       if (current->io_uring->last != file) {
+               XA_STATE(xas, &current->io_uring->xa, (unsigned long) file);
+               void *old;
+
+               rcu_read_lock();
+               old = xas_load(&xas);
+               if (old != file) {
+                       get_file(file);
+                       xas_lock(&xas);
+                       xas_store(&xas, file);
+                       xas_unlock(&xas);
+               }
+               rcu_read_unlock();
+               current->io_uring->last = file;
+       }
+
+       return 0;
+}
+
+/*
+ * Remove this io_uring_file -> task mapping.
+ */
+static void io_uring_del_task_file(struct file *file)
+{
+       struct io_uring_task *tctx = current->io_uring;
+       XA_STATE(xas, &tctx->xa, (unsigned long) file);
+
+       if (tctx->last == file)
+               tctx->last = NULL;
+
+       xas_lock(&xas);
+       file = xas_store(&xas, NULL);
+       xas_unlock(&xas);
+
+       if (file)
+               fput(file);
+}
+
+static void __io_uring_attempt_task_drop(struct file *file)
+{
+       XA_STATE(xas, &current->io_uring->xa, (unsigned long) file);
+       struct file *old;
+
+       rcu_read_lock();
+       old = xas_load(&xas);
+       rcu_read_unlock();
+
+       if (old == file)
+               io_uring_del_task_file(file);
+}
+
+/*
+ * Drop task note for this file if we're the only ones that hold it after
+ * pending fput()
+ */
+static void io_uring_attempt_task_drop(struct file *file, bool exiting)
+{
+       if (!current->io_uring)
+               return;
+       /*
+        * fput() is pending, will be 2 if the only other ref is our potential
+        * task file note. If the task is exiting, drop regardless of count.
+        */
+       if (!exiting && atomic_long_read(&file->f_count) != 2)
+               return;
+
+       __io_uring_attempt_task_drop(file);
+}
+
+void __io_uring_files_cancel(struct files_struct *files)
+{
+       struct io_uring_task *tctx = current->io_uring;
+       XA_STATE(xas, &tctx->xa, 0);
+
+       /* make sure overflow events are dropped */
+       tctx->in_idle = true;
+
+       do {
+               struct io_ring_ctx *ctx;
+               struct file *file;
+
+               xas_lock(&xas);
+               file = xas_next_entry(&xas, ULONG_MAX);
+               xas_unlock(&xas);
+
+               if (!file)
+                       break;
+
+               ctx = file->private_data;
+
+               io_uring_cancel_task_requests(ctx, files);
+               if (files)
+                       io_uring_del_task_file(file);
+       } while (1);
+}
+
+static inline bool io_uring_task_idle(struct io_uring_task *tctx)
+{
+       return atomic_long_read(&tctx->req_issue) ==
+               atomic_long_read(&tctx->req_complete);
+}
+
+/*
+ * Find any io_uring fd that this task has registered or done IO on, and cancel
+ * requests.
+ */
+void __io_uring_task_cancel(void)
+{
+       struct io_uring_task *tctx = current->io_uring;
+       DEFINE_WAIT(wait);
+       long completions;
+
+       /* make sure overflow events are dropped */
+       tctx->in_idle = true;
+
+       while (!io_uring_task_idle(tctx)) {
+               /* read completions before cancelations */
+               completions = atomic_long_read(&tctx->req_complete);
+               __io_uring_files_cancel(NULL);
+
+               prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
+
+               /*
+                * If we've seen completions, retry. This avoids a race where
+                * a completion comes in before we did prepare_to_wait().
+                */
+               if (completions != atomic_long_read(&tctx->req_complete))
+                       continue;
+               if (io_uring_task_idle(tctx))
+                       break;
+               schedule();
+       }
+
+       finish_wait(&tctx->wait, &wait);
+       tctx->in_idle = false;
+}
+
  static int io_uring_flush(struct file *file, void *data)
  {
         struct io_ring_ctx *ctx = file->private_data;
  
-       io_uring_cancel_files(ctx, data);
-
         /*
          * If the task is going away, cancel work it may have pending
          */
         if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
-               io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true);
+               data = NULL;
  
+       io_uring_cancel_task_requests(ctx, data);
+       io_uring_attempt_task_drop(file, !data);
         return 0;
  }
  
@@ -8379,8 +8603,11 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
                         wake_up(&ctx->sqo_wait);
                 submitted = to_submit;
         } else if (to_submit) {
+               ret = io_uring_add_task_file(f.file);
+               if (unlikely(ret))
+                       goto out;
                 mutex_lock(&ctx->uring_lock);
-               submitted = io_submit_sqes(ctx, to_submit, f.file, fd);
+               submitted = io_submit_sqes(ctx, to_submit);
                 mutex_unlock(&ctx->uring_lock);
  
                 if (submitted != to_submit)
@@ -8590,6 +8817,7 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx)
         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
                                         O_RDWR | O_CLOEXEC);
         if (IS_ERR(file)) {
+err_fd:
                 put_unused_fd(ret);
                 ret = PTR_ERR(file);
                 goto err;
@@ -8598,6 +8826,10 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx)
  #if defined(CONFIG_UNIX)
         ctx->ring_sock->file = file;
  #endif
+       if (unlikely(io_uring_add_task_file(file))) {
+               file = ERR_PTR(-ENOMEM);
+               goto err_fd;
+       }
         fd_install(ret, file);
         return ret;
  err:
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h

new file mode 100644 (file)

index 0000000..c09135a
--- /dev/null
+++ b/include/linux/io_uring.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_IO_URING_H
+#define _LINUX_IO_URING_H
+
+#include <linux/sched.h>
+#include <linux/xarray.h>
+#include <linux/percpu-refcount.h>
+
+struct io_uring_task {
+       /* submission side */
+       struct xarray           xa;
+       struct wait_queue_head  wait;
+       struct file             *last;
+       atomic_long_t           req_issue;
+
+       /* completion side */
+       bool                    in_idle ____cacheline_aligned_in_smp;
+       atomic_long_t           req_complete;
+};
+
+#if defined(CONFIG_IO_URING)
+void __io_uring_task_cancel(void);
+void __io_uring_files_cancel(struct files_struct *files);
+void __io_uring_free(struct task_struct *tsk);
+
+static inline void io_uring_task_cancel(void)
+{
+       if (current->io_uring && !xa_empty(&current->io_uring->xa))
+               __io_uring_task_cancel();
+}
+static inline void io_uring_files_cancel(struct files_struct *files)
+{
+       if (current->io_uring && !xa_empty(&current->io_uring->xa))
+               __io_uring_files_cancel(files);
+}
+static inline void io_uring_free(struct task_struct *tsk)
+{
+       if (tsk->io_uring)
+               __io_uring_free(tsk);
+}
+#else
+static inline void io_uring_task_cancel(void)
+{
+}
+static inline void io_uring_files_cancel(struct files_struct *files)
+{
+}
+static inline void io_uring_free(struct task_struct *tsk)
+{
+}
+#endif
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h

index afe01e232935fa44c11272442b75da197bd58353..8bf2295ebee48f95a2afc33cd9703a7c0a3837af 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -63,6 +63,7 @@ struct sighand_struct;
  struct signal_struct;
  struct task_delay_info;
  struct task_group;
+struct io_uring_task;
  
  /*
   * Task state bitmask. NOTE! These bits are also
@@ -935,6 +936,10 @@ struct task_struct {
         /* Open file information: */
         struct files_struct             *files;
  
+#ifdef CONFIG_IO_URING
+       struct io_uring_task            *io_uring;
+#endif
+
         /* Namespaces: */
         struct nsproxy                  *nsproxy;
  
diff --git a/init/init_task.c b/init/init_task.c

index f6889fce64af7c848e64912c92add9abfb5f0141..a56f0abb63e934e3d3de3f9cabf7dd2d9cc760f0 100644 (file)
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -114,6 +114,9 @@ struct task_struct init_task
         .thread         = INIT_THREAD,
         .fs             = &init_fs,
         .files          = &init_files,
+#ifdef CONFIG_IO_URING
+       .io_uring       = NULL,
+#endif
         .signal         = &init_signals,
         .sighand        = &init_sighand,
         .nsproxy        = &init_nsproxy,
diff --git a/kernel/fork.c b/kernel/fork.c

index da8d360fb0326e18a059313c8b21fe93220b5eb5..a3795aaaab5c580598aacd1299820d9a76b3aad0 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -95,6 +95,7 @@
  #include <linux/stackleak.h>
  #include <linux/kasan.h>
  #include <linux/scs.h>
+#include <linux/io_uring.h>
  
  #include <asm/pgalloc.h>
  #include <linux/uaccess.h>
@@ -728,6 +729,7 @@ void __put_task_struct(struct task_struct *tsk)
         WARN_ON(refcount_read(&tsk->usage));
         WARN_ON(tsk == current);
  
+       io_uring_free(tsk);
         cgroup_free(tsk);
         task_numa_free(tsk, true);
         security_task_free(tsk);
@@ -1983,6 +1985,10 @@ static __latent_entropy struct task_struct *copy_process(
         p->vtime.state = VTIME_INACTIVE;
  #endif
  
+#ifdef CONFIG_IO_URING
+       p->io_uring = NULL;
+#endif
+
  #if defined(SPLIT_RSS_COUNTING)
         memset(&p->rss_stat, 0, sizeof(p->rss_stat));
  #endif
author	Jens Axboe <axboe@kernel.dk>
	Sun, 13 Sep 2020 19:09:39 +0000 (13:09 -0600)
committer	Jens Axboe <axboe@kernel.dk>
	Thu, 1 Oct 2020 02:32:32 +0000 (20:32 -0600)
fs/exec.c		patch \| blob \| history
fs/file.c		patch \| blob \| history
fs/io_uring.c		patch \| blob \| history
include/linux/io_uring.h	[new file with mode: 0644]	patch \| blob
include/linux/sched.h		patch \| blob \| history
init/init_task.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history