struct file             **files;
 };
 
+enum {
+       FFD_F_ATOMIC,
+};
+
+struct fixed_file_data {
+       struct fixed_file_table         *table;
+       struct io_ring_ctx              *ctx;
+
+       struct percpu_ref               refs;
+       struct llist_head               put_llist;
+       unsigned long                   state;
+       struct work_struct              ref_work;
+       struct completion               done;
+};
+
 struct io_ring_ctx {
        struct {
                struct percpu_ref       refs;
         * readers must ensure that ->refs is alive as long as the file* is
         * used. Only updated through io_uring_register(2).
         */
-       struct fixed_file_table *file_table;
+       struct fixed_file_data  *file_data;
        unsigned                nr_user_files;
 
        /* if used, fixed mapped user buffers */
        int                             flags;
 };
 
+struct io_files_update {
+       struct file                     *file;
+       u64                             arg;
+       u32                             nr_args;
+       u32                             offset;
+};
+
 struct io_async_connect {
        struct sockaddr_storage         address;
 };
                struct io_sr_msg        sr_msg;
                struct io_open          open;
                struct io_close         close;
+               struct io_files_update  files_update;
        };
 
        struct io_async_ctx             *io;
 static void __io_double_put_req(struct io_kiocb *req);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
+static int __io_sqe_files_update(struct io_ring_ctx *ctx,
+                                struct io_uring_files_update *ip,
+                                unsigned nr_args);
 
 static struct kmem_cache *req_cachep;
 
        if (*nr) {
                kmem_cache_free_bulk(req_cachep, *nr, reqs);
                percpu_ref_put_many(&ctx->refs, *nr);
+               percpu_ref_put_many(&ctx->file_data->refs, *nr);
                *nr = 0;
        }
 }
 
        if (req->io)
                kfree(req->io);
-       if (req->file && !(req->flags & REQ_F_FIXED_FILE))
-               fput(req->file);
+       if (req->file) {
+               if (req->flags & REQ_F_FIXED_FILE)
+                       percpu_ref_put(&ctx->file_data->refs);
+               else
+                       fput(req->file);
+       }
        if (req->flags & REQ_F_INFLIGHT) {
                unsigned long flags;
 
        return 0;
 }
 
+static int io_files_update_prep(struct io_kiocb *req,
+                               const struct io_uring_sqe *sqe)
+{
+       if (sqe->flags || sqe->ioprio || sqe->rw_flags)
+               return -EINVAL;
+
+       req->files_update.offset = READ_ONCE(sqe->off);
+       req->files_update.nr_args = READ_ONCE(sqe->len);
+       if (!req->files_update.nr_args)
+               return -EINVAL;
+       req->files_update.arg = READ_ONCE(sqe->addr);
+       return 0;
+}
+
+static int io_files_update(struct io_kiocb *req, bool force_nonblock)
+{
+       struct io_ring_ctx *ctx = req->ctx;
+       struct io_uring_files_update up;
+       int ret;
+
+       if (force_nonblock) {
+               req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
+               return -EAGAIN;
+       }
+
+       up.offset = req->files_update.offset;
+       up.fds = req->files_update.arg;
+
+       mutex_lock(&ctx->uring_lock);
+       ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
+       mutex_unlock(&ctx->uring_lock);
+
+       if (ret < 0)
+               req_set_fail_links(req);
+       io_cqring_add_event(req, ret);
+       io_put_req(req);
+       return 0;
+}
+
 static int io_req_defer_prep(struct io_kiocb *req,
                             const struct io_uring_sqe *sqe)
 {
        case IORING_OP_CLOSE:
                ret = io_close_prep(req, sqe);
                break;
+       case IORING_OP_FILES_UPDATE:
+               ret = io_files_update_prep(req, sqe);
+               break;
        default:
                printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
                                req->opcode);
                }
                ret = io_close(req, nxt, force_nonblock);
                break;
+       case IORING_OP_FILES_UPDATE:
+               if (sqe) {
+                       ret = io_files_update_prep(req, sqe);
+                       if (ret)
+                               break;
+               }
+               ret = io_files_update(req, force_nonblock);
+               break;
        default:
                ret = -EINVAL;
                break;
 {
        struct fixed_file_table *table;
 
-       table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT];
-       return table->files[index & IORING_FILE_TABLE_MASK];
+       table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
+       return table->files[index & IORING_FILE_TABLE_MASK];;
 }
 
 static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
                return ret;
 
        if (flags & IOSQE_FIXED_FILE) {
-               if (unlikely(!ctx->file_table ||
+               if (unlikely(!ctx->file_data ||
                    (unsigned) fd >= ctx->nr_user_files))
                        return -EBADF;
                fd = array_index_nospec(fd, ctx->nr_user_files);
                if (!req->file)
                        return -EBADF;
                req->flags |= REQ_F_FIXED_FILE;
+               percpu_ref_get(&ctx->file_data->refs);
        } else {
                if (req->needs_fixed_file)
                        return -EBADF;
 #endif
 }
 
+static void io_file_ref_kill(struct percpu_ref *ref)
+{
+       struct fixed_file_data *data;
+
+       data = container_of(ref, struct fixed_file_data, refs);
+       complete(&data->done);
+}
+
 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
+       struct fixed_file_data *data = ctx->file_data;
        unsigned nr_tables, i;
 
-       if (!ctx->file_table)
+       if (!data)
                return -ENXIO;
 
+       /* protect against inflight atomic switch, which drops the ref */
+       flush_work(&data->ref_work);
+       percpu_ref_get(&data->refs);
+       percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
+       wait_for_completion(&data->done);
+       percpu_ref_put(&data->refs);
+       percpu_ref_exit(&data->refs);
+
        __io_sqe_files_unregister(ctx);
        nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
        for (i = 0; i < nr_tables; i++)
-               kfree(ctx->file_table[i].files);
-       kfree(ctx->file_table);
-       ctx->file_table = NULL;
+               kfree(data->table[i].files);
+       kfree(data->table);
+       kfree(data);
+       ctx->file_data = NULL;
        ctx->nr_user_files = 0;
        return 0;
 }
 }
 
 #if defined(CONFIG_UNIX)
-static void io_destruct_skb(struct sk_buff *skb)
-{
-       struct io_ring_ctx *ctx = skb->sk->sk_user_data;
-
-       if (ctx->io_wq)
-               io_wq_flush(ctx->io_wq);
-
-       unix_destruct_scm(skb);
-}
-
 /*
  * Ensure the UNIX gc is aware of our file set, so we are certain that
  * the io_uring can be safely unregistered on process exit, even if we have
                fpl->max = SCM_MAX_FD;
                fpl->count = nr_files;
                UNIXCB(skb).fp = fpl;
-               skb->destructor = io_destruct_skb;
+               skb->destructor = unix_destruct_scm;
                refcount_add(skb->truesize, &sk->sk_wmem_alloc);
                skb_queue_head(&sk->sk_receive_queue, skb);
 
        int i;
 
        for (i = 0; i < nr_tables; i++) {
-               struct fixed_file_table *table = &ctx->file_table[i];
+               struct fixed_file_table *table = &ctx->file_data->table[i];
                unsigned this_files;
 
                this_files = min(nr_files, IORING_MAX_FILES_TABLE);
                return 0;
 
        for (i = 0; i < nr_tables; i++) {
-               struct fixed_file_table *table = &ctx->file_table[i];
+               struct fixed_file_table *table = &ctx->file_data->table[i];
                kfree(table->files);
        }
        return 1;
 }
 
+static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
+{
+#if defined(CONFIG_UNIX)
+       struct sock *sock = ctx->ring_sock->sk;
+       struct sk_buff_head list, *head = &sock->sk_receive_queue;
+       struct sk_buff *skb;
+       int i;
+
+       __skb_queue_head_init(&list);
+
+       /*
+        * Find the skb that holds this file in its SCM_RIGHTS. When found,
+        * remove this entry and rearrange the file array.
+        */
+       skb = skb_dequeue(head);
+       while (skb) {
+               struct scm_fp_list *fp;
+
+               fp = UNIXCB(skb).fp;
+               for (i = 0; i < fp->count; i++) {
+                       int left;
+
+                       if (fp->fp[i] != file)
+                               continue;
+
+                       unix_notinflight(fp->user, fp->fp[i]);
+                       left = fp->count - 1 - i;
+                       if (left) {
+                               memmove(&fp->fp[i], &fp->fp[i + 1],
+                                               left * sizeof(struct file *));
+                       }
+                       fp->count--;
+                       if (!fp->count) {
+                               kfree_skb(skb);
+                               skb = NULL;
+                       } else {
+                               __skb_queue_tail(&list, skb);
+                       }
+                       fput(file);
+                       file = NULL;
+                       break;
+               }
+
+               if (!file)
+                       break;
+
+               __skb_queue_tail(&list, skb);
+
+               skb = skb_dequeue(head);
+       }
+
+       if (skb_peek(&list)) {
+               spin_lock_irq(&head->lock);
+               while ((skb = __skb_dequeue(&list)) != NULL)
+                       __skb_queue_tail(head, skb);
+               spin_unlock_irq(&head->lock);
+       }
+#else
+       fput(file);
+#endif
+}
+
+struct io_file_put {
+       struct llist_node llist;
+       struct file *file;
+       struct completion *done;
+};
+
+static void io_ring_file_ref_switch(struct work_struct *work)
+{
+       struct io_file_put *pfile, *tmp;
+       struct fixed_file_data *data;
+       struct llist_node *node;
+
+       data = container_of(work, struct fixed_file_data, ref_work);
+
+       while ((node = llist_del_all(&data->put_llist)) != NULL) {
+               llist_for_each_entry_safe(pfile, tmp, node, llist) {
+                       io_ring_file_put(data->ctx, pfile->file);
+                       if (pfile->done)
+                               complete(pfile->done);
+                       else
+                               kfree(pfile);
+               }
+       }
+
+       percpu_ref_get(&data->refs);
+       percpu_ref_switch_to_percpu(&data->refs);
+}
+
+static void io_file_data_ref_zero(struct percpu_ref *ref)
+{
+       struct fixed_file_data *data;
+
+       data = container_of(ref, struct fixed_file_data, refs);
+
+       /* we can't safely switch from inside this context, punt to wq */
+       queue_work(system_wq, &data->ref_work);
+}
+
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
                                 unsigned nr_args)
 {
        __s32 __user *fds = (__s32 __user *) arg;
        unsigned nr_tables;
+       struct file *file;
        int fd, ret = 0;
        unsigned i;
 
-       if (ctx->file_table)
+       if (ctx->file_data)
                return -EBUSY;
        if (!nr_args)
                return -EINVAL;
        if (nr_args > IORING_MAX_FIXED_FILES)
                return -EMFILE;
 
+       ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
+       if (!ctx->file_data)
+               return -ENOMEM;
+       ctx->file_data->ctx = ctx;
+       init_completion(&ctx->file_data->done);
+
        nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
-       ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table),
+       ctx->file_data->table = kcalloc(nr_tables,
+                                       sizeof(struct fixed_file_table),
                                        GFP_KERNEL);
-       if (!ctx->file_table)
+       if (!ctx->file_data->table) {
+               kfree(ctx->file_data);
+               ctx->file_data = NULL;
                return -ENOMEM;
+       }
+
+       if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
+                               PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+               kfree(ctx->file_data->table);
+               kfree(ctx->file_data);
+               ctx->file_data = NULL;
+               return -ENOMEM;
+       }
+       ctx->file_data->put_llist.first = NULL;
+       INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
 
        if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
-               kfree(ctx->file_table);
-               ctx->file_table = NULL;
+               percpu_ref_exit(&ctx->file_data->refs);
+               kfree(ctx->file_data->table);
+               kfree(ctx->file_data);
+               ctx->file_data = NULL;
                return -ENOMEM;
        }
 
                        continue;
                }
 
-               table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
+               table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
                index = i & IORING_FILE_TABLE_MASK;
-               table->files[index] = fget(fd);
+               file = fget(fd);
 
                ret = -EBADF;
-               if (!table->files[index])
+               if (!file)
                        break;
+
                /*
                 * Don't allow io_uring instances to be registered. If UNIX
                 * isn't enabled, then this causes a reference cycle and this
                 * handle it just fine, but there's still no point in allowing
                 * a ring fd as it doesn't support regular read/write anyway.
                 */
-               if (table->files[index]->f_op == &io_uring_fops) {
-                       fput(table->files[index]);
+               if (file->f_op == &io_uring_fops) {
+                       fput(file);
                        break;
                }
                ret = 0;
+               table->files[index] = file;
        }
 
        if (ret) {
                for (i = 0; i < ctx->nr_user_files; i++) {
-                       struct file *file;
-
                        file = io_file_from_index(ctx, i);
                        if (file)
                                fput(file);
                }
                for (i = 0; i < nr_tables; i++)
-                       kfree(ctx->file_table[i].files);
+                       kfree(ctx->file_data->table[i].files);
 
-               kfree(ctx->file_table);
-               ctx->file_table = NULL;
+               kfree(ctx->file_data->table);
+               kfree(ctx->file_data);
+               ctx->file_data = NULL;
                ctx->nr_user_files = 0;
                return ret;
        }
        return ret;
 }
 
-static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index)
-{
-#if defined(CONFIG_UNIX)
-       struct file *file = io_file_from_index(ctx, index);
-       struct sock *sock = ctx->ring_sock->sk;
-       struct sk_buff_head list, *head = &sock->sk_receive_queue;
-       struct sk_buff *skb;
-       int i;
-
-       __skb_queue_head_init(&list);
-
-       /*
-        * Find the skb that holds this file in its SCM_RIGHTS. When found,
-        * remove this entry and rearrange the file array.
-        */
-       skb = skb_dequeue(head);
-       while (skb) {
-               struct scm_fp_list *fp;
-
-               fp = UNIXCB(skb).fp;
-               for (i = 0; i < fp->count; i++) {
-                       int left;
-
-                       if (fp->fp[i] != file)
-                               continue;
-
-                       unix_notinflight(fp->user, fp->fp[i]);
-                       left = fp->count - 1 - i;
-                       if (left) {
-                               memmove(&fp->fp[i], &fp->fp[i + 1],
-                                               left * sizeof(struct file *));
-                       }
-                       fp->count--;
-                       if (!fp->count) {
-                               kfree_skb(skb);
-                               skb = NULL;
-                       } else {
-                               __skb_queue_tail(&list, skb);
-                       }
-                       fput(file);
-                       file = NULL;
-                       break;
-               }
-
-               if (!file)
-                       break;
-
-               __skb_queue_tail(&list, skb);
-
-               skb = skb_dequeue(head);
-       }
-
-       if (skb_peek(&list)) {
-               spin_lock_irq(&head->lock);
-               while ((skb = __skb_dequeue(&list)) != NULL)
-                       __skb_queue_tail(head, skb);
-               spin_unlock_irq(&head->lock);
-       }
-#else
-       fput(io_file_from_index(ctx, index));
-#endif
-}
-
 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
                                int index)
 {
 #endif
 }
 
-static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
-                              unsigned nr_args)
+static void io_atomic_switch(struct percpu_ref *ref)
 {
-       struct io_uring_files_update up;
+       struct fixed_file_data *data;
+
+       data = container_of(ref, struct fixed_file_data, refs);
+       clear_bit(FFD_F_ATOMIC, &data->state);
+}
+
+static bool io_queue_file_removal(struct fixed_file_data *data,
+                                 struct file *file)
+{
+       struct io_file_put *pfile, pfile_stack;
+       DECLARE_COMPLETION_ONSTACK(done);
+
+       /*
+        * If we fail allocating the struct we need for doing async reomval
+        * of this file, just punt to sync and wait for it.
+        */
+       pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
+       if (!pfile) {
+               pfile = &pfile_stack;
+               pfile->done = &done;
+       }
+
+       pfile->file = file;
+       llist_add(&pfile->llist, &data->put_llist);
+
+       if (pfile == &pfile_stack) {
+               if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
+                       percpu_ref_put(&data->refs);
+                       percpu_ref_switch_to_atomic(&data->refs,
+                                                       io_atomic_switch);
+               }
+               wait_for_completion(&done);
+               flush_work(&data->ref_work);
+               return false;
+       }
+
+       return true;
+}
+
+static int __io_sqe_files_update(struct io_ring_ctx *ctx,
+                                struct io_uring_files_update *up,
+                                unsigned nr_args)
+{
+       struct fixed_file_data *data = ctx->file_data;
+       bool ref_switch = false;
+       struct file *file;
        __s32 __user *fds;
        int fd, i, err;
        __u32 done;
 
-       if (!ctx->file_table)
-               return -ENXIO;
-       if (!nr_args)
-               return -EINVAL;
-       if (copy_from_user(&up, arg, sizeof(up)))
-               return -EFAULT;
-       if (up.resv)
-               return -EINVAL;
-       if (check_add_overflow(up.offset, nr_args, &done))
+       if (check_add_overflow(up->offset, nr_args, &done))
                return -EOVERFLOW;
        if (done > ctx->nr_user_files)
                return -EINVAL;
 
        done = 0;
-       fds = u64_to_user_ptr(up.fds);
+       fds = u64_to_user_ptr(up->fds);
        while (nr_args) {
                struct fixed_file_table *table;
                unsigned index;
                        err = -EFAULT;
                        break;
                }
-               i = array_index_nospec(up.offset, ctx->nr_user_files);
-               table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
+               i = array_index_nospec(up->offset, ctx->nr_user_files);
+               table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
                index = i & IORING_FILE_TABLE_MASK;
                if (table->files[index]) {
-                       io_sqe_file_unregister(ctx, i);
+                       file = io_file_from_index(ctx, index);
                        table->files[index] = NULL;
+                       if (io_queue_file_removal(data, file))
+                               ref_switch = true;
                }
                if (fd != -1) {
-                       struct file *file;
-
                        file = fget(fd);
                        if (!file) {
                                err = -EBADF;
                }
                nr_args--;
                done++;
-               up.offset++;
+               up->offset++;
+       }
+
+       if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
+               percpu_ref_put(&data->refs);
+               percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
        }
 
        return done ? done : err;
 }
+static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
+                              unsigned nr_args)
+{
+       struct io_uring_files_update up;
+
+       if (!ctx->file_data)
+               return -ENXIO;
+       if (!nr_args)
+               return -EINVAL;
+       if (copy_from_user(&up, arg, sizeof(up)))
+               return -EFAULT;
+       if (up.resv)
+               return -EINVAL;
+
+       return __io_sqe_files_update(ctx, &up, nr_args);
+}
 
 static void io_put_work(struct io_wq_work *work)
 {
 
 #if defined(CONFIG_UNIX)
        ctx->ring_sock->file = file;
-       ctx->ring_sock->sk->sk_user_data = ctx;
 #endif
        fd_install(ret, file);
        return ret;
        if (percpu_ref_is_dying(&ctx->refs))
                return -ENXIO;
 
-       percpu_ref_kill(&ctx->refs);
+       if (opcode != IORING_UNREGISTER_FILES &&
+           opcode != IORING_REGISTER_FILES_UPDATE) {
+               percpu_ref_kill(&ctx->refs);
 
-       /*
-        * Drop uring mutex before waiting for references to exit. If another
-        * thread is currently inside io_uring_enter() it might need to grab
-        * the uring_lock to make progress. If we hold it here across the drain
-        * wait, then we can deadlock. It's safe to drop the mutex here, since
-        * no new references will come in after we've killed the percpu ref.
-        */
-       mutex_unlock(&ctx->uring_lock);
-       wait_for_completion(&ctx->completions[0]);
-       mutex_lock(&ctx->uring_lock);
+               /*
+                * Drop uring mutex before waiting for references to exit. If
+                * another thread is currently inside io_uring_enter() it might
+                * need to grab the uring_lock to make progress. If we hold it
+                * here across the drain wait, then we can deadlock. It's safe
+                * to drop the mutex here, since no new references will come in
+                * after we've killed the percpu ref.
+                */
+               mutex_unlock(&ctx->uring_lock);
+               wait_for_completion(&ctx->completions[0]);
+               mutex_lock(&ctx->uring_lock);
+       }
 
        switch (opcode) {
        case IORING_REGISTER_BUFFERS:
                break;
        }
 
-       /* bring the ctx back to life */
-       reinit_completion(&ctx->completions[0]);
-       percpu_ref_reinit(&ctx->refs);
+
+       if (opcode != IORING_UNREGISTER_FILES &&
+           opcode != IORING_REGISTER_FILES_UPDATE) {
+               /* bring the ctx back to life */
+               reinit_completion(&ctx->completions[0]);
+               percpu_ref_reinit(&ctx->refs);
+       }
        return ret;
 }