struct completion ref_done;
 
+       unsigned long create_state;
+       struct callback_head create_work;
+       int create_index;
+
        struct rcu_head rcu;
 };
 
        atomic_inc(&acct->nr_running);
 }
 
-struct create_worker_data {
-       struct callback_head work;
-       struct io_wqe *wqe;
-       int index;
-};
-
 static void create_worker_cb(struct callback_head *cb)
 {
-       struct create_worker_data *cwd;
+       struct io_worker *worker;
        struct io_wq *wq;
        struct io_wqe *wqe;
        struct io_wqe_acct *acct;
        bool do_create = false, first = false;
 
-       cwd = container_of(cb, struct create_worker_data, work);
-       wqe = cwd->wqe;
+       worker = container_of(cb, struct io_worker, create_work);
+       wqe = worker->wqe;
        wq = wqe->wq;
-       acct = &wqe->acct[cwd->index];
+       acct = &wqe->acct[worker->create_index];
        raw_spin_lock_irq(&wqe->lock);
        if (acct->nr_workers < acct->max_workers) {
                if (!acct->nr_workers)
        }
        raw_spin_unlock_irq(&wqe->lock);
        if (do_create) {
-               create_io_worker(wq, wqe, cwd->index, first);
+               create_io_worker(wq, wqe, worker->create_index, first);
        } else {
                atomic_dec(&acct->nr_running);
                io_worker_ref_put(wq);
        }
-       kfree(cwd);
+       clear_bit_unlock(0, &worker->create_state);
+       io_worker_release(worker);
 }
 
-static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct)
+static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker,
+                                  struct io_wqe_acct *acct)
 {
-       struct create_worker_data *cwd;
        struct io_wq *wq = wqe->wq;
 
        /* raced with exit, just ignore create call */
        if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
                goto fail;
+       if (!io_worker_get(worker))
+               goto fail;
+       /*
+        * create_state manages ownership of create_work/index. We should
+        * only need one entry per worker, as the worker going to sleep
+        * will trigger the condition, and waking will clear it once it
+        * runs the task_work.
+        */
+       if (test_bit(0, &worker->create_state) ||
+           test_and_set_bit_lock(0, &worker->create_state))
+               goto fail_release;
 
-       cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC);
-       if (cwd) {
-               init_task_work(&cwd->work, create_worker_cb);
-               cwd->wqe = wqe;
-               cwd->index = acct->index;
-               if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL))
-                       return;
-
-               kfree(cwd);
-       }
+       init_task_work(&worker->create_work, create_worker_cb);
+       worker->create_index = acct->index;
+       if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+               return;
+       clear_bit_unlock(0, &worker->create_state);
+fail_release:
+       io_worker_release(worker);
 fail:
        atomic_dec(&acct->nr_running);
        io_worker_ref_put(wq);
        if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
                atomic_inc(&acct->nr_running);
                atomic_inc(&wqe->wq->worker_refs);
-               io_queue_worker_create(wqe, acct);
+               io_queue_worker_create(wqe, worker, acct);
        }
 }
 
 
 static bool io_task_work_match(struct callback_head *cb, void *data)
 {
-       struct create_worker_data *cwd;
+       struct io_worker *worker;
 
        if (cb->func != create_worker_cb)
                return false;
-       cwd = container_of(cb, struct create_worker_data, work);
-       return cwd->wqe->wq == data;
+       worker = container_of(cb, struct io_worker, create_work);
+       return worker->wqe->wq == data;
 }
 
 void io_wq_exit_start(struct io_wq *wq)
                return;
 
        while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
-               struct create_worker_data *cwd;
+               struct io_worker *worker;
 
-               cwd = container_of(cb, struct create_worker_data, work);
-               atomic_dec(&cwd->wqe->acct[cwd->index].nr_running);
+               worker = container_of(cb, struct io_worker, create_work);
+               atomic_dec(&worker->wqe->acct[worker->create_index].nr_running);
                io_worker_ref_put(wq);
-               kfree(cwd);
+               clear_bit_unlock(0, &worker->create_state);
+               io_worker_release(worker);
        }
 
        rcu_read_lock();