* The application needs a full memory barrier before checking
         * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
         */
-       u32                     sq_flags;
+       atomic_t                sq_flags;
        /*
         * Runtime CQ flags
         *
        all_flushed = list_empty(&ctx->cq_overflow_list);
        if (all_flushed) {
                clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
-               WRITE_ONCE(ctx->rings->sq_flags,
-                          ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
+               atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
        }
 
        io_commit_cqring(ctx);
        }
        if (list_empty(&ctx->cq_overflow_list)) {
                set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
-               WRITE_ONCE(ctx->rings->sq_flags,
-                          ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
+               atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
 
        }
        ocqe->cqe.user_data = user_data;
        return READ_ONCE(sqd->state);
 }
 
-static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
-{
-       /* Tell userspace we may need a wakeup call */
-       spin_lock(&ctx->completion_lock);
-       WRITE_ONCE(ctx->rings->sq_flags,
-                  ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
-       spin_unlock(&ctx->completion_lock);
-}
-
-static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
-{
-       spin_lock(&ctx->completion_lock);
-       WRITE_ONCE(ctx->rings->sq_flags,
-                  ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
-       spin_unlock(&ctx->completion_lock);
-}
-
 static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 {
        unsigned int to_submit;
                        bool needs_sched = true;
 
                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
-                               io_ring_set_wakeup_flag(ctx);
-
+                               atomic_or(IORING_SQ_NEED_WAKEUP,
+                                               &ctx->rings->sq_flags);
                                if ((ctx->flags & IORING_SETUP_IOPOLL) &&
                                    !wq_list_empty(&ctx->iopoll_list)) {
                                        needs_sched = false;
                                mutex_lock(&sqd->lock);
                        }
                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
-                               io_ring_clear_wakeup_flag(ctx);
+                               atomic_andnot(IORING_SQ_NEED_WAKEUP,
+                                               &ctx->rings->sq_flags);
                }
 
                finish_wait(&sqd->wait, &wait);
        io_uring_cancel_generic(true, sqd);
        sqd->thread = NULL;
        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
-               io_ring_set_wakeup_flag(ctx);
+               atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);
        io_run_task_work();
        mutex_unlock(&sqd->lock);
 
        BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
        BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
 
+       BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
+
        req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
                                SLAB_ACCOUNT);
        return 0;