if (cqe) {
                clear_bit(0, &ctx->sq_check_overflow);
                clear_bit(0, &ctx->cq_check_overflow);
+               ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
        }
        spin_unlock_irqrestore(&ctx->completion_lock, flags);
        io_cqring_ev_posted(ctx);
                if (list_empty(&ctx->cq_overflow_list)) {
                        set_bit(0, &ctx->sq_check_overflow);
                        set_bit(0, &ctx->cq_check_overflow);
+                       ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
                }
                req->flags |= REQ_F_OVERFLOW;
                refcount_inc(&req->refs);
                        }
 
                        /* Tell userspace we may need a wakeup call */
+                       spin_lock_irq(&ctx->completion_lock);
                        ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
-                       /* make sure to read SQ tail after writing flags */
-                       smp_mb();
+                       spin_unlock_irq(&ctx->completion_lock);
 
                        to_submit = io_sqring_entries(ctx);
                        if (!to_submit || ret == -EBUSY) {
                                schedule();
                                finish_wait(&ctx->sqo_wait, &wait);
 
+                               spin_lock_irq(&ctx->completion_lock);
                                ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+                               spin_unlock_irq(&ctx->completion_lock);
                                ret = 0;
                                continue;
                        }
                        finish_wait(&ctx->sqo_wait, &wait);
 
+                       spin_lock_irq(&ctx->completion_lock);
                        ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+                       spin_unlock_irq(&ctx->completion_lock);
                }
 
                mutex_lock(&ctx->uring_lock);
                        if (list_empty(&ctx->cq_overflow_list)) {
                                clear_bit(0, &ctx->sq_check_overflow);
                                clear_bit(0, &ctx->cq_check_overflow);
+                               ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
                        }
                        spin_unlock_irq(&ctx->completion_lock);