error = fq->rq_status;
 
        hctx = flush_rq->mq_hctx;
-       if (!q->elevator)
-               flush_rq->tag = BLK_MQ_NO_TAG;
-       else
-               flush_rq->internal_tag = BLK_MQ_NO_TAG;
+       if (!q->elevator) {
+               blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
+               flush_rq->tag = -1;
+       } else {
+               blk_mq_put_driver_tag(flush_rq);
+               flush_rq->internal_tag = -1;
+       }
 
        running = &fq->flush_queue[fq->flush_running_idx];
        BUG_ON(fq->flush_pending_idx == fq->flush_running_idx);
        flush_rq->mq_ctx = first_rq->mq_ctx;
        flush_rq->mq_hctx = first_rq->mq_hctx;
 
-       if (!q->elevator)
+       if (!q->elevator) {
+               fq->orig_rq = first_rq;
                flush_rq->tag = first_rq->tag;
-       else
+               blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq);
+       } else {
                flush_rq->internal_tag = first_rq->internal_tag;
+       }
 
        flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
        flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
        unsigned long flags;
        struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
 
+       if (q->elevator) {
+               WARN_ON(rq->tag < 0);
+               blk_mq_put_driver_tag(rq);
+       }
+
        /*
         * After populating an empty queue, kick it to avoid stall.  Read
         * the comment in flush_end_io().
 
        return atomic_read(&hctx->nr_active) < depth;
 }
 
+/*
+ * This helper should only be used for flush request to share tag
+ * with the request cloned from, and both the two requests can't be
+ * in flight at the same time. The caller has to make sure the tag
+ * can't be freed.
+ */
+static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx,
+               unsigned int tag, struct request *rq)
+{
+       hctx->tags->rqs[tag] = rq;
+}
+
 static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
                                          unsigned int tag)
 {
 
 {
        struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
        struct request *rq = tags->static_rqs[tag];
+       req_flags_t rq_flags = 0;
 
        if (data->q->elevator) {
                rq->tag = BLK_MQ_NO_TAG;
                rq->internal_tag = tag;
        } else {
+               if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
+                       rq_flags = RQF_MQ_INFLIGHT;
+                       atomic_inc(&data->hctx->nr_active);
+               }
                rq->tag = tag;
                rq->internal_tag = BLK_MQ_NO_TAG;
+               data->hctx->tags->rqs[rq->tag] = rq;
        }
 
        /* csd/requeue_work/fifo_time is initialized before use */
        rq->q = data->q;
        rq->mq_ctx = data->ctx;
        rq->mq_hctx = data->hctx;
-       rq->rq_flags = 0;
+       rq->rq_flags = rq_flags;
        rq->cmd_flags = data->cmd_flags;
        if (data->flags & BLK_MQ_REQ_PREEMPT)
                rq->rq_flags |= RQF_PREEMPT;
        return cpu_online(rq->mq_ctx->cpu);
 }
 
-static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
-               struct request *rq)
-{
-       blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
-       rq->tag = BLK_MQ_NO_TAG;
-
-       if (rq->rq_flags & RQF_MQ_INFLIGHT) {
-               rq->rq_flags &= ~RQF_MQ_INFLIGHT;
-               atomic_dec(&hctx->nr_active);
-       }
-}
-
-static inline void blk_mq_put_driver_tag(struct request *rq)
-{
-       if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG)
-               return;
-
-       __blk_mq_put_driver_tag(rq->mq_hctx, rq);
-}
-
 bool blk_mq_complete_request_remote(struct request *rq)
 {
        WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
 
-       blk_mq_put_driver_tag(rq);
-
        /*
         * For a polled request, always complete locallly, it's pointless
         * to redirect the completion.
 {
        struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
        unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
+       bool shared = blk_mq_tag_busy(rq->mq_hctx);
        int tag;
 
-       blk_mq_tag_busy(rq->mq_hctx);
-
        if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
                bt = &rq->mq_hctx->tags->breserved_tags;
                tag_offset = 0;
                return false;
 
        rq->tag = tag + tag_offset;
+       if (shared) {
+               rq->rq_flags |= RQF_MQ_INFLIGHT;
+               atomic_inc(&rq->mq_hctx->nr_active);
+       }
+       rq->mq_hctx->tags->rqs[rq->tag] = rq;
        return true;
 }
 
 static bool blk_mq_get_driver_tag(struct request *rq)
 {
-       struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
-
-       if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
-               return false;
-
-       if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
-               rq->rq_flags |= RQF_MQ_INFLIGHT;
-               atomic_inc(&hctx->nr_active);
-       }
-       hctx->tags->rqs[rq->tag] = rq;
-       return true;
+       if (rq->tag != BLK_MQ_NO_TAG)
+               return true;
+       return __blk_mq_get_driver_tag(rq);
 }
 
 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
 
        return true;
 }
 
+static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
+                                          struct request *rq)
+{
+       blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
+       rq->tag = BLK_MQ_NO_TAG;
+
+       if (rq->rq_flags & RQF_MQ_INFLIGHT) {
+               rq->rq_flags &= ~RQF_MQ_INFLIGHT;
+               atomic_dec(&hctx->nr_active);
+       }
+}
+
+static inline void blk_mq_put_driver_tag(struct request *rq)
+{
+       if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG)
+               return;
+
+       __blk_mq_put_driver_tag(rq->mq_hctx, rq);
+}
+
 static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
 {
        int cpu;