]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
blk-mq: fix freeze queue race
authorShan Hai <shan.hai@oracle.com>
Sun, 9 Oct 2016 15:13:10 +0000 (08:13 -0700)
committerChuck Anderson <chuck.anderson@oracle.com>
Wed, 19 Oct 2016 23:00:45 +0000 (16:00 -0700)
Orabug: 24914952

There are several race conditions while freezing queue.

When unfreezing queue, there is a small window between decrementing
q->mq_freeze_depth to zero and percpu_ref_reinit() call with
q->mq_usage_counter.  If the other calls blk_mq_freeze_queue_start()
in the window, q->mq_freeze_depth is increased from zero to one and
percpu_ref_kill() is called with q->mq_usage_counter which is already
killed.  percpu refcount should be re-initialized before killed again.

Also, there is a race condition while switching to percpu mode.
percpu_ref_switch_to_percpu() and percpu_ref_kill() must not be
executed at the same time as the following scenario is possible:

1. q->mq_usage_counter is initialized in atomic mode.
   (atomic counter: 1)

2. After the disk registration, a process like systemd-udev starts
   accessing the disk, and successfully increases refcount successfully
   by percpu_ref_tryget_live() in blk_mq_queue_enter().
   (atomic counter: 2)

3. In the final stage of initialization, q->mq_usage_counter is being
   switched to percpu mode by percpu_ref_switch_to_percpu() in
   blk_mq_finish_init().  But if CONFIG_PREEMPT_VOLUNTARY is enabled,
   the process is rescheduled in the middle of switching when calling
   wait_event() in __percpu_ref_switch_to_percpu().
   (atomic counter: 2)

4. CPU hotplug handling for blk-mq calls percpu_ref_kill() to freeze
   request queue.  q->mq_usage_counter is decreased and marked as
   DEAD.  Wait until all requests have finished.
   (atomic counter: 1)

5. The process rescheduled in the step 3. is resumed and finishes
   all remaining work in __percpu_ref_switch_to_percpu().
   A bias value is added to atomic counter of q->mq_usage_counter.
   (atomic counter: PERCPU_COUNT_BIAS + 1)

6. A request issed in the step 2. is finished and q->mq_usage_counter
   is decreased by blk_mq_queue_exit().  q->mq_usage_counter is DEAD,
   so atomic counter is decreased and no release handler is called.
   (atomic counter: PERCPU_COUNT_BIAS)

7. CPU hotplug handling in the step 4. will wait forever as
   q->mq_usage_counter will never be zero.

Also, percpu_ref_reinit() and percpu_ref_kill() must not be executed
at the same time.  Because both functions could call
__percpu_ref_switch_to_percpu() which adds the bias value and
initialize percpu counter.

Fix those races by serializing with per-queue mutex.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
(cherry picked from https://patchwork.kernel.org/patch/7269471/)

Signed-off-by: Shan Hai <shan.hai@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
block/blk-core.c
block/blk-mq-sysfs.c
block/blk-mq.c
include/linux/blkdev.h

index b4815d0e7ffd50ee66bccd926b6330e3321595b7..33cd386f00c41d9fd800a1fc4ebc8e136da28d2d 100644 (file)
@@ -662,6 +662,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
        __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
 
        init_waitqueue_head(&q->mq_freeze_wq);
+       mutex_init(&q->mq_freeze_lock);
 
        if (blkcg_init_queue(q))
                goto fail_bdi;
index 140cc965c4dc05546c4617716180a121cb98041c..cd14b11c1d179f5769afa1c9cd719bd286fb5704 100644 (file)
@@ -405,7 +405,9 @@ static void blk_mq_sysfs_init(struct request_queue *q)
 /* see blk_register_queue() */
 void blk_mq_finish_init(struct request_queue *q)
 {
+       mutex_lock(&q->mq_freeze_lock);
        percpu_ref_switch_to_percpu(&q->mq_usage_counter);
+       mutex_unlock(&q->mq_freeze_lock);
 }
 
 int blk_mq_register_disk(struct gendisk *disk)
index b88e59e4d80584e85bfbf2ab08986c0458ce7aed..6c50a5eb717ba43a096863a0b4606730bddaf37b 100644 (file)
@@ -114,14 +114,13 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
 {
        bool freeze;
 
-       spin_lock_irq(q->queue_lock);
+       mutex_lock(&q->mq_freeze_lock);
        freeze = !q->mq_freeze_depth++;
-       spin_unlock_irq(q->queue_lock);
-
        if (freeze) {
                percpu_ref_kill(&q->mq_usage_counter);
                blk_mq_run_hw_queues(q, false);
        }
+       mutex_unlock(&q->mq_freeze_lock);
 }
 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
 
@@ -145,14 +144,14 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
 {
        bool wake;
 
-       spin_lock_irq(q->queue_lock);
+       mutex_lock(&q->mq_freeze_lock);
        wake = !--q->mq_freeze_depth;
        WARN_ON_ONCE(q->mq_freeze_depth < 0);
-       spin_unlock_irq(q->queue_lock);
        if (wake) {
                percpu_ref_reinit(&q->mq_usage_counter);
                wake_up_all(&q->mq_freeze_wq);
        }
+       mutex_unlock(&q->mq_freeze_lock);
 }
 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 
index 21d97265871eafdfa15200d90f511efcfeaf4dc9..a0bc6f30983ba07a0a79dbbff655cc2d9095abb1 100644 (file)
@@ -488,6 +488,14 @@ struct request_queue {
 
        struct blk_mq_tag_set   *tag_set;
        struct list_head        tag_set_list;
+#ifndef __GENKSYMS__
+       /*
+        * Protect concurrent access to mq_usage_counter by
+        * percpu_ref_switch_to_percpu(), percpu_ref_kill(), and
+        * percpu_ref_reinit().
+        */
+       struct mutex            mq_freeze_lock;
+#endif
 };
 
 #define QUEUE_FLAG_QUEUED      1       /* uses generic tag queueing */