Move scheduler tags (sched_tags) allocation and deallocation outside
both the ->elevator_lock and ->freeze_lock when updating nr_hw_queues.
This change breaks the dependency chain from the percpu allocator lock
to the elevator lock, helping to prevent potential deadlocks, as
observed in the reported lockdep splat[1].
This commit introduces batch allocation and deallocation helpers for
sched_tags, which are now used from within __blk_mq_update_nr_hw_queues
routine while iterating through the tagset.
With this change, all sched_tags memory management is handled entirely
outside the ->elevator_lock and the ->freeze_lock context, thereby
eliminating the lock dependency that could otherwise manifest during
nr_hw_queues updates.
[1] https://lore.kernel.org/all/
0659ea8d-a463-47c8-9180-
43c719e106eb@linux.ibm.com/
Reported-by: Stefan Haberland <sth@linux.ibm.com>
Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Link: https://lore.kernel.org/r/20250730074614.2537382-4-nilay@linux.ibm.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
        kfree(et);
 }
 
+void blk_mq_free_sched_tags_batch(struct xarray *et_table,
+               struct blk_mq_tag_set *set)
+{
+       struct request_queue *q;
+       struct elevator_tags *et;
+
+       lockdep_assert_held_write(&set->update_nr_hwq_lock);
+
+       list_for_each_entry(q, &set->tag_list, tag_set_list) {
+               /*
+                * Accessing q->elevator without holding q->elevator_lock is
+                * safe because we're holding here set->update_nr_hwq_lock in
+                * the writer context. So, scheduler update/switch code (which
+                * acquires the same lock but in the reader context) can't run
+                * concurrently.
+                */
+               if (q->elevator) {
+                       et = xa_load(et_table, q->id);
+                       if (unlikely(!et))
+                               WARN_ON_ONCE(1);
+                       else
+                               blk_mq_free_sched_tags(et, set);
+               }
+       }
+}
+
 struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
                unsigned int nr_hw_queues)
 {
        return NULL;
 }
 
+int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
+               struct blk_mq_tag_set *set, unsigned int nr_hw_queues)
+{
+       struct request_queue *q;
+       struct elevator_tags *et;
+       gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
+
+       lockdep_assert_held_write(&set->update_nr_hwq_lock);
+
+       list_for_each_entry(q, &set->tag_list, tag_set_list) {
+               /*
+                * Accessing q->elevator without holding q->elevator_lock is
+                * safe because we're holding here set->update_nr_hwq_lock in
+                * the writer context. So, scheduler update/switch code (which
+                * acquires the same lock but in the reader context) can't run
+                * concurrently.
+                */
+               if (q->elevator) {
+                       et = blk_mq_alloc_sched_tags(set, nr_hw_queues);
+                       if (!et)
+                               goto out_unwind;
+                       if (xa_insert(et_table, q->id, et, gfp))
+                               goto out_free_tags;
+               }
+       }
+       return 0;
+out_free_tags:
+       blk_mq_free_sched_tags(et, set);
+out_unwind:
+       list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) {
+               if (q->elevator) {
+                       et = xa_load(et_table, q->id);
+                       if (et)
+                               blk_mq_free_sched_tags(et, set);
+               }
+       }
+       return -ENOMEM;
+}
+
 /* caller must have a reference to @e, will grab another one if successful */
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
                struct elevator_tags *et)
 
 
 struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
                unsigned int nr_hw_queues);
+int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
+               struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
 void blk_mq_free_sched_tags(struct elevator_tags *et,
                struct blk_mq_tag_set *set);
+void blk_mq_free_sched_tags_batch(struct xarray *et_table,
+               struct blk_mq_tag_set *set);
 
 static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
 {
 
  * Switch back to the elevator type stored in the xarray.
  */
 static void blk_mq_elv_switch_back(struct request_queue *q,
-               struct xarray *elv_tbl)
+               struct xarray *elv_tbl, struct xarray *et_tbl)
 {
        struct elevator_type *e = xa_load(elv_tbl, q->id);
+       struct elevator_tags *t = xa_load(et_tbl, q->id);
 
        /* The elv_update_nr_hw_queues unfreezes the queue. */
-       elv_update_nr_hw_queues(q, e);
+       elv_update_nr_hw_queues(q, e, t);
 
        /* Drop the reference acquired in blk_mq_elv_switch_none. */
        if (e)
        int prev_nr_hw_queues = set->nr_hw_queues;
        unsigned int memflags;
        int i;
-       struct xarray elv_tbl;
+       struct xarray elv_tbl, et_tbl;
 
        lockdep_assert_held(&set->tag_list_lock);
 
 
        memflags = memalloc_noio_save();
 
+       xa_init(&et_tbl);
+       if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0)
+               goto out_memalloc_restore;
+
        xa_init(&elv_tbl);
 
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
 switch_back:
        /* The blk_mq_elv_switch_back unfreezes queue for us. */
        list_for_each_entry(q, &set->tag_list, tag_set_list)
-               blk_mq_elv_switch_back(q, &elv_tbl);
+               blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl);
 
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_sysfs_register_hctxs(q);
        }
 
        xa_destroy(&elv_tbl);
-
+       xa_destroy(&et_tbl);
+out_memalloc_restore:
        memalloc_noio_restore(memflags);
 
        /* Free the excess tags when nr_hw_queues shrink. */
 
 #include "blk-crypto-internal.h"
 
 struct elevator_type;
+struct elevator_tags;
 
 /*
  * Default upper limit for the software max_sectors limit used for regular I/Os.
 
 bool blk_insert_flush(struct request *rq);
 
-void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e);
+void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
+               struct elevator_tags *t);
 void elevator_set_default(struct request_queue *q);
 void elevator_set_none(struct request_queue *q);
 
 
  * The I/O scheduler depends on the number of hardware queues, this forces a
  * reattachment when nr_hw_queues changes.
  */
-void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e)
+void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
+               struct elevator_tags *t)
 {
        struct blk_mq_tag_set *set = q->tag_set;
        struct elv_change_ctx ctx = {};
 
        if (e && !blk_queue_dying(q) && blk_queue_registered(q)) {
                ctx.name = e->elevator_name;
-               ctx.et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues);
-               if (!ctx.et) {
-                       WARN_ON_ONCE(1);
-                       goto unfreeze;
-               }
+               ctx.et = t;
+
                mutex_lock(&q->elevator_lock);
                /* force to reattach elevator after nr_hw_queue is updated */
                ret = elevator_switch(q, &ctx);
                mutex_unlock(&q->elevator_lock);
        }
-unfreeze:
        blk_mq_unfreeze_queue_nomemrestore(q);
        if (!ret)
                WARN_ON_ONCE(elevator_change_done(q, &ctx));
        /*
         * Free sched tags if it's allocated but we couldn't switch elevator.
         */
-       if (ctx.et && !ctx.new)
-               blk_mq_free_sched_tags(ctx.et, set);
+       if (t && !ctx.new)
+               blk_mq_free_sched_tags(t, set);
 }
 
 /*