From 27b5d4170cda9a7ad0596d540d57e063bf5da26e Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Mon, 7 Oct 2024 12:24:16 -0600 Subject: [PATCH 01/16] ublk: merge stop_work and quiesce_work Save some lines by merging stop_work and quiesce_work into nosrv_work, which looks at the recovery flags and does the right thing when the "no ublk server" condition is detected. Signed-off-by: Uday Shankar Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20241007182419.3263186-4-ushankar@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 64 ++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 39 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 50eb608b98a4..0e75283e3bda 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -182,8 +182,7 @@ struct ublk_device { unsigned int nr_queues_ready; unsigned int nr_privileged_daemon; - struct work_struct quiesce_work; - struct work_struct stop_work; + struct work_struct nosrv_work; }; /* header of ublk_params */ @@ -1261,10 +1260,7 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq) struct ublk_device *ub = ubq->dev; if (ublk_abort_requests(ub, ubq)) { - if (ublk_nosrv_should_stop_dev(ub)) - schedule_work(&ub->stop_work); - else - schedule_work(&ub->quiesce_work); + schedule_work(&ub->nosrv_work); } return BLK_EH_DONE; } @@ -1514,10 +1510,7 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, ublk_cancel_cmd(ubq, io, issue_flags); if (need_schedule) { - if (ublk_nosrv_should_stop_dev(ub)) - schedule_work(&ub->stop_work); - else - schedule_work(&ub->quiesce_work); + schedule_work(&ub->nosrv_work); } } @@ -1580,20 +1573,6 @@ static void __ublk_quiesce_dev(struct ublk_device *ub) ub->dev_info.state = UBLK_S_DEV_QUIESCED; } -static void ublk_quiesce_work_fn(struct work_struct *work) -{ - struct ublk_device *ub = - container_of(work, struct ublk_device, quiesce_work); - - mutex_lock(&ub->mutex); - if (ub->dev_info.state != UBLK_S_DEV_LIVE) - goto unlock; - __ublk_quiesce_dev(ub); - unlock: - mutex_unlock(&ub->mutex); - ublk_cancel_dev(ub); -} - static void ublk_unquiesce_dev(struct ublk_device *ub) { int i; @@ -1642,6 +1621,25 @@ static void ublk_stop_dev(struct ublk_device *ub) ublk_cancel_dev(ub); } +static void ublk_nosrv_work(struct work_struct *work) +{ + struct ublk_device *ub = + container_of(work, struct ublk_device, nosrv_work); + + if (ublk_nosrv_should_stop_dev(ub)) { + ublk_stop_dev(ub); + return; + } + + mutex_lock(&ub->mutex); + if (ub->dev_info.state != UBLK_S_DEV_LIVE) + goto unlock; + __ublk_quiesce_dev(ub); + unlock: + mutex_unlock(&ub->mutex); + ublk_cancel_dev(ub); +} + /* device can only be started after all IOs are ready */ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) { @@ -2155,14 +2153,6 @@ static int ublk_add_chdev(struct ublk_device *ub) return ret; } -static void ublk_stop_work_fn(struct work_struct *work) -{ - struct ublk_device *ub = - container_of(work, struct ublk_device, stop_work); - - ublk_stop_dev(ub); -} - /* align max io buffer size with PAGE_SIZE */ static void ublk_align_max_io_size(struct ublk_device *ub) { @@ -2187,8 +2177,7 @@ static int ublk_add_tag_set(struct ublk_device *ub) static void ublk_remove(struct ublk_device *ub) { ublk_stop_dev(ub); - cancel_work_sync(&ub->stop_work); - cancel_work_sync(&ub->quiesce_work); + cancel_work_sync(&ub->nosrv_work); cdev_device_del(&ub->cdev, &ub->cdev_dev); ublk_put_device(ub); ublks_added--; @@ -2457,8 +2446,7 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) goto out_unlock; mutex_init(&ub->mutex); spin_lock_init(&ub->lock); - INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn); - INIT_WORK(&ub->stop_work, ublk_stop_work_fn); + INIT_WORK(&ub->nosrv_work, ublk_nosrv_work); ret = ublk_alloc_dev_number(ub, header->dev_id); if (ret < 0) @@ -2593,9 +2581,7 @@ static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) static int ublk_ctrl_stop_dev(struct ublk_device *ub) { ublk_stop_dev(ub); - cancel_work_sync(&ub->stop_work); - cancel_work_sync(&ub->quiesce_work); - + cancel_work_sync(&ub->nosrv_work); return 0; } -- 2.50.1 From 59eaa01ce7a6cbc5c36b928f52888f99fca6b295 Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Mon, 7 Oct 2024 12:24:17 -0600 Subject: [PATCH 02/16] ublk: support device recovery without I/O queueing ublk currently supports the following behaviors on ublk server exit: A: outstanding I/Os get errors, subsequently issued I/Os get errors B: outstanding I/Os get errors, subsequently issued I/Os queue C: outstanding I/Os get reissued, subsequently issued I/Os queue and the following behaviors for recovery of preexisting block devices by a future incarnation of the ublk server: 1: ublk devices stopped on ublk server exit (no recovery possible) 2: ublk devices are recoverable using start/end_recovery commands The userspace interface allows selection of combinations of these behaviors using flags specified at device creation time, namely: default behavior: A + 1 UBLK_F_USER_RECOVERY: B + 2 UBLK_F_USER_RECOVERY|UBLK_F_USER_RECOVERY_REISSUE: C + 2 The behavior A + 2 is currently unsupported. Add support for this behavior under the new flag combination UBLK_F_USER_RECOVERY|UBLK_F_USER_RECOVERY_FAIL_IO. Signed-off-by: Uday Shankar Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20241007182419.3263186-5-ushankar@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 78 ++++++++++++++++++++++++++++------- include/uapi/linux/ublk_cmd.h | 18 ++++++++ 2 files changed, 81 insertions(+), 15 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 0e75283e3bda..59951e7c2593 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -60,10 +60,12 @@ | UBLK_F_UNPRIVILEGED_DEV \ | UBLK_F_CMD_IOCTL_ENCODE \ | UBLK_F_USER_COPY \ - | UBLK_F_ZONED) + | UBLK_F_ZONED \ + | UBLK_F_USER_RECOVERY_FAIL_IO) #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ - | UBLK_F_USER_RECOVERY_REISSUE) + | UBLK_F_USER_RECOVERY_REISSUE \ + | UBLK_F_USER_RECOVERY_FAIL_IO) /* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL \ @@ -146,6 +148,7 @@ struct ublk_queue { bool force_abort; bool timeout; bool canceling; + bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ unsigned short nr_io_ready; /* how many ios setup */ spinlock_t cancel_lock; struct ublk_device *dev; @@ -690,7 +693,8 @@ static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub) */ static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub) { - return ub->dev_info.flags & UBLK_F_USER_RECOVERY; + return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) && + !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO); } /* @@ -700,7 +704,8 @@ static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub) */ static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq) { - return ubq->flags & UBLK_F_USER_RECOVERY; + return (ubq->flags & UBLK_F_USER_RECOVERY) && + !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO); } /* @@ -714,6 +719,12 @@ static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub) return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY); } +static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub) +{ + return ub->dev_info.state == UBLK_S_DEV_QUIESCED || + ub->dev_info.state == UBLK_S_DEV_FAIL_IO; +} + static void ublk_free_disk(struct gendisk *disk) { struct ublk_device *ub = disk->private_data; @@ -1275,6 +1286,10 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq = bd->rq; blk_status_t res; + if (unlikely(ubq->fail_io)) { + return BLK_STS_TARGET; + } + /* fill iod to slot in io cmd buffer */ res = ublk_setup_iod(ubq, rq); if (unlikely(res != BLK_STS_OK)) @@ -1625,6 +1640,7 @@ static void ublk_nosrv_work(struct work_struct *work) { struct ublk_device *ub = container_of(work, struct ublk_device, nosrv_work); + int i; if (ublk_nosrv_should_stop_dev(ub)) { ublk_stop_dev(ub); @@ -1634,7 +1650,18 @@ static void ublk_nosrv_work(struct work_struct *work) mutex_lock(&ub->mutex); if (ub->dev_info.state != UBLK_S_DEV_LIVE) goto unlock; - __ublk_quiesce_dev(ub); + + if (ublk_nosrv_dev_should_queue_io(ub)) { + __ublk_quiesce_dev(ub); + } else { + blk_mq_quiesce_queue(ub->ub_disk->queue); + ub->dev_info.state = UBLK_S_DEV_FAIL_IO; + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + ublk_get_queue(ub, i)->fail_io = true; + } + blk_mq_unquiesce_queue(ub->ub_disk->queue); + } + unlock: mutex_unlock(&ub->mutex); ublk_cancel_dev(ub); @@ -2387,8 +2414,13 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) return -EPERM; /* forbid nonsense combinations of recovery flags */ - if ((info.flags & UBLK_F_USER_RECOVERY_REISSUE) && - !(info.flags & UBLK_F_USER_RECOVERY)) { + switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) { + case 0: + case UBLK_F_USER_RECOVERY: + case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE): + case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO): + break; + default: pr_warn("%s: invalid recovery flags %llx\n", __func__, info.flags & UBLK_F_ALL_RECOVERY_FLAGS); return -EINVAL; @@ -2729,14 +2761,18 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub, * and related io_uring ctx is freed so file struct of /dev/ublkcX is * released. * + * and one of the following holds + * * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work: * (a)has quiesced request queue * (b)has requeued every inflight rqs whose io_flags is ACTIVE * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE * (d)has completed/camceled all ioucmds owned by ther dying process + * + * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not + * quiesced, but all I/O is being immediately errored */ - if (test_bit(UB_STATE_OPEN, &ub->state) || - ub->dev_info.state != UBLK_S_DEV_QUIESCED) { + if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) { ret = -EBUSY; goto out_unlock; } @@ -2760,6 +2796,7 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); int ublksrv_pid = (int)header->data[0]; int ret = -EINVAL; + int i; pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", __func__, ub->dev_info.nr_hw_queues, header->dev_id); @@ -2774,18 +2811,29 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, if (ublk_nosrv_should_stop_dev(ub)) goto out_unlock; - if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) { + if (!ublk_dev_in_recoverable_state(ub)) { ret = -EBUSY; goto out_unlock; } ub->dev_info.ublksrv_pid = ublksrv_pid; pr_devel("%s: new ublksrv_pid %d, dev id %d\n", __func__, ublksrv_pid, header->dev_id); - blk_mq_unquiesce_queue(ub->ub_disk->queue); - pr_devel("%s: queue unquiesced, dev id %d.\n", - __func__, header->dev_id); - blk_mq_kick_requeue_list(ub->ub_disk->queue); - ub->dev_info.state = UBLK_S_DEV_LIVE; + + if (ublk_nosrv_dev_should_queue_io(ub)) { + ub->dev_info.state = UBLK_S_DEV_LIVE; + blk_mq_unquiesce_queue(ub->ub_disk->queue); + pr_devel("%s: queue unquiesced, dev id %d.\n", + __func__, header->dev_id); + blk_mq_kick_requeue_list(ub->ub_disk->queue); + } else { + blk_mq_quiesce_queue(ub->ub_disk->queue); + ub->dev_info.state = UBLK_S_DEV_LIVE; + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + ublk_get_queue(ub, i)->fail_io = false; + } + blk_mq_unquiesce_queue(ub->ub_disk->queue); + } + ret = 0; out_unlock: mutex_unlock(&ub->mutex); diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 12873639ea96..a8bc98bb69fc 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -147,8 +147,18 @@ */ #define UBLK_F_NEED_GET_DATA (1UL << 2) +/* + * - Block devices are recoverable if ublk server exits and restarts + * - Outstanding I/O when ublk server exits is met with errors + * - I/O issued while there is no ublk server queues + */ #define UBLK_F_USER_RECOVERY (1UL << 3) +/* + * - Block devices are recoverable if ublk server exits and restarts + * - Outstanding I/O when ublk server exits is reissued + * - I/O issued while there is no ublk server queues + */ #define UBLK_F_USER_RECOVERY_REISSUE (1UL << 4) /* @@ -190,10 +200,18 @@ */ #define UBLK_F_ZONED (1ULL << 8) +/* + * - Block devices are recoverable if ublk server exits and restarts + * - Outstanding I/O when ublk server exits is met with errors + * - I/O issued while there is no ublk server is met with errors + */ +#define UBLK_F_USER_RECOVERY_FAIL_IO (1ULL << 9) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 #define UBLK_S_DEV_QUIESCED 2 +#define UBLK_S_DEV_FAIL_IO 3 /* shipped via sqe->cmd of io_uring command */ struct ublksrv_ctrl_cmd { -- 2.50.1 From 69f407ee8dc0f9c09209c9883bd25cd8194e94a3 Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Mon, 7 Oct 2024 12:24:18 -0600 Subject: [PATCH 03/16] Documentation: ublk: document UBLK_F_USER_RECOVERY_FAIL_IO Signed-off-by: Uday Shankar Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20241007182419.3263186-6-ushankar@purestorage.com Signed-off-by: Jens Axboe --- Documentation/block/ublk.rst | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index ff74b3ec4a98..51665a3e6a50 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -199,24 +199,36 @@ managing and controlling ublk devices with help of several control commands: - user recovery feature description - Two new features are added for user recovery: ``UBLK_F_USER_RECOVERY`` and - ``UBLK_F_USER_RECOVERY_REISSUE``. - - With ``UBLK_F_USER_RECOVERY`` set, after one ubq_daemon(ublk server's io + Three new features are added for user recovery: ``UBLK_F_USER_RECOVERY``, + ``UBLK_F_USER_RECOVERY_REISSUE``, and ``UBLK_F_USER_RECOVERY_FAIL_IO``. To + enable recovery of ublk devices after the ublk server exits, the ublk server + should specify the ``UBLK_F_USER_RECOVERY`` flag when creating the device. The + ublk server may additionally specify at most one of + ``UBLK_F_USER_RECOVERY_REISSUE`` and ``UBLK_F_USER_RECOVERY_FAIL_IO`` to + modify how I/O is handled while the ublk server is dying/dead (this is called + the ``nosrv`` case in the driver code). + + With just ``UBLK_F_USER_RECOVERY`` set, after one ubq_daemon(ublk server's io handler) is dying, ublk does not delete ``/dev/ublkb*`` during the whole recovery stage and ublk device ID is kept. It is ublk server's responsibility to recover the device context by its own knowledge. Requests which have not been issued to userspace are requeued. Requests which have been issued to userspace are aborted. - With ``UBLK_F_USER_RECOVERY_REISSUE`` set, after one ubq_daemon(ublk - server's io handler) is dying, contrary to ``UBLK_F_USER_RECOVERY``, + With ``UBLK_F_USER_RECOVERY_REISSUE`` additionally set, after one ubq_daemon + (ublk server's io handler) is dying, contrary to ``UBLK_F_USER_RECOVERY``, requests which have been issued to userspace are requeued and will be re-issued to the new process after handling ``UBLK_CMD_END_USER_RECOVERY``. ``UBLK_F_USER_RECOVERY_REISSUE`` is designed for backends who tolerate double-write since the driver may issue the same I/O request twice. It might be useful to a read-only FS or a VM backend. + With ``UBLK_F_USER_RECOVERY_FAIL_IO`` additionally set, after the ublk server + exits, requests which have issued to userspace are failed, as are any + subsequently issued requests. Applications continuously issuing I/O against + devices with this flag set will see a stream of I/O errors until a new ublk + server recovers the device. + Unprivileged ublk device is supported by passing ``UBLK_F_UNPRIVILEGED_DEV``. Once the flag is set, all control commands can be sent by unprivileged user. Except for command of ``UBLK_CMD_ADD_DEV``, permission check on -- 2.50.1 From b21d948f4cc73e3296f2365c7afca721dd6893fa Mon Sep 17 00:00:00 2001 From: Greg Joyce Date: Thu, 29 Aug 2024 12:56:11 -0500 Subject: [PATCH 04/16] block: sed-opal: add ioctl IOC_OPAL_SET_SID_PW After a SED drive is provisioned, there is no way to change the SID password via the ioctl() interface. A new ioctl IOC_OPAL_SET_SID_PW will allow the password to be changed. The valid current password is required. Signed-off-by: Greg Joyce Reviewed-by: Daniel Wagner Link: https://lore.kernel.org/r/20240829175639.6478-2-gjoyce@linux.ibm.com Signed-off-by: Jens Axboe --- block/sed-opal.c | 26 ++++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 1 + 3 files changed, 28 insertions(+) diff --git a/block/sed-opal.c b/block/sed-opal.c index 598fd3e7fcc8..5a28f23f7f22 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -3037,6 +3037,29 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) return ret; } +static int opal_set_new_sid_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) +{ + int ret; + struct opal_key *newkey = &opal_pw->new_user_pw.opal_key; + struct opal_key *oldkey = &opal_pw->session.opal_key; + + const struct opal_step pw_steps[] = { + { start_SIDASP_opal_session, oldkey }, + { set_sid_cpin_pin, newkey }, + { end_opal_session, } + }; + + if (!dev) + return -ENODEV; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + static int opal_activate_user(struct opal_dev *dev, struct opal_session_info *opal_session) { @@ -3286,6 +3309,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_DISCOVERY: ret = opal_get_discv(dev, p); break; + case IOC_OPAL_SET_SID_PW: + ret = opal_set_new_sid_pw(dev, p); + break; default: break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 2ac50822554e..80f33a93f944 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -52,6 +52,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_GET_GEOMETRY: case IOC_OPAL_DISCOVERY: case IOC_OPAL_REVERT_LSP: + case IOC_OPAL_SET_SID_PW: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index d3994b7716bc..9025dd5a4f0f 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -215,5 +215,6 @@ struct opal_revert_lsp { #define IOC_OPAL_GET_GEOMETRY _IOR('p', 238, struct opal_geometry) #define IOC_OPAL_DISCOVERY _IOW('p', 239, struct opal_discovery) #define IOC_OPAL_REVERT_LSP _IOW('p', 240, struct opal_revert_lsp) +#define IOC_OPAL_SET_SID_PW _IOW('p', 241, struct opal_new_pw) #endif /* _UAPI_SED_OPAL_H */ -- 2.50.1 From 28878733ca5abbce0d0e66476605746d9c4c9765 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 13 Oct 2024 22:16:56 +0200 Subject: [PATCH 05/16] block: replace call_rcu by kfree_rcu for simple kmem_cache_free callback Since SLOB was removed and since commit 6c6c47b063b5 ("mm, slab: call kvfree_rcu_barrier() from kmem_cache_destroy()"), it is not necessary to use call_rcu when the callback only performs kmem_cache_free. Use kfree_rcu() directly. The changes were made using Coccinelle. Signed-off-by: Julia Lawall Link: https://lore.kernel.org/r/20241013201704.49576-10-Julia.Lawall@inria.fr Signed-off-by: Jens Axboe --- block/blk-ioc.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 25dd4db11121..ce82770c72ab 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -32,13 +32,6 @@ static void get_io_context(struct io_context *ioc) atomic_long_inc(&ioc->refcount); } -static void icq_free_icq_rcu(struct rcu_head *head) -{ - struct io_cq *icq = container_of(head, struct io_cq, __rcu_head); - - kmem_cache_free(icq->__rcu_icq_cache, icq); -} - /* * Exit an icq. Called with ioc locked for blk-mq, and with both ioc * and queue locked for legacy. @@ -102,7 +95,7 @@ static void ioc_destroy_icq(struct io_cq *icq) */ icq->__rcu_icq_cache = et->icq_cache; icq->flags |= ICQ_DESTROYED; - call_rcu(&icq->__rcu_head, icq_free_icq_rcu); + kfree_rcu(icq, __rcu_head); } /* -- 2.50.1 From 732312e1836b9bd80828cda6417df203c1d94218 Mon Sep 17 00:00:00 2001 From: Xiuhong Wang Date: Wed, 16 Oct 2024 10:45:08 +0800 Subject: [PATCH 06/16] Revert "blk-throttle: Fix IO hang for a corner case" This reverts commit 5b7048b89745c3c5fb4b3080fb7bced61dba2a2b. The main purpose of this patch is cleanup. The throtl_adjusted_limit function was removed after commit bf20ab538c81 ("blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW"), so the problem of not being able to scale after setting bps or iops to 1 will not occur. So revert this commit that bps/iops can be set to 1. Cc: Baolin Wang Cc: Yu Kuai Signed-off-by: Xiuhong Wang Signed-off-by: Zhiguo Niu Acked-by: Tejun Heo Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20241016024508.3340330-1-xiuhong.wang@unisoc.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 2c4192e12efa..443d1f47c2ce 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1485,13 +1485,13 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, goto out_finish; ret = -EINVAL; - if (!strcmp(tok, "rbps") && val > 1) + if (!strcmp(tok, "rbps")) v[0] = val; - else if (!strcmp(tok, "wbps") && val > 1) + else if (!strcmp(tok, "wbps")) v[1] = val; - else if (!strcmp(tok, "riops") && val > 1) + else if (!strcmp(tok, "riops")) v[2] = min_t(u64, val, UINT_MAX); - else if (!strcmp(tok, "wiops") && val > 1) + else if (!strcmp(tok, "wiops")) v[3] = min_t(u64, val, UINT_MAX); else goto out_finish; -- 2.50.1 From 2003ee8a9aa14d766b06088156978d53c2e9be3d Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 14 Oct 2024 17:29:32 +0800 Subject: [PATCH 07/16] block: fix missing dispatching request when queue is started or unquiesced Supposing the following scenario with a virtio_blk driver. CPU0 CPU1 CPU2 blk_mq_try_issue_directly() __blk_mq_issue_directly() q->mq_ops->queue_rq() virtio_queue_rq() blk_mq_stop_hw_queue() virtblk_done() blk_mq_try_issue_directly() if (blk_mq_hctx_stopped()) blk_mq_request_bypass_insert() blk_mq_run_hw_queue() blk_mq_run_hw_queue() blk_mq_run_hw_queue() blk_mq_insert_request() return After CPU0 has marked the queue as stopped, CPU1 will see the queue is stopped. But before CPU1 puts the request on the dispatch list, CPU2 receives the interrupt of completion of request, so it will run the hardware queue and marks the queue as non-stopped. Meanwhile, CPU1 also runs the same hardware queue. After both CPU1 and CPU2 complete blk_mq_run_hw_queue(), CPU1 just puts the request to the same hardware queue and returns. It misses dispatching a request. Fix it by running the hardware queue explicitly. And blk_mq_request_issue_directly() should handle a similar situation. Fix it as well. Fixes: d964f04a8fde ("blk-mq: fix direct issue") Cc: stable@vger.kernel.org Cc: Muchun Song Signed-off-by: Muchun Song Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20241014092934.53630-2-songmuchun@bytedance.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 7d05a56e3639..5deb9dffca0a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2647,6 +2647,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); + blk_mq_run_hw_queue(hctx, false); return; } @@ -2677,6 +2678,7 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); + blk_mq_run_hw_queue(hctx, false); return BLK_STS_OK; } -- 2.50.1 From 6bda857bcbb86fb9d0e54fbef93a093d51172acc Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 14 Oct 2024 17:29:33 +0800 Subject: [PATCH 08/16] block: fix ordering between checking QUEUE_FLAG_QUIESCED request adding Supposing the following scenario. CPU0 CPU1 blk_mq_insert_request() 1) store blk_mq_unquiesce_queue() blk_queue_flag_clear() 3) store blk_mq_run_hw_queues() blk_mq_run_hw_queue() if (!blk_mq_hctx_has_pending()) 4) load return blk_mq_run_hw_queue() if (blk_queue_quiesced()) 2) load return blk_mq_sched_dispatch_requests() The full memory barrier should be inserted between 1) and 2), as well as between 3) and 4) to make sure that either CPU0 sees QUEUE_FLAG_QUIESCED is cleared or CPU1 sees dispatch list or setting of bitmap of software queue. Otherwise, either CPU will not rerun the hardware queue causing starvation. So the first solution is to 1) add a pair of memory barrier to fix the problem, another solution is to 2) use hctx->queue->queue_lock to synchronize QUEUE_FLAG_QUIESCED. Here, we chose 2) to fix it since memory barrier is not easy to be maintained. Fixes: f4560ffe8cec ("blk-mq: use QUEUE_FLAG_QUIESCED to quiesce queue") Cc: stable@vger.kernel.org Cc: Muchun Song Signed-off-by: Muchun Song Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20241014092934.53630-3-songmuchun@bytedance.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 47 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 5deb9dffca0a..bb4ee2380dce 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2227,6 +2227,24 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) } EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); +static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx) +{ + bool need_run; + + /* + * When queue is quiesced, we may be switching io scheduler, or + * updating nr_hw_queues, or other things, and we can't run queue + * any more, even blk_mq_hctx_has_pending() can't be called safely. + * + * And queue will be rerun in blk_mq_unquiesce_queue() if it is + * quiesced. + */ + __blk_mq_run_dispatch_ops(hctx->queue, false, + need_run = !blk_queue_quiesced(hctx->queue) && + blk_mq_hctx_has_pending(hctx)); + return need_run; +} + /** * blk_mq_run_hw_queue - Start to run a hardware queue. * @hctx: Pointer to the hardware queue to run. @@ -2247,20 +2265,23 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); - /* - * When queue is quiesced, we may be switching io scheduler, or - * updating nr_hw_queues, or other things, and we can't run queue - * any more, even __blk_mq_hctx_has_pending() can't be called safely. - * - * And queue will be rerun in blk_mq_unquiesce_queue() if it is - * quiesced. - */ - __blk_mq_run_dispatch_ops(hctx->queue, false, - need_run = !blk_queue_quiesced(hctx->queue) && - blk_mq_hctx_has_pending(hctx)); + need_run = blk_mq_hw_queue_need_run(hctx); + if (!need_run) { + unsigned long flags; - if (!need_run) - return; + /* + * Synchronize with blk_mq_unquiesce_queue(), because we check + * if hw queue is quiesced locklessly above, we need the use + * ->queue_lock to make sure we see the up-to-date status to + * not miss rerunning the hw queue. + */ + spin_lock_irqsave(&hctx->queue->queue_lock, flags); + need_run = blk_mq_hw_queue_need_run(hctx); + spin_unlock_irqrestore(&hctx->queue->queue_lock, flags); + + if (!need_run) + return; + } if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { blk_mq_delay_run_hw_queue(hctx, 0); -- 2.50.1 From 96a9fe64bfd486ebeeacf1e6011801ffe89dae18 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 14 Oct 2024 17:29:34 +0800 Subject: [PATCH 09/16] block: fix ordering between checking BLK_MQ_S_STOPPED request adding Supposing first scenario with a virtio_blk driver. CPU0 CPU1 blk_mq_try_issue_directly() __blk_mq_issue_directly() q->mq_ops->queue_rq() virtio_queue_rq() blk_mq_stop_hw_queue() virtblk_done() blk_mq_request_bypass_insert() 1) store blk_mq_start_stopped_hw_queue() clear_bit(BLK_MQ_S_STOPPED) 3) store blk_mq_run_hw_queue() if (!blk_mq_hctx_has_pending()) 4) load return blk_mq_sched_dispatch_requests() blk_mq_run_hw_queue() if (!blk_mq_hctx_has_pending()) return blk_mq_sched_dispatch_requests() if (blk_mq_hctx_stopped()) 2) load return __blk_mq_sched_dispatch_requests() Supposing another scenario. CPU0 CPU1 blk_mq_requeue_work() blk_mq_insert_request() 1) store virtblk_done() blk_mq_start_stopped_hw_queue() blk_mq_run_hw_queues() clear_bit(BLK_MQ_S_STOPPED) 3) store blk_mq_run_hw_queue() if (!blk_mq_hctx_has_pending()) 4) load return blk_mq_sched_dispatch_requests() if (blk_mq_hctx_stopped()) 2) load continue blk_mq_run_hw_queue() Both scenarios are similar, the full memory barrier should be inserted between 1) and 2), as well as between 3) and 4) to make sure that either CPU0 sees BLK_MQ_S_STOPPED is cleared or CPU1 sees dispatch list. Otherwise, either CPU will not rerun the hardware queue causing starvation of the request. The easy way to fix it is to add the essential full memory barrier into helper of blk_mq_hctx_stopped(). In order to not affect the fast path (hardware queue is not stopped most of the time), we only insert the barrier into the slow path. Actually, only slow path needs to care about missing of dispatching the request to the low-level device driver. Fixes: 320ae51feed5 ("blk-mq: new multi-queue block IO queueing mechanism") Cc: stable@vger.kernel.org Cc: Muchun Song Signed-off-by: Muchun Song Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20241014092934.53630-4-songmuchun@bytedance.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ++++++ block/blk-mq.h | 13 +++++++++++++ 2 files changed, 19 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index bb4ee2380dce..022653320c18 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2438,6 +2438,12 @@ void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) return; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); + /* + * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the + * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch + * list in the subsequent routine. + */ + smp_mb__after_atomic(); blk_mq_run_hw_queue(hctx, async); } EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); diff --git a/block/blk-mq.h b/block/blk-mq.h index 3bd43b10032f..f4ac1af77a26 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -230,6 +230,19 @@ static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) { + /* Fast path: hardware queue is not stopped most of the time. */ + if (likely(!test_bit(BLK_MQ_S_STOPPED, &hctx->state))) + return false; + + /* + * This barrier is used to order adding of dispatch list before and + * the test of BLK_MQ_S_STOPPED below. Pairs with the memory barrier + * in blk_mq_start_stopped_hw_queue() so that dispatch code could + * either see BLK_MQ_S_STOPPED is cleared or dispatch list is not + * empty to avoid missing dispatching requests. + */ + smp_mb(); + return test_bit(BLK_MQ_S_STOPPED, &hctx->state); } -- 2.50.1 From 919b5139bd1d557a4d4cd4b2466e2440dda65484 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Sat, 17 Aug 2024 15:11:08 +0800 Subject: [PATCH 10/16] block: flush all throttled bios when deleting the cgroup When a process migrates to another cgroup and the original cgroup is deleted, the restrictions of throttled bios cannot be removed. If the restrictions are set too low, it will take a long time to complete these bios. Refer to the process of deleting a disk to remove the restrictions and issue bios when deleting the cgroup. This makes difference on the behavior of throttled bios: Before: the limit of the throttled bios can't be changed and the bios will complete under this limit; Now: the limit will be canceled and the throttled bios will be flushed immediately. References: [1] https://lore.kernel.org/r/20220318130144.1066064-4-ming.lei@redhat.com [2] https://lore.kernel.org/all/da861d63-58c6-3ca0-2535-9089993e9e28@huaweicloud.com/ Signed-off-by: Li Lingfeng Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20240817071108.1919729-1-lilingfeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 68 ++++++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 24 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 443d1f47c2ce..82dbaefcfa3b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1526,6 +1526,42 @@ static void throtl_shutdown_wq(struct request_queue *q) cancel_work_sync(&td->dispatch_work); } +static void tg_flush_bios(struct throtl_grp *tg) +{ + struct throtl_service_queue *sq = &tg->service_queue; + + if (tg->flags & THROTL_TG_CANCELING) + return; + /* + * Set the flag to make sure throtl_pending_timer_fn() won't + * stop until all throttled bios are dispatched. + */ + tg->flags |= THROTL_TG_CANCELING; + + /* + * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup + * will be inserted to service queue without THROTL_TG_PENDING + * set in tg_update_disptime below. Then IO dispatched from + * child in tg_dispatch_one_bio will trigger double insertion + * and corrupt the tree. + */ + if (!(tg->flags & THROTL_TG_PENDING)) + return; + + /* + * Update disptime after setting the above flag to make sure + * throtl_select_dispatch() won't exit without dispatching. + */ + tg_update_disptime(tg); + + throtl_schedule_pending_timer(sq, jiffies + 1); +} + +static void throtl_pd_offline(struct blkg_policy_data *pd) +{ + tg_flush_bios(pd_to_tg(pd)); +} + struct blkcg_policy blkcg_policy_throtl = { .dfl_cftypes = throtl_files, .legacy_cftypes = throtl_legacy_files, @@ -1533,6 +1569,7 @@ struct blkcg_policy blkcg_policy_throtl = { .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, + .pd_offline_fn = throtl_pd_offline, .pd_free_fn = throtl_pd_free, }; @@ -1553,32 +1590,15 @@ void blk_throtl_cancel_bios(struct gendisk *disk) */ rcu_read_lock(); blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - struct throtl_service_queue *sq = &tg->service_queue; - - /* - * Set the flag to make sure throtl_pending_timer_fn() won't - * stop until all throttled bios are dispatched. - */ - tg->flags |= THROTL_TG_CANCELING; - /* - * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup - * will be inserted to service queue without THROTL_TG_PENDING - * set in tg_update_disptime below. Then IO dispatched from - * child in tg_dispatch_one_bio will trigger double insertion - * and corrupt the tree. + * disk_release will call pd_offline_fn to cancel bios. + * However, disk_release can't be called if someone get + * the refcount of device and issued bios which are + * inflight after del_gendisk. + * Cancel bios here to ensure no bios are inflight after + * del_gendisk. */ - if (!(tg->flags & THROTL_TG_PENDING)) - continue; - - /* - * Update disptime after setting the above flag to make sure - * throtl_select_dispatch() won't exit without dispatching. - */ - tg_update_disptime(tg); - - throtl_schedule_pending_timer(sq, jiffies + 1); + tg_flush_bios(blkg_to_tg(blkg)); } rcu_read_unlock(); spin_unlock_irq(&q->queue_lock); -- 2.50.1 From 904ebd2527c507752f5ddb358f887d2e0dab96a0 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 21 Oct 2024 16:52:51 +0800 Subject: [PATCH 11/16] block: remove redundant explicit memory barrier from rq_qos waiter and waker The memory barriers in list_del_init_careful() and list_empty_careful() in pairs already handle the proper ordering between data.got_token and data.wq.entry. So remove the redundant explicit barriers. And also change a "break" statement to "return" to avoid redundant calling of finish_wait(). Signed-off-by: Muchun Song Reviewed-by: Chengming Zhou Reviewed-by: Omar Sandoval Link: https://lore.kernel.org/r/20241021085251.73353-1-songmuchun@bytedance.com Signed-off-by: Jens Axboe --- block/blk-rq-qos.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 058f92c4f9d5..eb9618cd68ad 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -218,7 +218,6 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr, return -1; data->got_token = true; - smp_wmb(); wake_up_process(data->task); list_del_init_careful(&curr->entry); return 1; @@ -274,10 +273,9 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, * which means we now have two. Put our local token * and wake anyone else potentially waiting for one. */ - smp_rmb(); if (data.got_token) cleanup_cb(rqw, private_data); - break; + return; } io_schedule(); has_sleeper = true; -- 2.50.1 From ccd9e252c515ac5a3ed04a414c95d1307d17f159 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 22 Oct 2024 11:16:17 -0700 Subject: [PATCH 12/16] blk-mq: Make blk_mq_quiesce_tagset() hold the tag list mutex less long Make sure that the tag_list_lock mutex is not held any longer than necessary. This change reduces latency if e.g. blk_mq_quiesce_tagset() is called concurrently from more than one thread. This function is used by the NVMe core and also by the UFS driver. Reported-by: Peter Wang Cc: Chao Leng Cc: Ming Lei Cc: stable@vger.kernel.org Fixes: 414dd48e882c ("blk-mq: add tagset quiesce interface") Signed-off-by: Bart Van Assche Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241022181617.2716173-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 022653320c18..e39319ff4ecb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -283,8 +283,9 @@ void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) if (!blk_queue_skip_tagset_quiesce(q)) blk_mq_quiesce_queue_nowait(q); } - blk_mq_wait_quiesce_done(set); mutex_unlock(&set->tag_list_lock); + + blk_mq_wait_quiesce_done(set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); -- 2.50.1 From e203e20a8b2b67eb12f87aa3ee625cda5e686434 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 23 Oct 2024 13:28:50 -0700 Subject: [PATCH 13/16] blk-mq: Unexport blk_mq_flush_busy_ctxs() Commit a6088845c2bf ("block: kyber: make kyber more friendly with merging") removed the only blk_mq_flush_busy_ctxs() call from outside the block layer core. Hence unexport blk_mq_flush_busy_ctxs(). Signed-off-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20241023202850.3469279-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index e39319ff4ecb..aae9a112c913 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1726,7 +1726,6 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); } -EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); struct dispatch_rq_data { struct blk_mq_hw_ctx *hctx; -- 2.50.1 From 8acdd0e7bfadda6b5103f2960d293581954454ed Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 25 Oct 2024 08:37:18 +0800 Subject: [PATCH 14/16] blk-mq: add non_owner variant of start_freeze/unfreeze queue APIs Add non_owner variant of start_freeze/unfreeze queue APIs, so that the caller knows that what they are doing, and we can skip lockdep support for non_owner variant in per-call level. Prepare for supporting lockdep for freezing/unfreezing queue. Reviewed-by: Christoph Hellwig Suggested-by: Christoph Hellwig Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241025003722.3630252-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 20 ++++++++++++++++++++ include/linux/blk-mq.h | 2 ++ 2 files changed, 22 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index aae9a112c913..770276815507 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -196,6 +196,26 @@ void blk_mq_unfreeze_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); +/* + * non_owner variant of blk_freeze_queue_start + * + * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen + * by the same task. This is fragile and should not be used if at all + * possible. + */ +void blk_freeze_queue_start_non_owner(struct request_queue *q) +{ + blk_freeze_queue_start(q); +} +EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner); + +/* non_owner variant of blk_mq_unfreeze_queue */ +void blk_mq_unfreeze_queue_non_owner(struct request_queue *q) +{ + __blk_mq_unfreeze_queue(q, false); +} +EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner); + /* * FIXME: replace the scsi_internal_device_*block_nowait() calls in the * mpt3sas driver such that this function can be removed. diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 59e9adf815a4..2035fad3131f 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -919,6 +919,8 @@ void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); +void blk_mq_unfreeze_queue_non_owner(struct request_queue *q); +void blk_freeze_queue_start_non_owner(struct request_queue *q); void blk_mq_map_queues(struct blk_mq_queue_map *qmap); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); -- 2.50.1 From 6b6f6c41c8ac9b5ef758f16b793e1fd998cd25b4 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 25 Oct 2024 08:37:19 +0800 Subject: [PATCH 15/16] nvme: core: switch to non_owner variant of start_freeze/unfreeze queue nvme_start_freeze() and nvme_unfreeze() may be called from same context, so switch them to call non_owner variant of start_freeze/unfreeze queue. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241025003722.3630252-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 84cb859a911d..3de7555a7de7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4864,7 +4864,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl) srcu_idx = srcu_read_lock(&ctrl->srcu); list_for_each_entry_rcu(ns, &ctrl->namespaces, list) - blk_mq_unfreeze_queue(ns->queue); + blk_mq_unfreeze_queue_non_owner(ns->queue); srcu_read_unlock(&ctrl->srcu, srcu_idx); clear_bit(NVME_CTRL_FROZEN, &ctrl->flags); } @@ -4906,7 +4906,12 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl) set_bit(NVME_CTRL_FROZEN, &ctrl->flags); srcu_idx = srcu_read_lock(&ctrl->srcu); list_for_each_entry_rcu(ns, &ctrl->namespaces, list) - blk_freeze_queue_start(ns->queue); + /* + * Typical non_owner use case is from pci driver, in which + * start_freeze is called from timeout work function, but + * unfreeze is done in reset work context + */ + blk_freeze_queue_start_non_owner(ns->queue); srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_start_freeze); -- 2.50.1 From f1be1788a32e8fa63416ad4518bbd1a85a825c9d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 25 Oct 2024 08:37:20 +0800 Subject: [PATCH 16/16] block: model freeze & enter queue as lock for supporting lockdep Recently we got several deadlock report[1][2][3] caused by blk_mq_freeze_queue and blk_enter_queue(). Turns out the two are just like acquiring read/write lock, so model them as read/write lock for supporting lockdep: 1) model q->q_usage_counter as two locks(io and queue lock) - queue lock covers sync with blk_enter_queue() - io lock covers sync with bio_enter_queue() 2) make the lockdep class/key as per-queue: - different subsystem has very different lock use pattern, shared lock class causes false positive easily - freeze_queue degrades to no lock in case that disk state becomes DEAD because bio_enter_queue() won't be blocked any more - freeze_queue degrades to no lock in case that request queue becomes dying because blk_enter_queue() won't be blocked any more 3) model blk_mq_freeze_queue() as acquire_exclusive & try_lock - it is exclusive lock, so dependency with blk_enter_queue() is covered - it is trylock because blk_mq_freeze_queue() are allowed to run concurrently 4) model blk_enter_queue() & bio_enter_queue() as acquire_read() - nested blk_enter_queue() are allowed - dependency with blk_mq_freeze_queue() is covered - blk_queue_exit() is often called from other contexts(such as irq), and it can't be annotated as lock_release(), so simply do it in blk_enter_queue(), this way still covered cases as many as possible With lockdep support, such kind of reports may be reported asap and needn't wait until the real deadlock is triggered. For example, lockdep report can be triggered in the report[3] with this patch applied. [1] occasional block layer hang when setting 'echo noop > /sys/block/sda/queue/scheduler' https://bugzilla.kernel.org/show_bug.cgi?id=219166 [2] del_gendisk() vs blk_queue_enter() race condition https://lore.kernel.org/linux-block/20241003085610.GK11458@google.com/ [3] queue_freeze & queue_enter deadlock in scsi https://lore.kernel.org/linux-block/ZxG38G9BuFdBpBHZ@fedora/T/#u Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241025003722.3630252-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-core.c | 18 ++++++++++++++++-- block/blk-mq.c | 26 ++++++++++++++++++++++---- block/blk.h | 29 ++++++++++++++++++++++++++--- block/genhd.c | 15 +++++++++++---- include/linux/blkdev.h | 6 ++++++ 5 files changed, 81 insertions(+), 13 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index bc5e8c5eaac9..09d10bb95fda 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -261,6 +261,8 @@ static void blk_free_queue(struct request_queue *q) blk_mq_release(q); ida_free(&blk_queue_ida, q->id); + lockdep_unregister_key(&q->io_lock_cls_key); + lockdep_unregister_key(&q->q_lock_cls_key); call_rcu(&q->rcu_head, blk_free_queue_rcu); } @@ -278,18 +280,20 @@ void blk_put_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_put_queue); -void blk_queue_start_drain(struct request_queue *q) +bool blk_queue_start_drain(struct request_queue *q) { /* * When queue DYING flag is set, we need to block new req * entering queue, so we call blk_freeze_queue_start() to * prevent I/O from crossing blk_queue_enter(). */ - blk_freeze_queue_start(q); + bool freeze = __blk_freeze_queue_start(q); if (queue_is_mq(q)) blk_mq_wake_waiters(q); /* Make blk_queue_enter() reexamine the DYING flag. */ wake_up_all(&q->mq_freeze_wq); + + return freeze; } /** @@ -321,6 +325,8 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) return -ENODEV; } + rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->q_lockdep_map, _RET_IP_); return 0; } @@ -352,6 +358,8 @@ int __bio_queue_enter(struct request_queue *q, struct bio *bio) goto dead; } + rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->io_lockdep_map, _RET_IP_); return 0; dead: bio_io_error(bio); @@ -441,6 +449,12 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) PERCPU_REF_INIT_ATOMIC, GFP_KERNEL); if (error) goto fail_stats; + lockdep_register_key(&q->io_lock_cls_key); + lockdep_register_key(&q->q_lock_cls_key); + lockdep_init_map(&q->io_lockdep_map, "&q->q_usage_counter(io)", + &q->io_lock_cls_key, 0); + lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)", + &q->q_lock_cls_key, 0); q->nr_requests = BLKDEV_DEFAULT_RQ; diff --git a/block/blk-mq.c b/block/blk-mq.c index 770276815507..4ae7eb335fbd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -120,17 +120,29 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, inflight[1] = mi.inflight[1]; } -void blk_freeze_queue_start(struct request_queue *q) +bool __blk_freeze_queue_start(struct request_queue *q) { + int freeze; + mutex_lock(&q->mq_freeze_lock); if (++q->mq_freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); mutex_unlock(&q->mq_freeze_lock); if (queue_is_mq(q)) blk_mq_run_hw_queues(q, false); + freeze = true; } else { mutex_unlock(&q->mq_freeze_lock); + freeze = false; } + + return freeze; +} + +void blk_freeze_queue_start(struct request_queue *q) +{ + if (__blk_freeze_queue_start(q)) + blk_freeze_acquire_lock(q, false, false); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); @@ -176,8 +188,10 @@ void blk_mq_freeze_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); -void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) +bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) { + int unfreeze = false; + mutex_lock(&q->mq_freeze_lock); if (force_atomic) q->q_usage_counter.data->force_atomic = true; @@ -186,13 +200,17 @@ void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) if (!q->mq_freeze_depth) { percpu_ref_resurrect(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); + unfreeze = true; } mutex_unlock(&q->mq_freeze_lock); + + return unfreeze; } void blk_mq_unfreeze_queue(struct request_queue *q) { - __blk_mq_unfreeze_queue(q, false); + if (__blk_mq_unfreeze_queue(q, false)) + blk_unfreeze_release_lock(q, false, false); } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); @@ -205,7 +223,7 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); */ void blk_freeze_queue_start_non_owner(struct request_queue *q) { - blk_freeze_queue_start(q); + __blk_freeze_queue_start(q); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner); diff --git a/block/blk.h b/block/blk.h index 8fddaf6eae49..63d5df0dc29c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -4,6 +4,7 @@ #include #include +#include #include /* for max_pfn/max_low_pfn */ #include #include @@ -35,8 +36,9 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); -void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); -void blk_queue_start_drain(struct request_queue *q); +bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); +bool blk_queue_start_drain(struct request_queue *q); +bool __blk_freeze_queue_start(struct request_queue *q); int __bio_queue_enter(struct request_queue *q, struct bio *bio); void submit_bio_noacct_nocheck(struct bio *bio); void bio_await_chain(struct bio *bio); @@ -69,8 +71,11 @@ static inline int bio_queue_enter(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - if (blk_try_enter_queue(q, false)) + if (blk_try_enter_queue(q, false)) { + rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->io_lockdep_map, _RET_IP_); return 0; + } return __bio_queue_enter(q, bio); } @@ -724,4 +729,22 @@ void blk_integrity_verify(struct bio *bio); void blk_integrity_prepare(struct request *rq); void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); +static inline void blk_freeze_acquire_lock(struct request_queue *q, bool + disk_dead, bool queue_dying) +{ + if (!disk_dead) + rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_); + if (!queue_dying) + rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_); +} + +static inline void blk_unfreeze_release_lock(struct request_queue *q, bool + disk_dead, bool queue_dying) +{ + if (!queue_dying) + rwsem_release(&q->q_lockdep_map, _RET_IP_); + if (!disk_dead) + rwsem_release(&q->io_lockdep_map, _RET_IP_); +} + #endif /* BLK_INTERNAL_H */ diff --git a/block/genhd.c b/block/genhd.c index bc30eee7ab16..dfee66146bd1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -601,13 +601,13 @@ static void blk_report_disk_dead(struct gendisk *disk, bool surprise) rcu_read_unlock(); } -static void __blk_mark_disk_dead(struct gendisk *disk) +static bool __blk_mark_disk_dead(struct gendisk *disk) { /* * Fail any new I/O. */ if (test_and_set_bit(GD_DEAD, &disk->state)) - return; + return false; if (test_bit(GD_OWNS_QUEUE, &disk->state)) blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue); @@ -620,7 +620,7 @@ static void __blk_mark_disk_dead(struct gendisk *disk) /* * Prevent new I/O from crossing bio_queue_enter(). */ - blk_queue_start_drain(disk->queue); + return blk_queue_start_drain(disk->queue); } /** @@ -661,6 +661,7 @@ void del_gendisk(struct gendisk *disk) struct request_queue *q = disk->queue; struct block_device *part; unsigned long idx; + bool start_drain, queue_dying; might_sleep(); @@ -688,7 +689,10 @@ void del_gendisk(struct gendisk *disk) * Drop all partitions now that the disk is marked dead. */ mutex_lock(&disk->open_mutex); - __blk_mark_disk_dead(disk); + start_drain = __blk_mark_disk_dead(disk); + queue_dying = blk_queue_dying(q); + if (start_drain) + blk_freeze_acquire_lock(q, true, queue_dying); xa_for_each_start(&disk->part_tbl, idx, part, 1) drop_partition(part); mutex_unlock(&disk->open_mutex); @@ -745,6 +749,9 @@ void del_gendisk(struct gendisk *disk) if (queue_is_mq(q)) blk_mq_exit_queue(q); } + + if (start_drain) + blk_unfreeze_release_lock(q, true, queue_dying); } EXPORT_SYMBOL(del_gendisk); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 55bec14fe55f..d0a52ed05e60 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -25,6 +25,7 @@ #include #include #include +#include struct module; struct request_queue; @@ -474,6 +475,11 @@ struct request_queue { struct xarray hctx_table; struct percpu_ref q_usage_counter; + struct lock_class_key io_lock_cls_key; + struct lockdep_map io_lockdep_map; + + struct lock_class_key q_lock_cls_key; + struct lockdep_map q_lockdep_map; struct request *last_merge; -- 2.50.1