From ccd9e252c515ac5a3ed04a414c95d1307d17f159 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 22 Oct 2024 11:16:17 -0700 Subject: [PATCH 01/16] blk-mq: Make blk_mq_quiesce_tagset() hold the tag list mutex less long Make sure that the tag_list_lock mutex is not held any longer than necessary. This change reduces latency if e.g. blk_mq_quiesce_tagset() is called concurrently from more than one thread. This function is used by the NVMe core and also by the UFS driver. Reported-by: Peter Wang Cc: Chao Leng Cc: Ming Lei Cc: stable@vger.kernel.org Fixes: 414dd48e882c ("blk-mq: add tagset quiesce interface") Signed-off-by: Bart Van Assche Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241022181617.2716173-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 022653320c18..e39319ff4ecb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -283,8 +283,9 @@ void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) if (!blk_queue_skip_tagset_quiesce(q)) blk_mq_quiesce_queue_nowait(q); } - blk_mq_wait_quiesce_done(set); mutex_unlock(&set->tag_list_lock); + + blk_mq_wait_quiesce_done(set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); -- 2.51.0 From e203e20a8b2b67eb12f87aa3ee625cda5e686434 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 23 Oct 2024 13:28:50 -0700 Subject: [PATCH 02/16] blk-mq: Unexport blk_mq_flush_busy_ctxs() Commit a6088845c2bf ("block: kyber: make kyber more friendly with merging") removed the only blk_mq_flush_busy_ctxs() call from outside the block layer core. Hence unexport blk_mq_flush_busy_ctxs(). Signed-off-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20241023202850.3469279-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index e39319ff4ecb..aae9a112c913 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1726,7 +1726,6 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); } -EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); struct dispatch_rq_data { struct blk_mq_hw_ctx *hctx; -- 2.51.0 From 8acdd0e7bfadda6b5103f2960d293581954454ed Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 25 Oct 2024 08:37:18 +0800 Subject: [PATCH 03/16] blk-mq: add non_owner variant of start_freeze/unfreeze queue APIs Add non_owner variant of start_freeze/unfreeze queue APIs, so that the caller knows that what they are doing, and we can skip lockdep support for non_owner variant in per-call level. Prepare for supporting lockdep for freezing/unfreezing queue. Reviewed-by: Christoph Hellwig Suggested-by: Christoph Hellwig Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241025003722.3630252-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 20 ++++++++++++++++++++ include/linux/blk-mq.h | 2 ++ 2 files changed, 22 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index aae9a112c913..770276815507 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -196,6 +196,26 @@ void blk_mq_unfreeze_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); +/* + * non_owner variant of blk_freeze_queue_start + * + * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen + * by the same task. This is fragile and should not be used if at all + * possible. + */ +void blk_freeze_queue_start_non_owner(struct request_queue *q) +{ + blk_freeze_queue_start(q); +} +EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner); + +/* non_owner variant of blk_mq_unfreeze_queue */ +void blk_mq_unfreeze_queue_non_owner(struct request_queue *q) +{ + __blk_mq_unfreeze_queue(q, false); +} +EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner); + /* * FIXME: replace the scsi_internal_device_*block_nowait() calls in the * mpt3sas driver such that this function can be removed. diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 59e9adf815a4..2035fad3131f 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -919,6 +919,8 @@ void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); +void blk_mq_unfreeze_queue_non_owner(struct request_queue *q); +void blk_freeze_queue_start_non_owner(struct request_queue *q); void blk_mq_map_queues(struct blk_mq_queue_map *qmap); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); -- 2.51.0 From 6b6f6c41c8ac9b5ef758f16b793e1fd998cd25b4 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 25 Oct 2024 08:37:19 +0800 Subject: [PATCH 04/16] nvme: core: switch to non_owner variant of start_freeze/unfreeze queue nvme_start_freeze() and nvme_unfreeze() may be called from same context, so switch them to call non_owner variant of start_freeze/unfreeze queue. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241025003722.3630252-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 84cb859a911d..3de7555a7de7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4864,7 +4864,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl) srcu_idx = srcu_read_lock(&ctrl->srcu); list_for_each_entry_rcu(ns, &ctrl->namespaces, list) - blk_mq_unfreeze_queue(ns->queue); + blk_mq_unfreeze_queue_non_owner(ns->queue); srcu_read_unlock(&ctrl->srcu, srcu_idx); clear_bit(NVME_CTRL_FROZEN, &ctrl->flags); } @@ -4906,7 +4906,12 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl) set_bit(NVME_CTRL_FROZEN, &ctrl->flags); srcu_idx = srcu_read_lock(&ctrl->srcu); list_for_each_entry_rcu(ns, &ctrl->namespaces, list) - blk_freeze_queue_start(ns->queue); + /* + * Typical non_owner use case is from pci driver, in which + * start_freeze is called from timeout work function, but + * unfreeze is done in reset work context + */ + blk_freeze_queue_start_non_owner(ns->queue); srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_start_freeze); -- 2.51.0 From f1be1788a32e8fa63416ad4518bbd1a85a825c9d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 25 Oct 2024 08:37:20 +0800 Subject: [PATCH 05/16] block: model freeze & enter queue as lock for supporting lockdep Recently we got several deadlock report[1][2][3] caused by blk_mq_freeze_queue and blk_enter_queue(). Turns out the two are just like acquiring read/write lock, so model them as read/write lock for supporting lockdep: 1) model q->q_usage_counter as two locks(io and queue lock) - queue lock covers sync with blk_enter_queue() - io lock covers sync with bio_enter_queue() 2) make the lockdep class/key as per-queue: - different subsystem has very different lock use pattern, shared lock class causes false positive easily - freeze_queue degrades to no lock in case that disk state becomes DEAD because bio_enter_queue() won't be blocked any more - freeze_queue degrades to no lock in case that request queue becomes dying because blk_enter_queue() won't be blocked any more 3) model blk_mq_freeze_queue() as acquire_exclusive & try_lock - it is exclusive lock, so dependency with blk_enter_queue() is covered - it is trylock because blk_mq_freeze_queue() are allowed to run concurrently 4) model blk_enter_queue() & bio_enter_queue() as acquire_read() - nested blk_enter_queue() are allowed - dependency with blk_mq_freeze_queue() is covered - blk_queue_exit() is often called from other contexts(such as irq), and it can't be annotated as lock_release(), so simply do it in blk_enter_queue(), this way still covered cases as many as possible With lockdep support, such kind of reports may be reported asap and needn't wait until the real deadlock is triggered. For example, lockdep report can be triggered in the report[3] with this patch applied. [1] occasional block layer hang when setting 'echo noop > /sys/block/sda/queue/scheduler' https://bugzilla.kernel.org/show_bug.cgi?id=219166 [2] del_gendisk() vs blk_queue_enter() race condition https://lore.kernel.org/linux-block/20241003085610.GK11458@google.com/ [3] queue_freeze & queue_enter deadlock in scsi https://lore.kernel.org/linux-block/ZxG38G9BuFdBpBHZ@fedora/T/#u Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241025003722.3630252-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-core.c | 18 ++++++++++++++++-- block/blk-mq.c | 26 ++++++++++++++++++++++---- block/blk.h | 29 ++++++++++++++++++++++++++--- block/genhd.c | 15 +++++++++++---- include/linux/blkdev.h | 6 ++++++ 5 files changed, 81 insertions(+), 13 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index bc5e8c5eaac9..09d10bb95fda 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -261,6 +261,8 @@ static void blk_free_queue(struct request_queue *q) blk_mq_release(q); ida_free(&blk_queue_ida, q->id); + lockdep_unregister_key(&q->io_lock_cls_key); + lockdep_unregister_key(&q->q_lock_cls_key); call_rcu(&q->rcu_head, blk_free_queue_rcu); } @@ -278,18 +280,20 @@ void blk_put_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_put_queue); -void blk_queue_start_drain(struct request_queue *q) +bool blk_queue_start_drain(struct request_queue *q) { /* * When queue DYING flag is set, we need to block new req * entering queue, so we call blk_freeze_queue_start() to * prevent I/O from crossing blk_queue_enter(). */ - blk_freeze_queue_start(q); + bool freeze = __blk_freeze_queue_start(q); if (queue_is_mq(q)) blk_mq_wake_waiters(q); /* Make blk_queue_enter() reexamine the DYING flag. */ wake_up_all(&q->mq_freeze_wq); + + return freeze; } /** @@ -321,6 +325,8 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) return -ENODEV; } + rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->q_lockdep_map, _RET_IP_); return 0; } @@ -352,6 +358,8 @@ int __bio_queue_enter(struct request_queue *q, struct bio *bio) goto dead; } + rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->io_lockdep_map, _RET_IP_); return 0; dead: bio_io_error(bio); @@ -441,6 +449,12 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) PERCPU_REF_INIT_ATOMIC, GFP_KERNEL); if (error) goto fail_stats; + lockdep_register_key(&q->io_lock_cls_key); + lockdep_register_key(&q->q_lock_cls_key); + lockdep_init_map(&q->io_lockdep_map, "&q->q_usage_counter(io)", + &q->io_lock_cls_key, 0); + lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)", + &q->q_lock_cls_key, 0); q->nr_requests = BLKDEV_DEFAULT_RQ; diff --git a/block/blk-mq.c b/block/blk-mq.c index 770276815507..4ae7eb335fbd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -120,17 +120,29 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, inflight[1] = mi.inflight[1]; } -void blk_freeze_queue_start(struct request_queue *q) +bool __blk_freeze_queue_start(struct request_queue *q) { + int freeze; + mutex_lock(&q->mq_freeze_lock); if (++q->mq_freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); mutex_unlock(&q->mq_freeze_lock); if (queue_is_mq(q)) blk_mq_run_hw_queues(q, false); + freeze = true; } else { mutex_unlock(&q->mq_freeze_lock); + freeze = false; } + + return freeze; +} + +void blk_freeze_queue_start(struct request_queue *q) +{ + if (__blk_freeze_queue_start(q)) + blk_freeze_acquire_lock(q, false, false); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); @@ -176,8 +188,10 @@ void blk_mq_freeze_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); -void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) +bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) { + int unfreeze = false; + mutex_lock(&q->mq_freeze_lock); if (force_atomic) q->q_usage_counter.data->force_atomic = true; @@ -186,13 +200,17 @@ void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) if (!q->mq_freeze_depth) { percpu_ref_resurrect(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); + unfreeze = true; } mutex_unlock(&q->mq_freeze_lock); + + return unfreeze; } void blk_mq_unfreeze_queue(struct request_queue *q) { - __blk_mq_unfreeze_queue(q, false); + if (__blk_mq_unfreeze_queue(q, false)) + blk_unfreeze_release_lock(q, false, false); } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); @@ -205,7 +223,7 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); */ void blk_freeze_queue_start_non_owner(struct request_queue *q) { - blk_freeze_queue_start(q); + __blk_freeze_queue_start(q); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner); diff --git a/block/blk.h b/block/blk.h index 8fddaf6eae49..63d5df0dc29c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -4,6 +4,7 @@ #include #include +#include #include /* for max_pfn/max_low_pfn */ #include #include @@ -35,8 +36,9 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); -void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); -void blk_queue_start_drain(struct request_queue *q); +bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); +bool blk_queue_start_drain(struct request_queue *q); +bool __blk_freeze_queue_start(struct request_queue *q); int __bio_queue_enter(struct request_queue *q, struct bio *bio); void submit_bio_noacct_nocheck(struct bio *bio); void bio_await_chain(struct bio *bio); @@ -69,8 +71,11 @@ static inline int bio_queue_enter(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - if (blk_try_enter_queue(q, false)) + if (blk_try_enter_queue(q, false)) { + rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->io_lockdep_map, _RET_IP_); return 0; + } return __bio_queue_enter(q, bio); } @@ -724,4 +729,22 @@ void blk_integrity_verify(struct bio *bio); void blk_integrity_prepare(struct request *rq); void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); +static inline void blk_freeze_acquire_lock(struct request_queue *q, bool + disk_dead, bool queue_dying) +{ + if (!disk_dead) + rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_); + if (!queue_dying) + rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_); +} + +static inline void blk_unfreeze_release_lock(struct request_queue *q, bool + disk_dead, bool queue_dying) +{ + if (!queue_dying) + rwsem_release(&q->q_lockdep_map, _RET_IP_); + if (!disk_dead) + rwsem_release(&q->io_lockdep_map, _RET_IP_); +} + #endif /* BLK_INTERNAL_H */ diff --git a/block/genhd.c b/block/genhd.c index bc30eee7ab16..dfee66146bd1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -601,13 +601,13 @@ static void blk_report_disk_dead(struct gendisk *disk, bool surprise) rcu_read_unlock(); } -static void __blk_mark_disk_dead(struct gendisk *disk) +static bool __blk_mark_disk_dead(struct gendisk *disk) { /* * Fail any new I/O. */ if (test_and_set_bit(GD_DEAD, &disk->state)) - return; + return false; if (test_bit(GD_OWNS_QUEUE, &disk->state)) blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue); @@ -620,7 +620,7 @@ static void __blk_mark_disk_dead(struct gendisk *disk) /* * Prevent new I/O from crossing bio_queue_enter(). */ - blk_queue_start_drain(disk->queue); + return blk_queue_start_drain(disk->queue); } /** @@ -661,6 +661,7 @@ void del_gendisk(struct gendisk *disk) struct request_queue *q = disk->queue; struct block_device *part; unsigned long idx; + bool start_drain, queue_dying; might_sleep(); @@ -688,7 +689,10 @@ void del_gendisk(struct gendisk *disk) * Drop all partitions now that the disk is marked dead. */ mutex_lock(&disk->open_mutex); - __blk_mark_disk_dead(disk); + start_drain = __blk_mark_disk_dead(disk); + queue_dying = blk_queue_dying(q); + if (start_drain) + blk_freeze_acquire_lock(q, true, queue_dying); xa_for_each_start(&disk->part_tbl, idx, part, 1) drop_partition(part); mutex_unlock(&disk->open_mutex); @@ -745,6 +749,9 @@ void del_gendisk(struct gendisk *disk) if (queue_is_mq(q)) blk_mq_exit_queue(q); } + + if (start_drain) + blk_unfreeze_release_lock(q, true, queue_dying); } EXPORT_SYMBOL(del_gendisk); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 55bec14fe55f..d0a52ed05e60 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -25,6 +25,7 @@ #include #include #include +#include struct module; struct request_queue; @@ -474,6 +475,11 @@ struct request_queue { struct xarray hctx_table; struct percpu_ref q_usage_counter; + struct lock_class_key io_lock_cls_key; + struct lockdep_map io_lockdep_map; + + struct lock_class_key q_lock_cls_key; + struct lockdep_map q_lockdep_map; struct request *last_merge; -- 2.51.0 From e4e535bff2bc82bb49a633775f9834beeaa527db Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 24 Oct 2024 07:00:15 +0200 Subject: [PATCH 06/16] iov_iter: don't require contiguous pages in iov_iter_extract_bvec_pages The iov_iter_extract_pages interface allows to return physically discontiguous pages, as long as all but the first and last page in the array are page aligned and page size. Rewrite iov_iter_extract_bvec_pages to take advantage of that instead of only returning ranges of physically contiguous pages. Signed-off-by: Ming Lei [hch: minor cleanups, new commit log] Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241024050021.627350-1-hch@lst.de Signed-off-by: Jens Axboe --- lib/iov_iter.c | 67 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 22 deletions(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 1abb32c0da50..9fc06f5fb748 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1677,8 +1677,8 @@ static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, } /* - * Extract a list of contiguous pages from an ITER_BVEC iterator. This does - * not get references on the pages, nor does it get a pin on them. + * Extract a list of virtually contiguous pages from an ITER_BVEC iterator. + * This does not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, @@ -1686,35 +1686,58 @@ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, iov_iter_extraction_t extraction_flags, size_t *offset0) { - struct page **p, *page; - size_t skip = i->iov_offset, offset, size; - int k; + size_t skip = i->iov_offset, size = 0; + struct bvec_iter bi; + int k = 0; - for (;;) { - if (i->nr_segs == 0) - return 0; - size = min(maxsize, i->bvec->bv_len - skip); - if (size) - break; + if (i->nr_segs == 0) + return 0; + + if (i->iov_offset == i->bvec->bv_len) { i->iov_offset = 0; i->nr_segs--; i->bvec++; skip = 0; } + bi.bi_size = maxsize + skip; + bi.bi_bvec_done = skip; + + maxpages = want_pages_array(pages, maxsize, skip, maxpages); + + while (bi.bi_size && bi.bi_idx < i->nr_segs) { + struct bio_vec bv = bvec_iter_bvec(i->bvec, bi); + + /* + * The iov_iter_extract_pages interface only allows an offset + * into the first page. Break out of the loop if we see an + * offset into subsequent pages, the caller will have to call + * iov_iter_extract_pages again for the reminder. + */ + if (k) { + if (bv.bv_offset) + break; + } else { + *offset0 = bv.bv_offset; + } - skip += i->bvec->bv_offset; - page = i->bvec->bv_page + skip / PAGE_SIZE; - offset = skip % PAGE_SIZE; - *offset0 = offset; + (*pages)[k++] = bv.bv_page; + size += bv.bv_len; - maxpages = want_pages_array(pages, size, offset, maxpages); - if (!maxpages) - return -ENOMEM; - p = *pages; - for (k = 0; k < maxpages; k++) - p[k] = page + k; + if (k >= maxpages) + break; + + /* + * We are done when the end of the bvec doesn't align to a page + * boundary as that would create a hole in the returned space. + * The caller will handle this with another call to + * iov_iter_extract_pages. + */ + if (bv.bv_offset + bv.bv_len != PAGE_SIZE) + break; + + bvec_iter_advance_single(i->bvec, &bi, bv.bv_len); + } - size = min_t(size_t, size, maxpages * PAGE_SIZE - offset); iov_iter_advance(i, size); return size; } -- 2.51.0 From 2f5a65ef30a636d5030917eebd283ac447a212af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Oct 2024 15:19:37 +0100 Subject: [PATCH 07/16] block: add a bdev_limits helper Add a helper to get the queue_limits from the bdev without having to poke into the request_queue. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Link: https://lore.kernel.org/r/20241029141937.249920-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-merge.c | 3 +-- block/blk-settings.c | 2 +- drivers/md/dm-cache-target.c | 4 ++-- drivers/md/dm-clone-target.c | 4 ++-- drivers/md/dm-thin.c | 2 +- fs/btrfs/zoned.c | 7 ++----- include/linux/blkdev.h | 15 ++++++++++----- 7 files changed, 19 insertions(+), 18 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 8b9a9646aed8..d813d799cee7 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -411,10 +411,9 @@ struct bio *bio_split_zone_append(struct bio *bio, */ struct bio *bio_split_to_limits(struct bio *bio) { - const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits; unsigned int nr_segs; - return __bio_split_to_limits(bio, lim, &nr_segs); + return __bio_split_to_limits(bio, bdev_limits(bio->bi_bdev), &nr_segs); } EXPORT_SYMBOL(bio_split_to_limits); diff --git a/block/blk-settings.c b/block/blk-settings.c index a446654ddee5..95fc39d09872 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -661,7 +661,7 @@ EXPORT_SYMBOL(blk_stack_limits); void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, sector_t offset, const char *pfx) { - if (blk_stack_limits(t, &bdev_get_queue(bdev)->limits, + if (blk_stack_limits(t, bdev_limits(bdev), get_start_sect(bdev) + offset)) pr_notice("%s: Warning: Device %pg is misaligned\n", pfx, bdev); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index aaeeabfab09b..c4520ff7fe1a 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -3360,7 +3360,7 @@ static int cache_iterate_devices(struct dm_target *ti, static void disable_passdown_if_not_supported(struct cache *cache) { struct block_device *origin_bdev = cache->origin_dev->bdev; - struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; + struct queue_limits *origin_limits = bdev_limits(origin_bdev); const char *reason = NULL; if (!cache->features.discard_passdown) @@ -3382,7 +3382,7 @@ static void disable_passdown_if_not_supported(struct cache *cache) static void set_discard_limits(struct cache *cache, struct queue_limits *limits) { struct block_device *origin_bdev = cache->origin_dev->bdev; - struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; + struct queue_limits *origin_limits = bdev_limits(origin_bdev); if (!cache->features.discard_passdown) { /* No passdown is done so setting own virtual limits */ diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index 12bbe487a4c8..e956d980672c 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -2020,7 +2020,7 @@ static void clone_resume(struct dm_target *ti) static void disable_passdown_if_not_supported(struct clone *clone) { struct block_device *dest_dev = clone->dest_dev->bdev; - struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits; + struct queue_limits *dest_limits = bdev_limits(dest_dev); const char *reason = NULL; if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) @@ -2041,7 +2041,7 @@ static void disable_passdown_if_not_supported(struct clone *clone) static void set_discard_limits(struct clone *clone, struct queue_limits *limits) { struct block_device *dest_bdev = clone->dest_dev->bdev; - struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits; + struct queue_limits *dest_limits = bdev_limits(dest_bdev); if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) { /* No passdown is done so we set our own virtual limits */ diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 89632ce97760..9095f19a84f3 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2842,7 +2842,7 @@ static void disable_discard_passdown_if_not_supported(struct pool_c *pt) { struct pool *pool = pt->pool; struct block_device *data_bdev = pt->data_dev->bdev; - struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits; + struct queue_limits *data_limits = bdev_limits(data_bdev); const char *reason = NULL; if (!pt->adjusted_pf.discard_passdown) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 69d03feea4e0..46b9386957e6 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -707,11 +707,8 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) * zoned mode. In this case, we don't have a valid max zone * append size. */ - if (bdev_is_zoned(device->bdev)) { - blk_stack_limits(lim, - &bdev_get_queue(device->bdev)->limits, - 0); - } + if (bdev_is_zoned(device->bdev)) + blk_stack_limits(lim, bdev_limits(device->bdev), 0); } /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d0a52ed05e60..7bfc877e159e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1159,6 +1159,11 @@ enum blk_default_limits { */ #define BLK_DEF_MAX_SECTORS_CAP 2560u +static inline struct queue_limits *bdev_limits(struct block_device *bdev) +{ + return &bdev_get_queue(bdev)->limits; +} + static inline unsigned long queue_segment_boundary(const struct request_queue *q) { return q->limits.seg_boundary_mask; @@ -1293,23 +1298,23 @@ unsigned int bdev_discard_alignment(struct block_device *bdev); static inline unsigned int bdev_max_discard_sectors(struct block_device *bdev) { - return bdev_get_queue(bdev)->limits.max_discard_sectors; + return bdev_limits(bdev)->max_discard_sectors; } static inline unsigned int bdev_discard_granularity(struct block_device *bdev) { - return bdev_get_queue(bdev)->limits.discard_granularity; + return bdev_limits(bdev)->discard_granularity; } static inline unsigned int bdev_max_secure_erase_sectors(struct block_device *bdev) { - return bdev_get_queue(bdev)->limits.max_secure_erase_sectors; + return bdev_limits(bdev)->max_secure_erase_sectors; } static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) { - return bdev_get_queue(bdev)->limits.max_write_zeroes_sectors; + return bdev_limits(bdev)->max_write_zeroes_sectors; } static inline bool bdev_nonrot(struct block_device *bdev) @@ -1345,7 +1350,7 @@ static inline bool bdev_write_cache(struct block_device *bdev) static inline bool bdev_fua(struct block_device *bdev) { - return bdev_get_queue(bdev)->limits.features & BLK_FEAT_FUA; + return bdev_limits(bdev)->features & BLK_FEAT_FUA; } static inline bool bdev_nowait(struct block_device *bdev) -- 2.51.0 From 8d3fd059dd289e6c322e5741ad56794bcce699a2 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 30 Oct 2024 11:19:00 +0000 Subject: [PATCH 08/16] loop: Use bdev limit helpers for configuring discard Instead of directly looking at the request_queue limits, use the bdev limits helpers, which is preferable. Signed-off-by: John Garry Link: https://lore.kernel.org/r/20241030111900.3981223-1-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 78a7bb28defe..7719858c49bb 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -786,11 +786,11 @@ static void loop_config_discard(struct loop_device *lo, * file-backed loop devices: discarded regions read back as zero. */ if (S_ISBLK(inode->i_mode)) { - struct request_queue *backingq = bdev_get_queue(I_BDEV(inode)); + struct block_device *bdev = I_BDEV(inode); - max_discard_sectors = backingq->limits.max_write_zeroes_sectors; - granularity = bdev_discard_granularity(I_BDEV(inode)) ?: - queue_physical_block_size(backingq); + max_discard_sectors = bdev_write_zeroes_sectors(bdev); + granularity = bdev_discard_granularity(bdev) ?: + bdev_physical_block_size(bdev); /* * We use punch hole to reclaim the free space used by the -- 2.51.0 From 826cc42adf44930a633d11a5993676d85ddb0842 Mon Sep 17 00:00:00 2001 From: Yang Erkun Date: Wed, 30 Oct 2024 11:49:14 +0800 Subject: [PATCH 09/16] brd: defer automatic disk creation until module initialization succeeds My colleague Wupeng found the following problems during fault injection: BUG: unable to handle page fault for address: fffffbfff809d073 PGD 6e648067 P4D 123ec8067 PUD 123ec4067 PMD 100e38067 PTE 0 Oops: Oops: 0000 [#1] PREEMPT SMP KASAN NOPTI CPU: 5 UID: 0 PID: 755 Comm: modprobe Not tainted 6.12.0-rc3+ #17 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.1-2.fc37 04/01/2014 RIP: 0010:__asan_load8+0x4c/0xa0 ... Call Trace: blkdev_put_whole+0x41/0x70 bdev_release+0x1a3/0x250 blkdev_release+0x11/0x20 __fput+0x1d7/0x4a0 task_work_run+0xfc/0x180 syscall_exit_to_user_mode+0x1de/0x1f0 do_syscall_64+0x6b/0x170 entry_SYSCALL_64_after_hwframe+0x76/0x7e loop_init() is calling loop_add() after __register_blkdev() succeeds and is ignoring disk_add() failure from loop_add(), for loop_add() failure is not fatal and successfully created disks are already visible to bdev_open(). brd_init() is currently calling brd_alloc() before __register_blkdev() succeeds and is releasing successfully created disks when brd_init() returns an error. This can cause UAF for the latter two case: case 1: T1: modprobe brd brd_init brd_alloc(0) // success add_disk disk_scan_partitions bdev_file_open_by_dev // alloc file fput // won't free until back to userspace brd_alloc(1) // failed since mem alloc error inject // error path for modprobe will release code segment // back to userspace __fput blkdev_release bdev_release blkdev_put_whole bdev->bd_disk->fops->release // fops is freed now, UAF! case 2: T1: T2: modprobe brd brd_init brd_alloc(0) // success open(/dev/ram0) brd_alloc(1) // fail // error path for modprobe close(/dev/ram0) ... /* UAF! */ bdev->bd_disk->fops->release Fix this problem by following what loop_init() does. Besides, reintroduce brd_devices_mutex to help serialize modifications to brd_list. Fixes: 7f9b348cb5e9 ("brd: convert to blk_alloc_disk/blk_cleanup_disk") Reported-by: Wupeng Ma Signed-off-by: Yang Erkun Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241030034914.907829-1-yangerkun@huaweicloud.com Signed-off-by: Jens Axboe --- drivers/block/brd.c | 66 ++++++++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 2fd1ed101748..5a95671d8151 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -316,8 +316,40 @@ __setup("ramdisk_size=", ramdisk_size); * (should share code eventually). */ static LIST_HEAD(brd_devices); +static DEFINE_MUTEX(brd_devices_mutex); static struct dentry *brd_debugfs_dir; +static struct brd_device *brd_find_or_alloc_device(int i) +{ + struct brd_device *brd; + + mutex_lock(&brd_devices_mutex); + list_for_each_entry(brd, &brd_devices, brd_list) { + if (brd->brd_number == i) { + mutex_unlock(&brd_devices_mutex); + return ERR_PTR(-EEXIST); + } + } + + brd = kzalloc(sizeof(*brd), GFP_KERNEL); + if (!brd) { + mutex_unlock(&brd_devices_mutex); + return ERR_PTR(-ENOMEM); + } + brd->brd_number = i; + list_add_tail(&brd->brd_list, &brd_devices); + mutex_unlock(&brd_devices_mutex); + return brd; +} + +static void brd_free_device(struct brd_device *brd) +{ + mutex_lock(&brd_devices_mutex); + list_del(&brd->brd_list); + mutex_unlock(&brd_devices_mutex); + kfree(brd); +} + static int brd_alloc(int i) { struct brd_device *brd; @@ -340,14 +372,9 @@ static int brd_alloc(int i) BLK_FEAT_NOWAIT, }; - list_for_each_entry(brd, &brd_devices, brd_list) - if (brd->brd_number == i) - return -EEXIST; - brd = kzalloc(sizeof(*brd), GFP_KERNEL); - if (!brd) - return -ENOMEM; - brd->brd_number = i; - list_add_tail(&brd->brd_list, &brd_devices); + brd = brd_find_or_alloc_device(i); + if (IS_ERR(brd)) + return PTR_ERR(brd); xa_init(&brd->brd_pages); @@ -378,8 +405,7 @@ static int brd_alloc(int i) out_cleanup_disk: put_disk(disk); out_free_dev: - list_del(&brd->brd_list); - kfree(brd); + brd_free_device(brd); return err; } @@ -398,8 +424,7 @@ static void brd_cleanup(void) del_gendisk(brd->brd_disk); put_disk(brd->brd_disk); brd_free_pages(brd); - list_del(&brd->brd_list); - kfree(brd); + brd_free_device(brd); } } @@ -426,16 +451,6 @@ static int __init brd_init(void) { int err, i; - brd_check_and_reset_par(); - - brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); - - for (i = 0; i < rd_nr; i++) { - err = brd_alloc(i); - if (err) - goto out_free; - } - /* * brd module now has a feature to instantiate underlying device * structure on-demand, provided that there is an access dev node. @@ -451,11 +466,18 @@ static int __init brd_init(void) * dynamically. */ + brd_check_and_reset_par(); + + brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); + if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) { err = -EIO; goto out_free; } + for (i = 0; i < rd_nr; i++) + brd_alloc(i); + pr_info("brd: module loaded\n"); return 0; -- 2.51.0 From 133008e84b99e4f5f8cf3d8b768c995732df9406 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 16 Oct 2024 13:13:09 -0700 Subject: [PATCH 10/16] blk-integrity: remove seed for user mapped buffers The seed is only used for kernel generation and verification. That doesn't happen for user buffers, so passing the seed around doesn't accomplish anything. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Link: https://lore.kernel.org/r/20241016201309.1090320-1-kbusch@meta.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 13 +++++-------- block/blk-integrity.c | 4 ++-- drivers/nvme/host/ioctl.c | 17 ++++++++--------- include/linux/bio-integrity.h | 4 ++-- include/linux/blk-integrity.h | 5 ++--- 5 files changed, 19 insertions(+), 24 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 88e3ad73c385..2a4bd6611692 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -199,7 +199,7 @@ EXPORT_SYMBOL(bio_integrity_add_page); static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, int nr_vecs, unsigned int len, - unsigned int direction, u32 seed) + unsigned int direction) { bool write = direction == ITER_SOURCE; struct bio_integrity_payload *bip; @@ -247,7 +247,6 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, } bip->bip_flags |= BIP_COPY_USER; - bip->bip_iter.bi_sector = seed; bip->bip_vcnt = nr_vecs; return 0; free_bip: @@ -258,7 +257,7 @@ free_buf: } static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, - int nr_vecs, unsigned int len, u32 seed) + int nr_vecs, unsigned int len) { struct bio_integrity_payload *bip; @@ -267,7 +266,6 @@ static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, return PTR_ERR(bip); memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec)); - bip->bip_iter.bi_sector = seed; bip->bip_iter.bi_size = len; bip->bip_vcnt = nr_vecs; return 0; @@ -303,8 +301,7 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, return nr_bvecs; } -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, - u32 seed) +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits); @@ -350,9 +347,9 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, if (copy) ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes, - direction, seed); + direction); else - ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed); + ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes); if (ret) goto release_pages; if (bvec != stack_vec) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 83b696ba0cac..b180cac61a9d 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -113,9 +113,9 @@ new_segment: EXPORT_SYMBOL(blk_rq_map_integrity_sg); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, - ssize_t bytes, u32 seed) + ssize_t bytes) { - int ret = bio_integrity_map_user(rq->bio, ubuf, bytes, seed); + int ret = bio_integrity_map_user(rq->bio, ubuf, bytes); if (ret) return ret; diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index b9b79ccfabf8..f697d2d1d7e4 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -114,7 +114,7 @@ static struct request *nvme_alloc_user_request(struct request_queue *q, static int nvme_map_user_request(struct request *req, u64 ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, struct io_uring_cmd *ioucmd, unsigned int flags) + struct io_uring_cmd *ioucmd, unsigned int flags) { struct request_queue *q = req->q; struct nvme_ns *ns = q->queuedata; @@ -152,8 +152,7 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, bio_set_dev(bio, bdev); if (has_metadata) { - ret = blk_rq_integrity_map_user(req, meta_buffer, meta_len, - meta_seed); + ret = blk_rq_integrity_map_user(req, meta_buffer, meta_len); if (ret) goto out_unmap; } @@ -170,7 +169,7 @@ out: static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, u64 ubuffer, unsigned bufflen, - void __user *meta_buffer, unsigned meta_len, u32 meta_seed, + void __user *meta_buffer, unsigned meta_len, u64 *result, unsigned timeout, unsigned int flags) { struct nvme_ns *ns = q->queuedata; @@ -187,7 +186,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, req->timeout = timeout; if (ubuffer && bufflen) { ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, - meta_len, meta_seed, NULL, flags); + meta_len, NULL, flags); if (ret) return ret; } @@ -268,7 +267,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.lbatm = cpu_to_le16(io.appmask); return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata, - meta_len, lower_32_bits(io.slba), NULL, 0, 0); + meta_len, NULL, 0, 0); } static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, @@ -323,7 +322,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata), - cmd.metadata_len, 0, &result, timeout, 0); + cmd.metadata_len, &result, timeout, 0); if (status >= 0) { if (put_user(result, &ucmd->result)) @@ -370,7 +369,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata), - cmd.metadata_len, 0, &cmd.result, timeout, flags); + cmd.metadata_len, &cmd.result, timeout, flags); if (status >= 0) { if (put_user(cmd.result, &ucmd->result)) @@ -504,7 +503,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (d.addr && d.data_len) { ret = nvme_map_user_request(req, d.addr, d.data_len, nvme_to_user_ptr(d.metadata), - d.metadata_len, 0, ioucmd, vec); + d.metadata_len, ioucmd, vec); if (ret) return ret; } diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index dd831c269e99..dbf0f74c1529 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -72,7 +72,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr); int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed); +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len); void bio_integrity_unmap_user(struct bio *bio); bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); @@ -99,7 +99,7 @@ static inline void bioset_integrity_free(struct bio_set *bs) } static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, - ssize_t len, u32 seed) + ssize_t len) { return -EINVAL; } diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 676f8f860c47..c7eae0bfb013 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -28,7 +28,7 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, - ssize_t bytes, u32 seed); + ssize_t bytes); static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) @@ -104,8 +104,7 @@ static inline int blk_rq_map_integrity_sg(struct request *q, } static inline int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, - ssize_t bytes, - u32 seed) + ssize_t bytes) { return -EINVAL; } -- 2.51.0 From 496a51b37143c690a06612a6bd58827ef2341761 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 31 Oct 2024 19:02:24 +0800 Subject: [PATCH 11/16] lib/iov_iter.c: initialize bi.bi_idx before iterating over bvec Initialize bi.bi_idx as 0 before iterating over bvec, otherwise garbage data can be used as ->bi_idx. Cc: Christoph Hellwig Reported-and-tested-by: Klara Modin Fixes: e4e535bff2bc ("iov_iter: don't require contiguous pages in iov_iter_extract_bvec_pages") Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- lib/iov_iter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 9fc06f5fb748..c761f6db3cb4 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1699,6 +1699,7 @@ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, i->bvec++; skip = 0; } + bi.bi_idx = 0; bi.bi_size = maxsize + skip; bi.bi_bvec_done = skip; -- 2.51.0 From cafd00d0e90956c1c570a0a96cd86298897d247b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Oct 2024 06:18:51 +0100 Subject: [PATCH 12/16] block: remove zone append special casing from the direct I/O path This code is unused, and all future zoned file systems should follow the btrfs lead of splitting the bios themselves to the zoned limits in the I/O submission handler, because if they didn't they would be hit by commit ed9832bc08db ("block: introduce folio awareness and add a bigger size from folio") breaking this code when the zone append limit (that is usually the max_hw_sectors limit) is smaller than the largest possible folio size. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20241030051859.280923-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 34 ++-------------------------------- 1 file changed, 2 insertions(+), 32 deletions(-) diff --git a/block/bio.c b/block/bio.c index ac4d77c88932..6a60d62a529d 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1206,21 +1206,12 @@ EXPORT_SYMBOL_GPL(__bio_release_pages); void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { - size_t size = iov_iter_count(iter); - WARN_ON_ONCE(bio->bi_max_vecs); - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - size_t max_sectors = queue_max_zone_append_sectors(q); - - size = min(size, max_sectors << SECTOR_SHIFT); - } - bio->bi_vcnt = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; - bio->bi_iter.bi_size = size; + bio->bi_iter.bi_size = iov_iter_count(iter); bio_set_flag(bio, BIO_CLONED); } @@ -1245,20 +1236,6 @@ static int bio_iov_add_folio(struct bio *bio, struct folio *folio, size_t len, return 0; } -static int bio_iov_add_zone_append_folio(struct bio *bio, struct folio *folio, - size_t len, size_t offset) -{ - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - bool same_page = false; - - if (bio_add_hw_folio(q, bio, folio, len, offset, - queue_max_zone_append_sectors(q), &same_page) != len) - return -EINVAL; - if (same_page && bio_flagged(bio, BIO_PAGE_PINNED)) - unpin_user_folio(folio, 1); - return 0; -} - static unsigned int get_contig_folio_len(unsigned int *num_pages, struct page **pages, unsigned int i, struct folio *folio, size_t left, @@ -1365,14 +1342,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) len = get_contig_folio_len(&num_pages, pages, i, folio, left, offset); - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - ret = bio_iov_add_zone_append_folio(bio, folio, len, - folio_offset); - if (ret) - break; - } else - bio_iov_add_folio(bio, folio, len, folio_offset); - + bio_iov_add_folio(bio, folio, len, folio_offset); offset = 0; } -- 2.51.0 From f187b9bf1a639090893c31030ddb60f9beae23f0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Oct 2024 06:18:52 +0100 Subject: [PATCH 13/16] block: remove bio_add_zone_append_page This is only used by the nvmet zns passthrough code, which can trivially just use bio_add_pc_page and do the sanity check for the max zone append limit itself. All future zoned file systems should follow the btrfs lead and let the upper layers fill up bios unlimited by hardware constraints and split them to the limits in the I/O submission handler. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20241030051859.280923-3-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 33 --------------------------------- drivers/nvme/target/zns.c | 21 +++++++++++++-------- include/linux/bio.h | 2 -- 3 files changed, 13 insertions(+), 43 deletions(-) diff --git a/block/bio.c b/block/bio.c index 6a60d62a529d..daceb0a5c1d7 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1064,39 +1064,6 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, } EXPORT_SYMBOL(bio_add_pc_page); -/** - * bio_add_zone_append_page - attempt to add page to zone-append bio - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * - * Attempt to add a page to the bio_vec maplist of a bio that will be submitted - * for a zone-append request. This can fail for a number of reasons, such as the - * bio being full or the target block device is not a zoned block device or - * other limitations of the target block device. The target block device must - * allow bio's up to PAGE_SIZE, so it is always possible to add a single page - * to an empty bio. - * - * Returns: number of bytes added to the bio, or 0 in case of a failure. - */ -int bio_add_zone_append_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) -{ - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - bool same_page = false; - - if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND)) - return 0; - - if (WARN_ON_ONCE(!bdev_is_zoned(bio->bi_bdev))) - return 0; - - return bio_add_hw_page(q, bio, page, len, offset, - queue_max_zone_append_sectors(q), &same_page); -} -EXPORT_SYMBOL_GPL(bio_add_zone_append_page); - /** * __bio_add_page - add page(s) to a bio in a new segment * @bio: destination bio diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index af9e13be7678..3aef35b05111 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -537,6 +537,7 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req) u16 status = NVME_SC_SUCCESS; unsigned int total_len = 0; struct scatterlist *sg; + u32 data_len = nvmet_rw_data_len(req); struct bio *bio; int sg_cnt; @@ -544,6 +545,13 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req) if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req))) return; + if (data_len > + bdev_max_zone_append_sectors(req->ns->bdev) << SECTOR_SHIFT) { + req->error_loc = offsetof(struct nvme_rw_command, length); + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + goto out; + } + if (!req->sg_cnt) { nvmet_req_complete(req, 0); return; @@ -576,20 +584,17 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req) bio->bi_opf |= REQ_FUA; for_each_sg(req->sg, sg, req->sg_cnt, sg_cnt) { - struct page *p = sg_page(sg); - unsigned int l = sg->length; - unsigned int o = sg->offset; - unsigned int ret; + unsigned int len = sg->length; - ret = bio_add_zone_append_page(bio, p, l, o); - if (ret != sg->length) { + if (bio_add_pc_page(bdev_get_queue(bio->bi_bdev), bio, + sg_page(sg), len, sg->offset) != len) { status = NVME_SC_INTERNAL; goto out_put_bio; } - total_len += sg->length; + total_len += len; } - if (total_len != nvmet_rw_data_len(req)) { + if (total_len != data_len) { status = NVME_SC_INTERNAL | NVME_STATUS_DNR; goto out_put_bio; } diff --git a/include/linux/bio.h b/include/linux/bio.h index faceadb040f9..4a1bf43ca53d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -418,8 +418,6 @@ bool __must_check bio_add_folio(struct bio *bio, struct folio *folio, size_t len, size_t off); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); -int bio_add_zone_append_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset); void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, -- 2.51.0 From d47de6ac8842327ae1c782670283450159c55d5b Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 1 Nov 2024 09:22:15 +0000 Subject: [PATCH 14/16] loop: Simplify discard granularity calc A bdev discard granularity is always at least SECTOR_SIZE, so don't check for a zero value. Suggested-by: Christoph Hellwig Signed-off-by: John Garry Link: https://lore.kernel.org/r/20241101092215.422428-1-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 7719858c49bb..f21f4254b038 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -789,8 +789,7 @@ static void loop_config_discard(struct loop_device *lo, struct block_device *bdev = I_BDEV(inode); max_discard_sectors = bdev_write_zeroes_sectors(bdev); - granularity = bdev_discard_granularity(bdev) ?: - bdev_physical_block_size(bdev); + granularity = bdev_discard_granularity(bdev); /* * We use punch hole to reclaim the free space used by the -- 2.51.0 From 341468e0ab4bb1b26ad0b0cee7c6db4bf283043a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 2 Nov 2024 09:42:11 +0800 Subject: [PATCH 15/16] lib/iov_iter: fix bvec iterator setup .bi_size of bvec iterator should be initialized as real max size for walking, and .bi_bvec_done just counts how many bytes need to be skipped in the 1st bvec, so .bi_size isn't related with .bi_bvec_done. This patch fixes bvec iterator initialization, and the inner `size` check isn't needed any more, so revert Eric Dumazet's commit 7bc802acf193 ("iov-iter: do not return more bytes than requested in iov_iter_extract_bvec_pages()"). Cc: Eric Dumazet Fixes: e4e535bff2bc ("iov_iter: don't require contiguous pages in iov_iter_extract_bvec_pages") Reported-by: syzbot+71abe7ab2b70bca770fd@syzkaller.appspotmail.com Tested-by: syzbot+71abe7ab2b70bca770fd@syzkaller.appspotmail.com Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- lib/iov_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index c761f6db3cb4..4a54c7af62c0 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1700,7 +1700,7 @@ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, skip = 0; } bi.bi_idx = 0; - bi.bi_size = maxsize + skip; + bi.bi_size = maxsize; bi.bi_bvec_done = skip; maxpages = want_pages_array(pages, maxsize, skip, maxpages); -- 2.51.0 From 05df01668490c670aeafe002cfa63334687b012f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2024 06:42:18 +0100 Subject: [PATCH 16/16] block: update blk_stack_limits documentation Listing every single features that needs to be pre-set by stacking drivers does not scale. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241104054218.45596-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 95fc39d09872..5ee3d6d1448d 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -508,10 +508,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->features |= (b->features & BLK_FEAT_INHERIT_MASK); /* - * BLK_FEAT_NOWAIT and BLK_FEAT_POLL need to be supported both by the - * stacking driver and all underlying devices. The stacking driver sets - * the flags before stacking the limits, and this will clear the flags - * if any of the underlying devices does not support it. + * Some feaures need to be supported both by the stacking driver and all + * underlying devices. The stacking driver sets these flags before + * stacking the limits, and this will clear the flags if any of the + * underlying devices does not support it. */ if (!(b->features & BLK_FEAT_NOWAIT)) t->features &= ~BLK_FEAT_NOWAIT; -- 2.51.0