From 05df01668490c670aeafe002cfa63334687b012f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2024 06:42:18 +0100 Subject: [PATCH 01/16] block: update blk_stack_limits documentation Listing every single features that needs to be pre-set by stacking drivers does not scale. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241104054218.45596-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 95fc39d09872..5ee3d6d1448d 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -508,10 +508,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->features |= (b->features & BLK_FEAT_INHERIT_MASK); /* - * BLK_FEAT_NOWAIT and BLK_FEAT_POLL need to be supported both by the - * stacking driver and all underlying devices. The stacking driver sets - * the flags before stacking the limits, and this will clear the flags - * if any of the underlying devices does not support it. + * Some feaures need to be supported both by the stacking driver and all + * underlying devices. The stacking driver sets these flags before + * stacking the limits, and this will clear the flags if any of the + * underlying devices does not support it. */ if (!(b->features & BLK_FEAT_NOWAIT)) t->features &= ~BLK_FEAT_NOWAIT; -- 2.51.0 From e494c3dce6981c0b19fee79928b293a1cf0af867 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2024 08:39:31 +0100 Subject: [PATCH 02/16] block: remove the max_zone_append_sectors check in blk_revalidate_disk_zones With the lock layer zone append emulation, we are now always setting a max_zone_append_sectors value for zoned devices and this check can't ever trigger. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241104073955.112324-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-zoned.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index af19296fa50d..a287577d1ad6 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1774,12 +1774,6 @@ int blk_revalidate_disk_zones(struct gendisk *disk) return -ENODEV; } - if (!queue_max_zone_append_sectors(q)) { - pr_warn("%s: Invalid 0 maximum zone append limit\n", - disk->disk_name); - return -ENODEV; - } - /* * Ensure that all memory allocations in this context are done as if * GFP_NOIO was specified. -- 2.51.0 From 2a8f6153e1c2db06a537a5c9d61102eb591776f1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2024 08:39:32 +0100 Subject: [PATCH 03/16] block: pre-calculate max_zone_append_sectors max_zone_append_sectors differs from all other queue limits in that the final value used is not stored in the queue_limits but needs to be obtained using queue_limits_max_zone_append_sectors helper. This not only adds (tiny) extra overhead to the I/O path, but also can be easily forgotten in file system code. Add a new max_hw_zone_append_sectors value to queue_limits which is set by the driver, and calculate max_zone_append_sectors from that and the other inputs in blk_validate_zoned_limits, similar to how max_sectors is calculated to fix this. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241104073955.112324-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-merge.c | 3 +-- block/blk-settings.c | 25 ++++++++++++------------- block/blk-sysfs.c | 17 +++-------------- drivers/block/null_blk/zoned.c | 2 +- drivers/block/ublk_drv.c | 2 +- drivers/block/virtio_blk.c | 2 +- drivers/md/dm-zone.c | 4 ++-- drivers/nvme/host/multipath.c | 2 +- drivers/nvme/host/zns.c | 2 +- drivers/scsi/sd_zbc.c | 2 -- include/linux/blkdev.h | 21 +++------------------ 12 files changed, 27 insertions(+), 57 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 09d10bb95fda..5df4607321ca 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -607,7 +607,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_IOERR; /* Make sure the BIO is small enough and will not get split */ - if (nr_sectors > queue_max_zone_append_sectors(q)) + if (nr_sectors > q->limits.max_zone_append_sectors) return BLK_STS_IOERR; bio->bi_opf |= REQ_NOMERGE; diff --git a/block/blk-merge.c b/block/blk-merge.c index d813d799cee7..7c1375a080ad 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -388,11 +388,10 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, struct bio *bio_split_zone_append(struct bio *bio, const struct queue_limits *lim, unsigned *nr_segs) { - unsigned int max_sectors = queue_limits_max_zone_append_sectors(lim); int split_sectors; split_sectors = bio_split_rw_at(bio, lim, nr_segs, - max_sectors << SECTOR_SHIFT); + lim->max_zone_append_sectors << SECTOR_SHIFT); if (WARN_ON_ONCE(split_sectors > 0)) split_sectors = -EINVAL; return bio_submit_split(bio, split_sectors); diff --git a/block/blk-settings.c b/block/blk-settings.c index 5ee3d6d1448d..5cb69d85af0e 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -91,17 +91,16 @@ static int blk_validate_zoned_limits(struct queue_limits *lim) if (lim->zone_write_granularity < lim->logical_block_size) lim->zone_write_granularity = lim->logical_block_size; - if (lim->max_zone_append_sectors) { - /* - * The Zone Append size is limited by the maximum I/O size - * and the zone size given that it can't span zones. - */ - lim->max_zone_append_sectors = - min3(lim->max_hw_sectors, - lim->max_zone_append_sectors, - lim->chunk_sectors); - } - + /* + * The Zone Append size is limited by the maximum I/O size and the zone + * size given that it can't span zones. + * + * If no max_hw_zone_append_sectors limit is provided, the block layer + * will emulated it, else we're also bound by the hardware limit. + */ + lim->max_zone_append_sectors = + min_not_zero(lim->max_hw_zone_append_sectors, + min(lim->chunk_sectors, lim->max_hw_sectors)); return 0; } @@ -527,8 +526,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, b->max_write_zeroes_sectors); - t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t), - queue_limits_max_zone_append_sectors(b)); + t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors, + b->max_hw_zone_append_sectors); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 741b95dfdbf6..d9f22122ae2f 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -131,6 +131,7 @@ QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors) #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(_field) \ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ @@ -178,18 +179,6 @@ static ssize_t queue_max_discard_sectors_store(struct gendisk *disk, return ret; } -/* - * For zone append queue_max_zone_append_sectors does not just return the - * underlying queue limits, but actually contains a calculation. Because of - * that we can't simply use QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES here. - */ -static ssize_t queue_zone_append_max_show(struct gendisk *disk, char *page) -{ - return sprintf(page, "%llu\n", - (u64)queue_max_zone_append_sectors(disk->queue) << - SECTOR_SHIFT); -} - static ssize_t queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count) { @@ -479,7 +468,7 @@ QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); -QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); +QUEUE_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); QUEUE_RO_ENTRY(queue_zoned, "zoned"); @@ -607,7 +596,7 @@ static struct attribute *queue_attrs[] = { &queue_atomic_write_unit_max_entry.attr, &queue_write_same_max_entry.attr, &queue_max_write_zeroes_sectors_entry.attr, - &queue_zone_append_max_entry.attr, + &queue_max_zone_append_sectors_entry.attr, &queue_zone_write_granularity_entry.attr, &queue_rotational_entry.attr, &queue_zoned_entry.attr, diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 9bc768b2ca56..0d5f9bf95229 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -166,7 +166,7 @@ int null_init_zoned_dev(struct nullb_device *dev, lim->features |= BLK_FEAT_ZONED; lim->chunk_sectors = dev->zone_size_sects; - lim->max_zone_append_sectors = dev->zone_append_max_sectors; + lim->max_hw_zone_append_sectors = dev->zone_append_max_sectors; lim->max_open_zones = dev->zone_max_open; lim->max_active_zones = dev->zone_max_active; return 0; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 59951e7c2593..8d938b2b41ee 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2270,7 +2270,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) lim.features |= BLK_FEAT_ZONED; lim.max_active_zones = p->max_active_zones; lim.max_open_zones = p->max_open_zones; - lim.max_zone_append_sectors = p->max_zone_append_sectors; + lim.max_hw_zone_append_sectors = p->max_zone_append_sectors; } if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) { diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 194417abc105..0e99a4714928 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -784,7 +784,7 @@ static int virtblk_read_zoned_limits(struct virtio_blk *vblk, wg, v); return -ENODEV; } - lim->max_zone_append_sectors = v; + lim->max_hw_zone_append_sectors = v; dev_dbg(&vdev->dev, "max append sectors = %u\n", v); return 0; diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index c0d41c36e06e..20edd3fabbab 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -344,7 +344,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); } else { set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - lim->max_zone_append_sectors = 0; + lim->max_hw_zone_append_sectors = 0; } /* @@ -379,7 +379,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, if (!zlim.mapped_nr_seq_zones) { lim->max_open_zones = 0; lim->max_active_zones = 0; - lim->max_zone_append_sectors = 0; + lim->max_hw_zone_append_sectors = 0; lim->zone_write_granularity = 0; lim->chunk_sectors = 0; lim->features &= ~BLK_FEAT_ZONED; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 6a15873055b9..c26cb7d3a2e5 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -636,7 +636,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) if (head->ids.csi == NVME_CSI_ZNS) lim.features |= BLK_FEAT_ZONED; else - lim.max_zone_append_sectors = 0; + lim.max_hw_zone_append_sectors = 0; head->disk = blk_alloc_disk(&lim, ctrl->numa_node); if (IS_ERR(head->disk)) diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 9a06f9d98cd6..382949e18c6a 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -111,7 +111,7 @@ void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim, lim->features |= BLK_FEAT_ZONED; lim->max_open_zones = zi->max_open_zones; lim->max_active_zones = zi->max_active_zones; - lim->max_zone_append_sectors = ns->ctrl->max_zone_append; + lim->max_hw_zone_append_sectors = ns->ctrl->max_zone_append; lim->chunk_sectors = ns->head->zsze = nvme_lba_to_sect(ns->head, zi->zone_size); } diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index ee2b74238758..de5c54c057ec 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -634,8 +634,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, lim->max_open_zones = sdkp->zones_max_open; lim->max_active_zones = 0; lim->chunk_sectors = logical_to_sectors(sdkp->device, zone_blocks); - /* Enable block layer zone append emulation */ - lim->max_zone_append_sectors = 0; return 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7bfc877e159e..6d1413bd69a5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -375,6 +375,7 @@ struct queue_limits { unsigned int max_user_discard_sectors; unsigned int max_secure_erase_sectors; unsigned int max_write_zeroes_sectors; + unsigned int max_hw_zone_append_sectors; unsigned int max_zone_append_sectors; unsigned int discard_granularity; unsigned int discard_alignment; @@ -1204,25 +1205,9 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q) return q->limits.max_segment_size; } -static inline unsigned int -queue_limits_max_zone_append_sectors(const struct queue_limits *l) -{ - unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors); - - return min_not_zero(l->max_zone_append_sectors, max_sectors); -} - -static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q) -{ - if (!blk_queue_is_zoned(q)) - return 0; - - return queue_limits_max_zone_append_sectors(&q->limits); -} - static inline bool queue_emulates_zone_append(struct request_queue *q) { - return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors; + return blk_queue_is_zoned(q) && !q->limits.max_hw_zone_append_sectors; } static inline bool bdev_emulates_zone_append(struct block_device *bdev) @@ -1233,7 +1218,7 @@ static inline bool bdev_emulates_zone_append(struct block_device *bdev) static inline unsigned int bdev_max_zone_append_sectors(struct block_device *bdev) { - return queue_max_zone_append_sectors(bdev_get_queue(bdev)); + return bdev_limits(bdev)->max_zone_append_sectors; } static inline unsigned int bdev_max_segments(struct block_device *bdev) -- 2.51.0 From 1e79892e76a78f33deecc895086bef92d8147183 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 7 Oct 2024 10:48:04 +0200 Subject: [PATCH 04/16] md/raid5-ppl: Use atomic64_inc_return() in ppl_new_iounit() Use atomic64_inc_return(&ref) instead of atomic64_add_return(1, &ref) to use optimized implementation and ease register pressure around the primitive for targets that implement optimized variant. Signed-off-by: Uros Bizjak Cc: Song Liu Cc: Yu Kuai Reviewed-by: Yu Kuai Reviewed-by: Artur Paszkiewicz Link: https://lore.kernel.org/r/20241007084831.48067-1-ubizjak@gmail.com Signed-off-by: Song Liu --- drivers/md/raid5-ppl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index a70cbec12ed0..37c4da5311ca 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -258,7 +258,7 @@ static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log, memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED); pplhdr->signature = cpu_to_le32(ppl_conf->signature); - io->seq = atomic64_add_return(1, &ppl_conf->seq); + io->seq = atomic64_inc_return(&ppl_conf->seq); pplhdr->generation = cpu_to_le64(io->seq); return io; -- 2.51.0 From 4abfce19c7fb090c93e6677e26b2c709ca6f6fd8 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 31 Oct 2024 11:31:08 +0800 Subject: [PATCH 05/16] md: add a new helper rdev_blocked() The helper will be used in later patches for raid1/raid10/raid5, the difference is that Faulty rdev with unacknowledged bad block will not be considered blocked. Signed-off-by: Yu Kuai Tested-by: Mariusz Tkaczyk Link: https://lore.kernel.org/r/20241031033114.3845582-2-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/md/md.h b/drivers/md/md.h index 5d2e6bd58e4d..4ba93af36126 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -1002,6 +1002,30 @@ static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio, trace_block_bio_remap(bio, disk_devt(mddev->gendisk), sector); } +static inline bool rdev_blocked(struct md_rdev *rdev) +{ + /* + * Blocked will be set by error handler and cleared by daemon after + * updating superblock, meanwhile write IO should be blocked to prevent + * reading old data after power failure. + */ + if (test_bit(Blocked, &rdev->flags)) + return true; + + /* + * Faulty device should not be accessed anymore, there is no need to + * wait for bad block to be acknowledged. + */ + if (test_bit(Faulty, &rdev->flags)) + return false; + + /* rdev is blocked by badblocks. */ + if (test_bit(BlockedBadBlocks, &rdev->flags)) + return true; + + return false; +} + #define mddev_add_trace_msg(mddev, fmt, args...) \ do { \ if (!mddev_is_dm(mddev)) \ -- 2.51.0 From 50e8274855e7ab5499ff8296e09802874a3f03b1 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 31 Oct 2024 11:31:09 +0800 Subject: [PATCH 06/16] md: don't wait faulty rdev in md_wait_for_blocked_rdev() md_wait_for_blocked_rdev() is called for write IO while rdev is blocked, howerver, rdev can be faulty after choosing this rdev to write, and faulty rdev should never be accessed anymore, hence there is no point to wait for faulty rdev to be unblocked. Signed-off-by: Yu Kuai Tested-by: Mariusz Tkaczyk Link: https://lore.kernel.org/r/20241031033114.3845582-3-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 179ee4afe937..b2a0e0a84309 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9762,9 +9762,7 @@ EXPORT_SYMBOL(md_reap_sync_thread); void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) { sysfs_notify_dirent_safe(rdev->sysfs_state); - wait_event_timeout(rdev->blocked_wait, - !test_bit(Blocked, &rdev->flags) && - !test_bit(BlockedBadBlocks, &rdev->flags), + wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev), msecs_to_jiffies(5000)); rdev_dec_pending(rdev, mddev); } -- 2.51.0 From 29967332ced51a15a22f11381eeebbc500ba1858 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 31 Oct 2024 11:31:10 +0800 Subject: [PATCH 07/16] md: don't record new badblocks for faulty rdev Faulty will be checked before issuing IO to the rdev, however, rdev can be faulty at any time, hence it's possible that rdev_set_badblocks() will be called for faulty rdev. In this case, mddev->sb_flags will be set and some other path can be blocked by updating super block. Since faulty rdev will not be accesed anymore, there is no need to record new babblocks for faulty rdev and forcing updating super block. Noted this is not a bugfix, just prevent updating superblock in some corner cases, and will help to slice a bug related to external metadata[1], testing also shows that devices are removed faster in the case IO error. [1] https://lore.kernel.org/all/f34452df-810b-48b2-a9b4-7f925699a9e7@linux.intel.com/ Signed-off-by: Yu Kuai Tested-by: Mariusz Tkaczyk Link: https://lore.kernel.org/r/20241031033114.3845582-4-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index b2a0e0a84309..bbe002ebd584 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9791,6 +9791,17 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, { struct mddev *mddev = rdev->mddev; int rv; + + /* + * Recording new badblocks for faulty rdev will force unnecessary + * super block updating. This is fragile for external management because + * userspace daemon may trying to remove this device and deadlock may + * occur. This will be probably solved in the mdadm, but it is safer to + * avoid it. + */ + if (test_bit(Faulty, &rdev->flags)) + return 1; + if (is_new) s += rdev->new_data_offset; else -- 2.51.0 From 88ed59c4cc6c2dbdf03345bce54e0d7a272937bc Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 31 Oct 2024 11:31:11 +0800 Subject: [PATCH 08/16] md/raid1: factor out helper to handle blocked rdev from raid1_write_request() Currently raid1 is preparing IO for underlying disks while checking if any disk is blocked, if so allocated resources must be released, then waiting for rdev to be unblocked and try to prepare IO again. Make code cleaner by checking blocked rdev first, it doesn't matter if rdev is blocked while issuing IO, the IO will wait for rdev to be unblocked or not. Signed-off-by: Yu Kuai Tested-by: Mariusz Tkaczyk Link: https://lore.kernel.org/r/20241031033114.3845582-5-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid1.c | 84 ++++++++++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 36 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 6c9d24203f39..1679c1e9b3d5 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1412,6 +1412,49 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, submit_bio_noacct(read_bio); } +static bool wait_blocked_rdev(struct mddev *mddev, struct bio *bio) +{ + struct r1conf *conf = mddev->private; + int disks = conf->raid_disks * 2; + int i; + +retry: + for (i = 0; i < disks; i++) { + struct md_rdev *rdev = conf->mirrors[i].rdev; + + if (!rdev) + continue; + + if (test_bit(Blocked, &rdev->flags)) { + if (bio->bi_opf & REQ_NOWAIT) + return false; + + mddev_add_trace_msg(rdev->mddev, "raid1 wait rdev %d blocked", + rdev->raid_disk); + atomic_inc(&rdev->nr_pending); + md_wait_for_blocked_rdev(rdev, rdev->mddev); + goto retry; + } + + /* don't write here until the bad block is acknowledged */ + if (test_bit(WriteErrorSeen, &rdev->flags) && + rdev_has_badblock(rdev, bio->bi_iter.bi_sector, + bio_sectors(bio)) < 0) { + if (bio->bi_opf & REQ_NOWAIT) + return false; + + set_bit(BlockedBadBlocks, &rdev->flags); + mddev_add_trace_msg(rdev->mddev, "raid1 wait rdev %d blocked", + rdev->raid_disk); + atomic_inc(&rdev->nr_pending); + md_wait_for_blocked_rdev(rdev, rdev->mddev); + goto retry; + } + } + + return true; +} + static void raid1_write_request(struct mddev *mddev, struct bio *bio, int max_write_sectors) { @@ -1419,7 +1462,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, struct r1bio *r1_bio; int i, disks; unsigned long flags; - struct md_rdev *blocked_rdev; int first_clone; int max_sectors; bool write_behind = false; @@ -1457,7 +1499,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, return; } - retry_write: + if (!wait_blocked_rdev(mddev, bio)) { + bio_wouldblock_error(bio); + return; + } + r1_bio = alloc_r1bio(mddev, bio); r1_bio->sectors = max_write_sectors; @@ -1473,7 +1519,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, */ disks = conf->raid_disks * 2; - blocked_rdev = NULL; max_sectors = r1_bio->sectors; for (i = 0; i < disks; i++) { struct md_rdev *rdev = conf->mirrors[i].rdev; @@ -1486,11 +1531,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (!is_discard && rdev && test_bit(WriteMostly, &rdev->flags)) write_behind = true; - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { - atomic_inc(&rdev->nr_pending); - blocked_rdev = rdev; - break; - } r1_bio->bios[i] = NULL; if (!rdev || test_bit(Faulty, &rdev->flags)) { if (i < conf->raid_disks) @@ -1506,13 +1546,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, is_bad = is_badblock(rdev, r1_bio->sector, max_sectors, &first_bad, &bad_sectors); - if (is_bad < 0) { - /* mustn't write here until the bad block is - * acknowledged*/ - set_bit(BlockedBadBlocks, &rdev->flags); - blocked_rdev = rdev; - break; - } if (is_bad && first_bad <= r1_bio->sector) { /* Cannot write here at all */ bad_sectors -= (r1_bio->sector - first_bad); @@ -1543,27 +1576,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio->bios[i] = bio; } - if (unlikely(blocked_rdev)) { - /* Wait for this device to become unblocked */ - int j; - - for (j = 0; j < i; j++) - if (r1_bio->bios[j]) - rdev_dec_pending(conf->mirrors[j].rdev, mddev); - mempool_free(r1_bio, &conf->r1bio_pool); - allow_barrier(conf, bio->bi_iter.bi_sector); - - if (bio->bi_opf & REQ_NOWAIT) { - bio_wouldblock_error(bio); - return; - } - mddev_add_trace_msg(mddev, "raid1 wait rdev %d blocked", - blocked_rdev->raid_disk); - md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf, bio->bi_iter.bi_sector, false); - goto retry_write; - } - /* * When using a bitmap, we may call alloc_behind_master_bio below. * alloc_behind_master_bio allocates a copy of the data payload a page -- 2.51.0 From ff31a7ef2b13aae27203d7fc29280ab0a2f8bf18 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 31 Oct 2024 11:31:12 +0800 Subject: [PATCH 09/16] md/raid1: don't wait for Faulty rdev in wait_blocked_rdev() Faulty rdev should never be accessed anymore, hence there is no point to wait for bad block to be acknowledged in this case while handling write request. Signed-off-by: Yu Kuai Tested-by: Mariusz Tkaczyk Link: https://lore.kernel.org/r/20241031033114.3845582-6-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid1.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1679c1e9b3d5..cd3e94dceabc 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1425,25 +1425,16 @@ retry: if (!rdev) continue; - if (test_bit(Blocked, &rdev->flags)) { - if (bio->bi_opf & REQ_NOWAIT) - return false; - - mddev_add_trace_msg(rdev->mddev, "raid1 wait rdev %d blocked", - rdev->raid_disk); - atomic_inc(&rdev->nr_pending); - md_wait_for_blocked_rdev(rdev, rdev->mddev); - goto retry; - } - /* don't write here until the bad block is acknowledged */ if (test_bit(WriteErrorSeen, &rdev->flags) && rdev_has_badblock(rdev, bio->bi_iter.bi_sector, - bio_sectors(bio)) < 0) { + bio_sectors(bio)) < 0) + set_bit(BlockedBadBlocks, &rdev->flags); + + if (rdev_blocked(rdev)) { if (bio->bi_opf & REQ_NOWAIT) return false; - set_bit(BlockedBadBlocks, &rdev->flags); mddev_add_trace_msg(rdev->mddev, "raid1 wait rdev %d blocked", rdev->raid_disk); atomic_inc(&rdev->nr_pending); -- 2.51.0 From d419284c95d369f2b77f71fb20f2d61850aa61b8 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 31 Oct 2024 11:31:13 +0800 Subject: [PATCH 10/16] md/raid10: don't wait for Faulty rdev in wait_blocked_rdev() Faulty rdev should never be accessed anymore, hence there is no point to wait for bad block to be acknowledged in this case while handling write request. Signed-off-by: Yu Kuai Tested-by: Mariusz Tkaczyk Link: https://lore.kernel.org/r/20241031033114.3845582-7-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid10.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f3bf1116794a..ff73db2f6c41 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1285,9 +1285,9 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) { - int i; struct r10conf *conf = mddev->private; struct md_rdev *blocked_rdev; + int i; retry_wait: blocked_rdev = NULL; @@ -1295,40 +1295,36 @@ retry_wait: struct md_rdev *rdev, *rrdev; rdev = conf->mirrors[i].rdev; - rrdev = conf->mirrors[i].replacement; - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { - atomic_inc(&rdev->nr_pending); - blocked_rdev = rdev; - break; - } - if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { - atomic_inc(&rrdev->nr_pending); - blocked_rdev = rrdev; - break; - } - - if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { + if (rdev) { sector_t dev_sector = r10_bio->devs[i].addr; /* * Discard request doesn't care the write result * so it doesn't need to wait blocked disk here. */ - if (!r10_bio->sectors) - continue; - - if (rdev_has_badblock(rdev, dev_sector, - r10_bio->sectors) < 0) { + if (test_bit(WriteErrorSeen, &rdev->flags) && + r10_bio->sectors && + rdev_has_badblock(rdev, dev_sector, + r10_bio->sectors) < 0) /* - * Mustn't write here until the bad block - * is acknowledged + * Mustn't write here until the bad + * block is acknowledged */ - atomic_inc(&rdev->nr_pending); set_bit(BlockedBadBlocks, &rdev->flags); + + if (rdev_blocked(rdev)) { blocked_rdev = rdev; + atomic_inc(&rdev->nr_pending); break; } } + + rrdev = conf->mirrors[i].replacement; + if (rrdev && rdev_blocked(rrdev)) { + atomic_inc(&rrdev->nr_pending); + blocked_rdev = rrdev; + break; + } } if (unlikely(blocked_rdev)) { -- 2.51.0 From 649bfec6908bd2365008db79b7328c6c22e662d8 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 31 Oct 2024 11:31:14 +0800 Subject: [PATCH 11/16] md/raid5: don't set Faulty rdev for blocked_rdev Faulty rdev should never be accessed anymore, hence there is no point to wait for bad block to be acknowledged in this case while handling write request. Signed-off-by: Yu Kuai Tested-by: Mariusz Tkaczyk Link: https://lore.kernel.org/r/20241031033114.3845582-8-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid5.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index dc2ea636d173..f5ac81dd21b2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4724,14 +4724,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (rdev) { is_bad = rdev_has_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf)); - if (s->blocked_rdev == NULL - && (test_bit(Blocked, &rdev->flags) - || is_bad < 0)) { + if (s->blocked_rdev == NULL) { if (is_bad < 0) - set_bit(BlockedBadBlocks, - &rdev->flags); - s->blocked_rdev = rdev; - atomic_inc(&rdev->nr_pending); + set_bit(BlockedBadBlocks, &rdev->flags); + if (rdev_blocked(rdev)) { + s->blocked_rdev = rdev; + atomic_inc(&rdev->nr_pending); + } } } clear_bit(R5_Insync, &dev->flags); -- 2.51.0 From 6012169e8aae9c0eda38bbedcd7a1540a81220ae Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Tue, 5 Nov 2024 21:01:05 +0800 Subject: [PATCH 12/16] md/md-bitmap: Add missing destroy_work_on_stack() This commit add missed destroy_work_on_stack() operations for unplug_work.work in bitmap_unplug_async(). Fixes: a022325ab970 ("md/md-bitmap: add a new helper to unplug bitmap asynchrously") Cc: stable@vger.kernel.org Signed-off-by: Yuan Can Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20241105130105.127336-1-yuancan@huawei.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 29da10e6f703..c3a42dd66ce5 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1285,6 +1285,7 @@ static void bitmap_unplug_async(struct bitmap *bitmap) queue_work(md_bitmap_wq, &unplug_work.work); wait_for_completion(&done); + destroy_work_on_stack(&unplug_work.work); } static void bitmap_unplug(struct mddev *mddev, bool sync) -- 2.51.0 From 91ff97a7225996db1071cfacc209a4fccce2246f Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Wed, 6 Nov 2024 15:52:50 +0100 Subject: [PATCH 13/16] mtip32xx: Replace deprecated PCI functions pcim_iomap_table() and pcim_request_regions() have been deprecated in commit e354bb84a4c1 ("PCI: Deprecate pcim_iomap_table(), pcim_iomap_regions_request_all()") and commit d140f80f60358 ("PCI: Deprecate pcim_iomap_regions() in favor of pcim_iomap_region()"), respectively. Replace these functions with pcim_iomap_region(). Signed-off-by: Philipp Stanner Link: https://lore.kernel.org/r/20241106145249.108996-2-pstanner@redhat.com Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 223faa9d5ffd..43701b7b10a7 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2701,7 +2701,12 @@ static int mtip_hw_init(struct driver_data *dd) int rv; unsigned long timeout, timetaken; - dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR]; + dd->mmio = pcim_iomap_region(dd->pdev, MTIP_ABAR, MTIP_DRV_NAME); + if (IS_ERR(dd->mmio)) { + dev_err(&dd->pdev->dev, "Unable to request / ioremap PCI region\n"); + return PTR_ERR(dd->mmio); + } + mtip_detect_product(dd); if (dd->product_type == MTIP_PRODUCT_UNKNOWN) { @@ -3710,13 +3715,6 @@ static int mtip_pci_probe(struct pci_dev *pdev, goto iomap_err; } - /* Map BAR5 to memory. */ - rv = pcim_iomap_regions(pdev, 1 << MTIP_ABAR, MTIP_DRV_NAME); - if (rv < 0) { - dev_err(&pdev->dev, "Unable to map regions\n"); - goto iomap_err; - } - rv = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); if (rv) { dev_warn(&pdev->dev, "64-bit DMA enable failed\n"); -- 2.51.0 From ab9bc81c1cf0efc7fc5a3aa4e562aa88d09ada57 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Nov 2024 05:45:34 -0700 Subject: [PATCH 14/16] Revert "block: pre-calculate max_zone_append_sectors" This causes issue on, at least, nvme-mpath where my boot fails with: WARNING: CPU: 354 PID: 2729 at block/blk-settings.c:75 blk_validate_limits+0x356/0x380 Modules linked in: tg3(+) nvme usbcore scsi_mod ptp i2c_piix4 libphy nvme_core crc32c_intel scsi_common usb_common pps_core i2c_smbus CPU: 354 UID: 0 PID: 2729 Comm: kworker/u2061:1 Not tainted 6.12.0-rc6+ #181 Hardware name: Dell Inc. PowerEdge R7625/06444F, BIOS 1.8.3 04/02/2024 Workqueue: async async_run_entry_fn RIP: 0010:blk_validate_limits+0x356/0x380 Code: f6 47 01 04 75 28 83 bf 94 00 00 00 00 75 39 83 bf 98 00 00 00 00 75 34 83 7f 68 00 75 32 31 c0 83 7f 5c 00 0f 84 9b fd ff ff <0f> 0b eb 13 0f 0b eb 0f 48 c7 c0 74 12 58 92 48 89 c7 e8 13 76 46 RSP: 0018:ffffa8a1dfb93b30 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff9232829c8388 RCX: 0000000000000088 RDX: 0000000000000080 RSI: 0000000000000200 RDI: ffffa8a1dfb93c38 RBP: 000000000000000c R08: 00000000ffffffff R09: 000000000000ffff R10: 0000000000000000 R11: 0000000000000000 R12: ffff9232829b9000 R13: ffff9232829b9010 R14: ffffa8a1dfb93c38 R15: ffffa8a1dfb93c38 FS: 0000000000000000(0000) GS:ffff923867c80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055c1b92480a8 CR3: 0000002484ff0002 CR4: 0000000000370ef0 Call Trace: ? __warn+0xca/0x1a0 ? blk_validate_limits+0x356/0x380 ? report_bug+0x11a/0x1a0 ? handle_bug+0x5e/0x90 ? exc_invalid_op+0x16/0x40 ? asm_exc_invalid_op+0x16/0x20 ? blk_validate_limits+0x356/0x380 blk_alloc_queue+0x7a/0x250 __blk_alloc_disk+0x39/0x80 nvme_mpath_alloc_disk+0x13d/0x1b0 [nvme_core] nvme_scan_ns+0xcc7/0x1010 [nvme_core] async_run_entry_fn+0x27/0x120 process_scheduled_works+0x1a0/0x360 worker_thread+0x2bc/0x350 ? pr_cont_work+0x1b0/0x1b0 kthread+0x111/0x120 ? kthread_unuse_mm+0x90/0x90 ret_from_fork+0x30/0x40 ? kthread_unuse_mm+0x90/0x90 ret_from_fork_asm+0x11/0x20 ---[ end trace 0000000000000000 ]--- presumably due to max_zone_append_sectors not being cleared to zero, resulting in blk_validate_zoned_limits() complaining and failing. This reverts commit 2a8f6153e1c2db06a537a5c9d61102eb591776f1. Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-merge.c | 3 ++- block/blk-settings.c | 25 +++++++++++++------------ block/blk-sysfs.c | 17 ++++++++++++++--- drivers/block/null_blk/zoned.c | 2 +- drivers/block/ublk_drv.c | 2 +- drivers/block/virtio_blk.c | 2 +- drivers/md/dm-zone.c | 4 ++-- drivers/nvme/host/multipath.c | 2 +- drivers/nvme/host/zns.c | 2 +- drivers/scsi/sd_zbc.c | 2 ++ include/linux/blkdev.h | 21 ++++++++++++++++++--- 12 files changed, 57 insertions(+), 27 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 5df4607321ca..09d10bb95fda 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -607,7 +607,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_IOERR; /* Make sure the BIO is small enough and will not get split */ - if (nr_sectors > q->limits.max_zone_append_sectors) + if (nr_sectors > queue_max_zone_append_sectors(q)) return BLK_STS_IOERR; bio->bi_opf |= REQ_NOMERGE; diff --git a/block/blk-merge.c b/block/blk-merge.c index 7c1375a080ad..d813d799cee7 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -388,10 +388,11 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, struct bio *bio_split_zone_append(struct bio *bio, const struct queue_limits *lim, unsigned *nr_segs) { + unsigned int max_sectors = queue_limits_max_zone_append_sectors(lim); int split_sectors; split_sectors = bio_split_rw_at(bio, lim, nr_segs, - lim->max_zone_append_sectors << SECTOR_SHIFT); + max_sectors << SECTOR_SHIFT); if (WARN_ON_ONCE(split_sectors > 0)) split_sectors = -EINVAL; return bio_submit_split(bio, split_sectors); diff --git a/block/blk-settings.c b/block/blk-settings.c index 5cb69d85af0e..5ee3d6d1448d 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -91,16 +91,17 @@ static int blk_validate_zoned_limits(struct queue_limits *lim) if (lim->zone_write_granularity < lim->logical_block_size) lim->zone_write_granularity = lim->logical_block_size; - /* - * The Zone Append size is limited by the maximum I/O size and the zone - * size given that it can't span zones. - * - * If no max_hw_zone_append_sectors limit is provided, the block layer - * will emulated it, else we're also bound by the hardware limit. - */ - lim->max_zone_append_sectors = - min_not_zero(lim->max_hw_zone_append_sectors, - min(lim->chunk_sectors, lim->max_hw_sectors)); + if (lim->max_zone_append_sectors) { + /* + * The Zone Append size is limited by the maximum I/O size + * and the zone size given that it can't span zones. + */ + lim->max_zone_append_sectors = + min3(lim->max_hw_sectors, + lim->max_zone_append_sectors, + lim->chunk_sectors); + } + return 0; } @@ -526,8 +527,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, b->max_write_zeroes_sectors); - t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors, - b->max_hw_zone_append_sectors); + t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t), + queue_limits_max_zone_append_sectors(b)); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index d9f22122ae2f..741b95dfdbf6 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -131,7 +131,6 @@ QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors) -QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors) #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(_field) \ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ @@ -179,6 +178,18 @@ static ssize_t queue_max_discard_sectors_store(struct gendisk *disk, return ret; } +/* + * For zone append queue_max_zone_append_sectors does not just return the + * underlying queue limits, but actually contains a calculation. Because of + * that we can't simply use QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES here. + */ +static ssize_t queue_zone_append_max_show(struct gendisk *disk, char *page) +{ + return sprintf(page, "%llu\n", + (u64)queue_max_zone_append_sectors(disk->queue) << + SECTOR_SHIFT); +} + static ssize_t queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count) { @@ -468,7 +479,7 @@ QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); -QUEUE_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); +QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); QUEUE_RO_ENTRY(queue_zoned, "zoned"); @@ -596,7 +607,7 @@ static struct attribute *queue_attrs[] = { &queue_atomic_write_unit_max_entry.attr, &queue_write_same_max_entry.attr, &queue_max_write_zeroes_sectors_entry.attr, - &queue_max_zone_append_sectors_entry.attr, + &queue_zone_append_max_entry.attr, &queue_zone_write_granularity_entry.attr, &queue_rotational_entry.attr, &queue_zoned_entry.attr, diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 0d5f9bf95229..9bc768b2ca56 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -166,7 +166,7 @@ int null_init_zoned_dev(struct nullb_device *dev, lim->features |= BLK_FEAT_ZONED; lim->chunk_sectors = dev->zone_size_sects; - lim->max_hw_zone_append_sectors = dev->zone_append_max_sectors; + lim->max_zone_append_sectors = dev->zone_append_max_sectors; lim->max_open_zones = dev->zone_max_open; lim->max_active_zones = dev->zone_max_active; return 0; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 8d938b2b41ee..59951e7c2593 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2270,7 +2270,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) lim.features |= BLK_FEAT_ZONED; lim.max_active_zones = p->max_active_zones; lim.max_open_zones = p->max_open_zones; - lim.max_hw_zone_append_sectors = p->max_zone_append_sectors; + lim.max_zone_append_sectors = p->max_zone_append_sectors; } if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) { diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 0e99a4714928..194417abc105 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -784,7 +784,7 @@ static int virtblk_read_zoned_limits(struct virtio_blk *vblk, wg, v); return -ENODEV; } - lim->max_hw_zone_append_sectors = v; + lim->max_zone_append_sectors = v; dev_dbg(&vdev->dev, "max append sectors = %u\n", v); return 0; diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 20edd3fabbab..c0d41c36e06e 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -344,7 +344,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); } else { set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - lim->max_hw_zone_append_sectors = 0; + lim->max_zone_append_sectors = 0; } /* @@ -379,7 +379,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, if (!zlim.mapped_nr_seq_zones) { lim->max_open_zones = 0; lim->max_active_zones = 0; - lim->max_hw_zone_append_sectors = 0; + lim->max_zone_append_sectors = 0; lim->zone_write_granularity = 0; lim->chunk_sectors = 0; lim->features &= ~BLK_FEAT_ZONED; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index c26cb7d3a2e5..6a15873055b9 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -636,7 +636,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) if (head->ids.csi == NVME_CSI_ZNS) lim.features |= BLK_FEAT_ZONED; else - lim.max_hw_zone_append_sectors = 0; + lim.max_zone_append_sectors = 0; head->disk = blk_alloc_disk(&lim, ctrl->numa_node); if (IS_ERR(head->disk)) diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 382949e18c6a..9a06f9d98cd6 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -111,7 +111,7 @@ void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim, lim->features |= BLK_FEAT_ZONED; lim->max_open_zones = zi->max_open_zones; lim->max_active_zones = zi->max_active_zones; - lim->max_hw_zone_append_sectors = ns->ctrl->max_zone_append; + lim->max_zone_append_sectors = ns->ctrl->max_zone_append; lim->chunk_sectors = ns->head->zsze = nvme_lba_to_sect(ns->head, zi->zone_size); } diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index de5c54c057ec..ee2b74238758 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -634,6 +634,8 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, lim->max_open_zones = sdkp->zones_max_open; lim->max_active_zones = 0; lim->chunk_sectors = logical_to_sectors(sdkp->device, zone_blocks); + /* Enable block layer zone append emulation */ + lim->max_zone_append_sectors = 0; return 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6d1413bd69a5..7bfc877e159e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -375,7 +375,6 @@ struct queue_limits { unsigned int max_user_discard_sectors; unsigned int max_secure_erase_sectors; unsigned int max_write_zeroes_sectors; - unsigned int max_hw_zone_append_sectors; unsigned int max_zone_append_sectors; unsigned int discard_granularity; unsigned int discard_alignment; @@ -1205,9 +1204,25 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q) return q->limits.max_segment_size; } +static inline unsigned int +queue_limits_max_zone_append_sectors(const struct queue_limits *l) +{ + unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors); + + return min_not_zero(l->max_zone_append_sectors, max_sectors); +} + +static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q) +{ + if (!blk_queue_is_zoned(q)) + return 0; + + return queue_limits_max_zone_append_sectors(&q->limits); +} + static inline bool queue_emulates_zone_append(struct request_queue *q) { - return blk_queue_is_zoned(q) && !q->limits.max_hw_zone_append_sectors; + return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors; } static inline bool bdev_emulates_zone_append(struct block_device *bdev) @@ -1218,7 +1233,7 @@ static inline bool bdev_emulates_zone_append(struct block_device *bdev) static inline unsigned int bdev_max_zone_append_sectors(struct block_device *bdev) { - return bdev_limits(bdev)->max_zone_append_sectors; + return queue_max_zone_append_sectors(bdev_get_queue(bdev)); } static inline unsigned int bdev_max_segments(struct block_device *bdev) -- 2.51.0 From 4122fef16b172f7c1838fcf74340268c86ed96db Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 7 Nov 2024 15:54:38 +0900 Subject: [PATCH 15/16] block: Switch to using refcount_t for zone write plugs Replace the raw atomic_t reference counting of zone write plugs with a refcount_t. No functional changes. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202411050650.ilIZa8S7-lkp@intel.com/ Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241107065438.236348-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index a287577d1ad6..f7e151f665c7 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include "blk.h" @@ -64,7 +64,7 @@ static const char *const zone_cond_name[] = { struct blk_zone_wplug { struct hlist_node node; struct list_head link; - atomic_t ref; + refcount_t ref; spinlock_t lock; unsigned int flags; unsigned int zone_no; @@ -411,7 +411,7 @@ static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { if (zwplug->zone_no == zno && - atomic_inc_not_zero(&zwplug->ref)) { + refcount_inc_not_zero(&zwplug->ref)) { rcu_read_unlock(); return zwplug; } @@ -432,7 +432,7 @@ static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) { - if (atomic_dec_and_test(&zwplug->ref)) { + if (refcount_dec_and_test(&zwplug->ref)) { WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); WARN_ON_ONCE(!list_empty(&zwplug->link)); WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); @@ -463,7 +463,7 @@ static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, * taken when the plug was allocated and another reference taken by the * caller context). */ - if (atomic_read(&zwplug->ref) > 2) + if (refcount_read(&zwplug->ref) > 2) return false; /* We can remove zone write plugs for zones that are empty or full. */ @@ -533,7 +533,7 @@ again: INIT_HLIST_NODE(&zwplug->node); INIT_LIST_HEAD(&zwplug->link); - atomic_set(&zwplug->ref, 2); + refcount_set(&zwplug->ref, 2); spin_lock_init(&zwplug->lock); zwplug->flags = 0; zwplug->zone_no = zno; @@ -624,7 +624,7 @@ static inline void disk_zone_wplug_set_error(struct gendisk *disk, * finished. */ zwplug->flags |= BLK_ZONE_WPLUG_ERROR; - atomic_inc(&zwplug->ref); + refcount_inc(&zwplug->ref); spin_lock_irqsave(&disk->zone_wplugs_lock, flags); list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list); @@ -1099,7 +1099,7 @@ static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, * reference we take here. */ WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); - atomic_inc(&zwplug->ref); + refcount_inc(&zwplug->ref); queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); } @@ -1444,7 +1444,7 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) while (!hlist_empty(&disk->zone_wplugs_hash[i])) { zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, struct blk_zone_wplug, node); - atomic_inc(&zwplug->ref); + refcount_inc(&zwplug->ref); disk_remove_zone_wplug(disk, zwplug); disk_put_zone_wplug(zwplug); } @@ -1845,7 +1845,7 @@ int queue_zone_wplugs_show(void *data, struct seq_file *m) spin_lock_irqsave(&zwplug->lock, flags); zwp_zone_no = zwplug->zone_no; zwp_flags = zwplug->flags; - zwp_ref = atomic_read(&zwplug->ref); + zwp_ref = refcount_read(&zwplug->ref); zwp_wp_offset = zwplug->wp_offset; zwp_bio_list_size = bio_list_size(&zwplug->bio_list); spin_unlock_irqrestore(&zwplug->lock, flags); -- 2.51.0 From 8e71afb94d6ed59055b67dadbc423c70104f21a9 Mon Sep 17 00:00:00 2001 From: zhangguopeng Date: Thu, 7 Nov 2024 18:42:58 +0800 Subject: [PATCH 16/16] block: Replace sprintf() with sysfs_emit() Per Documentation/filesystems/sysfs.rst, show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. No functional change intended. Signed-off-by: zhangguopeng Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241107104258.29742-1-zhangguopeng@kylinos.cn Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 24 ++++++++++++------------ block/genhd.c | 30 +++++++++++++++--------------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 741b95dfdbf6..0ef4e13e247d 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -30,7 +30,7 @@ struct queue_sysfs_entry { static ssize_t queue_var_show(unsigned long var, char *page) { - return sprintf(page, "%lu\n", var); + return sysfs_emit(page, "%lu\n", var); } static ssize_t @@ -121,7 +121,7 @@ QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_max) #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(_field) \ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ { \ - return sprintf(page, "%llu\n", \ + return sysfs_emit(page, "%llu\n", \ (unsigned long long)disk->queue->limits._field << \ SECTOR_SHIFT); \ } @@ -144,7 +144,7 @@ QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_hw_sectors) #define QUEUE_SYSFS_SHOW_CONST(_name, _val) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ - return sprintf(page, "%d\n", _val); \ + return sysfs_emit(page, "%d\n", _val); \ } /* deprecated fields */ @@ -235,7 +235,7 @@ static ssize_t queue_feature_store(struct gendisk *disk, const char *page, #define QUEUE_SYSFS_FEATURE(_name, _feature) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ - return sprintf(page, "%u\n", \ + return sysfs_emit(page, "%u\n", \ !!(disk->queue->limits.features & _feature)); \ } \ static ssize_t queue_##_name##_store(struct gendisk *disk, \ @@ -252,7 +252,7 @@ QUEUE_SYSFS_FEATURE(stable_writes, BLK_FEAT_STABLE_WRITES); #define QUEUE_SYSFS_FEATURE_SHOW(_name, _feature) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ - return sprintf(page, "%u\n", \ + return sysfs_emit(page, "%u\n", \ !!(disk->queue->limits.features & _feature)); \ } @@ -263,8 +263,8 @@ QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX); static ssize_t queue_zoned_show(struct gendisk *disk, char *page) { if (blk_queue_is_zoned(disk->queue)) - return sprintf(page, "host-managed\n"); - return sprintf(page, "none\n"); + return sysfs_emit(page, "host-managed\n"); + return sysfs_emit(page, "none\n"); } static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) @@ -377,7 +377,7 @@ static ssize_t queue_poll_store(struct gendisk *disk, const char *page, static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page) { - return sprintf(page, "%u\n", jiffies_to_msecs(disk->queue->rq_timeout)); + return sysfs_emit(page, "%u\n", jiffies_to_msecs(disk->queue->rq_timeout)); } static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, @@ -398,8 +398,8 @@ static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, static ssize_t queue_wc_show(struct gendisk *disk, char *page) { if (blk_queue_write_cache(disk->queue)) - return sprintf(page, "write back\n"); - return sprintf(page, "write through\n"); + return sysfs_emit(page, "write back\n"); + return sysfs_emit(page, "write through\n"); } static ssize_t queue_wc_store(struct gendisk *disk, const char *page, @@ -530,9 +530,9 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) return -EINVAL; if (wbt_disabled(disk->queue)) - return sprintf(page, "0\n"); + return sysfs_emit(page, "0\n"); - return sprintf(page, "%llu\n", + return sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(disk->queue), 1000)); } diff --git a/block/genhd.c b/block/genhd.c index dfee66146bd1..1971c91d6f72 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -783,7 +783,7 @@ static ssize_t disk_badblocks_show(struct device *dev, struct gendisk *disk = dev_to_disk(dev); if (!disk->bb) - return sprintf(page, "\n"); + return sysfs_emit(page, "\n"); return badblocks_show(disk->bb, page, 0); } @@ -931,7 +931,7 @@ static ssize_t disk_range_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", disk->minors); + return sysfs_emit(buf, "%d\n", disk->minors); } static ssize_t disk_ext_range_show(struct device *dev, @@ -939,7 +939,7 @@ static ssize_t disk_ext_range_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", + return sysfs_emit(buf, "%d\n", (disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS); } @@ -948,7 +948,7 @@ static ssize_t disk_removable_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", + return sysfs_emit(buf, "%d\n", (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); } @@ -957,7 +957,7 @@ static ssize_t disk_hidden_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", + return sysfs_emit(buf, "%d\n", (disk->flags & GENHD_FL_HIDDEN ? 1 : 0)); } @@ -966,13 +966,13 @@ static ssize_t disk_ro_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); + return sysfs_emit(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); } ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev))); + return sysfs_emit(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev))); } ssize_t part_stat_show(struct device *dev, @@ -989,7 +989,7 @@ ssize_t part_stat_show(struct device *dev, part_stat_unlock(); } part_stat_read_all(bdev, &stat); - return sprintf(buf, + return sysfs_emit(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " "%8u %8u %8u " @@ -1031,14 +1031,14 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, else part_in_flight_rw(bdev, inflight); - return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); + return sysfs_emit(buf, "%8u %8u\n", inflight[0], inflight[1]); } static ssize_t disk_capability_show(struct device *dev, struct device_attribute *attr, char *buf) { dev_warn_once(dev, "the capability attribute has been deprecated.\n"); - return sprintf(buf, "0\n"); + return sysfs_emit(buf, "0\n"); } static ssize_t disk_alignment_offset_show(struct device *dev, @@ -1047,7 +1047,7 @@ static ssize_t disk_alignment_offset_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0)); + return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t disk_discard_alignment_show(struct device *dev, @@ -1056,7 +1056,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0)); + return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t diskseq_show(struct device *dev, @@ -1064,13 +1064,13 @@ static ssize_t diskseq_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%llu\n", disk->diskseq); + return sysfs_emit(buf, "%llu\n", disk->diskseq); } static ssize_t partscan_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", disk_has_partscan(dev_to_disk(dev))); + return sysfs_emit(buf, "%u\n", disk_has_partscan(dev_to_disk(dev))); } static DEVICE_ATTR(range, 0444, disk_range_show, NULL); @@ -1092,7 +1092,7 @@ static DEVICE_ATTR(partscan, 0444, partscan_show, NULL); ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", + return sysfs_emit(buf, "%d\n", bdev_test_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL)); } -- 2.51.0