From 357e1b7f730bd85a383e7afa75a3caba329c5707 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 31 Oct 2024 21:37:20 +0800 Subject: [PATCH 01/16] block: don't verify IO lock for freeze/unfreeze in elevator_init_mq() elevator_init_mq() is only called at the entry of add_disk_fwnode() when disk IO isn't allowed yet. So not verify io lock(q->io_lockdep_map) for freeze & unfreeze in elevator_init_mq(). Reported-by: Marek Szyprowski Reported-by: Lai Yi Fixes: f1be1788a32e ("block: model freeze & enter queue as lock for supporting lockdep") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241031133723.303835-5-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/elevator.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/block/elevator.c b/block/elevator.c index f169f4bae917..7c3ba80e5ff4 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -598,13 +598,19 @@ void elevator_init_mq(struct request_queue *q) * drain any dispatch activities originated from passthrough * requests, then no need to quiesce queue which may add long boot * latency, especially when lots of disks are involved. + * + * Disk isn't added yet, so verifying queue lock only manually. */ - blk_mq_freeze_queue(q); + blk_freeze_queue_start_non_owner(q); + blk_freeze_acquire_lock(q, true, false); + blk_mq_freeze_queue_wait(q); + blk_mq_cancel_work_sync(q); err = blk_mq_init_sched(q, e); - blk_mq_unfreeze_queue(q); + blk_unfreeze_release_lock(q, true, false); + blk_mq_unfreeze_queue_non_owner(q); if (err) { pr_warn("\"%s\" elevator initialization failed, " -- 2.51.0 From fa1944bbe6220eb929e2c02e5e8706b908565711 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Wed, 6 Nov 2024 17:51:24 +0800 Subject: [PATCH 02/16] md/raid5: Wait sync io to finish before changing group cnt One customer reports a bug: raid5 is hung when changing thread cnt while resync is running. The stripes are all in conf->handle_list and new threads can't handle them. Commit b39f35ebe86d ("md: don't quiesce in mddev_suspend()") removes pers->quiesce from mddev_suspend/resume. Before this patch, mddev_suspend needs to wait for all ios including sync io to finish. Now it's used to only wait normal io. Fix this by calling raid5_quiesce from raid5_store_group_thread_cnt directly to wait all sync requests to finish before changing the group cnt. Fixes: b39f35ebe86d ("md: don't quiesce in mddev_suspend()") Cc: stable@vger.kernel.org Signed-off-by: Xiao Ni Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20241106095124.74577-1-xni@redhat.com Signed-off-by: Song Liu --- drivers/md/raid5.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f5ac81dd21b2..f09e7677ee9f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7176,6 +7176,8 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) err = mddev_suspend_and_lock(mddev); if (err) return err; + raid5_quiesce(mddev, true); + conf = mddev->private; if (!conf) err = -ENODEV; @@ -7197,6 +7199,8 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) kfree(old_groups); } } + + raid5_quiesce(mddev, false); mddev_unlock_and_resume(mddev); return err ?: len; -- 2.51.0 From c13c2d2a4b52eacab1c093e5b993c0a6f82c438e Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 7 Nov 2024 17:41:12 -0800 Subject: [PATCH 03/16] MAINTAINERS: Make Yu Kuai co-maintainer of md/raid subsystem In the past couple years, Yu Kuai has been making solid contributions to md/raid subsystem. Make Yu Kuai a co-maintainer. Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20241108014112.2098079-1-song@kernel.org Signed-off-by: Song Liu --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index e9659a5a7fb3..eeaa9f59dfe3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21303,7 +21303,7 @@ F: include/linux/property.h SOFTWARE RAID (Multiple Disks) SUPPORT M: Song Liu -R: Yu Kuai +M: Yu Kuai L: linux-raid@vger.kernel.org S: Supported Q: https://patchwork.kernel.org/project/linux-raid/list/ -- 2.51.0 From 8e604cac499248c75ad3a26695a743ff879ded5c Mon Sep 17 00:00:00 2001 From: Li Wang Date: Sat, 9 Nov 2024 10:27:44 +0800 Subject: [PATCH 04/16] loop: fix type of block size PAGE_SIZE may be 64K, and the max block size can be PAGE_SIZE, so any variable for holding block size can't be defined as 'unsigned short'. Unfortunately commit 473516b36193 ("loop: use the atomic queue limits update API") passes 'bsize' with type of 'unsigned short' to loop_reconfigure_limits(), and causes LTP/ioctl_loop06 test failure: 12 ioctl_loop06.c:76: TINFO: Using LOOP_SET_BLOCK_SIZE with arg > PAGE_SIZE 13 ioctl_loop06.c:59: TFAIL: Set block size succeed unexpectedly ... 18 ioctl_loop06.c:76: TINFO: Using LOOP_CONFIGURE with block_size > PAGE_SIZE 19 ioctl_loop06.c:59: TFAIL: Set block size succeed unexpectedly Fixes the issue by defining 'block size' variable with 'unsigned int', which is aligned with block layer's definition. (improve commit log & add fixes tag) Fixes: 473516b36193 ("loop: use the atomic queue limits update API") Cc: John Garry Cc: Stefan Hajnoczi Cc: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Jan Stancek Signed-off-by: Li Wang Signed-off-by: Ming Lei Reviewed-by: John Garry Link: https://lore.kernel.org/r/20241109022744.1126003-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f21f4254b038..fe9bb4fb5f1b 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -173,7 +173,7 @@ static loff_t get_loop_size(struct loop_device *lo, struct file *file) static bool lo_bdev_can_use_dio(struct loop_device *lo, struct block_device *backing_bdev) { - unsigned short sb_bsize = bdev_logical_block_size(backing_bdev); + unsigned int sb_bsize = bdev_logical_block_size(backing_bdev); if (queue_logical_block_size(lo->lo_queue) < sb_bsize) return false; @@ -976,7 +976,7 @@ loop_set_status_from_info(struct loop_device *lo, return 0; } -static unsigned short loop_default_blocksize(struct loop_device *lo, +static unsigned int loop_default_blocksize(struct loop_device *lo, struct block_device *backing_bdev) { /* In case of direct I/O, match underlying block size */ @@ -985,7 +985,7 @@ static unsigned short loop_default_blocksize(struct loop_device *lo, return SECTOR_SIZE; } -static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize) +static int loop_reconfigure_limits(struct loop_device *lo, unsigned int bsize) { struct file *file = lo->lo_backing_file; struct inode *inode = file->f_mapping->host; -- 2.51.0 From 7f5435b2a5ce6de8f53b65900a0e57a158e2e027 Mon Sep 17 00:00:00 2001 From: Miroslav Franc Date: Fri, 8 Nov 2024 14:39:12 +0100 Subject: [PATCH 05/16] s390/dasd: fix redundant /proc/dasd* entries removal In case of an early failure in dasd_init, dasd_proc_init is never called and /proc/dasd* files are never created. That can happen, for example, if an incompatible or incorrect argument is provided to the dasd_mod.dasd= kernel parameter. However, the attempted removal of /proc/dasd* files causes 8 warnings and backtraces in this case. 4 on the error path within dasd_init and 4 when the dasd module is unloaded. Notice the "removing permanent /proc entry 'devices'" message that is caused by the dasd_proc_exit function trying to remove /proc/devices instead of /proc/dasd/devices since dasd_proc_root_entry is NULL and /proc/devices is indeed permanent. Example: ------------[ cut here ]------------ removing permanent /proc entry 'devices' WARNING: CPU: 6 PID: 557 at fs/proc/generic.c:701 remove_proc_entry+0x22e/0x240 CPU: 6 PID: 557 Comm: modprobe Not tainted 6.10.5-1-default #1 openSUSE Tumbleweed f6917bfd6e5a5c7a7e900e0e3b517786fb5c6301 Hardware name: QEMU 8561 QEMU (KVM/Linux) Krnl PSW : 0704c00180000000 000003fffed0e9f2 (remove_proc_entry+0x232/0x240) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3 Krnl GPRS: 000003ff00000027 000003ff00000023 0000000000000028 000002f200000000 000002f3f05bec20 0000037ffecfb7d0 000003ffffdabab0 000003ff7ee4ec72 000003ff7ee4ec72 0000000000000007 000002f280e22600 000002f280e22688 000003ffa252cfa0 0000000000010000 000003fffed0e9ee 0000037ffecfba38 Krnl Code: 000003fffed0e9e2: c020004e7017 larl %r2,000003ffff6dca10 000003fffed0e9e8: c0e5ffdfad24 brasl %r14,000003fffe904430 #000003fffed0e9ee: af000000 mc 0,0 >000003fffed0e9f2: a7f4ff4c brc 15,000003fffed0e88a 000003fffed0e9f6: 0707 bcr 0,%r7 000003fffed0e9f8: 0707 bcr 0,%r7 000003fffed0e9fa: 0707 bcr 0,%r7 000003fffed0e9fc: 0707 bcr 0,%r7 Call Trace: [<000003fffed0e9f2>] remove_proc_entry+0x232/0x240 ([<000003fffed0e9ee>] remove_proc_entry+0x22e/0x240) [<000003ff7ef5a084>] dasd_proc_exit+0x34/0x60 [dasd_mod] [<000003ff7ef560c2>] dasd_exit+0x22/0xc0 [dasd_mod] [<000003ff7ee5a26e>] dasd_init+0x26e/0x280 [dasd_mod] [<000003fffe8ac9d0>] do_one_initcall+0x40/0x220 [<000003fffe9bc758>] do_init_module+0x78/0x260 [<000003fffe9bf3a6>] __do_sys_init_module+0x216/0x250 [<000003ffff37ac9e>] __do_syscall+0x24e/0x2d0 [<000003ffff38cca8>] system_call+0x70/0x98 Last Breaking-Event-Address: [<000003fffef7ea20>] __s390_indirect_jump_r14+0x0/0x10 ---[ end trace 0000000000000000 ]--- ------------[ cut here ]------------ While the cause is a user failure, the dasd module should handle the situation more gracefully. One of the simplest solutions is to make removal of the /proc/dasd* entries idempotent. Signed-off-by: Miroslav Franc [ sth: shortened if clause ] Signed-off-by: Stefan Haberland Link: https://lore.kernel.org/r/20241108133913.3068782-2-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_proc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/s390/block/dasd_proc.c b/drivers/s390/block/dasd_proc.c index 0faaa437d9be..48e12e81df00 100644 --- a/drivers/s390/block/dasd_proc.c +++ b/drivers/s390/block/dasd_proc.c @@ -350,6 +350,7 @@ dasd_proc_init(void) remove_proc_entry("devices", dasd_proc_root_entry); out_nodevices: remove_proc_entry("dasd", NULL); + dasd_proc_root_entry = NULL; out_nodasd: return -ENOENT; } @@ -357,7 +358,11 @@ dasd_proc_init(void) void dasd_proc_exit(void) { + if (!dasd_proc_root_entry) + return; + remove_proc_entry("devices", dasd_proc_root_entry); remove_proc_entry("statistics", dasd_proc_root_entry); remove_proc_entry("dasd", NULL); + dasd_proc_root_entry = NULL; } -- 2.51.0 From b2113edaa9afa1c405609b3dca18a9434d28b6c5 Mon Sep 17 00:00:00 2001 From: Yu Jiaoliang Date: Fri, 8 Nov 2024 14:39:13 +0100 Subject: [PATCH 06/16] s390/dasd: Fix typo in comment Fix typo in comment: requeust->request, Removve->Remove, notthing->nothing. Signed-off-by: Yu Jiaoliang Signed-off-by: Stefan Haberland Link: https://lore.kernel.org/r/20241108133913.3068782-3-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 2 +- drivers/s390/block/dasd_devmap.c | 2 +- drivers/s390/block/dasd_eckd.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 42a4a996defb..3ed642f4f00d 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -2117,7 +2117,7 @@ int dasd_flush_device_queue(struct dasd_device *device) case DASD_CQR_IN_IO: rc = device->discipline->term_IO(cqr); if (rc) { - /* unable to terminate requeust */ + /* unable to terminate request */ dev_err(&device->cdev->dev, "Flushing the DASD request queue failed\n"); /* stop flush processing */ diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index 6adaeb985dde..71d8fb86139d 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -855,7 +855,7 @@ dasd_delete_device(struct dasd_device *device) dev_set_drvdata(&device->cdev->dev, NULL); spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags); - /* Removve copy relation */ + /* Remove copy relation */ dasd_devmap_delete_copy_relation_device(device); /* * Drop ref_count by 3, one for the devmap reference, one for diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 90b106408992..1ebe589b5185 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -2405,7 +2405,7 @@ static int dasd_eckd_end_analysis(struct dasd_block *block) } if (count_area != NULL && count_area->kl == 0) { - /* we found notthing violating our disk layout */ + /* we found nothing violating our disk layout */ if (dasd_check_blocksize(count_area->dl) == 0) block->bp_block = count_area->dl; } -- 2.51.0 From d369735e02ef122d19d4c3d093028da0eb400636 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 11 Nov 2024 19:07:18 +0800 Subject: [PATCH 07/16] ublk: fix ublk_ch_mmap() for 64K page size In ublk_ch_mmap(), queue id is calculated in the following way: (vma->vm_pgoff << PAGE_SHIFT) / `max_cmd_buf_size` 'max_cmd_buf_size' is equal to `UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc)` and UBLK_MAX_QUEUE_DEPTH is 4096 and part of UAPI, so 'max_cmd_buf_size' is always page aligned in 4K page size kernel. However, it isn't true in 64K page size kernel. Fixes the issue by always rounding up 'max_cmd_buf_size' with PAGE_SIZE. Cc: stable@vger.kernel.org Fixes: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241111110718.1394001-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 59951e7c2593..4ae4fdb8bb7f 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -669,12 +669,21 @@ static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) return ublk_get_queue(ub, q_id)->io_cmd_buf; } +static inline int __ublk_queue_cmd_buf_size(int depth) +{ + return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE); +} + static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) { struct ublk_queue *ubq = ublk_get_queue(ub, q_id); - return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc), - PAGE_SIZE); + return __ublk_queue_cmd_buf_size(ubq->q_depth); +} + +static int ublk_max_cmd_buf_size(void) +{ + return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH); } /* @@ -1358,7 +1367,7 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) { struct ublk_device *ub = filp->private_data; size_t sz = vma->vm_end - vma->vm_start; - unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc); + unsigned max_sz = ublk_max_cmd_buf_size(); unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT; int q_id, ret = 0; -- 2.51.0 From e546fe1da9bd47a6fddce6b37c17b1aa1811f7d3 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 11 Nov 2024 11:21:45 +0000 Subject: [PATCH 08/16] block: Rework bio_split() return value Instead of returning an inconclusive value of NULL for an error in calling bio_split(), return a ERR_PTR() always. Also remove the BUG_ON() calls, and WARN_ON_ONCE() instead. Indeed, since almost all callers don't check the return code from bio_split(), we'll crash anyway (for those failures). Fix up the only user which checks bio_split() return code today (directly or indirectly), blk_crypto_fallback_split_bio_if_needed(). The md/bcache code does check the return code in cached_dev_cache_miss() -> bio_next_split() -> bio_split(), but only to see if there was a split, so there would be no change in behaviour here (when returning a ERR_PTR()). Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: John Garry Link: https://lore.kernel.org/r/20241111112150.3756529-2-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/bio.c | 10 ++++++---- block/blk-crypto-fallback.c | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/block/bio.c b/block/bio.c index daceb0a5c1d7..948b22825510 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1665,16 +1665,18 @@ struct bio *bio_split(struct bio *bio, int sectors, { struct bio *split; - BUG_ON(sectors <= 0); - BUG_ON(sectors >= bio_sectors(bio)); + if (WARN_ON_ONCE(sectors <= 0)) + return ERR_PTR(-EINVAL); + if (WARN_ON_ONCE(sectors >= bio_sectors(bio))) + return ERR_PTR(-EINVAL); /* Zone append commands cannot be split */ if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) - return NULL; + return ERR_PTR(-EINVAL); split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs); if (!split) - return NULL; + return ERR_PTR(-ENOMEM); split->bi_iter.bi_size = sectors << 9; diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index b1e7415f8439..29a205482617 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -226,7 +226,7 @@ static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr) split_bio = bio_split(bio, num_sectors, GFP_NOIO, &crypto_bio_split); - if (!split_bio) { + if (IS_ERR(split_bio)) { bio->bi_status = BLK_STS_RESOURCE; return false; } -- 2.51.0 From 27b26f09a7e6ae3ecae460299349b31fe0b5452f Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 11 Nov 2024 11:21:46 +0000 Subject: [PATCH 09/16] block: Error an attempt to split an atomic write in bio_split() This is disallowed. Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: John Garry Link: https://lore.kernel.org/r/20241111112150.3756529-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/bio.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/bio.c b/block/bio.c index 948b22825510..699a78c85c75 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1674,6 +1674,10 @@ struct bio *bio_split(struct bio *bio, int sectors, if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) return ERR_PTR(-EINVAL); + /* atomic writes cannot be split */ + if (bio->bi_opf & REQ_ATOMIC) + return ERR_PTR(-EINVAL); + split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs); if (!split) return ERR_PTR(-ENOMEM); -- 2.51.0 From 6eb09685885a4445da31097aa6418ee1875f9cec Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 11 Nov 2024 11:21:47 +0000 Subject: [PATCH 10/16] block: Handle bio_split() errors in bio_submit_split() bio_split() may error, so check this. Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: John Garry Link: https://lore.kernel.org/r/20241111112150.3756529-4-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/blk-merge.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index d813d799cee7..4cbccdbba638 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -107,17 +107,18 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim) static struct bio *bio_submit_split(struct bio *bio, int split_sectors) { - if (unlikely(split_sectors < 0)) { - bio->bi_status = errno_to_blk_status(split_sectors); - bio_endio(bio); - return NULL; - } + if (unlikely(split_sectors < 0)) + goto error; if (split_sectors) { struct bio *split; split = bio_split(bio, split_sectors, GFP_NOIO, &bio->bi_bdev->bd_disk->bio_split); + if (IS_ERR(split)) { + split_sectors = PTR_ERR(split); + goto error; + } split->bi_opf |= REQ_NOMERGE; blkcg_bio_issue_init(split); bio_chain(split, bio); @@ -128,6 +129,10 @@ static struct bio *bio_submit_split(struct bio *bio, int split_sectors) } return bio; +error: + bio->bi_status = errno_to_blk_status(split_sectors); + bio_endio(bio); + return NULL; } struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, -- 2.51.0 From 74538fdac3e85aae55eb4ed786478ed2384cb85d Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 11 Nov 2024 11:21:48 +0000 Subject: [PATCH 11/16] md/raid0: Handle bio_split() errors Add proper bio_split() error handling. For any error, set bi_status, end the bio, and return. Reviewed-by: Yu Kuai Reviewed-by: Hannes Reinecke Signed-off-by: John Garry Link: https://lore.kernel.org/r/20241111112150.3756529-5-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- drivers/md/raid0.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 32d587524778..baaf5f8b80ae 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -466,6 +466,12 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) struct bio *split = bio_split(bio, zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO, &mddev->bio_set); + + if (IS_ERR(split)) { + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); + bio_endio(bio); + return; + } bio_chain(split, bio); submit_bio_noacct(bio); bio = split; @@ -608,6 +614,12 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) if (sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, sectors, GFP_NOIO, &mddev->bio_set); + + if (IS_ERR(split)) { + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); + bio_endio(bio); + return true; + } bio_chain(split, bio); raid0_map_submit_bio(mddev, bio); bio = split; -- 2.51.0 From b1a7ad8b5c4fa28325ee7b369a2d545d3e16ccde Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 11 Nov 2024 11:21:49 +0000 Subject: [PATCH 12/16] md/raid1: Handle bio_split() errors Add proper bio_split() error handling. For any error, call raid_end_bio_io() and return. For the case of an in the write path, we need to undo the increment in the rdev pending count and NULLify the r1_bio->bios[] pointers. For read path failure, we need to undo rdev pending count increment from the earlier read_balance() call. Reviewed-by: Yu Kuai Reviewed-by: Hannes Reinecke Signed-off-by: John Garry Link: https://lore.kernel.org/r/20241111112150.3756529-6-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- drivers/md/raid1.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cd3e94dceabc..a5adf08ee174 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1322,7 +1322,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, const enum req_op op = bio_op(bio); const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; int max_sectors; - int rdisk; + int rdisk, error; bool r1bio_existed = !!r1_bio; /* @@ -1383,6 +1383,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, if (max_sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, max_sectors, gfp, &conf->bio_split); + + if (IS_ERR(split)) { + error = PTR_ERR(split); + goto err_handle; + } bio_chain(split, bio); submit_bio_noacct(bio); bio = split; @@ -1410,6 +1415,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_private = r1_bio; mddev_trace_remap(mddev, read_bio, r1_bio->sector); submit_bio_noacct(read_bio); + return; + +err_handle: + atomic_dec(&mirror->rdev->nr_pending); + bio->bi_status = errno_to_blk_status(error); + set_bit(R1BIO_Uptodate, &r1_bio->state); + raid_end_bio_io(r1_bio); } static bool wait_blocked_rdev(struct mddev *mddev, struct bio *bio) @@ -1451,7 +1463,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, { struct r1conf *conf = mddev->private; struct r1bio *r1_bio; - int i, disks; + int i, disks, k, error; unsigned long flags; int first_clone; int max_sectors; @@ -1579,6 +1591,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (max_sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, max_sectors, GFP_NOIO, &conf->bio_split); + + if (IS_ERR(split)) { + error = PTR_ERR(split); + goto err_handle; + } bio_chain(split, bio); submit_bio_noacct(bio); bio = split; @@ -1663,6 +1680,18 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, /* In case raid1d snuck in to freeze_array */ wake_up_barrier(conf); + return; +err_handle: + for (k = 0; k < i; k++) { + if (r1_bio->bios[k]) { + rdev_dec_pending(conf->mirrors[k].rdev, mddev); + r1_bio->bios[k] = NULL; + } + } + + bio->bi_status = errno_to_blk_status(error); + set_bit(R1BIO_Uptodate, &r1_bio->state); + raid_end_bio_io(r1_bio); } static bool raid1_make_request(struct mddev *mddev, struct bio *bio) -- 2.51.0 From 4cf58d9529097328b669e3c8693ed21e3a041903 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 11 Nov 2024 11:21:50 +0000 Subject: [PATCH 13/16] md/raid10: Handle bio_split() errors Add proper bio_split() error handling. For any error, call raid_end_bio_io() and return. Except for discard, where we end the bio directly. Reviewed-by: Yu Kuai Reviewed-by: Hannes Reinecke Signed-off-by: John Garry Link: https://lore.kernel.org/r/20241111112150.3756529-7-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- drivers/md/raid10.c | 47 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ff73db2f6c41..8c7f5daa073a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1159,6 +1159,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, int slot = r10_bio->read_slot; struct md_rdev *err_rdev = NULL; gfp_t gfp = GFP_NOIO; + int error; if (slot >= 0 && r10_bio->devs[slot].rdev) { /* @@ -1206,6 +1207,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, if (max_sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, max_sectors, gfp, &conf->bio_split); + if (IS_ERR(split)) { + error = PTR_ERR(split); + goto err_handle; + } bio_chain(split, bio); allow_barrier(conf); submit_bio_noacct(bio); @@ -1236,6 +1241,11 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, mddev_trace_remap(mddev, read_bio, r10_bio->sector); submit_bio_noacct(read_bio); return; +err_handle: + atomic_dec(&rdev->nr_pending); + bio->bi_status = errno_to_blk_status(error); + set_bit(R10BIO_Uptodate, &r10_bio->state); + raid_end_bio_io(r10_bio); } static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, @@ -1343,9 +1353,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, struct r10bio *r10_bio) { struct r10conf *conf = mddev->private; - int i; + int i, k; sector_t sectors; int max_sectors; + int error; if ((mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, @@ -1478,6 +1489,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, if (r10_bio->sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, r10_bio->sectors, GFP_NOIO, &conf->bio_split); + if (IS_ERR(split)) { + error = PTR_ERR(split); + goto err_handle; + } bio_chain(split, bio); allow_barrier(conf); submit_bio_noacct(bio); @@ -1499,6 +1514,26 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, raid10_write_one_disk(mddev, r10_bio, bio, true, i); } one_write_done(r10_bio); + return; +err_handle: + for (k = 0; k < i; k++) { + int d = r10_bio->devs[k].devnum; + struct md_rdev *rdev = conf->mirrors[d].rdev; + struct md_rdev *rrdev = conf->mirrors[d].replacement; + + if (r10_bio->devs[k].bio) { + rdev_dec_pending(rdev, mddev); + r10_bio->devs[k].bio = NULL; + } + if (r10_bio->devs[k].repl_bio) { + rdev_dec_pending(rrdev, mddev); + r10_bio->devs[k].repl_bio = NULL; + } + } + + bio->bi_status = errno_to_blk_status(error); + set_bit(R10BIO_Uptodate, &r10_bio->state); + raid_end_bio_io(r10_bio); } static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) @@ -1640,6 +1675,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) if (remainder) { split_size = stripe_size - remainder; split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); + if (IS_ERR(split)) { + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); + bio_endio(bio); + return 0; + } bio_chain(split, bio); allow_barrier(conf); /* Resend the fist split part */ @@ -1650,6 +1690,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) if (remainder) { split_size = bio_sectors(bio) - remainder; split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); + if (IS_ERR(split)) { + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); + bio_endio(bio); + return 0; + } bio_chain(split, bio); allow_barrier(conf); /* Resend the second split part */ -- 2.51.0 From 60dc5ea6bcfd078b71419640d49afa649acf9450 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2024 07:26:29 +0100 Subject: [PATCH 14/16] block: take chunk_sectors into account in bio_split_write_zeroes For zoned devices, write zeroes must be split at the zone boundary which is represented as chunk_sectors. For other uses like the internally RAIDed NVMe devices it is probably at least useful. Enhance get_max_io_size to know about write zeroes and use it in bio_split_write_zeroes. Also add a comment about the seemingly nonsensical zero max_write_zeroes limit. Fixes: 885fa13f6559 ("block: implement splitting of REQ_OP_WRITE_ZEROES bios") Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20241104062647.91160-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-merge.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 4cbccdbba638..cc320d9aed1a 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -171,17 +171,6 @@ struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, return bio_submit_split(bio, split_sectors); } -struct bio *bio_split_write_zeroes(struct bio *bio, - const struct queue_limits *lim, unsigned *nsegs) -{ - *nsegs = 0; - if (!lim->max_write_zeroes_sectors) - return bio; - if (bio_sectors(bio) <= lim->max_write_zeroes_sectors) - return bio; - return bio_submit_split(bio, lim->max_write_zeroes_sectors); -} - static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim, bool is_atomic) { @@ -216,7 +205,9 @@ static inline unsigned get_max_io_size(struct bio *bio, * We ignore lim->max_sectors for atomic writes because it may less * than the actual bio size, which we cannot tolerate. */ - if (is_atomic) + if (bio_op(bio) == REQ_OP_WRITE_ZEROES) + max_sectors = lim->max_write_zeroes_sectors; + else if (is_atomic) max_sectors = lim->atomic_write_max_sectors; else max_sectors = lim->max_sectors; @@ -403,6 +394,26 @@ struct bio *bio_split_zone_append(struct bio *bio, return bio_submit_split(bio, split_sectors); } +struct bio *bio_split_write_zeroes(struct bio *bio, + const struct queue_limits *lim, unsigned *nsegs) +{ + unsigned int max_sectors = get_max_io_size(bio, lim); + + *nsegs = 0; + + /* + * An unset limit should normally not happen, as bio submission is keyed + * off having a non-zero limit. But SCSI can clear the limit in the + * I/O completion handler, and we can race and see this. Splitting to a + * zero limit obviously doesn't make sense, so band-aid it here. + */ + if (!max_sectors) + return bio; + if (bio_sectors(bio) <= max_sectors) + return bio; + return bio_submit_split(bio, max_sectors); +} + /** * bio_split_to_limits - split a bio to fit the queue limits * @bio: bio to be split -- 2.51.0 From 7ecd2cd4fae3e8410c0a6620f3a83dcdbb254f02 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2024 07:26:30 +0100 Subject: [PATCH 15/16] block: fix bio_split_rw_at to take zone_write_granularity into account Otherwise it can create unaligned writes on zoned devices. Fixes: a805a4fa4fa3 ("block: introduce zone_write_granularity limit") Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241104062647.91160-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-merge.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index cc320d9aed1a..859875a5ee90 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -292,6 +292,14 @@ static bool bvec_split_segs(const struct queue_limits *lim, return len > 0 || bv->bv_len > max_len; } +static unsigned int bio_split_alignment(struct bio *bio, + const struct queue_limits *lim) +{ + if (op_is_write(bio_op(bio)) && lim->zone_write_granularity) + return lim->zone_write_granularity; + return lim->logical_block_size; +} + /** * bio_split_rw_at - check if and where to split a read/write bio * @bio: [in] bio to be split @@ -354,7 +362,7 @@ split: * split size so that each bio is properly block size aligned, even if * we do not use the full hardware limits. */ - bytes = ALIGN_DOWN(bytes, lim->logical_block_size); + bytes = ALIGN_DOWN(bytes, bio_split_alignment(bio, lim)); /* * Bio splitting may cause subtle trouble such as hang when doing sync -- 2.51.0 From 0ef2b9e698dbf9ba78f67952a747f35eb7060470 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2024 07:26:31 +0100 Subject: [PATCH 16/16] block: lift bio_is_zone_append to bio.h Make bio_is_zone_append globally available, because file systems need to use to check for a zone append bio in their end_io handlers to deal with the block layer emulation. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241104062647.91160-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 9 --------- include/linux/bio.h | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/block/blk.h b/block/blk.h index 57fc035620d6..2c26abf505b8 100644 --- a/block/blk.h +++ b/block/blk.h @@ -457,11 +457,6 @@ static inline bool bio_zone_write_plugging(struct bio *bio) { return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); } -static inline bool bio_is_zone_append(struct bio *bio) -{ - return bio_op(bio) == REQ_OP_ZONE_APPEND || - bio_flagged(bio, BIO_EMULATES_ZONE_APPEND); -} void blk_zone_write_plug_bio_merged(struct bio *bio); void blk_zone_write_plug_init_request(struct request *rq); static inline void blk_zone_update_request_bio(struct request *rq, @@ -510,10 +505,6 @@ static inline bool bio_zone_write_plugging(struct bio *bio) { return false; } -static inline bool bio_is_zone_append(struct bio *bio) -{ - return false; -} static inline void blk_zone_write_plug_bio_merged(struct bio *bio) { } diff --git a/include/linux/bio.h b/include/linux/bio.h index 4a1bf43ca53d..60830a6a5939 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -675,6 +675,23 @@ static inline void bio_clear_polled(struct bio *bio) bio->bi_opf &= ~REQ_POLLED; } +/** + * bio_is_zone_append - is this a zone append bio? + * @bio: bio to check + * + * Check if @bio is a zone append operation. Core block layer code and end_io + * handlers must use this instead of an open coded REQ_OP_ZONE_APPEND check + * because the block layer can rewrite REQ_OP_ZONE_APPEND to REQ_OP_WRITE if + * it is not natively supported. + */ +static inline bool bio_is_zone_append(struct bio *bio) +{ + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) + return false; + return bio_op(bio) == REQ_OP_ZONE_APPEND || + bio_flagged(bio, BIO_EMULATES_ZONE_APPEND); +} + struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, unsigned int nr_pages, blk_opf_t opf, gfp_t gfp); struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new); -- 2.51.0