From 957860cbc1dc89f79f2acc193470224e350dfd03 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 15 Nov 2024 07:14:03 -0700
Subject: [PATCH 01/16] block: make struct rq_list available for !CONFIG_BLOCK

A previous commit changed how requests are linked in the plug structure,
but unlike the previous method, it uses a new type for it rather than
struct request. The latter is available even for !CONFIG_BLOCK, while
struct rq_list is now. Move it outside CONFIG_BLOCK.

Reported-by: Nathan Chancellor <nathan@kernel.org>
Fixes: a3396b99990d ("block: add a rq_list type")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 00212e96261a..a1fd0ddce5cf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1006,12 +1006,12 @@ extern void blk_put_queue(struct request_queue *);
 
 void blk_mark_disk_dead(struct gendisk *disk);
 
-#ifdef CONFIG_BLOCK
 struct rq_list {
 	struct request *head;
 	struct request *tail;
 };
 
+#ifdef CONFIG_BLOCK
 /*
  * blk_plug permits building a queue of related requests by holding the I/O
  * fragments for a short period. This allows merging of sequential requests
-- 
2.51.0


From 886e4757f42e5775b7c2a6e0e5e2dc9a78177b0a Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Fri, 15 Nov 2024 10:25:53 -0800
Subject: [PATCH 02/16] MAINTAINERS: Update git tree for mdraid subsystem

Moving the official git tree to the MDRAID Group account.

Signed-off-by: Song Liu <song@kernel.org>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index eeaa9f59dfe3..d9f35571da0f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21307,7 +21307,7 @@ M:	Yu Kuai <yukuai3@huawei.com>
 L:	linux-raid@vger.kernel.org
 S:	Supported
 Q:	https://patchwork.kernel.org/project/linux-raid/list/
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mdraid/linux.git
 F:	drivers/md/Kconfig
 F:	drivers/md/Makefile
 F:	drivers/md/md*
-- 
2.51.0


From a3f143c461444c0b56360bbf468615fa814a8372 Mon Sep 17 00:00:00 2001
From: Manas <manas18244@iiitd.ac.in>
Date: Mon, 18 Nov 2024 20:06:58 +0530
Subject: [PATCH 03/16] rust: block: simplify Result<()> in validate_block_size
 return

`Result` is used in place of `Result<()>` because the default type
parameters are unit `()` and `Error` types, which are automatically
inferred. Thus keep the usage consistent throughout codebase.

Suggested-by: Miguel Ojeda <ojeda@kernel.org>
Link: https://github.com/Rust-for-Linux/linux/issues/1128
Signed-off-by: Manas <manas18244@iiitd.ac.in>
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Link: https://lore.kernel.org/r/20241118-simplify-result-v3-1-6b1566a77eab@iiitd.ac.in
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 rust/kernel/block/mq/gen_disk.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs
index 708125dce96a..798c4ae0bded 100644
--- a/rust/kernel/block/mq/gen_disk.rs
+++ b/rust/kernel/block/mq/gen_disk.rs
@@ -45,7 +45,7 @@ impl GenDiskBuilder {
 
     /// Validate block size by verifying that it is between 512 and `PAGE_SIZE`,
     /// and that it is a power of two.
-    fn validate_block_size(size: u32) -> Result<()> {
+    fn validate_block_size(size: u32) -> Result {
         if !(512..=bindings::PAGE_SIZE as u32).contains(&size) || !size.is_power_of_two() {
             Err(error::code::EINVAL)
         } else {
-- 
2.51.0


From 9c0ba14828d64744ccd195c610594ba254a1a9ab Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 18 Nov 2024 15:52:50 +0100
Subject: [PATCH 04/16] blk-settings: round down io_opt to physical_block_size

There was a bug report [1] where the user got a warning alignment
inconsistency. The user has optimal I/O 16776704 (0xFFFE00) and physical
block size 4096. Note that the optimal I/O size may be set by the DMA
engines or SCSI controllers and they have no knowledge about the disks
attached to them, so the situation with optimal I/O not aligned to
physical block size may happen.

This commit makes blk_validate_limits round down optimal I/O size to the
physical block size of the block device.

Closes: https://lore.kernel.org/dm-devel/1426ad71-79b4-4062-b2bf-84278be66a5d@redhat.com/T/ [1]
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Fixes: a23634644afc ("block: take io_opt and io_min into account for max_sectors")
Cc: stable@vger.kernel.org	# v6.11+
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/3dc0014b-9690-dc38-81c9-4a316a2d4fb2@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-settings.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index f1d4dfdc37a7..0fe16f987cde 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -248,6 +248,13 @@ int blk_validate_limits(struct queue_limits *lim)
 	if (lim->io_min < lim->physical_block_size)
 		lim->io_min = lim->physical_block_size;
 
+	/*
+	 * The optimal I/O size may not be aligned to physical block size
+	 * (because it may be limited by dma engines which have no clue about
+	 * block size of the disks attached to them), so we round it down here.
+	 */
+	lim->io_opt = round_down(lim->io_opt, lim->physical_block_size);
+
 	/*
 	 * max_hw_sectors has a somewhat weird default for historical reason,
 	 * but driver really should set their own instead of relying on this
-- 
2.51.0


From 3802f73bd80766d70f319658f334754164075bc3 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 4 Nov 2024 19:00:05 +0800
Subject: [PATCH 05/16] block: fix uaf for flush rq while iterating tags

blk_mq_clear_flush_rq_mapping() is not called during scsi probe, by
checking blk_queue_init_done(). However, QUEUE_FLAG_INIT_DONE is cleared
in del_gendisk by commit aec89dc5d421 ("block: keep q_usage_counter in
atomic mode after del_gendisk"), hence for disk like scsi, following
blk_mq_destroy_queue() will not clear flush rq from tags->rqs[] as well,
cause following uaf that is found by our syzkaller for v6.6:

==================================================================
BUG: KASAN: slab-use-after-free in blk_mq_find_and_get_req+0x16e/0x1a0 block/blk-mq-tag.c:261
Read of size 4 at addr ffff88811c969c20 by task kworker/1:2H/224909

CPU: 1 PID: 224909 Comm: kworker/1:2H Not tainted 6.6.0-ga836a5060850 #32
Workqueue: kblockd blk_mq_timeout_work
Call Trace:

__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x91/0xf0 lib/dump_stack.c:106
print_address_description.constprop.0+0x66/0x300 mm/kasan/report.c:364
print_report+0x3e/0x70 mm/kasan/report.c:475
kasan_report+0xb8/0xf0 mm/kasan/report.c:588
blk_mq_find_and_get_req+0x16e/0x1a0 block/blk-mq-tag.c:261
bt_iter block/blk-mq-tag.c:288 [inline]
__sbitmap_for_each_set include/linux/sbitmap.h:295 [inline]
sbitmap_for_each_set include/linux/sbitmap.h:316 [inline]
bt_for_each+0x455/0x790 block/blk-mq-tag.c:325
blk_mq_queue_tag_busy_iter+0x320/0x740 block/blk-mq-tag.c:534
blk_mq_timeout_work+0x1a3/0x7b0 block/blk-mq.c:1673
process_one_work+0x7c4/0x1450 kernel/workqueue.c:2631
process_scheduled_works kernel/workqueue.c:2704 [inline]
worker_thread+0x804/0xe40 kernel/workqueue.c:2785
kthread+0x346/0x450 kernel/kthread.c:388
ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147
ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:293

Allocated by task 942:
kasan_save_stack+0x22/0x50 mm/kasan/common.c:45
kasan_set_track+0x25/0x30 mm/kasan/common.c:52
____kasan_kmalloc mm/kasan/common.c:374 [inline]
__kasan_kmalloc mm/kasan/common.c:383 [inline]
__kasan_kmalloc+0xaa/0xb0 mm/kasan/common.c:380
kasan_kmalloc include/linux/kasan.h:198 [inline]
__do_kmalloc_node mm/slab_common.c:1007 [inline]
__kmalloc_node+0x69/0x170 mm/slab_common.c:1014
kmalloc_node include/linux/slab.h:620 [inline]
kzalloc_node include/linux/slab.h:732 [inline]
blk_alloc_flush_queue+0x144/0x2f0 block/blk-flush.c:499
blk_mq_alloc_hctx+0x601/0x940 block/blk-mq.c:3788
blk_mq_alloc_and_init_hctx+0x27f/0x330 block/blk-mq.c:4261
blk_mq_realloc_hw_ctxs+0x488/0x5e0 block/blk-mq.c:4294
blk_mq_init_allocated_queue+0x188/0x860 block/blk-mq.c:4350
blk_mq_init_queue_data block/blk-mq.c:4166 [inline]
blk_mq_init_queue+0x8d/0x100 block/blk-mq.c:4176
scsi_alloc_sdev+0x843/0xd50 drivers/scsi/scsi_scan.c:335
scsi_probe_and_add_lun+0x77c/0xde0 drivers/scsi/scsi_scan.c:1189
__scsi_scan_target+0x1fc/0x5a0 drivers/scsi/scsi_scan.c:1727
scsi_scan_channel drivers/scsi/scsi_scan.c:1815 [inline]
scsi_scan_channel+0x14b/0x1e0 drivers/scsi/scsi_scan.c:1791
scsi_scan_host_selected+0x2fe/0x400 drivers/scsi/scsi_scan.c:1844
scsi_scan+0x3a0/0x3f0 drivers/scsi/scsi_sysfs.c:151
store_scan+0x2a/0x60 drivers/scsi/scsi_sysfs.c:191
dev_attr_store+0x5c/0x90 drivers/base/core.c:2388
sysfs_kf_write+0x11c/0x170 fs/sysfs/file.c:136
kernfs_fop_write_iter+0x3fc/0x610 fs/kernfs/file.c:338
call_write_iter include/linux/fs.h:2083 [inline]
new_sync_write+0x1b4/0x2d0 fs/read_write.c:493
vfs_write+0x76c/0xb00 fs/read_write.c:586
ksys_write+0x127/0x250 fs/read_write.c:639
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
do_syscall_64+0x70/0x120 arch/x86/entry/common.c:81
entry_SYSCALL_64_after_hwframe+0x78/0xe2

Freed by task 244687:
kasan_save_stack+0x22/0x50 mm/kasan/common.c:45
kasan_set_track+0x25/0x30 mm/kasan/common.c:52
kasan_save_free_info+0x2b/0x50 mm/kasan/generic.c:522
____kasan_slab_free mm/kasan/common.c:236 [inline]
__kasan_slab_free+0x12a/0x1b0 mm/kasan/common.c:244
kasan_slab_free include/linux/kasan.h:164 [inline]
slab_free_hook mm/slub.c:1815 [inline]
slab_free_freelist_hook mm/slub.c:1841 [inline]
slab_free mm/slub.c:3807 [inline]
__kmem_cache_free+0xe4/0x520 mm/slub.c:3820
blk_free_flush_queue+0x40/0x60 block/blk-flush.c:520
blk_mq_hw_sysfs_release+0x4a/0x170 block/blk-mq-sysfs.c:37
kobject_cleanup+0x136/0x410 lib/kobject.c:689
kobject_release lib/kobject.c:720 [inline]
kref_put include/linux/kref.h:65 [inline]
kobject_put+0x119/0x140 lib/kobject.c:737
blk_mq_release+0x24f/0x3f0 block/blk-mq.c:4144
blk_free_queue block/blk-core.c:298 [inline]
blk_put_queue+0xe2/0x180 block/blk-core.c:314
blkg_free_workfn+0x376/0x6e0 block/blk-cgroup.c:144
process_one_work+0x7c4/0x1450 kernel/workqueue.c:2631
process_scheduled_works kernel/workqueue.c:2704 [inline]
worker_thread+0x804/0xe40 kernel/workqueue.c:2785
kthread+0x346/0x450 kernel/kthread.c:388
ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147
ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:293

Other than blk_mq_clear_flush_rq_mapping(), the flag is only used in
blk_register_queue() from initialization path, hence it's safe not to
clear the flag in del_gendisk. And since QUEUE_FLAG_REGISTERED already
make sure that queue should only be registered once, there is no need
to test the flag as well.

Fixes: 6cfeadbff3f8 ("blk-mq: don't clear flush_rq from tags->rqs[]")
Depends-on: commit aec89dc5d421 ("block: keep q_usage_counter in atomic mode after del_gendisk")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20241104110005.1412161-1-yukuai1@huaweicloud.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-sysfs.c | 6 ++----
 block/genhd.c     | 9 +++------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index d80a202cd170..4241aea84161 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -810,10 +810,8 @@ int blk_register_queue(struct gendisk *disk)
 	 * faster to shut down and is made fully functional here as
 	 * request_queues for non-existent devices never get registered.
 	 */
-	if (!blk_queue_init_done(q)) {
-		blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q);
-		percpu_ref_switch_to_percpu(&q->q_usage_counter);
-	}
+	blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q);
+	percpu_ref_switch_to_percpu(&q->q_usage_counter);
 
 	return ret;
 
diff --git a/block/genhd.c b/block/genhd.c
index 9130e163e191..79230c109fca 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -742,13 +742,10 @@ void del_gendisk(struct gendisk *disk)
 	 * If the disk does not own the queue, allow using passthrough requests
 	 * again.  Else leave the queue frozen to fail all I/O.
 	 */
-	if (!test_bit(GD_OWNS_QUEUE, &disk->state)) {
-		blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q);
+	if (!test_bit(GD_OWNS_QUEUE, &disk->state))
 		__blk_mq_unfreeze_queue(q, true);
-	} else {
-		if (queue_is_mq(q))
-			blk_mq_exit_queue(q);
-	}
+	else if (queue_is_mq(q))
+		blk_mq_exit_queue(q);
 
 	if (start_drain)
 		blk_unfreeze_release_lock(q, true, queue_dying);
-- 
2.51.0


From 46fd48ab3ea3eb3bb215684bd66ea3d260b091a9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 Nov 2024 08:26:02 +0100
Subject: [PATCH 06/16] block: return unsigned int from bdev_io_min

The underlying limit is defined as an unsigned int, so return that from
bdev_io_min as well.

Fixes: ac481c20ef8f ("block: Topology ioctls")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Link: https://lore.kernel.org/r/20241119072602.1059488-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a1fd0ddce5cf..195db38fda16 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1261,7 +1261,7 @@ static inline unsigned int queue_io_min(const struct request_queue *q)
 	return q->limits.io_min;
 }
 
-static inline int bdev_io_min(struct block_device *bdev)
+static inline unsigned int bdev_io_min(struct block_device *bdev)
 {
 	return queue_io_min(bdev_get_queue(bdev));
 }
-- 
2.51.0


From b49125574cae26458d4aa02ce8f4523ba9a2a328 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Tue, 19 Nov 2024 23:42:23 +0900
Subject: [PATCH 07/16] loop: Fix ABBA locking race

Current loop calls vfs_statfs() while holding the q->limits_lock. If
FS takes some locking in vfs_statfs callback, this may lead to ABBA
locking bug (at least, FAT fs has this issue actually).

So this patch calls vfs_statfs() outside q->limits_locks instead,
because looks like no reason to hold q->limits_locks while getting
discord configs.

Chain exists of:
  &sbi->fat_lock --> &q->q_usage_counter(io)#17 --> &q->limits_lock

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&q->limits_lock);
                               lock(&q->q_usage_counter(io)#17);
                               lock(&q->limits_lock);
  lock(&sbi->fat_lock);

 *** DEADLOCK ***

Reported-by: syzbot+a5d8c609c02f508672cc@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=a5d8c609c02f508672cc
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index fe9bb4fb5f1b..8f6761c27c68 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -770,12 +770,11 @@ static void loop_sysfs_exit(struct loop_device *lo)
 				   &loop_attribute_group);
 }
 
-static void loop_config_discard(struct loop_device *lo,
-		struct queue_limits *lim)
+static void loop_get_discard_config(struct loop_device *lo,
+				    u32 *granularity, u32 *max_discard_sectors)
 {
 	struct file *file = lo->lo_backing_file;
 	struct inode *inode = file->f_mapping->host;
-	u32 granularity = 0, max_discard_sectors = 0;
 	struct kstatfs sbuf;
 
 	/*
@@ -788,24 +787,17 @@ static void loop_config_discard(struct loop_device *lo,
 	if (S_ISBLK(inode->i_mode)) {
 		struct block_device *bdev = I_BDEV(inode);
 
-		max_discard_sectors = bdev_write_zeroes_sectors(bdev);
-		granularity = bdev_discard_granularity(bdev);
+		*max_discard_sectors = bdev_write_zeroes_sectors(bdev);
+		*granularity = bdev_discard_granularity(bdev);
 
 	/*
 	 * We use punch hole to reclaim the free space used by the
 	 * image a.k.a. discard.
 	 */
 	} else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) {
-		max_discard_sectors = UINT_MAX >> 9;
-		granularity = sbuf.f_bsize;
+		*max_discard_sectors = UINT_MAX >> 9;
+		*granularity = sbuf.f_bsize;
 	}
-
-	lim->max_hw_discard_sectors = max_discard_sectors;
-	lim->max_write_zeroes_sectors = max_discard_sectors;
-	if (max_discard_sectors)
-		lim->discard_granularity = granularity;
-	else
-		lim->discard_granularity = 0;
 }
 
 struct loop_worker {
@@ -991,6 +983,7 @@ static int loop_reconfigure_limits(struct loop_device *lo, unsigned int bsize)
 	struct inode *inode = file->f_mapping->host;
 	struct block_device *backing_bdev = NULL;
 	struct queue_limits lim;
+	u32 granularity = 0, max_discard_sectors = 0;
 
 	if (S_ISBLK(inode->i_mode))
 		backing_bdev = I_BDEV(inode);
@@ -1000,6 +993,8 @@ static int loop_reconfigure_limits(struct loop_device *lo, unsigned int bsize)
 	if (!bsize)
 		bsize = loop_default_blocksize(lo, backing_bdev);
 
+	loop_get_discard_config(lo, &granularity, &max_discard_sectors);
+
 	lim = queue_limits_start_update(lo->lo_queue);
 	lim.logical_block_size = bsize;
 	lim.physical_block_size = bsize;
@@ -1009,7 +1004,12 @@ static int loop_reconfigure_limits(struct loop_device *lo, unsigned int bsize)
 		lim.features |= BLK_FEAT_WRITE_CACHE;
 	if (backing_bdev && !bdev_nonrot(backing_bdev))
 		lim.features |= BLK_FEAT_ROTATIONAL;
-	loop_config_discard(lo, &lim);
+	lim.max_hw_discard_sectors = max_discard_sectors;
+	lim.max_write_zeroes_sectors = max_discard_sectors;
+	if (max_discard_sectors)
+		lim.discard_granularity = granularity;
+	else
+		lim.discard_granularity = 0;
 	return queue_limits_commit_update(lo->lo_queue, &lim);
 }
 
-- 
2.51.0


From e924da7d6622b72f9eee78a3aad3e75b859da341 Mon Sep 17 00:00:00 2001
From: John Garry <john.g.garry@oracle.com>
Date: Tue, 12 Nov 2024 09:21:44 +0000
Subject: [PATCH 08/16] block: Drop granularity check in
 queue_limit_discard_alignment()

lim->discard_granularity is always at least SECTOR_SIZE, so drop the
pointless check for granularity less than SECTOR_SIZE.

Signed-off-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20241112092144.4059847-1-john.g.garry@oracle.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-settings.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 0fe16f987cde..2f1005da530c 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -465,8 +465,6 @@ static unsigned int queue_limit_discard_alignment(
 	/* Why are these in bytes, not sectors? */
 	alignment = lim->discard_alignment >> SECTOR_SHIFT;
 	granularity = lim->discard_granularity >> SECTOR_SHIFT;
-	if (!granularity)
-		return 0;
 
 	/* Offset of the partition start in 'granularity' sectors */
 	offset = sector_div(sector, granularity);
-- 
2.51.0


From 34c1227035b3ab930a1ae6ab6f22fec1af8ab09e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 19 Nov 2024 11:06:46 +0800
Subject: [PATCH 09/16] ublk: fix error code for unsupported command

ENOTSUPP is for kernel use only, and shouldn't be sent to userspace.

Fix it by replacing it with EOPNOTSUPP.

Cc: stable@vger.kernel.org
Fixes: bfbcef036396 ("ublk_drv: move ublk_get_device_from_id into ublk_ctrl_uring_cmd")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20241119030646.2319030-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index c6d18cd8af44..d4aed12dd436 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -3041,7 +3041,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 		ret = ublk_ctrl_end_recovery(ub, cmd);
 		break;
 	default:
-		ret = -ENOTSUPP;
+		ret = -EOPNOTSUPP;
 		break;
 	}
 
-- 
2.51.0


From d00eea91deaf363f83599532cb49fa528ab8e00e Mon Sep 17 00:00:00 2001
From: John Garry <john.g.garry@oracle.com>
Date: Mon, 18 Nov 2024 10:50:14 +0000
Subject: [PATCH 10/16] block: Add extra checks in
 blk_validate_atomic_write_limits()

It is so far expected that the limits passed are valid.

In future atomic writes will be supported for stacked block devices, and
calculating the limits there will be complicated, so add extra sanity
checks to ensure that the values are always valid.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20241118105018.1870052-2-john.g.garry@oracle.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-settings.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 2f1005da530c..03c67620f98a 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -178,9 +178,26 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim)
 	if (!lim->atomic_write_hw_max)
 		goto unsupported;
 
+	if (WARN_ON_ONCE(!is_power_of_2(lim->atomic_write_hw_unit_min)))
+		goto unsupported;
+
+	if (WARN_ON_ONCE(!is_power_of_2(lim->atomic_write_hw_unit_max)))
+		goto unsupported;
+
+	if (WARN_ON_ONCE(lim->atomic_write_hw_unit_min >
+			 lim->atomic_write_hw_unit_max))
+		goto unsupported;
+
+	if (WARN_ON_ONCE(lim->atomic_write_hw_unit_max >
+			 lim->atomic_write_hw_max))
+		goto unsupported;
+
 	boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT;
 
 	if (boundary_sectors) {
+		if (WARN_ON_ONCE(lim->atomic_write_hw_max >
+				 lim->atomic_write_hw_boundary))
+			goto unsupported;
 		/*
 		 * A feature of boundary support is that it disallows bios to
 		 * be merged which would result in a merged request which
-- 
2.51.0


From d7f36dc446e894e0f57b5f05c5628f03c5f9e2d2 Mon Sep 17 00:00:00 2001
From: John Garry <john.g.garry@oracle.com>
Date: Mon, 18 Nov 2024 10:50:15 +0000
Subject: [PATCH 11/16] block: Support atomic writes limits for stacked devices

Allow stacked devices to support atomic writes by aggregating the minimum
capability of all bottom devices.

Flag BLK_FEAT_ATOMIC_WRITES_STACKED is set for stacked devices which
have been enabled to support atomic writes.

Some things to note on the implementation:
- For simplicity, all bottom devices must have same atomic write boundary
  value (if any)
- The atomic write boundary must be a power-of-2 already, but this
  restriction could be relaxed. Furthermore, it is now required that the
  chunk sectors for a top device must be aligned with this boundary.
- If a bottom device atomic write unit min/max are not aligned with the
  top device chunk sectors, the top device atomic write unit min/max are
  reduced to a value which works for the chunk sectors.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20241118105018.1870052-3-john.g.garry@oracle.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-settings.c   | 115 +++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |   4 ++
 2 files changed, 119 insertions(+)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 03c67620f98a..8f09e33f41f6 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -501,6 +501,119 @@ static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lb
 	return sectors;
 }
 
+/* Check if second and later bottom devices are compliant */
+static bool blk_stack_atomic_writes_tail(struct queue_limits *t,
+				struct queue_limits *b)
+{
+	/* We're not going to support different boundary sizes.. yet */
+	if (t->atomic_write_hw_boundary != b->atomic_write_hw_boundary)
+		return false;
+
+	/* Can't support this */
+	if (t->atomic_write_hw_unit_min > b->atomic_write_hw_unit_max)
+		return false;
+
+	/* Or this */
+	if (t->atomic_write_hw_unit_max < b->atomic_write_hw_unit_min)
+		return false;
+
+	t->atomic_write_hw_max = min(t->atomic_write_hw_max,
+				b->atomic_write_hw_max);
+	t->atomic_write_hw_unit_min = max(t->atomic_write_hw_unit_min,
+				b->atomic_write_hw_unit_min);
+	t->atomic_write_hw_unit_max = min(t->atomic_write_hw_unit_max,
+				b->atomic_write_hw_unit_max);
+	return true;
+}
+
+/* Check for valid boundary of first bottom device */
+static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t,
+				struct queue_limits *b)
+{
+	/*
+	 * Ensure atomic write boundary is aligned with chunk sectors. Stacked
+	 * devices store chunk sectors in t->io_min.
+	 */
+	if (b->atomic_write_hw_boundary > t->io_min &&
+	    b->atomic_write_hw_boundary % t->io_min)
+		return false;
+	if (t->io_min > b->atomic_write_hw_boundary &&
+	    t->io_min % b->atomic_write_hw_boundary)
+		return false;
+
+	t->atomic_write_hw_boundary = b->atomic_write_hw_boundary;
+	return true;
+}
+
+
+/* Check stacking of first bottom device */
+static bool blk_stack_atomic_writes_head(struct queue_limits *t,
+				struct queue_limits *b)
+{
+	if (b->atomic_write_hw_boundary &&
+	    !blk_stack_atomic_writes_boundary_head(t, b))
+		return false;
+
+	if (t->io_min <= SECTOR_SIZE) {
+		/* No chunk sectors, so use bottom device values directly */
+		t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
+		t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min;
+		t->atomic_write_hw_max = b->atomic_write_hw_max;
+		return true;
+	}
+
+	/*
+	 * Find values for limits which work for chunk size.
+	 * b->atomic_write_hw_unit_{min, max} may not be aligned with chunk
+	 * size (t->io_min), as chunk size is not restricted to a power-of-2.
+	 * So we need to find highest power-of-2 which works for the chunk
+	 * size.
+	 * As an example scenario, we could have b->unit_max = 16K and
+	 * t->io_min = 24K. For this case, reduce t->unit_max to a value
+	 * aligned with both limits, i.e. 8K in this example.
+	 */
+	t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
+	while (t->io_min % t->atomic_write_hw_unit_max)
+		t->atomic_write_hw_unit_max /= 2;
+
+	t->atomic_write_hw_unit_min = min(b->atomic_write_hw_unit_min,
+					  t->atomic_write_hw_unit_max);
+	t->atomic_write_hw_max = min(b->atomic_write_hw_max, t->io_min);
+
+	return true;
+}
+
+static void blk_stack_atomic_writes_limits(struct queue_limits *t,
+				struct queue_limits *b)
+{
+	if (!(t->features & BLK_FEAT_ATOMIC_WRITES_STACKED))
+		goto unsupported;
+
+	if (!b->atomic_write_unit_min)
+		goto unsupported;
+
+	/*
+	 * If atomic_write_hw_max is set, we have already stacked 1x bottom
+	 * device, so check for compliance.
+	 */
+	if (t->atomic_write_hw_max) {
+		if (!blk_stack_atomic_writes_tail(t, b))
+			goto unsupported;
+		return;
+	}
+
+	if (!blk_stack_atomic_writes_head(t, b))
+		goto unsupported;
+	return;
+
+unsupported:
+	t->atomic_write_hw_max = 0;
+	t->atomic_write_hw_unit_max = 0;
+	t->atomic_write_hw_unit_min = 0;
+	t->atomic_write_hw_boundary = 0;
+	t->features &= ~BLK_FEAT_ATOMIC_WRITES_STACKED;
+}
+
 /**
  * blk_stack_limits - adjust queue_limits for stacked devices
  * @t:	the stacking driver limits (top device)
@@ -661,6 +774,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		t->zone_write_granularity = 0;
 		t->max_zone_append_sectors = 0;
 	}
+	blk_stack_atomic_writes_limits(t, b);
+
 	return ret;
 }
 EXPORT_SYMBOL(blk_stack_limits);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 195db38fda16..31867de88213 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -333,6 +333,10 @@ typedef unsigned int __bitwise blk_features_t;
 #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \
 	((__force blk_features_t)(1u << 15))
 
+/* stacked device can/does support atomic writes */
+#define BLK_FEAT_ATOMIC_WRITES_STACKED \
+	((__force blk_features_t)(1u << 16))
+
 /*
  * Flags automatically inherited when stacking limits.
  */
-- 
2.51.0


From fa6fec82811bc6ebd3c4337ae4dae36c802c0fc1 Mon Sep 17 00:00:00 2001
From: John Garry <john.g.garry@oracle.com>
Date: Mon, 18 Nov 2024 10:50:16 +0000
Subject: [PATCH 12/16] md/raid0: Atomic write support

Set BLK_FEAT_ATOMIC_WRITES_STACKED to enable atomic writes. All other
stacked device request queue limits should automatically be set properly.
With regards to atomic write max bytes limit, this will be set at
hw_max_sectors and this is limited by the stripe width, which we want.

Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20241118105018.1870052-4-john.g.garry@oracle.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/raid0.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index baaf5f8b80ae..7049ec7fb8eb 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -384,6 +384,7 @@ static int raid0_set_limits(struct mddev *mddev)
 	lim.max_write_zeroes_sectors = mddev->chunk_sectors;
 	lim.io_min = mddev->chunk_sectors << 9;
 	lim.io_opt = lim.io_min * mddev->raid_disks;
+	lim.features |= BLK_FEAT_ATOMIC_WRITES_STACKED;
 	err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
 	if (err) {
 		queue_limits_cancel_update(mddev->gendisk->queue);
-- 
2.51.0


From f2a38abf5f1c5aeb3be8e9f4d3d815c867fff7ca Mon Sep 17 00:00:00 2001
From: John Garry <john.g.garry@oracle.com>
Date: Mon, 18 Nov 2024 10:50:17 +0000
Subject: [PATCH 13/16] md/raid1: Atomic write support

Set BLK_FEAT_ATOMIC_WRITES_STACKED to enable atomic writes.

For an attempt to atomic write to a region which has bad blocks, error
the write as we just cannot do this. It is unlikely to find devices which
support atomic writes and bad blocks.

Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20241118105018.1870052-5-john.g.garry@oracle.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/raid1.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a5adf08ee174..519c56f0ee3d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1571,7 +1571,21 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 				continue;
 			}
 			if (is_bad) {
-				int good_sectors = first_bad - r1_bio->sector;
+				int good_sectors;
+
+				/*
+				 * We cannot atomically write this, so just
+				 * error in that case. It could be possible to
+				 * atomically write other mirrors, but the
+				 * complexity of supporting that is not worth
+				 * the benefit.
+				 */
+				if (bio->bi_opf & REQ_ATOMIC) {
+					error = -EIO;
+					goto err_handle;
+				}
+
+				good_sectors = first_bad - r1_bio->sector;
 				if (good_sectors < max_sectors)
 					max_sectors = good_sectors;
 			}
@@ -1657,7 +1671,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 		mbio->bi_iter.bi_sector	= (r1_bio->sector + rdev->data_offset);
 		mbio->bi_end_io	= raid1_end_write_request;
-		mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA));
+		mbio->bi_opf = bio_op(bio) |
+			(bio->bi_opf & (REQ_SYNC | REQ_FUA | REQ_ATOMIC));
 		if (test_bit(FailFast, &rdev->flags) &&
 		    !test_bit(WriteMostly, &rdev->flags) &&
 		    conf->raid_disks - mddev->degraded > 1)
@@ -3224,6 +3239,7 @@ static int raid1_set_limits(struct mddev *mddev)
 
 	md_init_stacking_limits(&lim);
 	lim.max_write_zeroes_sectors = 0;
+	lim.features |= BLK_FEAT_ATOMIC_WRITES_STACKED;
 	err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
 	if (err) {
 		queue_limits_cancel_update(mddev->gendisk->queue);
-- 
2.51.0


From a1d9b4fd42d93f46c11e7e9d919a55a3f6ca6126 Mon Sep 17 00:00:00 2001
From: John Garry <john.g.garry@oracle.com>
Date: Mon, 18 Nov 2024 10:50:18 +0000
Subject: [PATCH 14/16] md/raid10: Atomic write support

Set BLK_FEAT_ATOMIC_WRITES_STACKED to enable atomic writes.

For an attempt to atomic write to a region which has bad blocks, error
the write as we just cannot do this. It is unlikely to find devices which
support atomic writes and bad blocks.

Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20241118105018.1870052-6-john.g.garry@oracle.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/raid10.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8c7f5daa073a..f779d8225667 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1255,6 +1255,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 	const enum req_op op = bio_op(bio);
 	const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
 	const blk_opf_t do_fua = bio->bi_opf & REQ_FUA;
+	const blk_opf_t do_atomic = bio->bi_opf & REQ_ATOMIC;
 	unsigned long flags;
 	struct r10conf *conf = mddev->private;
 	struct md_rdev *rdev;
@@ -1273,7 +1274,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 	mbio->bi_iter.bi_sector	= (r10_bio->devs[n_copy].addr +
 				   choose_data_offset(r10_bio, rdev));
 	mbio->bi_end_io	= raid10_end_write_request;
-	mbio->bi_opf = op | do_sync | do_fua;
+	mbio->bi_opf = op | do_sync | do_fua | do_atomic;
 	if (!replacement && test_bit(FailFast,
 				     &conf->mirrors[devnum].rdev->flags)
 			 && enough(conf, devnum))
@@ -1468,7 +1469,21 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 				continue;
 			}
 			if (is_bad) {
-				int good_sectors = first_bad - dev_sector;
+				int good_sectors;
+
+				/*
+				 * We cannot atomically write this, so just
+				 * error in that case. It could be possible to
+				 * atomically write other mirrors, but the
+				 * complexity of supporting that is not worth
+				 * the benefit.
+				 */
+				if (bio->bi_opf & REQ_ATOMIC) {
+					error = -EIO;
+					goto err_handle;
+				}
+
+				good_sectors = first_bad - dev_sector;
 				if (good_sectors < max_sectors)
 					max_sectors = good_sectors;
 			}
@@ -4025,6 +4040,7 @@ static int raid10_set_queue_limits(struct mddev *mddev)
 	lim.max_write_zeroes_sectors = 0;
 	lim.io_min = mddev->chunk_sectors << 9;
 	lim.io_opt = lim.io_min * raid10_nr_stripes(conf);
+	lim.features |= BLK_FEAT_ATOMIC_WRITES_STACKED;
 	err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
 	if (err) {
 		queue_limits_cancel_update(mddev->gendisk->queue);
-- 
2.51.0


From cf5a60d971c7b59efb89927919404be655a9e35a Mon Sep 17 00:00:00 2001
From: Zach Wade <zachwade.k@gmail.com>
Date: Tue, 19 Nov 2024 23:34:10 +0800
Subject: [PATCH 15/16] Revert "block, bfq: merge bfq_release_process_ref()
 into bfq_put_cooperator()"

This reverts commit bc3b1e9e7c50e1de0f573eea3871db61dd4787de.

The bic is associated with sync_bfqq, and bfq_release_process_ref cannot
be put into bfq_put_cooperator.

kasan report:
[  400.347277] ==================================================================
[  400.347287] BUG: KASAN: slab-use-after-free in bic_set_bfqq+0x200/0x230
[  400.347420] Read of size 8 at addr ffff88881cab7d60 by task dockerd/5800
[  400.347430]
[  400.347436] CPU: 24 UID: 0 PID: 5800 Comm: dockerd Kdump: loaded Tainted: G E 6.12.0 #32
[  400.347450] Tainted: [E]=UNSIGNED_MODULE
[  400.347454] Hardware name: VMware, Inc. VMware20,1/440BX Desktop Reference Platform, BIOS VMW201.00V.20192059.B64.2207280713 07/28/2022
[  400.347460] Call Trace:
[  400.347464]  <TASK>
[  400.347468]  dump_stack_lvl+0x5d/0x80
[  400.347490]  print_report+0x174/0x505
[  400.347521]  kasan_report+0xe0/0x160
[  400.347541]  bic_set_bfqq+0x200/0x230
[  400.347549]  bfq_bic_update_cgroup+0x419/0x740
[  400.347560]  bfq_bio_merge+0x133/0x320
[  400.347584]  blk_mq_submit_bio+0x1761/0x1e20
[  400.347625]  __submit_bio+0x28b/0x7b0
[  400.347664]  submit_bio_noacct_nocheck+0x6b2/0xd30
[  400.347690]  iomap_readahead+0x50c/0x680
[  400.347731]  read_pages+0x17f/0x9c0
[  400.347785]  page_cache_ra_unbounded+0x366/0x4a0
[  400.347795]  filemap_fault+0x83d/0x2340
[  400.347819]  __xfs_filemap_fault+0x11a/0x7d0 [xfs]
[  400.349256]  __do_fault+0xf1/0x610
[  400.349270]  do_fault+0x977/0x11a0
[  400.349281]  __handle_mm_fault+0x5d1/0x850
[  400.349314]  handle_mm_fault+0x1f8/0x560
[  400.349324]  do_user_addr_fault+0x324/0x970
[  400.349337]  exc_page_fault+0x76/0xf0
[  400.349350]  asm_exc_page_fault+0x26/0x30
[  400.349360] RIP: 0033:0x55a480d77375
[  400.349384] Code: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc 49 3b 66 10 0f 86 ae 02 00 00 55 48 89 e5 48 83 ec 58 48 8b 10 <83> 7a 10 00 0f 84 27 02 00 00 44 0f b6 42 28 44 0f b6 4a 29 41 80
[  400.349392] RSP: 002b:00007f18c37fd8b8 EFLAGS: 00010216
[  400.349401] RAX: 00007f18c37fd9d0 RBX: 0000000000000000 RCX: 0000000000000000
[  400.349407] RDX: 000055a484407d38 RSI: 000000c000e8b0c0 RDI: 0000000000000000
[  400.349412] RBP: 00007f18c37fd910 R08: 000055a484017f60 R09: 000055a484066f80
[  400.349417] R10: 0000000000194000 R11: 0000000000000005 R12: 0000000000000008
[  400.349422] R13: 0000000000000000 R14: 000000c000476a80 R15: 0000000000000000
[  400.349430]  </TASK>
[  400.349452]
[  400.349454] Allocated by task 5800:
[  400.349459]  kasan_save_stack+0x30/0x50
[  400.349469]  kasan_save_track+0x14/0x30
[  400.349475]  __kasan_slab_alloc+0x89/0x90
[  400.349482]  kmem_cache_alloc_node_noprof+0xdc/0x2a0
[  400.349492]  bfq_get_queue+0x1ef/0x1100
[  400.349502]  __bfq_get_bfqq_handle_split+0x11a/0x510
[  400.349511]  bfq_insert_requests+0xf55/0x9030
[  400.349519]  blk_mq_flush_plug_list+0x446/0x14c0
[  400.349527]  __blk_flush_plug+0x27c/0x4e0
[  400.349534]  blk_finish_plug+0x52/0xa0
[  400.349540]  _xfs_buf_ioapply+0x739/0xc30 [xfs]
[  400.350246]  __xfs_buf_submit+0x1b2/0x640 [xfs]
[  400.350967]  xfs_buf_read_map+0x306/0xa20 [xfs]
[  400.351672]  xfs_trans_read_buf_map+0x285/0x7d0 [xfs]
[  400.352386]  xfs_imap_to_bp+0x107/0x270 [xfs]
[  400.353077]  xfs_iget+0x70d/0x1eb0 [xfs]
[  400.353786]  xfs_lookup+0x2ca/0x3a0 [xfs]
[  400.354506]  xfs_vn_lookup+0x14e/0x1a0 [xfs]
[  400.355197]  __lookup_slow+0x19c/0x340
[  400.355204]  lookup_one_unlocked+0xfc/0x120
[  400.355211]  ovl_lookup_single+0x1b3/0xcf0 [overlay]
[  400.355255]  ovl_lookup_layer+0x316/0x490 [overlay]
[  400.355295]  ovl_lookup+0x844/0x1fd0 [overlay]
[  400.355351]  lookup_one_qstr_excl+0xef/0x150
[  400.355357]  do_unlinkat+0x22a/0x620
[  400.355366]  __x64_sys_unlinkat+0x109/0x1e0
[  400.355375]  do_syscall_64+0x82/0x160
[  400.355384]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  400.355393]
[  400.355395] Freed by task 5800:
[  400.355400]  kasan_save_stack+0x30/0x50
[  400.355407]  kasan_save_track+0x14/0x30
[  400.355413]  kasan_save_free_info+0x3b/0x70
[  400.355422]  __kasan_slab_free+0x4f/0x70
[  400.355429]  kmem_cache_free+0x176/0x520
[  400.355438]  bfq_put_queue+0x67e/0x980
[  400.355447]  bfq_bic_update_cgroup+0x407/0x740
[  400.355454]  bfq_bio_merge+0x133/0x320
[  400.355460]  blk_mq_submit_bio+0x1761/0x1e20
[  400.355467]  __submit_bio+0x28b/0x7b0
[  400.355473]  submit_bio_noacct_nocheck+0x6b2/0xd30
[  400.355480]  iomap_readahead+0x50c/0x680
[  400.355490]  read_pages+0x17f/0x9c0
[  400.355498]  page_cache_ra_unbounded+0x366/0x4a0
[  400.355505]  filemap_fault+0x83d/0x2340
[  400.355514]  __xfs_filemap_fault+0x11a/0x7d0 [xfs]
[  400.356204]  __do_fault+0xf1/0x610
[  400.356213]  do_fault+0x977/0x11a0
[  400.356221]  __handle_mm_fault+0x5d1/0x850
[  400.356230]  handle_mm_fault+0x1f8/0x560
[  400.356238]  do_user_addr_fault+0x324/0x970
[  400.356248]  exc_page_fault+0x76/0xf0
[  400.356258]  asm_exc_page_fault+0x26/0x30
[  400.356266]
[  400.356269] The buggy address belongs to the object at ffff88881cab7bc0
                which belongs to the cache bfq_queue of size 576
[  400.356276] The buggy address is located 416 bytes inside of
                freed 576-byte region [ffff88881cab7bc0, ffff88881cab7e00)
[  400.356285]
[  400.356287] The buggy address belongs to the physical page:
[  400.356292] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88881cab0b00 pfn:0x81cab0
[  400.356300] head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0
[  400.356323] flags: 0x50000000000040(head|node=1|zone=2)
[  400.356331] page_type: f5(slab)
[  400.356340] raw: 0050000000000040 ffff88880a00c280 dead000000000122 0000000000000000
[  400.356347] raw: ffff88881cab0b00 00000000802e0025 00000001f5000000 0000000000000000
[  400.356354] head: 0050000000000040 ffff88880a00c280 dead000000000122 0000000000000000
[  400.356359] head: ffff88881cab0b00 00000000802e0025 00000001f5000000 0000000000000000
[  400.356365] head: 0050000000000003 ffffea002072ac01 ffffffffffffffff 0000000000000000
[  400.356370] head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000
[  400.356378] page dumped because: kasan: bad access detected
[  400.356381]
[  400.356383] Memory state around the buggy address:
[  400.356387]  ffff88881cab7c00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[  400.356392]  ffff88881cab7c80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[  400.356397] >ffff88881cab7d00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[  400.356400]                                                        ^
[  400.356405]  ffff88881cab7d80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[  400.356409]  ffff88881cab7e00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[  400.356413] ==================================================================

Cc: stable@vger.kernel.org
Fixes: bc3b1e9e7c50 ("block, bfq: merge bfq_release_process_ref() into bfq_put_cooperator()")
Signed-off-by: Zach Wade <zachwade.k@gmail.com>
Cc: Ding Hui <dinghui@sangfor.com.cn>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20241119153410.2546-1-zachwade.k@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-cgroup.c  | 1 +
 block/bfq-iosched.c | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index e831aedb4643..9fb9f3533150 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -736,6 +736,7 @@ static void bfq_sync_bfqq_move(struct bfq_data *bfqd,
 		 */
 		bfq_put_cooperator(sync_bfqq);
 		bic_set_bfqq(bic, NULL, true, act_idx);
+		bfq_release_process_ref(bfqd, sync_bfqq);
 	}
 }
 
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 0747d9d0e48c..28c2bb06e859 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5434,8 +5434,6 @@ void bfq_put_cooperator(struct bfq_queue *bfqq)
 		bfq_put_queue(__bfqq);
 		__bfqq = next;
 	}
-
-	bfq_release_process_ref(bfqq->bfqd, bfqq);
 }
 
 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
@@ -5448,6 +5446,8 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
 
 	bfq_put_cooperator(bfqq);
+
+	bfq_release_process_ref(bfqd, bfqq);
 }
 
 static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync,
@@ -6734,6 +6734,8 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
 	bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx);
 
 	bfq_put_cooperator(bfqq);
+
+	bfq_release_process_ref(bfqq->bfqd, bfqq);
 	return NULL;
 }
 
-- 
2.51.0


From dcbb598e689e30d4636201d35582e167d1b8dfa4 Mon Sep 17 00:00:00 2001
From: Suraj Sonawane <surajsonawane0215@gmail.com>
Date: Tue, 19 Nov 2024 22:14:12 +0530
Subject: [PATCH 16/16] block: blk-mq: fix uninit-value in blk_rq_prep_clone
 and refactor

Fix an issue detected by the `smatch` tool:

block/blk-mq.c:3314 blk_rq_prep_clone() error: uninitialized
symbol 'bio'.

This patch refactors `blk_rq_prep_clone()` to improve code
readability and ensure safety by addressing potential misuse of
the `bio` variable:

- Move the bio_put(bio); call to the bio_ctr error handling block,
  which is the only place where it can be triggered.
- Move the bio variable into the __rq_for_each_bio loop scope.
  This change removes the need to set bio to NULL at the loop's
  end.

discussion on why bio remains uninitialized:
https://lore.kernel.org/lkml/20241004141037.43277-1-surajsonawane0215@gmail.com

Summary of above discussion:
- I pointed out that `bio` can remain uninitialized if the
  allocation with `bio_alloc_clone` fails.
- Keith Busch explained that `bio` is initialized to `NULL` when
  `bio_alloc_clone()` fails, preventing uninitialized usage.
- John Garry questioned whether `rq_src->bio` being `NULL` could
  leave `bio` uninitialized. Keith clarified that in such cases,
  `bio` is not referenced, so it does not need initialization.
- Christoph Hellwig recommended code improvements:
 - move the bio_put to the bio_ctr error handling, which is the only
   case where it can happen
 - move the bio variable into the __rq_for_each_bio scope, which
   also removed the need to zero it at the end of the loop

These changes enhance code clarity, address static analysis tool
warnings, and make the function more maintainable.

thread of previous version patch discussion:
https://lore.kernel.org/lkml/20241004100842.9052-1-surajsonawane0215@gmail.com

Signed-off-by: Suraj Sonawane <surajsonawane0215@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20241119164412.37609-1-surajsonawane0215@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 270cfd9fc6b0..abdf44736e08 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3273,19 +3273,21 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 		      int (*bio_ctr)(struct bio *, struct bio *, void *),
 		      void *data)
 {
-	struct bio *bio, *bio_src;
+	struct bio *bio_src;
 
 	if (!bs)
 		bs = &fs_bio_set;
 
 	__rq_for_each_bio(bio_src, rq_src) {
-		bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask,
-				      bs);
+		struct bio *bio	 = bio_alloc_clone(rq->q->disk->part0, bio_src,
+					gfp_mask, bs);
 		if (!bio)
 			goto free_and_out;
 
-		if (bio_ctr && bio_ctr(bio, bio_src, data))
+		if (bio_ctr && bio_ctr(bio, bio_src, data)) {
+			bio_put(bio);
 			goto free_and_out;
+		}
 
 		if (rq->bio) {
 			rq->biotail->bi_next = bio;
@@ -3293,7 +3295,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 		} else {
 			rq->bio = rq->biotail = bio;
 		}
-		bio = NULL;
 	}
 
 	/* Copy attributes of the original request to the clone request. */
@@ -3311,8 +3312,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 	return 0;
 
 free_and_out:
-	if (bio)
-		bio_put(bio);
 	blk_rq_unprep_clone(rq);
 
 	return -ENOMEM;
-- 
2.51.0