From ea90d270349d51086d0dddc55821a782040d68f5 Mon Sep 17 00:00:00 2001
From: John Garry <john.g.garry@oracle.com>
Date: Tue, 12 Nov 2024 16:10:18 +0000
Subject: [PATCH 01/16] md/raid5: Increase r5conf.cache_name size
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

For compiling with W=1, the following warning can be seen:

drivers/md/raid5.c: In function âsetup_confâ:
drivers/md/raid5.c:2423:12: error: â%sâ directive output may be truncated writing up to 31 bytes into a region of size between 16 and 26 [-Werror=format-truncation=]
    "raid%d-%s", conf->level, mdname(conf->mddev));
            ^~
drivers/md/raid5.c:2422:3: note: âsnprintfâ output between 7 and 48 bytes into a destination of size 32
   snprintf(conf->cache_name[0], namelen,
   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    "raid%d-%s", conf->level, mdname(conf->mddev));
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
cc1: all warnings being treated as errors

Increase the array size to avoid this warning.

Signed-off-by: John Garry <john.g.garry@oracle.com>
Link: https://lore.kernel.org/r/20241112161019.4154616-2-john.g.garry@oracle.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid5.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 896ecfc4afa6..d174e586698f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -633,7 +633,7 @@ struct r5conf {
 	 * two caches.
 	 */
 	int			active_name;
-	char			cache_name[2][32];
+	char			cache_name[2][48];
 	struct kmem_cache	*slab_cache; /* for allocating stripes */
 	struct mutex		cache_size_mutex; /* Protect changes to cache size */
 
-- 
2.51.0


From 609e60a3a9f4053b420d3ece565dee273bfdc2e8 Mon Sep 17 00:00:00 2001
From: Guixin Liu <kanie@linux.alibaba.com>
Date: Wed, 13 Nov 2024 18:18:39 +0800
Subject: [PATCH 02/16] nvmet: report ns's vwc not present

Currently, we report that controller has vwc even though the ns may
not have vwc. Report ns's vwc not present when not buffered_io or
backdev doesn't have vwc.

Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/admin-cmd.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 0a9fdc533186..934b401fbc2f 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -934,6 +934,13 @@ static void nvmet_execute_id_cs_indep(struct nvmet_req *req)
 		id->nsattr |= NVME_NS_ATTR_RO;
 	if (req->ns->bdev && !bdev_nonrot(req->ns->bdev))
 		id->nsfeat |= NVME_NS_ROTATIONAL;
+	/*
+	 * We need flush command to flush the file's metadata,
+	 * so report supporting vwc if backend is file, even
+	 * though buffered_io is disable.
+	 */
+	if (req->ns->bdev && !bdev_write_cache(req->ns->bdev))
+		id->nsfeat |= NVME_NS_VWC_NOT_PRESENT;
 
 	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
 	kfree(id);
-- 
2.51.0


From 8a502b5c1689862b1fce1ef3019dcd284cc98aa8 Mon Sep 17 00:00:00 2001
From: Guixin Liu <kanie@linux.alibaba.com>
Date: Mon, 14 Oct 2024 18:14:57 +0800
Subject: [PATCH 03/16] nvme: parse reservation commands's action and rtype to
 string

Parse reservation commands's action(including rrega, racqa and rrela)
and rtype to string to make the trace log more human-readable.

Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/trace.c | 58 +++++++++++++++++++++++++++++++++++----
 1 file changed, 52 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 87c437fc070d..ad25ad1e4041 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -228,27 +228,61 @@ static const char *nvme_trace_zone_mgmt_recv(struct trace_seq *p, u8 *cdw10)
 
 static const char *nvme_trace_resv_reg(struct trace_seq *p, u8 *cdw10)
 {
+	static const char * const rrega_strs[] = {
+		[0x00] = "register",
+		[0x01] = "unregister",
+		[0x02] = "replace",
+	};
 	const char *ret = trace_seq_buffer_ptr(p);
 	u8 rrega = cdw10[0] & 0x7;
 	u8 iekey = (cdw10[0] >> 3) & 0x1;
 	u8 ptpl = (cdw10[3] >> 6) & 0x3;
+	const char *rrega_str;
+
+	if (rrega < ARRAY_SIZE(rrega_strs) && rrega_strs[rrega])
+		rrega_str = rrega_strs[rrega];
+	else
+		rrega_str = "reserved";
 
-	trace_seq_printf(p, "rrega=%u, iekey=%u, ptpl=%u",
-			 rrega, iekey, ptpl);
+	trace_seq_printf(p, "rrega=%u:%s, iekey=%u, ptpl=%u",
+			 rrega, rrega_str, iekey, ptpl);
 	trace_seq_putc(p, 0);
 
 	return ret;
 }
 
+static const char * const rtype_strs[] = {
+	[0x00] = "reserved",
+	[0x01] = "write exclusive",
+	[0x02] = "exclusive access",
+	[0x03] = "write exclusive registrants only",
+	[0x04] = "exclusive access registrants only",
+	[0x05] = "write exclusive all registrants",
+	[0x06] = "exclusive access all registrants",
+};
+
 static const char *nvme_trace_resv_acq(struct trace_seq *p, u8 *cdw10)
 {
+	static const char * const racqa_strs[] = {
+		[0x00] = "acquire",
+		[0x01] = "preempt",
+		[0x02] = "preempt and abort",
+	};
 	const char *ret = trace_seq_buffer_ptr(p);
 	u8 racqa = cdw10[0] & 0x7;
 	u8 iekey = (cdw10[0] >> 3) & 0x1;
 	u8 rtype = cdw10[1];
+	const char *racqa_str = "reserved";
+	const char *rtype_str = "reserved";
 
-	trace_seq_printf(p, "racqa=%u, iekey=%u, rtype=%u",
-			 racqa, iekey, rtype);
+	if (racqa < ARRAY_SIZE(racqa_strs) && racqa_strs[racqa])
+		racqa_str = racqa_strs[racqa];
+
+	if (rtype < ARRAY_SIZE(rtype_strs) && rtype_strs[rtype])
+		rtype_str = rtype_strs[rtype];
+
+	trace_seq_printf(p, "racqa=%u:%s, iekey=%u, rtype=%u:%s",
+			 racqa, racqa_str, iekey, rtype, rtype_str);
 	trace_seq_putc(p, 0);
 
 	return ret;
@@ -256,13 +290,25 @@ static const char *nvme_trace_resv_acq(struct trace_seq *p, u8 *cdw10)
 
 static const char *nvme_trace_resv_rel(struct trace_seq *p, u8 *cdw10)
 {
+	static const char * const rrela_strs[] = {
+		[0x00] = "release",
+		[0x01] = "clear",
+	};
 	const char *ret = trace_seq_buffer_ptr(p);
 	u8 rrela = cdw10[0] & 0x7;
 	u8 iekey = (cdw10[0] >> 3) & 0x1;
 	u8 rtype = cdw10[1];
+	const char *rrela_str = "reserved";
+	const char *rtype_str = "reserved";
+
+	if (rrela < ARRAY_SIZE(rrela_strs) && rrela_strs[rrela])
+		rrela_str = rrela_strs[rrela];
+
+	if (rtype < ARRAY_SIZE(rtype_strs) && rtype_strs[rtype])
+		rtype_str = rtype_strs[rtype];
 
-	trace_seq_printf(p, "rrela=%u, iekey=%u, rtype=%u",
-			 rrela, iekey, rtype);
+	trace_seq_printf(p, "rrela=%u:%s, iekey=%u, rtype=%u:%s",
+			 rrela, rrela_str, iekey, rtype, rtype_str);
 	trace_seq_putc(p, 0);
 
 	return ret;
-- 
2.51.0


From 50bee3857d081ab1b83f93c64cb5c10a4babe2d9 Mon Sep 17 00:00:00 2001
From: Guixin Liu <kanie@linux.alibaba.com>
Date: Mon, 14 Oct 2024 18:14:58 +0800
Subject: [PATCH 04/16] nvmet: add tracing of reservation commands

Add tracing of reservation commands, including register, acquire,
release and report, and also parse the action and rtype to string
to make the trace log more human-readable.

Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/trace.c | 108 ++++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c
index 9a3548179a8e..6dbc7036f2e4 100644
--- a/drivers/nvme/target/trace.c
+++ b/drivers/nvme/target/trace.c
@@ -180,6 +180,106 @@ static const char *nvmet_trace_zone_mgmt_recv(struct trace_seq *p, u8 *cdw10)
 	return ret;
 }
 
+static const char *nvmet_trace_resv_reg(struct trace_seq *p, u8 *cdw10)
+{
+	static const char * const rrega_strs[] = {
+		[0x00] = "register",
+		[0x01] = "unregister",
+		[0x02] = "replace",
+	};
+	const char *ret = trace_seq_buffer_ptr(p);
+	u8 rrega = cdw10[0] & 0x7;
+	u8 iekey = (cdw10[0] >> 3) & 0x1;
+	u8 ptpl = (cdw10[3] >> 6) & 0x3;
+	const char *rrega_str;
+
+	if (rrega < ARRAY_SIZE(rrega_strs) && rrega_strs[rrega])
+		rrega_str = rrega_strs[rrega];
+	else
+		rrega_str = "reserved";
+
+	trace_seq_printf(p, "rrega=%u:%s, iekey=%u, ptpl=%u",
+			 rrega, rrega_str, iekey, ptpl);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+static const char * const rtype_strs[] = {
+	[0x00] = "reserved",
+	[0x01] = "write exclusive",
+	[0x02] = "exclusive access",
+	[0x03] = "write exclusive registrants only",
+	[0x04] = "exclusive access registrants only",
+	[0x05] = "write exclusive all registrants",
+	[0x06] = "exclusive access all registrants",
+};
+
+static const char *nvmet_trace_resv_acq(struct trace_seq *p, u8 *cdw10)
+{
+	static const char * const racqa_strs[] = {
+		[0x00] = "acquire",
+		[0x01] = "preempt",
+		[0x02] = "preempt and abort",
+	};
+	const char *ret = trace_seq_buffer_ptr(p);
+	u8 racqa = cdw10[0] & 0x7;
+	u8 iekey = (cdw10[0] >> 3) & 0x1;
+	u8 rtype = cdw10[1];
+	const char *racqa_str = "reserved";
+	const char *rtype_str = "reserved";
+
+	if (racqa < ARRAY_SIZE(racqa_strs) && racqa_strs[racqa])
+		racqa_str = racqa_strs[racqa];
+
+	if (rtype < ARRAY_SIZE(rtype_strs) && rtype_strs[rtype])
+		rtype_str = rtype_strs[rtype];
+
+	trace_seq_printf(p, "racqa=%u:%s, iekey=%u, rtype=%u:%s",
+			 racqa, racqa_str, iekey, rtype, rtype_str);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+static const char *nvmet_trace_resv_rel(struct trace_seq *p, u8 *cdw10)
+{
+	static const char * const rrela_strs[] = {
+		[0x00] = "release",
+		[0x01] = "clear",
+	};
+	const char *ret = trace_seq_buffer_ptr(p);
+	u8 rrela = cdw10[0] & 0x7;
+	u8 iekey = (cdw10[0] >> 3) & 0x1;
+	u8 rtype = cdw10[1];
+	const char *rrela_str = "reserved";
+	const char *rtype_str = "reserved";
+
+	if (rrela < ARRAY_SIZE(rrela_strs) && rrela_strs[rrela])
+		rrela_str = rrela_strs[rrela];
+
+	if (rtype < ARRAY_SIZE(rtype_strs) && rtype_strs[rtype])
+		rtype_str = rtype_strs[rtype];
+
+	trace_seq_printf(p, "rrela=%u:%s, iekey=%u, rtype=%u:%s",
+			 rrela, rrela_str, iekey, rtype, rtype_str);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+static const char *nvmet_trace_resv_report(struct trace_seq *p, u8 *cdw10)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u32 numd = get_unaligned_le32(cdw10);
+	u8 eds = cdw10[4] & 0x1;
+
+	trace_seq_printf(p, "numd=%u, eds=%u", numd, eds);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
 const char *nvmet_trace_parse_nvm_cmd(struct trace_seq *p,
 		u8 opcode, u8 *cdw10)
 {
@@ -195,6 +295,14 @@ const char *nvmet_trace_parse_nvm_cmd(struct trace_seq *p,
 		return nvmet_trace_zone_mgmt_send(p, cdw10);
 	case nvme_cmd_zone_mgmt_recv:
 		return nvmet_trace_zone_mgmt_recv(p, cdw10);
+	case nvme_cmd_resv_register:
+		return nvmet_trace_resv_reg(p, cdw10);
+	case nvme_cmd_resv_acquire:
+		return nvmet_trace_resv_acq(p, cdw10);
+	case nvme_cmd_resv_release:
+		return nvmet_trace_resv_rel(p, cdw10);
+	case nvme_cmd_resv_report:
+		return nvmet_trace_resv_report(p, cdw10);
 	default:
 		return nvmet_trace_common(p, cdw10);
 	}
-- 
2.51.0


From 470d2bc3a0bc19a849cc7478c02d3f5ecaa1233e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 13 Nov 2024 09:45:35 +0100
Subject: [PATCH 05/16] block: export blk_validate_limits

While block drivers do the validation as part of committing them to the
queue, users that use the limit outside of a block device context have
to validate the limits and fill in the calculated values as well.

So far btrfs is the only user of queue limits without a block device,
and it has gotten away with that more or less by accident.  But with
commit 559218d43ec9 ("block: pre-calculate max_zone_append_sectors")
this became fatal for setups that have small max zone append size,
as it won't be limited now.

Export blk_validate_limits so that it can be called directly from btrfs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20241113084541.34315-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-settings.c   | 3 ++-
 include/linux/blkdev.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 7d6b296997c2..f1d4dfdc37a7 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -222,7 +222,7 @@ unsupported:
  * Check that the limits in lim are valid, initialize defaults for unset
  * values, and cap values based on others where needed.
  */
-static int blk_validate_limits(struct queue_limits *lim)
+int blk_validate_limits(struct queue_limits *lim)
 {
 	unsigned int max_hw_sectors;
 	unsigned int logical_block_sectors;
@@ -365,6 +365,7 @@ static int blk_validate_limits(struct queue_limits *lim)
 		return err;
 	return blk_validate_zoned_limits(lim);
 }
+EXPORT_SYMBOL_GPL(blk_validate_limits);
 
 /*
  * Set the default limits for a newly allocated queue.  @lim contains the
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 65f37ae70712..cd905afaf51a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -948,6 +948,7 @@ queue_limits_start_update(struct request_queue *q)
 int queue_limits_commit_update(struct request_queue *q,
 		struct queue_limits *lim);
 int queue_limits_set(struct request_queue *q, struct queue_limits *lim);
+int blk_validate_limits(struct queue_limits *lim);
 
 /**
  * queue_limits_cancel_update - cancel an atomic update of queue limits
-- 
2.51.0


From e559ee022658c70bdc07c4846bf279f5a5abc494 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 13 Nov 2024 09:45:36 +0100
Subject: [PATCH 06/16] btrfs: validate queue limits

Call blk_validate_limits on the queue limits used for zone append
splitting so that calculated values get filled in and any stacking
conflicts get cought.

Without this there isn't a max_zone_append_sectors limits as of commit
559218d43ec9 ("block: pre-calculate max_zone_append_sectors").

Fixes: 559218d43ec9 ("block: pre-calculate max_zone_append_sectors")
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20241113084541.34315-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/btrfs/zoned.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 46b9386957e6..64cdae31d348 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -711,6 +711,12 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 			blk_stack_limits(lim, bdev_limits(device->bdev), 0);
 	}
 
+	ret = blk_validate_limits(lim);
+	if (ret) {
+		btrfs_err(fs_info, "zoned: failed to validate queue limits");
+		return ret;
+	}
+
 	/*
 	 * stripe_size is always aligned to BTRFS_STRIPE_LEN in
 	 * btrfs_create_chunk(). Since we want stripe_len == zone_size,
-- 
2.51.0


From beadf0088501d9dcf2454b05d90d5d31ea3ba55f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 13 Nov 2024 16:20:41 +0100
Subject: [PATCH 07/16] nvme-pci: reverse request order in nvme_queue_rqs

blk_mq_flush_plug_list submits requests in the reverse order that they
were submitted, which leads to a rather suboptimal I/O pattern especially
in rotational devices.  Fix this by rewriting nvme_queue_rqs so that it
always pops the requests from the passed in request list, and then adds
them to the head of a local submit list.  This actually simplifies the
code a bit as it removes the complicated list splicing, at the cost of
extra updates of the rq_next pointer.  As that should be cache hot
anyway it should be an easy price to pay.

Fixes: d62cbcf62f2f ("nvme: add support for mq_ops->queue_rqs()")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20241113152050.157179-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 39 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 0aa26a33f231..ec1c44c75d92 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -906,9 +906,10 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
 {
+	struct request *req;
+
 	spin_lock(&nvmeq->sq_lock);
-	while (!rq_list_empty(*rqlist)) {
-		struct request *req = rq_list_pop(rqlist);
+	while ((req = rq_list_pop(rqlist))) {
 		struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 
 		nvme_sq_copy_cmd(nvmeq, &iod->cmd);
@@ -933,31 +934,25 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
 
 static void nvme_queue_rqs(struct request **rqlist)
 {
-	struct request *req, *next, *prev = NULL;
+	struct request *submit_list = NULL;
 	struct request *requeue_list = NULL;
+	struct request **requeue_lastp = &requeue_list;
+	struct nvme_queue *nvmeq = NULL;
+	struct request *req;
 
-	rq_list_for_each_safe(rqlist, req, next) {
-		struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
-
-		if (!nvme_prep_rq_batch(nvmeq, req)) {
-			/* detach 'req' and add to remainder list */
-			rq_list_move(rqlist, &requeue_list, req, prev);
-
-			req = prev;
-			if (!req)
-				continue;
-		}
+	while ((req = rq_list_pop(rqlist))) {
+		if (nvmeq && nvmeq != req->mq_hctx->driver_data)
+			nvme_submit_cmds(nvmeq, &submit_list);
+		nvmeq = req->mq_hctx->driver_data;
 
-		if (!next || req->mq_hctx != next->mq_hctx) {
-			/* detach rest of list, and submit */
-			req->rq_next = NULL;
-			nvme_submit_cmds(nvmeq, rqlist);
-			*rqlist = next;
-			prev = NULL;
-		} else
-			prev = req;
+		if (nvme_prep_rq_batch(nvmeq, req))
+			rq_list_add(&submit_list, req); /* reverse order */
+		else
+			rq_list_add_tail(&requeue_lastp, req);
 	}
 
+	if (nvmeq)
+		nvme_submit_cmds(nvmeq, &submit_list);
 	*rqlist = requeue_list;
 }
 
-- 
2.51.0


From 7f212e997edbb7a2cb85cef2ac14265dfaf88717 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 13 Nov 2024 16:20:42 +0100
Subject: [PATCH 08/16] virtio_blk: reverse request order in virtio_queue_rqs

blk_mq_flush_plug_list submits requests in the reverse order that they
were submitted, which leads to a rather suboptimal I/O pattern
especially in rotational devices. Fix this by rewriting virtio_queue_rqs
so that it always pops the requests from the passed in request list, and
then adds them to the head of a local submit list. This actually
simplifies the code a bit as it removes the complicated list splicing,
at the cost of extra updates of the rq_next pointer. As that should be
cache hot anyway it should be an easy price to pay.

Fixes: 0e9911fa768f ("virtio-blk: support mq_ops->queue_rqs()")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20241113152050.157179-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/virtio_blk.c | 46 +++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 0e99a4714928..b25f7c06a28e 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -471,18 +471,18 @@ static bool virtblk_prep_rq_batch(struct request *req)
 	return virtblk_prep_rq(req->mq_hctx, vblk, req, vbr) == BLK_STS_OK;
 }
 
-static bool virtblk_add_req_batch(struct virtio_blk_vq *vq,
+static void virtblk_add_req_batch(struct virtio_blk_vq *vq,
 					struct request **rqlist)
 {
+	struct request *req;
 	unsigned long flags;
-	int err;
 	bool kick;
 
 	spin_lock_irqsave(&vq->lock, flags);
 
-	while (!rq_list_empty(*rqlist)) {
-		struct request *req = rq_list_pop(rqlist);
+	while ((req = rq_list_pop(rqlist))) {
 		struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
+		int err;
 
 		err = virtblk_add_req(vq->vq, vbr);
 		if (err) {
@@ -495,37 +495,33 @@ static bool virtblk_add_req_batch(struct virtio_blk_vq *vq,
 	kick = virtqueue_kick_prepare(vq->vq);
 	spin_unlock_irqrestore(&vq->lock, flags);
 
-	return kick;
+	if (kick)
+		virtqueue_notify(vq->vq);
 }
 
 static void virtio_queue_rqs(struct request **rqlist)
 {
-	struct request *req, *next, *prev = NULL;
+	struct request *submit_list = NULL;
 	struct request *requeue_list = NULL;
+	struct request **requeue_lastp = &requeue_list;
+	struct virtio_blk_vq *vq = NULL;
+	struct request *req;
 
-	rq_list_for_each_safe(rqlist, req, next) {
-		struct virtio_blk_vq *vq = get_virtio_blk_vq(req->mq_hctx);
-		bool kick;
-
-		if (!virtblk_prep_rq_batch(req)) {
-			rq_list_move(rqlist, &requeue_list, req, prev);
-			req = prev;
-			if (!req)
-				continue;
-		}
+	while ((req = rq_list_pop(rqlist))) {
+		struct virtio_blk_vq *this_vq = get_virtio_blk_vq(req->mq_hctx);
 
-		if (!next || req->mq_hctx != next->mq_hctx) {
-			req->rq_next = NULL;
-			kick = virtblk_add_req_batch(vq, rqlist);
-			if (kick)
-				virtqueue_notify(vq->vq);
+		if (vq && vq != this_vq)
+			virtblk_add_req_batch(vq, &submit_list);
+		vq = this_vq;
 
-			*rqlist = next;
-			prev = NULL;
-		} else
-			prev = req;
+		if (virtblk_prep_rq_batch(req))
+			rq_list_add(&submit_list, req); /* reverse order */
+		else
+			rq_list_add_tail(&requeue_lastp, req);
 	}
 
+	if (vq)
+		virtblk_add_req_batch(vq, &submit_list);
 	*rqlist = requeue_list;
 }
 
-- 
2.51.0


From e8225ab15006fbcdb14cef426a0a54475292fbbc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 13 Nov 2024 16:20:43 +0100
Subject: [PATCH 09/16] block: remove rq_list_move

Unused now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20241113152050.157179-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a28264442948..ad26a41d13f9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -268,23 +268,6 @@ static inline unsigned short req_get_ioprio(struct request *req)
 #define rq_list_next(rq)	(rq)->rq_next
 #define rq_list_empty(list)	((list) == (struct request *) NULL)
 
-/**
- * rq_list_move() - move a struct request from one list to another
- * @src: The source list @rq is currently in
- * @dst: The destination list that @rq will be appended to
- * @rq: The request to move
- * @prev: The request preceding @rq in @src (NULL if @rq is the head)
- */
-static inline void rq_list_move(struct request **src, struct request **dst,
-				struct request *rq, struct request *prev)
-{
-	if (prev)
-		prev->rq_next = rq->rq_next;
-	else
-		*src = rq->rq_next;
-	rq_list_add(dst, rq);
-}
-
 /**
  * enum blk_eh_timer_return - How the timeout handler should proceed
  * @BLK_EH_DONE: The block driver completed the command or will complete it at
-- 
2.51.0


From a3396b99990d8b4e5797e7b16fdeb64c15ae97bb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 13 Nov 2024 16:20:44 +0100
Subject: [PATCH 10/16] block: add a rq_list type

Replace the semi-open coded request list helpers with a proper rq_list
type that mirrors the bio_list and has head and tail pointers.  Besides
better type safety this actually allows to insert at the tail of the
list, which will be useful soon.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20241113152050.157179-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c              |  6 +--
 block/blk-merge.c             |  2 +-
 block/blk-mq.c                | 40 ++++++++--------
 block/blk-mq.h                |  2 +-
 drivers/block/null_blk/main.c |  9 ++--
 drivers/block/virtio_blk.c    | 13 +++---
 drivers/nvme/host/apple.c     |  2 +-
 drivers/nvme/host/pci.c       | 15 +++---
 include/linux/blk-mq.h        | 88 +++++++++++++++++++++--------------
 include/linux/blkdev.h        | 11 +++--
 io_uring/rw.c                 |  4 +-
 11 files changed, 104 insertions(+), 88 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 0387172e8259..666efe8fa202 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1120,8 +1120,8 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
 		return;
 
 	plug->cur_ktime = 0;
-	plug->mq_list = NULL;
-	plug->cached_rq = NULL;
+	rq_list_init(&plug->mq_list);
+	rq_list_init(&plug->cached_rqs);
 	plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
 	plug->rq_count = 0;
 	plug->multiple_queues = false;
@@ -1217,7 +1217,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
 	 * queue for cached requests, we don't want a blocked task holding
 	 * up a queue freeze/quiesce event.
 	 */
-	if (unlikely(!rq_list_empty(plug->cached_rq)))
+	if (unlikely(!rq_list_empty(&plug->cached_rqs)))
 		blk_mq_free_plug_rqs(plug);
 
 	plug->cur_ktime = 0;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index df36f83f3738..e0b28e9298c9 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -1179,7 +1179,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 	struct blk_plug *plug = current->plug;
 	struct request *rq;
 
-	if (!plug || rq_list_empty(plug->mq_list))
+	if (!plug || rq_list_empty(&plug->mq_list))
 		return false;
 
 	rq_list_for_each(&plug->mq_list, rq) {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3c6cadba75e3..ff0b819e35fc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -478,7 +478,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
 		prefetch(tags->static_rqs[tag]);
 		tag_mask &= ~(1UL << i);
 		rq = blk_mq_rq_ctx_init(data, tags, tag);
-		rq_list_add(data->cached_rq, rq);
+		rq_list_add_head(data->cached_rqs, rq);
 		nr++;
 	}
 	if (!(data->rq_flags & RQF_SCHED_TAGS))
@@ -487,7 +487,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
 	percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
 	data->nr_tags -= nr;
 
-	return rq_list_pop(data->cached_rq);
+	return rq_list_pop(data->cached_rqs);
 }
 
 static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
@@ -584,7 +584,7 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
 		.flags		= flags,
 		.cmd_flags	= opf,
 		.nr_tags	= plug->nr_ios,
-		.cached_rq	= &plug->cached_rq,
+		.cached_rqs	= &plug->cached_rqs,
 	};
 	struct request *rq;
 
@@ -609,14 +609,14 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
 	if (!plug)
 		return NULL;
 
-	if (rq_list_empty(plug->cached_rq)) {
+	if (rq_list_empty(&plug->cached_rqs)) {
 		if (plug->nr_ios == 1)
 			return NULL;
 		rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
 		if (!rq)
 			return NULL;
 	} else {
-		rq = rq_list_peek(&plug->cached_rq);
+		rq = rq_list_peek(&plug->cached_rqs);
 		if (!rq || rq->q != q)
 			return NULL;
 
@@ -625,7 +625,7 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
 		if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
 			return NULL;
 
-		plug->cached_rq = rq_list_next(rq);
+		rq_list_pop(&plug->cached_rqs);
 		blk_mq_rq_time_init(rq, blk_time_get_ns());
 	}
 
@@ -802,7 +802,7 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug)
 {
 	struct request *rq;
 
-	while ((rq = rq_list_pop(&plug->cached_rq)) != NULL)
+	while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL)
 		blk_mq_free_request(rq);
 }
 
@@ -1392,8 +1392,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
 	 */
 	if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS))
 		plug->has_elevator = true;
-	rq->rq_next = NULL;
-	rq_list_add(&plug->mq_list, rq);
+	rq_list_add_head(&plug->mq_list, rq);
 	plug->rq_count++;
 }
 
@@ -2785,7 +2784,7 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug)
 	blk_status_t ret = BLK_STS_OK;
 
 	while ((rq = rq_list_pop(&plug->mq_list))) {
-		bool last = rq_list_empty(plug->mq_list);
+		bool last = rq_list_empty(&plug->mq_list);
 
 		if (hctx != rq->mq_hctx) {
 			if (hctx) {
@@ -2828,8 +2827,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
 {
 	struct blk_mq_hw_ctx *this_hctx = NULL;
 	struct blk_mq_ctx *this_ctx = NULL;
-	struct request *requeue_list = NULL;
-	struct request **requeue_lastp = &requeue_list;
+	struct rq_list requeue_list = {};
 	unsigned int depth = 0;
 	bool is_passthrough = false;
 	LIST_HEAD(list);
@@ -2843,12 +2841,12 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
 			is_passthrough = blk_rq_is_passthrough(rq);
 		} else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx ||
 			   is_passthrough != blk_rq_is_passthrough(rq)) {
-			rq_list_add_tail(&requeue_lastp, rq);
+			rq_list_add_tail(&requeue_list, rq);
 			continue;
 		}
 		list_add(&rq->queuelist, &list);
 		depth++;
-	} while (!rq_list_empty(plug->mq_list));
+	} while (!rq_list_empty(&plug->mq_list));
 
 	plug->mq_list = requeue_list;
 	trace_block_unplug(this_hctx->queue, depth, !from_sched);
@@ -2903,19 +2901,19 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		if (q->mq_ops->queue_rqs) {
 			blk_mq_run_dispatch_ops(q,
 				__blk_mq_flush_plug_list(q, plug));
-			if (rq_list_empty(plug->mq_list))
+			if (rq_list_empty(&plug->mq_list))
 				return;
 		}
 
 		blk_mq_run_dispatch_ops(q,
 				blk_mq_plug_issue_direct(plug));
-		if (rq_list_empty(plug->mq_list))
+		if (rq_list_empty(&plug->mq_list))
 			return;
 	}
 
 	do {
 		blk_mq_dispatch_plug_list(plug, from_schedule);
-	} while (!rq_list_empty(plug->mq_list));
+	} while (!rq_list_empty(&plug->mq_list));
 }
 
 static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
@@ -2980,7 +2978,7 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
 	if (plug) {
 		data.nr_tags = plug->nr_ios;
 		plug->nr_ios = 1;
-		data.cached_rq = &plug->cached_rq;
+		data.cached_rqs = &plug->cached_rqs;
 	}
 
 	rq = __blk_mq_alloc_requests(&data);
@@ -3003,7 +3001,7 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
 
 	if (!plug)
 		return NULL;
-	rq = rq_list_peek(&plug->cached_rq);
+	rq = rq_list_peek(&plug->cached_rqs);
 	if (!rq || rq->q != q)
 		return NULL;
 	if (type != rq->mq_hctx->type &&
@@ -3017,14 +3015,14 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
 static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
 		struct bio *bio)
 {
-	WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);
+	if (rq_list_pop(&plug->cached_rqs) != rq)
+		WARN_ON_ONCE(1);
 
 	/*
 	 * If any qos ->throttle() end up blocking, we will have flushed the
 	 * plug and hence killed the cached_rq list as well. Pop this entry
 	 * before we throttle.
 	 */
-	plug->cached_rq = rq_list_next(rq);
 	rq_qos_throttle(rq->q, bio);
 
 	blk_mq_rq_time_init(rq, blk_time_get_ns());
diff --git a/block/blk-mq.h b/block/blk-mq.h
index f4ac1af77a26..89a20fffa4b1 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -155,7 +155,7 @@ struct blk_mq_alloc_data {
 
 	/* allocate multiple requests/tags in one go */
 	unsigned int nr_tags;
-	struct request **cached_rq;
+	struct rq_list *cached_rqs;
 
 	/* input & output parameter */
 	struct blk_mq_ctx *ctx;
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 2f0431e42c49..3c3d8d200abb 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1638,10 +1638,9 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_STS_OK;
 }
 
-static void null_queue_rqs(struct request **rqlist)
+static void null_queue_rqs(struct rq_list *rqlist)
 {
-	struct request *requeue_list = NULL;
-	struct request **requeue_lastp = &requeue_list;
+	struct rq_list requeue_list = {};
 	struct blk_mq_queue_data bd = { };
 	blk_status_t ret;
 
@@ -1651,8 +1650,8 @@ static void null_queue_rqs(struct request **rqlist)
 		bd.rq = rq;
 		ret = null_queue_rq(rq->mq_hctx, &bd);
 		if (ret != BLK_STS_OK)
-			rq_list_add_tail(&requeue_lastp, rq);
-	} while (!rq_list_empty(*rqlist));
+			rq_list_add_tail(&requeue_list, rq);
+	} while (!rq_list_empty(rqlist));
 
 	*rqlist = requeue_list;
 }
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index b25f7c06a28e..a19f24c19140 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -472,7 +472,7 @@ static bool virtblk_prep_rq_batch(struct request *req)
 }
 
 static void virtblk_add_req_batch(struct virtio_blk_vq *vq,
-					struct request **rqlist)
+		struct rq_list *rqlist)
 {
 	struct request *req;
 	unsigned long flags;
@@ -499,11 +499,10 @@ static void virtblk_add_req_batch(struct virtio_blk_vq *vq,
 		virtqueue_notify(vq->vq);
 }
 
-static void virtio_queue_rqs(struct request **rqlist)
+static void virtio_queue_rqs(struct rq_list *rqlist)
 {
-	struct request *submit_list = NULL;
-	struct request *requeue_list = NULL;
-	struct request **requeue_lastp = &requeue_list;
+	struct rq_list submit_list = { };
+	struct rq_list requeue_list = { };
 	struct virtio_blk_vq *vq = NULL;
 	struct request *req;
 
@@ -515,9 +514,9 @@ static void virtio_queue_rqs(struct request **rqlist)
 		vq = this_vq;
 
 		if (virtblk_prep_rq_batch(req))
-			rq_list_add(&submit_list, req); /* reverse order */
+			rq_list_add_head(&submit_list, req); /* reverse order */
 		else
-			rq_list_add_tail(&requeue_lastp, req);
+			rq_list_add_tail(&requeue_list, req);
 	}
 
 	if (vq)
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index b1387dc459a3..7cd1102a8d2c 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -649,7 +649,7 @@ static bool apple_nvme_handle_cq(struct apple_nvme_queue *q, bool force)
 
 	found = apple_nvme_poll_cq(q, &iob);
 
-	if (!rq_list_empty(iob.req_list))
+	if (!rq_list_empty(&iob.req_list))
 		apple_nvme_complete_batch(&iob);
 
 	return found;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ec1c44c75d92..707dbe8be6a2 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -904,7 +904,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_STS_OK;
 }
 
-static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
+static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct rq_list *rqlist)
 {
 	struct request *req;
 
@@ -932,11 +932,10 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
 	return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
 }
 
-static void nvme_queue_rqs(struct request **rqlist)
+static void nvme_queue_rqs(struct rq_list *rqlist)
 {
-	struct request *submit_list = NULL;
-	struct request *requeue_list = NULL;
-	struct request **requeue_lastp = &requeue_list;
+	struct rq_list submit_list = { };
+	struct rq_list requeue_list = { };
 	struct nvme_queue *nvmeq = NULL;
 	struct request *req;
 
@@ -946,9 +945,9 @@ static void nvme_queue_rqs(struct request **rqlist)
 		nvmeq = req->mq_hctx->driver_data;
 
 		if (nvme_prep_rq_batch(nvmeq, req))
-			rq_list_add(&submit_list, req); /* reverse order */
+			rq_list_add_head(&submit_list, req); /* reverse order */
 		else
-			rq_list_add_tail(&requeue_lastp, req);
+			rq_list_add_tail(&requeue_list, req);
 	}
 
 	if (nvmeq)
@@ -1080,7 +1079,7 @@ static irqreturn_t nvme_irq(int irq, void *data)
 	DEFINE_IO_COMP_BATCH(iob);
 
 	if (nvme_poll_cq(nvmeq, &iob)) {
-		if (!rq_list_empty(iob.req_list))
+		if (!rq_list_empty(&iob.req_list))
 			nvme_pci_complete_batch(&iob);
 		return IRQ_HANDLED;
 	}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ad26a41d13f9..c61e04365677 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -229,44 +229,60 @@ static inline unsigned short req_get_ioprio(struct request *req)
 #define rq_dma_dir(rq) \
 	(op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
 
-#define rq_list_add(listptr, rq)	do {		\
-	(rq)->rq_next = *(listptr);			\
-	*(listptr) = rq;				\
-} while (0)
-
-#define rq_list_add_tail(lastpptr, rq)	do {		\
-	(rq)->rq_next = NULL;				\
-	**(lastpptr) = rq;				\
-	*(lastpptr) = &rq->rq_next;			\
-} while (0)
-
-#define rq_list_pop(listptr)				\
-({							\
-	struct request *__req = NULL;			\
-	if ((listptr) && *(listptr))	{		\
-		__req = *(listptr);			\
-		*(listptr) = __req->rq_next;		\
-	}						\
-	__req;						\
-})
+static inline int rq_list_empty(const struct rq_list *rl)
+{
+	return rl->head == NULL;
+}
 
-#define rq_list_peek(listptr)				\
-({							\
-	struct request *__req = NULL;			\
-	if ((listptr) && *(listptr))			\
-		__req = *(listptr);			\
-	__req;						\
-})
+static inline void rq_list_init(struct rq_list *rl)
+{
+	rl->head = NULL;
+	rl->tail = NULL;
+}
+
+static inline void rq_list_add_tail(struct rq_list *rl, struct request *rq)
+{
+	rq->rq_next = NULL;
+	if (rl->tail)
+		rl->tail->rq_next = rq;
+	else
+		rl->head = rq;
+	rl->tail = rq;
+}
+
+static inline void rq_list_add_head(struct rq_list *rl, struct request *rq)
+{
+	rq->rq_next = rl->head;
+	rl->head = rq;
+	if (!rl->tail)
+		rl->tail = rq;
+}
+
+static inline struct request *rq_list_pop(struct rq_list *rl)
+{
+	struct request *rq = rl->head;
+
+	if (rq) {
+		rl->head = rl->head->rq_next;
+		if (!rl->head)
+			rl->tail = NULL;
+		rq->rq_next = NULL;
+	}
+
+	return rq;
+}
 
-#define rq_list_for_each(listptr, pos)			\
-	for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos))
+static inline struct request *rq_list_peek(struct rq_list *rl)
+{
+	return rl->head;
+}
 
-#define rq_list_for_each_safe(listptr, pos, nxt)			\
-	for (pos = rq_list_peek((listptr)), nxt = rq_list_next(pos);	\
-		pos; pos = nxt, nxt = pos ? rq_list_next(pos) : NULL)
+#define rq_list_for_each(rl, pos)					\
+	for (pos = rq_list_peek((rl)); (pos); pos = pos->rq_next)
 
-#define rq_list_next(rq)	(rq)->rq_next
-#define rq_list_empty(list)	((list) == (struct request *) NULL)
+#define rq_list_for_each_safe(rl, pos, nxt)				\
+	for (pos = rq_list_peek((rl)), nxt = pos->rq_next;		\
+		pos; pos = nxt, nxt = pos ? pos->rq_next : NULL)
 
 /**
  * enum blk_eh_timer_return - How the timeout handler should proceed
@@ -559,7 +575,7 @@ struct blk_mq_ops {
 	 * empty the @rqlist completely, then the rest will be queued
 	 * individually by the block layer upon return.
 	 */
-	void (*queue_rqs)(struct request **rqlist);
+	void (*queue_rqs)(struct rq_list *rqlist);
 
 	/**
 	 * @get_budget: Reserve budget before queue request, once .queue_rq is
@@ -868,7 +884,7 @@ static inline bool blk_mq_add_to_batch(struct request *req,
 	else if (iob->complete != complete)
 		return false;
 	iob->need_ts |= blk_mq_need_time_stamp(req);
-	rq_list_add(&iob->req_list, req);
+	rq_list_add_head(&iob->req_list, req);
 	return true;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cd905afaf51a..00212e96261a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1007,6 +1007,11 @@ extern void blk_put_queue(struct request_queue *);
 void blk_mark_disk_dead(struct gendisk *disk);
 
 #ifdef CONFIG_BLOCK
+struct rq_list {
+	struct request *head;
+	struct request *tail;
+};
+
 /*
  * blk_plug permits building a queue of related requests by holding the I/O
  * fragments for a short period. This allows merging of sequential requests
@@ -1019,10 +1024,10 @@ void blk_mark_disk_dead(struct gendisk *disk);
  * blk_flush_plug() is called.
  */
 struct blk_plug {
-	struct request *mq_list; /* blk-mq requests */
+	struct rq_list mq_list; /* blk-mq requests */
 
 	/* if ios_left is > 1, we can batch tag/rq allocations */
-	struct request *cached_rq;
+	struct rq_list cached_rqs;
 	u64 cur_ktime;
 	unsigned short nr_ios;
 
@@ -1684,7 +1689,7 @@ int bdev_thaw(struct block_device *bdev);
 void bdev_fput(struct file *bdev_file);
 
 struct io_comp_batch {
-	struct request *req_list;
+	struct rq_list req_list;
 	bool need_ts;
 	void (*complete)(struct io_comp_batch *);
 };
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 354c4e175654..9daef985543e 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -1160,12 +1160,12 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 			poll_flags |= BLK_POLL_ONESHOT;
 
 		/* iopoll may have completed current req */
-		if (!rq_list_empty(iob.req_list) ||
+		if (!rq_list_empty(&iob.req_list) ||
 		    READ_ONCE(req->iopoll_completed))
 			break;
 	}
 
-	if (!rq_list_empty(iob.req_list))
+	if (!rq_list_empty(&iob.req_list))
 		iob.complete(&iob);
 	else if (!pos)
 		return 0;
-- 
2.51.0


From e70c301faece15b618e54b613b1fd6ece3dd05b4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 13 Nov 2024 16:20:45 +0100
Subject: [PATCH 11/16] block: don't reorder requests in blk_add_rq_to_plug

Add requests to the tail of the list instead of the front so that they
are queued up in submission order.

Remove the re-reordering in blk_mq_dispatch_plug_list, virtio_queue_rqs
and nvme_queue_rqs now that the list is ordered as expected.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20241113152050.157179-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c             | 4 ++--
 drivers/block/virtio_blk.c | 2 +-
 drivers/nvme/host/pci.c    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ff0b819e35fc..270cfd9fc6b0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1392,7 +1392,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
 	 */
 	if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS))
 		plug->has_elevator = true;
-	rq_list_add_head(&plug->mq_list, rq);
+	rq_list_add_tail(&plug->mq_list, rq);
 	plug->rq_count++;
 }
 
@@ -2844,7 +2844,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
 			rq_list_add_tail(&requeue_list, rq);
 			continue;
 		}
-		list_add(&rq->queuelist, &list);
+		list_add_tail(&rq->queuelist, &list);
 		depth++;
 	} while (!rq_list_empty(&plug->mq_list));
 
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index a19f24c19140..c0cdba71f436 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -514,7 +514,7 @@ static void virtio_queue_rqs(struct rq_list *rqlist)
 		vq = this_vq;
 
 		if (virtblk_prep_rq_batch(req))
-			rq_list_add_head(&submit_list, req); /* reverse order */
+			rq_list_add_tail(&submit_list, req);
 		else
 			rq_list_add_tail(&requeue_list, req);
 	}
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 707dbe8be6a2..5f2e3ad2cc52 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -945,7 +945,7 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
 		nvmeq = req->mq_hctx->driver_data;
 
 		if (nvme_prep_rq_batch(nvmeq, req))
-			rq_list_add_head(&submit_list, req); /* reverse order */
+			rq_list_add_tail(&submit_list, req);
 		else
 			rq_list_add_tail(&requeue_list, req);
 	}
-- 
2.51.0


From 00e8d290b55f2fa5c5a0500b4dccf9e090650447 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 13 Nov 2024 16:20:46 +0100
Subject: [PATCH 12/16] block: don't reorder requests in blk_mq_add_to_batch

LIFO ordering for batched completions is a bit unexpected and also
defeats some merging optimizations in e.g. the XFS buffered write
code.  Now that we can easily add the request to the tail of the list
do that.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20241113152050.157179-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c61e04365677..c596e0e4cb75 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -884,7 +884,7 @@ static inline bool blk_mq_add_to_batch(struct request *req,
 	else if (iob->complete != complete)
 		return false;
 	iob->need_ts |= blk_mq_need_time_stamp(req);
-	rq_list_add_head(&iob->req_list, req);
+	rq_list_add_tail(&iob->req_list, req);
 	return true;
 }
 
-- 
2.51.0


From bda9c7d92f24b693eaf0262c090de4c8c108a28e Mon Sep 17 00:00:00 2001
From: David Wang <00107082@163.com>
Date: Fri, 8 Nov 2024 13:45:00 +0800
Subject: [PATCH 13/16] block/genhd: use seq_put_decimal_ull for diskstats
 decimal values

seq_printf is costly. For each block device, 19 decimal values are
yielded in /proc/diskstats via seq_printf. On a system with 16 logical
block devices, profiling for open/read/close sequences shows seq_printf
took ~75% samples of diskstats_show:

	diskstats_show(92.626% 2269372/2450040)
	    seq_printf(76.026% 1725313/2269372)
		vsnprintf(99.163% 1710866/1725313)
		    format_decode(26.597% 455040/1710866)
		    number(19.554% 334542/1710866)
		    memcpy_orig(4.183% 71570/1710866)
			...
		srso_return_thunk(0.009% 148/1725313)
	    part_stat_read_all(8.030% 182236/2269372)

One million rounds of open/read/close /proc/diskstats takes:

	real	0m37.687s
	user	0m0.264s
	sys	0m32.911s
On average, each sequence tooks ~0.032ms

With this patch, most decimal values are yield via seq_put_decimal_ull,
performance is significantly improved:

	real	0m20.792s
	user	0m0.316s
	sys	0m20.463s
On average, each sequence tooks ~0.020ms, a ~37.5% improvement.

Signed-off-by: David Wang <00107082@163.com>
Link: https://lore.kernel.org/r/20241108054500.4251-1-00107082@163.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 63 ++++++++++++++++++++++++---------------------------
 1 file changed, 29 insertions(+), 34 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 1971c91d6f72..9130e163e191 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1291,40 +1291,35 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			part_stat_unlock();
 		}
 		part_stat_read_all(hd, &stat);
-		seq_printf(seqf, "%4d %7d %pg "
-			   "%lu %lu %lu %u "
-			   "%lu %lu %lu %u "
-			   "%u %u %u "
-			   "%lu %lu %lu %u "
-			   "%lu %u"
-			   "\n",
-			   MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd,
-			   stat.ios[STAT_READ],
-			   stat.merges[STAT_READ],
-			   stat.sectors[STAT_READ],
-			   (unsigned int)div_u64(stat.nsecs[STAT_READ],
-							NSEC_PER_MSEC),
-			   stat.ios[STAT_WRITE],
-			   stat.merges[STAT_WRITE],
-			   stat.sectors[STAT_WRITE],
-			   (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
-							NSEC_PER_MSEC),
-			   inflight,
-			   jiffies_to_msecs(stat.io_ticks),
-			   (unsigned int)div_u64(stat.nsecs[STAT_READ] +
-						 stat.nsecs[STAT_WRITE] +
-						 stat.nsecs[STAT_DISCARD] +
-						 stat.nsecs[STAT_FLUSH],
-							NSEC_PER_MSEC),
-			   stat.ios[STAT_DISCARD],
-			   stat.merges[STAT_DISCARD],
-			   stat.sectors[STAT_DISCARD],
-			   (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
-						 NSEC_PER_MSEC),
-			   stat.ios[STAT_FLUSH],
-			   (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
-						 NSEC_PER_MSEC)
-			);
+		seq_put_decimal_ull_width(seqf, "",  MAJOR(hd->bd_dev), 4);
+		seq_put_decimal_ull_width(seqf, " ", MINOR(hd->bd_dev), 7);
+		seq_printf(seqf, " %pg", hd);
+		seq_put_decimal_ull(seqf, " ", stat.ios[STAT_READ]);
+		seq_put_decimal_ull(seqf, " ", stat.merges[STAT_READ]);
+		seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_READ]);
+		seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ],
+								     NSEC_PER_MSEC));
+		seq_put_decimal_ull(seqf, " ", stat.ios[STAT_WRITE]);
+		seq_put_decimal_ull(seqf, " ", stat.merges[STAT_WRITE]);
+		seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_WRITE]);
+		seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
+								     NSEC_PER_MSEC));
+		seq_put_decimal_ull(seqf, " ", inflight);
+		seq_put_decimal_ull(seqf, " ", jiffies_to_msecs(stat.io_ticks));
+		seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ] +
+								     stat.nsecs[STAT_WRITE] +
+								     stat.nsecs[STAT_DISCARD] +
+								     stat.nsecs[STAT_FLUSH],
+								     NSEC_PER_MSEC));
+		seq_put_decimal_ull(seqf, " ", stat.ios[STAT_DISCARD]);
+		seq_put_decimal_ull(seqf, " ", stat.merges[STAT_DISCARD]);
+		seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_DISCARD]);
+		seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
+								     NSEC_PER_MSEC));
+		seq_put_decimal_ull(seqf, " ", stat.ios[STAT_FLUSH]);
+		seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
+								     NSEC_PER_MSEC));
+		seq_putc(seqf, '\n');
 	}
 	rcu_read_unlock();
 
-- 
2.51.0


From 957860cbc1dc89f79f2acc193470224e350dfd03 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 15 Nov 2024 07:14:03 -0700
Subject: [PATCH 14/16] block: make struct rq_list available for !CONFIG_BLOCK

A previous commit changed how requests are linked in the plug structure,
but unlike the previous method, it uses a new type for it rather than
struct request. The latter is available even for !CONFIG_BLOCK, while
struct rq_list is now. Move it outside CONFIG_BLOCK.

Reported-by: Nathan Chancellor <nathan@kernel.org>
Fixes: a3396b99990d ("block: add a rq_list type")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 00212e96261a..a1fd0ddce5cf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1006,12 +1006,12 @@ extern void blk_put_queue(struct request_queue *);
 
 void blk_mark_disk_dead(struct gendisk *disk);
 
-#ifdef CONFIG_BLOCK
 struct rq_list {
 	struct request *head;
 	struct request *tail;
 };
 
+#ifdef CONFIG_BLOCK
 /*
  * blk_plug permits building a queue of related requests by holding the I/O
  * fragments for a short period. This allows merging of sequential requests
-- 
2.51.0


From 886e4757f42e5775b7c2a6e0e5e2dc9a78177b0a Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Fri, 15 Nov 2024 10:25:53 -0800
Subject: [PATCH 15/16] MAINTAINERS: Update git tree for mdraid subsystem

Moving the official git tree to the MDRAID Group account.

Signed-off-by: Song Liu <song@kernel.org>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index eeaa9f59dfe3..d9f35571da0f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21307,7 +21307,7 @@ M:	Yu Kuai <yukuai3@huawei.com>
 L:	linux-raid@vger.kernel.org
 S:	Supported
 Q:	https://patchwork.kernel.org/project/linux-raid/list/
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mdraid/linux.git
 F:	drivers/md/Kconfig
 F:	drivers/md/Makefile
 F:	drivers/md/md*
-- 
2.51.0


From a3f143c461444c0b56360bbf468615fa814a8372 Mon Sep 17 00:00:00 2001
From: Manas <manas18244@iiitd.ac.in>
Date: Mon, 18 Nov 2024 20:06:58 +0530
Subject: [PATCH 16/16] rust: block: simplify Result<()> in validate_block_size
 return

`Result` is used in place of `Result<()>` because the default type
parameters are unit `()` and `Error` types, which are automatically
inferred. Thus keep the usage consistent throughout codebase.

Suggested-by: Miguel Ojeda <ojeda@kernel.org>
Link: https://github.com/Rust-for-Linux/linux/issues/1128
Signed-off-by: Manas <manas18244@iiitd.ac.in>
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Link: https://lore.kernel.org/r/20241118-simplify-result-v3-1-6b1566a77eab@iiitd.ac.in
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 rust/kernel/block/mq/gen_disk.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs
index 708125dce96a..798c4ae0bded 100644
--- a/rust/kernel/block/mq/gen_disk.rs
+++ b/rust/kernel/block/mq/gen_disk.rs
@@ -45,7 +45,7 @@ impl GenDiskBuilder {
 
     /// Validate block size by verifying that it is between 512 and `PAGE_SIZE`,
     /// and that it is a power of two.
-    fn validate_block_size(size: u32) -> Result<()> {
+    fn validate_block_size(size: u32) -> Result {
         if !(512..=bindings::PAGE_SIZE as u32).contains(&size) || !size.is_power_of_two() {
             Err(error::code::EINVAL)
         } else {
-- 
2.51.0