From 08461535a9cd9757dadbae0ee3f3bbdd6e66ba09 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Sat, 4 Jan 2025 13:59:45 +0900
Subject: [PATCH 01/16] nvmet: Introduce get/set_feature controller operations

The implementation of some features cannot always be done generically by
the target core code. Arbitraion and IRQ coalescing features are
examples of such features: their implementation must be provided (at
least partially) by the target controller driver.

Introduce the set_feature() and get_feature() controller fabrics
operations (in struct nvmet_fabrics_ops) to allow supporting such
features.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Rick Wertenbroek <rick.wertenbroek@gmail.com>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/nvmet.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 86bb2852a63b..8325de3382ee 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -416,6 +416,10 @@ struct nvmet_fabrics_ops {
 	u16 (*create_cq)(struct nvmet_ctrl *ctrl, u16 cqid, u16 flags,
 			 u16 qsize, u64 prp1, u16 irq_vector);
 	u16 (*delete_cq)(struct nvmet_ctrl *ctrl, u16 cqid);
+	u16 (*set_feature)(const struct nvmet_ctrl *ctrl, u8 feat,
+			   void *feat_data);
+	u16 (*get_feature)(const struct nvmet_ctrl *ctrl, u8 feat,
+			   void *feat_data);
 };
 
 #define NVMET_MAX_INLINE_BIOVEC	8
-- 
2.50.1


From 2f2b20fad973d00169d24f5338eb1bf0a42e8218 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Sat, 4 Jan 2025 13:59:46 +0900
Subject: [PATCH 02/16] nvmet: Implement host identifier set feature support

The NVMe specifications mandate support for the host identifier
set_features for controllers that also supports reservations. Satisfy
this requirement by implementing handling of the NVME_FEAT_HOST_ID
feature for the nvme_set_features command. This implementation is for
now effective only for PCI target controllers. For other controller
types, the set features command is failed with a NVME_SC_CMD_SEQ_ERROR
status as before.

As noted in the code, 128 bits host identifiers are supported since the
NVMe base specifications version 2.1 indicate in section 5.1.25.1.28.1
that "The controller may support a 64-bit Host Identifier...".

The RHII (Reservations and Host Identifier Interaction) bit of the
controller attribute (ctratt) field of the identify controller data is
also set to indicate that a host ID of "0" is supported but that the
host ID must be a non-zero value to use reservations.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Rick Wertenbroek <rick.wertenbroek@gmail.com>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/admin-cmd.c | 35 +++++++++++++++++++++++++++++----
 include/linux/nvme.h            |  1 +
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 0c5127a1d191..efef3acba9fb 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -659,7 +659,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 	struct nvmet_ctrl *ctrl = req->sq->ctrl;
 	struct nvmet_subsys *subsys = ctrl->subsys;
 	struct nvme_id_ctrl *id;
-	u32 cmd_capsule_size;
+	u32 cmd_capsule_size, ctratt;
 	u16 status = 0;
 
 	if (!subsys->subsys_discovered) {
@@ -707,8 +707,10 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 
 	/* XXX: figure out what to do about RTD3R/RTD3 */
 	id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL);
-	id->ctratt = cpu_to_le32(NVME_CTRL_ATTR_HID_128_BIT |
-		NVME_CTRL_ATTR_TBKAS);
+	ctratt = NVME_CTRL_ATTR_HID_128_BIT | NVME_CTRL_ATTR_TBKAS;
+	if (nvmet_is_pci_ctrl(ctrl))
+		ctratt |= NVME_CTRL_ATTR_RHII;
+	id->ctratt = cpu_to_le32(ctratt);
 
 	id->oacs = 0;
 
@@ -1255,6 +1257,31 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask)
 	return 0;
 }
 
+static u16 nvmet_set_feat_host_id(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+
+	if (!nvmet_is_pci_ctrl(ctrl))
+		return NVME_SC_CMD_SEQ_ERROR | NVME_STATUS_DNR;
+
+	/*
+	 * The NVMe base specifications v2.1 recommends supporting 128-bits host
+	 * IDs (section 5.1.25.1.28.1). However, that same section also says
+	 * that "The controller may support a 64-bit Host Identifier and/or an
+	 * extended 128-bit Host Identifier". So simplify this support and do
+	 * not support 64-bits host IDs to avoid needing to check that all
+	 * controllers associated with the same subsystem all use the same host
+	 * ID size.
+	 */
+	if (!(req->cmd->common.cdw11 & cpu_to_le32(1 << 0))) {
+		req->error_loc = offsetof(struct nvme_common_command, cdw11);
+		return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+	}
+
+	return nvmet_copy_from_sgl(req, 0, &req->sq->ctrl->hostid,
+				   sizeof(req->sq->ctrl->hostid));
+}
+
 void nvmet_execute_set_features(struct nvmet_req *req)
 {
 	struct nvmet_subsys *subsys = nvmet_req_subsys(req);
@@ -1285,7 +1312,7 @@ void nvmet_execute_set_features(struct nvmet_req *req)
 		status = nvmet_set_feat_async_event(req, NVMET_AEN_CFG_ALL);
 		break;
 	case NVME_FEAT_HOST_ID:
-		status = NVME_SC_CMD_SEQ_ERROR | NVME_STATUS_DNR;
+		status = nvmet_set_feat_host_id(req);
 		break;
 	case NVME_FEAT_WRITE_PROTECT:
 		status = nvmet_set_feat_write_protect(req);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 42fc00dc494e..fe3b60818fdc 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -276,6 +276,7 @@ enum nvme_ctrl_attr {
 	NVME_CTRL_ATTR_HID_128_BIT	= (1 << 0),
 	NVME_CTRL_ATTR_TBKAS		= (1 << 6),
 	NVME_CTRL_ATTR_ELBAS		= (1 << 15),
+	NVME_CTRL_ATTR_RHII		= (1 << 18),
 };
 
 struct nvme_id_ctrl {
-- 
2.50.1


From 89b94a6cbeff4f184fc1ec3b9563b371ee617511 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Sat, 4 Jan 2025 13:59:47 +0900
Subject: [PATCH 03/16] nvmet: Implement interrupt coalescing feature support

The NVMe base specifications v2.1 mandate Supporting the interrupt
coalescing feature (NVME_FEAT_IRQ_COALESCE) for PCI controllers.
Introduce the data structure struct nvmet_feat_irq_coalesce to define
the time and threshold (thr) fields of this feature and implement the
functions nvmet_get_feat_irq_coalesce() and
nvmet_set_feat_irq_coalesce() to get and set this feature. These
functions respectively use the controller get_feature() and
set_feature() operations to fill and handle the fields of struct
nvmet_feat_irq_coalesce.

While the Linux kernel nvme driver does not use this feature and thus
will not complain if it is not implemented, other major OSes fail
initializing the NVMe device if this feature support is missing.

Support for this feature is prohibited for fabrics controllers. If a get
feature or set feature command for this feature is received for a
fabrics controller, the command is failed with an invalid field error.

Suggested-by: Rick Wertenbroek <rick.wertenbroek@gmail.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Rick Wertenbroek <rick.wertenbroek@gmail.com>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/admin-cmd.c | 53 +++++++++++++++++++++++++++++++--
 drivers/nvme/target/nvmet.h     | 10 +++++++
 2 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index efef3acba9fb..eff9fd2e81ed 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -1282,6 +1282,27 @@ static u16 nvmet_set_feat_host_id(struct nvmet_req *req)
 				   sizeof(req->sq->ctrl->hostid));
 }
 
+static u16 nvmet_set_feat_irq_coalesce(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11);
+	struct nvmet_feat_irq_coalesce irqc = {
+		.time = (cdw11 >> 8) & 0xff,
+		.thr = cdw11 & 0xff,
+	};
+
+	/*
+	 * This feature is not supported for fabrics controllers and mandatory
+	 * for PCI controllers.
+	 */
+	if (!nvmet_is_pci_ctrl(ctrl)) {
+		req->error_loc = offsetof(struct nvme_common_command, cdw10);
+		return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+	}
+
+	return ctrl->ops->set_feature(ctrl, NVME_FEAT_IRQ_COALESCE, &irqc);
+}
+
 void nvmet_execute_set_features(struct nvmet_req *req)
 {
 	struct nvmet_subsys *subsys = nvmet_req_subsys(req);
@@ -1305,6 +1326,9 @@ void nvmet_execute_set_features(struct nvmet_req *req)
 		nvmet_set_result(req,
 			(subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16));
 		break;
+	case NVME_FEAT_IRQ_COALESCE:
+		status = nvmet_set_feat_irq_coalesce(req);
+		break;
 	case NVME_FEAT_KATO:
 		status = nvmet_set_feat_kato(req);
 		break;
@@ -1349,6 +1373,30 @@ static u16 nvmet_get_feat_write_protect(struct nvmet_req *req)
 	return 0;
 }
 
+static u16 nvmet_get_feat_irq_coalesce(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_feat_irq_coalesce irqc = { };
+	u16 status;
+
+	/*
+	 * This feature is not supported for fabrics controllers and mandatory
+	 * for PCI controllers.
+	 */
+	if (!nvmet_is_pci_ctrl(ctrl)) {
+		req->error_loc = offsetof(struct nvme_common_command, cdw10);
+		return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+	}
+
+	status = ctrl->ops->get_feature(ctrl, NVME_FEAT_IRQ_COALESCE, &irqc);
+	if (status != NVME_SC_SUCCESS)
+		return status;
+
+	nvmet_set_result(req, ((u32)irqc.time << 8) | (u32)irqc.thr);
+
+	return NVME_SC_SUCCESS;
+}
+
 void nvmet_get_feat_kato(struct nvmet_req *req)
 {
 	nvmet_set_result(req, req->sq->ctrl->kato * 1000);
@@ -1383,13 +1431,14 @@ void nvmet_execute_get_features(struct nvmet_req *req)
 		break;
 	case NVME_FEAT_ERR_RECOVERY:
 		break;
-	case NVME_FEAT_IRQ_COALESCE:
-		break;
 	case NVME_FEAT_IRQ_CONFIG:
 		break;
 	case NVME_FEAT_WRITE_ATOMIC:
 		break;
 #endif
+	case NVME_FEAT_IRQ_COALESCE:
+		status = nvmet_get_feat_irq_coalesce(req);
+		break;
 	case NVME_FEAT_ASYNC_EVENT:
 		nvmet_get_feat_async_event(req);
 		break;
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 8325de3382ee..555c09b11dbe 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -906,4 +906,14 @@ static inline void nvmet_pr_put_ns_pc_ref(struct nvmet_pr_per_ctrl_ref *pc_ref)
 {
 	percpu_ref_put(&pc_ref->ref);
 }
+
+/*
+ * Data for the get_feature() and set_feature() operations of PCI target
+ * controllers.
+ */
+struct nvmet_feat_irq_coalesce {
+	u8		thr;
+	u8		time;
+};
+
 #endif /* _NVMET_H */
-- 
2.50.1


From f1ecd491b6e71d598172f29d9c6c8735b81d2566 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Sat, 4 Jan 2025 13:59:48 +0900
Subject: [PATCH 04/16] nvmet: Implement interrupt config feature support

The NVMe base specifications v2.1 mandate supporting the interrupt
config feature (NVME_FEAT_IRQ_CONFIG) for PCI controllers. Introduce the
data structure struct nvmet_feat_irq_config to define the coalescing
disabled (cd) and interrupt vector (iv) fields of this feature and
implement the functions nvmet_get_feat_irq_config() and
nvmet_set_feat_irq_config() functions to get and set these fields. These
functions respectively use the controller get_feature() and
set_feature() operations to fill and handle the fields of struct
nvmet_feat_irq_config.

Support for this feature is prohibited for fabrics controllers. If a get
feature command or a set feature command for this feature is received
for a fabrics controller, the command is failed with an invalid field
error.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Rick Wertenbroek <rick.wertenbroek@gmail.com>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/admin-cmd.c | 54 +++++++++++++++++++++++++++++++--
 drivers/nvme/target/nvmet.h     |  5 +++
 2 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index eff9fd2e81ed..8b8ec33330b2 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -1303,6 +1303,27 @@ static u16 nvmet_set_feat_irq_coalesce(struct nvmet_req *req)
 	return ctrl->ops->set_feature(ctrl, NVME_FEAT_IRQ_COALESCE, &irqc);
 }
 
+static u16 nvmet_set_feat_irq_config(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11);
+	struct nvmet_feat_irq_config irqcfg = {
+		.iv = cdw11 & 0xffff,
+		.cd = (cdw11 >> 16) & 0x1,
+	};
+
+	/*
+	 * This feature is not supported for fabrics controllers and mandatory
+	 * for PCI controllers.
+	 */
+	if (!nvmet_is_pci_ctrl(ctrl)) {
+		req->error_loc = offsetof(struct nvme_common_command, cdw10);
+		return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+	}
+
+	return ctrl->ops->set_feature(ctrl, NVME_FEAT_IRQ_CONFIG, &irqcfg);
+}
+
 void nvmet_execute_set_features(struct nvmet_req *req)
 {
 	struct nvmet_subsys *subsys = nvmet_req_subsys(req);
@@ -1329,6 +1350,9 @@ void nvmet_execute_set_features(struct nvmet_req *req)
 	case NVME_FEAT_IRQ_COALESCE:
 		status = nvmet_set_feat_irq_coalesce(req);
 		break;
+	case NVME_FEAT_IRQ_CONFIG:
+		status = nvmet_set_feat_irq_config(req);
+		break;
 	case NVME_FEAT_KATO:
 		status = nvmet_set_feat_kato(req);
 		break;
@@ -1397,6 +1421,31 @@ static u16 nvmet_get_feat_irq_coalesce(struct nvmet_req *req)
 	return NVME_SC_SUCCESS;
 }
 
+static u16 nvmet_get_feat_irq_config(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	u32 iv = le32_to_cpu(req->cmd->common.cdw11) & 0xffff;
+	struct nvmet_feat_irq_config irqcfg = { .iv = iv };
+	u16 status;
+
+	/*
+	 * This feature is not supported for fabrics controllers and mandatory
+	 * for PCI controllers.
+	 */
+	if (!nvmet_is_pci_ctrl(ctrl)) {
+		req->error_loc = offsetof(struct nvme_common_command, cdw10);
+		return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+	}
+
+	status = ctrl->ops->get_feature(ctrl, NVME_FEAT_IRQ_CONFIG, &irqcfg);
+	if (status != NVME_SC_SUCCESS)
+		return status;
+
+	nvmet_set_result(req, ((u32)irqcfg.cd << 16) | iv);
+
+	return NVME_SC_SUCCESS;
+}
+
 void nvmet_get_feat_kato(struct nvmet_req *req)
 {
 	nvmet_set_result(req, req->sq->ctrl->kato * 1000);
@@ -1431,14 +1480,15 @@ void nvmet_execute_get_features(struct nvmet_req *req)
 		break;
 	case NVME_FEAT_ERR_RECOVERY:
 		break;
-	case NVME_FEAT_IRQ_CONFIG:
-		break;
 	case NVME_FEAT_WRITE_ATOMIC:
 		break;
 #endif
 	case NVME_FEAT_IRQ_COALESCE:
 		status = nvmet_get_feat_irq_coalesce(req);
 		break;
+	case NVME_FEAT_IRQ_CONFIG:
+		status = nvmet_get_feat_irq_config(req);
+		break;
 	case NVME_FEAT_ASYNC_EVENT:
 		nvmet_get_feat_async_event(req);
 		break;
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 555c09b11dbe..999a4ebf597e 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -916,4 +916,9 @@ struct nvmet_feat_irq_coalesce {
 	u8		time;
 };
 
+struct nvmet_feat_irq_config {
+	u16		iv;
+	bool		cd;
+};
+
 #endif /* _NVMET_H */
-- 
2.50.1


From a0ed77d4c9a7745ac5dca35d563d6096787ae942 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Sat, 4 Jan 2025 13:59:49 +0900
Subject: [PATCH 05/16] nvmet: Implement arbitration feature support

NVMe base specification v2.1 mandates support for the arbitration
feature (NVME_FEAT_ARBITRATION). Introduce the data structure
struct nvmet_feat_arbitration to define the high, medium and low
priority weight fields and the arbitration burst field of this feature
and implement the functions nvmet_get_feat_arbitration() and
nvmet_set_feat_arbitration() functions to get and set these fields.

Since there is no generic way to implement support for the arbitration
feature, these functions respectively use the controller get_feature()
and set_feature() operations to process the feature with the help of
the controller driver. If the controller driver does not implement these
operations and a get feature command or a set feature command for this
feature is received, the command is failed with an invalid field error.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Rick Wertenbroek <rick.wertenbroek@gmail.com>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/admin-cmd.c | 51 +++++++++++++++++++++++++++++++--
 drivers/nvme/target/nvmet.h     |  7 +++++
 2 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 8b8ec33330b2..3ddd8e44e148 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -1324,6 +1324,25 @@ static u16 nvmet_set_feat_irq_config(struct nvmet_req *req)
 	return ctrl->ops->set_feature(ctrl, NVME_FEAT_IRQ_CONFIG, &irqcfg);
 }
 
+static u16 nvmet_set_feat_arbitration(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11);
+	struct nvmet_feat_arbitration arb = {
+		.hpw = (cdw11 >> 24) & 0xff,
+		.mpw = (cdw11 >> 16) & 0xff,
+		.lpw = (cdw11 >> 8) & 0xff,
+		.ab = cdw11 & 0x3,
+	};
+
+	if (!ctrl->ops->set_feature) {
+		req->error_loc = offsetof(struct nvme_common_command, cdw10);
+		return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+	}
+
+	return ctrl->ops->set_feature(ctrl, NVME_FEAT_ARBITRATION, &arb);
+}
+
 void nvmet_execute_set_features(struct nvmet_req *req)
 {
 	struct nvmet_subsys *subsys = nvmet_req_subsys(req);
@@ -1337,6 +1356,9 @@ void nvmet_execute_set_features(struct nvmet_req *req)
 		return;
 
 	switch (cdw10 & 0xff) {
+	case NVME_FEAT_ARBITRATION:
+		status = nvmet_set_feat_arbitration(req);
+		break;
 	case NVME_FEAT_NUM_QUEUES:
 		ncqr = (cdw11 >> 16) & 0xffff;
 		nsqr = cdw11 & 0xffff;
@@ -1446,6 +1468,30 @@ static u16 nvmet_get_feat_irq_config(struct nvmet_req *req)
 	return NVME_SC_SUCCESS;
 }
 
+static u16 nvmet_get_feat_arbitration(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_feat_arbitration arb = { };
+	u16 status;
+
+	if (!ctrl->ops->get_feature) {
+		req->error_loc = offsetof(struct nvme_common_command, cdw10);
+		return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+	}
+
+	status = ctrl->ops->get_feature(ctrl, NVME_FEAT_ARBITRATION, &arb);
+	if (status != NVME_SC_SUCCESS)
+		return status;
+
+	nvmet_set_result(req,
+			 ((u32)arb.hpw << 24) |
+			 ((u32)arb.mpw << 16) |
+			 ((u32)arb.lpw << 8) |
+			 (arb.ab & 0x3));
+
+	return NVME_SC_SUCCESS;
+}
+
 void nvmet_get_feat_kato(struct nvmet_req *req)
 {
 	nvmet_set_result(req, req->sq->ctrl->kato * 1000);
@@ -1472,8 +1518,6 @@ void nvmet_execute_get_features(struct nvmet_req *req)
 	 * need to come up with some fake values for these.
 	 */
 #if 0
-	case NVME_FEAT_ARBITRATION:
-		break;
 	case NVME_FEAT_POWER_MGMT:
 		break;
 	case NVME_FEAT_TEMP_THRESH:
@@ -1483,6 +1527,9 @@ void nvmet_execute_get_features(struct nvmet_req *req)
 	case NVME_FEAT_WRITE_ATOMIC:
 		break;
 #endif
+	case NVME_FEAT_ARBITRATION:
+		status = nvmet_get_feat_arbitration(req);
+		break;
 	case NVME_FEAT_IRQ_COALESCE:
 		status = nvmet_get_feat_irq_coalesce(req);
 		break;
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 999a4ebf597e..f4df458df9db 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -921,4 +921,11 @@ struct nvmet_feat_irq_config {
 	bool		cd;
 };
 
+struct nvmet_feat_arbitration {
+	u8		hpw;
+	u8		mpw;
+	u8		lpw;
+	u8		ab;
+};
+
 #endif /* _NVMET_H */
-- 
2.50.1


From 0faa0fe6f90ea59b10d1b0f15ce0eb0c18eff186 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Sat, 4 Jan 2025 13:59:50 +0900
Subject: [PATCH 06/16] nvmet: New NVMe PCI endpoint function target driver
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Implement a PCI target driver using the PCI endpoint framework. This
requires hardware with a PCI controller capable of executing in endpoint
mode.

The PCI endpoint framework is used to set up a PCI endpoint function
and its BAR compatible with a NVMe PCI controller. The framework is also
used to map local memory to the PCI address space to execute MMIO
accesses for retrieving NVMe commands from submission queues and posting
completion entries to completion queues. If supported, DMA is used for
command retreival and command data transfers, based on the PCI address
segments indicated by the command using either PRPs or SGLs.

The NVMe target driver relies on the NVMe target core code to execute
all commands isssued by the host. The PCI target driver is mainly
responsible for the following:
 - Initialization and teardown of the endpoint device and its backend
   PCI target controller. The PCI target controller is created using a
   subsystem and a port defined through configfs. The port used must be
   initialized with the "pci" transport type. The target controller is
   allocated and initialized when the PCI endpoint is started by binding
   it to the endpoint PCI device (nvmet_pci_epf_epc_init() function).

 - Manage the endpoint controller state according to the PCI link state
   and the actions of the host (e.g. checking the CC.EN register) and
   propagate these actions to the PCI target controller. Polling of the
   controller enable/disable is done using a delayed work scheduled
   every 5ms (nvmet_pci_epf_poll_cc() function). This work is started
   whenever the PCI link comes up (nvmet_pci_epf_link_up() notifier
   function) and stopped when the PCI link comes down
   (nvmet_pci_epf_link_down() notifier function).
   nvmet_pci_epf_poll_cc() enables and disables the PCI controller using
   the functions nvmet_pci_epf_enable_ctrl() and
   nvmet_pci_epf_disable_ctrl(). The controller admin queue is created
   using nvmet_pci_epf_create_cq(), which calls nvmet_cq_create(), and
   nvmet_pci_epf_create_sq() which uses nvmet_sq_create().
   nvmet_pci_epf_disable_ctrl() always resets the PCI controller to its
   initial state so that nvmet_pci_epf_enable_ctrl() can be called
   again. This ensures correct operation if, for instance, the host
   reboots causing the PCI link to be temporarily down.

 - Manage the controller admin and I/O submission queues using local
   memory. Commands are obtained from submission queues using a work
   item that constantly polls the doorbells of all submissions queues
   (nvmet_pci_epf_poll_sqs() function). This work is started whenever
   the controller is enabled (nvmet_pci_epf_enable_ctrl() function) and
   stopped when the controller is disabled (nvmet_pci_epf_disable_ctrl()
   function). When new commands are submitted by the host, DMA transfers
   are used to retrieve the commands.

 - Initiate the execution of all admin and I/O commands using the target
   core code, by calling a requests execute() function. All commands are
   individually handled using a per-command work item
   (nvmet_pci_epf_iod_work() function). A command overall execution
   includes: initializing a struct nvmet_req request for the command,
   using nvmet_req_transfer_len() to get a command data transfer length,
   parse the command PRPs or SGLs to get the PCI address segments of
   the command data buffer, retrieve data from the host (if the command
   is a write command), call req->execute() to execute the command and
   transfer data to the host (for read commands).

 - Handle the completions of commands as notified by the
   ->queue_response() operation of the PCI target controller
   (nvmet_pci_epf_queue_response() function). Completed commands are
   added to a list of completed command for their CQ. Each CQ list of
   completed command is processed using a work item
   (nvmet_pci_epf_cq_work() function) which posts entries for the
   completed commands in the CQ memory and raise an IRQ to the host to
   signal the completion. IRQ coalescing is supported as mandated by the
   NVMe base specification for PCI controllers. Of note is that
   completion entries are transmitted to the host using MMIO, after
   mapping the completion queue memory to the host PCI address space.
   Unlike for retrieving commands from SQs, DMA is not used as it
   degrades performance due to the transfer serialization needed (which
   delays completion entries transmission).

The configuration of a NVMe PCI endpoint controller is done using
configfs. First the NVMe PCI target controller configuration must be
done to set up a subsystem and a port with the "pci" addr_trtype
attribute. The subsystem can be setup using a file or block device
backed namespace or using a passthrough NVMe device. After this, the
PCI endpoint can be configured and bound to the PCI endpoint controller
to start the NVMe endpoint controller.

In order to not overcomplicate this initial implementation of an
endpoint PCI target controller driver, protection information is not
for now supported. If the PCI controller port and namespace are
configured with protection information support, an error will be
returned when the controller is created and initialized when the
endpoint function is started. Protection information support will be
added in a follow-up patch series.

Using a Rock5B board (Rockchip RK3588 SoC, PCI Gen3x4 endpoint
controller) with a target PCI controller setup with 4 I/O queues and a
null_blk block device as a namespace, the maximum performance using fio
was measured at 131 KIOPS for random 4K reads and up to 2.8 GB/S
throughput. Some data points are:

Rnd read,   4KB,  QD=1, 1 job : IOPS=16.9k, BW=66.2MiB/s (69.4MB/s)
Rnd read,   4KB, QD=32, 1 job : IOPS=78.5k, BW=307MiB/s (322MB/s)
Rnd read,   4KB, QD=32, 4 jobs: IOPS=131k, BW=511MiB/s (536MB/s)
Seq read, 512KB, QD=32, 1 job : IOPS=5381, BW=2691MiB/s (2821MB/s)

The NVMe PCI endpoint target driver is not intended for production use.
It is a tool for learning NVMe, exploring existing features and testing
implementations of new NVMe features.

Co-developed-by: Rick Wertenbroek <rick.wertenbroek@gmail.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Reviewed-by: Krzysztof WilczyÅski <kwilczynski@kernel.org>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/Kconfig   |   11 +
 drivers/nvme/target/Makefile  |    2 +
 drivers/nvme/target/pci-epf.c | 2591 +++++++++++++++++++++++++++++++++
 3 files changed, 2604 insertions(+)
 create mode 100644 drivers/nvme/target/pci-epf.c

diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 46be031f91b4..fb7446d6d682 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -115,3 +115,14 @@ config NVME_TARGET_AUTH
 	  target side.
 
 	  If unsure, say N.
+
+config NVME_TARGET_PCI_EPF
+	tristate "NVMe PCI Endpoint Function target support"
+	depends on NVME_TARGET && PCI_ENDPOINT
+	depends on NVME_CORE=y || NVME_CORE=NVME_TARGET
+	help
+	  This enables the NVMe PCI Endpoint Function target driver support,
+	  which allows creating a NVMe PCI controller using an endpoint mode
+	  capable PCI controller.
+
+	  If unsure, say N.
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index f2b025bbe10c..ed8522911d1f 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_NVME_TARGET_RDMA)		+= nvmet-rdma.o
 obj-$(CONFIG_NVME_TARGET_FC)		+= nvmet-fc.o
 obj-$(CONFIG_NVME_TARGET_FCLOOP)	+= nvme-fcloop.o
 obj-$(CONFIG_NVME_TARGET_TCP)		+= nvmet-tcp.o
+obj-$(CONFIG_NVME_TARGET_PCI_EPF)	+= nvmet-pci-epf.o
 
 nvmet-y		+= core.o configfs.o admin-cmd.o fabrics-cmd.o \
 			discovery.o io-cmd-file.o io-cmd-bdev.o pr.o
@@ -20,4 +21,5 @@ nvmet-rdma-y	+= rdma.o
 nvmet-fc-y	+= fc.o
 nvme-fcloop-y	+= fcloop.o
 nvmet-tcp-y	+= tcp.o
+nvmet-pci-epf-y	+= pci-epf.o
 nvmet-$(CONFIG_TRACING)	+= trace.o
diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c
new file mode 100644
index 000000000000..ac30b42cc622
--- /dev/null
+++ b/drivers/nvme/target/pci-epf.c
@@ -0,0 +1,2591 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe PCI Endpoint Function target driver.
+ *
+ * Copyright (c) 2024, Western Digital Corporation or its affiliates.
+ * Copyright (c) 2024, Rick Wertenbroek <rick.wertenbroek@gmail.com>
+ *                     REDS Institute, HEIG-VD, HES-SO, Switzerland
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/delay.h>
+#include <linux/dmaengine.h>
+#include <linux/io.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/nvme.h>
+#include <linux/pci_ids.h>
+#include <linux/pci-epc.h>
+#include <linux/pci-epf.h>
+#include <linux/pci_regs.h>
+#include <linux/slab.h>
+
+#include "nvmet.h"
+
+static LIST_HEAD(nvmet_pci_epf_ports);
+static DEFINE_MUTEX(nvmet_pci_epf_ports_mutex);
+
+/*
+ * Default and maximum allowed data transfer size. For the default,
+ * allow up to 128 page-sized segments. For the maximum allowed,
+ * use 4 times the default (which is completely arbitrary).
+ */
+#define NVMET_PCI_EPF_MAX_SEGS		128
+#define NVMET_PCI_EPF_MDTS_KB		\
+	(NVMET_PCI_EPF_MAX_SEGS << (PAGE_SHIFT - 10))
+#define NVMET_PCI_EPF_MAX_MDTS_KB	(NVMET_PCI_EPF_MDTS_KB * 4)
+
+/*
+ * IRQ vector coalescing threshold: by default, post 8 CQEs before raising an
+ * interrupt vector to the host. This default 8 is completely arbitrary and can
+ * be changed by the host with a nvme_set_features command.
+ */
+#define NVMET_PCI_EPF_IV_THRESHOLD	8
+
+/*
+ * BAR CC register and SQ polling intervals.
+ */
+#define NVMET_PCI_EPF_CC_POLL_INTERVAL	msecs_to_jiffies(5)
+#define NVMET_PCI_EPF_SQ_POLL_INTERVAL	msecs_to_jiffies(5)
+#define NVMET_PCI_EPF_SQ_POLL_IDLE	msecs_to_jiffies(5000)
+
+/*
+ * SQ arbitration burst default: fetch at most 8 commands at a time from an SQ.
+ */
+#define NVMET_PCI_EPF_SQ_AB		8
+
+/*
+ * Handling of CQs is normally immediate, unless we fail to map a CQ or the CQ
+ * is full, in which case we retry the CQ processing after this interval.
+ */
+#define NVMET_PCI_EPF_CQ_RETRY_INTERVAL	msecs_to_jiffies(1)
+
+enum nvmet_pci_epf_queue_flags {
+	NVMET_PCI_EPF_Q_IS_SQ = 0,	/* The queue is a submission queue */
+	NVMET_PCI_EPF_Q_LIVE,		/* The queue is live */
+	NVMET_PCI_EPF_Q_IRQ_ENABLED,	/* IRQ is enabled for this queue */
+};
+
+/*
+ * IRQ vector descriptor.
+ */
+struct nvmet_pci_epf_irq_vector {
+	unsigned int	vector;
+	unsigned int	ref;
+	bool		cd;
+	int		nr_irqs;
+};
+
+struct nvmet_pci_epf_queue {
+	union {
+		struct nvmet_sq		nvme_sq;
+		struct nvmet_cq		nvme_cq;
+	};
+	struct nvmet_pci_epf_ctrl	*ctrl;
+	unsigned long			flags;
+
+	u64				pci_addr;
+	size_t				pci_size;
+	struct pci_epc_map		pci_map;
+
+	u16				qid;
+	u16				depth;
+	u16				vector;
+	u16				head;
+	u16				tail;
+	u16				phase;
+	u32				db;
+
+	size_t				qes;
+
+	struct nvmet_pci_epf_irq_vector	*iv;
+	struct workqueue_struct		*iod_wq;
+	struct delayed_work		work;
+	spinlock_t			lock;
+	struct list_head		list;
+};
+
+/*
+ * PCI Root Complex (RC) address data segment for mapping an admin or
+ * I/O command buffer @buf of @length bytes to the PCI address @pci_addr.
+ */
+struct nvmet_pci_epf_segment {
+	void				*buf;
+	u64				pci_addr;
+	u32				length;
+};
+
+/*
+ * Command descriptors.
+ */
+struct nvmet_pci_epf_iod {
+	struct list_head		link;
+
+	struct nvmet_req		req;
+	struct nvme_command		cmd;
+	struct nvme_completion		cqe;
+	unsigned int			status;
+
+	struct nvmet_pci_epf_ctrl	*ctrl;
+
+	struct nvmet_pci_epf_queue	*sq;
+	struct nvmet_pci_epf_queue	*cq;
+
+	/* Data transfer size and direction for the command. */
+	size_t				data_len;
+	enum dma_data_direction		dma_dir;
+
+	/*
+	 * PCI Root Complex (RC) address data segments: if nr_data_segs is 1, we
+	 * use only @data_seg. Otherwise, the array of segments @data_segs is
+	 * allocated to manage multiple PCI address data segments. @data_sgl and
+	 * @data_sgt are used to setup the command request for execution by the
+	 * target core.
+	 */
+	unsigned int			nr_data_segs;
+	struct nvmet_pci_epf_segment	data_seg;
+	struct nvmet_pci_epf_segment	*data_segs;
+	struct scatterlist		data_sgl;
+	struct sg_table			data_sgt;
+
+	struct work_struct		work;
+	struct completion		done;
+};
+
+/*
+ * PCI target controller private data.
+ */
+struct nvmet_pci_epf_ctrl {
+	struct nvmet_pci_epf		*nvme_epf;
+	struct nvmet_port		*port;
+	struct nvmet_ctrl		*tctrl;
+	struct device			*dev;
+
+	unsigned int			nr_queues;
+	struct nvmet_pci_epf_queue	*sq;
+	struct nvmet_pci_epf_queue	*cq;
+	unsigned int			sq_ab;
+
+	mempool_t			iod_pool;
+	void				*bar;
+	u64				cap;
+	u32				cc;
+	u32				csts;
+
+	size_t				io_sqes;
+	size_t				io_cqes;
+
+	size_t				mps_shift;
+	size_t				mps;
+	size_t				mps_mask;
+
+	unsigned int			mdts;
+
+	struct delayed_work		poll_cc;
+	struct delayed_work		poll_sqs;
+
+	struct mutex			irq_lock;
+	struct nvmet_pci_epf_irq_vector	*irq_vectors;
+	unsigned int			irq_vector_threshold;
+
+	bool				link_up;
+	bool				enabled;
+};
+
+/*
+ * PCI EPF driver private data.
+ */
+struct nvmet_pci_epf {
+	struct pci_epf			*epf;
+
+	const struct pci_epc_features	*epc_features;
+
+	void				*reg_bar;
+	size_t				msix_table_offset;
+
+	unsigned int			irq_type;
+	unsigned int			nr_vectors;
+
+	struct nvmet_pci_epf_ctrl	ctrl;
+
+	bool				dma_enabled;
+	struct dma_chan			*dma_tx_chan;
+	struct mutex			dma_tx_lock;
+	struct dma_chan			*dma_rx_chan;
+	struct mutex			dma_rx_lock;
+
+	struct mutex			mmio_lock;
+
+	/* PCI endpoint function configfs attributes. */
+	struct config_group		group;
+	__le16				portid;
+	char				subsysnqn[NVMF_NQN_SIZE];
+	unsigned int			mdts_kb;
+};
+
+static inline u32 nvmet_pci_epf_bar_read32(struct nvmet_pci_epf_ctrl *ctrl,
+					   u32 off)
+{
+	__le32 *bar_reg = ctrl->bar + off;
+
+	return le32_to_cpu(READ_ONCE(*bar_reg));
+}
+
+static inline void nvmet_pci_epf_bar_write32(struct nvmet_pci_epf_ctrl *ctrl,
+					     u32 off, u32 val)
+{
+	__le32 *bar_reg = ctrl->bar + off;
+
+	WRITE_ONCE(*bar_reg, cpu_to_le32(val));
+}
+
+static inline u64 nvmet_pci_epf_bar_read64(struct nvmet_pci_epf_ctrl *ctrl,
+					   u32 off)
+{
+	return (u64)nvmet_pci_epf_bar_read32(ctrl, off) |
+		((u64)nvmet_pci_epf_bar_read32(ctrl, off + 4) << 32);
+}
+
+static inline void nvmet_pci_epf_bar_write64(struct nvmet_pci_epf_ctrl *ctrl,
+					     u32 off, u64 val)
+{
+	nvmet_pci_epf_bar_write32(ctrl, off, val & 0xFFFFFFFF);
+	nvmet_pci_epf_bar_write32(ctrl, off + 4, (val >> 32) & 0xFFFFFFFF);
+}
+
+static inline int nvmet_pci_epf_mem_map(struct nvmet_pci_epf *nvme_epf,
+		u64 pci_addr, size_t size, struct pci_epc_map *map)
+{
+	struct pci_epf *epf = nvme_epf->epf;
+
+	return pci_epc_mem_map(epf->epc, epf->func_no, epf->vfunc_no,
+			       pci_addr, size, map);
+}
+
+static inline void nvmet_pci_epf_mem_unmap(struct nvmet_pci_epf *nvme_epf,
+					   struct pci_epc_map *map)
+{
+	struct pci_epf *epf = nvme_epf->epf;
+
+	pci_epc_mem_unmap(epf->epc, epf->func_no, epf->vfunc_no, map);
+}
+
+struct nvmet_pci_epf_dma_filter {
+	struct device *dev;
+	u32 dma_mask;
+};
+
+static bool nvmet_pci_epf_dma_filter(struct dma_chan *chan, void *arg)
+{
+	struct nvmet_pci_epf_dma_filter *filter = arg;
+	struct dma_slave_caps caps;
+
+	memset(&caps, 0, sizeof(caps));
+	dma_get_slave_caps(chan, &caps);
+
+	return chan->device->dev == filter->dev &&
+		(filter->dma_mask & caps.directions);
+}
+
+static void nvmet_pci_epf_init_dma(struct nvmet_pci_epf *nvme_epf)
+{
+	struct pci_epf *epf = nvme_epf->epf;
+	struct device *dev = &epf->dev;
+	struct nvmet_pci_epf_dma_filter filter;
+	struct dma_chan *chan;
+	dma_cap_mask_t mask;
+
+	mutex_init(&nvme_epf->dma_rx_lock);
+	mutex_init(&nvme_epf->dma_tx_lock);
+
+	dma_cap_zero(mask);
+	dma_cap_set(DMA_SLAVE, mask);
+
+	filter.dev = epf->epc->dev.parent;
+	filter.dma_mask = BIT(DMA_DEV_TO_MEM);
+
+	chan = dma_request_channel(mask, nvmet_pci_epf_dma_filter, &filter);
+	if (!chan)
+		goto out_dma_no_rx;
+
+	nvme_epf->dma_rx_chan = chan;
+
+	filter.dma_mask = BIT(DMA_MEM_TO_DEV);
+	chan = dma_request_channel(mask, nvmet_pci_epf_dma_filter, &filter);
+	if (!chan)
+		goto out_dma_no_tx;
+
+	nvme_epf->dma_tx_chan = chan;
+
+	nvme_epf->dma_enabled = true;
+
+	dev_dbg(dev, "Using DMA RX channel %s, maximum segment size %u B\n",
+		dma_chan_name(chan),
+		dma_get_max_seg_size(dmaengine_get_dma_device(chan)));
+
+	dev_dbg(dev, "Using DMA TX channel %s, maximum segment size %u B\n",
+		dma_chan_name(chan),
+		dma_get_max_seg_size(dmaengine_get_dma_device(chan)));
+
+	return;
+
+out_dma_no_tx:
+	dma_release_channel(nvme_epf->dma_rx_chan);
+	nvme_epf->dma_rx_chan = NULL;
+
+out_dma_no_rx:
+	mutex_destroy(&nvme_epf->dma_rx_lock);
+	mutex_destroy(&nvme_epf->dma_tx_lock);
+	nvme_epf->dma_enabled = false;
+
+	dev_info(&epf->dev, "DMA not supported, falling back to MMIO\n");
+}
+
+static void nvmet_pci_epf_deinit_dma(struct nvmet_pci_epf *nvme_epf)
+{
+	if (!nvme_epf->dma_enabled)
+		return;
+
+	dma_release_channel(nvme_epf->dma_tx_chan);
+	nvme_epf->dma_tx_chan = NULL;
+	dma_release_channel(nvme_epf->dma_rx_chan);
+	nvme_epf->dma_rx_chan = NULL;
+	mutex_destroy(&nvme_epf->dma_rx_lock);
+	mutex_destroy(&nvme_epf->dma_tx_lock);
+	nvme_epf->dma_enabled = false;
+}
+
+static int nvmet_pci_epf_dma_transfer(struct nvmet_pci_epf *nvme_epf,
+		struct nvmet_pci_epf_segment *seg, enum dma_data_direction dir)
+{
+	struct pci_epf *epf = nvme_epf->epf;
+	struct dma_async_tx_descriptor *desc;
+	struct dma_slave_config sconf = {};
+	struct device *dev = &epf->dev;
+	struct device *dma_dev;
+	struct dma_chan *chan;
+	dma_cookie_t cookie;
+	dma_addr_t dma_addr;
+	struct mutex *lock;
+	int ret;
+
+	switch (dir) {
+	case DMA_FROM_DEVICE:
+		lock = &nvme_epf->dma_rx_lock;
+		chan = nvme_epf->dma_rx_chan;
+		sconf.direction = DMA_DEV_TO_MEM;
+		sconf.src_addr = seg->pci_addr;
+		break;
+	case DMA_TO_DEVICE:
+		lock = &nvme_epf->dma_tx_lock;
+		chan = nvme_epf->dma_tx_chan;
+		sconf.direction = DMA_MEM_TO_DEV;
+		sconf.dst_addr = seg->pci_addr;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	mutex_lock(lock);
+
+	dma_dev = dmaengine_get_dma_device(chan);
+	dma_addr = dma_map_single(dma_dev, seg->buf, seg->length, dir);
+	ret = dma_mapping_error(dma_dev, dma_addr);
+	if (ret)
+		goto unlock;
+
+	ret = dmaengine_slave_config(chan, &sconf);
+	if (ret) {
+		dev_err(dev, "Failed to configure DMA channel\n");
+		goto unmap;
+	}
+
+	desc = dmaengine_prep_slave_single(chan, dma_addr, seg->length,
+					   sconf.direction, DMA_CTRL_ACK);
+	if (!desc) {
+		dev_err(dev, "Failed to prepare DMA\n");
+		ret = -EIO;
+		goto unmap;
+	}
+
+	cookie = dmaengine_submit(desc);
+	ret = dma_submit_error(cookie);
+	if (ret) {
+		dev_err(dev, "Failed to do DMA submit (err=%d)\n", ret);
+		goto unmap;
+	}
+
+	if (dma_sync_wait(chan, cookie) != DMA_COMPLETE) {
+		dev_err(dev, "DMA transfer failed\n");
+		ret = -EIO;
+	}
+
+	dmaengine_terminate_sync(chan);
+
+unmap:
+	dma_unmap_single(dma_dev, dma_addr, seg->length, dir);
+
+unlock:
+	mutex_unlock(lock);
+
+	return ret;
+}
+
+static int nvmet_pci_epf_mmio_transfer(struct nvmet_pci_epf *nvme_epf,
+		struct nvmet_pci_epf_segment *seg, enum dma_data_direction dir)
+{
+	u64 pci_addr = seg->pci_addr;
+	u32 length = seg->length;
+	void *buf = seg->buf;
+	struct pci_epc_map map;
+	int ret = -EINVAL;
+
+	/*
+	 * Note: MMIO transfers do not need serialization but this is a
+	 * simple way to avoid using too many mapping windows.
+	 */
+	mutex_lock(&nvme_epf->mmio_lock);
+
+	while (length) {
+		ret = nvmet_pci_epf_mem_map(nvme_epf, pci_addr, length, &map);
+		if (ret)
+			break;
+
+		switch (dir) {
+		case DMA_FROM_DEVICE:
+			memcpy_fromio(buf, map.virt_addr, map.pci_size);
+			break;
+		case DMA_TO_DEVICE:
+			memcpy_toio(map.virt_addr, buf, map.pci_size);
+			break;
+		default:
+			ret = -EINVAL;
+			goto unlock;
+		}
+
+		pci_addr += map.pci_size;
+		buf += map.pci_size;
+		length -= map.pci_size;
+
+		nvmet_pci_epf_mem_unmap(nvme_epf, &map);
+	}
+
+unlock:
+	mutex_unlock(&nvme_epf->mmio_lock);
+
+	return ret;
+}
+
+static inline int nvmet_pci_epf_transfer_seg(struct nvmet_pci_epf *nvme_epf,
+		struct nvmet_pci_epf_segment *seg, enum dma_data_direction dir)
+{
+	if (nvme_epf->dma_enabled)
+		return nvmet_pci_epf_dma_transfer(nvme_epf, seg, dir);
+
+	return nvmet_pci_epf_mmio_transfer(nvme_epf, seg, dir);
+}
+
+static inline int nvmet_pci_epf_transfer(struct nvmet_pci_epf_ctrl *ctrl,
+					 void *buf, u64 pci_addr, u32 length,
+					 enum dma_data_direction dir)
+{
+	struct nvmet_pci_epf_segment seg = {
+		.buf = buf,
+		.pci_addr = pci_addr,
+		.length = length,
+	};
+
+	return nvmet_pci_epf_transfer_seg(ctrl->nvme_epf, &seg, dir);
+}
+
+static int nvmet_pci_epf_alloc_irq_vectors(struct nvmet_pci_epf_ctrl *ctrl)
+{
+	ctrl->irq_vectors = kcalloc(ctrl->nr_queues,
+				    sizeof(struct nvmet_pci_epf_irq_vector),
+				    GFP_KERNEL);
+	if (!ctrl->irq_vectors)
+		return -ENOMEM;
+
+	mutex_init(&ctrl->irq_lock);
+
+	return 0;
+}
+
+static void nvmet_pci_epf_free_irq_vectors(struct nvmet_pci_epf_ctrl *ctrl)
+{
+	if (ctrl->irq_vectors) {
+		mutex_destroy(&ctrl->irq_lock);
+		kfree(ctrl->irq_vectors);
+		ctrl->irq_vectors = NULL;
+	}
+}
+
+static struct nvmet_pci_epf_irq_vector *
+nvmet_pci_epf_find_irq_vector(struct nvmet_pci_epf_ctrl *ctrl, u16 vector)
+{
+	struct nvmet_pci_epf_irq_vector *iv;
+	int i;
+
+	lockdep_assert_held(&ctrl->irq_lock);
+
+	for (i = 0; i < ctrl->nr_queues; i++) {
+		iv = &ctrl->irq_vectors[i];
+		if (iv->ref && iv->vector == vector)
+			return iv;
+	}
+
+	return NULL;
+}
+
+static struct nvmet_pci_epf_irq_vector *
+nvmet_pci_epf_add_irq_vector(struct nvmet_pci_epf_ctrl *ctrl, u16 vector)
+{
+	struct nvmet_pci_epf_irq_vector *iv;
+	int i;
+
+	mutex_lock(&ctrl->irq_lock);
+
+	iv = nvmet_pci_epf_find_irq_vector(ctrl, vector);
+	if (iv) {
+		iv->ref++;
+		goto unlock;
+	}
+
+	for (i = 0; i < ctrl->nr_queues; i++) {
+		iv = &ctrl->irq_vectors[i];
+		if (!iv->ref)
+			break;
+	}
+
+	if (WARN_ON_ONCE(!iv))
+		goto unlock;
+
+	iv->ref = 1;
+	iv->vector = vector;
+	iv->nr_irqs = 0;
+
+unlock:
+	mutex_unlock(&ctrl->irq_lock);
+
+	return iv;
+}
+
+static void nvmet_pci_epf_remove_irq_vector(struct nvmet_pci_epf_ctrl *ctrl,
+					    u16 vector)
+{
+	struct nvmet_pci_epf_irq_vector *iv;
+
+	mutex_lock(&ctrl->irq_lock);
+
+	iv = nvmet_pci_epf_find_irq_vector(ctrl, vector);
+	if (iv) {
+		iv->ref--;
+		if (!iv->ref) {
+			iv->vector = 0;
+			iv->nr_irqs = 0;
+		}
+	}
+
+	mutex_unlock(&ctrl->irq_lock);
+}
+
+static bool nvmet_pci_epf_should_raise_irq(struct nvmet_pci_epf_ctrl *ctrl,
+		struct nvmet_pci_epf_queue *cq, bool force)
+{
+	struct nvmet_pci_epf_irq_vector *iv = cq->iv;
+	bool ret;
+
+	if (!test_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags))
+		return false;
+
+	/* IRQ coalescing for the admin queue is not allowed. */
+	if (!cq->qid)
+		return true;
+
+	if (iv->cd)
+		return true;
+
+	if (force) {
+		ret = iv->nr_irqs > 0;
+	} else {
+		iv->nr_irqs++;
+		ret = iv->nr_irqs >= ctrl->irq_vector_threshold;
+	}
+	if (ret)
+		iv->nr_irqs = 0;
+
+	return ret;
+}
+
+static void nvmet_pci_epf_raise_irq(struct nvmet_pci_epf_ctrl *ctrl,
+		struct nvmet_pci_epf_queue *cq, bool force)
+{
+	struct nvmet_pci_epf *nvme_epf = ctrl->nvme_epf;
+	struct pci_epf *epf = nvme_epf->epf;
+	int ret = 0;
+
+	if (!test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags))
+		return;
+
+	mutex_lock(&ctrl->irq_lock);
+
+	if (!nvmet_pci_epf_should_raise_irq(ctrl, cq, force))
+		goto unlock;
+
+	switch (nvme_epf->irq_type) {
+	case PCI_IRQ_MSIX:
+	case PCI_IRQ_MSI:
+		ret = pci_epc_raise_irq(epf->epc, epf->func_no, epf->vfunc_no,
+					nvme_epf->irq_type, cq->vector + 1);
+		if (!ret)
+			break;
+		/*
+		 * If we got an error, it is likely because the host is using
+		 * legacy IRQs (e.g. BIOS, grub).
+		 */
+		fallthrough;
+	case PCI_IRQ_INTX:
+		ret = pci_epc_raise_irq(epf->epc, epf->func_no, epf->vfunc_no,
+					PCI_IRQ_INTX, 0);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		ret = -EINVAL;
+		break;
+	}
+
+	if (ret)
+		dev_err(ctrl->dev, "Failed to raise IRQ (err=%d)\n", ret);
+
+unlock:
+	mutex_unlock(&ctrl->irq_lock);
+}
+
+static inline const char *nvmet_pci_epf_iod_name(struct nvmet_pci_epf_iod *iod)
+{
+	return nvme_opcode_str(iod->sq->qid, iod->cmd.common.opcode);
+}
+
+static void nvmet_pci_epf_exec_iod_work(struct work_struct *work);
+
+static struct nvmet_pci_epf_iod *
+nvmet_pci_epf_alloc_iod(struct nvmet_pci_epf_queue *sq)
+{
+	struct nvmet_pci_epf_ctrl *ctrl = sq->ctrl;
+	struct nvmet_pci_epf_iod *iod;
+
+	iod = mempool_alloc(&ctrl->iod_pool, GFP_KERNEL);
+	if (unlikely(!iod))
+		return NULL;
+
+	memset(iod, 0, sizeof(*iod));
+	iod->req.cmd = &iod->cmd;
+	iod->req.cqe = &iod->cqe;
+	iod->req.port = ctrl->port;
+	iod->ctrl = ctrl;
+	iod->sq = sq;
+	iod->cq = &ctrl->cq[sq->qid];
+	INIT_LIST_HEAD(&iod->link);
+	iod->dma_dir = DMA_NONE;
+	INIT_WORK(&iod->work, nvmet_pci_epf_exec_iod_work);
+	init_completion(&iod->done);
+
+	return iod;
+}
+
+/*
+ * Allocate or grow a command table of PCI segments.
+ */
+static int nvmet_pci_epf_alloc_iod_data_segs(struct nvmet_pci_epf_iod *iod,
+					     int nsegs)
+{
+	struct nvmet_pci_epf_segment *segs;
+	int nr_segs = iod->nr_data_segs + nsegs;
+
+	segs = krealloc(iod->data_segs,
+			nr_segs * sizeof(struct nvmet_pci_epf_segment),
+			GFP_KERNEL | __GFP_ZERO);
+	if (!segs)
+		return -ENOMEM;
+
+	iod->nr_data_segs = nr_segs;
+	iod->data_segs = segs;
+
+	return 0;
+}
+
+static void nvmet_pci_epf_free_iod(struct nvmet_pci_epf_iod *iod)
+{
+	int i;
+
+	if (iod->data_segs) {
+		for (i = 0; i < iod->nr_data_segs; i++)
+			kfree(iod->data_segs[i].buf);
+		if (iod->data_segs != &iod->data_seg)
+			kfree(iod->data_segs);
+	}
+	if (iod->data_sgt.nents > 1)
+		sg_free_table(&iod->data_sgt);
+	mempool_free(iod, &iod->ctrl->iod_pool);
+}
+
+static int nvmet_pci_epf_transfer_iod_data(struct nvmet_pci_epf_iod *iod)
+{
+	struct nvmet_pci_epf *nvme_epf = iod->ctrl->nvme_epf;
+	struct nvmet_pci_epf_segment *seg = &iod->data_segs[0];
+	int i, ret;
+
+	/* Split the data transfer according to the PCI segments. */
+	for (i = 0; i < iod->nr_data_segs; i++, seg++) {
+		ret = nvmet_pci_epf_transfer_seg(nvme_epf, seg, iod->dma_dir);
+		if (ret) {
+			iod->status = NVME_SC_DATA_XFER_ERROR | NVME_STATUS_DNR;
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static inline u32 nvmet_pci_epf_prp_ofst(struct nvmet_pci_epf_ctrl *ctrl,
+					 u64 prp)
+{
+	return prp & ctrl->mps_mask;
+}
+
+static inline size_t nvmet_pci_epf_prp_size(struct nvmet_pci_epf_ctrl *ctrl,
+					    u64 prp)
+{
+	return ctrl->mps - nvmet_pci_epf_prp_ofst(ctrl, prp);
+}
+
+/*
+ * Transfer a PRP list from the host and return the number of prps.
+ */
+static int nvmet_pci_epf_get_prp_list(struct nvmet_pci_epf_ctrl *ctrl, u64 prp,
+				      size_t xfer_len, __le64 *prps)
+{
+	size_t nr_prps = (xfer_len + ctrl->mps_mask) >> ctrl->mps_shift;
+	u32 length;
+	int ret;
+
+	/*
+	 * Compute the number of PRPs required for the number of bytes to
+	 * transfer (xfer_len). If this number overflows the memory page size
+	 * with the PRP list pointer specified, only return the space available
+	 * in the memory page, the last PRP in there will be a PRP list pointer
+	 * to the remaining PRPs.
+	 */
+	length = min(nvmet_pci_epf_prp_size(ctrl, prp), nr_prps << 3);
+	ret = nvmet_pci_epf_transfer(ctrl, prps, prp, length, DMA_FROM_DEVICE);
+	if (ret)
+		return ret;
+
+	return length >> 3;
+}
+
+static int nvmet_pci_epf_iod_parse_prp_list(struct nvmet_pci_epf_ctrl *ctrl,
+					    struct nvmet_pci_epf_iod *iod)
+{
+	struct nvme_command *cmd = &iod->cmd;
+	struct nvmet_pci_epf_segment *seg;
+	size_t size = 0, ofst, prp_size, xfer_len;
+	size_t transfer_len = iod->data_len;
+	int nr_segs, nr_prps = 0;
+	u64 pci_addr, prp;
+	int i = 0, ret;
+	__le64 *prps;
+
+	prps = kzalloc(ctrl->mps, GFP_KERNEL);
+	if (!prps)
+		goto err_internal;
+
+	/*
+	 * Allocate PCI segments for the command: this considers the worst case
+	 * scenario where all prps are discontiguous, so get as many segments
+	 * as we can have prps. In practice, most of the time, we will have
+	 * far less PCI segments than prps.
+	 */
+	prp = le64_to_cpu(cmd->common.dptr.prp1);
+	if (!prp)
+		goto err_invalid_field;
+
+	ofst = nvmet_pci_epf_prp_ofst(ctrl, prp);
+	nr_segs = (transfer_len + ofst + ctrl->mps - 1) >> ctrl->mps_shift;
+
+	ret = nvmet_pci_epf_alloc_iod_data_segs(iod, nr_segs);
+	if (ret)
+		goto err_internal;
+
+	/* Set the first segment using prp1. */
+	seg = &iod->data_segs[0];
+	seg->pci_addr = prp;
+	seg->length = nvmet_pci_epf_prp_size(ctrl, prp);
+
+	size = seg->length;
+	pci_addr = prp + size;
+	nr_segs = 1;
+
+	/*
+	 * Now build the PCI address segments using the PRP lists, starting
+	 * from prp2.
+	 */
+	prp = le64_to_cpu(cmd->common.dptr.prp2);
+	if (!prp)
+		goto err_invalid_field;
+
+	while (size < transfer_len) {
+		xfer_len = transfer_len - size;
+
+		if (!nr_prps) {
+			nr_prps = nvmet_pci_epf_get_prp_list(ctrl, prp,
+							     xfer_len, prps);
+			if (nr_prps < 0)
+				goto err_internal;
+
+			i = 0;
+			ofst = 0;
+		}
+
+		/* Current entry */
+		prp = le64_to_cpu(prps[i]);
+		if (!prp)
+			goto err_invalid_field;
+
+		/* Did we reach the last PRP entry of the list? */
+		if (xfer_len > ctrl->mps && i == nr_prps - 1) {
+			/* We need more PRPs: PRP is a list pointer. */
+			nr_prps = 0;
+			continue;
+		}
+
+		/* Only the first PRP is allowed to have an offset. */
+		if (nvmet_pci_epf_prp_ofst(ctrl, prp))
+			goto err_invalid_offset;
+
+		if (prp != pci_addr) {
+			/* Discontiguous prp: new segment. */
+			nr_segs++;
+			if (WARN_ON_ONCE(nr_segs > iod->nr_data_segs))
+				goto err_internal;
+
+			seg++;
+			seg->pci_addr = prp;
+			seg->length = 0;
+			pci_addr = prp;
+		}
+
+		prp_size = min_t(size_t, ctrl->mps, xfer_len);
+		seg->length += prp_size;
+		pci_addr += prp_size;
+		size += prp_size;
+
+		i++;
+	}
+
+	iod->nr_data_segs = nr_segs;
+	ret = 0;
+
+	if (size != transfer_len) {
+		dev_err(ctrl->dev,
+			"PRPs transfer length mismatch: got %zu B, need %zu B\n",
+			size, transfer_len);
+		goto err_internal;
+	}
+
+	kfree(prps);
+
+	return 0;
+
+err_invalid_offset:
+	dev_err(ctrl->dev, "PRPs list invalid offset\n");
+	iod->status = NVME_SC_PRP_INVALID_OFFSET | NVME_STATUS_DNR;
+	goto err;
+
+err_invalid_field:
+	dev_err(ctrl->dev, "PRPs list invalid field\n");
+	iod->status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+	goto err;
+
+err_internal:
+	dev_err(ctrl->dev, "PRPs list internal error\n");
+	iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
+
+err:
+	kfree(prps);
+	return -EINVAL;
+}
+
+static int nvmet_pci_epf_iod_parse_prp_simple(struct nvmet_pci_epf_ctrl *ctrl,
+					      struct nvmet_pci_epf_iod *iod)
+{
+	struct nvme_command *cmd = &iod->cmd;
+	size_t transfer_len = iod->data_len;
+	int ret, nr_segs = 1;
+	u64 prp1, prp2 = 0;
+	size_t prp1_size;
+
+	prp1 = le64_to_cpu(cmd->common.dptr.prp1);
+	prp1_size = nvmet_pci_epf_prp_size(ctrl, prp1);
+
+	/* For commands crossing a page boundary, we should have prp2. */
+	if (transfer_len > prp1_size) {
+		prp2 = le64_to_cpu(cmd->common.dptr.prp2);
+		if (!prp2) {
+			iod->status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+			return -EINVAL;
+		}
+		if (nvmet_pci_epf_prp_ofst(ctrl, prp2)) {
+			iod->status =
+				NVME_SC_PRP_INVALID_OFFSET | NVME_STATUS_DNR;
+			return -EINVAL;
+		}
+		if (prp2 != prp1 + prp1_size)
+			nr_segs = 2;
+	}
+
+	if (nr_segs == 1) {
+		iod->nr_data_segs = 1;
+		iod->data_segs = &iod->data_seg;
+		iod->data_segs[0].pci_addr = prp1;
+		iod->data_segs[0].length = transfer_len;
+		return 0;
+	}
+
+	ret = nvmet_pci_epf_alloc_iod_data_segs(iod, nr_segs);
+	if (ret) {
+		iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
+		return ret;
+	}
+
+	iod->data_segs[0].pci_addr = prp1;
+	iod->data_segs[0].length = prp1_size;
+	iod->data_segs[1].pci_addr = prp2;
+	iod->data_segs[1].length = transfer_len - prp1_size;
+
+	return 0;
+}
+
+static int nvmet_pci_epf_iod_parse_prps(struct nvmet_pci_epf_iod *iod)
+{
+	struct nvmet_pci_epf_ctrl *ctrl = iod->ctrl;
+	u64 prp1 = le64_to_cpu(iod->cmd.common.dptr.prp1);
+	size_t ofst;
+
+	/* Get the PCI address segments for the command using its PRPs. */
+	ofst = nvmet_pci_epf_prp_ofst(ctrl, prp1);
+	if (ofst & 0x3) {
+		iod->status = NVME_SC_PRP_INVALID_OFFSET | NVME_STATUS_DNR;
+		return -EINVAL;
+	}
+
+	if (iod->data_len + ofst <= ctrl->mps * 2)
+		return nvmet_pci_epf_iod_parse_prp_simple(ctrl, iod);
+
+	return nvmet_pci_epf_iod_parse_prp_list(ctrl, iod);
+}
+
+/*
+ * Transfer an SGL segment from the host and return the number of data
+ * descriptors and the next segment descriptor, if any.
+ */
+static struct nvme_sgl_desc *
+nvmet_pci_epf_get_sgl_segment(struct nvmet_pci_epf_ctrl *ctrl,
+			      struct nvme_sgl_desc *desc, unsigned int *nr_sgls)
+{
+	struct nvme_sgl_desc *sgls;
+	u32 length = le32_to_cpu(desc->length);
+	int nr_descs, ret;
+	void *buf;
+
+	buf = kmalloc(length, GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	ret = nvmet_pci_epf_transfer(ctrl, buf, le64_to_cpu(desc->addr), length,
+				     DMA_FROM_DEVICE);
+	if (ret) {
+		kfree(buf);
+		return NULL;
+	}
+
+	sgls = buf;
+	nr_descs = length / sizeof(struct nvme_sgl_desc);
+	if (sgls[nr_descs - 1].type == (NVME_SGL_FMT_SEG_DESC << 4) ||
+	    sgls[nr_descs - 1].type == (NVME_SGL_FMT_LAST_SEG_DESC << 4)) {
+		/*
+		 * We have another SGL segment following this one: do not count
+		 * it as a regular data SGL descriptor and return it to the
+		 * caller.
+		 */
+		*desc = sgls[nr_descs - 1];
+		nr_descs--;
+	} else {
+		/* We do not have another SGL segment after this one. */
+		desc->length = 0;
+	}
+
+	*nr_sgls = nr_descs;
+
+	return sgls;
+}
+
+static int nvmet_pci_epf_iod_parse_sgl_segments(struct nvmet_pci_epf_ctrl *ctrl,
+						struct nvmet_pci_epf_iod *iod)
+{
+	struct nvme_command *cmd = &iod->cmd;
+	struct nvme_sgl_desc seg = cmd->common.dptr.sgl;
+	struct nvme_sgl_desc *sgls = NULL;
+	int n = 0, i, nr_sgls;
+	int ret;
+
+	/*
+	 * We do not support inline data nor keyed SGLs, so we should be seeing
+	 * only segment descriptors.
+	 */
+	if (seg.type != (NVME_SGL_FMT_SEG_DESC << 4) &&
+	    seg.type != (NVME_SGL_FMT_LAST_SEG_DESC << 4)) {
+		iod->status = NVME_SC_SGL_INVALID_TYPE | NVME_STATUS_DNR;
+		return -EIO;
+	}
+
+	while (seg.length) {
+		sgls = nvmet_pci_epf_get_sgl_segment(ctrl, &seg, &nr_sgls);
+		if (!sgls) {
+			iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
+			return -EIO;
+		}
+
+		/* Grow the PCI segment table as needed. */
+		ret = nvmet_pci_epf_alloc_iod_data_segs(iod, nr_sgls);
+		if (ret) {
+			iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
+			goto out;
+		}
+
+		/*
+		 * Parse the SGL descriptors to build the PCI segment table,
+		 * checking the descriptor type as we go.
+		 */
+		for (i = 0; i < nr_sgls; i++) {
+			if (sgls[i].type != (NVME_SGL_FMT_DATA_DESC << 4)) {
+				iod->status = NVME_SC_SGL_INVALID_TYPE |
+					NVME_STATUS_DNR;
+				goto out;
+			}
+			iod->data_segs[n].pci_addr = le64_to_cpu(sgls[i].addr);
+			iod->data_segs[n].length = le32_to_cpu(sgls[i].length);
+			n++;
+		}
+
+		kfree(sgls);
+	}
+
+ out:
+	if (iod->status != NVME_SC_SUCCESS) {
+		kfree(sgls);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int nvmet_pci_epf_iod_parse_sgls(struct nvmet_pci_epf_iod *iod)
+{
+	struct nvmet_pci_epf_ctrl *ctrl = iod->ctrl;
+	struct nvme_sgl_desc *sgl = &iod->cmd.common.dptr.sgl;
+
+	if (sgl->type == (NVME_SGL_FMT_DATA_DESC << 4)) {
+		/* Single data descriptor case. */
+		iod->nr_data_segs = 1;
+		iod->data_segs = &iod->data_seg;
+		iod->data_seg.pci_addr = le64_to_cpu(sgl->addr);
+		iod->data_seg.length = le32_to_cpu(sgl->length);
+		return 0;
+	}
+
+	return nvmet_pci_epf_iod_parse_sgl_segments(ctrl, iod);
+}
+
+static int nvmet_pci_epf_alloc_iod_data_buf(struct nvmet_pci_epf_iod *iod)
+{
+	struct nvmet_pci_epf_ctrl *ctrl = iod->ctrl;
+	struct nvmet_req *req = &iod->req;
+	struct nvmet_pci_epf_segment *seg;
+	struct scatterlist *sg;
+	int ret, i;
+
+	if (iod->data_len > ctrl->mdts) {
+		iod->status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+		return -EINVAL;
+	}
+
+	/*
+	 * Get the PCI address segments for the command data buffer using either
+	 * its SGLs or PRPs.
+	 */
+	if (iod->cmd.common.flags & NVME_CMD_SGL_ALL)
+		ret = nvmet_pci_epf_iod_parse_sgls(iod);
+	else
+		ret = nvmet_pci_epf_iod_parse_prps(iod);
+	if (ret)
+		return ret;
+
+	/* Get a command buffer using SGLs matching the PCI segments. */
+	if (iod->nr_data_segs == 1) {
+		sg_init_table(&iod->data_sgl, 1);
+		iod->data_sgt.sgl = &iod->data_sgl;
+		iod->data_sgt.nents = 1;
+		iod->data_sgt.orig_nents = 1;
+	} else {
+		ret = sg_alloc_table(&iod->data_sgt, iod->nr_data_segs,
+				     GFP_KERNEL);
+		if (ret)
+			goto err_nomem;
+	}
+
+	for_each_sgtable_sg(&iod->data_sgt, sg, i) {
+		seg = &iod->data_segs[i];
+		seg->buf = kmalloc(seg->length, GFP_KERNEL);
+		if (!seg->buf)
+			goto err_nomem;
+		sg_set_buf(sg, seg->buf, seg->length);
+	}
+
+	req->transfer_len = iod->data_len;
+	req->sg = iod->data_sgt.sgl;
+	req->sg_cnt = iod->data_sgt.nents;
+
+	return 0;
+
+err_nomem:
+	iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
+	return -ENOMEM;
+}
+
+static void nvmet_pci_epf_complete_iod(struct nvmet_pci_epf_iod *iod)
+{
+	struct nvmet_pci_epf_queue *cq = iod->cq;
+	unsigned long flags;
+
+	/* Print an error message for failed commands, except AENs. */
+	iod->status = le16_to_cpu(iod->cqe.status) >> 1;
+	if (iod->status && iod->cmd.common.opcode != nvme_admin_async_event)
+		dev_err(iod->ctrl->dev,
+			"CQ[%d]: Command %s (0x%x) status 0x%0x\n",
+			iod->sq->qid, nvmet_pci_epf_iod_name(iod),
+			iod->cmd.common.opcode, iod->status);
+
+	/*
+	 * Add the command to the list of completed commands and schedule the
+	 * CQ work.
+	 */
+	spin_lock_irqsave(&cq->lock, flags);
+	list_add_tail(&iod->link, &cq->list);
+	queue_delayed_work(system_highpri_wq, &cq->work, 0);
+	spin_unlock_irqrestore(&cq->lock, flags);
+}
+
+static void nvmet_pci_epf_drain_queue(struct nvmet_pci_epf_queue *queue)
+{
+	struct nvmet_pci_epf_iod *iod;
+	unsigned long flags;
+
+	spin_lock_irqsave(&queue->lock, flags);
+	while (!list_empty(&queue->list)) {
+		iod = list_first_entry(&queue->list, struct nvmet_pci_epf_iod,
+				       link);
+		list_del_init(&iod->link);
+		nvmet_pci_epf_free_iod(iod);
+	}
+	spin_unlock_irqrestore(&queue->lock, flags);
+}
+
+static int nvmet_pci_epf_add_port(struct nvmet_port *port)
+{
+	mutex_lock(&nvmet_pci_epf_ports_mutex);
+	list_add_tail(&port->entry, &nvmet_pci_epf_ports);
+	mutex_unlock(&nvmet_pci_epf_ports_mutex);
+	return 0;
+}
+
+static void nvmet_pci_epf_remove_port(struct nvmet_port *port)
+{
+	mutex_lock(&nvmet_pci_epf_ports_mutex);
+	list_del_init(&port->entry);
+	mutex_unlock(&nvmet_pci_epf_ports_mutex);
+}
+
+static struct nvmet_port *
+nvmet_pci_epf_find_port(struct nvmet_pci_epf_ctrl *ctrl, __le16 portid)
+{
+	struct nvmet_port *p, *port = NULL;
+
+	mutex_lock(&nvmet_pci_epf_ports_mutex);
+	list_for_each_entry(p, &nvmet_pci_epf_ports, entry) {
+		if (p->disc_addr.portid == portid) {
+			port = p;
+			break;
+		}
+	}
+	mutex_unlock(&nvmet_pci_epf_ports_mutex);
+
+	return port;
+}
+
+static void nvmet_pci_epf_queue_response(struct nvmet_req *req)
+{
+	struct nvmet_pci_epf_iod *iod =
+		container_of(req, struct nvmet_pci_epf_iod, req);
+
+	iod->status = le16_to_cpu(req->cqe->status) >> 1;
+
+	/* If we have no data to transfer, directly complete the command. */
+	if (!iod->data_len || iod->dma_dir != DMA_TO_DEVICE) {
+		nvmet_pci_epf_complete_iod(iod);
+		return;
+	}
+
+	complete(&iod->done);
+}
+
+static u8 nvmet_pci_epf_get_mdts(const struct nvmet_ctrl *tctrl)
+{
+	struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
+	int page_shift = NVME_CAP_MPSMIN(tctrl->cap) + 12;
+
+	return ilog2(ctrl->mdts) - page_shift;
+}
+
+static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl,
+		u16 cqid, u16 flags, u16 qsize, u64 pci_addr, u16 vector)
+{
+	struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
+	struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid];
+	u16 status;
+
+	if (test_and_set_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags))
+		return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
+
+	if (!(flags & NVME_QUEUE_PHYS_CONTIG))
+		return NVME_SC_INVALID_QUEUE | NVME_STATUS_DNR;
+
+	if (flags & NVME_CQ_IRQ_ENABLED)
+		set_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags);
+
+	cq->pci_addr = pci_addr;
+	cq->qid = cqid;
+	cq->depth = qsize + 1;
+	cq->vector = vector;
+	cq->head = 0;
+	cq->tail = 0;
+	cq->phase = 1;
+	cq->db = NVME_REG_DBS + (((cqid * 2) + 1) * sizeof(u32));
+	nvmet_pci_epf_bar_write32(ctrl, cq->db, 0);
+
+	if (!cqid)
+		cq->qes = sizeof(struct nvme_completion);
+	else
+		cq->qes = ctrl->io_cqes;
+	cq->pci_size = cq->qes * cq->depth;
+
+	cq->iv = nvmet_pci_epf_add_irq_vector(ctrl, vector);
+	if (!cq->iv) {
+		status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
+		goto err;
+	}
+
+	status = nvmet_cq_create(tctrl, &cq->nvme_cq, cqid, cq->depth);
+	if (status != NVME_SC_SUCCESS)
+		goto err;
+
+	dev_dbg(ctrl->dev, "CQ[%u]: %u entries of %zu B, IRQ vector %u\n",
+		cqid, qsize, cq->qes, cq->vector);
+
+	return NVME_SC_SUCCESS;
+
+err:
+	clear_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags);
+	clear_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags);
+	return status;
+}
+
+static u16 nvmet_pci_epf_delete_cq(struct nvmet_ctrl *tctrl, u16 cqid)
+{
+	struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
+	struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid];
+
+	if (!test_and_clear_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags))
+		return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
+
+	cancel_delayed_work_sync(&cq->work);
+	nvmet_pci_epf_drain_queue(cq);
+	nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector);
+
+	return NVME_SC_SUCCESS;
+}
+
+static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl,
+		u16 sqid, u16 flags, u16 qsize, u64 pci_addr)
+{
+	struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
+	struct nvmet_pci_epf_queue *sq = &ctrl->sq[sqid];
+	u16 status;
+
+	if (test_and_set_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags))
+		return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
+
+	if (!(flags & NVME_QUEUE_PHYS_CONTIG))
+		return NVME_SC_INVALID_QUEUE | NVME_STATUS_DNR;
+
+	sq->pci_addr = pci_addr;
+	sq->qid = sqid;
+	sq->depth = qsize + 1;
+	sq->head = 0;
+	sq->tail = 0;
+	sq->phase = 0;
+	sq->db = NVME_REG_DBS + (sqid * 2 * sizeof(u32));
+	nvmet_pci_epf_bar_write32(ctrl, sq->db, 0);
+	if (!sqid)
+		sq->qes = 1UL << NVME_ADM_SQES;
+	else
+		sq->qes = ctrl->io_sqes;
+	sq->pci_size = sq->qes * sq->depth;
+
+	status = nvmet_sq_create(tctrl, &sq->nvme_sq, sqid, sq->depth);
+	if (status != NVME_SC_SUCCESS)
+		goto out_clear_bit;
+
+	sq->iod_wq = alloc_workqueue("sq%d_wq", WQ_UNBOUND,
+				min_t(int, sq->depth, WQ_MAX_ACTIVE), sqid);
+	if (!sq->iod_wq) {
+		dev_err(ctrl->dev, "Failed to create SQ %d work queue\n", sqid);
+		status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
+		goto out_destroy_sq;
+	}
+
+	dev_dbg(ctrl->dev, "SQ[%u]: %u entries of %zu B\n",
+		sqid, qsize, sq->qes);
+
+	return NVME_SC_SUCCESS;
+
+out_destroy_sq:
+	nvmet_sq_destroy(&sq->nvme_sq);
+out_clear_bit:
+	clear_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags);
+	return status;
+}
+
+static u16 nvmet_pci_epf_delete_sq(struct nvmet_ctrl *tctrl, u16 sqid)
+{
+	struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
+	struct nvmet_pci_epf_queue *sq = &ctrl->sq[sqid];
+
+	if (!test_and_clear_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags))
+		return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
+
+	flush_workqueue(sq->iod_wq);
+	destroy_workqueue(sq->iod_wq);
+	sq->iod_wq = NULL;
+
+	nvmet_pci_epf_drain_queue(sq);
+
+	if (sq->nvme_sq.ctrl)
+		nvmet_sq_destroy(&sq->nvme_sq);
+
+	return NVME_SC_SUCCESS;
+}
+
+static u16 nvmet_pci_epf_get_feat(const struct nvmet_ctrl *tctrl,
+				  u8 feat, void *data)
+{
+	struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
+	struct nvmet_feat_arbitration *arb;
+	struct nvmet_feat_irq_coalesce *irqc;
+	struct nvmet_feat_irq_config *irqcfg;
+	struct nvmet_pci_epf_irq_vector *iv;
+	u16 status;
+
+	switch (feat) {
+	case NVME_FEAT_ARBITRATION:
+		arb = data;
+		if (!ctrl->sq_ab)
+			arb->ab = 0x7;
+		else
+			arb->ab = ilog2(ctrl->sq_ab);
+		return NVME_SC_SUCCESS;
+
+	case NVME_FEAT_IRQ_COALESCE:
+		irqc = data;
+		irqc->thr = ctrl->irq_vector_threshold;
+		irqc->time = 0;
+		return NVME_SC_SUCCESS;
+
+	case NVME_FEAT_IRQ_CONFIG:
+		irqcfg = data;
+		mutex_lock(&ctrl->irq_lock);
+		iv = nvmet_pci_epf_find_irq_vector(ctrl, irqcfg->iv);
+		if (iv) {
+			irqcfg->cd = iv->cd;
+			status = NVME_SC_SUCCESS;
+		} else {
+			status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+		}
+		mutex_unlock(&ctrl->irq_lock);
+		return status;
+
+	default:
+		return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+	}
+}
+
+static u16 nvmet_pci_epf_set_feat(const struct nvmet_ctrl *tctrl,
+				  u8 feat, void *data)
+{
+	struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
+	struct nvmet_feat_arbitration *arb;
+	struct nvmet_feat_irq_coalesce *irqc;
+	struct nvmet_feat_irq_config *irqcfg;
+	struct nvmet_pci_epf_irq_vector *iv;
+	u16 status;
+
+	switch (feat) {
+	case NVME_FEAT_ARBITRATION:
+		arb = data;
+		if (arb->ab == 0x7)
+			ctrl->sq_ab = 0;
+		else
+			ctrl->sq_ab = 1 << arb->ab;
+		return NVME_SC_SUCCESS;
+
+	case NVME_FEAT_IRQ_COALESCE:
+		/*
+		 * Since we do not implement precise IRQ coalescing timing,
+		 * ignore the time field.
+		 */
+		irqc = data;
+		ctrl->irq_vector_threshold = irqc->thr + 1;
+		return NVME_SC_SUCCESS;
+
+	case NVME_FEAT_IRQ_CONFIG:
+		irqcfg = data;
+		mutex_lock(&ctrl->irq_lock);
+		iv = nvmet_pci_epf_find_irq_vector(ctrl, irqcfg->iv);
+		if (iv) {
+			iv->cd = irqcfg->cd;
+			status = NVME_SC_SUCCESS;
+		} else {
+			status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+		}
+		mutex_unlock(&ctrl->irq_lock);
+		return status;
+
+	default:
+		return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+	}
+}
+
+static const struct nvmet_fabrics_ops nvmet_pci_epf_fabrics_ops = {
+	.owner		= THIS_MODULE,
+	.type		= NVMF_TRTYPE_PCI,
+	.add_port	= nvmet_pci_epf_add_port,
+	.remove_port	= nvmet_pci_epf_remove_port,
+	.queue_response = nvmet_pci_epf_queue_response,
+	.get_mdts	= nvmet_pci_epf_get_mdts,
+	.create_cq	= nvmet_pci_epf_create_cq,
+	.delete_cq	= nvmet_pci_epf_delete_cq,
+	.create_sq	= nvmet_pci_epf_create_sq,
+	.delete_sq	= nvmet_pci_epf_delete_sq,
+	.get_feature	= nvmet_pci_epf_get_feat,
+	.set_feature	= nvmet_pci_epf_set_feat,
+};
+
+static void nvmet_pci_epf_cq_work(struct work_struct *work);
+
+static void nvmet_pci_epf_init_queue(struct nvmet_pci_epf_ctrl *ctrl,
+				     unsigned int qid, bool sq)
+{
+	struct nvmet_pci_epf_queue *queue;
+
+	if (sq) {
+		queue = &ctrl->sq[qid];
+		set_bit(NVMET_PCI_EPF_Q_IS_SQ, &queue->flags);
+	} else {
+		queue = &ctrl->cq[qid];
+		INIT_DELAYED_WORK(&queue->work, nvmet_pci_epf_cq_work);
+	}
+	queue->ctrl = ctrl;
+	queue->qid = qid;
+	spin_lock_init(&queue->lock);
+	INIT_LIST_HEAD(&queue->list);
+}
+
+static int nvmet_pci_epf_alloc_queues(struct nvmet_pci_epf_ctrl *ctrl)
+{
+	unsigned int qid;
+
+	ctrl->sq = kcalloc(ctrl->nr_queues,
+			   sizeof(struct nvmet_pci_epf_queue), GFP_KERNEL);
+	if (!ctrl->sq)
+		return -ENOMEM;
+
+	ctrl->cq = kcalloc(ctrl->nr_queues,
+			   sizeof(struct nvmet_pci_epf_queue), GFP_KERNEL);
+	if (!ctrl->cq) {
+		kfree(ctrl->sq);
+		ctrl->sq = NULL;
+		return -ENOMEM;
+	}
+
+	for (qid = 0; qid < ctrl->nr_queues; qid++) {
+		nvmet_pci_epf_init_queue(ctrl, qid, true);
+		nvmet_pci_epf_init_queue(ctrl, qid, false);
+	}
+
+	return 0;
+}
+
+static void nvmet_pci_epf_free_queues(struct nvmet_pci_epf_ctrl *ctrl)
+{
+	kfree(ctrl->sq);
+	ctrl->sq = NULL;
+	kfree(ctrl->cq);
+	ctrl->cq = NULL;
+}
+
+static int nvmet_pci_epf_map_queue(struct nvmet_pci_epf_ctrl *ctrl,
+				   struct nvmet_pci_epf_queue *queue)
+{
+	struct nvmet_pci_epf *nvme_epf = ctrl->nvme_epf;
+	int ret;
+
+	ret = nvmet_pci_epf_mem_map(nvme_epf, queue->pci_addr,
+				      queue->pci_size, &queue->pci_map);
+	if (ret) {
+		dev_err(ctrl->dev, "Failed to map queue %u (err=%d)\n",
+			queue->qid, ret);
+		return ret;
+	}
+
+	if (queue->pci_map.pci_size < queue->pci_size) {
+		dev_err(ctrl->dev, "Invalid partial mapping of queue %u\n",
+			queue->qid);
+		nvmet_pci_epf_mem_unmap(nvme_epf, &queue->pci_map);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static inline void nvmet_pci_epf_unmap_queue(struct nvmet_pci_epf_ctrl *ctrl,
+					     struct nvmet_pci_epf_queue *queue)
+{
+	nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &queue->pci_map);
+}
+
+static void nvmet_pci_epf_exec_iod_work(struct work_struct *work)
+{
+	struct nvmet_pci_epf_iod *iod =
+		container_of(work, struct nvmet_pci_epf_iod, work);
+	struct nvmet_req *req = &iod->req;
+	int ret;
+
+	if (!iod->ctrl->link_up) {
+		nvmet_pci_epf_free_iod(iod);
+		return;
+	}
+
+	if (!test_bit(NVMET_PCI_EPF_Q_LIVE, &iod->sq->flags)) {
+		iod->status = NVME_SC_QID_INVALID | NVME_STATUS_DNR;
+		goto complete;
+	}
+
+	if (!nvmet_req_init(req, &iod->cq->nvme_cq, &iod->sq->nvme_sq,
+			    &nvmet_pci_epf_fabrics_ops))
+		goto complete;
+
+	iod->data_len = nvmet_req_transfer_len(req);
+	if (iod->data_len) {
+		/*
+		 * Get the data DMA transfer direction. Here "device" means the
+		 * PCI root-complex host.
+		 */
+		if (nvme_is_write(&iod->cmd))
+			iod->dma_dir = DMA_FROM_DEVICE;
+		else
+			iod->dma_dir = DMA_TO_DEVICE;
+
+		/*
+		 * Setup the command data buffer and get the command data from
+		 * the host if needed.
+		 */
+		ret = nvmet_pci_epf_alloc_iod_data_buf(iod);
+		if (!ret && iod->dma_dir == DMA_FROM_DEVICE)
+			ret = nvmet_pci_epf_transfer_iod_data(iod);
+		if (ret) {
+			nvmet_req_uninit(req);
+			goto complete;
+		}
+	}
+
+	req->execute(req);
+
+	/*
+	 * If we do not have data to transfer after the command execution
+	 * finishes, nvmet_pci_epf_queue_response() will complete the command
+	 * directly. No need to wait for the completion in this case.
+	 */
+	if (!iod->data_len || iod->dma_dir != DMA_TO_DEVICE)
+		return;
+
+	wait_for_completion(&iod->done);
+
+	if (iod->status == NVME_SC_SUCCESS) {
+		WARN_ON_ONCE(!iod->data_len || iod->dma_dir != DMA_TO_DEVICE);
+		nvmet_pci_epf_transfer_iod_data(iod);
+	}
+
+complete:
+	nvmet_pci_epf_complete_iod(iod);
+}
+
+static int nvmet_pci_epf_process_sq(struct nvmet_pci_epf_ctrl *ctrl,
+				    struct nvmet_pci_epf_queue *sq)
+{
+	struct nvmet_pci_epf_iod *iod;
+	int ret, n = 0;
+
+	sq->tail = nvmet_pci_epf_bar_read32(ctrl, sq->db);
+	while (sq->head != sq->tail && (!ctrl->sq_ab || n < ctrl->sq_ab)) {
+		iod = nvmet_pci_epf_alloc_iod(sq);
+		if (!iod)
+			break;
+
+		/* Get the NVMe command submitted by the host. */
+		ret = nvmet_pci_epf_transfer(ctrl, &iod->cmd,
+					     sq->pci_addr + sq->head * sq->qes,
+					     sq->qes, DMA_FROM_DEVICE);
+		if (ret) {
+			/* Not much we can do... */
+			nvmet_pci_epf_free_iod(iod);
+			break;
+		}
+
+		dev_dbg(ctrl->dev, "SQ[%u]: head %u, tail %u, command %s\n",
+			sq->qid, sq->head, sq->tail,
+			nvmet_pci_epf_iod_name(iod));
+
+		sq->head++;
+		if (sq->head == sq->depth)
+			sq->head = 0;
+		n++;
+
+		queue_work_on(WORK_CPU_UNBOUND, sq->iod_wq, &iod->work);
+
+		sq->tail = nvmet_pci_epf_bar_read32(ctrl, sq->db);
+	}
+
+	return n;
+}
+
+static void nvmet_pci_epf_poll_sqs_work(struct work_struct *work)
+{
+	struct nvmet_pci_epf_ctrl *ctrl =
+		container_of(work, struct nvmet_pci_epf_ctrl, poll_sqs.work);
+	struct nvmet_pci_epf_queue *sq;
+	unsigned long last = 0;
+	int i, nr_sqs;
+
+	while (ctrl->link_up && ctrl->enabled) {
+		nr_sqs = 0;
+		/* Do round-robin arbitration. */
+		for (i = 0; i < ctrl->nr_queues; i++) {
+			sq = &ctrl->sq[i];
+			if (!test_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags))
+				continue;
+			if (nvmet_pci_epf_process_sq(ctrl, sq))
+				nr_sqs++;
+		}
+
+		if (nr_sqs) {
+			last = jiffies;
+			continue;
+		}
+
+		/*
+		 * If we have not received any command on any queue for more
+		 * than NVMET_PCI_EPF_SQ_POLL_IDLE, assume we are idle and
+		 * reschedule. This avoids "burning" a CPU when the controller
+		 * is idle for a long time.
+		 */
+		if (time_is_before_jiffies(last + NVMET_PCI_EPF_SQ_POLL_IDLE))
+			break;
+
+		cpu_relax();
+	}
+
+	schedule_delayed_work(&ctrl->poll_sqs, NVMET_PCI_EPF_SQ_POLL_INTERVAL);
+}
+
+static void nvmet_pci_epf_cq_work(struct work_struct *work)
+{
+	struct nvmet_pci_epf_queue *cq =
+		container_of(work, struct nvmet_pci_epf_queue, work.work);
+	struct nvmet_pci_epf_ctrl *ctrl = cq->ctrl;
+	struct nvme_completion *cqe;
+	struct nvmet_pci_epf_iod *iod;
+	unsigned long flags;
+	int ret, n = 0;
+
+	ret = nvmet_pci_epf_map_queue(ctrl, cq);
+	if (ret)
+		goto again;
+
+	while (test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags) && ctrl->link_up) {
+
+		/* Check that the CQ is not full. */
+		cq->head = nvmet_pci_epf_bar_read32(ctrl, cq->db);
+		if (cq->head == cq->tail + 1) {
+			ret = -EAGAIN;
+			break;
+		}
+
+		spin_lock_irqsave(&cq->lock, flags);
+		iod = list_first_entry_or_null(&cq->list,
+					       struct nvmet_pci_epf_iod, link);
+		if (iod)
+			list_del_init(&iod->link);
+		spin_unlock_irqrestore(&cq->lock, flags);
+
+		if (!iod)
+			break;
+
+		/* Post the IOD completion entry. */
+		cqe = &iod->cqe;
+		cqe->status = cpu_to_le16((iod->status << 1) | cq->phase);
+
+		dev_dbg(ctrl->dev,
+			"CQ[%u]: %s status 0x%x, result 0x%llx, head %u, tail %u, phase %u\n",
+			cq->qid, nvmet_pci_epf_iod_name(iod), iod->status,
+			le64_to_cpu(cqe->result.u64), cq->head, cq->tail,
+			cq->phase);
+
+		memcpy_toio(cq->pci_map.virt_addr + cq->tail * cq->qes,
+			    cqe, cq->qes);
+
+		cq->tail++;
+		if (cq->tail >= cq->depth) {
+			cq->tail = 0;
+			cq->phase ^= 1;
+		}
+
+		nvmet_pci_epf_free_iod(iod);
+
+		/* Signal the host. */
+		nvmet_pci_epf_raise_irq(ctrl, cq, false);
+		n++;
+	}
+
+	nvmet_pci_epf_unmap_queue(ctrl, cq);
+
+	/*
+	 * We do not support precise IRQ coalescing time (100ns units as per
+	 * NVMe specifications). So if we have posted completion entries without
+	 * reaching the interrupt coalescing threshold, raise an interrupt.
+	 */
+	if (n)
+		nvmet_pci_epf_raise_irq(ctrl, cq, true);
+
+again:
+	if (ret < 0)
+		queue_delayed_work(system_highpri_wq, &cq->work,
+				   NVMET_PCI_EPF_CQ_RETRY_INTERVAL);
+}
+
+static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
+{
+	u64 pci_addr, asq, acq;
+	u32 aqa;
+	u16 status, qsize;
+
+	if (ctrl->enabled)
+		return 0;
+
+	dev_info(ctrl->dev, "Enabling controller\n");
+
+	ctrl->mps_shift = nvmet_cc_mps(ctrl->cc) + 12;
+	ctrl->mps = 1UL << ctrl->mps_shift;
+	ctrl->mps_mask = ctrl->mps - 1;
+
+	ctrl->io_sqes = 1UL << nvmet_cc_iosqes(ctrl->cc);
+	if (ctrl->io_sqes < sizeof(struct nvme_command)) {
+		dev_err(ctrl->dev, "Unsupported I/O SQES %zu (need %zu)\n",
+			ctrl->io_sqes, sizeof(struct nvme_command));
+		return -EINVAL;
+	}
+
+	ctrl->io_cqes = 1UL << nvmet_cc_iocqes(ctrl->cc);
+	if (ctrl->io_cqes < sizeof(struct nvme_completion)) {
+		dev_err(ctrl->dev, "Unsupported I/O CQES %zu (need %zu)\n",
+			ctrl->io_sqes, sizeof(struct nvme_completion));
+		return -EINVAL;
+	}
+
+	/* Create the admin queue. */
+	aqa = nvmet_pci_epf_bar_read32(ctrl, NVME_REG_AQA);
+	asq = nvmet_pci_epf_bar_read64(ctrl, NVME_REG_ASQ);
+	acq = nvmet_pci_epf_bar_read64(ctrl, NVME_REG_ACQ);
+
+	qsize = (aqa & 0x0fff0000) >> 16;
+	pci_addr = acq & GENMASK_ULL(63, 12);
+	status = nvmet_pci_epf_create_cq(ctrl->tctrl, 0,
+				NVME_CQ_IRQ_ENABLED | NVME_QUEUE_PHYS_CONTIG,
+				qsize, pci_addr, 0);
+	if (status != NVME_SC_SUCCESS) {
+		dev_err(ctrl->dev, "Failed to create admin completion queue\n");
+		return -EINVAL;
+	}
+
+	qsize = aqa & 0x00000fff;
+	pci_addr = asq & GENMASK_ULL(63, 12);
+	status = nvmet_pci_epf_create_sq(ctrl->tctrl, 0, NVME_QUEUE_PHYS_CONTIG,
+					 qsize, pci_addr);
+	if (status != NVME_SC_SUCCESS) {
+		dev_err(ctrl->dev, "Failed to create admin submission queue\n");
+		nvmet_pci_epf_delete_cq(ctrl->tctrl, 0);
+		return -EINVAL;
+	}
+
+	ctrl->sq_ab = NVMET_PCI_EPF_SQ_AB;
+	ctrl->irq_vector_threshold = NVMET_PCI_EPF_IV_THRESHOLD;
+	ctrl->enabled = true;
+
+	/* Start polling the controller SQs. */
+	schedule_delayed_work(&ctrl->poll_sqs, 0);
+
+	return 0;
+}
+
+static void nvmet_pci_epf_disable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
+{
+	int qid;
+
+	if (!ctrl->enabled)
+		return;
+
+	dev_info(ctrl->dev, "Disabling controller\n");
+
+	ctrl->enabled = false;
+	cancel_delayed_work_sync(&ctrl->poll_sqs);
+
+	/* Delete all I/O queues first. */
+	for (qid = 1; qid < ctrl->nr_queues; qid++)
+		nvmet_pci_epf_delete_sq(ctrl->tctrl, qid);
+
+	for (qid = 1; qid < ctrl->nr_queues; qid++)
+		nvmet_pci_epf_delete_cq(ctrl->tctrl, qid);
+
+	/* Delete the admin queue last. */
+	nvmet_pci_epf_delete_sq(ctrl->tctrl, 0);
+	nvmet_pci_epf_delete_cq(ctrl->tctrl, 0);
+}
+
+static void nvmet_pci_epf_poll_cc_work(struct work_struct *work)
+{
+	struct nvmet_pci_epf_ctrl *ctrl =
+		container_of(work, struct nvmet_pci_epf_ctrl, poll_cc.work);
+	u32 old_cc, new_cc;
+	int ret;
+
+	if (!ctrl->tctrl)
+		return;
+
+	old_cc = ctrl->cc;
+	new_cc = nvmet_pci_epf_bar_read32(ctrl, NVME_REG_CC);
+	ctrl->cc = new_cc;
+
+	if (nvmet_cc_en(new_cc) && !nvmet_cc_en(old_cc)) {
+		ret = nvmet_pci_epf_enable_ctrl(ctrl);
+		if (ret)
+			return;
+		ctrl->csts |= NVME_CSTS_RDY;
+	}
+
+	if (!nvmet_cc_en(new_cc) && nvmet_cc_en(old_cc)) {
+		nvmet_pci_epf_disable_ctrl(ctrl);
+		ctrl->csts &= ~NVME_CSTS_RDY;
+	}
+
+	if (nvmet_cc_shn(new_cc) && !nvmet_cc_shn(old_cc)) {
+		nvmet_pci_epf_disable_ctrl(ctrl);
+		ctrl->csts |= NVME_CSTS_SHST_CMPLT;
+	}
+
+	if (!nvmet_cc_shn(new_cc) && nvmet_cc_shn(old_cc))
+		ctrl->csts &= ~NVME_CSTS_SHST_CMPLT;
+
+	nvmet_update_cc(ctrl->tctrl, ctrl->cc);
+	nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CSTS, ctrl->csts);
+
+	schedule_delayed_work(&ctrl->poll_cc, NVMET_PCI_EPF_CC_POLL_INTERVAL);
+}
+
+static void nvmet_pci_epf_init_bar(struct nvmet_pci_epf_ctrl *ctrl)
+{
+	struct nvmet_ctrl *tctrl = ctrl->tctrl;
+
+	ctrl->bar = ctrl->nvme_epf->reg_bar;
+
+	/* Copy the target controller capabilities as a base. */
+	ctrl->cap = tctrl->cap;
+
+	/* Contiguous Queues Required (CQR). */
+	ctrl->cap |= 0x1ULL << 16;
+
+	/* Set Doorbell stride to 4B (DSTRB). */
+	ctrl->cap &= ~GENMASK_ULL(35, 32);
+
+	/* Clear NVM Subsystem Reset Supported (NSSRS). */
+	ctrl->cap &= ~(0x1ULL << 36);
+
+	/* Clear Boot Partition Support (BPS). */
+	ctrl->cap &= ~(0x1ULL << 45);
+
+	/* Clear Persistent Memory Region Supported (PMRS). */
+	ctrl->cap &= ~(0x1ULL << 56);
+
+	/* Clear Controller Memory Buffer Supported (CMBS). */
+	ctrl->cap &= ~(0x1ULL << 57);
+
+	/* Controller configuration. */
+	ctrl->cc = tctrl->cc & (~NVME_CC_ENABLE);
+
+	/* Controller status. */
+	ctrl->csts = ctrl->tctrl->csts;
+
+	nvmet_pci_epf_bar_write64(ctrl, NVME_REG_CAP, ctrl->cap);
+	nvmet_pci_epf_bar_write32(ctrl, NVME_REG_VS, tctrl->subsys->ver);
+	nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CSTS, ctrl->csts);
+	nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CC, ctrl->cc);
+}
+
+static int nvmet_pci_epf_create_ctrl(struct nvmet_pci_epf *nvme_epf,
+				     unsigned int max_nr_queues)
+{
+	struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl;
+	struct nvmet_alloc_ctrl_args args = {};
+	char hostnqn[NVMF_NQN_SIZE];
+	uuid_t id;
+	int ret;
+
+	memset(ctrl, 0, sizeof(*ctrl));
+	ctrl->dev = &nvme_epf->epf->dev;
+	mutex_init(&ctrl->irq_lock);
+	ctrl->nvme_epf = nvme_epf;
+	ctrl->mdts = nvme_epf->mdts_kb * SZ_1K;
+	INIT_DELAYED_WORK(&ctrl->poll_cc, nvmet_pci_epf_poll_cc_work);
+	INIT_DELAYED_WORK(&ctrl->poll_sqs, nvmet_pci_epf_poll_sqs_work);
+
+	ret = mempool_init_kmalloc_pool(&ctrl->iod_pool,
+					max_nr_queues * NVMET_MAX_QUEUE_SIZE,
+					sizeof(struct nvmet_pci_epf_iod));
+	if (ret) {
+		dev_err(ctrl->dev, "Failed to initialize IOD mempool\n");
+		return ret;
+	}
+
+	ctrl->port = nvmet_pci_epf_find_port(ctrl, nvme_epf->portid);
+	if (!ctrl->port) {
+		dev_err(ctrl->dev, "Port not found\n");
+		ret = -EINVAL;
+		goto out_mempool_exit;
+	}
+
+	/* Create the target controller. */
+	uuid_gen(&id);
+	snprintf(hostnqn, NVMF_NQN_SIZE,
+		 "nqn.2014-08.org.nvmexpress:uuid:%pUb", &id);
+	args.port = ctrl->port;
+	args.subsysnqn = nvme_epf->subsysnqn;
+	memset(&id, 0, sizeof(uuid_t));
+	args.hostid = &id;
+	args.hostnqn = hostnqn;
+	args.ops = &nvmet_pci_epf_fabrics_ops;
+
+	ctrl->tctrl = nvmet_alloc_ctrl(&args);
+	if (!ctrl->tctrl) {
+		dev_err(ctrl->dev, "Failed to create target controller\n");
+		ret = -ENOMEM;
+		goto out_mempool_exit;
+	}
+	ctrl->tctrl->drvdata = ctrl;
+
+	/* We do not support protection information for now. */
+	if (ctrl->tctrl->pi_support) {
+		dev_err(ctrl->dev,
+			"Protection information (PI) is not supported\n");
+		ret = -ENOTSUPP;
+		goto out_put_ctrl;
+	}
+
+	/* Allocate our queues, up to the maximum number. */
+	ctrl->nr_queues = min(ctrl->tctrl->subsys->max_qid + 1, max_nr_queues);
+	ret = nvmet_pci_epf_alloc_queues(ctrl);
+	if (ret)
+		goto out_put_ctrl;
+
+	/*
+	 * Allocate the IRQ vectors descriptors. We cannot have more than the
+	 * maximum number of queues.
+	 */
+	ret = nvmet_pci_epf_alloc_irq_vectors(ctrl);
+	if (ret)
+		goto out_free_queues;
+
+	dev_info(ctrl->dev,
+		 "New PCI ctrl \"%s\", %u I/O queues, mdts %u B\n",
+		 ctrl->tctrl->subsys->subsysnqn, ctrl->nr_queues - 1,
+		 ctrl->mdts);
+
+	/* Initialize BAR 0 using the target controller CAP. */
+	nvmet_pci_epf_init_bar(ctrl);
+
+	return 0;
+
+out_free_queues:
+	nvmet_pci_epf_free_queues(ctrl);
+out_put_ctrl:
+	nvmet_ctrl_put(ctrl->tctrl);
+	ctrl->tctrl = NULL;
+out_mempool_exit:
+	mempool_exit(&ctrl->iod_pool);
+	return ret;
+}
+
+static void nvmet_pci_epf_start_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
+{
+	schedule_delayed_work(&ctrl->poll_cc, NVMET_PCI_EPF_CC_POLL_INTERVAL);
+}
+
+static void nvmet_pci_epf_stop_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
+{
+	cancel_delayed_work_sync(&ctrl->poll_cc);
+
+	nvmet_pci_epf_disable_ctrl(ctrl);
+}
+
+static void nvmet_pci_epf_destroy_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
+{
+	if (!ctrl->tctrl)
+		return;
+
+	dev_info(ctrl->dev, "Destroying PCI ctrl \"%s\"\n",
+		 ctrl->tctrl->subsys->subsysnqn);
+
+	nvmet_pci_epf_stop_ctrl(ctrl);
+
+	nvmet_pci_epf_free_queues(ctrl);
+	nvmet_pci_epf_free_irq_vectors(ctrl);
+
+	nvmet_ctrl_put(ctrl->tctrl);
+	ctrl->tctrl = NULL;
+
+	mempool_exit(&ctrl->iod_pool);
+}
+
+static int nvmet_pci_epf_configure_bar(struct nvmet_pci_epf *nvme_epf)
+{
+	struct pci_epf *epf = nvme_epf->epf;
+	const struct pci_epc_features *epc_features = nvme_epf->epc_features;
+	size_t reg_size, reg_bar_size;
+	size_t msix_table_size = 0;
+
+	/*
+	 * The first free BAR will be our register BAR and per NVMe
+	 * specifications, it must be BAR 0.
+	 */
+	if (pci_epc_get_first_free_bar(epc_features) != BAR_0) {
+		dev_err(&epf->dev, "BAR 0 is not free\n");
+		return -ENODEV;
+	}
+
+	if (epc_features->bar[BAR_0].only_64bit)
+		epf->bar[BAR_0].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64;
+
+	/*
+	 * Calculate the size of the register bar: NVMe registers first with
+	 * enough space for the doorbells, followed by the MSI-X table
+	 * if supported.
+	 */
+	reg_size = NVME_REG_DBS + (NVMET_NR_QUEUES * 2 * sizeof(u32));
+	reg_size = ALIGN(reg_size, 8);
+
+	if (epc_features->msix_capable) {
+		size_t pba_size;
+
+		msix_table_size = PCI_MSIX_ENTRY_SIZE * epf->msix_interrupts;
+		nvme_epf->msix_table_offset = reg_size;
+		pba_size = ALIGN(DIV_ROUND_UP(epf->msix_interrupts, 8), 8);
+
+		reg_size += msix_table_size + pba_size;
+	}
+
+	if (epc_features->bar[BAR_0].type == BAR_FIXED) {
+		if (reg_size > epc_features->bar[BAR_0].fixed_size) {
+			dev_err(&epf->dev,
+				"BAR 0 size %llu B too small, need %zu B\n",
+				epc_features->bar[BAR_0].fixed_size,
+				reg_size);
+			return -ENOMEM;
+		}
+		reg_bar_size = epc_features->bar[BAR_0].fixed_size;
+	} else {
+		reg_bar_size = ALIGN(reg_size, max(epc_features->align, 4096));
+	}
+
+	nvme_epf->reg_bar = pci_epf_alloc_space(epf, reg_bar_size, BAR_0,
+						epc_features, PRIMARY_INTERFACE);
+	if (!nvme_epf->reg_bar) {
+		dev_err(&epf->dev, "Failed to allocate BAR 0\n");
+		return -ENOMEM;
+	}
+	memset(nvme_epf->reg_bar, 0, reg_bar_size);
+
+	return 0;
+}
+
+static void nvmet_pci_epf_free_bar(struct nvmet_pci_epf *nvme_epf)
+{
+	struct pci_epf *epf = nvme_epf->epf;
+
+	if (!nvme_epf->reg_bar)
+		return;
+
+	pci_epf_free_space(epf, nvme_epf->reg_bar, BAR_0, PRIMARY_INTERFACE);
+	nvme_epf->reg_bar = NULL;
+}
+
+static void nvmet_pci_epf_clear_bar(struct nvmet_pci_epf *nvme_epf)
+{
+	struct pci_epf *epf = nvme_epf->epf;
+
+	pci_epc_clear_bar(epf->epc, epf->func_no, epf->vfunc_no,
+			  &epf->bar[BAR_0]);
+}
+
+static int nvmet_pci_epf_init_irq(struct nvmet_pci_epf *nvme_epf)
+{
+	const struct pci_epc_features *epc_features = nvme_epf->epc_features;
+	struct pci_epf *epf = nvme_epf->epf;
+	int ret;
+
+	/* Enable MSI-X if supported, otherwise, use MSI. */
+	if (epc_features->msix_capable && epf->msix_interrupts) {
+		ret = pci_epc_set_msix(epf->epc, epf->func_no, epf->vfunc_no,
+				       epf->msix_interrupts, BAR_0,
+				       nvme_epf->msix_table_offset);
+		if (ret) {
+			dev_err(&epf->dev, "Failed to configure MSI-X\n");
+			return ret;
+		}
+
+		nvme_epf->nr_vectors = epf->msix_interrupts;
+		nvme_epf->irq_type = PCI_IRQ_MSIX;
+
+		return 0;
+	}
+
+	if (epc_features->msi_capable && epf->msi_interrupts) {
+		ret = pci_epc_set_msi(epf->epc, epf->func_no, epf->vfunc_no,
+				      epf->msi_interrupts);
+		if (ret) {
+			dev_err(&epf->dev, "Failed to configure MSI\n");
+			return ret;
+		}
+
+		nvme_epf->nr_vectors = epf->msi_interrupts;
+		nvme_epf->irq_type = PCI_IRQ_MSI;
+
+		return 0;
+	}
+
+	/* MSI and MSI-X are not supported: fall back to INTx. */
+	nvme_epf->nr_vectors = 1;
+	nvme_epf->irq_type = PCI_IRQ_INTX;
+
+	return 0;
+}
+
+static int nvmet_pci_epf_epc_init(struct pci_epf *epf)
+{
+	struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
+	const struct pci_epc_features *epc_features = nvme_epf->epc_features;
+	struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl;
+	unsigned int max_nr_queues = NVMET_NR_QUEUES;
+	int ret;
+
+	/* For now, do not support virtual functions. */
+	if (epf->vfunc_no > 0) {
+		dev_err(&epf->dev, "Virtual functions are not supported\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Cap the maximum number of queues we can support on the controller
+	 * with the number of IRQs we can use.
+	 */
+	if (epc_features->msix_capable && epf->msix_interrupts) {
+		dev_info(&epf->dev,
+			 "PCI endpoint controller supports MSI-X, %u vectors\n",
+			 epf->msix_interrupts);
+		max_nr_queues = min(max_nr_queues, epf->msix_interrupts);
+	} else if (epc_features->msi_capable && epf->msi_interrupts) {
+		dev_info(&epf->dev,
+			 "PCI endpoint controller supports MSI, %u vectors\n",
+			 epf->msi_interrupts);
+		max_nr_queues = min(max_nr_queues, epf->msi_interrupts);
+	}
+
+	if (max_nr_queues < 2) {
+		dev_err(&epf->dev, "Invalid maximum number of queues %u\n",
+			max_nr_queues);
+		return -EINVAL;
+	}
+
+	/* Create the target controller. */
+	ret = nvmet_pci_epf_create_ctrl(nvme_epf, max_nr_queues);
+	if (ret) {
+		dev_err(&epf->dev,
+			"Failed to create NVMe PCI target controller (err=%d)\n",
+			ret);
+		return ret;
+	}
+
+	/* Set device ID, class, etc. */
+	epf->header->vendorid = ctrl->tctrl->subsys->vendor_id;
+	epf->header->subsys_vendor_id = ctrl->tctrl->subsys->subsys_vendor_id;
+	ret = pci_epc_write_header(epf->epc, epf->func_no, epf->vfunc_no,
+				   epf->header);
+	if (ret) {
+		dev_err(&epf->dev,
+			"Failed to write configuration header (err=%d)\n", ret);
+		goto out_destroy_ctrl;
+	}
+
+	ret = pci_epc_set_bar(epf->epc, epf->func_no, epf->vfunc_no,
+			      &epf->bar[BAR_0]);
+	if (ret) {
+		dev_err(&epf->dev, "Failed to set BAR 0 (err=%d)\n", ret);
+		goto out_destroy_ctrl;
+	}
+
+	/*
+	 * Enable interrupts and start polling the controller BAR if we do not
+	 * have a link up notifier.
+	 */
+	ret = nvmet_pci_epf_init_irq(nvme_epf);
+	if (ret)
+		goto out_clear_bar;
+
+	if (!epc_features->linkup_notifier) {
+		ctrl->link_up = true;
+		nvmet_pci_epf_start_ctrl(&nvme_epf->ctrl);
+	}
+
+	return 0;
+
+out_clear_bar:
+	nvmet_pci_epf_clear_bar(nvme_epf);
+out_destroy_ctrl:
+	nvmet_pci_epf_destroy_ctrl(&nvme_epf->ctrl);
+	return ret;
+}
+
+static void nvmet_pci_epf_epc_deinit(struct pci_epf *epf)
+{
+	struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
+	struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl;
+
+	ctrl->link_up = false;
+	nvmet_pci_epf_destroy_ctrl(ctrl);
+
+	nvmet_pci_epf_deinit_dma(nvme_epf);
+	nvmet_pci_epf_clear_bar(nvme_epf);
+}
+
+static int nvmet_pci_epf_link_up(struct pci_epf *epf)
+{
+	struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
+	struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl;
+
+	ctrl->link_up = true;
+	nvmet_pci_epf_start_ctrl(ctrl);
+
+	return 0;
+}
+
+static int nvmet_pci_epf_link_down(struct pci_epf *epf)
+{
+	struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
+	struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl;
+
+	ctrl->link_up = false;
+	nvmet_pci_epf_stop_ctrl(ctrl);
+
+	return 0;
+}
+
+static const struct pci_epc_event_ops nvmet_pci_epf_event_ops = {
+	.epc_init = nvmet_pci_epf_epc_init,
+	.epc_deinit = nvmet_pci_epf_epc_deinit,
+	.link_up = nvmet_pci_epf_link_up,
+	.link_down = nvmet_pci_epf_link_down,
+};
+
+static int nvmet_pci_epf_bind(struct pci_epf *epf)
+{
+	struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
+	const struct pci_epc_features *epc_features;
+	struct pci_epc *epc = epf->epc;
+	int ret;
+
+	if (WARN_ON_ONCE(!epc))
+		return -EINVAL;
+
+	epc_features = pci_epc_get_features(epc, epf->func_no, epf->vfunc_no);
+	if (!epc_features) {
+		dev_err(&epf->dev, "epc_features not implemented\n");
+		return -EOPNOTSUPP;
+	}
+	nvme_epf->epc_features = epc_features;
+
+	ret = nvmet_pci_epf_configure_bar(nvme_epf);
+	if (ret)
+		return ret;
+
+	nvmet_pci_epf_init_dma(nvme_epf);
+
+	return 0;
+}
+
+static void nvmet_pci_epf_unbind(struct pci_epf *epf)
+{
+	struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
+	struct pci_epc *epc = epf->epc;
+
+	nvmet_pci_epf_destroy_ctrl(&nvme_epf->ctrl);
+
+	if (epc->init_complete) {
+		nvmet_pci_epf_deinit_dma(nvme_epf);
+		nvmet_pci_epf_clear_bar(nvme_epf);
+	}
+
+	nvmet_pci_epf_free_bar(nvme_epf);
+}
+
+static struct pci_epf_header nvme_epf_pci_header = {
+	.vendorid	= PCI_ANY_ID,
+	.deviceid	= PCI_ANY_ID,
+	.progif_code	= 0x02, /* NVM Express */
+	.baseclass_code = PCI_BASE_CLASS_STORAGE,
+	.subclass_code	= 0x08, /* Non-Volatile Memory controller */
+	.interrupt_pin	= PCI_INTERRUPT_INTA,
+};
+
+static int nvmet_pci_epf_probe(struct pci_epf *epf,
+			       const struct pci_epf_device_id *id)
+{
+	struct nvmet_pci_epf *nvme_epf;
+	int ret;
+
+	nvme_epf = devm_kzalloc(&epf->dev, sizeof(*nvme_epf), GFP_KERNEL);
+	if (!nvme_epf)
+		return -ENOMEM;
+
+	ret = devm_mutex_init(&epf->dev, &nvme_epf->mmio_lock);
+	if (ret)
+		return ret;
+
+	nvme_epf->epf = epf;
+	nvme_epf->mdts_kb = NVMET_PCI_EPF_MDTS_KB;
+
+	epf->event_ops = &nvmet_pci_epf_event_ops;
+	epf->header = &nvme_epf_pci_header;
+	epf_set_drvdata(epf, nvme_epf);
+
+	return 0;
+}
+
+#define to_nvme_epf(epf_group)	\
+	container_of(epf_group, struct nvmet_pci_epf, group)
+
+static ssize_t nvmet_pci_epf_portid_show(struct config_item *item, char *page)
+{
+	struct config_group *group = to_config_group(item);
+	struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group);
+
+	return sysfs_emit(page, "%u\n", le16_to_cpu(nvme_epf->portid));
+}
+
+static ssize_t nvmet_pci_epf_portid_store(struct config_item *item,
+					  const char *page, size_t len)
+{
+	struct config_group *group = to_config_group(item);
+	struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group);
+	u16 portid;
+
+	/* Do not allow setting this when the function is already started. */
+	if (nvme_epf->ctrl.tctrl)
+		return -EBUSY;
+
+	if (!len)
+		return -EINVAL;
+
+	if (kstrtou16(page, 0, &portid))
+		return -EINVAL;
+
+	nvme_epf->portid = cpu_to_le16(portid);
+
+	return len;
+}
+
+CONFIGFS_ATTR(nvmet_pci_epf_, portid);
+
+static ssize_t nvmet_pci_epf_subsysnqn_show(struct config_item *item,
+					    char *page)
+{
+	struct config_group *group = to_config_group(item);
+	struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group);
+
+	return sysfs_emit(page, "%s\n", nvme_epf->subsysnqn);
+}
+
+static ssize_t nvmet_pci_epf_subsysnqn_store(struct config_item *item,
+					     const char *page, size_t len)
+{
+	struct config_group *group = to_config_group(item);
+	struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group);
+
+	/* Do not allow setting this when the function is already started. */
+	if (nvme_epf->ctrl.tctrl)
+		return -EBUSY;
+
+	if (!len)
+		return -EINVAL;
+
+	strscpy(nvme_epf->subsysnqn, page, len);
+
+	return len;
+}
+
+CONFIGFS_ATTR(nvmet_pci_epf_, subsysnqn);
+
+static ssize_t nvmet_pci_epf_mdts_kb_show(struct config_item *item, char *page)
+{
+	struct config_group *group = to_config_group(item);
+	struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group);
+
+	return sysfs_emit(page, "%u\n", nvme_epf->mdts_kb);
+}
+
+static ssize_t nvmet_pci_epf_mdts_kb_store(struct config_item *item,
+					   const char *page, size_t len)
+{
+	struct config_group *group = to_config_group(item);
+	struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group);
+	unsigned long mdts_kb;
+	int ret;
+
+	if (nvme_epf->ctrl.tctrl)
+		return -EBUSY;
+
+	ret = kstrtoul(page, 0, &mdts_kb);
+	if (ret)
+		return ret;
+	if (!mdts_kb)
+		mdts_kb = NVMET_PCI_EPF_MDTS_KB;
+	else if (mdts_kb > NVMET_PCI_EPF_MAX_MDTS_KB)
+		mdts_kb = NVMET_PCI_EPF_MAX_MDTS_KB;
+
+	if (!is_power_of_2(mdts_kb))
+		return -EINVAL;
+
+	nvme_epf->mdts_kb = mdts_kb;
+
+	return len;
+}
+
+CONFIGFS_ATTR(nvmet_pci_epf_, mdts_kb);
+
+static struct configfs_attribute *nvmet_pci_epf_attrs[] = {
+	&nvmet_pci_epf_attr_portid,
+	&nvmet_pci_epf_attr_subsysnqn,
+	&nvmet_pci_epf_attr_mdts_kb,
+	NULL,
+};
+
+static const struct config_item_type nvmet_pci_epf_group_type = {
+	.ct_attrs	= nvmet_pci_epf_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct config_group *nvmet_pci_epf_add_cfs(struct pci_epf *epf,
+						  struct config_group *group)
+{
+	struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
+
+	config_group_init_type_name(&nvme_epf->group, "nvme",
+				    &nvmet_pci_epf_group_type);
+
+	return &nvme_epf->group;
+}
+
+static const struct pci_epf_device_id nvmet_pci_epf_ids[] = {
+	{ .name = "nvmet_pci_epf" },
+	{},
+};
+
+static struct pci_epf_ops nvmet_pci_epf_ops = {
+	.bind	= nvmet_pci_epf_bind,
+	.unbind	= nvmet_pci_epf_unbind,
+	.add_cfs = nvmet_pci_epf_add_cfs,
+};
+
+static struct pci_epf_driver nvmet_pci_epf_driver = {
+	.driver.name	= "nvmet_pci_epf",
+	.probe		= nvmet_pci_epf_probe,
+	.id_table	= nvmet_pci_epf_ids,
+	.ops		= &nvmet_pci_epf_ops,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nvmet_pci_epf_init_module(void)
+{
+	int ret;
+
+	ret = pci_epf_register_driver(&nvmet_pci_epf_driver);
+	if (ret)
+		return ret;
+
+	ret = nvmet_register_transport(&nvmet_pci_epf_fabrics_ops);
+	if (ret) {
+		pci_epf_unregister_driver(&nvmet_pci_epf_driver);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void __exit nvmet_pci_epf_cleanup_module(void)
+{
+	nvmet_unregister_transport(&nvmet_pci_epf_fabrics_ops);
+	pci_epf_unregister_driver(&nvmet_pci_epf_driver);
+}
+
+module_init(nvmet_pci_epf_init_module);
+module_exit(nvmet_pci_epf_cleanup_module);
+
+MODULE_DESCRIPTION("NVMe PCI Endpoint Function target driver");
+MODULE_AUTHOR("Damien Le Moal <dlemoal@kernel.org>");
+MODULE_LICENSE("GPL");
-- 
2.50.1


From 002ec8f1c69d3722a033eaf45102ba747ae80e94 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Sat, 4 Jan 2025 13:59:51 +0900
Subject: [PATCH 07/16] Documentation: Document the NVMe PCI endpoint target
 driver

Add a documentation file
(Documentation/nvme/nvme-pci-endpoint-target.rst) for the new NVMe PCI
endpoint target driver. This provides an overview of the driver
requirements, capabilities and limitations. A user guide describing how
to setup a NVMe PCI endpoint device using this driver is also provided.

This document is made accessible also from the PCI endpoint
documentation using a link. Furthermore, since the existing nvme
documentation was not accessible from the top documentation index, an
index file is added to Documentation/nvme and this index listed as
"NVMe Subsystem" in the "Storage interfaces" section of the subsystem
API index.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 Documentation/PCI/endpoint/index.rst          |   1 +
 .../PCI/endpoint/pci-nvme-function.rst        |  13 +
 Documentation/nvme/index.rst                  |  12 +
 .../nvme/nvme-pci-endpoint-target.rst         | 368 ++++++++++++++++++
 Documentation/subsystem-apis.rst              |   1 +
 5 files changed, 395 insertions(+)
 create mode 100644 Documentation/PCI/endpoint/pci-nvme-function.rst
 create mode 100644 Documentation/nvme/index.rst
 create mode 100644 Documentation/nvme/nvme-pci-endpoint-target.rst

diff --git a/Documentation/PCI/endpoint/index.rst b/Documentation/PCI/endpoint/index.rst
index 4d2333e7ae06..dd1f62e731c9 100644
--- a/Documentation/PCI/endpoint/index.rst
+++ b/Documentation/PCI/endpoint/index.rst
@@ -15,6 +15,7 @@ PCI Endpoint Framework
    pci-ntb-howto
    pci-vntb-function
    pci-vntb-howto
+   pci-nvme-function
 
    function/binding/pci-test
    function/binding/pci-ntb
diff --git a/Documentation/PCI/endpoint/pci-nvme-function.rst b/Documentation/PCI/endpoint/pci-nvme-function.rst
new file mode 100644
index 000000000000..df57b8e7d066
--- /dev/null
+++ b/Documentation/PCI/endpoint/pci-nvme-function.rst
@@ -0,0 +1,13 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================
+PCI NVMe Function
+=================
+
+:Author: Damien Le Moal <dlemoal@kernel.org>
+
+The PCI NVMe endpoint function implements a PCI NVMe controller using the NVMe
+subsystem target core code. The driver for this function resides with the NVMe
+subsystem as drivers/nvme/target/nvmet-pciep.c.
+
+See Documentation/nvme/nvme-pci-endpoint-target.rst for more details.
diff --git a/Documentation/nvme/index.rst b/Documentation/nvme/index.rst
new file mode 100644
index 000000000000..13383c760cc7
--- /dev/null
+++ b/Documentation/nvme/index.rst
@@ -0,0 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============
+NVMe Subsystem
+==============
+
+.. toctree::
+   :maxdepth: 2
+   :numbered:
+
+   feature-and-quirk-policy
+   nvme-pci-endpoint-target
diff --git a/Documentation/nvme/nvme-pci-endpoint-target.rst b/Documentation/nvme/nvme-pci-endpoint-target.rst
new file mode 100644
index 000000000000..66e7b7d869b4
--- /dev/null
+++ b/Documentation/nvme/nvme-pci-endpoint-target.rst
@@ -0,0 +1,368 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================================
+NVMe PCI Endpoint Function Target
+=================================
+
+:Author: Damien Le Moal <dlemoal@kernel.org>
+
+The NVMe PCI endpoint function target driver implements a NVMe PCIe controller
+using a NVMe fabrics target controller configured with the PCI transport type.
+
+Overview
+========
+
+The NVMe PCI endpoint function target driver allows exposing a NVMe target
+controller over a PCIe link, thus implementing an NVMe PCIe device similar to a
+regular M.2 SSD. The target controller is created in the same manner as when
+using NVMe over fabrics: the controller represents the interface to an NVMe
+subsystem using a port. The port transfer type must be configured to be
+"pci". The subsystem can be configured to have namespaces backed by regular
+files or block devices, or can use NVMe passthrough to expose to the PCI host an
+existing physical NVMe device or a NVMe fabrics host controller (e.g. a NVMe TCP
+host controller).
+
+The NVMe PCI endpoint function target driver relies as much as possible on the
+NVMe target core code to parse and execute NVMe commands submitted by the PCIe
+host. However, using the PCI endpoint framework API and DMA API, the driver is
+also responsible for managing all data transfers over the PCIe link. This
+implies that the NVMe PCI endpoint function target driver implements several
+NVMe data structure management and some NVMe command parsing.
+
+1) The driver manages retrieval of NVMe commands in submission queues using DMA
+   if supported, or MMIO otherwise. Each command retrieved is then executed
+   using a work item to maximize performance with the parallel execution of
+   multiple commands on different CPUs. The driver uses a work item to
+   constantly poll the doorbell of all submission queues to detect command
+   submissions from the PCIe host.
+
+2) The driver transfers completion queues entries of completed commands to the
+   PCIe host using MMIO copy of the entries in the host completion queue.
+   After posting completion entries in a completion queue, the driver uses the
+   PCI endpoint framework API to raise an interrupt to the host to signal the
+   commands completion.
+
+3) For any command that has a data buffer, the NVMe PCI endpoint target driver
+   parses the command PRPs or SGLs lists to create a list of PCI address
+   segments representing the mapping of the command data buffer on the host.
+   The command data buffer is transferred over the PCIe link using this list of
+   PCI address segments using DMA, if supported. If DMA is not supported, MMIO
+   is used, which results in poor performance. For write commands, the command
+   data buffer is transferred from the host into a local memory buffer before
+   executing the command using the target core code. For read commands, a local
+   memory buffer is allocated to execute the command and the content of that
+   buffer is transferred to the host once the command completes.
+
+Controller Capabilities
+-----------------------
+
+The NVMe capabilities exposed to the PCIe host through the BAR 0 registers
+are almost identical to the capabilities of the NVMe target controller
+implemented by the target core code. There are some exceptions.
+
+1) The NVMe PCI endpoint target driver always sets the controller capability
+   CQR bit to request "Contiguous Queues Required". This is to facilitate the
+   mapping of a queue PCI address range to the local CPU address space.
+
+2) The doorbell stride (DSTRB) is always set to be 4B
+
+3) Since the PCI endpoint framework does not provide a way to handle PCI level
+   resets, the controller capability NSSR bit (NVM Subsystem Reset Supported)
+   is always cleared.
+
+4) The boot partition support (BPS), Persistent Memory Region Supported (PMRS)
+   and Controller Memory Buffer Supported (CMBS) capabilities are never
+   reported.
+
+Supported Features
+------------------
+
+The NVMe PCI endpoint target driver implements support for both PRPs and SGLs.
+The driver also implements IRQ vector coalescing and submission queue
+arbitration burst.
+
+The maximum number of queues and the maximum data transfer size (MDTS) are
+configurable through configfs before starting the controller. To avoid issues
+with excessive local memory usage for executing commands, MDTS defaults to 512
+KB and is limited to a maximum of 2 MB (arbitrary limit).
+
+Mimimum number of PCI Address Mapping Windows Required
+------------------------------------------------------
+
+Most PCI endpoint controllers provide a limited number of mapping windows for
+mapping a PCI address range to local CPU memory addresses. The NVMe PCI
+endpoint target controllers uses mapping windows for the following.
+
+1) One memory window for raising MSI or MSI-X interrupts
+2) One memory window for MMIO transfers
+3) One memory window for each completion queue
+
+Given the highly asynchronous nature of the NVMe PCI endpoint target driver
+operation, the memory windows as described above will generally not be used
+simultaneously, but that may happen. So a safe maximum number of completion
+queues that can be supported is equal to the total number of memory mapping
+windows of the PCI endpoint controller minus two. E.g. for an endpoint PCI
+controller with 32 outbound memory windows available, up to 30 completion
+queues can be safely operated without any risk of getting PCI address mapping
+errors due to the lack of memory windows.
+
+Maximum Number of Queue Pairs
+-----------------------------
+
+Upon binding of the NVMe PCI endpoint target driver to the PCI endpoint
+controller, BAR 0 is allocated with enough space to accommodate the admin queue
+and multiple I/O queues. The maximum of number of I/O queues pairs that can be
+supported is limited by several factors.
+
+1) The NVMe target core code limits the maximum number of I/O queues to the
+   number of online CPUs.
+2) The total number of queue pairs, including the admin queue, cannot exceed
+   the number of MSI-X or MSI vectors available.
+3) The total number of completion queues must not exceed the total number of
+   PCI mapping windows minus 2 (see above).
+
+The NVMe endpoint function driver allows configuring the maximum number of
+queue pairs through configfs.
+
+Limitations and NVMe Specification Non-Compliance
+-------------------------------------------------
+
+Similar to the NVMe target core code, the NVMe PCI endpoint target driver does
+not support multiple submission queues using the same completion queue. All
+submission queues must specify a unique completion queue.
+
+
+User Guide
+==========
+
+This section describes the hardware requirements and how to setup an NVMe PCI
+endpoint target device.
+
+Kernel Requirements
+-------------------
+
+The kernel must be compiled with the configuration options CONFIG_PCI_ENDPOINT,
+CONFIG_PCI_ENDPOINT_CONFIGFS, and CONFIG_NVME_TARGET_PCI_EPF enabled.
+CONFIG_PCI, CONFIG_BLK_DEV_NVME and CONFIG_NVME_TARGET must also be enabled
+(obviously).
+
+In addition to this, at least one PCI endpoint controller driver should be
+available for the endpoint hardware used.
+
+To facilitate testing, enabling the null-blk driver (CONFIG_BLK_DEV_NULL_BLK)
+is also recommended. With this, a simple setup using a null_blk block device
+as a subsystem namespace can be used.
+
+Hardware Requirements
+---------------------
+
+To use the NVMe PCI endpoint target driver, at least one endpoint controller
+device is required.
+
+To find the list of endpoint controller devices in the system::
+
+       # ls /sys/class/pci_epc/
+        a40000000.pcie-ep
+
+If PCI_ENDPOINT_CONFIGFS is enabled::
+
+       # ls /sys/kernel/config/pci_ep/controllers
+        a40000000.pcie-ep
+
+The endpoint board must of course also be connected to a host with a PCI cable
+with RX-TX signal swapped. If the host PCI slot used does not have
+plug-and-play capabilities, the host should be powered off when the NVMe PCI
+endpoint device is configured.
+
+NVMe Endpoint Device
+--------------------
+
+Creating an NVMe endpoint device is a two step process. First, an NVMe target
+subsystem and port must be defined. Second, the NVMe PCI endpoint device must
+be setup and bound to the subsystem and port created.
+
+Creating a NVMe Subsystem and Port
+----------------------------------
+
+Details about how to configure a NVMe target subsystem and port are outside the
+scope of this document. The following only provides a simple example of a port
+and subsystem with a single namespace backed by a null_blk device.
+
+First, make sure that configfs is enabled::
+
+       # mount -t configfs none /sys/kernel/config
+
+Next, create a null_blk device (default settings give a 250 GB device without
+memory backing). The block device created will be /dev/nullb0 by default::
+
+        # modprobe null_blk
+        # ls /dev/nullb0
+        /dev/nullb0
+
+The NVMe PCI endpoint function target driver must be loaded::
+
+        # modprobe nvmet_pci_epf
+        # lsmod | grep nvmet
+        nvmet_pci_epf          32768  0
+        nvmet                 118784  1 nvmet_pci_epf
+        nvme_core             131072  2 nvmet_pci_epf,nvmet
+
+Now, create a subsystem and a port that we will use to create a PCI target
+controller when setting up the NVMe PCI endpoint target device. In this
+example, the port is created with a maximum of 4 I/O queue pairs::
+
+        # cd /sys/kernel/config/nvmet/subsystems
+        # mkdir nvmepf.0.nqn
+        # echo -n "Linux-pci-epf" > nvmepf.0.nqn/attr_model
+        # echo "0x1b96" > nvmepf.0.nqn/attr_vendor_id
+        # echo "0x1b96" > nvmepf.0.nqn/attr_subsys_vendor_id
+        # echo 1 > nvmepf.0.nqn/attr_allow_any_host
+        # echo 4 > nvmepf.0.nqn/attr_qid_max
+
+Next, create and enable the subsystem namespace using the null_blk block
+device::
+
+        # mkdir nvmepf.0.nqn/namespaces/1
+        # echo -n "/dev/nullb0" > nvmepf.0.nqn/namespaces/1/device_path
+        # echo 1 > "nvmepf.0.nqn/namespaces/1/enable"
+
+Finally, create the target port and link it to the subsystem::
+
+        # cd /sys/kernel/config/nvmet/ports
+        # mkdir 1
+        # echo -n "pci" > 1/addr_trtype
+        # ln -s /sys/kernel/config/nvmet/subsystems/nvmepf.0.nqn \
+                /sys/kernel/config/nvmet/ports/1/subsystems/nvmepf.0.nqn
+
+Creating a NVMe PCI Endpoint Device
+-----------------------------------
+
+With the NVMe target subsystem and port ready for use, the NVMe PCI endpoint
+device can now be created and enabled. The NVMe PCI endpoint target driver
+should already be loaded (that is done automatically when the port is created)::
+
+        # ls /sys/kernel/config/pci_ep/functions
+        nvmet_pci_epf
+
+Next, create function 0::
+
+        # cd /sys/kernel/config/pci_ep/functions/nvmet_pci_epf
+        # mkdir nvmepf.0
+        # ls nvmepf.0/
+        baseclass_code    msix_interrupts   secondary
+        cache_line_size   nvme              subclass_code
+        deviceid          primary           subsys_id
+        interrupt_pin     progif_code       subsys_vendor_id
+        msi_interrupts    revid             vendorid
+
+Configure the function using any device ID (the vendor ID for the device will
+be automatically set to the same value as the NVMe target subsystem vendor
+ID)::
+
+        # cd /sys/kernel/config/pci_ep/functions/nvmet_pci_epf
+        # echo 0xBEEF > nvmepf.0/deviceid
+        # echo 32 > nvmepf.0/msix_interrupts
+
+If the PCI endpoint controller used does not support MSI-X, MSI can be
+configured instead::
+
+        # echo 32 > nvmepf.0/msi_interrupts
+
+Next, let's bind our endpoint device with the target subsystem and port that we
+created::
+
+        # echo 1 > nvmepf.0/nvme/portid
+        # echo "nvmepf.0.nqn" > nvmepf.0/nvme/subsysnqn
+
+The endpoint function can then be bound to the endpoint controller and the
+controller started::
+
+        # cd /sys/kernel/config/pci_ep
+        # ln -s functions/nvmet_pci_epf/nvmepf.0 controllers/a40000000.pcie-ep/
+        # echo 1 > controllers/a40000000.pcie-ep/start
+
+On the endpoint machine, kernel messages will show information as the NVMe
+target device and endpoint device are created and connected.
+
+.. code-block:: text
+
+        null_blk: disk nullb0 created
+        null_blk: module loaded
+        nvmet: adding nsid 1 to subsystem nvmepf.0.nqn
+        nvmet_pci_epf nvmet_pci_epf.0: PCI endpoint controller supports MSI-X, 32 vectors
+        nvmet: Created nvm controller 1 for subsystem nvmepf.0.nqn for NQN nqn.2014-08.org.nvmexpress:uuid:2ab90791-2246-4fbb-961d-4c3d5a5a0176.
+        nvmet_pci_epf nvmet_pci_epf.0: New PCI ctrl "nvmepf.0.nqn", 4 I/O queues, mdts 524288 B
+
+PCI Root-Complex Host
+---------------------
+
+Booting the PCI host will result in the initialization of the PCIe link (this
+may be signaled by the PCI endpoint driver with a kernel message). A kernel
+message on the endpoint will also signal when the host NVMe driver enables the
+device controller::
+
+        nvmet_pci_epf nvmet_pci_epf.0: Enabling controller
+
+On the host side, the NVMe PCI endpoint function target device will is
+discoverable as a PCI device, with the vendor ID and device ID as configured::
+
+        # lspci -n
+        0000:01:00.0 0108: 1b96:beef
+
+An this device will be recognized as an NVMe device with a single namespace::
+
+        # lsblk
+        NAME        MAJ:MIN RM   SIZE RO TYPE MOUNTPOINTS
+        nvme0n1     259:0    0   250G  0 disk
+
+The NVMe endpoint block device can then be used as any other regular NVMe
+namespace block device. The *nvme* command line utility can be used to get more
+detailed information about the endpoint device::
+
+        # nvme id-ctrl /dev/nvme0
+        NVME Identify Controller:
+        vid       : 0x1b96
+        ssvid     : 0x1b96
+        sn        : 94993c85650ef7bcd625
+        mn        : Linux-pci-epf
+        fr        : 6.13.0-r
+        rab       : 6
+        ieee      : 000000
+        cmic      : 0xb
+        mdts      : 7
+        cntlid    : 0x1
+        ver       : 0x20100
+        ...
+
+
+Endpoint Bindings
+=================
+
+The NVMe PCI endpoint target driver uses the PCI endpoint configfs device
+attributes as follows.
+
+================   ===========================================================
+vendorid           Ignored (the vendor id of the NVMe target subsystem is used)
+deviceid           Anything is OK (e.g. PCI_ANY_ID)
+revid              Do not care
+progif_code        Must be 0x02 (NVM Express)
+baseclass_code     Must be 0x01 (PCI_BASE_CLASS_STORAGE)
+subclass_code      Must be 0x08 (Non-Volatile Memory controller)
+cache_line_size    Do not care
+subsys_vendor_id   Ignored (the subsystem vendor id of the NVMe target subsystem
+		   is used)
+subsys_id          Anything is OK (e.g. PCI_ANY_ID)
+msi_interrupts     At least equal to the number of queue pairs desired
+msix_interrupts    At least equal to the number of queue pairs desired
+interrupt_pin      Interrupt PIN to use if MSI and MSI-X are not supported
+================   ===========================================================
+
+The NVMe PCI endpoint target function also has some specific configurable
+fields defined in the *nvme* subdirectory of the function directory. These
+fields are as follows.
+
+================   ===========================================================
+mdts_kb            Maximum data transfer size in KiB (default: 512)
+portid             The ID of the target port to use
+subsysnqn          The NQN of the target subsystem to use
+================   ===========================================================
diff --git a/Documentation/subsystem-apis.rst b/Documentation/subsystem-apis.rst
index 74af50d2ef7f..b52ad5b969d4 100644
--- a/Documentation/subsystem-apis.rst
+++ b/Documentation/subsystem-apis.rst
@@ -60,6 +60,7 @@ Storage interfaces
    cdrom/index
    scsi/index
    target/index
+   nvme/index
 
 Other subsystems
 ----------------
-- 
2.50.1


From e4a0a3058de85bc623f1ba90eec68f239d0a11b2 Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch@tkos.co.il>
Date: Sun, 8 Dec 2024 13:34:32 +0200
Subject: [PATCH 08/16] nvme-pci: fix comment typo

envent -> event.

Signed-off-by: Baruch Siach <baruch@tkos.co.il>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 57e8e32c4529..c3bfbe11ee57 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -372,7 +372,7 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db,
 		/*
 		 * Ensure that the doorbell is updated before reading the event
 		 * index from memory.  The controller needs to provide similar
-		 * ordering to ensure the envent index is updated before reading
+		 * ordering to ensure the event index is updated before reading
 		 * the doorbell.
 		 */
 		mb();
-- 
2.50.1


From d4a95adeabc6b5a39405e49c6d5ed14dd83682c4 Mon Sep 17 00:00:00 2001
From: Keisuke Nishimura <keisuke.nishimura@inria.fr>
Date: Mon, 16 Dec 2024 16:27:20 +0100
Subject: [PATCH 09/16] nvme: Add error path for xa_store in nvme_init_effects

The xa_store() may fail due to memory allocation failure because there
is no guarantee that the index NVME_CSI_NVM is already used. This fix
introduces a new function to handle the error path.

Fixes: cc115cbe12d9 ("nvme: always initialize known command effects")
Signed-off-by: Keisuke Nishimura <keisuke.nishimura@inria.fr>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4bdd5144af7c..2a0555856795 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3175,6 +3175,25 @@ free_data:
 	return ret;
 }
 
+static int nvme_init_effects_log(struct nvme_ctrl *ctrl,
+		u8 csi, struct nvme_effects_log **log)
+{
+	struct nvme_effects_log *effects, *old;
+
+	effects = kzalloc(sizeof(*effects), GFP_KERNEL);
+	if (effects)
+		return -ENOMEM;
+
+	old = xa_store(&ctrl->cels, csi, effects, GFP_KERNEL);
+	if (xa_is_err(old)) {
+		kfree(effects);
+		return xa_err(old);
+	}
+
+	*log = effects;
+	return 0;
+}
+
 static void nvme_init_known_nvm_effects(struct nvme_ctrl *ctrl)
 {
 	struct nvme_effects_log	*log = ctrl->effects;
@@ -3221,10 +3240,9 @@ static int nvme_init_effects(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 	}
 
 	if (!ctrl->effects) {
-		ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
-		if (!ctrl->effects)
-			return -ENOMEM;
-		xa_store(&ctrl->cels, NVME_CSI_NVM, ctrl->effects, GFP_KERNEL);
+		ret = nvme_init_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
+		if (ret < 0)
+			return ret;
 	}
 
 	nvme_init_known_nvm_effects(ctrl);
-- 
2.50.1


From 4a324970fabad503260973cd588609f3a26baab9 Mon Sep 17 00:00:00 2001
From: Francis Pravin <francis.p@samsung.com>
Date: Fri, 10 Jan 2025 05:21:37 +0530
Subject: [PATCH 10/16] nvme-pci: use correct size to free the hmb buffer

dev->host_mem_size value is updated only after the successful buffer
allocation of hmb descriptor. Otherwise, it may have some undefined value.
So, use the correct size to free the hmb buffer when the hmb descriptor
buffer allocation failed.

Signed-off-by: Francis Pravin <francis.p@samsung.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c3bfbe11ee57..fe0795e16e25 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2085,8 +2085,8 @@ static int nvme_alloc_host_mem_single(struct nvme_dev *dev, u64 size)
 			sizeof(*dev->host_mem_descs), &dev->host_mem_descs_dma,
 			GFP_KERNEL);
 	if (!dev->host_mem_descs) {
-		dma_free_noncontiguous(dev->dev, dev->host_mem_size,
-				dev->hmb_sgt, DMA_BIDIRECTIONAL);
+		dma_free_noncontiguous(dev->dev, size, dev->hmb_sgt,
+				DMA_BIDIRECTIONAL);
 		dev->hmb_sgt = NULL;
 		return -ENOMEM;
 	}
-- 
2.50.1


From 7c0be4ead1f8f5f8be0803f347de0de81e3b8e1c Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Mon, 13 Jan 2025 09:58:33 +0800
Subject: [PATCH 11/16] block: mark GFP_NOIO around sysfs ->store()
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

sysfs ->store is called with queue freezed, meantime we have several
->store() callbacks(update_nr_requests, wbt, scheduler) to allocate
memory with GFP_KERNEL which may run into direct reclaim code path,
then potential deadlock can be caused.

Fix the issue by marking NOIO around sysfs ->store()

Reported-by: Thomas HellstrÃ¶m <thomas.hellstrom@linux.intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Link: https://lore.kernel.org/r/20250113015833.698458-1-ming.lei@redhat.com
Link: https://lore.kernel.org/linux-block/Z4RkemI9f6N5zoEF@fedora/T/#mc774c65eeca5c024d29695f9ac6152b87763f305
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-sysfs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e828be777206..e09b455874bf 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -681,6 +681,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
 	struct queue_sysfs_entry *entry = to_queue(attr);
 	struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
 	struct request_queue *q = disk->queue;
+	unsigned int noio_flag;
 	ssize_t res;
 
 	if (!entry->store_limit && !entry->store)
@@ -711,7 +712,9 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
 
 	mutex_lock(&q->sysfs_lock);
 	blk_mq_freeze_queue(q);
+	noio_flag = memalloc_noio_save();
 	res = entry->store(disk, page, length);
+	memalloc_noio_restore(noio_flag);
 	blk_mq_unfreeze_queue(q);
 	mutex_unlock(&q->sysfs_lock);
 	return res;
-- 
2.50.1


From 8337b029f788272f5273887ccefb8226404658ce Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 29 Oct 2024 09:19:41 +0800
Subject: [PATCH 12/16] nbd: fix partial sending

nbd driver sends request header and payload with multiple call of
sock_sendmsg, and partial sending can't be avoided. However, nbd driver
returns BLK_STS_RESOURCE to block core in this situation. This way causes
one issue: request->tag may change in the next run of nbd_queue_rq(), but
the original old tag has been sent as part of header cookie, this way
confuses nbd driver reply handling, since the real request can't be
retrieved any more with the obsolete old tag.

Fix it by retrying sending directly in per-socket work function,
meantime return BLK_STS_OK to block layer core.

Cc: vincent.chen@sifive.com
Cc: Leon Schuermann <leon@is.currently.online>
Cc: Bart Van Assche <bvanassche@acm.org>
Reported-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Tested-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Link: https://lore.kernel.org/r/20241029011941.153037-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/nbd.c | 95 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 85 insertions(+), 10 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index efa05c3c06bf..b63a0f29a54a 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -62,6 +62,7 @@ struct nbd_sock {
 	bool dead;
 	int fallback_index;
 	int cookie;
+	struct work_struct work;
 };
 
 struct recv_thread_args {
@@ -141,6 +142,9 @@ struct nbd_device {
  */
 #define NBD_CMD_INFLIGHT	2
 
+/* Just part of request header or data payload is sent successfully */
+#define NBD_CMD_PARTIAL_SEND	3
+
 struct nbd_cmd {
 	struct nbd_device *nbd;
 	struct mutex lock;
@@ -453,6 +457,12 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req)
 	if (!mutex_trylock(&cmd->lock))
 		return BLK_EH_RESET_TIMER;
 
+	/* partial send is handled in nbd_sock's work function */
+	if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)) {
+		mutex_unlock(&cmd->lock);
+		return BLK_EH_RESET_TIMER;
+	}
+
 	if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
 		mutex_unlock(&cmd->lock);
 		return BLK_EH_DONE;
@@ -601,6 +611,30 @@ static inline int was_interrupted(int result)
 	return result == -ERESTARTSYS || result == -EINTR;
 }
 
+/*
+ * We've already sent header or part of data payload, have no choice but
+ * to set pending and schedule it in work.
+ *
+ * And we have to return BLK_STS_OK to block core, otherwise this same
+ * request may be re-dispatched with different tag, but our header has
+ * been sent out with old tag, and this way does confuse reply handling.
+ */
+static void nbd_sched_pending_work(struct nbd_device *nbd,
+				   struct nbd_sock *nsock,
+				   struct nbd_cmd *cmd, int sent)
+{
+	struct request *req = blk_mq_rq_from_pdu(cmd);
+
+	/* pending work should be scheduled only once */
+	WARN_ON_ONCE(test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags));
+
+	nsock->pending = req;
+	nsock->sent = sent;
+	set_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags);
+	refcount_inc(&nbd->config_refs);
+	schedule_work(&nsock->work);
+}
+
 /*
  * Returns BLK_STS_RESOURCE if the caller should retry after a delay.
  * Returns BLK_STS_IOERR if sending failed.
@@ -686,8 +720,8 @@ static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd,
 			 * completely done.
 			 */
 			if (sent) {
-				nsock->pending = req;
-				nsock->sent = sent;
+				nbd_sched_pending_work(nbd, nsock, cmd, sent);
+				return BLK_STS_OK;
 			}
 			set_bit(NBD_CMD_REQUEUED, &cmd->flags);
 			return BLK_STS_RESOURCE;
@@ -724,14 +758,8 @@ send_pages:
 			result = sock_xmit(nbd, index, 1, &from, flags, &sent);
 			if (result < 0) {
 				if (was_interrupted(result)) {
-					/* We've already sent the header, we
-					 * have no choice but to set pending and
-					 * return BUSY.
-					 */
-					nsock->pending = req;
-					nsock->sent = sent;
-					set_bit(NBD_CMD_REQUEUED, &cmd->flags);
-					return BLK_STS_RESOURCE;
+					nbd_sched_pending_work(nbd, nsock, cmd, sent);
+					return BLK_STS_OK;
 				}
 				dev_err(disk_to_dev(nbd->disk),
 					"Send data failed (result %d)\n",
@@ -757,6 +785,14 @@ out:
 	return BLK_STS_OK;
 
 requeue:
+	/*
+	 * Can't requeue in case we are dealing with partial send
+	 *
+	 * We must run from pending work function.
+	 * */
+	if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags))
+		return BLK_STS_OK;
+
 	/* retry on a different socket */
 	dev_err_ratelimited(disk_to_dev(nbd->disk),
 			    "Request send failed, requeueing\n");
@@ -765,6 +801,44 @@ requeue:
 	return BLK_STS_OK;
 }
 
+/* handle partial sending */
+static void nbd_pending_cmd_work(struct work_struct *work)
+{
+	struct nbd_sock *nsock = container_of(work, struct nbd_sock, work);
+	struct request *req = nsock->pending;
+	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
+	struct nbd_device *nbd = cmd->nbd;
+	unsigned long deadline = READ_ONCE(req->deadline);
+	unsigned int wait_ms = 2;
+
+	mutex_lock(&cmd->lock);
+
+	WARN_ON_ONCE(test_bit(NBD_CMD_REQUEUED, &cmd->flags));
+	if (WARN_ON_ONCE(!test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)))
+		goto out;
+
+	mutex_lock(&nsock->tx_lock);
+	while (true) {
+		nbd_send_cmd(nbd, cmd, cmd->index);
+		if (!nsock->pending)
+			break;
+
+		/* don't bother timeout handler for partial sending */
+		if (READ_ONCE(jiffies) + msecs_to_jiffies(wait_ms) >= deadline) {
+			cmd->status = BLK_STS_IOERR;
+			blk_mq_complete_request(req);
+			break;
+		}
+		msleep(wait_ms);
+		wait_ms *= 2;
+	}
+	mutex_unlock(&nsock->tx_lock);
+	clear_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags);
+out:
+	mutex_unlock(&cmd->lock);
+	nbd_config_put(nbd);
+}
+
 static int nbd_read_reply(struct nbd_device *nbd, struct socket *sock,
 			  struct nbd_reply *reply)
 {
@@ -1211,6 +1285,7 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
 	nsock->pending = NULL;
 	nsock->sent = 0;
 	nsock->cookie = 0;
+	INIT_WORK(&nsock->work, nbd_pending_cmd_work);
 	socks[config->num_connections++] = nsock;
 	atomic_inc(&config->live_connections);
 	blk_mq_unfreeze_queue(nbd->disk->queue);
-- 
2.50.1


From 4fa5c37012d71f6a39c4286ffabb9466f1728ba3 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 10 Jan 2025 22:27:36 -0800
Subject: [PATCH 13/16] blk-cgroup: fix kernel-doc warnings in header file

Correct the function parameters and function names to eliminate
kernel-doc warnings:

blk-cgroup.h:238: warning: Function parameter or struct member 'bio' not described in 'bio_issue_as_root_blkg'
blk-cgroup.h:248: warning: bad line:
blk-cgroup.h:279: warning: expecting prototype for blkg_to_pdata(). Prototype was for blkg_to_pd() instead
blk-cgroup.h:296: warning: expecting prototype for pdata_to_blkg(). Prototype was for pd_to_blkg() instead

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: cgroups@vger.kernel.org
Link: https://lore.kernel.org/r/20250111062736.910383-1-rdunlap@infradead.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index b9e3265c1eb3..2c4663bd993a 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -225,7 +225,9 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx);
 
 /**
  * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
- * @return: true if this bio needs to be submitted with the root blkg context.
+ * @bio: the target &bio
+ *
+ * Return: true if this bio needs to be submitted with the root blkg context.
  *
  * In order to avoid priority inversions we sometimes need to issue a bio as if
  * it were attached to the root blkg, and then backcharge to the actual owning
@@ -245,7 +247,7 @@ static inline bool bio_issue_as_root_blkg(struct bio *bio)
  * @q: request_queue of interest
  *
  * Lookup blkg for the @blkcg - @q pair.
-
+ *
  * Must be called in a RCU critical section.
  */
 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
@@ -268,7 +270,7 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
 }
 
 /**
- * blkg_to_pdata - get policy private data
+ * blkg_to_pd - get policy private data
  * @blkg: blkg of interest
  * @pol: policy of interest
  *
@@ -287,7 +289,7 @@ static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
 }
 
 /**
- * pdata_to_blkg - get blkg associated with policy private data
+ * pd_to_blkg - get blkg associated with policy private data
  * @pd: policy private data of interest
  *
  * @pd is policy private data.  Determine the blkg it's associated with.
-- 
2.50.1


From f403034e8afd12ed6ea5de64f0adda3d90e67c9d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 10 Jan 2025 22:27:48 -0800
Subject: [PATCH 14/16] blk-cgroup: rwstat: fix kernel-doc warnings in header
 file

Correct the function parameters to eliminate kernel-doc warnings:

blk-cgroup-rwstat.h:63: warning: Function parameter or struct member 'opf' not described in 'blkg_rwstat_add'
blk-cgroup-rwstat.h:63: warning: Excess function parameter 'op' description in 'blkg_rwstat_add'
blk-cgroup-rwstat.h:91: warning: Function parameter or struct member 'result' not described in 'blkg_rwstat_read'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: cgroups@vger.kernel.org
Link: https://lore.kernel.org/r/20250111062748.910442-1-rdunlap@infradead.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup-rwstat.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h
index 022527b0b043..703a16fe1404 100644
--- a/block/blk-cgroup-rwstat.h
+++ b/block/blk-cgroup-rwstat.h
@@ -52,7 +52,7 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
 /**
  * blkg_rwstat_add - add a value to a blkg_rwstat
  * @rwstat: target blkg_rwstat
- * @op: REQ_OP and flags
+ * @opf: REQ_OP and flags
  * @val: value to add
  *
  * Add @val to @rwstat.  The counters are chosen according to @rw.  The
@@ -83,8 +83,9 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
 /**
  * blkg_rwstat_read - read the current values of a blkg_rwstat
  * @rwstat: blkg_rwstat to read
+ * @result: where to put the current values
  *
- * Read the current snapshot of @rwstat and return it in the aux counts.
+ * Read the current snapshot of @rwstat and return it in the @result counts.
  */
 static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat,
 		struct blkg_rwstat_sample *result)
-- 
2.50.1


From e494e451611a3de6ae95f99e8339210c157d70fb Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 10 Jan 2025 22:27:58 -0800
Subject: [PATCH 15/16] partitions: ldm: remove the initial kernel-doc notation

Remove the file's first comment describing what the file is.
This comment is not in kernel-doc format so it causes a kernel-doc
warning.

ldm.h:13: warning: expecting prototype for ldm(). Prototype was for _FS_PT_LDM_H_() instead

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Richard Russon (FlatCap) <ldm@flatcap.org>
Cc: linux-ntfs-dev@lists.sourceforge.net
Cc: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20250111062758.910458-1-rdunlap@infradead.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/partitions/ldm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h
index e259180c8914..aa3bd050d8cd 100644
--- a/block/partitions/ldm.h
+++ b/block/partitions/ldm.h
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * ldm - Part of the Linux-NTFS project.
  *
  * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
-- 
2.50.1


From 127186cfb184eaccdfe948e6da66940cfa03efc5 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 2 Jan 2025 19:28:41 +0800
Subject: [PATCH 16/16] md: reintroduce md-linear

THe md-linear is removed by commit 849d18e27be9 ("md: Remove deprecated
CONFIG_MD_LINEAR") because it has been marked as deprecated for a long
time.

However, md-linear is used widely for underlying disks with different size,
sadly we didn't know this until now, and it's true useful to create
partitions and assemble multiple raid and then append one to the other.

People have to use dm-linear in this case now, however, they will prefer
to minimize the number of involved modules.

Fixes: 849d18e27be9 ("md: Remove deprecated CONFIG_MD_LINEAR")
Cc: stable@vger.kernel.org
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Acked-by: Coly Li <colyli@kernel.org>
Acked-by: Mike Snitzer <snitzer@kernel.org>
Link: https://lore.kernel.org/r/20250102112841.1227111-1-yukuai1@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/Kconfig             |  13 ++
 drivers/md/Makefile            |   2 +
 drivers/md/md-autodetect.c     |   8 +-
 drivers/md/md-linear.c         | 354 +++++++++++++++++++++++++++++++++
 drivers/md/md.c                |   2 +-
 include/uapi/linux/raid/md_p.h |   2 +-
 include/uapi/linux/raid/md_u.h |   2 +
 7 files changed, 379 insertions(+), 4 deletions(-)
 create mode 100644 drivers/md/md-linear.c

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 1e9db8e4acdf..0b1870a09e1f 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -61,6 +61,19 @@ config MD_BITMAP_FILE
 	  various kernel APIs and can only work with files on a file system not
 	  actually sitting on the MD device.
 
+config MD_LINEAR
+	tristate "Linear (append) mode"
+	depends on BLK_DEV_MD
+	help
+	  If you say Y here, then your multiple devices driver will be able to
+	  use the so-called linear mode, i.e. it will combine the hard disk
+	  partitions by simply appending one to the other.
+
+	  To compile this as a module, choose M here: the module
+	  will be called linear.
+
+	  If unsure, say Y.
+
 config MD_RAID0
 	tristate "RAID-0 (striping) mode"
 	depends on BLK_DEV_MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 476a214e4bdc..87bdfc9fe14c 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -29,12 +29,14 @@ dm-zoned-y	+= dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
 
 md-mod-y	+= md.o md-bitmap.o
 raid456-y	+= raid5.o raid5-cache.o raid5-ppl.o
+linear-y       += md-linear.o
 
 # Note: link order is important.  All raid personalities
 # and must come before md.o, as they each initialise
 # themselves, and md.o may use the personalities when it
 # auto-initialised.
 
+obj-$(CONFIG_MD_LINEAR)		+= linear.o
 obj-$(CONFIG_MD_RAID0)		+= raid0.o
 obj-$(CONFIG_MD_RAID1)		+= raid1.o
 obj-$(CONFIG_MD_RAID10)		+= raid10.o
diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c
index b2a00f213c2c..4b80165afd23 100644
--- a/drivers/md/md-autodetect.c
+++ b/drivers/md/md-autodetect.c
@@ -49,6 +49,7 @@ static int md_setup_ents __initdata;
  *             instead of just one.  -- KTK
  * 18May2000: Added support for persistent-superblock arrays:
  *             md=n,0,factor,fault,device-list   uses RAID0 for device n
+ *             md=n,-1,factor,fault,device-list  uses LINEAR for device n
  *             md=n,device-list      reads a RAID superblock from the devices
  *             elements in device-list are read by name_to_kdev_t so can be
  *             a hex number or something like /dev/hda1 /dev/sdb
@@ -87,7 +88,7 @@ static int __init md_setup(char *str)
 		md_setup_ents++;
 	switch (get_option(&str, &level)) {	/* RAID level */
 	case 2: /* could be 0 or -1.. */
-		if (level == 0) {
+		if (level == 0 || level == LEVEL_LINEAR) {
 			if (get_option(&str, &factor) != 2 ||	/* Chunk Size */
 					get_option(&str, &fault) != 2) {
 				printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
@@ -95,7 +96,10 @@ static int __init md_setup(char *str)
 			}
 			md_setup_args[ent].level = level;
 			md_setup_args[ent].chunk = 1 << (factor+12);
-			pername = "raid0";
+			if (level ==  LEVEL_LINEAR)
+				pername = "linear";
+			else
+				pername = "raid0";
 			break;
 		}
 		fallthrough;
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
new file mode 100644
index 000000000000..53bc3fda9edb
--- /dev/null
+++ b/drivers/md/md-linear.c
@@ -0,0 +1,354 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * linear.c : Multiple Devices driver for Linux Copyright (C) 1994-96 Marc
+ * ZYNGIER <zyngier@ufr-info-p7.ibp.fr> or <maz@gloups.fdn.fr>
+ */
+
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <trace/events/block.h>
+#include "md.h"
+
+struct dev_info {
+	struct md_rdev	*rdev;
+	sector_t	end_sector;
+};
+
+struct linear_conf {
+	struct rcu_head         rcu;
+	sector_t                array_sectors;
+	/* a copy of mddev->raid_disks */
+	int                     raid_disks;
+	struct dev_info         disks[] __counted_by(raid_disks);
+};
+
+/*
+ * find which device holds a particular offset
+ */
+static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
+{
+	int lo, mid, hi;
+	struct linear_conf *conf;
+
+	lo = 0;
+	hi = mddev->raid_disks - 1;
+	conf = mddev->private;
+
+	/*
+	 * Binary Search
+	 */
+
+	while (hi > lo) {
+
+		mid = (hi + lo) / 2;
+		if (sector < conf->disks[mid].end_sector)
+			hi = mid;
+		else
+			lo = mid + 1;
+	}
+
+	return conf->disks + lo;
+}
+
+static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks)
+{
+	struct linear_conf *conf;
+	sector_t array_sectors;
+
+	conf = mddev->private;
+	WARN_ONCE(sectors || raid_disks,
+		  "%s does not support generic reshape\n", __func__);
+	array_sectors = conf->array_sectors;
+
+	return array_sectors;
+}
+
+static int linear_set_limits(struct mddev *mddev)
+{
+	struct queue_limits lim;
+	int err;
+
+	md_init_stacking_limits(&lim);
+	lim.max_hw_sectors = mddev->chunk_sectors;
+	lim.max_write_zeroes_sectors = mddev->chunk_sectors;
+	lim.io_min = mddev->chunk_sectors << 9;
+	err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
+	if (err) {
+		queue_limits_cancel_update(mddev->gendisk->queue);
+		return err;
+	}
+
+	return queue_limits_set(mddev->gendisk->queue, &lim);
+}
+
+static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
+{
+	struct linear_conf *conf;
+	struct md_rdev *rdev;
+	int ret = -EINVAL;
+	int cnt;
+	int i;
+
+	conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL);
+	if (!conf)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * conf->raid_disks is copy of mddev->raid_disks. The reason to
+	 * keep a copy of mddev->raid_disks in struct linear_conf is,
+	 * mddev->raid_disks may not be consistent with pointers number of
+	 * conf->disks[] when it is updated in linear_add() and used to
+	 * iterate old conf->disks[] earray in linear_congested().
+	 * Here conf->raid_disks is always consitent with number of
+	 * pointers in conf->disks[] array, and mddev->private is updated
+	 * with rcu_assign_pointer() in linear_addr(), such race can be
+	 * avoided.
+	 */
+	conf->raid_disks = raid_disks;
+
+	cnt = 0;
+	conf->array_sectors = 0;
+
+	rdev_for_each(rdev, mddev) {
+		int j = rdev->raid_disk;
+		struct dev_info *disk = conf->disks + j;
+		sector_t sectors;
+
+		if (j < 0 || j >= raid_disks || disk->rdev) {
+			pr_warn("md/linear:%s: disk numbering problem. Aborting!\n",
+				mdname(mddev));
+			goto out;
+		}
+
+		disk->rdev = rdev;
+		if (mddev->chunk_sectors) {
+			sectors = rdev->sectors;
+			sector_div(sectors, mddev->chunk_sectors);
+			rdev->sectors = sectors * mddev->chunk_sectors;
+		}
+
+		conf->array_sectors += rdev->sectors;
+		cnt++;
+	}
+	if (cnt != raid_disks) {
+		pr_warn("md/linear:%s: not enough drives present. Aborting!\n",
+			mdname(mddev));
+		goto out;
+	}
+
+	/*
+	 * Here we calculate the device offsets.
+	 */
+	conf->disks[0].end_sector = conf->disks[0].rdev->sectors;
+
+	for (i = 1; i < raid_disks; i++)
+		conf->disks[i].end_sector =
+			conf->disks[i-1].end_sector +
+			conf->disks[i].rdev->sectors;
+
+	if (!mddev_is_dm(mddev)) {
+		ret = linear_set_limits(mddev);
+		if (ret)
+			goto out;
+	}
+
+	return conf;
+
+out:
+	kfree(conf);
+	return ERR_PTR(ret);
+}
+
+static int linear_run(struct mddev *mddev)
+{
+	struct linear_conf *conf;
+	int ret;
+
+	if (md_check_no_bitmap(mddev))
+		return -EINVAL;
+
+	conf = linear_conf(mddev, mddev->raid_disks);
+	if (IS_ERR(conf))
+		return PTR_ERR(conf);
+
+	mddev->private = conf;
+	md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
+
+	ret =  md_integrity_register(mddev);
+	if (ret) {
+		kfree(conf);
+		mddev->private = NULL;
+	}
+	return ret;
+}
+
+static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
+{
+	/* Adding a drive to a linear array allows the array to grow.
+	 * It is permitted if the new drive has a matching superblock
+	 * already on it, with raid_disk equal to raid_disks.
+	 * It is achieved by creating a new linear_private_data structure
+	 * and swapping it in in-place of the current one.
+	 * The current one is never freed until the array is stopped.
+	 * This avoids races.
+	 */
+	struct linear_conf *newconf, *oldconf;
+
+	if (rdev->saved_raid_disk != mddev->raid_disks)
+		return -EINVAL;
+
+	rdev->raid_disk = rdev->saved_raid_disk;
+	rdev->saved_raid_disk = -1;
+
+	newconf = linear_conf(mddev, mddev->raid_disks + 1);
+	if (!newconf)
+		return -ENOMEM;
+
+	/* newconf->raid_disks already keeps a copy of * the increased
+	 * value of mddev->raid_disks, WARN_ONCE() is just used to make
+	 * sure of this. It is possible that oldconf is still referenced
+	 * in linear_congested(), therefore kfree_rcu() is used to free
+	 * oldconf until no one uses it anymore.
+	 */
+	oldconf = rcu_dereference_protected(mddev->private,
+			lockdep_is_held(&mddev->reconfig_mutex));
+	mddev->raid_disks++;
+	WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
+		"copied raid_disks doesn't match mddev->raid_disks");
+	rcu_assign_pointer(mddev->private, newconf);
+	md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
+	set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
+	kfree_rcu(oldconf, rcu);
+	return 0;
+}
+
+static void linear_free(struct mddev *mddev, void *priv)
+{
+	struct linear_conf *conf = priv;
+
+	kfree(conf);
+}
+
+static bool linear_make_request(struct mddev *mddev, struct bio *bio)
+{
+	struct dev_info *tmp_dev;
+	sector_t start_sector, end_sector, data_offset;
+	sector_t bio_sector = bio->bi_iter.bi_sector;
+
+	if (unlikely(bio->bi_opf & REQ_PREFLUSH)
+	    && md_flush_request(mddev, bio))
+		return true;
+
+	tmp_dev = which_dev(mddev, bio_sector);
+	start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
+	end_sector = tmp_dev->end_sector;
+	data_offset = tmp_dev->rdev->data_offset;
+
+	if (unlikely(bio_sector >= end_sector ||
+		     bio_sector < start_sector))
+		goto out_of_bounds;
+
+	if (unlikely(is_rdev_broken(tmp_dev->rdev))) {
+		md_error(mddev, tmp_dev->rdev);
+		bio_io_error(bio);
+		return true;
+	}
+
+	if (unlikely(bio_end_sector(bio) > end_sector)) {
+		/* This bio crosses a device boundary, so we have to split it */
+		struct bio *split = bio_split(bio, end_sector - bio_sector,
+					      GFP_NOIO, &mddev->bio_set);
+
+		if (IS_ERR(split)) {
+			bio->bi_status = errno_to_blk_status(PTR_ERR(split));
+			bio_endio(bio);
+			return true;
+		}
+
+		bio_chain(split, bio);
+		submit_bio_noacct(bio);
+		bio = split;
+	}
+
+	md_account_bio(mddev, &bio);
+	bio_set_dev(bio, tmp_dev->rdev->bdev);
+	bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
+		start_sector + data_offset;
+
+	if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+		     !bdev_max_discard_sectors(bio->bi_bdev))) {
+		/* Just ignore it */
+		bio_endio(bio);
+	} else {
+		if (mddev->gendisk)
+			trace_block_bio_remap(bio, disk_devt(mddev->gendisk),
+					      bio_sector);
+		mddev_check_write_zeroes(mddev, bio);
+		submit_bio_noacct(bio);
+	}
+	return true;
+
+out_of_bounds:
+	pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %pg: %llu sectors, offset %llu\n",
+	       mdname(mddev),
+	       (unsigned long long)bio->bi_iter.bi_sector,
+	       tmp_dev->rdev->bdev,
+	       (unsigned long long)tmp_dev->rdev->sectors,
+	       (unsigned long long)start_sector);
+	bio_io_error(bio);
+	return true;
+}
+
+static void linear_status(struct seq_file *seq, struct mddev *mddev)
+{
+	seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
+}
+
+static void linear_error(struct mddev *mddev, struct md_rdev *rdev)
+{
+	if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) {
+		char *md_name = mdname(mddev);
+
+		pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n",
+			md_name, rdev->bdev);
+	}
+}
+
+static void linear_quiesce(struct mddev *mddev, int state)
+{
+}
+
+static struct md_personality linear_personality = {
+	.name		= "linear",
+	.level		= LEVEL_LINEAR,
+	.owner		= THIS_MODULE,
+	.make_request	= linear_make_request,
+	.run		= linear_run,
+	.free		= linear_free,
+	.status		= linear_status,
+	.hot_add_disk	= linear_add,
+	.size		= linear_size,
+	.quiesce	= linear_quiesce,
+	.error_handler	= linear_error,
+};
+
+static int __init linear_init(void)
+{
+	return register_md_personality(&linear_personality);
+}
+
+static void linear_exit(void)
+{
+	unregister_md_personality(&linear_personality);
+}
+
+module_init(linear_init);
+module_exit(linear_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Linear device concatenation personality for MD (deprecated)");
+MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
+MODULE_ALIAS("md-linear");
+MODULE_ALIAS("md-level--1");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index aebe12b0ee27..3dd013f82e26 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8124,7 +8124,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
 		return;
 	mddev->pers->error_handler(mddev, rdev);
 
-	if (mddev->pers->level == 0)
+	if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR)
 		return;
 
 	if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index 5a43c23f53bf..ff47b6f0ba0f 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -233,7 +233,7 @@ struct mdp_superblock_1 {
 	char	set_name[32];	/* set and interpreted by user-space */
 
 	__le64	ctime;		/* lo 40 bits are seconds, top 24 are microseconds or 0*/
-	__le32	level;		/* 0,1,4,5 */
+	__le32	level;		/* 0,1,4,5, -1 (linear) */
 	__le32	layout;		/* only for raid5 and raid10 currently */
 	__le64	size;		/* used size of component devices, in 512byte sectors */
 
diff --git a/include/uapi/linux/raid/md_u.h b/include/uapi/linux/raid/md_u.h
index 7be89a4906e7..a893010735fb 100644
--- a/include/uapi/linux/raid/md_u.h
+++ b/include/uapi/linux/raid/md_u.h
@@ -103,6 +103,8 @@ typedef struct mdu_array_info_s {
 
 } mdu_array_info_t;
 
+#define LEVEL_LINEAR		(-1)
+
 /* we need a value for 'no level specified' and 0
  * means 'raid0', so we need something else.  This is
  * for internal use only
-- 
2.50.1