From ffa35567632c0059e7f380ed155e26a07ec4153f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Feb 2025 15:52:29 +0900 Subject: [PATCH 01/16] nvmet: pci-epf: Do not uselessly write the CSTS register The function nvmet_pci_epf_poll_cc_work() will do nothing if there are no changes to the controller configuration (CC) register. However, even for such case, this function still calls nvmet_update_cc() and uselessly writes the CSTS register. Avoid this by simply rescheduling the poll_cc work if the CC register has not changed. Also reschedule the poll_cc work if the function nvmet_pci_epf_enable_ctrl() fails to allow the host the chance to try again enabling the controller. While at it, since there is no point in trying to handle the CC register as quickly as possible, change the poll_cc work scheduling interval to 10 ms (from 5ms), to avoid excessive read accesses to that register. Fixes: 0faa0fe6f90e ("nvmet: New NVMe PCI endpoint function target driver") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/pci-epf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index efd4623fb002..b646a8f468ea 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -46,7 +46,7 @@ static DEFINE_MUTEX(nvmet_pci_epf_ports_mutex); /* * BAR CC register and SQ polling intervals. */ -#define NVMET_PCI_EPF_CC_POLL_INTERVAL msecs_to_jiffies(5) +#define NVMET_PCI_EPF_CC_POLL_INTERVAL msecs_to_jiffies(10) #define NVMET_PCI_EPF_SQ_POLL_INTERVAL msecs_to_jiffies(5) #define NVMET_PCI_EPF_SQ_POLL_IDLE msecs_to_jiffies(5000) @@ -1910,12 +1910,15 @@ static void nvmet_pci_epf_poll_cc_work(struct work_struct *work) old_cc = ctrl->cc; new_cc = nvmet_pci_epf_bar_read32(ctrl, NVME_REG_CC); + if (new_cc == old_cc) + goto reschedule_work; + ctrl->cc = new_cc; if (nvmet_cc_en(new_cc) && !nvmet_cc_en(old_cc)) { ret = nvmet_pci_epf_enable_ctrl(ctrl); if (ret) - return; + goto reschedule_work; } if (!nvmet_cc_en(new_cc) && nvmet_cc_en(old_cc)) @@ -1932,6 +1935,7 @@ static void nvmet_pci_epf_poll_cc_work(struct work_struct *work) nvmet_update_cc(ctrl->tctrl, ctrl->cc); nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CSTS, ctrl->csts); +reschedule_work: schedule_delayed_work(&ctrl->poll_cc, NVMET_PCI_EPF_CC_POLL_INTERVAL); } -- 2.51.0 From 01ef7ff7dd3cd8cc71af8d1d1496be853281a948 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Feb 2025 15:52:30 +0900 Subject: [PATCH 02/16] nvmet: pci-epf: Avoid RCU stalls under heavy workload The delayed work item function nvmet_pci_epf_poll_sqs_work() polls all submission queues and keeps running in a loop as long as commands are being submitted by the host. Depending on the preemption configuration of the kernel, under heavy command workload, this function can thus run for more than RCU_CPU_STALL_TIMEOUT seconds, leading to a RCU stall: rcu: INFO: rcu_sched self-detected stall on CPU rcu: 5-....: (20998 ticks this GP) idle=4244/1/0x4000000000000000 softirq=301/301 fqs=5132 rcu: (t=21000 jiffies g=-443 q=12 ncpus=8) CPU: 5 UID: 0 PID: 82 Comm: kworker/5:1 Not tainted 6.14.0-rc2 #1 Hardware name: Radxa ROCK 5B (DT) Workqueue: events nvmet_pci_epf_poll_sqs_work [nvmet_pci_epf] pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : dw_edma_device_tx_status+0xb8/0x130 lr : dw_edma_device_tx_status+0x9c/0x130 sp : ffff800080b5bbb0 x29: ffff800080b5bbb0 x28: ffff0331c5c78400 x27: ffff0331c1cd1960 x26: ffff0331c0e39010 x25: ffff0331c20e4000 x24: ffff0331c20e4a90 x23: 0000000000000000 x22: 0000000000000001 x21: 00000000005aca33 x20: ffff800080b5bc30 x19: ffff0331c123e370 x18: 000000000ab29e62 x17: ffffb2a878c9c118 x16: ffff0335bde82040 x15: 0000000000000000 x14: 000000000000017b x13: 00000000ee601780 x12: 0000000000000018 x11: 0000000000000000 x10: 0000000000000001 x9 : 0000000000000040 x8 : 00000000ee601780 x7 : 0000000105c785c0 x6 : ffff0331c1027d80 x5 : 0000000001ee7ad6 x4 : ffff0335bdea16c0 x3 : ffff0331c123e438 x2 : 00000000005aca33 x1 : 0000000000000000 x0 : ffff0331c123e410 Call trace: dw_edma_device_tx_status+0xb8/0x130 (P) dma_sync_wait+0x60/0xbc nvmet_pci_epf_dma_transfer+0x128/0x264 [nvmet_pci_epf] nvmet_pci_epf_poll_sqs_work+0x2a0/0x2e0 [nvmet_pci_epf] process_one_work+0x144/0x390 worker_thread+0x27c/0x458 kthread+0xe8/0x19c ret_from_fork+0x10/0x20 The solution for this is simply to explicitly allow rescheduling using cond_resched(). However, since doing so for every loop of nvmet_pci_epf_poll_sqs_work() significantly degrades performance (for 4K random reads using 4 I/O queues, the maximum IOPS goes down from 137 KIOPS to 110 KIOPS), call cond_resched() every second to avoid the RCU stalls. Fixes: 0faa0fe6f90e ("nvmet: New NVMe PCI endpoint function target driver") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/pci-epf.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index b646a8f468ea..565d2bd36dcd 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -1694,6 +1694,7 @@ static void nvmet_pci_epf_poll_sqs_work(struct work_struct *work) struct nvmet_pci_epf_ctrl *ctrl = container_of(work, struct nvmet_pci_epf_ctrl, poll_sqs.work); struct nvmet_pci_epf_queue *sq; + unsigned long limit = jiffies; unsigned long last = 0; int i, nr_sqs; @@ -1708,6 +1709,16 @@ static void nvmet_pci_epf_poll_sqs_work(struct work_struct *work) nr_sqs++; } + /* + * If we have been running for a while, reschedule to let other + * tasks run and to avoid RCU stalls. + */ + if (time_is_before_jiffies(limit + secs_to_jiffies(1))) { + cond_resched(); + limit = jiffies; + continue; + } + if (nr_sqs) { last = jiffies; continue; -- 2.51.0 From cd513e0434c3e736c549bc99bf7982658b25114d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Feb 2025 15:52:31 +0900 Subject: [PATCH 03/16] nvme: tcp: Fix compilation warning with W=1 When compiling with W=1, a warning result for the function nvme_tcp_set_queue_io_cpu(): host/tcp.c:1578: warning: Function parameter or struct member 'queue' not described in 'nvme_tcp_set_queue_io_cpu' host/tcp.c:1578: warning: expecting prototype for Track the number of queues assigned to each cpu using a global per(). Prototype was for nvme_tcp_set_queue_io_cpu() instead Avoid this warning by using the regular comment format for the function nvme_tcp_set_queue_io_cpu() instead of the kdoc comment format. Fixes: 32193789878c ("nvme-tcp: Fix I/O queue cpu spreading for multiple controllers") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 038b35238c26..a7398e9e5f71 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1608,7 +1608,7 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue) ctrl->io_queues[HCTX_TYPE_POLL]; } -/** +/* * Track the number of queues assigned to each cpu using a global per-cpu * counter and select the least used cpu from the mq_map. Our goal is to spread * different controllers I/O threads across different cpu cores. -- 2.51.0 From 578539e0969028f711c34d9a4565931edfe1d730 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 24 Jan 2025 11:43:10 -0700 Subject: [PATCH 04/16] nvme-tcp: fix connect failure on receiving partial ICResp PDU nvme_tcp_init_connection() attempts to receive an ICResp PDU but only checks that the return value from recvmsg() is non-negative. If the sender closes the TCP connection or sends fewer than 128 bytes, this check will pass even though the full PDU wasn't received. Ensure the full ICResp PDU is received by checking that recvmsg() returns the expected 128 bytes. Additionally set the MSG_WAITALL flag for recvmsg(), as a sender could split the ICResp over multiple TCP frames. Without MSG_WAITALL, recvmsg() could return prematurely with only part of the PDU. Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver") Signed-off-by: Caleb Sander Mateos Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index a7398e9e5f71..8a9131c95a3d 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1492,11 +1492,14 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) msg.msg_control = cbuf; msg.msg_controllen = sizeof(cbuf); } + msg.msg_flags = MSG_WAITALL; ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); - if (ret < 0) { + if (ret < sizeof(*icresp)) { pr_warn("queue %d: failed to receive icresp, error %d\n", nvme_tcp_queue_id(queue), ret); + if (ret >= 0) + ret = -ECONNRESET; goto free_icresp; } ret = -ENOTCONN; -- 2.51.0 From 487a3ea7b1b8ba2ca7d2c2bb3c3594dc360d6261 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 13 Feb 2025 10:05:14 -0700 Subject: [PATCH 05/16] nvme/ioctl: add missing space in err message nvme_validate_passthru_nsid() logs an err message whose format string is split over 2 lines. There is a missing space between the two pieces, resulting in log lines like "... does not match nsid (1)of namespace". Add the missing space between ")" and "of". Also combine the format string pieces onto a single line to make the err message easier to grep. Fixes: e7d4b5493a2d ("nvme: factor out a nvme_validate_passthru_nsid helper") Signed-off-by: Caleb Sander Mateos Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/ioctl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index e8930146847a..b1b46c2713e1 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -283,8 +283,7 @@ static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, { if (ns && nsid != ns->head->ns_id) { dev_err(ctrl->device, - "%s: nsid (%u) in cmd does not match nsid (%u)" - "of namespace\n", + "%s: nsid (%u) in cmd does not match nsid (%u) of namespace\n", current->comm, nsid, ns->head->ns_id); return false; } -- 2.51.0 From d422247d14a53fe825b1778edf104167d8fd8f3f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Feb 2025 15:49:59 +0900 Subject: [PATCH 06/16] nvme: Cleanup the definition of the controller config register fields Reorganized the enum used to define the fields of the contrller configuration (CC) register in include/linux/nvme.h to: 1) Group together all the values defined for each field. 2) Add the missing field masks definitions. 3) Add comments to describe the enum and each field. Signed-off-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index fe3b60818fdc..2dc05b1c3283 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -199,28 +199,54 @@ enum { #define NVME_NVM_IOSQES 6 #define NVME_NVM_IOCQES 4 +/* + * Controller Configuration (CC) register (Offset 14h) + */ enum { + /* Enable (EN): bit 0 */ NVME_CC_ENABLE = 1 << 0, NVME_CC_EN_SHIFT = 0, + + /* Bits 03:01 are reserved (NVMe Base Specification rev 2.1) */ + + /* I/O Command Set Selected (CSS): bits 06:04 */ NVME_CC_CSS_SHIFT = 4, - NVME_CC_MPS_SHIFT = 7, - NVME_CC_AMS_SHIFT = 11, - NVME_CC_SHN_SHIFT = 14, - NVME_CC_IOSQES_SHIFT = 16, - NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_CSS_MASK = 7 << NVME_CC_CSS_SHIFT, NVME_CC_CSS_NVM = 0 << NVME_CC_CSS_SHIFT, NVME_CC_CSS_CSI = 6 << NVME_CC_CSS_SHIFT, - NVME_CC_CSS_MASK = 7 << NVME_CC_CSS_SHIFT, + + /* Memory Page Size (MPS): bits 10:07 */ + NVME_CC_MPS_SHIFT = 7, + NVME_CC_MPS_MASK = 0xf << NVME_CC_MPS_SHIFT, + + /* Arbitration Mechanism Selected (AMS): bits 13:11 */ + NVME_CC_AMS_SHIFT = 11, + NVME_CC_AMS_MASK = 7 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_RR = 0 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_WRRU = 1 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_VS = 7 << NVME_CC_AMS_SHIFT, + + /* Shutdown Notification (SHN): bits 15:14 */ + NVME_CC_SHN_SHIFT = 14, + NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_NONE = 0 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_NORMAL = 1 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_ABRUPT = 2 << NVME_CC_SHN_SHIFT, - NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, + + /* I/O Submission Queue Entry Size (IOSQES): bits 19:16 */ + NVME_CC_IOSQES_SHIFT = 16, + NVME_CC_IOSQES_MASK = 0xf << NVME_CC_IOSQES_SHIFT, NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, + + /* I/O Completion Queue Entry Size (IOCQES): bits 23:20 */ + NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_IOCQES_MASK = 0xf << NVME_CC_IOCQES_SHIFT, NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, + + /* Controller Ready Independent of Media Enable (CRIME): bit 24 */ NVME_CC_CRIME = 1 << 24, + + /* Bits 25:31 are reserved (NVMe Base Specification rev 2.1) */ }; enum { -- 2.51.0 From 2ba8cf918f0d1873cd5430ae2cc3c41711a144d7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Feb 2025 15:50:00 +0900 Subject: [PATCH 07/16] nvmet: Use enum definitions instead of hardcoded values Change the definition of the inline functions nvmet_cc_en(), nvmet_cc_css(), nvmet_cc_mps(), nvmet_cc_ams(), nvmet_cc_shn(), nvmet_cc_iosqes(), and nvmet_cc_iocqes() to use the enum difinitions in include/linux/nvme.h instead of hardcoded values. Signed-off-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/nvmet.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 4be8d22d2d8d..d2c1233981e1 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -784,37 +784,37 @@ u16 nvmet_report_invalid_opcode(struct nvmet_req *req); static inline bool nvmet_cc_en(u32 cc) { - return (cc >> NVME_CC_EN_SHIFT) & 0x1; + return (cc & NVME_CC_ENABLE) >> NVME_CC_EN_SHIFT; } static inline u8 nvmet_cc_css(u32 cc) { - return (cc >> NVME_CC_CSS_SHIFT) & 0x7; + return (cc & NVME_CC_CSS_MASK) >> NVME_CC_CSS_SHIFT; } static inline u8 nvmet_cc_mps(u32 cc) { - return (cc >> NVME_CC_MPS_SHIFT) & 0xf; + return (cc & NVME_CC_MPS_MASK) >> NVME_CC_MPS_SHIFT; } static inline u8 nvmet_cc_ams(u32 cc) { - return (cc >> NVME_CC_AMS_SHIFT) & 0x7; + return (cc & NVME_CC_AMS_MASK) >> NVME_CC_AMS_SHIFT; } static inline u8 nvmet_cc_shn(u32 cc) { - return (cc >> NVME_CC_SHN_SHIFT) & 0x3; + return (cc & NVME_CC_SHN_MASK) >> NVME_CC_SHN_SHIFT; } static inline u8 nvmet_cc_iosqes(u32 cc) { - return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf; + return (cc & NVME_CC_IOSQES_MASK) >> NVME_CC_IOSQES_SHIFT; } static inline u8 nvmet_cc_iocqes(u32 cc) { - return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf; + return (cc & NVME_CC_IOCQES_MASK) >> NVME_CC_IOCQES_SHIFT; } /* Convert a 32-bit number to a 16-bit 0's based number */ -- 2.51.0 From eefa72a15ea03fd009333aaa9f0e360b2578e434 Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Thu, 13 Feb 2025 11:12:59 -0500 Subject: [PATCH 08/16] apple-nvme: Release power domains when probe fails Signed-off-by: Hector Martin Reviewed-by: Neal Gompa Reviewed-by: Sven Peter Signed-off-by: Alyssa Rosenzweig Signed-off-by: Keith Busch --- drivers/nvme/host/apple.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 1de11b722f04..7995f0776bc0 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1516,6 +1516,7 @@ static struct apple_nvme *apple_nvme_alloc(struct platform_device *pdev) return anv; put_dev: + apple_nvme_detach_genpd(anv); put_device(anv->dev); return ERR_PTR(ret); } @@ -1549,6 +1550,7 @@ out_uninit_ctrl: nvme_uninit_ctrl(&anv->ctrl); out_put_ctrl: nvme_put_ctrl(&anv->ctrl); + apple_nvme_detach_genpd(anv); return ret; } -- 2.51.0 From 3f22421f6a240b33ab8ffbf662bf0a8f336f405b Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Thu, 13 Feb 2025 11:12:58 -0500 Subject: [PATCH 09/16] apple-nvme: Support coprocessors left idle iBoot on at least some firmwares/machines leaves ANS2 running, requiring a wake command instead of a CPU boot (and if we reset ANS2 in that state, everything breaks). Only stop the CPU if RTKit was running, and only do the reset dance if the CPU is stopped. Normal shutdown handoff: - RTKit not yet running - CPU detected not running - Reset - CPU powerup - RTKit boot wait ANS2 left running/idle: - RTKit not yet running - CPU detected running - RTKit wake message Sleep/resume cycle: - RTKit shutdown - CPU stopped - (sleep here) - CPU detected not running - Reset - CPU powerup - RTKit boot wait Shutdown or device removal: - RTKit shutdown - CPU stopped Therefore, the CPU running bit serves as a consistent flag of whether the coprocessor is fully stopped or just idle. Signed-off-by: Hector Martin Reviewed-by: Neal Gompa Reviewed-by: Sven Peter Signed-off-by: Alyssa Rosenzweig Signed-off-by: Keith Busch --- drivers/nvme/host/apple.c | 53 ++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 7995f0776bc0..a060f69558e7 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1011,25 +1011,37 @@ static void apple_nvme_reset_work(struct work_struct *work) ret = apple_rtkit_shutdown(anv->rtk); if (ret) goto out; + + writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); } - writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); + /* + * Only do the soft-reset if the CPU is not running, which means either we + * or the previous stage shut it down cleanly. + */ + if (!(readl(anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL) & + APPLE_ANS_COPROC_CPU_CONTROL_RUN)) { - ret = reset_control_assert(anv->reset); - if (ret) - goto out; + ret = reset_control_assert(anv->reset); + if (ret) + goto out; - ret = apple_rtkit_reinit(anv->rtk); - if (ret) - goto out; + ret = apple_rtkit_reinit(anv->rtk); + if (ret) + goto out; - ret = reset_control_deassert(anv->reset); - if (ret) - goto out; + ret = reset_control_deassert(anv->reset); + if (ret) + goto out; + + writel(APPLE_ANS_COPROC_CPU_CONTROL_RUN, + anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); + + ret = apple_rtkit_boot(anv->rtk); + } else { + ret = apple_rtkit_wake(anv->rtk); + } - writel(APPLE_ANS_COPROC_CPU_CONTROL_RUN, - anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); - ret = apple_rtkit_boot(anv->rtk); if (ret) { dev_err(anv->dev, "ANS did not boot"); goto out; @@ -1565,9 +1577,12 @@ static void apple_nvme_remove(struct platform_device *pdev) apple_nvme_disable(anv, true); nvme_uninit_ctrl(&anv->ctrl); - if (apple_rtkit_is_running(anv->rtk)) + if (apple_rtkit_is_running(anv->rtk)) { apple_rtkit_shutdown(anv->rtk); + writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); + } + apple_nvme_detach_genpd(anv); } @@ -1576,8 +1591,11 @@ static void apple_nvme_shutdown(struct platform_device *pdev) struct apple_nvme *anv = platform_get_drvdata(pdev); apple_nvme_disable(anv, true); - if (apple_rtkit_is_running(anv->rtk)) + if (apple_rtkit_is_running(anv->rtk)) { apple_rtkit_shutdown(anv->rtk); + + writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); + } } static int apple_nvme_resume(struct device *dev) @@ -1594,10 +1612,11 @@ static int apple_nvme_suspend(struct device *dev) apple_nvme_disable(anv, true); - if (apple_rtkit_is_running(anv->rtk)) + if (apple_rtkit_is_running(anv->rtk)) { ret = apple_rtkit_shutdown(anv->rtk); - writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); + writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); + } return ret; } -- 2.51.0 From f13409bb3f9140dad7256febcb478f0c9600312c Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Fri, 14 Feb 2025 09:02:04 +0100 Subject: [PATCH 10/16] nvme-fc: rely on state transitions to handle connectivity loss It's not possible to call nvme_state_ctrl_state with holding a spin lock, because nvme_state_ctrl_state calls cancel_delayed_work_sync when fastfail is enabled. Instead syncing the ASSOC_FLAG and state transitions using a lock, it's possible to only rely on the state machine transitions. That means nvme_fc_ctrl_connectivity_loss should unconditionally call nvme_reset_ctrl which avoids the read race on the ctrl state variable. Actually, it's not necessary to test in which state the ctrl is, the reset work will only scheduled when the state machine is in LIVE state. In nvme_fc_create_association, the LIVE state can only be entered if it was previously CONNECTING. If this is not possible then the reset handler got triggered. Thus just error out here. Fixes: ee59e3820ca9 ("nvme-fc: do not ignore connectivity loss during connecting") Closes: https://lore.kernel.org/all/denqwui6sl5erqmz2gvrwueyxakl5txzbbiu3fgebryzrfxunm@iwxuthct377m/ Reported-by: Shinichiro Kawasaki Tested-by: Shin'ichiro Kawasaki Reviewed-by: Sagi Grimberg Signed-off-by: Daniel Wagner Signed-off-by: Keith Busch --- drivers/nvme/host/fc.c | 67 ++++-------------------------------------- 1 file changed, 6 insertions(+), 61 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index f4f1866fbd5b..b9929a5a7f4e 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -781,61 +781,12 @@ restart: static void nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl) { - enum nvme_ctrl_state state; - unsigned long flags; - dev_info(ctrl->ctrl.device, "NVME-FC{%d}: controller connectivity lost. Awaiting " "Reconnect", ctrl->cnum); - spin_lock_irqsave(&ctrl->lock, flags); set_bit(ASSOC_FAILED, &ctrl->flags); - state = nvme_ctrl_state(&ctrl->ctrl); - spin_unlock_irqrestore(&ctrl->lock, flags); - - switch (state) { - case NVME_CTRL_NEW: - case NVME_CTRL_LIVE: - /* - * Schedule a controller reset. The reset will terminate the - * association and schedule the reconnect timer. Reconnects - * will be attempted until either the ctlr_loss_tmo - * (max_retries * connect_delay) expires or the remoteport's - * dev_loss_tmo expires. - */ - if (nvme_reset_ctrl(&ctrl->ctrl)) { - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Couldn't schedule reset.\n", - ctrl->cnum); - nvme_delete_ctrl(&ctrl->ctrl); - } - break; - - case NVME_CTRL_CONNECTING: - /* - * The association has already been terminated and the - * controller is attempting reconnects. No need to do anything - * futher. Reconnects will be attempted until either the - * ctlr_loss_tmo (max_retries * connect_delay) expires or the - * remoteport's dev_loss_tmo expires. - */ - break; - - case NVME_CTRL_RESETTING: - /* - * Controller is already in the process of terminating the - * association. No need to do anything further. The reconnect - * step will kick in naturally after the association is - * terminated. - */ - break; - - case NVME_CTRL_DELETING: - case NVME_CTRL_DELETING_NOIO: - default: - /* no action to take - let it delete */ - break; - } + nvme_reset_ctrl(&ctrl->ctrl); } /** @@ -3071,7 +3022,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) struct nvmefc_ls_rcv_op *disls = NULL; unsigned long flags; int ret; - bool changed; ++ctrl->ctrl.nr_reconnects; @@ -3177,23 +3127,18 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) else ret = nvme_fc_recreate_io_queues(ctrl); } + if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags)) + ret = -EIO; if (ret) goto out_term_aen_ops; - spin_lock_irqsave(&ctrl->lock, flags); - if (!test_bit(ASSOC_FAILED, &ctrl->flags)) - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - else + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE)) { ret = -EIO; - spin_unlock_irqrestore(&ctrl->lock, flags); - - if (ret) goto out_term_aen_ops; + } ctrl->ctrl.nr_reconnects = 0; - - if (changed) - nvme_start_ctrl(&ctrl->ctrl); + nvme_start_ctrl(&ctrl->ctrl); return 0; /* Success */ -- 2.51.0 From d2fe192348f93fe3a0cb1e33e4aba58e646397f4 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Fri, 14 Feb 2025 09:02:03 +0100 Subject: [PATCH 11/16] nvme: only allow entering LIVE from CONNECTING state The fabric transports and also the PCI transport are not entering the LIVE state from NEW or RESETTING. This makes the state machine more restrictive and allows to catch not supported state transitions, e.g. directly switching from RESETTING to LIVE. Reviewed-by: Sagi Grimberg Signed-off-by: Daniel Wagner Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 818d4e49aab5..f028913e2e62 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -564,8 +564,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (new_state) { case NVME_CTRL_LIVE: switch (old_state) { - case NVME_CTRL_NEW: - case NVME_CTRL_RESETTING: case NVME_CTRL_CONNECTING: changed = true; fallthrough; -- 2.51.0 From 8c1624b63a7d24142a2bbc3a5ee7e95f004ea36e Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 20 Feb 2025 13:18:30 +0200 Subject: [PATCH 12/16] nvme-tcp: fix possible UAF in nvme_tcp_poll nvme_tcp_poll() may race with the send path error handler because it may complete the request while it is actively being polled for completion, resulting in a UAF panic [1]: We should make sure to stop polling when we see an error when trying to read from the socket. Hence make sure to propagate the error so that the block layer breaks the polling cycle. [1]: -- [35665.692310] nvme nvme2: failed to send request -13 [35665.702265] nvme nvme2: unsupported pdu type (3) [35665.702272] BUG: kernel NULL pointer dereference, address: 0000000000000000 [35665.702542] nvme nvme2: queue 1 receive failed: -22 [35665.703209] #PF: supervisor write access in kernel mode [35665.703213] #PF: error_code(0x0002) - not-present page [35665.703214] PGD 8000003801cce067 P4D 8000003801cce067 PUD 37e6f79067 PMD 0 [35665.703220] Oops: 0002 [#1] SMP PTI [35665.703658] nvme nvme2: starting error recovery [35665.705809] Hardware name: Inspur aaabbb/YZMB-00882-104, BIOS 4.1.26 09/22/2022 [35665.705812] Workqueue: kblockd blk_mq_requeue_work [35665.709172] RIP: 0010:_raw_spin_lock+0xc/0x30 [35665.715788] Call Trace: [35665.716201] [35665.716613] ? show_trace_log_lvl+0x1c1/0x2d9 [35665.717049] ? show_trace_log_lvl+0x1c1/0x2d9 [35665.717457] ? blk_mq_request_bypass_insert+0x2c/0xb0 [35665.717950] ? __die_body.cold+0x8/0xd [35665.718361] ? page_fault_oops+0xac/0x140 [35665.718749] ? blk_mq_start_request+0x30/0xf0 [35665.719144] ? nvme_tcp_queue_rq+0xc7/0x170 [nvme_tcp] [35665.719547] ? exc_page_fault+0x62/0x130 [35665.719938] ? asm_exc_page_fault+0x22/0x30 [35665.720333] ? _raw_spin_lock+0xc/0x30 [35665.720723] blk_mq_request_bypass_insert+0x2c/0xb0 [35665.721101] blk_mq_requeue_work+0xa5/0x180 [35665.721451] process_one_work+0x1e8/0x390 [35665.721809] worker_thread+0x53/0x3d0 [35665.722159] ? process_one_work+0x390/0x390 [35665.722501] kthread+0x124/0x150 [35665.722849] ? set_kthread_struct+0x50/0x50 [35665.723182] ret_from_fork+0x1f/0x30 Reported-by: Zhang Guanghui Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 8a9131c95a3d..8c14018201db 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2699,6 +2699,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct nvme_tcp_queue *queue = hctx->driver_data; struct sock *sk = queue->sock->sk; + int ret; if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags)) return 0; @@ -2706,9 +2707,9 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) set_bit(NVME_TCP_Q_POLLING, &queue->flags); if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue)) sk_busy_loop(sk, true); - nvme_tcp_try_recv(queue); + ret = nvme_tcp_try_recv(queue); clear_bit(NVME_TCP_Q_POLLING, &queue->flags); - return queue->nr_cqe; + return ret < 0 ? ret : queue->nr_cqe; } static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size) -- 2.51.0 From 6a3572e10f740acd48e2713ef37e92186a3ce5e8 Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Thu, 13 Feb 2025 01:04:43 +0800 Subject: [PATCH 13/16] nvme-pci: clean up CMBMSC when registering CMB fails CMB decoding should get disabled when the CMB block isn't successfully registered to P2P DMA subsystem. Clean up the CMBMSC register in this error handling codepath to disable CMB decoding (and CMBLOC/CMBSZ registers). Signed-off-by: Icenowy Zheng Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 950289405ef2..218506e3dabe 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2003,6 +2003,7 @@ static void nvme_map_cmb(struct nvme_dev *dev) if (pci_p2pdma_add_resource(pdev, bar, size, offset)) { dev_warn(dev->ctrl.device, "failed to register the CMB\n"); + hi_lo_writeq(0, dev->bar + NVME_REG_CMBMSC); return; } -- 2.51.0 From 56cf7ef0d490b28fad8f8629fc135c5ab7c9f54e Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Thu, 13 Feb 2025 01:04:44 +0800 Subject: [PATCH 14/16] nvme-pci: skip CMB blocks incompatible with PCI P2P DMA The PCI P2PDMA code will register the CMB block to the memory hot-plugging subsystem, which have an alignment requirement. Memory blocks that do not satisfy this alignment requirement (usually 2MB) will lead to a WARNING from memory hotplugging. Verify the CMB block's address and size against the alignment and only try to send CMB blocks compatible with it to prevent this warning. Tested on Intel DC D4502 SSD, which has a 512K CMB block that is too small for memory hotplugging (thus PCI P2PDMA). Signed-off-by: Icenowy Zheng Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 218506e3dabe..640590b21728 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1982,6 +1982,18 @@ static void nvme_map_cmb(struct nvme_dev *dev) if (offset > bar_size) return; + /* + * Controllers may support a CMB size larger than their BAR, for + * example, due to being behind a bridge. Reduce the CMB to the + * reported size of the BAR + */ + size = min(size, bar_size - offset); + + if (!IS_ALIGNED(size, memremap_compat_align()) || + !IS_ALIGNED(pci_resource_start(pdev, bar), + memremap_compat_align())) + return; + /* * Tell the controller about the host side address mapping the CMB, * and enable CMB decoding for the NVMe 1.4+ scheme: @@ -1992,14 +2004,6 @@ static void nvme_map_cmb(struct nvme_dev *dev) dev->bar + NVME_REG_CMBMSC); } - /* - * Controllers may support a CMB size larger than their BAR, - * for example, due to being behind a bridge. Reduce the CMB to - * the reported size of the BAR - */ - if (size > bar_size - offset) - size = bar_size - offset; - if (pci_p2pdma_add_resource(pdev, bar, size, offset)) { dev_warn(dev->ctrl.device, "failed to register the CMB\n"); -- 2.51.0 From 00817f0f1c45b007965f5676b9a2013bb39c7228 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 24 Feb 2025 17:13:30 -0800 Subject: [PATCH 15/16] nvme-ioctl: fix leaked requests on mapping error All the callers assume nvme_map_user_request() frees the request on a failure. This wasn't happening on invalid metadata or io_uring command flags, so we've been leaking those requests. Fixes: 23fd22e55b767b ("nvme: wire up fixed buffer support for nvme passthrough") Fixes: 7c2fd76048e95d ("nvme: fix metadata handling in nvme-passthrough") Reviewed-by: Damien Le Moal Reviewed-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/ioctl.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index b1b46c2713e1..24e2c702da7a 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -128,8 +128,10 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, if (!nvme_ctrl_sgl_supported(ctrl)) dev_warn_once(ctrl->device, "using unchecked data buffer\n"); if (has_metadata) { - if (!supports_metadata) - return -EINVAL; + if (!supports_metadata) { + ret = -EINVAL; + goto out; + } if (!nvme_ctrl_meta_sgl_supported(ctrl)) dev_warn_once(ctrl->device, "using unchecked metadata buffer\n"); @@ -139,8 +141,10 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, struct iov_iter iter; /* fixedbufs is only for non-vectored io */ - if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC)) - return -EINVAL; + if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC)) { + ret = -EINVAL; + goto out; + } ret = io_uring_cmd_import_fixed(ubuffer, bufflen, rq_data_dir(req), &iter, ioucmd); if (ret < 0) -- 2.51.0 From 0979ff3676b1b4e6a20970bc265491d23c2da42b Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Thu, 27 Feb 2025 20:00:05 +0100 Subject: [PATCH 16/16] nvmet: remove old function prototype nvmet_subsys_nsid_exists() doesn't exist anymore Fixes: 74d16965d7ac ("nvmet-loop: avoid using mutex in IO hotpath") Signed-off-by: Maurizio Lombardi Signed-off-by: Keith Busch --- drivers/nvme/target/nvmet.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index d2c1233981e1..fcf4f460dc9a 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -647,7 +647,6 @@ void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys, struct nvmet_host *host); void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, u8 event_info, u8 log_page); -bool nvmet_subsys_nsid_exists(struct nvmet_subsys *subsys, u32 nsid); #define NVMET_MIN_QUEUE_SIZE 16 #define NVMET_MAX_QUEUE_SIZE 1024 -- 2.51.0