nvme-pci: fix stuck reset on concurrent DPC and HP

author Keith Busch <kbusch@kernel.org>

Thu, 6 Mar 2025 22:25:57 +0000 (14:25 -0800)

committer Keith Busch <kbusch@kernel.org>

Mon, 10 Mar 2025 16:15:48 +0000 (09:15 -0700)
author Keith Busch <kbusch@kernel.org>
Thu, 6 Mar 2025 22:25:57 +0000 (14:25 -0800)
committer Keith Busch <kbusch@kernel.org>
Mon, 10 Mar 2025 16:15:48 +0000 (09:15 -0700)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c

index 640590b2172828cf70c05bd35234b44f286d0d16..e59aad269abf8334178a362e91e0c036679e5dc8 100644 (file)
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1411,9 +1411,20 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
         struct nvme_dev *dev = nvmeq->dev;
         struct request *abort_req;
         struct nvme_command cmd = { };
+       struct pci_dev *pdev = to_pci_dev(dev->dev);
         u32 csts = readl(dev->bar + NVME_REG_CSTS);
         u8 opcode;
  
+       /*
+        * Shutdown the device immediately if we see it is disconnected. This
+        * unblocks PCIe error handling if the nvme driver is waiting in
+        * error_resume for a device that has been removed. We can't unbind the
+        * driver while the driver's error callback is waiting to complete, so
+        * we're relying on a timeout to break that deadlock if a removal
+        * occurs while reset work is running.
+        */
+       if (pci_dev_is_disconnected(pdev))
+               nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
         if (nvme_state_terminal(&dev->ctrl))
                 goto disable;
  
@@ -1421,7 +1432,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
          * the recovery mechanism will surely fail.
          */
         mb();
-       if (pci_channel_offline(to_pci_dev(dev->dev)))
+       if (pci_channel_offline(pdev))
                 return BLK_EH_RESET_TIMER;
  
         /*
author	Keith Busch <kbusch@kernel.org>
	Thu, 6 Mar 2025 22:25:57 +0000 (14:25 -0800)
committer	Keith Busch <kbusch@kernel.org>
	Mon, 10 Mar 2025 16:15:48 +0000 (09:15 -0700)