]> www.infradead.org Git - users/willy/pagecache.git/commitdiff
nfs/blocklayout: Limit repeat device registration on failure
authorBenjamin Coddington <bcodding@redhat.com>
Fri, 22 Nov 2024 15:11:12 +0000 (10:11 -0500)
committerTrond Myklebust <trond.myklebust@hammerspace.com>
Thu, 28 Nov 2024 17:55:32 +0000 (12:55 -0500)
Every pNFS SCSI IO wants to do LAYOUTGET, then within the layout find the
device which can drive GETDEVINFO, then finally may need to prep the device
with a reservation.  This slow work makes a mess of IO latencies if one of
the later steps is going to fail for awhile.

If we're unable to register a SCSI device, ensure we mark the device as
unavailable so that it will timeout and be re-added via GETDEVINFO.  This
avoids repeated doomed attempts to register a device in the IO path.

Add some clarifying comments as well.

Fixes: d869da91cccb ("nfs/blocklayout: Fix premature PR key unregistration")
Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
fs/nfs/blocklayout/blocklayout.c

index 0becdec129704f233ef9fdddcd6ef2122a7e4d72..47189476b5538bd277fec46254631025c642c7d3 100644 (file)
@@ -571,19 +571,32 @@ retry:
        if (!node)
                return ERR_PTR(-ENODEV);
 
+       /*
+        * Devices that are marked unavailable are left in the cache with a
+        * timeout to avoid sending GETDEVINFO after every LAYOUTGET, or
+        * constantly attempting to register the device.  Once marked as
+        * unavailable they must be deleted and never reused.
+        */
        if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
                unsigned long end = jiffies;
                unsigned long start = end - PNFS_DEVICE_RETRY_TIMEOUT;
 
                if (!time_in_range(node->timestamp_unavailable, start, end)) {
+                       /* Uncork subsequent GETDEVINFO operations for this device */
                        nfs4_delete_deviceid(node->ld, node->nfs_client, id);
                        goto retry;
                }
                goto out_put;
        }
 
-       if (!bl_register_dev(container_of(node, struct pnfs_block_dev, node)))
+       if (!bl_register_dev(container_of(node, struct pnfs_block_dev, node))) {
+               /*
+                * If we cannot register, treat this device as transient:
+                * Make a negative cache entry for the device
+                */
+               nfs4_mark_deviceid_unavailable(node);
                goto out_put;
+       }
 
        return node;