]> www.infradead.org Git - users/hch/xfs.git/commitdiff
xfs: flush inodegc before swapon
authorChristoph Hellwig <hch@lst.de>
Thu, 6 Feb 2025 06:15:00 +0000 (07:15 +0100)
committerCarlos Maiolino <cem@kernel.org>
Mon, 10 Feb 2025 09:29:27 +0000 (10:29 +0100)
Fix the brand new xfstest that tries to swapon on a recently unshared
file and use the chance to document the other bit of magic in this
function.

The big comment is taken from a mailinglist post by Dave Chinner.

Fixes: 5e672cd69f0a53 ("xfs: introduce xfs_inodegc_push()")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
fs/xfs/xfs_aops.c

index a80608e82c9b909b954b257a6c218fd32e4f3f52..6d9965b546cbb50102294d1752fc536cf4e0f35e 100644 (file)
@@ -19,6 +19,7 @@
 #include "xfs_reflink.h"
 #include "xfs_errortag.h"
 #include "xfs_error.h"
+#include "xfs_icache.h"
 
 struct xfs_writepage_ctx {
        struct iomap_writepage_ctx ctx;
@@ -533,7 +534,39 @@ xfs_vm_swap_activate(
        struct file                     *swap_file,
        sector_t                        *span)
 {
-       sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev;
+       struct xfs_inode                *ip = XFS_I(file_inode(swap_file));
+
+       /*
+        * Swap file activation can race against concurrent shared extent
+        * removal in files that have been cloned.  If this happens,
+        * iomap_swapfile_iter() can fail because it encountered a shared
+        * extent even though an operation is in progress to remove those
+        * shared extents.
+        *
+        * This race becomes problematic when we defer extent removal
+        * operations beyond the end of a syscall (i.e. use async background
+        * processing algorithms).  Users think the extents are no longer
+        * shared, but iomap_swapfile_iter() still sees them as shared
+        * because the refcountbt entries for the extents being removed have
+        * not yet been updated.  Hence the swapon call fails unexpectedly.
+        *
+        * The race condition is currently most obvious from the unlink()
+        * operation as extent removal is deferred until after the last
+        * reference to the inode goes away.  We then process the extent
+        * removal asynchronously, hence triggers the "syscall completed but
+        * work not done" condition mentioned above.  To close this race
+        * window, we need to flush any pending inodegc operations to ensure
+        * they have updated the refcountbt records before we try to map the
+        * swapfile.
+        */
+       xfs_inodegc_flush(ip->i_mount);
+
+       /*
+        * Direct the swap code to the correct block device when this file
+        * sits on the RT device.
+        */
+       sis->bdev = xfs_inode_buftarg(ip)->bt_bdev;
+
        return iomap_swapfile_activate(sis, swap_file, span,
                        &xfs_read_iomap_ops);
 }