]> www.infradead.org Git - users/hch/xfs.git/commitdiff
xfs: allow queued realtime intents to drain before scrubbing
authorDarrick J. Wong <djwong@kernel.org>
Tue, 15 Oct 2024 19:39:59 +0000 (12:39 -0700)
committerDarrick J. Wong <djwong@kernel.org>
Tue, 5 Nov 2024 21:36:25 +0000 (13:36 -0800)
When a writer thread executes a chain of log intent items for the
realtime volume, the ILOCKs taken during each step are for each rt
metadata file, not the entire rt volume itself.  Although scrub takes
all rt metadata ILOCKs, this isn't sufficient to guard against scrub
checking the rt volume while that writer thread is in the middle of
finishing a chain because there's no higher level locking primitive
guarding the realtime volume.

When there's a collision, cross-referencing between data structures
(e.g. rtrmapbt and rtrefcountbt) yields false corruption events; if
repair is running, this results in incorrect repairs, which is
catastrophic.

Fix this by adding to the mount structure the same drain that we use to
protect scrub against concurrent AG updates, but this time for the
realtime volume.

[Contains a few cleanups from hch]

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
fs/xfs/scrub/bmap.c
fs/xfs/scrub/common.c
fs/xfs/scrub/common.h
fs/xfs/scrub/rgsuper.c
fs/xfs/scrub/rtbitmap.c
fs/xfs/scrub/rtsummary.c
fs/xfs/scrub/scrub.c
fs/xfs/xfs_drain.c
fs/xfs/xfs_drain.h

index 7e00312225ed10d1f7e2cd76905c7db885d244b2..b04e12cddc70052f433f05deb1032ed73552aa0e 100644 (file)
@@ -324,10 +324,15 @@ xchk_bmap_rt_iextent_xref(
                        irec->br_startoff, &error))
                return;
 
-       xchk_rtgroup_lock(&info->sc->sr, XCHK_RTGLOCK_ALL);
+       error = xchk_rtgroup_lock(info->sc, &info->sc->sr, XCHK_RTGLOCK_ALL);
+       if (!xchk_fblock_process_error(info->sc, info->whichfork,
+                       irec->br_startoff, &error))
+               goto out_free;
+
        xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
                        irec->br_blockcount);
 
+out_free:
        xchk_rtgroup_free(info->sc, &info->sc->sr);
 }
 
index 5cbd94b56582a47cf33668f78532f7b44d106466..613fb54e723edee5bf311a13f823a5ee174cc901 100644 (file)
@@ -719,13 +719,79 @@ xchk_rtgroup_init(
        return 0;
 }
 
-void
+/* Lock all the rt group metadata inode ILOCKs and wait for intents. */
+int
 xchk_rtgroup_lock(
+       struct xfs_scrub        *sc,
        struct xchk_rt          *sr,
        unsigned int            rtglock_flags)
 {
-       xfs_rtgroup_lock(sr->rtg, rtglock_flags);
+       int                     error = 0;
+
+       ASSERT(sr->rtg != NULL);
+
+       /*
+        * If we're /only/ locking the rtbitmap in shared mode, then we're
+        * obviously not trying to compare records in two metadata inodes.
+        * There's no need to drain intents here because the caller (most
+        * likely the rgsuper scanner) doesn't need that level of consistency.
+        */
+       if (rtglock_flags == XFS_RTGLOCK_BITMAP_SHARED) {
+               xfs_rtgroup_lock(sr->rtg, rtglock_flags);
+               sr->rtlock_flags = rtglock_flags;
+               return 0;
+       }
+
+       do {
+               if (xchk_should_terminate(sc, &error))
+                       return error;
+
+               xfs_rtgroup_lock(sr->rtg, rtglock_flags);
+
+               /*
+                * If we've grabbed a non-metadata file for scrubbing, we
+                * assume that holding its ILOCK will suffice to coordinate
+                * with any rt intent chains involving this inode.
+                */
+               if (sc->ip && !xfs_is_internal_inode(sc->ip))
+                       break;
+
+               /*
+                * Decide if the rt group is quiet enough for all metadata to
+                * be consistent with each other.  Regular file IO doesn't get
+                * to lock all the rt inodes at the same time, which means that
+                * there could be other threads in the middle of processing a
+                * chain of deferred ops.
+                *
+                * We just locked all the metadata inodes for this rt group;
+                * now take a look to see if there are any intents in progress.
+                * If there are, drop the rt group inode locks and wait for the
+                * intents to drain.  Since we hold the rt group inode locks
+                * for the duration of the scrub, this is the only time we have
+                * to sample the intents counter; any threads increasing it
+                * after this point can't possibly be in the middle of a chain
+                * of rt metadata updates.
+                *
+                * Obviously, this should be slanted against scrub and in favor
+                * of runtime threads.
+                */
+               if (!xfs_group_intent_busy(rtg_group(sr->rtg)))
+                       break;
+
+               xfs_rtgroup_unlock(sr->rtg, rtglock_flags);
+
+               if (!(sc->flags & XCHK_FSGATES_DRAIN))
+                       return -ECHRNG;
+               error = xfs_group_intent_drain(rtg_group(sr->rtg));
+               if (error) {
+                       if (error == -ERESTARTSYS)
+                               error = -EINTR;
+                       return error;
+               }
+       } while (1);
+
        sr->rtlock_flags = rtglock_flags;
+       return 0;
 }
 
 /*
@@ -1379,7 +1445,7 @@ xchk_fsgates_enable(
        trace_xchk_fsgates_enable(sc, scrub_fsgates);
 
        if (scrub_fsgates & XCHK_FSGATES_DRAIN)
-               xfs_drain_wait_enable();
+               xfs_defer_drain_wait_enable();
 
        if (scrub_fsgates & XCHK_FSGATES_QUOTA)
                xfs_dqtrx_hook_enable();
index 9ff3cafd867962755b97742a766be53109c7aaef..e734572a8dd6ec417143d69f90f25d56ab5ed9f8 100644 (file)
@@ -141,12 +141,13 @@ xchk_rtgroup_init_existing(
        return error == -ENOENT ? -EFSCORRUPTED : error;
 }
 
-void xchk_rtgroup_lock(struct xchk_rt *sr, unsigned int rtglock_flags);
+int xchk_rtgroup_lock(struct xfs_scrub *sc, struct xchk_rt *sr,
+               unsigned int rtglock_flags);
 void xchk_rtgroup_free(struct xfs_scrub *sc, struct xchk_rt *sr);
 #else
 # define xchk_rtgroup_init(sc, rgno, sr)               (-EFSCORRUPTED)
 # define xchk_rtgroup_init_existing(sc, rgno, sr)      (-EFSCORRUPTED)
-# define xchk_rtgroup_lock(sc, lockflags)              do { } while (0)
+# define xchk_rtgroup_lock(sc, sr, lockflags)          (-EFSCORRUPTED)
 # define xchk_rtgroup_free(sc, sr)                     do { } while (0)
 #endif /* CONFIG_XFS_RT */
 
index 463b3573bb761b35a2a89100eb5185226080025a..e062c7d12565cd47638050bcb2f6736183c8d568 100644 (file)
@@ -61,7 +61,9 @@ xchk_rgsuperblock(
        if (!xchk_xref_process_error(sc, 0, 0, &error))
                return error;
 
-       xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP_SHARED);
+       error = xchk_rtgroup_lock(sc, &sc->sr, XFS_RTGLOCK_BITMAP_SHARED);
+       if (error)
+               return error;
 
        /*
         * Since we already validated the rt superblock at mount time, we don't
index 376a36fd9a9cdd73c4da1b6a1ad3cb66c743882f..023d974f7994b1eb26ce374a8083eaee143bfb20 100644 (file)
@@ -30,6 +30,9 @@ xchk_setup_rtbitmap(
        struct xchk_rtbitmap    *rtb;
        int                     error;
 
+       if (xchk_need_intent_drain(sc))
+               xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
        rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS);
        if (!rtb)
                return -ENOMEM;
@@ -58,12 +61,15 @@ xchk_setup_rtbitmap(
        if (error)
                return error;
 
+       error = xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL);
+       if (error)
+               return error;
+
        /*
         * Now that we've locked the rtbitmap, we can't race with growfsrt
         * trying to expand the bitmap or change the size of the rt volume.
         * Hence it is safe to compute and check the geometry values.
         */
-       xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP);
        if (mp->m_sb.sb_rblocks) {
                rtb->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks);
                rtb->rextslog = xfs_compute_rextslog(rtb->rextents);
index 49fc6250bafcaaaaff1930fd106d70d48d5ee2fb..f6338e5a61bc60c82215d6b158adc5389fb86c90 100644 (file)
@@ -90,6 +90,10 @@ xchk_setup_rtsummary(
        if (error)
                return error;
 
+       error = xchk_rtgroup_lock(sc, &sc->sr, XFS_RTGLOCK_BITMAP);
+       if (error)
+               return error;
+
        /*
         * Now that we've locked the rtbitmap and rtsummary, we can't race with
         * growfsrt trying to expand the summary or change the size of the rt
@@ -100,7 +104,6 @@ xchk_setup_rtsummary(
         * exclusively here.  If we ever start caring about running concurrent
         * fsmap with scrub this could be changed.
         */
-       xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP);
        if (mp->m_sb.sb_rblocks) {
                rts->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks);
                rts->rbmblocks = xfs_rtbitmap_blockcount(mp);
index 950f5a58dcd967bc94023d736ecf0a6e83bc00b3..652d347cee9929053019b573b684b59d5a01b8cc 100644 (file)
@@ -164,7 +164,7 @@ xchk_fsgates_disable(
        trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL);
 
        if (sc->flags & XCHK_FSGATES_DRAIN)
-               xfs_drain_wait_disable();
+               xfs_defer_drain_wait_disable();
 
        if (sc->flags & XCHK_FSGATES_QUOTA)
                xfs_dqtrx_hook_disable();
index 5ede81fadbd8ca6756ae8c163d590ad1657e6c60..80137ff474f00c1b61ad929d260a8ef81203bad0 100644 (file)
@@ -13,9 +13,9 @@
 #include "xfs_trace.h"
 
 /*
- * Use a static key here to reduce the overhead of xfs_drain_rele.  If the
+ * Use a static key here to reduce the overhead of xfs_defer_drain_rele.  If the
  * compiler supports jump labels, the static branch will be replaced by a nop
- * sled when there are no xfs_drain_wait callers.  Online fsck is currently
+ * sled when there are no xfs_defer_drain_wait callers.  Online fsck is currently
  * the only caller, so this is a reasonable tradeoff.
  *
  * Note: Patching the kernel code requires taking the cpu hotplug lock.  Other
  * XFS callers cannot hold any locks that might be used by memory reclaim or
  * writeback when calling the static_branch_{inc,dec} functions.
  */
-static DEFINE_STATIC_KEY_FALSE(xfs_drain_waiter_gate);
+static DEFINE_STATIC_KEY_FALSE(xfs_defer_drain_waiter_gate);
 
 void
-xfs_drain_wait_disable(void)
+xfs_defer_drain_wait_disable(void)
 {
-       static_branch_dec(&xfs_drain_waiter_gate);
+       static_branch_dec(&xfs_defer_drain_waiter_gate);
 }
 
 void
-xfs_drain_wait_enable(void)
+xfs_defer_drain_wait_enable(void)
 {
-       static_branch_inc(&xfs_drain_waiter_gate);
+       static_branch_inc(&xfs_defer_drain_waiter_gate);
 }
 
 void
@@ -71,7 +71,7 @@ static inline bool has_waiters(struct wait_queue_head *wq_head)
 static inline void xfs_defer_drain_rele(struct xfs_defer_drain *dr)
 {
        if (atomic_dec_and_test(&dr->dr_count) &&
-           static_branch_unlikely(&xfs_drain_waiter_gate) &&
+           static_branch_unlikely(&xfs_defer_drain_waiter_gate) &&
            has_waiters(&dr->dr_waiters))
                wake_up(&dr->dr_waiters);
 }
index efcf88df9a5e70e5a493341898032777d0729cea..4d446dbf65e519231b5a2b2fa063741e8e4225df 100644 (file)
@@ -26,8 +26,8 @@ struct xfs_defer_drain {
 void xfs_defer_drain_init(struct xfs_defer_drain *dr);
 void xfs_defer_drain_free(struct xfs_defer_drain *dr);
 
-void xfs_drain_wait_disable(void);
-void xfs_drain_wait_enable(void);
+void xfs_defer_drain_wait_disable(void);
+void xfs_defer_drain_wait_enable(void);
 
 /*
  * Deferred Work Intent Drains
@@ -61,6 +61,9 @@ void xfs_drain_wait_enable(void);
  * All functions that create work items must increment the intent counter as
  * soon as the item is added to the transaction and cannot drop the counter
  * until the item is finished or cancelled.
+ *
+ * The same principles apply to realtime groups because the rt metadata inode
+ * ILOCKs are not held across transaction rolls.
  */
 struct xfs_group *xfs_group_intent_get(struct xfs_mount *mp,
                xfs_fsblock_t fsbno, enum xfs_group_type type);