]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
xfs: allow queued realtime intents to drain before scrubbing
authorDarrick J. Wong <djwong@kernel.org>
Thu, 21 Nov 2024 00:20:33 +0000 (16:20 -0800)
committerDarrick J. Wong <djwong@kernel.org>
Mon, 23 Dec 2024 21:06:06 +0000 (13:06 -0800)
When a writer thread executes a chain of log intent items for the
realtime volume, the ILOCKs taken during each step are for each rt
metadata file, not the entire rt volume itself.  Although scrub takes
all rt metadata ILOCKs, this isn't sufficient to guard against scrub
checking the rt volume while that writer thread is in the middle of
finishing a chain because there's no higher level locking primitive
guarding the realtime volume.

When there's a collision, cross-referencing between data structures
(e.g. rtrmapbt and rtrefcountbt) yields false corruption events; if
repair is running, this results in incorrect repairs, which is
catastrophic.

Fix this by adding to the mount structure the same drain that we use to
protect scrub against concurrent AG updates, but this time for the
realtime volume.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
fs/xfs/scrub/bmap.c
fs/xfs/scrub/common.c
fs/xfs/scrub/common.h
fs/xfs/scrub/rgsuper.c
fs/xfs/scrub/rtbitmap.c
fs/xfs/scrub/rtsummary.c
fs/xfs/scrub/scrub.c
fs/xfs/xfs_drain.c
fs/xfs/xfs_drain.h

index 0d7ad692822d48cdbee051bfa802ace74e719de9..dd99366643f832d798ba76fa55122dfd70177800 100644 (file)
@@ -324,10 +324,15 @@ xchk_bmap_rt_iextent_xref(
                        irec->br_startoff, &error))
                return;
 
-       xchk_rtgroup_lock(&info->sc->sr, XCHK_RTGLOCK_ALL);
+       error = xchk_rtgroup_lock(info->sc, &info->sc->sr, XCHK_RTGLOCK_ALL);
+       if (!xchk_fblock_process_error(info->sc, info->whichfork,
+                       irec->br_startoff, &error))
+               goto out_free;
+
        xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
                        irec->br_blockcount);
 
+out_free:
        xchk_rtgroup_free(info->sc, &info->sc->sr);
 }
 
index 5cbd94b56582a47cf33668f78532f7b44d106466..613fb54e723edee5bf311a13f823a5ee174cc901 100644 (file)
@@ -719,13 +719,79 @@ xchk_rtgroup_init(
        return 0;
 }
 
-void
+/* Lock all the rt group metadata inode ILOCKs and wait for intents. */
+int
 xchk_rtgroup_lock(
+       struct xfs_scrub        *sc,
        struct xchk_rt          *sr,
        unsigned int            rtglock_flags)
 {
-       xfs_rtgroup_lock(sr->rtg, rtglock_flags);
+       int                     error = 0;
+
+       ASSERT(sr->rtg != NULL);
+
+       /*
+        * If we're /only/ locking the rtbitmap in shared mode, then we're
+        * obviously not trying to compare records in two metadata inodes.
+        * There's no need to drain intents here because the caller (most
+        * likely the rgsuper scanner) doesn't need that level of consistency.
+        */
+       if (rtglock_flags == XFS_RTGLOCK_BITMAP_SHARED) {
+               xfs_rtgroup_lock(sr->rtg, rtglock_flags);
+               sr->rtlock_flags = rtglock_flags;
+               return 0;
+       }
+
+       do {
+               if (xchk_should_terminate(sc, &error))
+                       return error;
+
+               xfs_rtgroup_lock(sr->rtg, rtglock_flags);
+
+               /*
+                * If we've grabbed a non-metadata file for scrubbing, we
+                * assume that holding its ILOCK will suffice to coordinate
+                * with any rt intent chains involving this inode.
+                */
+               if (sc->ip && !xfs_is_internal_inode(sc->ip))
+                       break;
+
+               /*
+                * Decide if the rt group is quiet enough for all metadata to
+                * be consistent with each other.  Regular file IO doesn't get
+                * to lock all the rt inodes at the same time, which means that
+                * there could be other threads in the middle of processing a
+                * chain of deferred ops.
+                *
+                * We just locked all the metadata inodes for this rt group;
+                * now take a look to see if there are any intents in progress.
+                * If there are, drop the rt group inode locks and wait for the
+                * intents to drain.  Since we hold the rt group inode locks
+                * for the duration of the scrub, this is the only time we have
+                * to sample the intents counter; any threads increasing it
+                * after this point can't possibly be in the middle of a chain
+                * of rt metadata updates.
+                *
+                * Obviously, this should be slanted against scrub and in favor
+                * of runtime threads.
+                */
+               if (!xfs_group_intent_busy(rtg_group(sr->rtg)))
+                       break;
+
+               xfs_rtgroup_unlock(sr->rtg, rtglock_flags);
+
+               if (!(sc->flags & XCHK_FSGATES_DRAIN))
+                       return -ECHRNG;
+               error = xfs_group_intent_drain(rtg_group(sr->rtg));
+               if (error) {
+                       if (error == -ERESTARTSYS)
+                               error = -EINTR;
+                       return error;
+               }
+       } while (1);
+
        sr->rtlock_flags = rtglock_flags;
+       return 0;
 }
 
 /*
@@ -1379,7 +1445,7 @@ xchk_fsgates_enable(
        trace_xchk_fsgates_enable(sc, scrub_fsgates);
 
        if (scrub_fsgates & XCHK_FSGATES_DRAIN)
-               xfs_drain_wait_enable();
+               xfs_defer_drain_wait_enable();
 
        if (scrub_fsgates & XCHK_FSGATES_QUOTA)
                xfs_dqtrx_hook_enable();
index 9ff3cafd867962755b97742a766be53109c7aaef..e734572a8dd6ec417143d69f90f25d56ab5ed9f8 100644 (file)
@@ -141,12 +141,13 @@ xchk_rtgroup_init_existing(
        return error == -ENOENT ? -EFSCORRUPTED : error;
 }
 
-void xchk_rtgroup_lock(struct xchk_rt *sr, unsigned int rtglock_flags);
+int xchk_rtgroup_lock(struct xfs_scrub *sc, struct xchk_rt *sr,
+               unsigned int rtglock_flags);
 void xchk_rtgroup_free(struct xfs_scrub *sc, struct xchk_rt *sr);
 #else
 # define xchk_rtgroup_init(sc, rgno, sr)               (-EFSCORRUPTED)
 # define xchk_rtgroup_init_existing(sc, rgno, sr)      (-EFSCORRUPTED)
-# define xchk_rtgroup_lock(sc, lockflags)              do { } while (0)
+# define xchk_rtgroup_lock(sc, sr, lockflags)          (-EFSCORRUPTED)
 # define xchk_rtgroup_free(sc, sr)                     do { } while (0)
 #endif /* CONFIG_XFS_RT */
 
index 463b3573bb761b35a2a89100eb5185226080025a..e062c7d12565cd47638050bcb2f6736183c8d568 100644 (file)
@@ -61,7 +61,9 @@ xchk_rgsuperblock(
        if (!xchk_xref_process_error(sc, 0, 0, &error))
                return error;
 
-       xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP_SHARED);
+       error = xchk_rtgroup_lock(sc, &sc->sr, XFS_RTGLOCK_BITMAP_SHARED);
+       if (error)
+               return error;
 
        /*
         * Since we already validated the rt superblock at mount time, we don't
index fb4970c877abd3bba7de1eee03e0e64544596b7b..819026ea2d741f7f9a79a29664fa79d89fbec175 100644 (file)
@@ -30,6 +30,9 @@ xchk_setup_rtbitmap(
        struct xchk_rtbitmap    *rtb;
        int                     error;
 
+       if (xchk_need_intent_drain(sc))
+               xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
        rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS);
        if (!rtb)
                return -ENOMEM;
@@ -57,12 +60,15 @@ xchk_setup_rtbitmap(
        if (error)
                return error;
 
+       error = xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL);
+       if (error)
+               return error;
+
        /*
         * Now that we've locked the rtbitmap, we can't race with growfsrt
         * trying to expand the bitmap or change the size of the rt volume.
         * Hence it is safe to compute and check the geometry values.
         */
-       xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP);
        if (mp->m_sb.sb_rblocks) {
                rtb->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks);
                rtb->rextslog = xfs_compute_rextslog(rtb->rextents);
index f1af5431b38856f844e126ac6824e8befc593d13..4ac679c1bd29cd81a84f0d27f8fe3d267a6576e4 100644 (file)
@@ -89,6 +89,10 @@ xchk_setup_rtsummary(
        if (error)
                return error;
 
+       error = xchk_rtgroup_lock(sc, &sc->sr, XFS_RTGLOCK_BITMAP);
+       if (error)
+               return error;
+
        /*
         * Now that we've locked the rtbitmap and rtsummary, we can't race with
         * growfsrt trying to expand the summary or change the size of the rt
@@ -99,7 +103,6 @@ xchk_setup_rtsummary(
         * exclusively here.  If we ever start caring about running concurrent
         * fsmap with scrub this could be changed.
         */
-       xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP);
        if (mp->m_sb.sb_rblocks) {
                rts->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks);
                rts->rbmblocks = xfs_rtbitmap_blockcount(mp);
index 950f5a58dcd967bc94023d736ecf0a6e83bc00b3..652d347cee9929053019b573b684b59d5a01b8cc 100644 (file)
@@ -164,7 +164,7 @@ xchk_fsgates_disable(
        trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL);
 
        if (sc->flags & XCHK_FSGATES_DRAIN)
-               xfs_drain_wait_disable();
+               xfs_defer_drain_wait_disable();
 
        if (sc->flags & XCHK_FSGATES_QUOTA)
                xfs_dqtrx_hook_disable();
index 5ede81fadbd8ca6756ae8c163d590ad1657e6c60..fa5f31931efdb573104c7a3f9aa707f49c60c765 100644 (file)
 #include "xfs_trace.h"
 
 /*
- * Use a static key here to reduce the overhead of xfs_drain_rele.  If the
- * compiler supports jump labels, the static branch will be replaced by a nop
- * sled when there are no xfs_drain_wait callers.  Online fsck is currently
- * the only caller, so this is a reasonable tradeoff.
+ * Use a static key here to reduce the overhead of xfs_defer_drain_rele.  If
+ * the compiler supports jump labels, the static branch will be replaced by a
+ * nop sled when there are no xfs_defer_drain_wait callers.  Online fsck is
+ * currently the only caller, so this is a reasonable tradeoff.
  *
  * Note: Patching the kernel code requires taking the cpu hotplug lock.  Other
  * parts of the kernel allocate memory with that lock held, which means that
  * XFS callers cannot hold any locks that might be used by memory reclaim or
  * writeback when calling the static_branch_{inc,dec} functions.
  */
-static DEFINE_STATIC_KEY_FALSE(xfs_drain_waiter_gate);
+static DEFINE_STATIC_KEY_FALSE(xfs_defer_drain_waiter_gate);
 
 void
-xfs_drain_wait_disable(void)
+xfs_defer_drain_wait_disable(void)
 {
-       static_branch_dec(&xfs_drain_waiter_gate);
+       static_branch_dec(&xfs_defer_drain_waiter_gate);
 }
 
 void
-xfs_drain_wait_enable(void)
+xfs_defer_drain_wait_enable(void)
 {
-       static_branch_inc(&xfs_drain_waiter_gate);
+       static_branch_inc(&xfs_defer_drain_waiter_gate);
 }
 
 void
@@ -71,7 +71,7 @@ static inline bool has_waiters(struct wait_queue_head *wq_head)
 static inline void xfs_defer_drain_rele(struct xfs_defer_drain *dr)
 {
        if (atomic_dec_and_test(&dr->dr_count) &&
-           static_branch_unlikely(&xfs_drain_waiter_gate) &&
+           static_branch_unlikely(&xfs_defer_drain_waiter_gate) &&
            has_waiters(&dr->dr_waiters))
                wake_up(&dr->dr_waiters);
 }
index efcf88df9a5e70e5a493341898032777d0729cea..4d446dbf65e519231b5a2b2fa063741e8e4225df 100644 (file)
@@ -26,8 +26,8 @@ struct xfs_defer_drain {
 void xfs_defer_drain_init(struct xfs_defer_drain *dr);
 void xfs_defer_drain_free(struct xfs_defer_drain *dr);
 
-void xfs_drain_wait_disable(void);
-void xfs_drain_wait_enable(void);
+void xfs_defer_drain_wait_disable(void);
+void xfs_defer_drain_wait_enable(void);
 
 /*
  * Deferred Work Intent Drains
@@ -61,6 +61,9 @@ void xfs_drain_wait_enable(void);
  * All functions that create work items must increment the intent counter as
  * soon as the item is added to the transaction and cannot drop the counter
  * until the item is finished or cancelled.
+ *
+ * The same principles apply to realtime groups because the rt metadata inode
+ * ILOCKs are not held across transaction rolls.
  */
 struct xfs_group *xfs_group_intent_get(struct xfs_mount *mp,
                xfs_fsblock_t fsbno, enum xfs_group_type type);