/* Place kernel structure only init below this point. */
spin_lock_init(&rtg->rtg_state_lock);
init_waitqueue_head(&rtg->rtg_active_wq);
+ xfs_defer_drain_init(&rtg->rtg_intents_drain);
#endif /* __KERNEL__ */
/* Active ref owned by mount indicates rtgroup is online. */
XFS_IS_CORRUPT(mp, atomic_read(&rtg->rtg_ref) != 0);
+#ifdef __KERNEL__
+ xfs_defer_drain_free(&rtg->rtg_intents_drain);
+#endif
+
/* drop the mount's active reference */
xfs_rtgroup_rele(rtg);
XFS_IS_CORRUPT(mp, atomic_read(&rtg->rtg_active_ref) != 0);
#ifdef __KERNEL__
/* -- kernel only structures below this line -- */
spinlock_t rtg_state_lock;
+
+ /*
+ * We use xfs_drain to track the number of deferred log intent items
+ * that have been queued (but not yet processed) so that waiters (e.g.
+ * scrub) will not lock resources when other threads are in the middle
+ * of processing a chain of intent items only to find momentary
+ * inconsistencies.
+ */
+ struct xfs_defer_drain rtg_intents_drain;
#endif /* __KERNEL__ */
};
if (error)
return;
- xchk_rtgroup_lock(&info->sc->sr, XCHK_RTGLOCK_ALL);
+ error = xchk_rtgroup_lock(info->sc, &info->sc->sr, XCHK_RTGLOCK_ALL);
+ if (!xchk_fblock_process_error(info->sc, info->whichfork,
+ irec->br_startoff, &error))
+ goto out_free;
+
xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
irec->br_blockcount);
+out_free:
xchk_rtgroup_free(info->sc, &info->sc->sr);
}
return 0;
}
-void
+/* Lock all the rt group metadata inode ILOCKs and wait for intents. */
+int
xchk_rtgroup_lock(
+ struct xfs_scrub *sc,
struct xchk_rt *sr,
unsigned int rtglock_flags)
{
- xfs_rtgroup_lock(sr->rtg, rtglock_flags);
- sr->rtlock_flags = rtglock_flags;
+ int error = 0;
+
+ ASSERT(sr->rtg != NULL);
+
+ /*
+ * If we're /only/ locking the rtbitmap in shared mode, then we're
+ * obviously not trying to compare records in two metadata inodes.
+ * There's no need to drain intents here because the caller (most
+ * likely the rgsuper scanner) doesn't need that level of consistency.
+ */
+ if (rtglock_flags == XFS_RTGLOCK_BITMAP_SHARED) {
+ xfs_rtgroup_lock(sr->rtg, rtglock_flags);
+ sr->rtlock_flags = rtglock_flags;
+ return 0;
+ }
+
+ do {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ xfs_rtgroup_lock(sr->rtg, rtglock_flags);
+
+ /*
+ * If we've grabbed a non-metadata file for scrubbing, we
+ * assume that holding its ILOCK will suffice to coordinate
+ * with any rt intent chains involving this inode.
+ */
+ if (sc->ip && !xfs_is_metadata_inode(sc->ip)) {
+ sr->rtlock_flags = rtglock_flags;
+ return 0;
+ }
+
+ /*
+ * Decide if the rt group is quiet enough for all metadata to
+ * be consistent with each other. Regular file IO doesn't get
+ * to lock all the rt inodes at the same time, which means that
+ * there could be other threads in the middle of processing a
+ * chain of deferred ops.
+ *
+ * We just locked all the metadata inodes for this rt group;
+ * now take a look to see if there are any intents in progress.
+ * If there are, drop the rt group inode locks and wait for the
+ * intents to drain. Since we hold the rt group inode locks
+ * for the duration of the scrub, this is the only time we have
+ * to sample the intents counter; any threads increasing it
+ * after this point can't possibly be in the middle of a chain
+ * of rt metadata updates.
+ *
+ * Obviously, this should be slanted against scrub and in favor
+ * of runtime threads.
+ */
+ if (!xfs_rtgroup_intent_busy(sr->rtg)) {
+ sr->rtlock_flags = rtglock_flags;
+ return 0;
+ }
+
+ xfs_rtgroup_unlock(sr->rtg, rtglock_flags);
+
+ if (!(sc->flags & XCHK_FSGATES_DRAIN))
+ return -ECHRNG;
+ error = xfs_rtgroup_intent_drain(sr->rtg);
+ if (error == -ERESTARTSYS)
+ error = -EINTR;
+ } while (!error);
+
+ return error;
}
/*
trace_xchk_fsgates_enable(sc, scrub_fsgates);
if (scrub_fsgates & XCHK_FSGATES_DRAIN)
- xfs_drain_wait_enable();
+ xfs_defer_drain_wait_enable();
if (scrub_fsgates & XCHK_FSGATES_QUOTA)
xfs_dqtrx_hook_enable();
int xchk_rtgroup_init(struct xfs_scrub *sc, xfs_rgnumber_t rgno,
struct xchk_rt *sr);
-void xchk_rtgroup_lock(struct xchk_rt *sr, unsigned int rtglock_flags);
+int xchk_rtgroup_lock(struct xfs_scrub *sc, struct xchk_rt *sr,
+ unsigned int rtglock_flags);
void xchk_rtgroup_free(struct xfs_scrub *sc, struct xchk_rt *sr);
#else
# define xchk_rtgroup_init(sc, rgno, sr) (-ENOSYS)
-# define xchk_rtgroup_lock(sc, lockflags) do { } while (0)
+# define xchk_rtgroup_lock(sc, sr, lockflags) (-ENOSYS)
# define xchk_rtgroup_free(sc, sr) do { } while (0)
#endif /* CONFIG_XFS_RT */
error = xchk_rtgroup_init(sc, rgno, &sc->sr);
if (error)
return error;
- xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP_SHARED);
+ error = xchk_rtgroup_lock(sc, &sc->sr, XFS_RTGLOCK_BITMAP_SHARED);
+ if (error)
+ return error;
/*
* Since we already validated the rt superblock at mount time, we don't
struct xchk_rtbitmap *rtb;
int error;
+ if (xchk_need_intent_drain(sc))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS);
if (!rtb)
return -ENOMEM;
if (error)
return error;
+ error = xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL);
+ if (error)
+ return error;
+
/*
* Now that we've locked the rtbitmap, we can't race with growfsrt
* trying to expand the bitmap or change the size of the rt volume.
* Hence it is safe to compute and check the geometry values.
*/
- xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP);
if (mp->m_sb.sb_rblocks) {
rtb->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks);
rtb->rextslog = xfs_compute_rextslog(rtb->rextents);
if (error)
return error;
+ error = xchk_rtgroup_lock(sc, &sc->sr, XFS_RTGLOCK_BITMAP);
+ if (error)
+ return error;
+
/*
* Now that we've locked the rtbitmap and rtsummary, we can't race with
* growfsrt trying to expand the summary or change the size of the rt
* exclusively here. If we ever start caring about running concurrent
* fsmap with scrub this could be changed.
*/
- xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP);
if (mp->m_sb.sb_rblocks) {
rts->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks);
rts->rbmblocks = xfs_rtbitmap_blockcount(mp);
trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL);
if (sc->flags & XCHK_FSGATES_DRAIN)
- xfs_drain_wait_disable();
+ xfs_defer_drain_wait_disable();
if (sc->flags & XCHK_FSGATES_QUOTA)
xfs_dqtrx_hook_disable();
{
if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) {
if (xfs_has_rtgroups(mp)) {
- xfs_rgnumber_t rgno;
-
- rgno = xfs_rtb_to_rgno(mp, bi->bi_bmap.br_startblock);
- bi->bi_rtg = xfs_rtgroup_get(mp, rgno);
+ bi->bi_rtg = xfs_rtgroup_intent_get(mp,
+ bi->bi_bmap.br_startblock);
} else {
bi->bi_rtg = NULL;
}
{
if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) {
if (xfs_has_rtgroups(bi->bi_owner->i_mount))
- xfs_rtgroup_put(bi->bi_rtg);
+ xfs_rtgroup_intent_put(bi->bi_rtg);
return;
}
#include "xfs_mount.h"
#include "xfs_ag.h"
#include "xfs_trace.h"
+#include "xfs_rtgroup.h"
/*
- * Use a static key here to reduce the overhead of xfs_drain_rele. If the
+ * Use a static key here to reduce the overhead of xfs_defer_drain_rele. If the
* compiler supports jump labels, the static branch will be replaced by a nop
- * sled when there are no xfs_drain_wait callers. Online fsck is currently
+ * sled when there are no xfs_defer_drain_wait callers. Online fsck is currently
* the only caller, so this is a reasonable tradeoff.
*
* Note: Patching the kernel code requires taking the cpu hotplug lock. Other
* XFS callers cannot hold any locks that might be used by memory reclaim or
* writeback when calling the static_branch_{inc,dec} functions.
*/
-static DEFINE_STATIC_KEY_FALSE(xfs_drain_waiter_gate);
+static DEFINE_STATIC_KEY_FALSE(xfs_defer_drain_waiter_gate);
void
-xfs_drain_wait_disable(void)
+xfs_defer_drain_wait_disable(void)
{
- static_branch_dec(&xfs_drain_waiter_gate);
+ static_branch_dec(&xfs_defer_drain_waiter_gate);
}
void
-xfs_drain_wait_enable(void)
+xfs_defer_drain_wait_enable(void)
{
- static_branch_inc(&xfs_drain_waiter_gate);
+ static_branch_inc(&xfs_defer_drain_waiter_gate);
}
void
static inline void xfs_defer_drain_rele(struct xfs_defer_drain *dr)
{
if (atomic_dec_and_test(&dr->dr_count) &&
- static_branch_unlikely(&xfs_drain_waiter_gate) &&
+ static_branch_unlikely(&xfs_defer_drain_waiter_gate) &&
has_waiters(&dr->dr_waiters))
wake_up(&dr->dr_waiters);
}
{
return xfs_defer_drain_busy(&pag->pag_intents_drain);
}
+
+#ifdef CONFIG_XFS_RT
+
+/*
+ * Get a passive reference to an rtgroup and declare an intent to update its
+ * metadata.
+ */
+struct xfs_rtgroup *
+xfs_rtgroup_intent_get(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ struct xfs_rtgroup *rtg;
+ xfs_rgnumber_t rgno;
+
+ rgno = xfs_rtb_to_rgno(mp, rtbno);
+ rtg = xfs_rtgroup_get(mp, rgno);
+ if (!rtg)
+ return NULL;
+
+ xfs_rtgroup_intent_hold(rtg);
+ return rtg;
+}
+
+/*
+ * Release our intent to update this rtgroup's metadata, and then release our
+ * passive ref to the rtgroup.
+ */
+void
+xfs_rtgroup_intent_put(
+ struct xfs_rtgroup *rtg)
+{
+ xfs_rtgroup_intent_rele(rtg);
+ xfs_rtgroup_put(rtg);
+}
+/*
+ * Declare an intent to update rtgroup metadata. Other threads that need
+ * exclusive access can decide to back off if they see declared intentions.
+ */
+void
+xfs_rtgroup_intent_hold(
+ struct xfs_rtgroup *rtg)
+{
+ trace_xfs_rtgroup_intent_hold(rtg, __return_address);
+ xfs_defer_drain_grab(&rtg->rtg_intents_drain);
+}
+
+/* Release our intent to update this rtgroup's metadata. */
+void
+xfs_rtgroup_intent_rele(
+ struct xfs_rtgroup *rtg)
+{
+ trace_xfs_rtgroup_intent_rele(rtg, __return_address);
+ xfs_defer_drain_rele(&rtg->rtg_intents_drain);
+}
+
+/*
+ * Wait for the intent update count for this rtgroup to hit zero.
+ * Callers must not hold any rt metadata inode locks.
+ */
+int
+xfs_rtgroup_intent_drain(
+ struct xfs_rtgroup *rtg)
+{
+ trace_xfs_rtgroup_wait_intents(rtg, __return_address);
+ return xfs_defer_drain_wait(&rtg->rtg_intents_drain);
+}
+
+/* Has anyone declared an intent to update this rtgroup? */
+bool
+xfs_rtgroup_intent_busy(
+ struct xfs_rtgroup *rtg)
+{
+ return xfs_defer_drain_busy(&rtg->rtg_intents_drain);
+}
+#endif /* CONFIG_XFS_RT */
#define XFS_DRAIN_H_
struct xfs_perag;
+struct xfs_rtgroup;
#ifdef CONFIG_XFS_DRAIN_INTENTS
/*
void xfs_defer_drain_init(struct xfs_defer_drain *dr);
void xfs_defer_drain_free(struct xfs_defer_drain *dr);
-void xfs_drain_wait_disable(void);
-void xfs_drain_wait_enable(void);
+void xfs_defer_drain_wait_disable(void);
+void xfs_defer_drain_wait_enable(void);
/*
* Deferred Work Intent Drains
* All functions that create work items must increment the intent counter as
* soon as the item is added to the transaction and cannot drop the counter
* until the item is finished or cancelled.
+ *
+ * The same principles apply to realtime groups because the rt metadata inode
+ * ILOCKs are not held across transaction rolls.
*/
struct xfs_perag *xfs_perag_intent_get(struct xfs_mount *mp,
xfs_fsblock_t fsbno);
int xfs_perag_intent_drain(struct xfs_perag *pag);
bool xfs_perag_intent_busy(struct xfs_perag *pag);
+
#else
struct xfs_defer_drain { /* empty */ };
#endif /* CONFIG_XFS_DRAIN_INTENTS */
+#if defined(CONFIG_XFS_DRAIN_INTENTS) && defined(CONFIG_XFS_RT)
+struct xfs_rtgroup *xfs_rtgroup_intent_get(struct xfs_mount *mp,
+ xfs_rtblock_t rtbno);
+void xfs_rtgroup_intent_put(struct xfs_rtgroup *rtg);
+
+void xfs_rtgroup_intent_hold(struct xfs_rtgroup *rtg);
+void xfs_rtgroup_intent_rele(struct xfs_rtgroup *rtg);
+
+int xfs_rtgroup_intent_drain(struct xfs_rtgroup *rtg);
+bool xfs_rtgroup_intent_busy(struct xfs_rtgroup *rtg);
+#else
+#define xfs_rtgroup_intent_get(mp, rtbno) \
+ xfs_rtgroup_get(mp, xfs_rtb_to_rgno((mp), (rtbno)))
+#define xfs_rtgroup_intent_put(rtg) xfs_rtgroup_put(rtg)
+static inline void xfs_rtgroup_intent_hold(struct xfs_rtgroup *rtg) { }
+static inline void xfs_rtgroup_intent_rele(struct xfs_rtgroup *rtg) { }
+#endif /* CONFIG_XFS_DRAIN_INTENTS && CONFIG_XFS_RT */
+
+
#endif /* XFS_DRAIN_H_ */
trace_xfs_extent_free_defer(mp, xefi);
if (xfs_efi_is_realtime(xefi)) {
- xfs_rgnumber_t rgno;
-
- rgno = xfs_rtb_to_rgno(mp, xefi->xefi_startblock);
- xefi->xefi_rtg = xfs_rtgroup_get(mp, rgno);
-
+ xefi->xefi_rtg = xfs_rtgroup_intent_get(mp,
+ xefi->xefi_startblock);
*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
&xfs_rtextent_free_defer_type);
return;
xefi->xefi_agresv = XFS_AG_RESV_NONE;
xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN;
if (isrt) {
- xfs_rgnumber_t rgno;
-
xefi->xefi_flags |= XFS_EFI_REALTIME;
- rgno = xfs_rtb_to_rgno(mp, extp->ext_start);
- xefi->xefi_rtg = xfs_rtgroup_get(mp, rgno);
+ xefi->xefi_rtg = xfs_rtgroup_intent_get(mp, extp->ext_start);
} else {
xefi->xefi_pag = xfs_perag_intent_get(mp, extp->ext_start);
}
{
struct xfs_extent_free_item *xefi = xefi_entry(item);
- xfs_rtgroup_put(xefi->xefi_rtg);
+ xfs_rtgroup_intent_put(xefi->xefi_rtg);
kmem_cache_free(xfs_extfree_item_cache, xefi);
}
struct xfs_quotainfo;
struct xfs_da_geometry;
struct xfs_perag;
+struct xfs_rtgroup;
/* dynamic preallocation free space thresholds, 5% down to 1% */
enum {
* section updates.
*/
if (ri->ri_realtime) {
- xfs_rgnumber_t rgno;
-
- rgno = xfs_rtb_to_rgno(mp, ri->ri_bmap.br_startblock);
- ri->ri_rtg = xfs_rtgroup_get(mp, rgno);
+ ri->ri_rtg = xfs_rtgroup_intent_get(mp,
+ ri->ri_bmap.br_startblock);
xfs_defer_add(tp, &ri->ri_list, &xfs_rtrmap_update_defer_type);
} else {
- ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_bmap.br_startblock);
+ ri->ri_pag = xfs_perag_intent_get(mp,
+ ri->ri_bmap.br_startblock);
xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
}
}
XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
ri->ri_realtime = isrt;
if (isrt) {
- xfs_rgnumber_t rgno;
-
- rgno = xfs_rtb_to_rgno(mp, map->me_startblock);
- ri->ri_rtg = xfs_rtgroup_get(mp, rgno);
+ ri->ri_rtg = xfs_rtgroup_intent_get(mp, map->me_startblock);
} else {
ri->ri_pag = xfs_perag_intent_get(mp, map->me_startblock);
}
{
struct xfs_rmap_intent *ri = ri_entry(item);
- xfs_rtgroup_put(ri->ri_rtg);
+ xfs_rtgroup_intent_put(ri->ri_rtg);
kmem_cache_free(xfs_rmap_intent_cache, ri);
}
DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_rele);
DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents);
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xfs_rtgroup_intents_class,
+ TP_PROTO(struct xfs_rtgroup *rtg, void *caller_ip),
+ TP_ARGS(rtg, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, rtdev)
+ __field(long, nr_intents)
+ __field(void *, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = rtg->rtg_mount->m_super->s_dev;
+ __entry->rtdev = rtg->rtg_mount->m_rtdev_targp->bt_dev;
+ __entry->nr_intents = atomic_read(&rtg->rtg_intents_drain.dr_count);
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d rtdev %d:%d intents %ld caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+ __entry->nr_intents,
+ __entry->caller_ip)
+);
+
+#define DEFINE_RTGROUP_INTENTS_EVENT(name) \
+DEFINE_EVENT(xfs_rtgroup_intents_class, name, \
+ TP_PROTO(struct xfs_rtgroup *rtg, void *caller_ip), \
+ TP_ARGS(rtg, caller_ip))
+DEFINE_RTGROUP_INTENTS_EVENT(xfs_rtgroup_intent_hold);
+DEFINE_RTGROUP_INTENTS_EVENT(xfs_rtgroup_intent_rele);
+DEFINE_RTGROUP_INTENTS_EVENT(xfs_rtgroup_wait_intents);
+#endif /* CONFIG_XFS_RT */
+
#endif /* CONFIG_XFS_DRAIN_INTENTS */
#ifdef CONFIG_XFS_MEMORY_BUFS