]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
xfs: fix buffer lookup vs release race
authorChristoph Hellwig <hch@lst.de>
Thu, 16 Jan 2025 06:01:42 +0000 (07:01 +0100)
committerCarlos Maiolino <cem@kernel.org>
Thu, 16 Jan 2025 09:19:59 +0000 (10:19 +0100)
Since commit 298f34224506 ("xfs: lockless buffer lookup") the buffer
lookup fastpath is done without a hash-wide lock (then pag_buf_lock, now
bc_lock) and only under RCU protection.  But this means that nothing
serializes lookups against the temporary 0 reference count for buffers
that are added to the LRU after dropping the last regular reference,
and a concurrent lookup would fail to find them.

Fix this by doing all b_hold modifications under b_lock.  We're already
doing this for release so this "only" ~ doubles the b_lock round trips.
We'll later look into the lockref infrastructure to optimize the number
of lock round trips again.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
fs/xfs/xfs_buf.c
fs/xfs/xfs_buf.h
fs/xfs/xfs_trace.h

index 183428fbc6072b67d6c383ffe00cd0ffcb334f70..7fbdd4b30676ca163b51cf8a167980de9ece9f99 100644 (file)
@@ -127,15 +127,6 @@ __xfs_buf_ioacct_dec(
        }
 }
 
-static inline void
-xfs_buf_ioacct_dec(
-       struct xfs_buf  *bp)
-{
-       spin_lock(&bp->b_lock);
-       __xfs_buf_ioacct_dec(bp);
-       spin_unlock(&bp->b_lock);
-}
-
 /*
  * When we mark a buffer stale, we remove the buffer from the LRU and clear the
  * b_lru_ref count so that the buffer is freed immediately when the buffer
@@ -171,9 +162,9 @@ xfs_buf_stale(
        atomic_set(&bp->b_lru_ref, 0);
        if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
            (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
-               atomic_dec(&bp->b_hold);
+               bp->b_hold--;
 
-       ASSERT(atomic_read(&bp->b_hold) >= 1);
+       ASSERT(bp->b_hold >= 1);
        spin_unlock(&bp->b_lock);
 }
 
@@ -229,14 +220,14 @@ _xfs_buf_alloc(
         */
        flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
 
-       atomic_set(&bp->b_hold, 1);
+       spin_lock_init(&bp->b_lock);
+       bp->b_hold = 1;
        atomic_set(&bp->b_lru_ref, 1);
        init_completion(&bp->b_iowait);
        INIT_LIST_HEAD(&bp->b_lru);
        INIT_LIST_HEAD(&bp->b_list);
        INIT_LIST_HEAD(&bp->b_li_list);
        sema_init(&bp->b_sema, 0); /* held, no waiters */
-       spin_lock_init(&bp->b_lock);
        bp->b_target = target;
        bp->b_mount = target->bt_mount;
        bp->b_flags = flags;
@@ -580,6 +571,20 @@ xfs_buf_find_lock(
        return 0;
 }
 
+static bool
+xfs_buf_try_hold(
+       struct xfs_buf          *bp)
+{
+       spin_lock(&bp->b_lock);
+       if (bp->b_hold == 0) {
+               spin_unlock(&bp->b_lock);
+               return false;
+       }
+       bp->b_hold++;
+       spin_unlock(&bp->b_lock);
+       return true;
+}
+
 static inline int
 xfs_buf_lookup(
        struct xfs_buf_cache    *bch,
@@ -592,7 +597,7 @@ xfs_buf_lookup(
 
        rcu_read_lock();
        bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
-       if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
+       if (!bp || !xfs_buf_try_hold(bp)) {
                rcu_read_unlock();
                return -ENOENT;
        }
@@ -655,7 +660,7 @@ xfs_buf_find_insert(
                spin_unlock(&bch->bc_lock);
                goto out_free_buf;
        }
-       if (bp && atomic_inc_not_zero(&bp->b_hold)) {
+       if (bp && xfs_buf_try_hold(bp)) {
                /* found an existing buffer */
                spin_unlock(&bch->bc_lock);
                error = xfs_buf_find_lock(bp, flags);
@@ -1037,7 +1042,10 @@ xfs_buf_hold(
        struct xfs_buf          *bp)
 {
        trace_xfs_buf_hold(bp, _RET_IP_);
-       atomic_inc(&bp->b_hold);
+
+       spin_lock(&bp->b_lock);
+       bp->b_hold++;
+       spin_unlock(&bp->b_lock);
 }
 
 static void
@@ -1045,10 +1053,15 @@ xfs_buf_rele_uncached(
        struct xfs_buf          *bp)
 {
        ASSERT(list_empty(&bp->b_lru));
-       if (atomic_dec_and_test(&bp->b_hold)) {
-               xfs_buf_ioacct_dec(bp);
-               xfs_buf_free(bp);
+
+       spin_lock(&bp->b_lock);
+       if (--bp->b_hold) {
+               spin_unlock(&bp->b_lock);
+               return;
        }
+       __xfs_buf_ioacct_dec(bp);
+       spin_unlock(&bp->b_lock);
+       xfs_buf_free(bp);
 }
 
 static void
@@ -1058,51 +1071,40 @@ xfs_buf_rele_cached(
        struct xfs_buftarg      *btp = bp->b_target;
        struct xfs_perag        *pag = bp->b_pag;
        struct xfs_buf_cache    *bch = xfs_buftarg_buf_cache(btp, pag);
-       bool                    release;
        bool                    freebuf = false;
 
        trace_xfs_buf_rele(bp, _RET_IP_);
 
-       ASSERT(atomic_read(&bp->b_hold) > 0);
-
-       /*
-        * We grab the b_lock here first to serialise racing xfs_buf_rele()
-        * calls. The pag_buf_lock being taken on the last reference only
-        * serialises against racing lookups in xfs_buf_find(). IOWs, the second
-        * to last reference we drop here is not serialised against the last
-        * reference until we take bp->b_lock. Hence if we don't grab b_lock
-        * first, the last "release" reference can win the race to the lock and
-        * free the buffer before the second-to-last reference is processed,
-        * leading to a use-after-free scenario.
-        */
        spin_lock(&bp->b_lock);
-       release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock);
-       if (!release) {
+       ASSERT(bp->b_hold >= 1);
+       if (bp->b_hold > 1) {
                /*
                 * Drop the in-flight state if the buffer is already on the LRU
                 * and it holds the only reference. This is racy because we
                 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
                 * ensures the decrement occurs only once per-buf.
                 */
-               if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
+               if (--bp->b_hold == 1 && !list_empty(&bp->b_lru))
                        __xfs_buf_ioacct_dec(bp);
                goto out_unlock;
        }
 
-       /* the last reference has been dropped ... */
+       /* we are asked to drop the last reference */
+       spin_lock(&bch->bc_lock);
        __xfs_buf_ioacct_dec(bp);
        if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
                /*
-                * If the buffer is added to the LRU take a new reference to the
+                * If the buffer is added to the LRU, keep the reference to the
                 * buffer for the LRU and clear the (now stale) dispose list
-                * state flag
+                * state flag, else drop the reference.
                 */
-               if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) {
+               if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
                        bp->b_state &= ~XFS_BSTATE_DISPOSE;
-                       atomic_inc(&bp->b_hold);
-               }
+               else
+                       bp->b_hold--;
                spin_unlock(&bch->bc_lock);
        } else {
+               bp->b_hold--;
                /*
                 * most of the time buffers will already be removed from the
                 * LRU, so optimise that case by checking for the
@@ -1748,13 +1750,14 @@ xfs_buftarg_drain_rele(
        struct xfs_buf          *bp = container_of(item, struct xfs_buf, b_lru);
        struct list_head        *dispose = arg;
 
-       if (atomic_read(&bp->b_hold) > 1) {
+       if (!spin_trylock(&bp->b_lock))
+               return LRU_SKIP;
+       if (bp->b_hold > 1) {
                /* need to wait, so skip it this pass */
+               spin_unlock(&bp->b_lock);
                trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
                return LRU_SKIP;
        }
-       if (!spin_trylock(&bp->b_lock))
-               return LRU_SKIP;
 
        /*
         * clear the LRU reference count so the buffer doesn't get
@@ -2093,7 +2096,7 @@ xfs_buf_delwri_queue(
         */
        bp->b_flags |= _XBF_DELWRI_Q;
        if (list_empty(&bp->b_list)) {
-               atomic_inc(&bp->b_hold);
+               xfs_buf_hold(bp);
                list_add_tail(&bp->b_list, list);
        }
 
index 10bf66e074a0f40677773373ed05cc82c000d7eb..7e73663c5d4a580b35c75480aec24c88de3cd20d 100644 (file)
@@ -168,7 +168,7 @@ struct xfs_buf {
 
        xfs_daddr_t             b_rhash_key;    /* buffer cache index */
        int                     b_length;       /* size of buffer in BBs */
-       atomic_t                b_hold;         /* reference count */
+       unsigned int            b_hold;         /* reference count */
        atomic_t                b_lru_ref;      /* lru reclaim ref count */
        xfs_buf_flags_t         b_flags;        /* status flags */
        struct semaphore        b_sema;         /* semaphore for lockables */
index 4fe689410eb6aea02402d72f8f9c53ccc6d89155..b29462363b815f71036f45c2d4f2237297bb7855 100644 (file)
@@ -498,7 +498,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
                __entry->dev = bp->b_target->bt_dev;
                __entry->bno = xfs_buf_daddr(bp);
                __entry->nblks = bp->b_length;
-               __entry->hold = atomic_read(&bp->b_hold);
+               __entry->hold = bp->b_hold;
                __entry->pincount = atomic_read(&bp->b_pin_count);
                __entry->lockval = bp->b_sema.count;
                __entry->flags = bp->b_flags;
@@ -569,7 +569,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
                __entry->bno = xfs_buf_daddr(bp);
                __entry->length = bp->b_length;
                __entry->flags = flags;
-               __entry->hold = atomic_read(&bp->b_hold);
+               __entry->hold = bp->b_hold;
                __entry->pincount = atomic_read(&bp->b_pin_count);
                __entry->lockval = bp->b_sema.count;
                __entry->caller_ip = caller_ip;
@@ -612,7 +612,7 @@ TRACE_EVENT(xfs_buf_ioerror,
                __entry->dev = bp->b_target->bt_dev;
                __entry->bno = xfs_buf_daddr(bp);
                __entry->length = bp->b_length;
-               __entry->hold = atomic_read(&bp->b_hold);
+               __entry->hold = bp->b_hold;
                __entry->pincount = atomic_read(&bp->b_pin_count);
                __entry->lockval = bp->b_sema.count;
                __entry->error = error;
@@ -656,7 +656,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
                __entry->buf_bno = xfs_buf_daddr(bip->bli_buf);
                __entry->buf_len = bip->bli_buf->b_length;
                __entry->buf_flags = bip->bli_buf->b_flags;
-               __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
+               __entry->buf_hold = bip->bli_buf->b_hold;
                __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
                __entry->buf_lockval = bip->bli_buf->b_sema.count;
                __entry->li_flags = bip->bli_item.li_flags;
@@ -4978,7 +4978,7 @@ DECLARE_EVENT_CLASS(xfbtree_buf_class,
                __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
                __entry->bno = xfs_buf_daddr(bp);
                __entry->nblks = bp->b_length;
-               __entry->hold = atomic_read(&bp->b_hold);
+               __entry->hold = bp->b_hold;
                __entry->pincount = atomic_read(&bp->b_pin_count);
                __entry->lockval = bp->b_sema.count;
                __entry->flags = bp->b_flags;