xfs: ensure we have blocks available before taking the iolock
authorChristoph Hellwig <hch@lst.de>
Wed, 24 Jul 2024 14:20:34 +0000 (07:20 -0700)
committerChristoph Hellwig <hch@lst.de>
Wed, 24 Jul 2024 14:22:12 +0000 (07:22 -0700)
With the last patch we have the infrastructure in place to have space
reservations before taking the iolock and thus avoid the GC deadlock
in generic/269.  But right now it will happily take space that has
been freed in a used zoned that would still require GC.  Add a new
RTAVAILABLE counter of blocks that are actually directly available to
be written into in addition to the classic free counter.  Only allow
a write to go ahead if it has blocks available to write, and otherwise
wait for GC.  This also requires tweaking the need GC condition a
bit as we now always need to GC if someone is waiting for space.

Because GC always allocates from the reserved pool that gets replenished
first we can also do away with the ratio to favor it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
fs/xfs/libxfs/xfs_bmap.c
fs/xfs/xfs_file.c
fs/xfs/xfs_iomap.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_super.c
fs/xfs/xfs_zone_alloc.c
fs/xfs/xfs_zone_alloc.h
fs/xfs/xfs_zone_gc.c

index eed18754167c24f6286b8ac0db4ea32233164766..2fa6890e87edbb369fb77559c276a6d3ee1f8c0f 100644 (file)
@@ -4159,13 +4159,30 @@ retry:
                error = xfs_dec_frextents(mp, xfs_rtb_to_rtx(mp, alen));
                if (error)
                        goto out_unreserve_quota;
+
+               /*
+                * For block zeroing we can end up here even on a zoned file
+                * system, as we can't do the pre-iolock reservation for
+                * truncates that get called with it held from the VFS.  So
+                * we try to dip into the available pool here, but never
+                * actually wait for GC to avoid the deadlock.  Because we
+                * only ever zero one block at a time that actually
+                * generally works, but otherwise the zeroing operations will
+                * fail with -ENOSPC.
+                */
+               if (xfs_is_zoned_inode(ip)) {
+                       error = xfs_dec_freecounter(mp, FREE_RTAVAILABLE,
+                                       xfs_rtb_to_rtx(mp, alen), false);
+                       if (error)
+                               goto out_unreserve_frextents;
+               }
        } else {
                fdblocks += alen;
        }
 
        error = xfs_dec_fdblocks(mp, fdblocks, false);
        if (error)
-               goto out_unreserve_frextents;
+               goto out_unreserve_rtavailable;
 
        ip->i_delayed_blks += alen;
        xfs_mod_delalloc(ip, alen, indlen);
@@ -4189,6 +4206,9 @@ retry:
 
        return 0;
 
+out_unreserve_rtavailable:
+       if (xfs_is_zoned_inode(ip) && !ac)
+               xfs_zoned_add_available(mp, xfs_rtb_to_rtx(mp, alen));
 out_unreserve_frextents:
        if (ac)
                ac->reserved_blocks += alen;
@@ -5128,9 +5148,12 @@ xfs_bmap_del_extent_delay(
                ASSERT(!(bflags & XFS_BMAPI_REMAP));
                ac->reserved_blocks += del->br_blockcount;
        } else if (!(bflags & XFS_BMAPI_REMAP)) {
-               if (isrt)
+               if (isrt) {
+                       if (xfs_is_zoned_inode(ip))
+                               xfs_zoned_add_available(mp,
+                                       xfs_rtb_to_rtx(mp, del->br_blockcount));
                        xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount));
-               else
+               else
                        fdblocks += del->br_blockcount;
        }
 
index 95306b882ebe1890619e4750e56259514cf30fac..35266ae022978016795322946ee7f629308aef08 100644 (file)
@@ -615,7 +615,8 @@ xfs_zoned_write_space_reserve(
         * until taking the iolock for O_APPEND writes.
         */
 retry:
-       error = xfs_zoned_space_reserve(ip, count, ac);
+       error = xfs_zoned_space_reserve(ip, count, ac,
+                       iocb->ki_flags & IOCB_NOWAIT);
        if (error == -ENOSPC && !(iocb->ki_flags & IOCB_DIRECT) &&
            count > ip->i_mount->m_sb.sb_blocksize) {
                /*
@@ -1417,7 +1418,7 @@ xfs_write_fault(
                 * than a folio that's just fine.
                 */
                error = xfs_zoned_space_reserve(ip,
-                               folio_size(page_folio(vmf->page)), &ac);
+                               folio_size(page_folio(vmf->page)), &ac, false);
                if (error < 0)
                        return vmf_fs_error(error);
        }
index 2eefb0ddad7699faec88d63cfc15aaefe46eb39e..69c650fbbbdd9c68025d2ef01a1fc5e26bf7d812 100644 (file)
@@ -1077,6 +1077,7 @@ xfs_zoned_buffered_write_iomap_begin(
        ASSERT(!xfs_get_extsz_hint(ip));
        ASSERT(!(flags & IOMAP_UNSHARE));
        ASSERT(ac || (flags & IOMAP_ZERO));
+       ASSERT(!ac || !(flags & IOMAP_ZERO));
 
        if (xfs_is_shutdown(mp))
                return -EIO;
index f749d839e85ce2e3b59302ff2bbdd9f91bc25d9d..deb6d84e20d00ea26c5004c2920bf0c1b733346c 100644 (file)
@@ -469,7 +469,7 @@ xfs_default_resblks(
 {
        uint64_t resblks;
 
-       if (idx == FREE_RTEXTENTS) {
+       if (idx == FREE_RTEXTENTS || idx == FREE_RTAVAILABLE) {
                if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
                        return xfs_zoned_reserved_blocks(mp);
                return 0;
@@ -1248,7 +1248,7 @@ xfs_freecounter_unavailable(
        struct xfs_mount        *mp,
        unsigned int            idx)
 {
-       if (idx == FREE_RTEXTENTS)
+       if (idx == FREE_RTEXTENTS || idx == FREE_RTAVAILABLE)
                return 0;
        return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
 }
@@ -1347,7 +1347,7 @@ xfs_dec_freecounter(
                return 0;
        }
 
-       if (idx != FREE_RTEXTENTS)
+       if (idx == FREE_BLOCKS)
                xfs_warn_once(mp,
 "Reserve blocks depleted! Consider increasing reserve pool size.");
 
index 07a2a9b782f497f132d64ba6b0393c8118f5e2c1..5cfebd1faa1de2a1ffa35887f2672d7f303a0e0e 100644 (file)
@@ -72,6 +72,13 @@ struct xfs_inodegc {
        unsigned int            cpu;
 };
 
+enum {
+       FREE_BLOCKS,            /* free block counter */
+       FREE_RTEXTENTS,         /* free rt extent counter */
+       FREE_RTAVAILABLE,       /* actually available rt extents */
+       FREE_NR,
+};
+
 /*
  * The struct xfsmount layout is optimised to separate read-mostly variables
  * from variables that are frequently modified. We put the read-mostly variables
@@ -201,10 +208,6 @@ typedef struct xfs_mount {
        spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */
        struct percpu_counter   m_icount;       /* allocated inodes counter */
        struct percpu_counter   m_ifree;        /* free inodes counter */
-
-#define FREE_BLOCKS            0       /* free block counter */
-#define FREE_RTEXTENTS         1       /* free rt extent counter */
-#define FREE_NR                        2
        struct percpu_counter   m_free[FREE_NR];
 
        /*
@@ -247,9 +250,6 @@ typedef struct xfs_mount {
        struct delayed_work     m_reclaim_work; /* background inode reclaim */
        spinlock_t              m_reservation_lock;
        struct list_head        m_reclaim_reservations;
-       atomic64_t              m_zone_reclaim_head;
-       atomic64_t              m_zone_reservation_head;
-       uint64_t                m_reclaim_ratio;
        struct task_struct      *m_zone_gc_thread;
        struct dentry           *m_debugfs;     /* debugfs parent */
        struct xfs_kobj         m_kobj;
@@ -504,8 +504,6 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
 #define XFS_OPSTATE_UNSET_LOG_INCOMPAT 11
 /* Filesystem can use logged extended attributes */
 #define XFS_OPSTATE_USE_LARP           12
-/* Zone space reservation required */
-#define XFS_OPSTATE_ZONE_RESERVATION_REQUIRED  13
 
 #define __XFS_IS_OPSTATE(name, NAME) \
 static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -535,7 +533,6 @@ __XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING)
 #endif
 __XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
 __XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
-__XFS_IS_OPSTATE(zone_reservation_required, ZONE_RESERVATION_REQUIRED)
 
 static inline bool
 xfs_should_warn(struct xfs_mount *mp, long nr)
index 2afbb7d116cdf443200a2eb1b812a688728a8cfa..05ea8f7f8452890d9bf7253a120751ce3c1fc004 100644 (file)
@@ -1056,7 +1056,7 @@ static int
 xfs_init_percpu_counters(
        struct xfs_mount        *mp)
 {
-       int             error;
+       int                     error, i;
 
        error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL);
        if (error)
@@ -1066,30 +1066,28 @@ xfs_init_percpu_counters(
        if (error)
                goto free_icount;
 
-       error = percpu_counter_init(&mp->m_free[FREE_BLOCKS], 0, GFP_KERNEL);
-       if (error)
-               goto free_ifree;
-
        error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL);
        if (error)
-               goto free_fdblocks;
+               goto free_ifree;
 
        error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL);
        if (error)
                goto free_delalloc;
 
-       error = percpu_counter_init(&mp->m_free[FREE_RTEXTENTS], 0, GFP_KERNEL);
-       if (error)
-               goto free_delalloc_rt;
+       for (i = 0; i < FREE_NR; i++) {
+               error = percpu_counter_init(&mp->m_free[i], 0, GFP_KERNEL);
+               if (error)
+                       goto free_freecounters;
+       }
 
        return 0;
 
-free_delalloc_rt:
+free_freecounters:
+       while (--i > 0)
+               percpu_counter_destroy(&mp->m_free[i]);
        percpu_counter_destroy(&mp->m_delalloc_rtextents);
 free_delalloc:
        percpu_counter_destroy(&mp->m_delalloc_blks);
-free_fdblocks:
-       percpu_counter_destroy(&mp->m_free[FREE_BLOCKS]);
 free_ifree:
        percpu_counter_destroy(&mp->m_ifree);
 free_icount:
@@ -1113,16 +1111,18 @@ static void
 xfs_destroy_percpu_counters(
        struct xfs_mount        *mp)
 {
+       int                     i;
+
+       for (i = 0; i < FREE_NR; i++)
+               percpu_counter_destroy(&mp->m_free[i]);
        percpu_counter_destroy(&mp->m_icount);
        percpu_counter_destroy(&mp->m_ifree);
-       percpu_counter_destroy(&mp->m_free[FREE_BLOCKS]);
        ASSERT(xfs_is_shutdown(mp) ||
               percpu_counter_sum(&mp->m_delalloc_rtextents) == 0);
        percpu_counter_destroy(&mp->m_delalloc_rtextents);
        ASSERT(xfs_is_shutdown(mp) ||
               percpu_counter_sum(&mp->m_delalloc_blks) == 0);
        percpu_counter_destroy(&mp->m_delalloc_blks);
-       percpu_counter_destroy(&mp->m_free[FREE_RTEXTENTS]);
 }
 
 static int
index c46f3644a0d796ceec04f92ef0c9ebead916fb64..0e2cd54fbd882e37f9d4d9534c6bf533cc31a98b 100644 (file)
@@ -507,17 +507,16 @@ xfs_zoned_show_stats(
 
        seq_puts(m, "\n");
 
-       seq_printf(m, "\ttotal free blocks: %lld\n",
+       seq_printf(m, "\tuser free blocks: %lld\n",
                xfs_estimate_freecounter(mp, FREE_RTEXTENTS));
        seq_printf(m, "\treserved free blocks: %lld\n",
-                       mp->m_resblks[FREE_RTEXTENTS].avail);
+               mp->m_resblks[FREE_RTEXTENTS].avail);
+       seq_printf(m, "\tuser available blocks: %lld\n",
+               xfs_estimate_freecounter(mp, FREE_RTAVAILABLE));
+       seq_printf(m, "\treserved available blocks: %lld\n",
+               mp->m_resblks[FREE_RTAVAILABLE].avail);
        seq_printf(m, "\treservations required: %d\n",
-               xfs_is_zone_reservation_required(mp));
-       seq_printf(m, "\treservation head: %lld\n",
-               atomic64_read(&mp->m_zone_reservation_head));
-       seq_printf(m, "\treclaim head: %lld\n",
-               atomic64_read(&mp->m_zone_reservation_head));
-       seq_printf(m, "\treclaim ratio: %lld\n", mp->m_reclaim_ratio);
+               !list_empty_careful(&mp->m_reclaim_reservations));
        seq_printf(m, "\tGC required: %d\n",
                xfs_zoned_need_gc(mp));
 
index 0dac533ce793f8942ae0ed05de74eaddfa82dd96..0241f77afb5f25fc5bbc34620a51c8791a3afa85 100644 (file)
@@ -42,17 +42,10 @@ struct xfs_zone_alloc_ctx {
 };
 
 int xfs_zoned_space_reserve(struct xfs_inode *ip, size_t count,
-               struct xfs_zone_alloc_ctx *ac);
+               struct xfs_zone_alloc_ctx *ac, bool nowait);
 void xfs_zoned_space_unreserve(struct xfs_inode *ip,
                struct xfs_zone_alloc_ctx *ac);
-
-/*
- * We aim to keep enough zones free in stock to fully use the open zone limit
- * for data placement purposes.
- */
-static inline bool xfs_zoned_need_gc(struct xfs_mount *mp)
-{
-       return atomic_read(&mp->m_nr_free_zones) <= mp->m_max_open_zones;
-}
+void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb);
+bool xfs_zoned_need_gc(struct xfs_mount *mp);
 
 #endif /* _XFS_ZONE_ALLOC_H */
index d1332aef77b578a8deec2a4ff483c7aee58febfd..9e9f3ea9d67a206690aea6125ba12d92e8978947 100644 (file)
 #include "xfs_trace.h"
 
 struct xfs_zone_reservation {
+       struct list_head        entry;
        struct task_struct      *task;
-       uint64_t                target;
-       struct  list_head       reservation_entry;
+       xfs_rtxnum_t            rtxlen;
+       bool                    done;
 };
 
 /*
@@ -44,165 +45,141 @@ xfs_zoned_reserved_blocks(
                XFS_B_TO_FSB(mp, mp->m_zoned_op);
 }
 
-static int64_t
-xfs_zoned_user_to_reclaim_count(
-       struct xfs_mount                *mp,
-       size_t                          count)
+/*
+ * We aim to keep enough zones free in stock to fully use the open zone limit
+ * for data placement purposes.
+ */
+bool
+xfs_zoned_need_gc(
+       struct xfs_mount        *mp)
 {
-       /*
-        * We give the gc reclaim a slight 10% advantage to
-        * drift towards a state where we can stop throtteling
-        */
-       return div_u64(READ_ONCE(mp->m_reclaim_ratio) * count, 100 - 10);
+       if (!xa_marked(&mp->m_rtgroups, XFS_RTG_RECLAIMABLE))
+               return false;
+       if (!list_empty_careful(&mp->m_reclaim_reservations))
+               return true;
+       if (xfs_estimate_freecounter(mp, FREE_RTAVAILABLE) <
+           mp->m_rgblocks * (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
+               return true;
+       return false;
 }
 
 static void
-xfs_zoned_require_reservations(
-       struct xfs_mount        *mp)
+xfs_zoned_wake_all(
+       struct xfs_mount                *mp)
 {
-       uint64_t                start_bytes = 0;
-
-       if (xfs_is_zone_reservation_required(mp))
-               return;
+       struct xfs_zone_reservation     *reservation;
 
        spin_lock(&mp->m_reservation_lock);
-       /*
-        * We're low on space and require reservations.
-        *
-        * Make sure we won't let any user reservations through before we have
-        * free space for them.  E.g. if we're completely out of free zones, we
-        * will have to wait for a whole zone to be reclaimed before starting to
-        * release waiters, as the minimum space we can reclaim is one zone.
-        */
-       if (atomic_read(&mp->m_nr_free_zones) < 1)
-               start_bytes = XFS_FSB_TO_B(mp, mp->m_rgblocks);
-
-       atomic64_set(&mp->m_zone_reservation_head,
-               xfs_zoned_user_to_reclaim_count(mp, start_bytes));
-       atomic64_set(&mp->m_zone_reclaim_head, 0);
-       xfs_set_zone_reservation_required(mp);
+       list_for_each_entry(reservation, &mp->m_reclaim_reservations, entry)
+               wake_up_process(reservation->task);
        spin_unlock(&mp->m_reservation_lock);
 }
 
-static void
-xfs_zoned_release_all_reservations(
-       struct xfs_mount                *mp)
+void
+xfs_zoned_add_available(
+       struct xfs_mount                *mp,
+       xfs_filblks_t                   count_fsb)
 {
+       xfs_rtxnum_t                    rtxlen = xfs_rtb_to_rtx(mp, count_fsb);
        struct xfs_zone_reservation     *reservation;
-       struct xfs_zone_reservation     *tmp;
 
-       if (!xfs_is_zone_reservation_required(mp))
+       if (list_empty_careful(&mp->m_reclaim_reservations)) {
+               xfs_add_freecounter(mp, FREE_RTAVAILABLE, rtxlen);
                return;
+       }
 
        spin_lock(&mp->m_reservation_lock);
-       xfs_clear_zone_reservation_required(mp);
-       list_for_each_entry_safe(reservation, tmp, &mp->m_reclaim_reservations,
-                       reservation_entry) {
+       xfs_add_freecounter(mp, FREE_RTAVAILABLE, rtxlen);
+       rtxlen = xfs_sum_freecounter(mp, FREE_RTAVAILABLE);
+       list_for_each_entry(reservation, &mp->m_reclaim_reservations, entry) {
+               if (reservation->rtxlen > rtxlen)
+                       break;
                wake_up_process(reservation->task);
+               rtxlen -= reservation->rtxlen;
+
        }
        spin_unlock(&mp->m_reservation_lock);
 }
 
-static void
-xfs_zoned_move_reclaim_head(
-       struct xfs_mount                *mp,
-       size_t                          count)
-
+static int
+xfs_zoned_space_wait(
+       struct xfs_inode                *ip,
+       xfs_rtxnum_t                    rtxlen)
 {
-       struct xfs_zone_reservation     *reservation;
-       struct xfs_zone_reservation     *tmp;
-       int64_t                         reclaim_head;
-       int64_t                         reservation_head;
-
-       if (!xfs_is_zone_reservation_required(mp))
-               return;
-
-       reclaim_head = atomic64_add_return(count,
-                       &mp->m_zone_reclaim_head);
-       reservation_head = atomic64_read(&mp->m_zone_reservation_head);
-
-       /*
-        * If the previous reclaim head was ahead of the reservation head, no
-        * user waits should be waiting, so avoid taking the lock.  In the very
-        * unlikely case of a race, we'll wake up the user write on the next
-        * reclaim write.
-        */
-       if (reclaim_head - count > reservation_head)
-               return;
+       struct xfs_mount                *mp = ip->i_mount;
+       struct xfs_zone_reservation     reservation = {
+               .task           = current,
+               .rtxlen         = rtxlen,
+       };
+       int                             error;
 
        spin_lock(&mp->m_reservation_lock);
-       if (xfs_is_zone_reservation_required(mp)) {
-               list_for_each_entry_safe(reservation, tmp,
-                               &mp->m_reclaim_reservations,
-                               reservation_entry) {
-                       if (reservation->target < reclaim_head)
-                               wake_up_process(reservation->task);
+       do {
+               if (xfs_is_shutdown(mp)) {
+                       error = -EIO;
+                       break;
                }
+               list_add_tail(&reservation.entry, &mp->m_reclaim_reservations);
+               set_current_state(TASK_KILLABLE);
+               spin_unlock(&mp->m_reservation_lock);
 
-       }
+               error = xfs_dec_freecounter(mp, FREE_RTAVAILABLE, rtxlen,
+                               false);
+               if (error == -ENOSPC &&
+                   xa_marked(&mp->m_rtgroups, XFS_RTG_RECLAIMABLE)) {
+                       schedule();
+                       if (fatal_signal_pending(current))
+                               error = -EINTR;
+               }
+
+               spin_lock(&mp->m_reservation_lock);
+               list_del(&reservation.entry);
+       } while (error == -ENOSPC &&
+                xa_marked(&mp->m_rtgroups, XFS_RTG_RECLAIMABLE));
        spin_unlock(&mp->m_reservation_lock);
+
+       __set_current_state(TASK_RUNNING);
+       return error;
 }
 
 int
 xfs_zoned_space_reserve(
        struct xfs_inode                *ip,
        size_t                          count,
-       struct xfs_zone_alloc_ctx       *ac)
+       struct xfs_zone_alloc_ctx       *ac,
+       bool                            nowait)
 {
        struct xfs_mount                *mp = ip->i_mount;
-       xfs_filblks_t                   count_fsb;
-       struct xfs_zone_reservation     reservation;
-       int64_t                         needed;
+       xfs_rtxnum_t                    rtxlen;
        int                             error;
 
        /*
         * Round up the block count as a write could hit partial blocks at the
         * start and the end.
         */
-       count_fsb = XFS_B_TO_FSB(mp, count) + 1;
+       ac->reserved_blocks = XFS_B_TO_FSB(mp, count) + 1;
+       ac->cached_rtg = NULL;
 
-       error = xfs_dec_frextents(mp, xfs_rtb_to_rtx(mp, count_fsb));
+       rtxlen = xfs_rtb_to_rtx(mp, ac->reserved_blocks);
+       error = xfs_dec_frextents(mp, rtxlen);
        if (error)
                return error;
 
-       if (!xfs_is_zone_reservation_required(mp))
-               goto done;
-
-       needed = xfs_zoned_user_to_reclaim_count(mp,
-                       XFS_FSB_TO_B(mp, count_fsb));
-       reservation.target = atomic64_add_return(needed,
-                               &mp->m_zone_reservation_head);
-       if (reservation.target < atomic64_read(&mp->m_zone_reclaim_head))
-               goto done;
-
-       reservation.task = current;
-
-       spin_lock(&mp->m_reservation_lock);
-       if (!xfs_is_zone_reservation_required(mp)) {
-               spin_unlock(&mp->m_reservation_lock);
-               goto done;
-       }
-
-       list_add_tail(&reservation.reservation_entry,
-                       &mp->m_reclaim_reservations);
-
-       __set_current_state(TASK_KILLABLE);
-       spin_unlock(&mp->m_reservation_lock);
-
-       schedule();
-
-       spin_lock(&mp->m_reservation_lock);
-       list_del(&reservation.reservation_entry);
-       spin_unlock(&mp->m_reservation_lock);
-
-       if (fatal_signal_pending(current)) {
-               xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, count_fsb));
-               return -EINTR;
+       if (list_empty_careful(&mp->m_reclaim_reservations)) {
+               error = xfs_dec_freecounter(mp, FREE_RTAVAILABLE, rtxlen,
+                               false);
+               if (error != -ENOSPC)
+                       goto out;
+               if (nowait) {
+                       error = -EAGAIN;
+                       goto out;
+               }
        }
-done:
-       ac->reserved_blocks = count_fsb;
-       ac->cached_rtg = NULL;
-       return 0;
+       error = xfs_zoned_space_wait(ip, rtxlen);
+out:
+       if (error)
+               xfs_add_frextents(mp, rtxlen);
+       return error;
 }
 
 void
@@ -210,14 +187,12 @@ xfs_zoned_space_unreserve(
        struct xfs_inode                *ip,
        struct xfs_zone_alloc_ctx       *ac)
 {
-       struct xfs_mount                *mp = ip->i_mount;
-       xfs_filblks_t                   count_fsb = ac->reserved_blocks;
+       if (ac->reserved_blocks > 0) {
+               struct xfs_mount        *mp = ip->i_mount;
 
-       if (count_fsb > 0) {
-               xfs_zoned_move_reclaim_head(mp,
-                       xfs_zoned_user_to_reclaim_count(mp,
-                               XFS_FSB_TO_B(mp, count_fsb)));
-               xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, count_fsb));
+               xfs_zoned_add_available(mp, ac->reserved_blocks);
+               xfs_add_freecounter(mp, FREE_RTEXTENTS,
+                               xfs_rtb_to_rtx(mp, ac->reserved_blocks));
        }
        xfs_zone_finish_alloc(ac->cached_rtg);
 }
@@ -510,6 +485,9 @@ xfs_zone_reclaim_pick(
        unsigned long           index = 0;
        bool                    easy = false;
 
+       if (!xfs_zoned_need_gc(mp))
+               return false;
+
        rcu_read_lock();
        xa_for_each_marked(&mp->m_rtgroups, index, rtg, XFS_RTG_RECLAIMABLE) {
                u64 used = *xfs_zone_used_counter(rtg);
@@ -540,9 +518,6 @@ xfs_zone_reclaim_pick(
 
        xa_clear_mark(&mp->m_rtgroups, victim_rtg->rtg_rgno,
                        XFS_RTG_RECLAIMABLE);
-       WRITE_ONCE(mp->m_reclaim_ratio,
-               div_u64(100 * victim_used,
-                       victim_rtg->rtg_blockcount - victim_used));
 
        xfs_info(mp, "reclaiming zone %d, used = %lld/%u (%s)",
                victim_rtg->rtg_rgno, victim_used,
@@ -578,17 +553,32 @@ xfs_zone_gc_allocate(
 {
        struct xfs_rtgroup      *rtg = data->mp->m_open_gc_zone;
        struct xfs_mount        *mp = rtg->rtg_mount;
+       xfs_rtxnum_t            rtxlen = xfs_rtb_to_rtx(mp, count_fsb);
        xfs_rgblock_t           rgbno = 0;
-       int                     error;
 
        /* the caller must have ensured there is enough space */
        ASSERT(rtg->rtg_blockcount - rtg->rtg_write_pointer >= count_fsb);
-       error = xfs_dec_freecounter(mp, FREE_RTEXTENTS,
-                       xfs_rtb_to_rtx(mp, count_fsb), true);
-       if (error) {
-               ASSERT(!error);
+
+       /*
+        * Directly allocate GC blocks from the reserved pool.
+        * If we'd take them from the normal pool we could be stealing
+        * blocks a regular writer, which would then have to wait for
+        * GC and deadlock.
+        */
+       spin_lock(&mp->m_sb_lock);
+       if (rtxlen > mp->m_resblks[FREE_RTEXTENTS].avail ||
+           rtxlen > mp->m_resblks[FREE_RTAVAILABLE].avail) {
+               xfs_crit(mp,
+"out of space for garbage collection (rtxlen = %lld, free = %lld, avail = %lld).",
+                       rtxlen,
+                       mp->m_resblks[FREE_RTEXTENTS].avail,
+                       mp->m_resblks[FREE_RTAVAILABLE].avail);
+               spin_unlock(&mp->m_sb_lock);
                return NULLFSBLOCK;
        }
+       mp->m_resblks[FREE_RTEXTENTS].avail -= rtxlen;
+       mp->m_resblks[FREE_RTAVAILABLE].avail -= rtxlen;
+       spin_unlock(&mp->m_sb_lock);
 
        *is_seq = test_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags);
        if (!*is_seq)
@@ -619,17 +609,8 @@ xfs_zone_gc_space_available(
        struct xfs_mount        *mp = data->mp;
        struct xfs_rtgroup      *rtg = mp->m_open_gc_zone;
 
-       if (!iter->victim_rtg) {
-               if (!xfs_zoned_need_gc(mp) ||
-                   !xa_marked(&mp->m_rtgroups, XFS_RTG_RECLAIMABLE)) {
-                       xfs_zoned_release_all_reservations(mp);
-                       return 0;
-               }
-
-               xfs_zoned_require_reservations(mp);
-               if (!xfs_zone_reclaim_pick(mp, iter))
-                       return 0;
-       }
+       if (!iter->victim_rtg && !xfs_zone_reclaim_pick(mp, iter))
+               return 0;
 
        if (rtg && rtg->rtg_write_pointer == rtg->rtg_blockcount) {
                /*
@@ -818,8 +799,6 @@ xfs_zone_gc_finish_chunk(
                        XFS_BB_TO_FSB(mp, bio->bi_iter.bi_sector);
        error = xfs_zoned_end_cow(ip, chunk->offset, chunk->len,
                        chunk->new_startblock, chunk->old_startblock);
-       if (!error)
-               xfs_zoned_move_reclaim_head(mp, chunk->len);
 free:
        if (error)
                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
@@ -843,6 +822,8 @@ xfs_zone_gc_finish_reset(
        atomic_inc(&mp->m_nr_free_zones);
        spin_unlock(&mp->m_zone_list_lock);
 
+       xfs_zoned_add_available(mp, rtg->rtg_blockcount);
+
        wake_up_all(&mp->m_zone_wait);
 out:
        bio_put(bio);
@@ -1007,6 +988,8 @@ xfs_zoned_gcd(
                        continue;
 
                if (!data->inflight) {
+                       xfs_zoned_wake_all(mp);
+
                        if (kthread_should_stop()) {
                                __set_current_state(TASK_RUNNING);
                                break;
@@ -1027,7 +1010,6 @@ xfs_zoned_gcd(
        if (mp->m_open_gc_zone)
                xfs_rtgroup_rele(mp->m_open_gc_zone);
 
-       xfs_zoned_release_all_reservations(mp);
        memalloc_nofs_restore(nofs_flag);
        kfree(iter);
 out_free_data:
@@ -1099,7 +1081,8 @@ xfs_get_zone_info_cb(
 static int
 xfs_init_zone(
        struct xfs_rtgroup      *rtg,
-       uint64_t                *freeblocks)
+       uint64_t                *available,
+       uint64_t                *freedblocks)
 {
        struct xfs_mount        *mp = rtg->rtg_mount;
        uint64_t                used = *xfs_zone_used_counter(rtg);
@@ -1137,18 +1120,18 @@ xfs_init_zone(
                /* zone is free */
                list_add_tail(&rtg->rtg_entry, &mp->m_free_zones);
                atomic_inc(&mp->m_nr_free_zones);
-               *freeblocks += rtg->rtg_blockcount;
+               *available += rtg->rtg_blockcount;
        } else if (rtg->rtg_write_pointer < rtg->rtg_blockcount) {
                /* zone is open */
                list_add(&rtg->rtg_entry, &mp->m_open_zones);
                mp->m_nr_open_zones++;
                set_bit(RTG_F_OPEN, &rtg->rtg_flags);
-               *freeblocks += (rtg->rtg_blockcount - rtg->rtg_write_pointer);
+               *available += (rtg->rtg_blockcount - rtg->rtg_write_pointer);
        } else if (used < rtg->rtg_blockcount) {
                /* zone fully written, but has freed blocks */
                xa_set_mark(&mp->m_rtgroups, rtg->rtg_rgno,
                            XFS_RTG_RECLAIMABLE);
-               *freeblocks += (rtg->rtg_blockcount - used);
+               *freedblocks += (rtg->rtg_blockcount - used);
        }
 
        return 0;
@@ -1185,7 +1168,7 @@ xfs_mount_zones(
 {
        struct xfs_buftarg      *bt = mp->m_rtdev_targp;
        unsigned int            bdev_open_zones;
-       int64_t                 freeblocks = 0;
+       int64_t                 available = 0, freedblocks = 0;
        struct xfs_rtgroup      *rtg;
        xfs_rgnumber_t          rgno;
        int                     error;
@@ -1257,15 +1240,6 @@ xfs_mount_zones(
        spin_lock_init(&mp->m_reservation_lock);
        init_waitqueue_head(&mp->m_zone_wait);
 
-       atomic64_set(&mp->m_zone_reclaim_head, 0);
-       atomic64_set(&mp->m_zone_reservation_head, 0);
-
-       /*
-        * Assume a one-to-one reclaim ratio until we pick
-        * a zone for reclaim and update the estimate.
-        */
-       WRITE_ONCE(mp->m_reclaim_ratio, 100);
-
        xfs_info(mp, "%u zones of %u blocks size (%d max open)",
                 mp->m_sb.sb_rgcount, mp->m_rgblocks, mp->m_max_open_zones);
 
@@ -1295,13 +1269,15 @@ xfs_mount_zones(
        kthread_park(mp->m_zone_gc_thread);
 
        for_each_rtgroup(mp, rgno, rtg) {
-               error = xfs_init_zone(rtg, &freeblocks);
+               error = xfs_init_zone(rtg, &available, &freedblocks);
                if (error)
                        goto out_unlink_zones;
        }
 
+       percpu_counter_set(&mp->m_free[FREE_RTAVAILABLE],
+                       xfs_rtb_to_rtx(mp, available));
        percpu_counter_set(&mp->m_free[FREE_RTEXTENTS],
-                       xfs_rtb_to_rtx(mp, freeblocks));
+                       xfs_rtb_to_rtx(mp, available + freedblocks));
 
        /*
         * If there are no free zones available for GC, pick the open zone with