xfs: ensure we have blocks available before taking the iolock

author Christoph Hellwig <hch@lst.de>

Wed, 24 Jul 2024 14:20:34 +0000 (07:20 -0700)

committer Christoph Hellwig <hch@lst.de>

Wed, 24 Jul 2024 14:22:12 +0000 (07:22 -0700)
author Christoph Hellwig <hch@lst.de>
Wed, 24 Jul 2024 14:20:34 +0000 (07:20 -0700)
committer Christoph Hellwig <hch@lst.de>
Wed, 24 Jul 2024 14:22:12 +0000 (07:22 -0700)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c

index eed18754167c24f6286b8ac0db4ea32233164766..2fa6890e87edbb369fb77559c276a6d3ee1f8c0f 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4159,13 +4159,30 @@ retry:
                 error = xfs_dec_frextents(mp, xfs_rtb_to_rtx(mp, alen));
                 if (error)
                         goto out_unreserve_quota;
+
+               /*
+                * For block zeroing we can end up here even on a zoned file
+                * system, as we can't do the pre-iolock reservation for
+                * truncates that get called with it held from the VFS.  So
+                * we try to dip into the available pool here, but never
+                * actually wait for GC to avoid the deadlock.  Because we
+                * only ever zero one block at a time that actually
+                * generally works, but otherwise the zeroing operations will
+                * fail with -ENOSPC.
+                */
+               if (xfs_is_zoned_inode(ip)) {
+                       error = xfs_dec_freecounter(mp, FREE_RTAVAILABLE,
+                                       xfs_rtb_to_rtx(mp, alen), false);
+                       if (error)
+                               goto out_unreserve_frextents;
+               }
         } else {
                 fdblocks += alen;
         }
  
         error = xfs_dec_fdblocks(mp, fdblocks, false);
         if (error)
-               goto out_unreserve_frextents;
+               goto out_unreserve_rtavailable;
  
         ip->i_delayed_blks += alen;
         xfs_mod_delalloc(ip, alen, indlen);
@@ -4189,6 +4206,9 @@ retry:
  
         return 0;
  
+out_unreserve_rtavailable:
+       if (xfs_is_zoned_inode(ip) && !ac)
+               xfs_zoned_add_available(mp, xfs_rtb_to_rtx(mp, alen));
  out_unreserve_frextents:
         if (ac)
                 ac->reserved_blocks += alen;
@@ -5128,9 +5148,12 @@ xfs_bmap_del_extent_delay(
                 ASSERT(!(bflags & XFS_BMAPI_REMAP));
                 ac->reserved_blocks += del->br_blockcount;
         } else if (!(bflags & XFS_BMAPI_REMAP)) {
-               if (isrt)
+               if (isrt) {
+                       if (xfs_is_zoned_inode(ip))
+                               xfs_zoned_add_available(mp,
+                                       xfs_rtb_to_rtx(mp, del->br_blockcount));
                         xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount));
-               else
+               } else
                         fdblocks += del->br_blockcount;
         }
  
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index 95306b882ebe1890619e4750e56259514cf30fac..35266ae022978016795322946ee7f629308aef08 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -615,7 +615,8 @@ xfs_zoned_write_space_reserve(
          * until taking the iolock for O_APPEND writes.
          */
  retry:
-       error = xfs_zoned_space_reserve(ip, count, ac);
+       error = xfs_zoned_space_reserve(ip, count, ac,
+                       iocb->ki_flags & IOCB_NOWAIT);
         if (error == -ENOSPC && !(iocb->ki_flags & IOCB_DIRECT) &&
             count > ip->i_mount->m_sb.sb_blocksize) {
                 /*
@@ -1417,7 +1418,7 @@ xfs_write_fault(
                  * than a folio that's just fine.
                  */
                 error = xfs_zoned_space_reserve(ip,
-                               folio_size(page_folio(vmf->page)), &ac);
+                               folio_size(page_folio(vmf->page)), &ac, false);
                 if (error < 0)
                         return vmf_fs_error(error);
         }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c

index 2eefb0ddad7699faec88d63cfc15aaefe46eb39e..69c650fbbbdd9c68025d2ef01a1fc5e26bf7d812 100644 (file)
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1077,6 +1077,7 @@ xfs_zoned_buffered_write_iomap_begin(
         ASSERT(!xfs_get_extsz_hint(ip));
         ASSERT(!(flags & IOMAP_UNSHARE));
         ASSERT(ac || (flags & IOMAP_ZERO));
+       ASSERT(!ac || !(flags & IOMAP_ZERO));
  
         if (xfs_is_shutdown(mp))
                 return -EIO;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c

index f749d839e85ce2e3b59302ff2bbdd9f91bc25d9d..deb6d84e20d00ea26c5004c2920bf0c1b733346c 100644 (file)
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -469,7 +469,7 @@ xfs_default_resblks(
  {
         uint64_t resblks;
  
-       if (idx == FREE_RTEXTENTS) {
+       if (idx == FREE_RTEXTENTS || idx == FREE_RTAVAILABLE) {
                 if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
                         return xfs_zoned_reserved_blocks(mp);
                 return 0;
@@ -1248,7 +1248,7 @@ xfs_freecounter_unavailable(
         struct xfs_mount        *mp,
         unsigned int            idx)
  {
-       if (idx == FREE_RTEXTENTS)
+       if (idx == FREE_RTEXTENTS || idx == FREE_RTAVAILABLE)
                 return 0;
         return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
  }
@@ -1347,7 +1347,7 @@ xfs_dec_freecounter(
                 return 0;
         }
  
-       if (idx != FREE_RTEXTENTS)
+       if (idx == FREE_BLOCKS)
                 xfs_warn_once(mp,
  "Reserve blocks depleted! Consider increasing reserve pool size.");
  
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h

index 07a2a9b782f497f132d64ba6b0393c8118f5e2c1..5cfebd1faa1de2a1ffa35887f2672d7f303a0e0e 100644 (file)
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -72,6 +72,13 @@ struct xfs_inodegc {
         unsigned int            cpu;
  };
  
+enum {
+       FREE_BLOCKS,            /* free block counter */
+       FREE_RTEXTENTS,         /* free rt extent counter */
+       FREE_RTAVAILABLE,       /* actually available rt extents */
+       FREE_NR,
+};
+
  /*
   * The struct xfsmount layout is optimised to separate read-mostly variables
   * from variables that are frequently modified. We put the read-mostly variables
@@ -201,10 +208,6 @@ typedef struct xfs_mount {
         spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */
         struct percpu_counter   m_icount;       /* allocated inodes counter */
         struct percpu_counter   m_ifree;        /* free inodes counter */
-
-#define FREE_BLOCKS            0       /* free block counter */
-#define FREE_RTEXTENTS         1       /* free rt extent counter */
-#define FREE_NR                        2
         struct percpu_counter   m_free[FREE_NR];
  
         /*
@@ -247,9 +250,6 @@ typedef struct xfs_mount {
         struct delayed_work     m_reclaim_work; /* background inode reclaim */
         spinlock_t              m_reservation_lock;
         struct list_head        m_reclaim_reservations;
-       atomic64_t              m_zone_reclaim_head;
-       atomic64_t              m_zone_reservation_head;
-       uint64_t                m_reclaim_ratio;
         struct task_struct      *m_zone_gc_thread;
         struct dentry           *m_debugfs;     /* debugfs parent */
         struct xfs_kobj         m_kobj;
@@ -504,8 +504,6 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
  #define XFS_OPSTATE_UNSET_LOG_INCOMPAT 11
  /* Filesystem can use logged extended attributes */
  #define XFS_OPSTATE_USE_LARP           12
-/* Zone space reservation required */
-#define XFS_OPSTATE_ZONE_RESERVATION_REQUIRED  13
  
  #define __XFS_IS_OPSTATE(name, NAME) \
  static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -535,7 +533,6 @@ __XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING)
  #endif
  __XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
  __XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
-__XFS_IS_OPSTATE(zone_reservation_required, ZONE_RESERVATION_REQUIRED)
  
  static inline bool
  xfs_should_warn(struct xfs_mount *mp, long nr)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c

index 2afbb7d116cdf443200a2eb1b812a688728a8cfa..05ea8f7f8452890d9bf7253a120751ce3c1fc004 100644 (file)
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1056,7 +1056,7 @@ static int
  xfs_init_percpu_counters(
         struct xfs_mount        *mp)
  {
-       int             error;
+       int                     error, i;
  
         error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL);
         if (error)
@@ -1066,30 +1066,28 @@ xfs_init_percpu_counters(
         if (error)
                 goto free_icount;
  
-       error = percpu_counter_init(&mp->m_free[FREE_BLOCKS], 0, GFP_KERNEL);
-       if (error)
-               goto free_ifree;
-
         error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL);
         if (error)
-               goto free_fdblocks;
+               goto free_ifree;
  
         error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL);
         if (error)
                 goto free_delalloc;
  
-       error = percpu_counter_init(&mp->m_free[FREE_RTEXTENTS], 0, GFP_KERNEL);
-       if (error)
-               goto free_delalloc_rt;
+       for (i = 0; i < FREE_NR; i++) {
+               error = percpu_counter_init(&mp->m_free[i], 0, GFP_KERNEL);
+               if (error)
+                       goto free_freecounters;
+       }
  
         return 0;
  
-free_delalloc_rt:
+free_freecounters:
+       while (--i > 0)
+               percpu_counter_destroy(&mp->m_free[i]);
         percpu_counter_destroy(&mp->m_delalloc_rtextents);
  free_delalloc:
         percpu_counter_destroy(&mp->m_delalloc_blks);
-free_fdblocks:
-       percpu_counter_destroy(&mp->m_free[FREE_BLOCKS]);
  free_ifree:
         percpu_counter_destroy(&mp->m_ifree);
  free_icount:
@@ -1113,16 +1111,18 @@ static void
  xfs_destroy_percpu_counters(
         struct xfs_mount        *mp)
  {
+       int                     i;
+
+       for (i = 0; i < FREE_NR; i++)
+               percpu_counter_destroy(&mp->m_free[i]);
         percpu_counter_destroy(&mp->m_icount);
         percpu_counter_destroy(&mp->m_ifree);
-       percpu_counter_destroy(&mp->m_free[FREE_BLOCKS]);
         ASSERT(xfs_is_shutdown(mp) ||
                percpu_counter_sum(&mp->m_delalloc_rtextents) == 0);
         percpu_counter_destroy(&mp->m_delalloc_rtextents);
         ASSERT(xfs_is_shutdown(mp) ||
                percpu_counter_sum(&mp->m_delalloc_blks) == 0);
         percpu_counter_destroy(&mp->m_delalloc_blks);
-       percpu_counter_destroy(&mp->m_free[FREE_RTEXTENTS]);
  }
  
  static int
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c

index c46f3644a0d796ceec04f92ef0c9ebead916fb64..0e2cd54fbd882e37f9d4d9534c6bf533cc31a98b 100644 (file)
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -507,17 +507,16 @@ xfs_zoned_show_stats(
  
         seq_puts(m, "\n");
  
-       seq_printf(m, "\ttotal free blocks: %lld\n",
+       seq_printf(m, "\tuser free blocks: %lld\n",
                 xfs_estimate_freecounter(mp, FREE_RTEXTENTS));
         seq_printf(m, "\treserved free blocks: %lld\n",
-                       mp->m_resblks[FREE_RTEXTENTS].avail);
+               mp->m_resblks[FREE_RTEXTENTS].avail);
+       seq_printf(m, "\tuser available blocks: %lld\n",
+               xfs_estimate_freecounter(mp, FREE_RTAVAILABLE));
+       seq_printf(m, "\treserved available blocks: %lld\n",
+               mp->m_resblks[FREE_RTAVAILABLE].avail);
         seq_printf(m, "\treservations required: %d\n",
-               xfs_is_zone_reservation_required(mp));
-       seq_printf(m, "\treservation head: %lld\n",
-               atomic64_read(&mp->m_zone_reservation_head));
-       seq_printf(m, "\treclaim head: %lld\n",
-               atomic64_read(&mp->m_zone_reservation_head));
-       seq_printf(m, "\treclaim ratio: %lld\n", mp->m_reclaim_ratio);
+               !list_empty_careful(&mp->m_reclaim_reservations));
         seq_printf(m, "\tGC required: %d\n",
                 xfs_zoned_need_gc(mp));
  
diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h

index 0dac533ce793f8942ae0ed05de74eaddfa82dd96..0241f77afb5f25fc5bbc34620a51c8791a3afa85 100644 (file)
--- a/fs/xfs/xfs_zone_alloc.h
+++ b/fs/xfs/xfs_zone_alloc.h
@@ -42,17 +42,10 @@ struct xfs_zone_alloc_ctx {
  };
  
  int xfs_zoned_space_reserve(struct xfs_inode *ip, size_t count,
-               struct xfs_zone_alloc_ctx *ac);
+               struct xfs_zone_alloc_ctx *ac, bool nowait);
  void xfs_zoned_space_unreserve(struct xfs_inode *ip,
                 struct xfs_zone_alloc_ctx *ac);
-
-/*
- * We aim to keep enough zones free in stock to fully use the open zone limit
- * for data placement purposes.
- */
-static inline bool xfs_zoned_need_gc(struct xfs_mount *mp)
-{
-       return atomic_read(&mp->m_nr_free_zones) <= mp->m_max_open_zones;
-}
+void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb);
+bool xfs_zoned_need_gc(struct xfs_mount *mp);
  
  #endif /* _XFS_ZONE_ALLOC_H */
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c

index d1332aef77b578a8deec2a4ff483c7aee58febfd..9e9f3ea9d67a206690aea6125ba12d92e8978947 100644 (file)
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -25,9 +25,10 @@
  #include "xfs_trace.h"
  
  struct xfs_zone_reservation {
+       struct list_head        entry;
         struct task_struct      *task;
-       uint64_t                target;
-       struct  list_head       reservation_entry;
+       xfs_rtxnum_t            rtxlen;
+       bool                    done;
  };
  
  /*
@@ -44,165 +45,141 @@ xfs_zoned_reserved_blocks(
                 XFS_B_TO_FSB(mp, mp->m_zoned_op);
  }
  
-static int64_t
-xfs_zoned_user_to_reclaim_count(
-       struct xfs_mount                *mp,
-       size_t                          count)
+/*
+ * We aim to keep enough zones free in stock to fully use the open zone limit
+ * for data placement purposes.
+ */
+bool
+xfs_zoned_need_gc(
+       struct xfs_mount        *mp)
  {
-       /*
-        * We give the gc reclaim a slight 10% advantage to
-        * drift towards a state where we can stop throtteling
-        */
-       return div_u64(READ_ONCE(mp->m_reclaim_ratio) * count, 100 - 10);
+       if (!xa_marked(&mp->m_rtgroups, XFS_RTG_RECLAIMABLE))
+               return false;
+       if (!list_empty_careful(&mp->m_reclaim_reservations))
+               return true;
+       if (xfs_estimate_freecounter(mp, FREE_RTAVAILABLE) <
+           mp->m_rgblocks * (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
+               return true;
+       return false;
  }
  
  static void
-xfs_zoned_require_reservations(
-       struct xfs_mount        *mp)
+xfs_zoned_wake_all(
+       struct xfs_mount                *mp)
  {
-       uint64_t                start_bytes = 0;
-
-       if (xfs_is_zone_reservation_required(mp))
-               return;
+       struct xfs_zone_reservation     *reservation;
  
         spin_lock(&mp->m_reservation_lock);
-       /*
-        * We're low on space and require reservations.
-        *
-        * Make sure we won't let any user reservations through before we have
-        * free space for them.  E.g. if we're completely out of free zones, we
-        * will have to wait for a whole zone to be reclaimed before starting to
-        * release waiters, as the minimum space we can reclaim is one zone.
-        */
-       if (atomic_read(&mp->m_nr_free_zones) < 1)
-               start_bytes = XFS_FSB_TO_B(mp, mp->m_rgblocks);
-
-       atomic64_set(&mp->m_zone_reservation_head,
-               xfs_zoned_user_to_reclaim_count(mp, start_bytes));
-       atomic64_set(&mp->m_zone_reclaim_head, 0);
-       xfs_set_zone_reservation_required(mp);
+       list_for_each_entry(reservation, &mp->m_reclaim_reservations, entry)
+               wake_up_process(reservation->task);
         spin_unlock(&mp->m_reservation_lock);
  }
  
-static void
-xfs_zoned_release_all_reservations(
-       struct xfs_mount                *mp)
+void
+xfs_zoned_add_available(
+       struct xfs_mount                *mp,
+       xfs_filblks_t                   count_fsb)
  {
+       xfs_rtxnum_t                    rtxlen = xfs_rtb_to_rtx(mp, count_fsb);
         struct xfs_zone_reservation     *reservation;
-       struct xfs_zone_reservation     *tmp;
  
-       if (!xfs_is_zone_reservation_required(mp))
+       if (list_empty_careful(&mp->m_reclaim_reservations)) {
+               xfs_add_freecounter(mp, FREE_RTAVAILABLE, rtxlen);
                 return;
+       }
  
         spin_lock(&mp->m_reservation_lock);
-       xfs_clear_zone_reservation_required(mp);
-       list_for_each_entry_safe(reservation, tmp, &mp->m_reclaim_reservations,
-                       reservation_entry) {
+       xfs_add_freecounter(mp, FREE_RTAVAILABLE, rtxlen);
+       rtxlen = xfs_sum_freecounter(mp, FREE_RTAVAILABLE);
+       list_for_each_entry(reservation, &mp->m_reclaim_reservations, entry) {
+               if (reservation->rtxlen > rtxlen)
+                       break;
                 wake_up_process(reservation->task);
+               rtxlen -= reservation->rtxlen;
+
         }
         spin_unlock(&mp->m_reservation_lock);
  }
  
-static void
-xfs_zoned_move_reclaim_head(
-       struct xfs_mount                *mp,
-       size_t                          count)
-
+static int
+xfs_zoned_space_wait(
+       struct xfs_inode                *ip,
+       xfs_rtxnum_t                    rtxlen)
  {
-       struct xfs_zone_reservation     *reservation;
-       struct xfs_zone_reservation     *tmp;
-       int64_t                         reclaim_head;
-       int64_t                         reservation_head;
-
-       if (!xfs_is_zone_reservation_required(mp))
-               return;
-
-       reclaim_head = atomic64_add_return(count,
-                       &mp->m_zone_reclaim_head);
-       reservation_head = atomic64_read(&mp->m_zone_reservation_head);
-
-       /*
-        * If the previous reclaim head was ahead of the reservation head, no
-        * user waits should be waiting, so avoid taking the lock.  In the very
-        * unlikely case of a race, we'll wake up the user write on the next
-        * reclaim write.
-        */
-       if (reclaim_head - count > reservation_head)
-               return;
+       struct xfs_mount                *mp = ip->i_mount;
+       struct xfs_zone_reservation     reservation = {
+               .task           = current,
+               .rtxlen         = rtxlen,
+       };
+       int                             error;
  
         spin_lock(&mp->m_reservation_lock);
-       if (xfs_is_zone_reservation_required(mp)) {
-               list_for_each_entry_safe(reservation, tmp,
-                               &mp->m_reclaim_reservations,
-                               reservation_entry) {
-                       if (reservation->target < reclaim_head)
-                               wake_up_process(reservation->task);
+       do {
+               if (xfs_is_shutdown(mp)) {
+                       error = -EIO;
+                       break;
                 }
+               list_add_tail(&reservation.entry, &mp->m_reclaim_reservations);
+               set_current_state(TASK_KILLABLE);
+               spin_unlock(&mp->m_reservation_lock);
  
-       }
+               error = xfs_dec_freecounter(mp, FREE_RTAVAILABLE, rtxlen,
+                               false);
+               if (error == -ENOSPC &&
+                   xa_marked(&mp->m_rtgroups, XFS_RTG_RECLAIMABLE)) {
+                       schedule();
+                       if (fatal_signal_pending(current))
+                               error = -EINTR;
+               }
+
+               spin_lock(&mp->m_reservation_lock);
+               list_del(&reservation.entry);
+       } while (error == -ENOSPC &&
+                xa_marked(&mp->m_rtgroups, XFS_RTG_RECLAIMABLE));
         spin_unlock(&mp->m_reservation_lock);
+
+       __set_current_state(TASK_RUNNING);
+       return error;
  }
  
  int
  xfs_zoned_space_reserve(
         struct xfs_inode                *ip,
         size_t                          count,
-       struct xfs_zone_alloc_ctx       *ac)
+       struct xfs_zone_alloc_ctx       *ac,
+       bool                            nowait)
  {
         struct xfs_mount                *mp = ip->i_mount;
-       xfs_filblks_t                   count_fsb;
-       struct xfs_zone_reservation     reservation;
-       int64_t                         needed;
+       xfs_rtxnum_t                    rtxlen;
         int                             error;
  
         /*
          * Round up the block count as a write could hit partial blocks at the
          * start and the end.
          */
-       count_fsb = XFS_B_TO_FSB(mp, count) + 1;
+       ac->reserved_blocks = XFS_B_TO_FSB(mp, count) + 1;
+       ac->cached_rtg = NULL;
  
-       error = xfs_dec_frextents(mp, xfs_rtb_to_rtx(mp, count_fsb));
+       rtxlen = xfs_rtb_to_rtx(mp, ac->reserved_blocks);
+       error = xfs_dec_frextents(mp, rtxlen);
         if (error)
                 return error;
  
-       if (!xfs_is_zone_reservation_required(mp))
-               goto done;
-
-       needed = xfs_zoned_user_to_reclaim_count(mp,
-                       XFS_FSB_TO_B(mp, count_fsb));
-       reservation.target = atomic64_add_return(needed,
-                               &mp->m_zone_reservation_head);
-       if (reservation.target < atomic64_read(&mp->m_zone_reclaim_head))
-               goto done;
-
-       reservation.task = current;
-
-       spin_lock(&mp->m_reservation_lock);
-       if (!xfs_is_zone_reservation_required(mp)) {
-               spin_unlock(&mp->m_reservation_lock);
-               goto done;
-       }
-
-       list_add_tail(&reservation.reservation_entry,
-                       &mp->m_reclaim_reservations);
-
-       __set_current_state(TASK_KILLABLE);
-       spin_unlock(&mp->m_reservation_lock);
-
-       schedule();
-
-       spin_lock(&mp->m_reservation_lock);
-       list_del(&reservation.reservation_entry);
-       spin_unlock(&mp->m_reservation_lock);
-
-       if (fatal_signal_pending(current)) {
-               xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, count_fsb));
-               return -EINTR;
+       if (list_empty_careful(&mp->m_reclaim_reservations)) {
+               error = xfs_dec_freecounter(mp, FREE_RTAVAILABLE, rtxlen,
+                               false);
+               if (error != -ENOSPC)
+                       goto out;
+               if (nowait) {
+                       error = -EAGAIN;
+                       goto out;
+               }
         }
-done:
-       ac->reserved_blocks = count_fsb;
-       ac->cached_rtg = NULL;
-       return 0;
+       error = xfs_zoned_space_wait(ip, rtxlen);
+out:
+       if (error)
+               xfs_add_frextents(mp, rtxlen);
+       return error;
  }
  
  void
@@ -210,14 +187,12 @@ xfs_zoned_space_unreserve(
         struct xfs_inode                *ip,
         struct xfs_zone_alloc_ctx       *ac)
  {
-       struct xfs_mount                *mp = ip->i_mount;
-       xfs_filblks_t                   count_fsb = ac->reserved_blocks;
+       if (ac->reserved_blocks > 0) {
+               struct xfs_mount        *mp = ip->i_mount;
  
-       if (count_fsb > 0) {
-               xfs_zoned_move_reclaim_head(mp,
-                       xfs_zoned_user_to_reclaim_count(mp,
-                               XFS_FSB_TO_B(mp, count_fsb)));
-               xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, count_fsb));
+               xfs_zoned_add_available(mp, ac->reserved_blocks);
+               xfs_add_freecounter(mp, FREE_RTEXTENTS,
+                               xfs_rtb_to_rtx(mp, ac->reserved_blocks));
         }
         xfs_zone_finish_alloc(ac->cached_rtg);
  }
@@ -510,6 +485,9 @@ xfs_zone_reclaim_pick(
         unsigned long           index = 0;
         bool                    easy = false;
  
+       if (!xfs_zoned_need_gc(mp))
+               return false;
+
         rcu_read_lock();
         xa_for_each_marked(&mp->m_rtgroups, index, rtg, XFS_RTG_RECLAIMABLE) {
                 u64 used = *xfs_zone_used_counter(rtg);
@@ -540,9 +518,6 @@ xfs_zone_reclaim_pick(
  
         xa_clear_mark(&mp->m_rtgroups, victim_rtg->rtg_rgno,
                         XFS_RTG_RECLAIMABLE);
-       WRITE_ONCE(mp->m_reclaim_ratio,
-               div_u64(100 * victim_used,
-                       victim_rtg->rtg_blockcount - victim_used));
  
         xfs_info(mp, "reclaiming zone %d, used = %lld/%u (%s)",
                 victim_rtg->rtg_rgno, victim_used,
@@ -578,17 +553,32 @@ xfs_zone_gc_allocate(
  {
         struct xfs_rtgroup      *rtg = data->mp->m_open_gc_zone;
         struct xfs_mount        *mp = rtg->rtg_mount;
+       xfs_rtxnum_t            rtxlen = xfs_rtb_to_rtx(mp, count_fsb);
         xfs_rgblock_t           rgbno = 0;
-       int                     error;
  
         /* the caller must have ensured there is enough space */
         ASSERT(rtg->rtg_blockcount - rtg->rtg_write_pointer >= count_fsb);
-       error = xfs_dec_freecounter(mp, FREE_RTEXTENTS,
-                       xfs_rtb_to_rtx(mp, count_fsb), true);
-       if (error) {
-               ASSERT(!error);
+
+       /*
+        * Directly allocate GC blocks from the reserved pool.
+        * If we'd take them from the normal pool we could be stealing
+        * blocks a regular writer, which would then have to wait for
+        * GC and deadlock.
+        */
+       spin_lock(&mp->m_sb_lock);
+       if (rtxlen > mp->m_resblks[FREE_RTEXTENTS].avail ||
+           rtxlen > mp->m_resblks[FREE_RTAVAILABLE].avail) {
+               xfs_crit(mp,
+"out of space for garbage collection (rtxlen = %lld, free = %lld, avail = %lld).",
+                       rtxlen,
+                       mp->m_resblks[FREE_RTEXTENTS].avail,
+                       mp->m_resblks[FREE_RTAVAILABLE].avail);
+               spin_unlock(&mp->m_sb_lock);
                 return NULLFSBLOCK;
         }
+       mp->m_resblks[FREE_RTEXTENTS].avail -= rtxlen;
+       mp->m_resblks[FREE_RTAVAILABLE].avail -= rtxlen;
+       spin_unlock(&mp->m_sb_lock);
  
         *is_seq = test_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags);
         if (!*is_seq)
@@ -619,17 +609,8 @@ xfs_zone_gc_space_available(
         struct xfs_mount        *mp = data->mp;
         struct xfs_rtgroup      *rtg = mp->m_open_gc_zone;
  
-       if (!iter->victim_rtg) {
-               if (!xfs_zoned_need_gc(mp) ||
-                   !xa_marked(&mp->m_rtgroups, XFS_RTG_RECLAIMABLE)) {
-                       xfs_zoned_release_all_reservations(mp);
-                       return 0;
-               }
-
-               xfs_zoned_require_reservations(mp);
-               if (!xfs_zone_reclaim_pick(mp, iter))
-                       return 0;
-       }
+       if (!iter->victim_rtg && !xfs_zone_reclaim_pick(mp, iter))
+               return 0;
  
         if (rtg && rtg->rtg_write_pointer == rtg->rtg_blockcount) {
                 /*
@@ -818,8 +799,6 @@ xfs_zone_gc_finish_chunk(
                         XFS_BB_TO_FSB(mp, bio->bi_iter.bi_sector);
         error = xfs_zoned_end_cow(ip, chunk->offset, chunk->len,
                         chunk->new_startblock, chunk->old_startblock);
-       if (!error)
-               xfs_zoned_move_reclaim_head(mp, chunk->len);
  free:
         if (error)
                 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
@@ -843,6 +822,8 @@ xfs_zone_gc_finish_reset(
         atomic_inc(&mp->m_nr_free_zones);
         spin_unlock(&mp->m_zone_list_lock);
  
+       xfs_zoned_add_available(mp, rtg->rtg_blockcount);
+
         wake_up_all(&mp->m_zone_wait);
  out:
         bio_put(bio);
@@ -1007,6 +988,8 @@ xfs_zoned_gcd(
                         continue;
  
                 if (!data->inflight) {
+                       xfs_zoned_wake_all(mp);
+
                         if (kthread_should_stop()) {
                                 __set_current_state(TASK_RUNNING);
                                 break;
@@ -1027,7 +1010,6 @@ xfs_zoned_gcd(
         if (mp->m_open_gc_zone)
                 xfs_rtgroup_rele(mp->m_open_gc_zone);
  
-       xfs_zoned_release_all_reservations(mp);
         memalloc_nofs_restore(nofs_flag);
         kfree(iter);
  out_free_data:
@@ -1099,7 +1081,8 @@ xfs_get_zone_info_cb(
  static int
  xfs_init_zone(
         struct xfs_rtgroup      *rtg,
-       uint64_t                *freeblocks)
+       uint64_t                *available,
+       uint64_t                *freedblocks)
  {
         struct xfs_mount        *mp = rtg->rtg_mount;
         uint64_t                used = *xfs_zone_used_counter(rtg);
@@ -1137,18 +1120,18 @@ xfs_init_zone(
                 /* zone is free */
                 list_add_tail(&rtg->rtg_entry, &mp->m_free_zones);
                 atomic_inc(&mp->m_nr_free_zones);
-               *freeblocks += rtg->rtg_blockcount;
+               *available += rtg->rtg_blockcount;
         } else if (rtg->rtg_write_pointer < rtg->rtg_blockcount) {
                 /* zone is open */
                 list_add(&rtg->rtg_entry, &mp->m_open_zones);
                 mp->m_nr_open_zones++;
                 set_bit(RTG_F_OPEN, &rtg->rtg_flags);
-               *freeblocks += (rtg->rtg_blockcount - rtg->rtg_write_pointer);
+               *available += (rtg->rtg_blockcount - rtg->rtg_write_pointer);
         } else if (used < rtg->rtg_blockcount) {
                 /* zone fully written, but has freed blocks */
                 xa_set_mark(&mp->m_rtgroups, rtg->rtg_rgno,
                             XFS_RTG_RECLAIMABLE);
-               *freeblocks += (rtg->rtg_blockcount - used);
+               *freedblocks += (rtg->rtg_blockcount - used);
         }
  
         return 0;
@@ -1185,7 +1168,7 @@ xfs_mount_zones(
  {
         struct xfs_buftarg      *bt = mp->m_rtdev_targp;
         unsigned int            bdev_open_zones;
-       int64_t                 freeblocks = 0;
+       int64_t                 available = 0, freedblocks = 0;
         struct xfs_rtgroup      *rtg;
         xfs_rgnumber_t          rgno;
         int                     error;
@@ -1257,15 +1240,6 @@ xfs_mount_zones(
         spin_lock_init(&mp->m_reservation_lock);
         init_waitqueue_head(&mp->m_zone_wait);
  
-       atomic64_set(&mp->m_zone_reclaim_head, 0);
-       atomic64_set(&mp->m_zone_reservation_head, 0);
-
-       /*
-        * Assume a one-to-one reclaim ratio until we pick
-        * a zone for reclaim and update the estimate.
-        */
-       WRITE_ONCE(mp->m_reclaim_ratio, 100);
-
         xfs_info(mp, "%u zones of %u blocks size (%d max open)",
                  mp->m_sb.sb_rgcount, mp->m_rgblocks, mp->m_max_open_zones);
  
@@ -1295,13 +1269,15 @@ xfs_mount_zones(
         kthread_park(mp->m_zone_gc_thread);
  
         for_each_rtgroup(mp, rgno, rtg) {
-               error = xfs_init_zone(rtg, &freeblocks);
+               error = xfs_init_zone(rtg, &available, &freedblocks);
                 if (error)
                         goto out_unlink_zones;
         }
  
+       percpu_counter_set(&mp->m_free[FREE_RTAVAILABLE],
+                       xfs_rtb_to_rtx(mp, available));
         percpu_counter_set(&mp->m_free[FREE_RTEXTENTS],
-                       xfs_rtb_to_rtx(mp, freeblocks));
+                       xfs_rtb_to_rtx(mp, available + freedblocks));
  
         /*
          * If there are no free zones available for GC, pick the open zone with
author	Christoph Hellwig <hch@lst.de>
	Wed, 24 Jul 2024 14:20:34 +0000 (07:20 -0700)
committer	Christoph Hellwig <hch@lst.de>
	Wed, 24 Jul 2024 14:22:12 +0000 (07:22 -0700)
fs/xfs/libxfs/xfs_bmap.c		patch \| blob \| history
fs/xfs/xfs_file.c		patch \| blob \| history
fs/xfs/xfs_iomap.c		patch \| blob \| history
fs/xfs/xfs_mount.c		patch \| blob \| history
fs/xfs/xfs_mount.h		patch \| blob \| history
fs/xfs/xfs_super.c		patch \| blob \| history
fs/xfs/xfs_zone_alloc.c		patch \| blob \| history
fs/xfs/xfs_zone_alloc.h		patch \| blob \| history
fs/xfs/xfs_zone_gc.c		patch \| blob \| history