xfs: support zoned RT devices

author Christoph Hellwig <hch@lst.de>

Tue, 5 Nov 2024 08:27:11 +0000 (09:27 +0100)

committer Christoph Hellwig <hch@lst.de>

Tue, 5 Nov 2024 08:29:53 +0000 (09:29 +0100)
author Christoph Hellwig <hch@lst.de>
Tue, 5 Nov 2024 08:27:11 +0000 (09:27 +0100)
committer Christoph Hellwig <hch@lst.de>
Tue, 5 Nov 2024 08:29:53 +0000 (09:29 +0100)
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile

index 7afa51e414278e9cd39cf3888dc5713e0ca64ef1..cc6019342c0bf8a8c4ada5313a74d58dc89e6070 100644 (file)
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -64,6 +64,7 @@ xfs-y                         += $(addprefix libxfs/, \
  xfs-$(CONFIG_XFS_RT)           += $(addprefix libxfs/, \
                                    xfs_rtbitmap.o \
                                    xfs_rtgroup.o \
+                                  xfs_zones.o \
                                    )
  
  # highlevel code
@@ -136,7 +137,9 @@ xfs-$(CONFIG_XFS_QUOTA)             += xfs_dquot.o \
                                    xfs_quotaops.o
  
  # xfs_rtbitmap is shared with libxfs
-xfs-$(CONFIG_XFS_RT)           += xfs_rtalloc.o
+xfs-$(CONFIG_XFS_RT)           += xfs_rtalloc.o \
+                                  xfs_zone_alloc.o \
+                                  xfs_zone_gc.o
  
  xfs-$(CONFIG_XFS_POSIX_ACL)    += xfs_acl.o
  xfs-$(CONFIG_SYSCTL)           += xfs_sysctl.o
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c

index dc2f2608c3962d915d4097d96a2cc169daf5d433..4bb13d34a87e2aa6641d68ae39f4d7a530802964 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -41,6 +41,7 @@
  #include "xfs_symlink_remote.h"
  #include "xfs_inode_util.h"
  #include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
  
  struct kmem_cache              *xfs_bmap_intent_cache;
  
@@ -171,18 +172,16 @@ xfs_bmbt_update(
   * Compute the worst-case number of indirect blocks that will be used
   * for ip's delayed extent of length "len".
   */
-STATIC xfs_filblks_t
+xfs_filblks_t
  xfs_bmap_worst_indlen(
-       xfs_inode_t     *ip,            /* incore inode pointer */
-       xfs_filblks_t   len)            /* delayed extent length */
+       struct xfs_inode        *ip,            /* incore inode pointer */
+       xfs_filblks_t           len)            /* delayed extent length */
  {
-       int             level;          /* btree level number */
-       int             maxrecs;        /* maximum record count at this level */
-       xfs_mount_t     *mp;            /* mount structure */
-       xfs_filblks_t   rval;           /* return value */
+       struct xfs_mount        *mp = ip->i_mount;
+       int                     maxrecs = mp->m_bmap_dmxr[0];
+       int                     level;
+       xfs_filblks_t           rval;
  
-       mp = ip->i_mount;
-       maxrecs = mp->m_bmap_dmxr[0];
         for (level = 0, rval = 0;
              level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
              level++) {
@@ -2576,12 +2575,12 @@ done:
  /*
   * Convert a hole to a delayed allocation.
   */
-STATIC void
+void
  xfs_bmap_add_extent_hole_delay(
-       xfs_inode_t             *ip,    /* incore inode pointer */
+       struct xfs_inode        *ip,    /* incore inode pointer */
         int                     whichfork,
         struct xfs_iext_cursor  *icur,
-       xfs_bmbt_irec_t         *new)   /* new data to add to file extents */
+       struct xfs_bmbt_irec    *new)   /* new data to add to file extents */
  {
         struct xfs_ifork        *ifp;   /* inode fork pointer */
         xfs_bmbt_irec_t         left;   /* left neighbor extent entry */
@@ -4127,6 +4126,7 @@ retry:
  
         fdblocks = indlen;
         if (XFS_IS_REALTIME_INODE(ip)) {
+               ASSERT(!xfs_is_zoned_inode(ip));
                 error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
                 if (error)
                         goto out_unreserve_quota;
@@ -5072,12 +5072,18 @@ xfs_bmap_del_extent_delay(
         da_diff = da_old - da_new;
         fdblocks = da_diff;
  
-       if (bflags & XFS_BMAPI_REMAP)
+       if (bflags & XFS_BMAPI_REMAP) {
                 ;
-       else if (isrt)
-               xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
-       else
+       } else if (isrt) {
+               xfs_rtxlen_t    rtxlen;
+
+               rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount);
+               if (xfs_is_zoned_inode(ip))
+                       xfs_zoned_add_available(mp, rtxlen);
+               xfs_add_frextents(mp, rtxlen);
+       } else {
                 fdblocks += del->br_blockcount;
+       }
  
         xfs_add_fdblocks(mp, fdblocks);
         xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
@@ -6383,8 +6389,12 @@ xfs_bmap_validate_extent_raw(
                                            irec->br_blockcount))
                         return __this_address;
         }
-       if (irec->br_state != XFS_EXT_NORM && whichfork != XFS_DATA_FORK)
-               return __this_address;
+       if (irec->br_state != XFS_EXT_NORM) {
+               if (whichfork != XFS_DATA_FORK)
+                       return __this_address;
+               if (rtfile && xfs_has_zoned(mp))
+                       return __this_address;
+       }
         return NULL;
  }
  
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h

index 8bfb75444d3b6c5569c1ecfff2d268e6a129b324..e114dd691fc027f1ccdd98471997e337b3276f06 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -229,10 +229,13 @@ int       xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
                 struct xfs_inode *ip, int whichfork,
                 struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
                 struct xfs_bmbt_irec *new, int *logflagsp);
+void   xfs_bmap_add_extent_hole_delay(struct xfs_inode *ip, int whichfork,
+               struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *new);
  xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip,
                 int fork);
  int    xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap,
                 struct xfs_alloc_arg *args);
+xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len);
  
  enum xfs_bmap_intent_type {
         XFS_BMAP_MAP = 1,
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c

index 4ddfb7e395b38af2d3ee8f9de109053183c8903a..8ddc6e74240ae9c8c5982aeacabb0dd963ff8dba 100644 (file)
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1123,6 +1123,7 @@ xfs_rtfree_blocks(
         xfs_extlen_t            mod;
         int                     error;
  
+       ASSERT(!xfs_has_zoned(mp));
         ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN);
  
         mod = xfs_blen_to_rtxoff(mp, rtlen);
@@ -1174,6 +1175,22 @@ xfs_rtalloc_query_range(
  
         end = min(end, rtg->rtg_extents - 1);
  
+       if (xfs_has_zoned(mp)) {
+               xfs_rtxnum_t            wp;
+
+               wp = rtg->rtg_write_pointer * mp->m_sb.sb_rextsize;
+               if (end >= wp) {
+                       struct xfs_rtalloc_rec  rec = {
+                               .ar_startext    = max(start, wp),
+                               .ar_extcount    = end - start + 1,
+                       };
+
+                       return fn(rtg, tp, &rec, priv);
+               }
+
+               return 0;
+       }
+
         /* Iterate the bitmap, looking for discrepancies. */
         while (start <= end) {
                 struct xfs_rtalloc_rec  rec;
@@ -1268,6 +1285,8 @@ xfs_rtbitmap_blockcount_len(
         struct xfs_mount        *mp,
         xfs_rtbxlen_t           rtextents)
  {
+       if (xfs_has_zoned(mp))
+               return 0;
         return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp));
  }
  
@@ -1308,6 +1327,11 @@ xfs_rtsummary_blockcount(
         xfs_rtbxlen_t           rextents = xfs_rtbitmap_bitcount(mp);
         unsigned long long      rsumwords;
  
+       if (xfs_has_zoned(mp)) {
+               *rsumlevels = 0;
+               return 0;
+       }
+
         *rsumlevels = xfs_compute_rextslog(rextents) + 1;
         rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels);
         return howmany_64(rsumwords, mp->m_blockwsize);
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c

index dbff04daaccada47c76ca109b8fe4418d2779b47..2f9fc1aaacccdc165608fef8dc0c625b4985e503 100644 (file)
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -77,6 +77,8 @@ xfs_rtgroup_alloc(
                 return -ENOMEM;
  
         xfs_rtgroup_calc_geometry(mp, rtg, rgno, rgcount, rextents);
+       INIT_LIST_HEAD(&rtg->rtg_entry);
+       spin_lock_init(&rtg->rtg_alloc_lock);
  
         error = xfs_group_insert(mp, rtg_group(rtg), rgno, XG_TYPE_RTG);
         if (error)
@@ -184,6 +186,25 @@ xfs_update_last_rtgroup_size(
         return 0;
  }
  
+/*
+ * Zoned file systems don't have bitmap and summary inodes, instead allocations
+ * are only tracked in the rmap.
+ *
+ * This means XFS_RTGLOCK_BITMAP(_SHARED) implies that the rmap needs to be
+ * locked instead.
+ */
+static void
+xfs_rtglock_zoned_adjust(
+       struct xfs_rtgroup      *rtg,
+       unsigned int            *rtglock_flags)
+{
+       if (!xfs_has_zoned(rtg_mount(rtg)))
+               return;
+       if (*rtglock_flags & (XFS_RTGLOCK_BITMAP | XFS_RTGLOCK_BITMAP_SHARED))
+               *rtglock_flags |= XFS_RTGLOCK_RMAP;
+       *rtglock_flags &= ~(XFS_RTGLOCK_BITMAP | XFS_RTGLOCK_BITMAP_SHARED);
+}
+
  /* Lock metadata inodes associated with this rt group. */
  void
  xfs_rtgroup_lock(
@@ -194,6 +215,8 @@ xfs_rtgroup_lock(
         ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
                !(rtglock_flags & XFS_RTGLOCK_BITMAP));
  
+       xfs_rtglock_zoned_adjust(rtg, &rtglock_flags);
+
         if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
                 /*
                  * Lock both realtime free space metadata inodes for a freespace
@@ -224,6 +247,8 @@ xfs_rtgroup_unlock(
         ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
                !(rtglock_flags & XFS_RTGLOCK_BITMAP));
  
+       xfs_rtglock_zoned_adjust(rtg, &rtglock_flags);
+
         if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) &&
                         rtg->rtg_inodes[XFS_RTGI_REFCOUNT])
                 xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_REFCOUNT], XFS_ILOCK_EXCL);
@@ -252,6 +277,8 @@ xfs_rtgroup_trans_join(
         ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
         ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED));
  
+       xfs_rtglock_zoned_adjust(rtg, &rtglock_flags);
+
         if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
                 xfs_trans_ijoin(tp, rtg->rtg_inodes[XFS_RTGI_BITMAP],
                                 XFS_ILOCK_EXCL);
@@ -372,6 +399,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
                 .sick           = XFS_SICK_RG_BITMAP,
                 .fmt_mask       = (1U << XFS_DINODE_FMT_EXTENTS) |
                                   (1U << XFS_DINODE_FMT_BTREE),
+               .enabled        = xfs_has_nonzoned,
                 .create         = xfs_rtbitmap_create,
         },
         [XFS_RTGI_SUMMARY] = {
@@ -380,6 +408,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
                 .sick           = XFS_SICK_RG_SUMMARY,
                 .fmt_mask       = (1U << XFS_DINODE_FMT_EXTENTS) |
                                   (1U << XFS_DINODE_FMT_BTREE),
+               .enabled        = xfs_has_nonzoned,
                 .create         = xfs_rtsummary_create,
         },
         [XFS_RTGI_RMAP] = {
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h

index 06f93b97d396c43b269c4fa520acf0aadf490cab..e8fab91a5ee2e65d9560af19e9256adec91c021a 100644 (file)
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -44,6 +44,17 @@ struct xfs_rtgroup {
          * Reads and writes are serialized by the rsumip inode lock.
          */
         uint8_t                 *rtg_rsum_cache;
+
+       unsigned long           rtg_flags;
+#define RTG_F_SEQUENTIAL               0
+#define RTG_F_OPEN                     1
+
+       spinlock_t              rtg_alloc_lock;
+       xfs_rgblock_t           rtg_write_pointer;
+       xfs_rgblock_t           rtg_written;
+
+       /* zone state entry */
+       struct list_head        rtg_entry;
  };
  
  static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg)
@@ -66,6 +77,8 @@ static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg)
         return rtg->rtg_group.xg_gno;
  }
  
+#define XFS_RTG_RECLAIMABLE            XA_MARK_0
+
  /* Passive rtgroup references */
  static inline struct xfs_rtgroup *
  xfs_rtgroup_get(
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c

index 89ebab965d5e4ff561203784729f31c245c4a736..d4d19b6f4389ecb73abd4d7197eeb7ef1e23cb6d 100644 (file)
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -30,6 +30,7 @@
  #include "xfs_rtgroup.h"
  #include "xfs_rtrmap_btree.h"
  #include "xfs_rtrefcount_btree.h"
+#include "xfs_rtbitmap.h"
  
  /*
   * Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -268,6 +269,9 @@ static uint64_t
  xfs_expected_rbmblocks(
         struct xfs_sb           *sbp)
  {
+       if (xfs_sb_is_v5(sbp) &&
+           (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED))
+               return 0;
         return howmany_64(xfs_extents_per_rbm(sbp),
                           NBBY * xfs_rtbmblock_size(sbp));
  }
@@ -1268,7 +1272,7 @@ xfs_log_sb(
          * we handle nearly-lockless reservations, so we must use the _positive
          * variant here to avoid writing out nonsense frextents.
          */
-       if (xfs_has_rtgroups(mp))
+       if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp))
                 mp->m_sb.sb_frextents = xfs_sum_freecounter(mp, FREE_RTEXTENTS);
  
         xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c

new file mode 100644 (file)

index 0000000..e38529c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_zones.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Christoph Hellwig.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zones.h"
+
+static int
+xfs_zone_validate_empty(
+       struct blk_zone         *zone,
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+
+       if (*xfs_zone_used_counter(rtg) > 0) {
+               xfs_warn(mp, "empty zone %d has non-zero used counter (0x%llx).",
+                        rtg_rgno(rtg), *xfs_zone_used_counter(rtg));
+               return -EIO;
+       }
+       return 0;
+}
+
+static int
+xfs_zone_validate_wp(
+       struct blk_zone         *zone,
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+       xfs_fileoff_t           wp_fsb = xfs_daddr_to_rtb(mp, zone->wp);
+
+       if (*xfs_zone_used_counter(rtg) > (uint64_t)rtg->rtg_extents) {
+               xfs_warn(mp, "zone %d has too larged used counter (0x%llx).",
+                        rtg_rgno(rtg), *xfs_zone_used_counter(rtg));
+               return -EIO;
+       }
+
+       /*
+        * Always use the hardware write pointer.
+        */
+       rtg->rtg_write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb);
+       if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) {
+               xfs_warn(mp, "zone %d write pointer (0x%x) outside of zone.",
+                        rtg_rgno(rtg), rtg->rtg_write_pointer);
+               return -EFSCORRUPTED;
+       }
+       if (rtg->rtg_write_pointer >= rtg->rtg_extents) {
+               xfs_warn(mp, "zone %d has invalid write pointer (0x%x).",
+                        rtg_rgno(rtg), rtg->rtg_write_pointer);
+               return -EFSCORRUPTED;
+       }
+
+       return 0;
+}
+
+static int
+xfs_zone_validate_full(
+       struct blk_zone         *zone,
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+
+       rtg->rtg_write_pointer = rtg->rtg_extents;
+       if (*xfs_zone_used_counter(rtg) > rtg->rtg_extents) {
+               xfs_warn(mp, "zone %d has too larged used counter (0x%llx).",
+                        rtg_rgno(rtg), *xfs_zone_used_counter(rtg));
+               return -EIO;
+       }
+
+       return 0;
+}
+
+static int
+xfs_zone_validate_seq(
+       struct blk_zone         *zone,
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+
+       set_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags);
+
+       switch (zone->cond) {
+       case BLK_ZONE_COND_EMPTY:
+               return xfs_zone_validate_empty(zone, rtg);
+       case BLK_ZONE_COND_IMP_OPEN:
+       case BLK_ZONE_COND_EXP_OPEN:
+       case BLK_ZONE_COND_CLOSED:
+               return xfs_zone_validate_wp(zone, rtg);
+       case BLK_ZONE_COND_FULL:
+               return xfs_zone_validate_full(zone, rtg);
+       case BLK_ZONE_COND_NOT_WP:
+       case BLK_ZONE_COND_OFFLINE:
+       case BLK_ZONE_COND_READONLY:
+               xfs_warn(mp, "zone %d has unsupported zone condition 0x%x.",
+                       rtg_rgno(rtg), zone->cond);
+               return -EIO;
+       default:
+               xfs_warn(mp, "zone %d has unknown zone condition 0x%x.",
+                       rtg_rgno(rtg), zone->cond);
+               return -EIO;
+       }
+}
+
+static int
+xfs_zone_validate_conv(
+       struct blk_zone         *zone,
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+
+       switch (zone->cond) {
+       case BLK_ZONE_COND_NOT_WP:
+               return 0;
+       default:
+               xfs_warn(mp,
+"conventional zone %d has unsupported zone condition 0x%x.",
+                        rtg_rgno(rtg), zone->cond);
+               return -EIO;
+       }
+}
+
+int
+xfs_zone_validate(
+       struct blk_zone         *zone,
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+       struct xfs_groups       *g = &mp->m_groups[XG_TYPE_RTG];
+
+       /*
+        * Check that the zone capacity matches the capacity stored in the
+        * superblock.  Note that all zones including the last one must have a
+        * uniform capacity.
+        */
+       if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) {
+               xfs_warn(mp,
+"zone %d capacity (0x%llx) does not match RT group size (0x%x).",
+                       rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity),
+                       g->blocks);
+               return -EIO;
+       }
+
+       if (XFS_BB_TO_FSB(mp, zone->len) != 1 << g->blklog) {
+               xfs_warn(mp,
+"zone %d length (0x%llx) does match geometry (0x%x).",
+                       rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len),
+                       1 << g->blklog);
+       }
+
+       switch (zone->type) {
+       case BLK_ZONE_TYPE_CONVENTIONAL:
+               return xfs_zone_validate_conv(zone, rtg);
+       case BLK_ZONE_TYPE_SEQWRITE_REQ:
+               return xfs_zone_validate_seq(zone, rtg);
+       default:
+               xfs_warn(mp, "zoned %d has unsupported type 0x%x.",
+                       rtg_rgno(rtg), zone->type);
+               return -EFSCORRUPTED;
+       }
+}
diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h

new file mode 100644 (file)

index 0000000..7d0e3ce
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_zones.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LIBXFS_ZONES_H
+#define _LIBXFS_ZONES_H
+
+/*
+ * In order to guarantee forward progress for GC we need to reserve at least
+ * two zones:  one that will be used for moving data into and one spare zone
+ * making sure that we have enough space to relocate a nearly-full zone.
+ * To allow for slightly sloppy accounting for when we need to reserve the
+ * second zone, we actually reserve three as that is easier than doing fully
+ * accurate bookkeeing.
+ */
+#define XFS_GC_ZONES           3U
+
+/*
+ * In addition we need two zones for user writes, one open zone for writing
+ * and one to still have available blocks without resetting the open zone
+ * when data in the open zone has been freed.
+ */
+#define XFS_RESERVED_ZONES     (XFS_GC_ZONES + 1)
+#define XFS_MIN_ZONES          (XFS_RESERVED_ZONES + 1)
+
+/*
+ * Always keep one zone out of the general open zone pool to allow for GC to
+ * happen while other writers are waiting for free space.
+ */
+#define XFS_OPEN_GC_ZONES      1U
+#define XFS_MIN_OPEN_ZONES     (XFS_OPEN_GC_ZONES + 1U)
+
+int xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg);
+
+#endif /* _LIBXFS_ZONES_H */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c

index 7567dd5cad14f4734fbfbcfc0e0278b46fc29525..1a05c27ba47197f2e39df3c3879e8b378beb2f5d 100644 (file)
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -387,12 +387,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
         },
         [XFS_SCRUB_TYPE_RTBITMAP] = {   /* realtime bitmap */
                 .type   = ST_RTGROUP,
+               .has    = xfs_has_nonzoned,
                 .setup  = xchk_setup_rtbitmap,
                 .scrub  = xchk_rtbitmap,
                 .repair = xrep_rtbitmap,
         },
         [XFS_SCRUB_TYPE_RTSUM] = {      /* realtime summary */
                 .type   = ST_RTGROUP,
+               .has    = xfs_has_nonzoned,
                 .setup  = xchk_setup_rtsummary,
                 .scrub  = xchk_rtsummary,
                 .repair = xrep_rtsummary,
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c

index d175853da5aef0177e21ff0a50c2bb0c3b76853c..536776e14a779f890621391b80075b6530922439 100644 (file)
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1,7 +1,7 @@
  // SPDX-License-Identifier: GPL-2.0
  /*
   * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * Copyright (c) 2016-2018 Christoph Hellwig.
+ * Copyright (c) 2016-2023 Christoph Hellwig.
   * All Rights Reserved.
   */
  #include "xfs.h"
@@ -19,6 +19,8 @@
  #include "xfs_reflink.h"
  #include "xfs_errortag.h"
  #include "xfs_error.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_rtgroup.h"
  
  struct xfs_writepage_ctx {
         struct iomap_writepage_ctx ctx;
@@ -85,6 +87,7 @@ xfs_end_ioend(
  {
         struct xfs_inode        *ip = XFS_I(ioend->io_inode);
         struct xfs_mount        *mp = ip->i_mount;
+       bool                    is_zoned = xfs_is_zoned_inode(ip);
         xfs_off_t               offset = ioend->io_offset;
         size_t                  size = ioend->io_size;
         unsigned int            nofs_flag;
@@ -115,9 +118,10 @@ xfs_end_ioend(
         error = blk_status_to_errno(ioend->io_bio.bi_status);
         if (unlikely(error)) {
                 if (ioend->io_flags & IOMAP_F_SHARED) {
+                       ASSERT(!is_zoned);
                         xfs_reflink_cancel_cow_range(ip, offset, size, true);
                         xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
-                                       offset + size);
+                                       offset + size, NULL);
                 }
                 goto done;
         }
@@ -125,12 +129,15 @@ xfs_end_ioend(
         /*
          * Success: commit the COW or unwritten blocks if needed.
          */
-       if (ioend->io_flags & IOMAP_F_SHARED)
+       if (is_zoned)
+               error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
+                               NULLFSBLOCK);
+       else if (ioend->io_flags & IOMAP_F_SHARED)
                 error = xfs_reflink_end_cow(ip, offset, size);
         else if (ioend->io_type == IOMAP_UNWRITTEN)
                 error = xfs_iomap_write_unwritten(ip, offset, size, false);
  
-       if (!error && xfs_ioend_is_append(ioend))
+       if (!error && xfs_ioend_is_append(ioend) && !ioend->io_isdirect)
                 error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
  done:
         iomap_finish_ioends(ioend, error);
@@ -175,17 +182,30 @@ xfs_end_io(
         }
  }
  
-STATIC void
+void
  xfs_end_bio(
         struct bio              *bio)
  {
         struct iomap_ioend      *ioend = iomap_ioend_from_bio(bio);
         struct xfs_inode        *ip = XFS_I(ioend->io_inode);
+       struct xfs_mount        *mp = ip->i_mount;
         unsigned long           flags;
  
+       if (bio_is_zone_append(bio)) {
+               /*
+                * Record the actually written block number and make sure we
+                * don't merge the first ioened for a zone into the last one
+                * for the previous zone.
+                */
+               ioend->io_sector = bio->bi_iter.bi_sector;
+               if (!(xfs_daddr_to_rtb(mp, ioend->io_sector) %
+                     mp->m_groups[XG_TYPE_RTG].blocks))
+                       ioend->io_flags |= IOMAP_F_BOUNDARY;
+       }
+
         spin_lock_irqsave(&ip->i_ioend_lock, flags);
         if (list_empty(&ip->i_ioend_list))
-               WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
+               WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
                                          &ip->i_ioend_work));
         list_add_tail(&ioend->io_list, &ip->i_ioend_list);
         spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
@@ -462,7 +482,7 @@ xfs_discard_folio(
          * folio itself and not the start offset that is passed in.
          */
         xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
-                               folio_pos(folio) + folio_size(folio));
+                               folio_pos(folio) + folio_size(folio), NULL);
  }
  
  static const struct iomap_writeback_ops xfs_writeback_ops = {
@@ -471,14 +491,116 @@ static const struct iomap_writeback_ops xfs_writeback_ops = {
         .discard_folio          = xfs_discard_folio,
  };
  
+struct xfs_zoned_writepage_ctx {
+       struct iomap_writepage_ctx ctx;
+       struct xfs_rtgroup      *rtg;
+};
+
+static inline struct xfs_zoned_writepage_ctx *
+XFS_ZWPC(struct iomap_writepage_ctx *ctx)
+{
+       return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
+}
+
+static int
+xfs_zoned_map_blocks(
+       struct iomap_writepage_ctx *wpc,
+       struct inode            *inode,
+       loff_t                  offset,
+       unsigned int            len)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + len);
+       xfs_filblks_t           count_fsb;
+       struct xfs_bmbt_irec    imap, del;
+       struct xfs_iext_cursor  icur;
+
+       if (xfs_is_shutdown(mp))
+               return -EIO;
+
+       XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
+
+       /*
+        * All dirty data must be covered by delalloc extents.  But truncate can
+        * remove delalloc extents underneath us or reduce their size.
+        * Returning a hole tells iomap to not write back any data from this
+        * range, which is the right thing to do in that case.
+        *
+        * Otherwise just tell iomap to treat ranges previously covered by a
+        * delalloc extent as mapped.  The actual block allocation will be done
+        * just before submitting the bio.
+        */
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
+               imap.br_startoff = end_fsb;     /* fake a hole past EOF */
+       if (imap.br_startoff > offset_fsb) {
+               imap.br_blockcount = imap.br_startoff - offset_fsb;
+               imap.br_startoff = offset_fsb;
+               imap.br_startblock = HOLESTARTBLOCK;
+               imap.br_state = XFS_EXT_NORM;
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
+               return 0;
+       }
+       end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
+       count_fsb = end_fsb - offset_fsb;
+
+       del = imap;
+       xfs_trim_extent(&del, offset_fsb, count_fsb);
+       xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
+                       XFS_BMAPI_REMAP);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+       wpc->iomap.type = IOMAP_MAPPED;
+       wpc->iomap.flags = IOMAP_F_DIRTY;
+       wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
+       wpc->iomap.offset = offset;
+       wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
+       wpc->iomap.flags = IOMAP_F_ZONE_APPEND;
+       wpc->iomap.addr = 0;
+
+       trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
+       return 0;
+}
+
+static int
+xfs_zoned_submit_ioend(
+       struct iomap_writepage_ctx *wpc,
+       int                     status)
+{
+       wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
+       if (status)
+               return status;
+       xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->rtg);
+       return 0;
+}
+
+static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
+       .map_blocks             = xfs_zoned_map_blocks,
+       .submit_ioend           = xfs_zoned_submit_ioend,
+       .discard_folio          = xfs_discard_folio,
+};
+
  STATIC int
  xfs_vm_writepages(
         struct address_space    *mapping,
         struct writeback_control *wbc)
  {
+       struct xfs_inode        *ip = XFS_I(mapping->host);
         struct xfs_writepage_ctx wpc = { };
+       int                     error;
  
-       xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+       xfs_iflags_clear(ip, XFS_ITRUNCATED);
+       if (xfs_is_zoned_inode(ip)) {
+               struct xfs_zoned_writepage_ctx xc = { };
+
+               error = iomap_writepages(mapping, wbc, &xc.ctx,
+                                        &xfs_zoned_writeback_ops);
+               xfs_zone_finish_alloc(xc.rtg);
+               return error;
+       }
         return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
  }
  
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h

index c96187f981bdd38be93ecf9d4e66580d2780dc5e..906016c2c57c6d6ccc7ccf5ef32a720fc33595db 100644 (file)
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -11,6 +11,7 @@ struct iomap_ioend;
  extern const struct address_space_operations xfs_address_space_operations;
  extern const struct address_space_operations xfs_dax_aops;
  
-int    xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
+int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
+void xfs_end_bio(struct bio *bio);
  
  #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c

index 67a5940285f163da8696dad96fc2edb19f0e7204..05fd768f7dcd76d557aee4976643b535a8c995fb 100644 (file)
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -30,6 +30,7 @@
  #include "xfs_reflink.h"
  #include "xfs_rtbitmap.h"
  #include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
  
  /* Kernel only BMAP related definitions and functions */
  
@@ -436,7 +437,8 @@ xfs_bmap_punch_delalloc_range(
         struct xfs_inode        *ip,
         int                     whichfork,
         xfs_off_t               start_byte,
-       xfs_off_t               end_byte)
+       xfs_off_t               end_byte,
+       struct xfs_zone_alloc_ctx *ac)
  {
         struct xfs_mount        *mp = ip->i_mount;
         struct xfs_ifork        *ifp = xfs_ifork_ptr(ip, whichfork);
@@ -467,7 +469,10 @@ xfs_bmap_punch_delalloc_range(
                         continue;
                 }
  
-               xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del, 0);
+               xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del,
+                               ac ? XFS_BMAPI_REMAP : 0);
+               if (xfs_is_zoned_inode(ip) && ac)
+                       ac->reserved_blocks += del.br_blockcount;
                 if (!xfs_iext_get_extent(ifp, &icur, &got))
                         break;
         }
@@ -582,7 +587,7 @@ xfs_free_eofblocks(
                 if (ip->i_delayed_blks) {
                         xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK,
                                 round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
-                               LLONG_MAX);
+                               LLONG_MAX, NULL);
                 }
                 xfs_inode_clear_eofblocks_tag(ip);
                 return 0;
@@ -823,14 +828,15 @@ xfs_flush_unmap_range(
  
  int
  xfs_free_file_space(
-       struct xfs_inode        *ip,
-       xfs_off_t               offset,
-       xfs_off_t               len)
+       struct xfs_inode                *ip,
+       xfs_off_t                       offset,
+       xfs_off_t                       len,
+       struct xfs_zone_alloc_ctx       *ac)
  {
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_fileoff_t           startoffset_fsb;
-       xfs_fileoff_t           endoffset_fsb;
-       int                     done = 0, error;
+       struct xfs_mount                *mp = ip->i_mount;
+       xfs_fileoff_t                   startoffset_fsb;
+       xfs_fileoff_t                   endoffset_fsb;
+       int                             done = 0, error;
  
         trace_xfs_free_file_space(ip);
  
@@ -880,7 +886,7 @@ xfs_free_file_space(
                 return 0;
         if (offset + len > XFS_ISIZE(ip))
                 len = XFS_ISIZE(ip) - offset;
-       error = xfs_zero_range(ip, offset, len, NULL);
+       error = xfs_zero_range(ip, offset, len, ac, NULL);
         if (error)
                 return error;
  
@@ -968,7 +974,8 @@ int
  xfs_collapse_file_space(
         struct xfs_inode        *ip,
         xfs_off_t               offset,
-       xfs_off_t               len)
+       xfs_off_t               len,
+       struct xfs_zone_alloc_ctx *ac)
  {
         struct xfs_mount        *mp = ip->i_mount;
         struct xfs_trans        *tp;
@@ -981,7 +988,7 @@ xfs_collapse_file_space(
  
         trace_xfs_collapse_file_space(ip);
  
-       error = xfs_free_file_space(ip, offset, len);
+       error = xfs_free_file_space(ip, offset, len, ac);
         if (error)
                 return error;
  
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h

index b29760d36e1ab1ef5e8392c1457180ce3ac9f59d..41a5b70e19dbefa74e6c2ff8998a967de991b859 100644 (file)
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -15,6 +15,7 @@ struct xfs_inode;
  struct xfs_mount;
  struct xfs_trans;
  struct xfs_bmalloca;
+struct xfs_zone_alloc_ctx;
  
  #ifdef CONFIG_XFS_RT
  int    xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
@@ -31,7 +32,8 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
  #endif /* CONFIG_XFS_RT */
  
  void   xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork,
-               xfs_off_t start_byte, xfs_off_t end_byte);
+               xfs_off_t start_byte, xfs_off_t end_byte,
+               struct xfs_zone_alloc_ctx *ac);
  
  struct kgetbmap {
         __s64           bmv_offset;     /* file offset of segment in blocks */
@@ -56,9 +58,9 @@ int   xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
  int    xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
                              xfs_off_t len);
  int    xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
-                           xfs_off_t len);
+                           xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
  int    xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
-                               xfs_off_t len);
+                               xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
  int    xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
                                 xfs_off_t len);
  
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c

index c4bd145f5ec1bfb1b0fda2bf5a78878c9a2b70e9..70ff5b92882ddfbbb604c8656cd9154fe90f8484 100644 (file)
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -844,15 +844,19 @@ xfs_ioc_trim(
  
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
-       if (mp->m_rtdev_targp &&
+
+       if (mp->m_rtdev_targp && !xfs_has_zoned(mp) &&
             bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
                 rt_bdev = mp->m_rtdev_targp->bt_bdev;
         if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)
                 return -EOPNOTSUPP;
  
-       if (rt_bdev)
+       if (rt_bdev) {
+               if (!bdev_max_discard_sectors(rt_bdev))
+                       return -EOPNOTSUPP;
                 granularity = max(granularity,
                                   bdev_discard_granularity(rt_bdev));
+       }
  
         /*
          * We haven't recovered the log, so we cannot use our bnobt-guided
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c

index a25c713ff888c7c773c8fd4f87751feee82e1ffd..a8d1817542b0080e8e6d14475d93ed0002d3ccfa 100644 (file)
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -29,6 +29,7 @@
  #include "xfs_inode.h"
  #include "xfs_rtbitmap.h"
  #include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
  
  struct kmem_cache      *xfs_efi_cache;
  struct kmem_cache      *xfs_efd_cache;
@@ -774,14 +775,21 @@ xfs_rtextent_free_finish_item(
                         xfs_rtgroup_trans_join(tp, *rtgp,
                                         XFS_RTGLOCK_BITMAP);
                 }
-               error = xfs_rtfree_blocks(tp, *rtgp,
-                               xefi->xefi_startblock, xefi->xefi_blockcount);
+
+               if (xfs_has_zoned(mp)) {
+                       error = xfs_zone_free_blocks(tp, *rtgp,
+                                       xefi->xefi_startblock,
+                                       xefi->xefi_blockcount);
+               } else {
+                       error = xfs_rtfree_blocks(tp, *rtgp,
+                                       xefi->xefi_startblock,
+                                       xefi->xefi_blockcount);
+               }
         }
         if (error == -EAGAIN) {
                 xfs_efd_from_efi(efdp);
                 return error;
         }
-
         xfs_efd_add_extent(efdp, xefi);
         xfs_extent_free_cancel_item(item);
         return error;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index a952d3faa06611e650f2c41f600c7a8ac9b39681..6cccb7c9560b7a66573226fa608be13ae11da4cf 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -25,6 +25,9 @@
  #include "xfs_iomap.h"
  #include "xfs_reflink.h"
  #include "xfs_file.h"
+#include "xfs_aops.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_rtbitmap.h"
  
  #include <linux/dax.h>
  #include <linux/falloc.h>
@@ -360,7 +363,8 @@ xfs_file_write_zero_eof(
         struct iov_iter         *from,
         unsigned int            *iolock,
         size_t                  count,
-       bool                    *drained_dio)
+       bool                    *drained_dio,
+       struct xfs_zone_alloc_ctx *ac)
  {
         struct xfs_inode        *ip = XFS_I(iocb->ki_filp->f_mapping->host);
         loff_t                  isize;
@@ -414,7 +418,7 @@ xfs_file_write_zero_eof(
         trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
  
         xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
-       error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
+       error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
         xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
  
         return error;
@@ -431,7 +435,8 @@ STATIC ssize_t
  xfs_file_write_checks(
         struct kiocb            *iocb,
         struct iov_iter         *from,
-       unsigned int            *iolock)
+       unsigned int            *iolock,
+       struct xfs_zone_alloc_ctx *ac)
  {
         struct inode            *inode = iocb->ki_filp->f_mapping->host;
         size_t                  count = iov_iter_count(from);
@@ -478,10 +483,15 @@ restart:
          * can only extend EOF.  Truncate is locked out at this point, so the
          * EOF can not move backwards, only forwards. Hence we only need to take
          * the slow path when we are at or beyond the current EOF.
+        *
+        * For zoned file systems, we never allocated speculative blocks, so
+        * there is no need to zero anything.  The tail of the block beyond
+        * i_size was already zeroed when writing it, and the beginning of
+        * the block where the write starts will be zeroed by the write itself.
          */
         if (iocb->ki_pos > i_size_read(inode)) {
                 error = xfs_file_write_zero_eof(iocb, from, iolock, count,
-                               &drained_dio);
+                               &drained_dio, ac);
                 if (error == 1)
                         goto restart;
                 if (error)
@@ -503,6 +513,9 @@ xfs_dio_write_end_io(
         loff_t                  offset = iocb->ki_pos;
         unsigned int            nofs_flag;
  
+       ASSERT(!xfs_is_zoned_inode(ip) ||
+              !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
+
         trace_xfs_end_io_direct_write(ip, offset, size);
  
         if (xfs_is_shutdown(ip->i_mount))
@@ -582,14 +595,94 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
         .end_io         = xfs_dio_write_end_io,
  };
  
+static void
+xfs_dio_zoned_submit_io(
+       const struct iomap_iter *iter,
+       struct bio              *bio,
+       loff_t                  file_offset)
+{
+       struct xfs_mount        *mp = XFS_I(iter->inode)->i_mount;
+       struct xfs_zone_alloc_ctx *ac = iter->private;
+       xfs_filblks_t           count_fsb;
+       struct iomap_ioend      *ioend;
+
+       count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
+       if (count_fsb > ac->reserved_blocks) {
+               xfs_err(mp,
+"allocation (%lld) larger than reservation (%lld).",
+                       count_fsb, ac->reserved_blocks);
+               xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+               bio_io_error(bio);
+               return;
+       }
+       ac->reserved_blocks -= count_fsb;
+
+       bio->bi_end_io = xfs_end_bio;
+       ioend = iomap_init_ioend(iter->inode, bio, file_offset,
+                       IOMAP_MAPPED, 0, true);
+       xfs_zone_alloc_and_submit(ioend, &ac->cached_rtg);
+}
+
+static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
+       .bio_set        = &iomap_ioend_bioset,
+       .submit_io      = xfs_dio_zoned_submit_io,
+       .end_io         = xfs_dio_write_end_io,
+};
+
+static ssize_t
+xfs_zoned_write_space_reserve(
+       struct xfs_inode                *ip,
+       struct kiocb                    *iocb,
+       struct iov_iter                 *from,
+       unsigned int                    flags,
+       struct xfs_zone_alloc_ctx       *ac)
+{
+       loff_t                          count = iov_iter_count(from);
+       int                             error;
+
+       if (iocb->ki_flags & IOCB_NOWAIT)
+               flags |= XFS_ZR_NOWAIT;
+
+       /*
+        * Check the rlimit and LFS boundary first so that we don't over-reserve
+        * by possibly a lot.
+        *
+        * The generic write path will redo this check later, and it might have
+        * changed by then.  If it got expanded we'll stick to our earlier
+        * smaller limit, and if it is decreased the new smaller limit will be
+        * used and our extra space reservation will be returned after finishing
+        * the write.
+        */
+       error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
+       if (error)
+               return error;
+
+       /*
+        * Sloppily round up count to file system blocks.
+        *
+        * This will often reserve an extra block, but that avoids having to look
+        * at the start offset, which isn't stable for O_APPEND until taking the
+        * iolock.  Also we need to reserve a block each for zeroing the old
+        * EOF block and the new new start block if they are unaligned.
+        *
+        * Any remaining block will be returned after the write.
+        */
+       return xfs_zoned_space_reserve(ip,
+                       XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2,
+                       flags, ac);
+}
+
  /*
- * Handle block aligned direct I/O writes
+ * Handle block aligned direct I/O writes.
   */
  static noinline ssize_t
  xfs_file_dio_write_aligned(
         struct xfs_inode        *ip,
         struct kiocb            *iocb,
-       struct iov_iter         *from)
+       struct iov_iter         *from,
+       const struct iomap_ops  *ops,
+       const struct iomap_dio_ops *dops,
+       struct xfs_zone_alloc_ctx *ac)
  {
         unsigned int            iolock = XFS_IOLOCK_SHARED;
         ssize_t                 ret;
@@ -597,7 +690,7 @@ xfs_file_dio_write_aligned(
         ret = xfs_ilock_iocb_for_write(iocb, &iolock);
         if (ret)
                 return ret;
-       ret = xfs_file_write_checks(iocb, from, &iolock);
+       ret = xfs_file_write_checks(iocb, from, &iolock, ac);
         if (ret)
                 goto out_unlock;
  
@@ -611,11 +704,31 @@ xfs_file_dio_write_aligned(
                 iolock = XFS_IOLOCK_SHARED;
         }
         trace_xfs_file_direct_write(iocb, from);
-       ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
-                          &xfs_dio_write_ops, 0, NULL, 0);
+       ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
  out_unlock:
-       if (iolock)
-               xfs_iunlock(ip, iolock);
+       xfs_iunlock(ip, iolock);
+       return ret;
+}
+
+/*
+ * Handle block aligned direct I/O writes to zoned devices.
+ */
+static noinline ssize_t
+xfs_file_dio_write_zoned(
+       struct xfs_inode        *ip,
+       struct kiocb            *iocb,
+       struct iov_iter         *from)
+{
+       struct xfs_zone_alloc_ctx ac;
+       ssize_t                 ret;
+
+       ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac);
+       if (ret < 0)
+               return ret;
+       ret = xfs_file_dio_write_aligned(ip, iocb, from,
+                       &xfs_zoned_direct_write_iomap_ops,
+                       &xfs_dio_zoned_write_ops, &ac);
+       xfs_zoned_space_unreserve(ip, &ac);
         return ret;
  }
  
@@ -675,7 +788,7 @@ retry_exclusive:
                 goto out_unlock;
         }
  
-       ret = xfs_file_write_checks(iocb, from, &iolock);
+       ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
         if (ret)
                 goto out_unlock;
  
@@ -732,7 +845,10 @@ xfs_file_dio_write(
             (xfs_is_always_cow_inode(ip) &&
              (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
                 return xfs_file_dio_write_unaligned(ip, iocb, from);
-       return xfs_file_dio_write_aligned(ip, iocb, from);
+       if (xfs_is_zoned_inode(ip))
+               return xfs_file_dio_write_zoned(ip, iocb, from);
+       return xfs_file_dio_write_aligned(ip, iocb, from,
+                       &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
  }
  
  static noinline ssize_t
@@ -749,7 +865,7 @@ xfs_file_dax_write(
         ret = xfs_ilock_iocb(iocb, iolock);
         if (ret)
                 return ret;
-       ret = xfs_file_write_checks(iocb, from, &iolock);
+       ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
         if (ret)
                 goto out;
  
@@ -793,7 +909,7 @@ write_retry:
         if (ret)
                 return ret;
  
-       ret = xfs_file_write_checks(iocb, from, &iolock);
+       ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
         if (ret)
                 goto out;
  
@@ -840,6 +956,67 @@ out:
         return ret;
  }
  
+STATIC ssize_t
+xfs_file_buffered_write_zoned(
+       struct kiocb            *iocb,
+       struct iov_iter         *from)
+{
+       struct xfs_inode        *ip = XFS_I(iocb->ki_filp->f_mapping->host);
+       struct xfs_mount        *mp = ip->i_mount;
+       unsigned int            iolock = XFS_IOLOCK_EXCL;
+       bool                    cleared_space = false;
+       struct xfs_zone_alloc_ctx ac;
+       ssize_t                 ret;
+
+       ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac);
+       if (ret < 0)
+               return ret;
+
+       ret = xfs_ilock_iocb(iocb, iolock);
+       if (ret)
+               goto out_unreserve;
+
+       ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
+       if (ret)
+               goto out_unlock;
+
+       /*
+        * Truncate the iter to the length that we were actually able to
+        * allocate blocks for.  This needs to happen after
+        * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
+        * writes.
+        */
+       iov_iter_truncate(from,
+                       XFS_FSB_TO_B(mp, ac.reserved_blocks) -
+                       (iocb->ki_pos & mp->m_blockmask));
+       if (!iov_iter_count(from))
+               goto out_unlock;
+
+retry:
+       trace_xfs_file_buffered_write(iocb, from);
+       ret = iomap_file_buffered_write(iocb, from,
+                       &xfs_buffered_write_iomap_ops, &ac);
+       if (ret == -ENOSPC && !cleared_space) {
+               /* 
+                * Kick off writeback to convert delalloc space and release the
+                * usually too pessimistic indirect block reservations.
+                */
+               xfs_flush_inodes(mp);
+               cleared_space = true;
+               goto retry;
+       }
+
+out_unlock:
+       xfs_iunlock(ip, iolock);
+out_unreserve:
+       xfs_zoned_space_unreserve(ip, &ac);
+       if (ret > 0) {
+               XFS_STATS_ADD(mp, xs_write_bytes, ret);
+               ret = generic_write_sync(iocb, ret);
+       }
+       return ret;
+}
+
  STATIC ssize_t
  xfs_file_write_iter(
         struct kiocb            *iocb,
@@ -873,6 +1050,8 @@ xfs_file_write_iter(
                         return ret;
         }
  
+       if (xfs_is_zoned_inode(ip))
+               return xfs_file_buffered_write_zoned(iocb, from);
         return xfs_file_buffered_write(iocb, from);
  }
  
@@ -927,7 +1106,8 @@ static int
  xfs_falloc_collapse_range(
         struct file             *file,
         loff_t                  offset,
-       loff_t                  len)
+       loff_t                  len,
+       struct xfs_zone_alloc_ctx *ac)
  {
         struct inode            *inode = file_inode(file);
         loff_t                  new_size = i_size_read(inode) - len;
@@ -943,7 +1123,7 @@ xfs_falloc_collapse_range(
         if (offset + len >= i_size_read(inode))
                 return -EINVAL;
  
-       error = xfs_collapse_file_space(XFS_I(inode), offset, len);
+       error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
         if (error)
                 return error;
         return xfs_falloc_setsize(file, new_size);
@@ -999,7 +1179,8 @@ xfs_falloc_zero_range(
         struct file             *file,
         int                     mode,
         loff_t                  offset,
-       loff_t                  len)
+       loff_t                  len,
+       struct xfs_zone_alloc_ctx *ac)
  {
         struct inode            *inode = file_inode(file);
         unsigned int            blksize = i_blocksize(inode);
@@ -1012,7 +1193,7 @@ xfs_falloc_zero_range(
         if (error)
                 return error;
  
-       error = xfs_free_file_space(XFS_I(inode), offset, len);
+       error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
         if (error)
                 return error;
  
@@ -1093,12 +1274,29 @@ xfs_file_fallocate(
         struct xfs_inode        *ip = XFS_I(inode);
         long                    error;
         uint                    iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+       struct xfs_zone_alloc_ctx ac = { };
  
         if (!S_ISREG(inode->i_mode))
                 return -EINVAL;
         if (mode & ~XFS_FALLOC_FL_SUPPORTED)
                 return -EOPNOTSUPP;
  
+       /*
+        * For zoned file systems, zeroing the first and last block of a hole
+        * punch requires allocating a new block to rewrite the remaining data
+        * and new zeroes out of place.  Get a reservations for those before
+        * taking the iolock.  Dip into the reserved pool because we are
+        * expected to be able to punch a hole even on a completely full
+        * file system.
+        */
+       if (xfs_is_zoned_inode(ip) &&
+           (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
+                    FALLOC_FL_COLLAPSE_RANGE))) {
+               error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac);
+               if (error)
+                       return error;
+       }
+
         xfs_ilock(ip, iolock);
         error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
         if (error)
@@ -1119,16 +1317,16 @@ xfs_file_fallocate(
  
         switch (mode & FALLOC_FL_MODE_MASK) {
         case FALLOC_FL_PUNCH_HOLE:
-               error = xfs_free_file_space(ip, offset, len);
+               error = xfs_free_file_space(ip, offset, len, &ac);
                 break;
         case FALLOC_FL_COLLAPSE_RANGE:
-               error = xfs_falloc_collapse_range(file, offset, len);
+               error = xfs_falloc_collapse_range(file, offset, len, &ac);
                 break;
         case FALLOC_FL_INSERT_RANGE:
                 error = xfs_falloc_insert_range(file, offset, len);
                 break;
         case FALLOC_FL_ZERO_RANGE:
-               error = xfs_falloc_zero_range(file, mode, offset, len);
+               error = xfs_falloc_zero_range(file, mode, offset, len, &ac);
                 break;
         case FALLOC_FL_UNSHARE_RANGE:
                 error = xfs_falloc_unshare_range(file, mode, offset, len);
@@ -1146,6 +1344,8 @@ xfs_file_fallocate(
  
  out_unlock:
         xfs_iunlock(ip, iolock);
+       if (xfs_is_zoned_inode(ip))
+               xfs_zoned_space_unreserve(ip, &ac);
         return error;
  }
  
@@ -1449,8 +1649,24 @@ xfs_write_fault(
         struct inode            *inode = file_inode(vmf->vma->vm_file);
         struct xfs_inode        *ip = XFS_I(inode);
         unsigned int            lock_mode = XFS_MMAPLOCK_SHARED;
+       struct xfs_zone_alloc_ctx ac;
+       int                     error;
         vm_fault_t              ret;
  
+       if (xfs_is_zoned_inode(ip)) {
+               /*
+                * This could over-allocate as it doesn't check for truncation.
+                * But as the overallocation is limited to less than a folio and
+                * will be release instantly that's just fine.
+                */
+               unsigned int    len = folio_size(page_folio(vmf->page));
+
+               error = xfs_zoned_space_reserve(ip,
+                               XFS_B_TO_FSB(ip->i_mount, len), 0, &ac);
+               if (error < 0)
+                       return vmf_fs_error(error);
+       }
+
         sb_start_pagefault(inode->i_sb);
         file_update_time(vmf->vma->vm_file);
  
@@ -1470,10 +1686,12 @@ xfs_write_fault(
                 ret = xfs_dax_fault_locked(vmf, order, true);
         else
                 ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops,
-                               NULL);
+                               xfs_is_zoned_inode(ip) ? &ac : NULL);
         xfs_iunlock(ip, lock_mode);
  
         sb_end_pagefault(inode->i_sb);
+       if (xfs_is_zoned_inode(ip))
+               xfs_zoned_space_unreserve(ip, &ac);
         return ret;
  }
  
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c

index 84d81f29222dbe987d0fce1f765ee514a7e51ce3..342b34e437f829a96a435aaff0a42ad428e25871 100644 (file)
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -2073,7 +2073,7 @@ xfs_inodegc_want_queue_rt_file(
  {
         struct xfs_mount        *mp = ip->i_mount;
  
-       if (!XFS_IS_REALTIME_INODE(ip))
+       if (!XFS_IS_REALTIME_INODE(ip) || xfs_has_zoned(mp))
                 return false;
  
         if (xfs_compare_freecounter(mp, FREE_RTEXTENTS,
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h

index 0ece40d38f4bd02cbd97d5d5d59e2798339131f9..254c59ae77d096abb95c7062fd9003ebf214ff59 100644 (file)
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -605,7 +605,8 @@ int xfs_break_layouts(struct inode *inode, uint *iolock,
  
  static inline void xfs_update_stable_writes(struct xfs_inode *ip)
  {
-       if (bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev))
+       if (xfs_is_zoned_inode(ip) ||
+           bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev))
                 mapping_set_stable_writes(VFS_I(ip)->i_mapping);
         else
                 mapping_clear_stable_writes(VFS_I(ip)->i_mapping);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c

index 4e25748b682279e6f185a1dd1ba892c63dd99f80..65b222281169ad6ec169c03f9d4ffde8a5856c6e 100644 (file)
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -41,6 +41,7 @@
  #include "xfs_exchrange.h"
  #include "xfs_handle.h"
  #include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
  
  #include <linux/mount.h>
  #include <linux/fileattr.h>
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c

index e6854001b35487f05353f0165240ebf5b5bac633..8f24a2a357b571285e97cf3a60560d9ef741d588 100644 (file)
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -30,6 +30,8 @@
  #include "xfs_reflink.h"
  #include "xfs_health.h"
  #include "xfs_rtbitmap.h"
+#include "xfs_icache.h"
+#include "xfs_zone_alloc.h"
  
  #define XFS_ALLOC_ALIGN(mp, off) \
         (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
@@ -170,17 +172,6 @@ xfs_hole_to_iomap(
         iomap->dax_dev = target->bt_daxdev;
  }
  
-static inline xfs_fileoff_t
-xfs_iomap_end_fsb(
-       struct xfs_mount        *mp,
-       loff_t                  offset,
-       loff_t                  count)
-{
-       ASSERT(offset <= mp->m_super->s_maxbytes);
-       return min(XFS_B_TO_FSB(mp, offset + count),
-                  XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
-}
-
  static xfs_extlen_t
  xfs_eof_alignment(
         struct xfs_inode        *ip)
@@ -963,6 +954,60 @@ const struct iomap_ops xfs_direct_write_iomap_ops = {
         .iomap_begin            = xfs_direct_write_iomap_begin,
  };
  
+#ifdef CONFIG_XFS_RT
+/*
+ * This is really simple.  The space has already been reserved before taking the
+ * IOLOCK, the actual block allocation is done just before submitting the bio
+ * and only recorded in the extent map on I/O completion.
+ */
+static int
+xfs_zoned_direct_write_iomap_begin(
+       struct inode            *inode,
+       loff_t                  offset,
+       loff_t                  length,
+       unsigned                flags,
+       struct iomap            *iomap,
+       struct iomap            *srcmap)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+       int                     error;
+
+       ASSERT(!(flags & IOMAP_OVERWRITE_ONLY));
+
+       /*
+        * Needs to be pushed down into the allocator so that only writes into
+        * a single zone can be supported.
+        */
+       if (flags & IOMAP_NOWAIT)
+               return -EAGAIN;
+
+       /*
+        * Ensure the extent list is in memory in so that we don't have to do
+        * read it from the I/O completion handler.
+        */
+       if (xfs_need_iread_extents(&ip->i_df)) {
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               if (error)
+                       return error;
+       }
+
+       iomap->type = IOMAP_MAPPED;
+       iomap->flags = IOMAP_F_DIRTY;
+       iomap->bdev = ip->i_mount->m_rtdev_targp->bt_bdev;
+       iomap->offset = offset;
+       iomap->length = length;
+       iomap->flags = IOMAP_F_ZONE_APPEND;
+       iomap->addr = 0;
+       return 0;
+}
+
+const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
+       .iomap_begin            = xfs_zoned_direct_write_iomap_begin,
+};
+#endif /* CONFIG_XFS_RT */
+
  static int
  xfs_dax_write_iomap_end(
         struct inode            *inode,
@@ -990,6 +1035,176 @@ const struct iomap_ops xfs_dax_write_iomap_ops = {
         .iomap_end      = xfs_dax_write_iomap_end,
  };
  
+static int
+xfs_zoned_buffered_write_iomap_begin(
+       struct inode            *inode,
+       loff_t                  offset,
+       loff_t                  count,
+       unsigned                flags,
+       struct iomap            *iomap,
+       struct iomap            *srcmap)
+{
+       struct iomap_iter       *iter =
+               container_of(iomap, struct iomap_iter, iomap);
+       struct xfs_zone_alloc_ctx *ac = iter->private;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       xfs_fileoff_t           end_fsb = xfs_iomap_end_fsb(mp, offset, count);
+       u16                     iomap_flags = IOMAP_F_SHARED;
+       unsigned int            lockmode = XFS_ILOCK_EXCL;
+       xfs_filblks_t           count_fsb;
+       xfs_extlen_t            indlen;
+       struct xfs_bmbt_irec    got;
+       struct xfs_iext_cursor  icur;
+       int                     error = 0;
+
+       ASSERT(!xfs_get_extsz_hint(ip));
+       ASSERT(!(flags & IOMAP_UNSHARE));
+       ASSERT(ac);
+
+       if (xfs_is_shutdown(mp))
+               return -EIO;
+
+       error = xfs_qm_dqattach(ip);
+       if (error)
+               return error;
+
+       error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+       if (error)
+               return error;
+
+       if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
+           XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+               xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
+               error = -EFSCORRUPTED;
+               goto out_unlock;
+       }
+
+       XFS_STATS_INC(mp, xs_blk_mapw);
+
+       error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+       if (error)
+               goto out_unlock;
+
+       /*
+        * For zeroing operations check if there is any data to zero first.
+        *
+        * For regular writes we always need to allocate new blocks, but need to
+        * provide the source mapping when the range is unaligned to support
+        * read-modify-write of the whole block in the page cache.
+        *
+        * In either case we need to limit the reported range to the boundaries
+        * of the source map in the data fork.
+        */
+       if (!IS_ALIGNED(offset, mp->m_sb.sb_blocksize) ||
+           !IS_ALIGNED(offset + count, mp->m_sb.sb_blocksize) ||
+           (flags & IOMAP_ZERO)) {
+               struct xfs_bmbt_irec    smap;
+               struct xfs_iext_cursor  scur;
+
+               if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &scur,
+                               &smap))
+                       smap.br_startoff = end_fsb; /* fake hole until EOF */
+               if (smap.br_startoff > offset_fsb) {
+                       /*
+                        * We never need to allocate blocks for zeroing a hole.
+                        */
+                       if (flags & (IOMAP_UNSHARE | IOMAP_ZERO)) {
+                               xfs_hole_to_iomap(ip, iomap, offset_fsb,
+                                               smap.br_startoff);
+                               goto out_unlock;
+                       }
+                       end_fsb = min(end_fsb, smap.br_startoff);
+               } else {
+                       end_fsb = min(end_fsb,
+                               smap.br_startoff + smap.br_blockcount);
+                       xfs_trim_extent(&smap, offset_fsb,
+                                       end_fsb - offset_fsb);
+                       error = xfs_bmbt_to_iomap(ip, srcmap, &smap, flags, 0,
+                                       xfs_iomap_inode_sequence(ip, 0));
+                       if (error)
+                               goto out_unlock;
+               }
+       }
+
+       if (!ip->i_cowfp)
+               xfs_ifork_init_cow(ip);
+
+       if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
+               got.br_startoff = end_fsb;
+       if (got.br_startoff <= offset_fsb) {
+               trace_xfs_reflink_cow_found(ip, &got);
+               goto done;
+       }
+
+       /*
+        * Cap the maximum length to keep the chunks of work done here somewhat
+        * symmetric with the work writeback does.
+        */
+       end_fsb = min(end_fsb, got.br_startoff);
+       count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN,
+                        XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE));
+
+       /*
+        * The block reservation is supposed to cover all blocks that the
+        * operation could possible write, but there is a nasty corner case
+        * where blocks could be stole from underneath us:
+        *
+        *  1) while this thread iterates over a larger buffered write,
+        *  2) another thread is causing a write fault that calls into
+        *     ->page_mkwrite in range this thread writes to, using up the
+        *     delalloc reservation created by a previous call to this function.
+        *  3) another thread does direct I/O on the range that the write fault
+        *     happened on, which causes writeback of the dirty data.
+        *  4) this then set the stale flag, which cuts the current iomap
+        *     iteration short, causing the new call to ->iomap_begin that gets
+        *     us here again, but now without a sufficient reservation.
+        *
+        * This is a very unusual I/O pattern, and nothing but generic/095 is
+        * known to hit it. There's not really much we can do here, so turn this
+        * into a short write.
+        */
+       if (count_fsb > ac->reserved_blocks) {
+               xfs_warn_ratelimited(mp,
+"Short write on ino 0x%llx comm %.20s due to three-way race with write fault and direct I/O",
+                       ip->i_ino, current->comm);
+               count_fsb = ac->reserved_blocks;
+               if (!count_fsb) {
+                       error = -EIO;
+                       goto out_unlock;
+               }
+       }
+
+       error = xfs_quota_reserve_blkres(ip, count_fsb);
+       if (error)
+               goto out_unlock;
+
+       indlen = xfs_bmap_worst_indlen(ip, count_fsb);
+       error = xfs_dec_fdblocks(mp, indlen, false);
+       if (error)
+               goto out_unlock;
+       ip->i_delayed_blks += count_fsb;
+       xfs_mod_delalloc(ip, count_fsb, indlen);
+
+       got.br_startoff = offset_fsb;
+       got.br_startblock = nullstartblock(indlen);
+       got.br_blockcount = count_fsb;
+       got.br_state = XFS_EXT_NORM;
+       xfs_bmap_add_extent_hole_delay(ip, XFS_COW_FORK, &icur, &got);
+       ac->reserved_blocks -= count_fsb;
+       iomap_flags |= IOMAP_F_NEW;
+
+       trace_xfs_iomap_alloc(ip, offset, XFS_FSB_TO_B(mp, count_fsb),
+                       XFS_COW_FORK, &got);
+done:
+       error = xfs_bmbt_to_iomap(ip, iomap, &got, flags, iomap_flags,
+                       xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED));
+out_unlock:
+       xfs_iunlock(ip, lockmode);
+       return error;
+}
+
  static int
  xfs_buffered_write_iomap_begin(
         struct inode            *inode,
@@ -1016,6 +1231,10 @@ xfs_buffered_write_iomap_begin(
         if (xfs_is_shutdown(mp))
                 return -EIO;
  
+       if (xfs_is_zoned_inode(ip))
+               return xfs_zoned_buffered_write_iomap_begin(inode, offset,
+                               count, flags, iomap, srcmap);
+
         /* we can't use delayed allocations when using extent size hints */
         if (xfs_get_extsz_hint(ip))
                 return xfs_direct_write_iomap_begin(inode, offset, count,
@@ -1248,10 +1467,13 @@ xfs_buffered_write_delalloc_punch(
         loff_t                  length,
         struct iomap            *iomap)
  {
+       struct iomap_iter       *iter =
+               container_of(iomap, struct iomap_iter, iomap);
+
         xfs_bmap_punch_delalloc_range(XFS_I(inode),
                         (iomap->flags & IOMAP_F_SHARED) ?
                                 XFS_COW_FORK : XFS_DATA_FORK,
-                       offset, offset + length);
+                       offset, offset + length, iter->private);
  }
  
  static int
@@ -1486,12 +1708,13 @@ const struct iomap_ops xfs_xattr_iomap_ops = {
  
  int
  xfs_zero_range(
-       struct xfs_inode        *ip,
-       loff_t                  pos,
-       loff_t                  len,
-       bool                    *did_zero)
+       struct xfs_inode                *ip,
+       loff_t                          pos,
+       loff_t                          len,
+       struct xfs_zone_alloc_ctx       *ac,
+       bool                            *did_zero)
  {
-       struct inode            *inode = VFS_I(ip);
+       struct inode                    *inode = VFS_I(ip);
  
         xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
  
@@ -1499,13 +1722,14 @@ xfs_zero_range(
                 return dax_zero_range(inode, pos, len, did_zero,
                                       &xfs_dax_write_iomap_ops);
         return iomap_zero_range(inode, pos, len, did_zero,
-                               &xfs_buffered_write_iomap_ops, NULL);
+                               &xfs_buffered_write_iomap_ops, ac);
  }
  
  int
  xfs_truncate_page(
         struct xfs_inode        *ip,
         loff_t                  pos,
+       struct xfs_zone_alloc_ctx *ac,
         bool                    *did_zero)
  {
         struct inode            *inode = VFS_I(ip);
@@ -1514,5 +1738,5 @@ xfs_truncate_page(
                 return dax_truncate_page(inode, pos, did_zero,
                                         &xfs_dax_write_iomap_ops);
         return iomap_truncate_page(inode, pos, did_zero,
-                                  &xfs_buffered_write_iomap_ops, NULL);
+                                  &xfs_buffered_write_iomap_ops, ac);
  }
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h

index 4da13440bae9bd7e67f5044b8bfc3544d57e8be9..e9ddb5a1007e457f562796533f63092818ba3a6f 100644 (file)
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -10,6 +10,7 @@
  
  struct xfs_inode;
  struct xfs_bmbt_irec;
+struct xfs_zone_alloc_ctx;
  
  int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
                 xfs_fileoff_t count_fsb, unsigned int flags,
@@ -24,8 +25,20 @@ int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
                 u16 iomap_flags, u64 sequence_cookie);
  
  int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
-               bool *did_zero);
-int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero);
+               struct xfs_zone_alloc_ctx *ac, bool *did_zero);
+int xfs_truncate_page(struct xfs_inode *ip, loff_t pos,
+               struct xfs_zone_alloc_ctx *ac, bool *did_zero);
+
+static inline xfs_fileoff_t
+xfs_iomap_end_fsb(
+       struct xfs_mount        *mp,
+       loff_t                  offset,
+       loff_t                  count)
+{
+       ASSERT(offset <= mp->m_super->s_maxbytes);
+       return min(XFS_B_TO_FSB(mp, offset + count),
+                  XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+}
  
  static inline xfs_filblks_t
  xfs_aligned_fsb_count(
@@ -50,6 +63,7 @@ xfs_aligned_fsb_count(
  extern const struct iomap_ops xfs_buffered_write_iomap_ops;
  extern const struct iomap_ops xfs_page_mkwrite_iomap_ops;
  extern const struct iomap_ops xfs_direct_write_iomap_ops;
+extern const struct iomap_ops xfs_zoned_direct_write_iomap_ops;
  extern const struct iomap_ops xfs_read_iomap_ops;
  extern const struct iomap_ops xfs_seek_iomap_ops;
  extern const struct iomap_ops xfs_xattr_iomap_ops;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c

index 66a726a5fbbba2854479e4cf8674a3a8188f47e8..31c5e9d58e535daae20dd355b27a31eae401ffbe 100644 (file)
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -29,6 +29,7 @@
  #include "xfs_xattr.h"
  #include "xfs_file.h"
  #include "xfs_bmap.h"
+#include "xfs_zone_alloc.h"
  
  #include <linux/posix_acl.h>
  #include <linux/security.h>
@@ -822,6 +823,7 @@ xfs_setattr_size(
         uint                    lock_flags = 0;
         uint                    resblks = 0;
         bool                    did_zeroing = false;
+       struct xfs_zone_alloc_ctx ac;
  
         xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
         ASSERT(S_ISREG(inode->i_mode));
@@ -857,6 +859,28 @@ xfs_setattr_size(
          */
         inode_dio_wait(inode);
  
+       /*
+        * Normally xfs_zoned_space_reserve is supposed to be called outside the
+        * IOLOCK.  For for truncate we can't do that since ->setattr is called
+        * with it already held by the VFS.  So for now chicken out and try to
+        * allocate space under it.
+        *
+        * To avoid deadlocks this means we can't block waiting for space, which
+        * can lead to spurious -ENOSPC if there are no directly available
+        * blocks.  We mitigate this a bit by allowing zeroing to dip into the
+        * reserved pool, but eventually the VFS calling convention needs to
+        * change.
+        */
+       if (xfs_is_zoned_inode(ip)) {
+               error = xfs_zoned_space_reserve(ip, 1,
+                               XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac);
+               if (error) {
+                       if (error == -EAGAIN)
+                               return -ENOSPC;
+                       return error;
+               }
+       }
+
         /*
          * File data changes must be complete before we start the transaction to
          * modify the inode.  This needs to be done before joining the inode to
@@ -870,11 +894,14 @@ xfs_setattr_size(
         if (newsize > oldsize) {
                 trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
                 error = xfs_zero_range(ip, oldsize, newsize - oldsize,
-                               &did_zeroing);
+                               &ac, &did_zeroing);
         } else {
-               error = xfs_truncate_page(ip, newsize, &did_zeroing);
+               error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing);
         }
  
+       if (xfs_is_zoned_inode(ip))
+               xfs_zoned_space_unreserve(ip, &ac);
+
         if (error)
                 return error;
  
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c

index 26b2f5887b88193175e8abafa12b17a9bd370f31..d64a95f126f97862db32537c9020919e790dd40a 100644 (file)
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3531,6 +3531,12 @@ xlog_force_shutdown(
         spin_unlock(&log->l_icloglock);
  
         wake_up_var(&log->l_opstate);
+
+       if (xfs_has_zoned(log->l_mp) && IS_ENABLED(CONFIG_XFS_RT)) {
+               spin_lock(&log->l_mp->m_zone_list_lock);
+               wake_up_all(&log->l_mp->m_zone_wait);
+               spin_unlock(&log->l_mp->m_zone_list_lock);
+       }
         return log_error;
  }
  
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c

index 4d8cbf173f60ec4b1ad7eb4fff9fb750e6a58342..03ecdda1d34b827c9411409901146ed0c444fc52 100644 (file)
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -40,6 +40,8 @@
  #include "xfs_rtrmap_btree.h"
  #include "xfs_rtrefcount_btree.h"
  #include "scrub/stats.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_zone_alloc.h"
  
  static DEFINE_MUTEX(xfs_uuid_table_mutex);
  static int xfs_uuid_table_size;
@@ -469,22 +471,26 @@ xfs_default_resblks(
         struct xfs_mount        *mp,
         unsigned int            idx)
  {
-       uint64_t resblks;
-
-       if (idx == FREE_RTEXTENTS)
-               return 0;
+       switch (idx) {
+       case FREE_BLOCKS:
+               /*
+                * We default to 5% or 8192 FSBs of space reserved, whichever is
+                * smaller.
+                *
+                * This is intended to cover concurrent allocation transactions
+                * when we initially hit ENOSPC.  These each require a 4 block
+                * reservation. Hence by default we cover roughly 2000
+                * concurrent allocation reservations.
+                */
+               return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL);
+       case FREE_RTEXTENTS:
+       case FREE_RTAVAILABLE:
+               if (!IS_ENABLED(CONFIG_XFS_RT) || !xfs_has_zoned(mp))
+                       break;
+               return xfs_zoned_default_resblks(mp, idx);
+       }
  
-       /*
-        * We default to 5% or 8192 fsbs of space reserved, whichever is
-        * smaller.  This is intended to cover concurrent allocation
-        * transactions when we initially hit enospc. These each require a 4
-        * block reservation. Hence by default we cover roughly 2000 concurrent
-        * allocation reservations.
-        */
-       resblks = mp->m_sb.sb_dblocks;
-       do_div(resblks, 20);
-       resblks = min_t(uint64_t, resblks, 8192);
-       return resblks;
+       return 0;
  }
  
  /* Ensure the summary counts are correct. */
@@ -551,7 +557,7 @@ xfs_check_summary_counts(
          * If we're mounting the rt volume after recovering the log, recompute
          * frextents from the rtbitmap file to fix the inconsistency.
          */
-       if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
+       if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) {
                 error = xfs_rtalloc_reinit_frextents(mp);
                 if (error)
                         return error;
@@ -1078,6 +1084,9 @@ xfs_mountfs(
                         goto out_agresv;
         }
  
+       if (!xfs_is_readonly(mp))
+               xfs_zone_gc_start(mp);
+
         return 0;
  
   out_agresv:
@@ -1162,6 +1171,8 @@ xfs_unmountfs(
         xfs_inodegc_flush(mp);
  
         xfs_blockgc_stop(mp);
+       if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate))
+               xfs_zone_gc_stop(mp);
         xfs_fs_unreserve_ag_blocks(mp);
         xfs_qm_unmount_quotas(mp);
         xfs_rtunmount_inodes(mp);
@@ -1247,7 +1258,7 @@ xfs_freecounter_unavailable(
         struct xfs_mount        *mp,
         unsigned int            idx)
  {
-       if (idx == FREE_RTEXTENTS)
+       if (idx == FREE_RTEXTENTS || idx == FREE_RTAVAILABLE)
                 return 0;
         return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
  }
@@ -1345,7 +1356,9 @@ xfs_dec_freecounter(
                 spin_unlock(&mp->m_sb_lock);
                 return 0;
         }
-       xfs_warn_once(mp,
+
+       if (idx == FREE_BLOCKS)
+               xfs_warn_once(mp,
  "Reserve blocks depleted! Consider increasing reserve pool size.");
  
  fdblocks_enospc:
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h

index c0f870130fbb8278d1fc672a59181573ae9d2a83..c7ae70fde64d511c0ceb7f85cc4233f3d4d78362 100644 (file)
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -108,6 +108,7 @@ struct xfs_groups {
  enum {
         FREE_BLOCKS,            /* free block counter */
         FREE_RTEXTENTS,         /* free rt extent counter */
+       FREE_RTAVAILABLE,       /* actually available rt extents */
         FREE_NR,
  };
  
@@ -255,7 +256,20 @@ typedef struct xfs_mount {
                 uint64_t        avail;          /* available reserved blocks */
                 uint64_t        save;           /* reserved blks @ remount,ro */
         } m_resblks[FREE_NR];
+       struct list_head        m_free_zones;
+       struct list_head        m_open_zones;
+       atomic_t                m_nr_free_zones;
+       unsigned int            m_nr_open_zones;
+       unsigned int            m_max_open_zones;
+       uint64_t                m_zoned_op;
+       struct list_head        m_emptied_zones;
+       spinlock_t              m_zone_list_lock;
+       wait_queue_head_t       m_zone_wait;
+       struct xfs_rtgroup      *m_open_gc_zone;
         struct delayed_work     m_reclaim_work; /* background inode reclaim */
+       spinlock_t              m_reservation_lock;
+       struct list_head        m_reclaim_reservations;
+       struct task_struct      *m_zone_gc_thread;
         struct dentry           *m_debugfs;     /* debugfs parent */
         struct xfs_kobj         m_kobj;
         struct xfs_kobj         m_error_kobj;
@@ -428,6 +442,11 @@ static inline bool xfs_has_rtreflink(const struct xfs_mount *mp)
                xfs_has_reflink(mp);
  }
  
+static inline bool xfs_has_nonzoned(const struct xfs_mount *mp)
+{
+       return !xfs_has_zoned(mp);
+}
+
  /*
   * Some features are always on for v5 file systems, allow the compiler to
   * eliminiate dead code when building without v4 support.
@@ -531,6 +550,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
  #define XFS_OPSTATE_WARNED_METADIR     17
  /* Filesystem should use qflags to determine quotaon status */
  #define XFS_OPSTATE_RESUMING_QUOTAON   18
+/* (Zoned) GC is in progress */
+#define XFS_OPSTATE_IN_GC              19
  
  #define __XFS_IS_OPSTATE(name, NAME) \
  static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -575,6 +596,7 @@ static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp)
  #endif /* CONFIG_XFS_QUOTA */
  __XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
  __XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
+__XFS_IS_OPSTATE(in_gc, IN_GC)
  
  static inline bool
  xfs_should_warn(struct xfs_mount *mp, long nr)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c

index 304cf53b0908ece468058f647e61cf0cecac038b..346a8e4d1a48aefd1a4991e7aaac0c82506ff8b6 100644 (file)
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -34,6 +34,7 @@
  #include "xfs_rtalloc.h"
  #include "xfs_rtgroup.h"
  #include "xfs_metafile.h"
+#include "xfs_zone_alloc.h"
  
  /*
   * Copy on Write of Shared Blocks
@@ -965,12 +966,137 @@ xfs_reflink_end_cow(
          */
         while (end_fsb > offset_fsb && !error)
                 error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
-
         if (error)
                 trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
         return error;
  }
  
+#ifdef CONFIG_XFS_RT
+static int
+xfs_zoned_end_extent(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip,
+       struct xfs_bmbt_irec    *new,
+       xfs_fsblock_t           old_startblock)
+{
+       struct xfs_bmbt_irec    data;
+       int                     nmaps = 1;
+       int                     error;
+
+       /* Grab the corresponding mapping in the data fork. */
+       error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data,
+                              &nmaps, 0);
+       if (error)
+               return error;
+
+       /*
+        * Cap the update to the existing extent in the data fork because we can
+        * only overwrite one extent at a time.
+        */
+       ASSERT(new->br_blockcount >= data.br_blockcount);
+       new->br_blockcount = data.br_blockcount;
+
+       /*
+        * If a data write raced with this GC write, keep the existing data in
+        * the data fork, mark our newly written GC extent as reclaimable, then
+        * move on to the next extent.
+        */
+       if (old_startblock != NULLFSBLOCK &&
+           old_startblock != data.br_startblock)
+               goto skip;
+
+       trace_xfs_reflink_cow_remap_from(ip, new);
+       trace_xfs_reflink_cow_remap_to(ip, &data);
+
+       error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
+                       XFS_IEXT_REFLINK_END_COW_CNT);
+       if (error)
+               return error;
+
+       if (data.br_startblock != HOLESTARTBLOCK) {
+               ASSERT(data.br_startblock != DELAYSTARTBLOCK);
+               ASSERT(!isnullstartblock(data.br_startblock));
+
+               xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
+               if (xfs_is_reflink_inode(ip)) {
+                       xfs_refcount_decrease_extent(tp, true, &data);
+               } else {
+                       error = xfs_free_extent_later(tp, data.br_startblock,
+                                       data.br_blockcount, NULL,
+                                       XFS_AG_RESV_NONE,
+                                       XFS_FREE_EXTENT_REALTIME);
+                       if (error)
+                               return error;
+               }
+       }
+
+       error = xfs_zone_record_blocks(tp, new->br_startblock,
+                       new->br_blockcount, true);
+       if (error)
+               return error;
+
+       /* Map the new blocks into the data fork. */
+       xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new);
+       return 0;
+
+skip:
+       trace_xfs_reflink_cow_remap_skip(ip, new);
+       return xfs_zone_record_blocks(tp, new->br_startblock,
+                       new->br_blockcount, false);
+}
+
+int
+xfs_zoned_end_io(
+       struct xfs_inode                *ip,
+       xfs_off_t                       offset,
+       xfs_off_t                       count,
+       xfs_daddr_t                     daddr,
+       xfs_fsblock_t                   old_startblock)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + count);
+       struct xfs_bmbt_irec    new = {
+               .br_startoff    = XFS_B_TO_FSBT(mp, offset),
+               .br_startblock  = xfs_daddr_to_rtb(mp, daddr),
+               .br_state       = XFS_EXT_NORM,
+       };
+       unsigned int            resblks =
+               XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+       struct xfs_trans        *tp;
+       int                     error;
+
+       if (xfs_is_shutdown(mp))
+               return -EIO;
+
+       while (new.br_startoff < end_fsb) {
+               new.br_blockcount = end_fsb - new.br_startoff;
+
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+                               XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp);
+               if (error)
+                       return error;
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               xfs_trans_ijoin(tp, ip, 0);
+
+               error = xfs_zoned_end_extent(tp, ip, &new, old_startblock);
+               if (error)
+                       xfs_trans_cancel(tp);
+               else
+                       error = xfs_trans_commit(tp);
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               if (error)
+                       return error;
+
+               new.br_startoff += new.br_blockcount;
+               new.br_startblock += new.br_blockcount;
+               if (old_startblock != NULLFSBLOCK)
+                       old_startblock += new.br_blockcount;
+       }
+
+       return 0;
+}
+#endif /* CONFIG_XFS_RT */
+
  /*
   * Free all CoW staging blocks that are still referenced by the ondisk refcount
   * metadata.  The ondisk metadata does not track which inode created the
@@ -1538,7 +1664,7 @@ xfs_reflink_zero_posteof(
                 return 0;
  
         trace_xfs_zero_eof(ip, isize, pos - isize);
-       return xfs_zero_range(ip, isize, pos - isize, NULL);
+       return xfs_zero_range(ip, isize, pos - isize, NULL, NULL);
  }
  
  /*
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h

index cc4e92278279b6231135512428f4a359689ebb22..7c5c06ce177ab000da01eb3fa08fc0ec4ffd5caa 100644 (file)
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -41,8 +41,10 @@ extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
                 xfs_fileoff_t end_fsb, bool cancel_real);
  extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
                 xfs_off_t count, bool cancel_real);
-extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
+int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
                 xfs_off_t count);
+int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count,
+               xfs_daddr_t daddr, xfs_fsblock_t old_startblock);
  extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
  extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
                 struct file *file_out, loff_t pos_out, loff_t len,
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c

index b380414940ea38993d24859ab34d18746b81f87d..a3a2423d61d87a01baa5fc0480713004448ea3b1 100644 (file)
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -33,6 +33,7 @@
  #include "xfs_trace.h"
  #include "xfs_rtrefcount_btree.h"
  #include "xfs_reflink.h"
+#include "xfs_zone_alloc.h"
  
  /*
   * Return whether there are any free extents in the size range given
@@ -661,6 +662,7 @@ xfs_rtunmount_rtg(
  {
         int                     i;
  
+       list_del_init(&rtg->rtg_entry);
         for (i = 0; i < XFS_RTGI_MAX; i++)
                 xfs_rtginode_irele(&rtg->rtg_inodes[i]);
         kvfree(rtg->rtg_rsum_cache);
@@ -1320,6 +1322,8 @@ xfs_growfs_rt(
         if (xfs_has_reflink(mp) &&
             !xfs_reflink_supports_rextsize(mp, in->extsize))
                 goto out_unlock;
+       if (xfs_has_zoned(mp))
+               goto out_unlock;
  
         error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks);
         if (error)
@@ -1637,14 +1641,26 @@ xfs_rtmount_inodes(
                 error = xfs_rtmount_rtg(mp, tp, rtg);
                 if (error) {
                         xfs_rtgroup_rele(rtg);
-                       xfs_rtunmount_inodes(mp);
-                       break;
+                       goto out_rtunmount_rtgs;
                 }
         }
  
+       if (xfs_has_zoned(mp)) {
+               error = xfs_mount_zones(mp);
+               if (error)
+                       goto out_rtunmount_rtgs;
+       }
+
  out_cancel:
         xfs_trans_cancel(tp);
         return error;
+
+out_rtunmount_rtgs:
+       rtg = NULL;
+       while ((rtg = xfs_rtgroup_next(mp, rtg)))
+               xfs_rtunmount_rtg(rtg);
+       xfs_rtginode_irele(&mp->m_rtdirip);
+       goto out_cancel;
  }
  
  void
@@ -1653,6 +1669,9 @@ xfs_rtunmount_inodes(
  {
         struct xfs_rtgroup      *rtg = NULL;
  
+       if (xfs_has_zoned(mp))
+               xfs_unmount_zones(mp);
+
         while ((rtg = xfs_rtgroup_next(mp, rtg)))
                 xfs_rtunmount_rtg(rtg);
         xfs_rtginode_irele(&mp->m_rtdirip);
@@ -2092,6 +2111,8 @@ xfs_bmap_rtalloc(
                 ap->datatype & XFS_ALLOC_INITIAL_USER_DATA;
         int                     error;
  
+       ASSERT(!xfs_has_zoned(ap->tp->t_mountp));
+
  retry:
         error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign);
         if (error)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c

index 2d1c97b333358244b3143d80add8ddddd4221c18..041c725047e447908a8a563a973a9c53620800fc 100644 (file)
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -46,6 +46,7 @@
  #include "xfs_exchmaps_item.h"
  #include "xfs_parent.h"
  #include "xfs_rtalloc.h"
+#include "xfs_zone_alloc.h"
  #include "scrub/stats.h"
  #include "scrub/rcbag_btree.h"
  
@@ -109,7 +110,8 @@ enum {
         Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
         Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
         Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
-       Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum,
+       Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
+       Opt_zoned_op,
  };
  
  static const struct fs_parameter_spec xfs_fs_parameters[] = {
@@ -154,6 +156,8 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
         fsparam_flag("nodiscard",       Opt_nodiscard),
         fsparam_flag("dax",             Opt_dax),
         fsparam_enum("dax",             Opt_dax_enum, dax_param_enums),
+       fsparam_u32("max_open_zones",   Opt_max_open_zones),
+       fsparam_u64("zoned_op",         Opt_zoned_op),
         {}
  };
  
@@ -233,6 +237,12 @@ xfs_fs_show_options(
         if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
                 seq_puts(m, ",noquota");
  
+       if (mp->m_max_open_zones)
+               seq_printf(m, ",max_open_zones=%d", mp->m_max_open_zones);
+
+       if (mp->m_zoned_op)
+               seq_printf(m, ",zoned_op=%llu", mp->m_zoned_op);
+
         return 0;
  }
  
@@ -882,6 +892,8 @@ xfs_fs_statfs(
                 s64     freertx;
  
                 statp->f_blocks = sbp->sb_rblocks;
+               if (xfs_has_zoned(mp))
+                       statp->f_blocks -= mp->m_resblks[FREE_RTEXTENTS].total;
                 freertx = max_t(int64_t, 0, xfs_sum_freecounter(mp, FREE_RTEXTENTS));
                 statp->f_bavail = statp->f_bfree =
                         xfs_rtbxlen_to_blen(mp, freertx);
@@ -1093,7 +1105,9 @@ xfs_reinit_percpu_counters(
         percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
         percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
         percpu_counter_set(&mp->m_free[FREE_BLOCKS], mp->m_sb.sb_fdblocks);
-       percpu_counter_set(&mp->m_free[FREE_RTEXTENTS], mp->m_sb.sb_frextents);
+       if (!xfs_has_zoned(mp))
+               percpu_counter_set(&mp->m_free[FREE_RTEXTENTS],
+                               mp->m_sb.sb_frextents);
  }
  
  static void
@@ -1192,6 +1206,18 @@ xfs_fs_shutdown(
         xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED);
  }
  
+static int
+xfs_fs_show_stats(
+       struct seq_file         *m,
+       struct dentry           *root)
+{
+       struct xfs_mount        *mp = XFS_M(root->d_sb);
+
+       if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT))
+               xfs_zoned_show_stats(m, mp);
+       return 0;
+}
+
  static const struct super_operations xfs_super_operations = {
         .alloc_inode            = xfs_fs_alloc_inode,
         .destroy_inode          = xfs_fs_destroy_inode,
@@ -1206,6 +1232,7 @@ static const struct super_operations xfs_super_operations = {
         .nr_cached_objects      = xfs_fs_nr_cached_objects,
         .free_cached_objects    = xfs_fs_free_cached_objects,
         .shutdown               = xfs_fs_shutdown,
+       .show_stats             = xfs_fs_show_stats,
  };
  
  static int
@@ -1418,6 +1445,12 @@ xfs_fs_parse_param(
                 xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
                 parsing_mp->m_features |= XFS_FEAT_NOATTR2;
                 return 0;
+       case Opt_max_open_zones:
+               parsing_mp->m_max_open_zones = result.uint_32;
+               return 0;
+       case Opt_zoned_op:
+               parsing_mp->m_zoned_op = result.uint_64;
+               return 0;
         default:
                 xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
                 return -EINVAL;
@@ -1758,8 +1791,14 @@ xfs_fs_fill_super(
                 mp->m_features &= ~XFS_FEAT_DISCARD;
         }
  
-       if (xfs_has_metadir(mp))
+       if (xfs_has_metadir(mp)) {
                 xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR);
+       } else if (xfs_has_zoned(mp)) {
+               xfs_alert(mp,
+       "metadir feature required for zoned realtime devices.");
+               error = -EINVAL;
+               goto out_filestream_unmount;
+       }
  
         if (xfs_has_reflink(mp)) {
                 if (xfs_has_realtime(mp) &&
@@ -1771,6 +1810,13 @@ xfs_fs_fill_super(
                         goto out_filestream_unmount;
                 }
  
+               if (xfs_has_zoned(mp)) {
+                       xfs_alert(mp,
+       "reflink not compatible with zoned RT device!");
+                       error = -EINVAL;
+                       goto out_filestream_unmount;
+               }
+
                 /*
                  * always-cow mode is not supported on filesystems with rt
                  * extent sizes larger than a single block because we'd have
@@ -1902,6 +1948,9 @@ xfs_remount_rw(
         /* Re-enable the background inode inactivation worker. */
         xfs_inodegc_start(mp);
  
+       /* Restart zone reclaim */
+       xfs_zone_gc_start(mp);
+
         return 0;
  }
  
@@ -1946,6 +1995,9 @@ xfs_remount_ro(
          */
         xfs_inodegc_stop(mp);
  
+       /* Stop zone reclaim */
+       xfs_zone_gc_stop(mp);
+
         /* Free the per-AG metadata reservation pool. */
         xfs_fs_unreserve_ag_blocks(mp);
  
@@ -2069,6 +2121,7 @@ xfs_init_fs_context(
         mutex_init(&mp->m_growlock);
         INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
         INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+
         mp->m_kobj.kobject.kset = xfs_kset;
         /*
          * We don't create the finobt per-ag space reservation until after log
@@ -2097,8 +2150,10 @@ static void
  xfs_kill_sb(
         struct super_block              *sb)
  {
+       struct xfs_mount                *mp = XFS_M(sb);
+
         kill_block_super(sb);
-       xfs_mount_free(XFS_M(sb));
+       xfs_mount_free(mp);
  }
  
  static struct file_system_type xfs_fs_type = {
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c

index 8f530e69c18ae73a4e7ecd2063ff0709d28ffaf7..8e5a3eb31bd28f1af667bfb47f43a9f252607411 100644 (file)
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -49,6 +49,7 @@
  #include "xfs_metafile.h"
  #include "xfs_metadir.h"
  #include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
  
  /*
   * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h

index 31822f940073201e0d2cb51ef98fb35b4369eee1..cc251aa0fb67221abf7b474f8694a850ab10fba1 100644 (file)
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -254,6 +254,7 @@ DECLARE_EVENT_CLASS(xfs_group_class,
                   (char *)__entry->caller_ip)
  );
  
+
  #define DEFINE_GROUP_REF_EVENT(name)   \
  DEFINE_EVENT(xfs_group_class, name,    \
         TP_PROTO(struct xfs_group *xg, unsigned long caller_ip), \
@@ -265,6 +266,86 @@ DEFINE_GROUP_REF_EVENT(xfs_group_grab);
  DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag);
  DEFINE_GROUP_REF_EVENT(xfs_group_rele);
  
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xfs_zone_class,
+       TP_PROTO(struct xfs_rtgroup *rtg),
+       TP_ARGS(rtg),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_rgnumber_t, rgno)
+               __field(xfs_rgblock_t, used)
+               __field(xfs_rgblock_t, written)
+               __field(xfs_rgblock_t, write_pointer)
+       ),
+       TP_fast_assign(
+               __entry->dev = rtg_mount(rtg)->m_super->s_dev;
+               __entry->rgno = rtg_rgno(rtg);
+               __entry->used = *xfs_zone_used_counter(rtg);
+               __entry->written = rtg->rtg_written;
+               __entry->write_pointer = rtg->rtg_write_pointer;
+       ),
+       TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->rgno,
+                 __entry->used,
+                 __entry->written,
+                 __entry->write_pointer)
+);
+
+#define DEFINE_ZONE_EVENT(name)                                \
+DEFINE_EVENT(xfs_zone_class, name,                     \
+       TP_PROTO(struct xfs_rtgroup *rtg),              \
+       TP_ARGS(rtg))
+DEFINE_ZONE_EVENT(xfs_zone_emptied);
+DEFINE_ZONE_EVENT(xfs_zone_full);
+DEFINE_ZONE_EVENT(xfs_zone_activate);
+DEFINE_ZONE_EVENT(xfs_zone_reset);
+DEFINE_ZONE_EVENT(xfs_zone_reclaim);
+DEFINE_ZONE_EVENT(xfs_gc_zone_activate);
+
+DECLARE_EVENT_CLASS(xfs_zone_alloc_class,
+       TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
+                xfs_extlen_t len),
+       TP_ARGS(rtg, rgbno, len),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_rgnumber_t, rgno)
+               __field(xfs_rgblock_t, used)
+               __field(xfs_rgblock_t, written)
+               __field(xfs_rgblock_t, write_pointer)
+               __field(xfs_rgblock_t, rgbno)
+               __field(xfs_extlen_t, len)
+       ),
+       TP_fast_assign(
+               __entry->dev = rtg_mount(rtg)->m_super->s_dev;
+               __entry->rgno = rtg_rgno(rtg);
+               __entry->used = *xfs_zone_used_counter(rtg);
+               __entry->written = rtg->rtg_written;
+               __entry->write_pointer = rtg->rtg_write_pointer;
+               __entry->rgbno = rgbno;
+               __entry->len = len;
+       ),
+       TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->rgno,
+                 __entry->used,
+                 __entry->written,
+                 __entry->write_pointer,
+                 __entry->rgbno,
+                 __entry->len)
+);
+
+
+#define DEFINE_ZONE_ALLOC_EVENT(name)                          \
+DEFINE_EVENT(xfs_zone_alloc_class, name,                       \
+       TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,  \
+                xfs_extlen_t len),                             \
+       TP_ARGS(rtg, rgbno, len))
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_free_blocks);
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
+#endif /* CONFIG_XFS_RT */
+
  TRACE_EVENT(xfs_inodegc_worker,
         TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
         TP_ARGS(mp, shrinker_hits),
@@ -1596,6 +1677,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
  DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
  DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
  DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read);
+DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks);
  
  DECLARE_EVENT_CLASS(xfs_itrunc_class,
         TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
@@ -3984,6 +4066,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip);
  
  DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
  DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c

new file mode 100644 (file)

index 0000000..9cdce2c
--- /dev/null
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -0,0 +1,557 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Christoph Hellwig.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_fsops.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_iomap.h"
+#include "xfs_trans.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zones.h"
+#include "xfs_trace.h"
+
+/*
+ * Keep track of a counter of blocks used in a rtgroup.  This is incremented
+ * after the blocks have been written to and the I/O completion handlers sets
+ * up the bmap and remap records to link them into the file system metadata
+ * and decremented when the blocks are "freed" by unlinking them from the bmap
+ * and rmap trees.  The space will only become available for reuse when the
+ * zone is reset.
+ *
+ * The way this stored is a bit of a hack and abuses the atime field in the
+ * rmap inode.  There is precedence for this in the rtbimap inode, but it is
+ * a bit ugly.
+ */
+uint64_t *
+xfs_zone_used_counter(
+       struct xfs_rtgroup      *rtg)
+{
+       return (uint64_t *)&VFS_I(rtg->rtg_inodes[XFS_RTGI_RMAP])->i_atime_sec;
+}
+
+/*
+ * Keep track of the last written block in a zone.
+ *
+ * This is only needed when using the zoned allocator on a device that doesn't
+ * support zones natively and is an approximation for the hardware write
+ * pointer.  Unlike the hardware write pointer it might be past regions that
+ * haven't been written to.  In case of an unclean shutdown this means there
+ * could be blocks that we'll never write ever before finishing the zone.
+ * This is a little bit inefficient, but not a real problem as the used counter
+ * above doesn't account for them, so they will be treated by zone reclaim as
+ * if these blocks were written to but deleted immediately.
+ *
+ * This uses the same kind of hack to store extra information in the rmap inode
+ * as the used counter above.
+ */
+uint64_t *
+xfs_zone_last_written(
+       struct xfs_rtgroup      *rtg)
+{
+       return (uint64_t *)&VFS_I(rtg->rtg_inodes[XFS_RTGI_RMAP])->i_mtime_sec;
+}
+
+static void
+xfs_zone_emptied(
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+
+       trace_xfs_zone_emptied(rtg);
+
+       xfs_group_clear_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE);
+
+       spin_lock(&mp->m_zone_list_lock);
+       ASSERT(list_empty(&rtg->rtg_entry));
+       list_add_tail(&rtg->rtg_entry, &mp->m_emptied_zones);
+       spin_unlock(&mp->m_zone_list_lock);
+
+       wake_up_process(mp->m_zone_gc_thread);
+}
+
+static void
+xfs_zone_mark_reclaimable(
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+
+       xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE);
+       if (xfs_zoned_need_gc(mp))
+               wake_up_process(mp->m_zone_gc_thread);
+}
+
+static void
+xfs_zone_mark_full(
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+
+       trace_xfs_zone_full(rtg);
+
+       spin_lock(&mp->m_zone_list_lock);
+       clear_bit(RTG_F_OPEN, &rtg->rtg_flags);
+       if (!list_empty(&rtg->rtg_entry)) {
+               /* empty list means this is the open GC zone */
+               mp->m_nr_open_zones--;
+               list_del_init(&rtg->rtg_entry);
+       }
+       spin_unlock(&mp->m_zone_list_lock);
+
+       wake_up_all(&mp->m_zone_wait);
+       if (*xfs_zone_used_counter(rtg) < rtg->rtg_extents)
+               xfs_zone_mark_reclaimable(rtg);
+}
+
+/*
+ * Record data blocks as having been written to.
+ *
+ * This is called from the write completion handler and records blocks as
+ * actually used.  For zoned devices all this is purely an in-memory
+ * exercise to manage the open zones, but if we run on a conventional
+ * device we also have to record the last written block as the write pointer
+ * approximation.
+ */
+int
+xfs_zone_record_blocks(
+       struct xfs_trans        *tp,
+       xfs_fsblock_t           fsbno,
+       xfs_filblks_t           len,
+       bool                    used)
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       xfs_rgblock_t           rgbno = xfs_rtb_to_rgbno(mp, fsbno);
+       struct xfs_rtgroup      *rtg;
+
+       rtg = xfs_rtgroup_get(mp, xfs_rtb_to_rgno(mp, fsbno));
+       if (!rtg)
+               return -EIO;
+
+       trace_xfs_zone_record_blocks(rtg, rgbno, len);
+
+       xfs_ilock(rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOCK_EXCL);
+
+       if (used) {
+               *xfs_zone_used_counter(rtg) += len;
+               ASSERT(*xfs_zone_used_counter(rtg) <= rtg->rtg_extents);
+       } else {
+               xfs_add_frextents(mp, xfs_extlen_to_rtxlen(mp, len));
+       }
+
+       if (rgbno + len > *xfs_zone_last_written(rtg))
+               *xfs_zone_last_written(rtg) = rgbno + len;
+
+       rtg->rtg_written += len;
+       ASSERT(rtg->rtg_written <= rtg->rtg_write_pointer);
+       if (rtg->rtg_written == rtg->rtg_extents)
+               xfs_zone_mark_full(rtg);
+
+       xfs_trans_log_inode(tp, rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOG_CORE);
+
+       xfs_rtgroup_put(rtg);
+       return 0;
+}
+
+/*
+ * "Free" blocks allocated in a zone.
+ *
+ * Just decrement the used blocks counter and report the space as freed.
+ */
+int
+xfs_zone_free_blocks(
+       struct xfs_trans        *tp,
+       struct xfs_rtgroup      *rtg,
+       xfs_fsblock_t           fsbno,
+       xfs_filblks_t           len)
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       uint64_t                *used = xfs_zone_used_counter(rtg);
+
+       xfs_assert_ilocked(rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOCK_EXCL);
+       if (len > *used) {
+               xfs_err(mp,
+"trying to free more blocks (%lld) than used counter (%lld).",
+                       len, *used);
+               ASSERT(len <= *used);
+               xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+               return -EFSCORRUPTED;
+       }
+
+       trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len);
+
+       *used -= len;
+       if (rtg->rtg_written == rtg->rtg_extents) {
+               /*
+                * Mark up the zone as reclaimable, but only if the zone is full
+                * as we don't reclaim open zones.  As an optimization kick of a
+                * zone reset if the usage counter hits zero.
+                */
+               if (*used == 0)
+                       xfs_zone_emptied(rtg);
+               else if (*used + len == rtg->rtg_extents)
+                       xfs_zone_mark_reclaimable(rtg);
+       }
+
+       xfs_add_frextents(mp, xfs_extlen_to_rtxlen(mp, len));
+       xfs_trans_log_inode(tp, rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOG_CORE);
+       return 0;
+}
+
+/*
+ * Check if the zone containing the data just before the offset we are
+ * writing to is still open and has space.
+ */
+static struct xfs_rtgroup *
+xfs_last_used_zone(
+       struct iomap_ioend      *ioend)
+{
+       struct xfs_inode        *ip = XFS_I(ioend->io_inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset);
+       struct xfs_rtgroup      *rtg = NULL;
+       struct xfs_iext_cursor  icur;
+       struct xfs_bmbt_irec    got;
+
+       xfs_ilock(ip, XFS_ILOCK_SHARED);
+       if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb,
+                               &icur, &got))
+               goto out_unlock;
+       ASSERT(!isnullstartblock(got.br_startblock));
+       rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock));
+       if (rtg && !test_bit(RTG_F_OPEN, &rtg->rtg_flags)) {
+               xfs_rtgroup_rele(rtg);
+               rtg = NULL;
+       }
+out_unlock:
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+       return rtg;
+}
+
+struct xfs_rtgroup *
+xfs_find_free_zone(
+       struct xfs_mount        *mp)
+{
+       struct xfs_rtgroup      *rtg;
+
+       lockdep_assert_held(&mp->m_zone_list_lock);
+
+       list_for_each_entry(rtg, &mp->m_free_zones, rtg_entry) {
+               ASSERT(rtg->rtg_write_pointer == 0);
+               if (atomic_inc_not_zero(&rtg->rtg_group.xg_active_ref)) {
+                       list_del_init(&rtg->rtg_entry);
+                       atomic_dec(&mp->m_nr_free_zones);
+                       return rtg;
+               }
+       }
+
+       return NULL;
+}
+
+/*
+ * Activate a free zone.
+ *
+ * This just does the accounting and allows to find the zone on the open
+ * zones list.  Don't bother with an explicit open command, we'll just open it
+ * implicitly with the first write to it.
+ */
+static struct xfs_rtgroup *
+xfs_activate_zone(
+       struct xfs_mount        *mp)
+{
+       struct xfs_rtgroup      *rtg;
+
+       if (atomic_read(&mp->m_nr_free_zones) <
+           XFS_GC_ZONES - XFS_OPEN_GC_ZONES)
+               return NULL;
+
+       rtg = xfs_find_free_zone(mp);
+       if (!rtg)
+               return NULL;
+
+       list_add_tail(&rtg->rtg_entry, &mp->m_open_zones);
+       mp->m_nr_open_zones++;
+       if (xfs_zoned_need_gc(mp))
+               wake_up_process(mp->m_zone_gc_thread);
+
+       /* XXX: this is a little verbose, but let's keep it for now */
+       xfs_info(mp, "using zone %u (%d)",
+                rtg_rgno(rtg), mp->m_nr_open_zones);
+       set_bit(RTG_F_OPEN, &rtg->rtg_flags);
+       trace_xfs_zone_activate(rtg);
+       return rtg;
+}
+
+/*
+ * For SMR hard drives that have no open limit, keep opening a new zone for each
+ * allocation context.  If all zones in the system are open, use this simple LRU
+ * algorithm to pick then one that was least recently used.
+ *
+ * This requires that any reused zone is rotated to the end of the open list so
+ * that the next users doesn't pick it again.
+ */
+static struct xfs_rtgroup *
+xfs_select_open_zone_lru(
+       struct xfs_mount        *mp,
+       unsigned int            minlen)
+{
+       struct xfs_rtgroup      *rtg;
+
+       list_for_each_entry(rtg, &mp->m_open_zones, rtg_entry) {
+               if (rtg->rtg_extents - rtg->rtg_write_pointer < minlen)
+                       continue;
+               if (!atomic_inc_not_zero(&rtg->rtg_group.xg_active_ref))
+                       continue;
+               list_move_tail(&rtg->rtg_entry, &mp->m_open_zones);
+               return rtg;
+       }
+
+       return NULL;
+}
+
+/*
+ * Pick a new zone for writes.
+ *
+ * If we aren't using up our budget of open zones just open a new one from
+ * the freelist.  Else try to find one that matches the expected allocation
+ * length, or at least the minimum required length.  If we don't find one
+ * that is good enough we pick one anyway and let the caller finish it to
+ * free up open zone resources.
+ */
+static struct xfs_rtgroup *
+xfs_select_zone_nowait(
+       struct xfs_inode        *ip,
+       xfs_filblks_t           count_fsb)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_rtgroup      *rtg;
+
+       /*
+        * If we are below the open limit try to activate a zone.
+        */
+       if (mp->m_nr_open_zones < mp->m_max_open_zones - XFS_OPEN_GC_ZONES) {
+               rtg = xfs_activate_zone(mp);
+               if (rtg)
+                       return rtg;
+       }
+
+       rtg = xfs_select_open_zone_lru(mp, count_fsb);
+       if (rtg)
+               return rtg;
+       return xfs_select_open_zone_lru(mp, 1);
+}
+
+static struct xfs_rtgroup *
+xfs_select_zone(
+       struct iomap_ioend      *ioend)
+{
+       struct xfs_inode        *ip = XFS_I(ioend->io_inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_filblks_t           count_fsb = XFS_B_TO_FSB(mp, ioend->io_size);
+       struct xfs_rtgroup      *rtg = NULL;
+       DEFINE_WAIT             (wait);
+
+       spin_lock(&mp->m_zone_list_lock);
+       if (xfs_is_shutdown(mp))
+               goto out_unlock;
+
+       rtg = xfs_select_zone_nowait(ip, count_fsb);
+       if (rtg)
+               goto out_unlock;
+
+       for (;;) {
+               prepare_to_wait(&mp->m_zone_wait, &wait, TASK_UNINTERRUPTIBLE);
+               if (xfs_is_shutdown(mp))
+                       break;
+
+               rtg = xfs_select_zone_nowait(ip, count_fsb);
+               if (rtg)
+                       break;
+
+               spin_unlock(&mp->m_zone_list_lock);
+               schedule();
+               spin_lock(&mp->m_zone_list_lock);
+       }
+       finish_wait(&mp->m_zone_wait, &wait);
+
+out_unlock:
+       spin_unlock(&mp->m_zone_list_lock);
+       return rtg;
+}
+
+static unsigned int
+xfs_zone_alloc_blocks(
+       struct iomap_ioend      *ioend,
+       struct xfs_rtgroup      *rtg,
+       bool                    *is_seq)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+       xfs_filblks_t           count_fsb = XFS_B_TO_FSB(mp, ioend->io_size);
+       xfs_rgblock_t           rgbno;
+
+       spin_lock(&rtg->rtg_alloc_lock);
+       count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN,
+               (xfs_filblks_t)rtg->rtg_extents - rtg->rtg_write_pointer);
+       if (!count_fsb || !test_bit(RTG_F_OPEN, &rtg->rtg_flags)) {
+               spin_unlock(&rtg->rtg_alloc_lock);
+               return 0;
+       }
+       rgbno = rtg->rtg_write_pointer;
+       rtg->rtg_write_pointer += count_fsb;
+       spin_unlock(&rtg->rtg_alloc_lock);
+
+       trace_xfs_zone_alloc_blocks(rtg, rgbno, count_fsb);
+
+       *is_seq = test_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags);
+       if (*is_seq)
+               rgbno = 0;
+       ioend->io_sector = xfs_rtb_to_daddr(mp, xfs_rgbno_to_rtb(rtg, rgbno));
+       return XFS_FSB_TO_B(mp, count_fsb);
+}
+
+static inline void
+xfs_mark_rtg_boundary(
+       struct iomap_ioend      *ioend)
+{
+       struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
+       sector_t                sector = ioend->io_bio.bi_iter.bi_sector;
+
+       if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0)
+               ioend->io_flags |= IOMAP_F_BOUNDARY;
+}
+
+static void
+xfs_submit_zoned_bio(
+       struct iomap_ioend      *ioend,
+       bool                    is_seq)
+{
+       if (is_seq) {
+               ioend->io_bio.bi_opf &= ~REQ_OP_WRITE;
+               ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND;
+       } else {
+               xfs_mark_rtg_boundary(ioend);
+       }
+
+       ioend->io_bio.bi_iter.bi_sector = ioend->io_sector;
+       submit_bio(&ioend->io_bio);
+}
+
+void
+xfs_zone_alloc_and_submit(
+       struct iomap_ioend      *ioend,
+       struct xfs_rtgroup      **rtg)
+{
+       unsigned int            alloc_len;
+       struct iomap_ioend      *split;
+       bool                    is_seq;
+
+       if (xfs_is_shutdown(XFS_I(ioend->io_inode)->i_mount))
+               goto out_error;
+
+       /*
+        * If we don't have a cached zone in this write context, see if the
+        * last extent before the one we are writing points of an active zone.
+        * If so, just continue writing to it.
+        */
+       if (!*rtg)
+               *rtg = xfs_last_used_zone(ioend);
+
+       if (!*rtg) {
+select_zone:
+               *rtg = xfs_select_zone(ioend);
+               if (!*rtg)
+                       goto out_error;
+       }
+
+       alloc_len = xfs_zone_alloc_blocks(ioend, *rtg, &is_seq);
+       if (!alloc_len) {
+               xfs_zone_finish_alloc(*rtg);
+               goto select_zone;
+       }
+
+       while ((split = iomap_split_ioend(ioend, is_seq, &alloc_len))) {
+               xfs_submit_zoned_bio(split, is_seq);
+               if (!alloc_len) {
+                       xfs_zone_finish_alloc(*rtg);
+                       goto select_zone;
+               }
+       }
+
+       xfs_submit_zoned_bio(ioend, is_seq);
+       return;
+
+out_error:
+       bio_io_error(&ioend->io_bio);
+}
+
+void
+xfs_zone_finish_alloc(
+       struct xfs_rtgroup      *rtg)
+{
+       if (rtg)
+               xfs_rtgroup_rele(rtg);
+}
+
+static void
+xfs_show_zone(
+       struct seq_file         *m,
+       struct xfs_rtgroup      *rtg)
+{
+       seq_printf(m, "\t  zone %d, wp %u, written %u, used %llu\n",
+               rtg_rgno(rtg),
+               rtg->rtg_write_pointer, rtg->rtg_written,
+               *xfs_zone_used_counter(rtg));
+}
+
+void
+xfs_zoned_show_stats(
+       struct seq_file         *m,
+       struct xfs_mount        *mp)
+{
+       unsigned long           index = 0;
+       unsigned                count = 0;
+       struct xfs_rtgroup      *rtg;
+
+       seq_puts(m, "\n");
+
+       seq_printf(m, "\tuser free blocks: %lld\n",
+               xfs_sum_freecounter(mp, FREE_RTEXTENTS));
+       seq_printf(m, "\treserved free blocks: %lld\n",
+               mp->m_resblks[FREE_RTEXTENTS].avail);
+       seq_printf(m, "\tuser available blocks: %lld\n",
+               xfs_sum_freecounter(mp, FREE_RTAVAILABLE));
+       seq_printf(m, "\treserved available blocks: %lld\n",
+               mp->m_resblks[FREE_RTAVAILABLE].avail);
+       seq_printf(m, "\treservations required: %d\n",
+               !list_empty_careful(&mp->m_reclaim_reservations));
+       seq_printf(m, "\tGC required: %d\n",
+               xfs_zoned_need_gc(mp));
+
+       spin_lock(&mp->m_zone_list_lock);
+       seq_printf(m, "\tfree zones: %d\n", atomic_read(&mp->m_nr_free_zones));
+       seq_puts(m, "\topen zones:\n");
+       list_for_each_entry(rtg, &mp->m_open_zones, rtg_entry)
+               xfs_show_zone(m, rtg);
+       if (mp->m_open_gc_zone) {
+               seq_puts(m, "\topen gc zone:\n");
+               xfs_show_zone(m, mp->m_open_gc_zone);
+       }
+       seq_puts(m, "\treclaimable zones:\n");
+       xa_for_each_marked(&mp->m_groups[XG_TYPE_RTG].xa, index, rtg,
+                       XFS_RTG_RECLAIMABLE) {
+               if (++count > 20) {
+                       seq_puts(m, "\t  (truncated)\n");
+                       break;
+               }
+               xfs_show_zone(m, rtg);
+       }
+       spin_unlock(&mp->m_zone_list_lock);
+}
diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h

new file mode 100644 (file)

index 0000000..0e5c612
--- /dev/null
+++ b/fs/xfs/xfs_zone_alloc.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XFS_ZONE_ALLOC_H
+#define _XFS_ZONE_ALLOC_H
+
+void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
+               struct xfs_rtgroup **rtg);
+void xfs_zone_finish_alloc(struct xfs_rtgroup *rtg);
+int xfs_zone_record_blocks(struct xfs_trans *tp, xfs_fsblock_t fsbno,
+               xfs_filblks_t len, bool used);
+int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+               xfs_fsblock_t fsbno, xfs_filblks_t len);
+
+uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp, unsigned int idx);
+
+int xfs_mount_zones(struct xfs_mount *mp);
+void xfs_unmount_zones(struct xfs_mount *mp);
+
+#ifdef CONFIG_XFS_RT
+void xfs_zone_gc_start(struct xfs_mount *mp);
+void xfs_zone_gc_stop(struct xfs_mount *mp);
+#else
+static inline void xfs_zone_gc_start(struct xfs_mount *mp)
+{
+}
+static inline void xfs_zone_gc_stop(struct xfs_mount *mp)
+{
+}
+#endif /* CONFIG_XFS_RT */
+
+void xfs_zoned_show_stats(struct seq_file *m, struct xfs_mount *mp);
+
+uint64_t *xfs_zone_used_counter(struct xfs_rtgroup *rtg);
+uint64_t *xfs_zone_last_written(struct xfs_rtgroup *rtg);
+
+struct xfs_zone_alloc_ctx {
+       struct xfs_rtgroup      *cached_rtg;
+       xfs_filblks_t           reserved_blocks;
+};
+
+#define XFS_ZR_GREEDY          (1U << 0)
+#define XFS_ZR_NOWAIT          (1U << 1)
+#define XFS_ZR_RESERVED                (1U << 2)
+
+int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb,
+               unsigned int flags, struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_space_unreserve(struct xfs_inode *ip,
+               struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb);
+bool xfs_zoned_need_gc(struct xfs_mount *mp);
+struct xfs_rtgroup *xfs_find_free_zone(struct xfs_mount *mp);
+
+#endif /* _XFS_ZONE_ALLOC_H */
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c

new file mode 100644 (file)

index 0000000..6fb8d62
--- /dev/null
+++ b/fs/xfs/xfs_zone_gc.c
@@ -0,0 +1,1409 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Christoph Hellwig.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_fsops.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_iomap.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_reflink.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zones.h"
+#include "xfs_trace.h"
+
+struct xfs_zone_reservation {
+       struct list_head        entry;
+       struct task_struct      *task;
+       xfs_rtxnum_t            rtxlen;
+};
+
+uint64_t
+xfs_zoned_default_resblks(
+       struct xfs_mount        *mp,
+       unsigned int            idx)
+{
+       /*
+        * For the available blocks dipped into by the allocator only reserved the
+        * required GC zones.
+        */
+       if (idx == FREE_RTAVAILABLE)
+               return XFS_GC_ZONES * mp->m_groups[XG_TYPE_RTG].blocks;
+
+       /*
+        * For the user reported blocks, include at least the extra sparse zone
+        * and also any extra overprovisioning.
+        */
+       return XFS_RESERVED_ZONES * mp->m_groups[XG_TYPE_RTG].blocks +
+               XFS_B_TO_FSB(mp, mp->m_zoned_op);
+}
+
+/*
+ * We aim to keep enough zones free in stock to fully use the open zone limit
+ * for data placement purposes.
+ */
+bool
+xfs_zoned_need_gc(
+       struct xfs_mount        *mp)
+{
+       if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
+               return false;
+       if (xfs_estimate_freecounter(mp, FREE_RTAVAILABLE) <
+           mp->m_groups[XG_TYPE_RTG].blocks *
+           (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
+               return true;
+       return false;
+}
+
+static void
+xfs_zoned_wake_all(
+       struct xfs_mount                *mp)
+{
+       struct xfs_zone_reservation     *reservation;
+
+       spin_lock(&mp->m_reservation_lock);
+       list_for_each_entry(reservation, &mp->m_reclaim_reservations, entry)
+               wake_up_process(reservation->task);
+       spin_unlock(&mp->m_reservation_lock);
+}
+
+void
+xfs_zoned_add_available(
+       struct xfs_mount                *mp,
+       xfs_filblks_t                   count_fsb)
+{
+       struct xfs_zone_reservation     *reservation;
+       xfs_rtxnum_t                    rtxlen;
+
+       rtxlen = xfs_extlen_to_rtxlen(mp, count_fsb);
+       if (list_empty_careful(&mp->m_reclaim_reservations)) {
+               xfs_add_freecounter(mp, FREE_RTAVAILABLE, rtxlen);
+               return;
+       }
+
+       spin_lock(&mp->m_reservation_lock);
+       xfs_add_freecounter(mp, FREE_RTAVAILABLE, rtxlen);
+       rtxlen = xfs_sum_freecounter(mp, FREE_RTAVAILABLE);
+       list_for_each_entry(reservation, &mp->m_reclaim_reservations, entry) {
+               if (reservation->rtxlen > rtxlen)
+                       break;
+               wake_up_process(reservation->task);
+               rtxlen -= reservation->rtxlen;
+
+       }
+       spin_unlock(&mp->m_reservation_lock);
+}
+
+static int
+xfs_zoned_space_wait_error(
+       struct xfs_mount                *mp)
+{
+       if (xfs_is_shutdown(mp))
+               return -EIO;
+       if (fatal_signal_pending(current))
+               return -EINTR;
+       return 0;
+}
+
+static int
+xfs_zoned_reserve_available(
+       struct xfs_inode                *ip,
+       xfs_rtxlen_t                    rtxlen,
+       unsigned int                    flags)
+{
+       struct xfs_mount                *mp = ip->i_mount;
+       struct xfs_zone_reservation     reservation = {
+               .task           = current,
+               .rtxlen         = rtxlen,
+       };
+       int                             error;
+
+       if (likely(list_empty_careful(&mp->m_reclaim_reservations))) {
+               error = xfs_dec_freecounter(mp, FREE_RTAVAILABLE, rtxlen,
+                               flags & XFS_ZR_RESERVED);
+               if (error != -ENOSPC)
+                       return error;
+       }
+
+       if (flags & XFS_ZR_NOWAIT)
+               return -EAGAIN;
+
+       spin_lock(&mp->m_reservation_lock);
+       list_add_tail(&reservation.entry, &mp->m_reclaim_reservations);
+       while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
+               set_current_state(TASK_KILLABLE);
+
+               error = xfs_dec_freecounter(mp, FREE_RTAVAILABLE, rtxlen,
+                               flags & XFS_ZR_RESERVED);
+               if (error != -ENOSPC)
+                       break;
+
+               /*
+                * If there is nothing left to reclaim, give up.
+                */
+               if (!xfs_is_in_gc(mp) &&
+                   !xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
+                       break;
+
+               spin_unlock(&mp->m_reservation_lock);
+               schedule();
+               spin_lock(&mp->m_reservation_lock);
+       }
+       list_del(&reservation.entry);
+       spin_unlock(&mp->m_reservation_lock);
+
+       __set_current_state(TASK_RUNNING);
+       return error;
+}
+
+/*
+ * Implement greedy space allocation for short writes by trying to grab all
+ * that is left after locking out other threads from trying to do the same.
+ * 
+ * This isn't exactly optimal and can hopefully be replaced by a proper
+ * percpu_counter primitive one day.
+ */
+static int
+xfs_zoned_reserve_extents_greedy(
+       struct xfs_inode                *ip,
+       xfs_rtxlen_t                    *rtxlen,
+       unsigned int                    flags)
+{
+       struct xfs_mount                *mp = ip->i_mount;
+       s64                             len = *rtxlen;
+       int                             error = -ENOSPC;
+
+       spin_lock(&mp->m_reservation_lock);
+       len = min(len, xfs_sum_freecounter(mp, FREE_RTEXTENTS));
+       if (len > 0) {
+               *rtxlen = len;
+               error = xfs_dec_freecounter(mp, FREE_RTEXTENTS, *rtxlen,
+                               flags & XFS_ZR_RESERVED);
+       }
+       spin_unlock(&mp->m_reservation_lock);
+       return error;
+}
+
+int
+xfs_zoned_space_reserve(
+       struct xfs_inode                *ip,
+       xfs_filblks_t                   count_fsb,
+       unsigned int                    flags,
+       struct xfs_zone_alloc_ctx       *ac)
+{
+       struct xfs_mount                *mp = ip->i_mount;
+       xfs_rtxlen_t                    rtxlen;
+       int                             error;
+
+       ac->cached_rtg = NULL;
+
+       rtxlen = xfs_extlen_to_rtxlen(mp, count_fsb);
+       error = xfs_dec_freecounter(mp, FREE_RTEXTENTS, rtxlen,
+                       flags & XFS_ZR_RESERVED);
+       if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && rtxlen > 1) {
+               error = xfs_zoned_reserve_extents_greedy(ip, &rtxlen, flags);
+               if (error)
+                       return error;
+       }
+       error = xfs_zoned_reserve_available(ip, rtxlen, flags);
+       if (error) {
+               xfs_add_freecounter(mp, FREE_RTEXTENTS, rtxlen);
+               return error;
+       }
+       ac->reserved_blocks = xfs_rtxlen_to_extlen(mp, rtxlen);
+       return 0;
+}
+
+void
+xfs_zoned_space_unreserve(
+       struct xfs_inode                *ip,
+       struct xfs_zone_alloc_ctx       *ac)
+{
+       if (ac->reserved_blocks > 0) {
+               struct xfs_mount        *mp = ip->i_mount;
+
+               xfs_zoned_add_available(mp, ac->reserved_blocks);
+               xfs_add_freecounter(mp, FREE_RTEXTENTS,
+                               xfs_extlen_to_rtxlen(mp, ac->reserved_blocks));
+       }
+       xfs_zone_finish_alloc(ac->cached_rtg);
+}
+
+/*
+ * Split up rewrites in smaller chunks (1MB)
+ */
+#define XFS_GC_CHUNK_SIZE      (1024u * 1024)
+
+#define XFS_ZONE_GC_NR_SCRATCH 2
+struct xfs_zone_scratch {
+       struct folio                    *folio;
+       unsigned int                    offset;
+       unsigned int                    freed;
+};
+
+struct xfs_gc_bio {
+       struct xfs_inode                *ip;
+       loff_t                          offset;
+       unsigned int                    len;
+       bool                            is_seq;
+       xfs_fsblock_t                   old_startblock;
+       xfs_daddr_t                     new_daddr;
+       union {
+               struct xfs_zone_scratch         *scratch;
+               struct xfs_zone_gc_data         *data;
+       };
+
+       struct bio_vec                  bv;
+       struct bio                      bio; /* must be last */
+};
+
+struct xfs_zone_gc_data {
+       /* global GC state */
+       struct xfs_mount                *mp;
+       struct bio_set                  bio_set;
+       struct xfs_zone_scratch         scratch[XFS_ZONE_GC_NR_SCRATCH];
+       unsigned int                    scratch_idx;
+       struct bio_list                 read_done;
+       struct bio_list                 write_done;
+       struct bio_list                 reset_done;
+       spinlock_t                      list_lock;
+       unsigned int                    inflight;
+};
+
+static struct xfs_zone_gc_data *
+xfs_zone_gc_data_alloc(
+       struct xfs_mount        *mp)
+{
+       struct xfs_zone_gc_data *data;
+       int i;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return NULL;
+
+       /*
+        * We actually only need a single bio_vec.  It would be nice to have
+        * a flag that only allocates the inline bvecs and not the separate
+        * bvec pool.
+        */
+       if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
+                       BIOSET_NEED_BVECS))
+               goto out_free_data;
+       for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
+               data->scratch[i].folio =
+                       folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
+               if (!data->scratch[i].folio)
+                       goto out_free_scratch;
+       }
+       spin_lock_init(&data->list_lock);
+       data->mp = mp;
+       return data;
+
+out_free_scratch:
+       while (--i >= 0)
+               folio_put(data->scratch[i].folio);
+       bioset_exit(&data->bio_set);
+out_free_data:
+       kfree(data);
+       return NULL;
+}
+
+static void
+xfs_zone_gc_data_free(
+       struct xfs_zone_gc_data *data)
+{
+       int                     i;
+
+       for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
+               folio_put(data->scratch[i].folio);
+       bioset_exit(&data->bio_set);
+       kfree(data);
+}
+
+#define XFS_ZONE_GC_RECS               32
+
+/* iterator, needs to be reinitialized for each victim zone */
+struct xfs_zone_gc_iter {
+       struct xfs_rtgroup              *victim_rtg;
+       unsigned int                    rec_count;
+       unsigned int                    rec_idx;
+       xfs_agblock_t                   next_startblock;
+       struct xfs_rmap_irec            recs[XFS_ZONE_GC_RECS];
+};
+
+static void
+xfs_zone_gc_iter_init(
+       struct xfs_zone_gc_iter *iter,
+       struct xfs_rtgroup      *victim_rtg)
+
+{
+       iter->next_startblock = 0;
+       iter->rec_count = 0;
+       iter->rec_idx = 0;
+       iter->victim_rtg = victim_rtg;
+}
+
+static int
+xfs_zone_gc_query_cb(
+       struct xfs_btree_cur    *cur,
+       const struct xfs_rmap_irec *irec,
+       void                    *private)
+{
+       struct xfs_zone_gc_iter *iter = private;
+
+       ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
+       ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
+       ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
+
+       iter->recs[iter->rec_count] = *irec;
+       if (++iter->rec_count == XFS_ZONE_GC_RECS) {
+               iter->next_startblock =
+                       irec->rm_startblock + irec->rm_blockcount;
+               return 1;
+       }
+       return 0;
+}
+
+static int
+xfs_zone_gc_rmap_rec_cmp(
+       const void                      *a,
+       const void                      *b)
+{
+       const struct xfs_rmap_irec      *reca = a;
+       const struct xfs_rmap_irec      *recb = b;
+
+       if (reca->rm_owner < recb->rm_owner)
+               return -1;
+       if (reca->rm_owner > recb->rm_owner)
+               return 1;
+
+       if (reca->rm_offset < recb->rm_offset)
+               return -1;
+       if (reca->rm_offset < recb->rm_offset)
+               return 1;
+
+       return 0;
+}
+
+static int
+xfs_zone_gc_query(
+       struct xfs_mount        *mp,
+       struct xfs_zone_gc_iter *iter)
+{
+       struct xfs_rtgroup      *rtg = iter->victim_rtg;
+       struct xfs_rmap_irec    ri_low = { };
+       struct xfs_rmap_irec    ri_high;
+       struct xfs_btree_cur    *cur;
+       struct xfs_trans        *tp;
+       int                     error;
+
+       ASSERT(iter->next_startblock <= rtg->rtg_extents);
+       if (iter->next_startblock == rtg->rtg_extents)
+               goto done;
+
+       ASSERT(iter->next_startblock < rtg->rtg_extents);
+       ri_low.rm_startblock = iter->next_startblock;
+       memset(&ri_high, 0xFF, sizeof(ri_high));
+
+       iter->rec_idx = 0;
+       iter->rec_count = 0;
+
+       error = xfs_trans_alloc_empty(mp, &tp);
+       if (error)
+               return error;
+
+       xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+       xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
+       cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+       error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+                       xfs_zone_gc_query_cb, iter);
+       xfs_btree_del_cursor(cur, error < 0 ? error : 0);
+       xfs_trans_cancel(tp);
+
+       if (error < 0)
+               return error;
+
+       /*
+        * Sort the rmap records by inode number and increasing offset to
+        * defragment the mappings.
+        *
+        * This could be further enhanced by an even bigger look ahead window,
+        * but that's better left until we have better detection of changes to
+        * inode mapping to avoid the potential of GCing already dead data.
+        */
+       sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
+               xfs_zone_gc_rmap_rec_cmp, NULL);
+
+       if (error == 0) {
+               /*
+                * We finished iterating through the zone.
+                */
+               iter->next_startblock = rtg->rtg_extents;
+               if (iter->rec_count == 0)
+                       goto done;
+       }
+
+       return 0;
+done:
+       xfs_rtgroup_rele(iter->victim_rtg);
+       iter->victim_rtg = NULL;
+       return 0;
+}
+
+static bool
+xfs_zone_gc_iter_next(
+       struct xfs_mount        *mp,
+       struct xfs_zone_gc_iter *iter,
+       struct xfs_rmap_irec    *chunk_rec,
+       struct xfs_inode        **ipp)
+{
+       struct xfs_rmap_irec    *irec;
+       int                     error;
+
+       if (!iter->victim_rtg)
+               return false;
+
+       if (iter->rec_idx == iter->rec_count) {
+retry:
+               error = xfs_zone_gc_query(mp, iter);
+               if (error)
+                       goto fail;
+               if (!iter->victim_rtg)
+                       return false;
+       }
+
+       irec = &iter->recs[iter->rec_idx];
+       error = xfs_iget(mp, NULL, irec->rm_owner, XFS_IGET_NORETRY |
+                       XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
+       if (error) {
+               if (error == -EAGAIN || error == -ENOENT) {
+                       iter->next_startblock = irec->rm_startblock;
+                       goto retry;
+               }
+               goto fail;
+       }
+
+       if (!S_ISREG(VFS_I(*ipp)->i_mode)) {
+               iter->next_startblock = irec->rm_startblock;
+               xfs_irele(*ipp);
+               goto retry;
+       }
+
+       *chunk_rec = *irec;
+       return true;
+
+fail:
+       xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+       return false;
+}
+
+static void
+xfs_zone_gc_iter_advance(
+       struct xfs_zone_gc_iter *iter,
+       xfs_extlen_t            count_fsb)
+{
+       struct xfs_rmap_irec    *irec = &iter->recs[iter->rec_idx];
+
+       irec->rm_offset += count_fsb;
+       irec->rm_startblock += count_fsb;
+       irec->rm_blockcount -= count_fsb;
+       if (!irec->rm_blockcount)
+               iter->rec_idx++;
+}
+
+/*
+ * Iterate through all zones marked as reclaimable and find a candidate that is
+ * either good enough for instant reclaim, or the one with the least used space.
+ */
+static bool
+xfs_zone_reclaim_pick(
+       struct xfs_mount        *mp,
+       struct xfs_zone_gc_iter *iter)
+{
+       struct xfs_rtgroup      *victim_rtg = NULL, *rtg;
+       u64                     victim_used = U64_MAX;
+       unsigned long           index = 0;
+       bool                    easy = false;
+
+       if (xfs_is_shutdown(mp))
+               return false;
+
+       if (iter->victim_rtg)
+               return true;
+
+       /*
+        * Don't start new work if we are asked to stop or park.
+        */
+       if (kthread_should_stop() || kthread_should_park())
+               return false;
+
+       if (!xfs_zoned_need_gc(mp))
+               return false;
+
+       rcu_read_lock();
+       xa_for_each_marked(&mp->m_groups[XG_TYPE_RTG].xa, index, rtg,
+                       XFS_RTG_RECLAIMABLE) {
+               u64 used = *xfs_zone_used_counter(rtg);
+
+               if (used >= victim_used)
+                       continue;
+               if (!atomic_inc_not_zero(&rtg->rtg_group.xg_active_ref))
+                       continue;
+
+               if (victim_rtg)
+                       xfs_rtgroup_rele(victim_rtg);
+               victim_rtg = rtg;
+               victim_used = used;
+
+               /*
+                * Any zone that is less than 1 percent used is fair game for
+                * instant reclaim.
+                */
+               if (used < div_u64(rtg->rtg_extents, 100)) {
+                       easy = true;
+                       break;
+               }
+       }
+       rcu_read_unlock();
+
+       if (!victim_rtg)
+               return false;
+
+       xfs_info(mp, "reclaiming zone %d, used = %lld/%llu (%s)",
+               rtg_rgno(victim_rtg), victim_used,
+               victim_rtg->rtg_extents,
+               easy ? "easy" : "best");
+       trace_xfs_zone_reclaim(victim_rtg);
+       xfs_zone_gc_iter_init(iter, victim_rtg);
+       return true;
+}
+
+static struct xfs_rtgroup *
+xfs_select_gc_zone(
+       struct xfs_mount        *mp)
+{
+       struct xfs_rtgroup      *rtg = mp->m_open_gc_zone;
+
+       if (rtg && rtg->rtg_write_pointer == rtg->rtg_extents) {
+               /*
+                * We need to wait for pending writes to finish.
+                */
+               if (rtg->rtg_written < rtg->rtg_extents)
+                       return NULL;
+               xfs_rtgroup_rele(rtg);
+               rtg = NULL;
+       }
+
+       if (!rtg) {
+               spin_lock(&mp->m_zone_list_lock);
+               rtg = xfs_find_free_zone(mp);
+               spin_unlock(&mp->m_zone_list_lock);
+
+               if (rtg)
+                       trace_xfs_gc_zone_activate(rtg);
+               mp->m_open_gc_zone = rtg;
+       }
+
+       return rtg;
+}
+
+static unsigned int
+xfs_zone_gc_scratch_available(
+       struct xfs_zone_gc_data *data)
+{
+       return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
+}
+
+static bool
+xfs_zone_gc_space_available(
+       struct xfs_zone_gc_data *data)
+{
+       struct xfs_rtgroup      *rtg;
+
+       rtg = xfs_select_gc_zone(data->mp);
+       if (!rtg)
+               return false;
+       return rtg->rtg_write_pointer < rtg->rtg_extents &&
+               xfs_zone_gc_scratch_available(data);
+}
+
+static void
+xfs_zone_gc_end_io(
+       struct bio              *bio)
+{
+       struct xfs_zone_gc_data *data = bio->bi_private;
+       unsigned long           flags;
+
+       spin_lock_irqsave(&data->list_lock, flags);
+       if (bio_op(bio) == REQ_OP_READ)
+               bio_list_add(&data->read_done, bio);
+       else
+               bio_list_add(&data->write_done, bio);
+       wake_up_process(data->mp->m_zone_gc_thread);
+       spin_unlock_irqrestore(&data->list_lock, flags);
+}
+
+static bool
+xfs_zone_gc_allocate(
+       struct xfs_zone_gc_data *data,
+       xfs_extlen_t            *count_fsb,
+       xfs_daddr_t             *daddr,
+       bool                    *is_seq)
+{
+       struct xfs_mount        *mp = data->mp;
+       xfs_rtxnum_t            rtxres, rtxlen;
+       xfs_rgblock_t           rgbno = 0;
+       struct xfs_rtgroup      *rtg;
+
+       rtg = xfs_select_gc_zone(mp);
+       if (!rtg)
+               return false;
+
+       *count_fsb = min(*count_fsb,
+               XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
+
+       /*
+        * Directly allocate GC blocks from the reserved pool.
+        *
+        * If we'd take them from the normal pool we could be stealing blocks a
+        * regular writer, which would then have to wait for GC and deadlock.
+        */
+       spin_lock(&mp->m_sb_lock);
+       rtxres = min(mp->m_resblks[FREE_RTEXTENTS].avail,
+                    mp->m_resblks[FREE_RTAVAILABLE].avail);
+       rtxlen = min3(rtxres,
+                     rtg->rtg_extents - rtg->rtg_write_pointer,
+                     xfs_extlen_to_rtxlen(mp, *count_fsb));
+       mp->m_resblks[FREE_RTEXTENTS].avail -= rtxlen;
+       mp->m_resblks[FREE_RTAVAILABLE].avail -= rtxlen;
+       spin_unlock(&mp->m_sb_lock);
+
+       if (!rtxlen)
+               return false;
+       *count_fsb = xfs_rtxlen_to_extlen(mp, rtxlen);
+       *is_seq = test_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags);
+       if (!*is_seq)
+               rgbno = rtg->rtg_write_pointer;
+       rtg->rtg_write_pointer += *count_fsb;
+       *daddr = xfs_gbno_to_daddr(&rtg->rtg_group, rgbno);
+       return true;
+}
+
+static bool
+xfs_zone_gc_start_chunk(
+       struct xfs_zone_gc_data *data,
+       struct xfs_zone_gc_iter *iter)
+{
+       struct xfs_mount        *mp = data->mp;
+       struct block_device     *bdev = mp->m_rtdev_targp->bt_bdev;
+       struct xfs_rmap_irec    irec;
+       struct xfs_gc_bio       *chunk;
+       struct xfs_inode        *ip;
+       struct bio              *bio;
+       xfs_daddr_t             daddr;
+       bool                    is_seq;
+
+       if (xfs_is_shutdown(mp))
+               return false;
+
+       if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
+               return false;
+       if (!xfs_zone_gc_allocate(data, &irec.rm_blockcount, &daddr, &is_seq)) {
+               xfs_irele(ip);
+               return false;
+       }
+
+       bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
+
+       chunk = container_of(bio, struct xfs_gc_bio, bio);
+       chunk->ip = ip;
+       chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
+       chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
+       chunk->old_startblock =
+               xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
+       chunk->new_daddr = daddr;
+       chunk->is_seq = is_seq;
+       chunk->scratch = &data->scratch[data->scratch_idx];
+
+       bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
+       bio->bi_end_io = xfs_zone_gc_end_io;
+       bio->bi_private = data;
+       bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
+                       chunk->scratch->offset);
+       chunk->scratch->offset += chunk->len;
+       if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
+               data->scratch_idx =
+                       (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
+       }
+       data->inflight++;
+       xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
+
+       submit_bio(bio);
+       return true;
+}
+
+static void
+xfs_zone_gc_free_chunk(
+       struct xfs_zone_gc_data *data,
+       struct xfs_gc_bio       *chunk)
+{
+       data->inflight--;
+       xfs_irele(chunk->ip);
+       bio_put(&chunk->bio);
+}
+
+static void
+xfs_gc_submit_write(
+       struct xfs_zone_gc_data *data,
+       struct xfs_gc_bio       *chunk)
+{
+       if (chunk->is_seq) {
+               chunk->bio.bi_opf &= ~REQ_OP_WRITE;
+               chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
+       }
+       chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
+       chunk->bio.bi_end_io = xfs_zone_gc_end_io;
+       chunk->bio.bi_private = data;
+       submit_bio(&chunk->bio);
+}
+
+static struct xfs_gc_bio *
+xfs_gc_split_write(
+       struct xfs_zone_gc_data *data,
+       struct xfs_gc_bio       *chunk)
+{
+       struct queue_limits     *lim =
+               &bdev_get_queue(chunk->bio.bi_bdev)->limits;
+       struct xfs_gc_bio       *split_chunk;
+       int                     split_sectors;
+       unsigned int            split_len;
+       struct bio              *split;
+       unsigned int            nsegs;
+
+       if (!chunk->is_seq)
+               return NULL;
+
+       split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
+               queue_limits_max_zone_append_sectors(lim) << SECTOR_SHIFT);
+       if (!split_sectors)
+               return NULL;
+       split_len = split_sectors << SECTOR_SHIFT;
+
+       split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
+       split_chunk = container_of(split, struct xfs_gc_bio, bio);
+       ihold(VFS_I(chunk->ip));
+       split_chunk->ip = chunk->ip;
+       split_chunk->is_seq = chunk->is_seq;
+       split_chunk->scratch = chunk->scratch;
+       split_chunk->offset = chunk->offset;
+       split_chunk->len = split_len;
+       split_chunk->old_startblock = chunk->old_startblock;
+       split_chunk->new_daddr = chunk->new_daddr;
+
+       chunk->offset += split_len;
+       chunk->len -= split_len;
+       chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
+
+       data->inflight++;
+       return split_chunk;
+}
+
+static void
+xfs_zone_gc_write_chunk(
+       struct xfs_zone_gc_data *data,
+       struct bio              *bio)
+{
+       struct xfs_gc_bio       *chunk =
+               container_of(bio, struct xfs_gc_bio, bio);
+       struct xfs_mount        *mp = chunk->ip->i_mount;
+       unsigned int            folio_offset = bio->bi_io_vec->bv_offset;
+       struct xfs_gc_bio       *split_chunk;
+
+       if (bio->bi_status)
+               xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+       if (xfs_is_shutdown(mp)) {
+               xfs_zone_gc_free_chunk(data, chunk);
+               return;
+       }
+
+       bio_reset(bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
+       bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
+                       folio_offset);
+
+       while ((split_chunk = xfs_gc_split_write(data, chunk)))
+               xfs_gc_submit_write(data, split_chunk);
+       xfs_gc_submit_write(data, chunk);
+}
+
+static void
+xfs_zone_gc_finish_chunk(
+       struct xfs_zone_gc_data *data,
+       struct bio              *bio)
+{
+       struct xfs_gc_bio       *chunk =
+               container_of(bio, struct xfs_gc_bio, bio);
+       uint                    iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+       struct xfs_inode        *ip = chunk->ip;
+       struct xfs_mount        *mp = ip->i_mount;
+       int                     error;
+
+       if (bio->bi_status)
+               xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+       if (xfs_is_shutdown(mp)) {
+               xfs_zone_gc_free_chunk(data, chunk);
+               return;
+       }
+
+       chunk->scratch->freed += chunk->len;
+       if (chunk->scratch->freed == chunk->scratch->offset) {
+               chunk->scratch->offset = 0;
+               chunk->scratch->freed = 0;
+       }
+
+       /*
+        * Cycle through the iolock and wait for direct I/O and layouts to
+        * ensure no one is reading from the old mapping before it goes away.
+        */
+       xfs_ilock(ip, iolock);
+       error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
+       if (!error)
+               inode_dio_wait(VFS_I(ip));
+       xfs_iunlock(ip, iolock);
+       if (error)
+               goto free;
+
+       if (chunk->is_seq)
+               chunk->new_daddr = bio->bi_iter.bi_sector;
+       error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
+                       chunk->new_daddr, chunk->old_startblock);
+free:
+       if (error)
+               xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+       xfs_zone_gc_free_chunk(data, chunk);
+}
+
+static void
+xfs_zone_gc_finish_reset(
+       struct xfs_rtgroup      *rtg,
+       struct bio              *bio)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+
+       if (bio->bi_status) {
+               xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+               goto out;
+       }
+
+       spin_lock(&mp->m_zone_list_lock);
+       list_add_tail(&rtg->rtg_entry, &mp->m_free_zones);
+       atomic_inc(&mp->m_nr_free_zones);
+       spin_unlock(&mp->m_zone_list_lock);
+
+       xfs_zoned_add_available(mp, rtg->rtg_extents);
+
+       wake_up_all(&mp->m_zone_wait);
+out:
+       bio_put(bio);
+}
+
+static void
+xfs_zone_reset_end_io(
+       struct bio              *bio)
+{
+       struct xfs_zone_gc_data *data =
+               container_of(bio, struct xfs_gc_bio, bio)->data;
+       struct xfs_rtgroup      *rtg = bio->bi_private;
+       unsigned long           flags;
+
+       spin_lock_irqsave(&data->list_lock, flags);
+       bio_list_add(&data->reset_done, bio);
+       data->inflight--;
+       wake_up_process(rtg_mount(rtg)->m_zone_gc_thread);
+       spin_unlock_irqrestore(&data->list_lock, flags);
+}
+
+static struct bio *
+xfs_prepare_zone_reset(
+       struct xfs_rtgroup      *rtg,
+       struct xfs_zone_gc_data *data)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+       struct block_device     *bdev = mp->m_rtdev_targp->bt_bdev;
+       struct bio              *bio;
+
+       spin_lock(&rtg->rtg_alloc_lock);
+       rtg->rtg_write_pointer = 0;
+       spin_unlock(&rtg->rtg_alloc_lock);
+
+       xfs_ilock(rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOCK_EXCL);
+       ASSERT(*xfs_zone_used_counter(rtg) == 0);
+       rtg->rtg_written = 0;
+       *xfs_zone_last_written(rtg) = 0;
+       xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOCK_EXCL);
+
+       trace_xfs_zone_reset(rtg);
+
+       bio = bio_alloc_bioset(bdev, 0, REQ_OP_ZONE_RESET, GFP_NOFS,
+                       data ? &data->bio_set : &fs_bio_set);
+       bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
+       if (!test_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags)) {
+               bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
+               bio->bi_iter.bi_size = XFS_FSB_TO_B(mp, rtg->rtg_extents);
+       }
+       return bio;
+}
+
+static void
+xfs_reset_empty_zones(
+       struct xfs_zone_gc_data *data,
+       struct list_head        *empty_zones)
+{
+       struct xfs_rtgroup      *rtg;
+       struct bio              *bio;
+
+       if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
+               xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
+               return;
+       }
+
+       while ((rtg = list_first_entry_or_null(empty_zones,
+                       struct xfs_rtgroup, rtg_entry))) {
+               list_del_init(&rtg->rtg_entry);
+
+               xfs_log_force_inode(rtg->rtg_inodes[XFS_RTGI_RMAP]);
+
+               bio = xfs_prepare_zone_reset(rtg, data);
+               bio->bi_private = rtg;
+               bio->bi_end_io = xfs_zone_reset_end_io;
+               data->inflight++;
+               container_of(bio, struct xfs_gc_bio, bio)->data = data;
+               submit_bio(bio);
+       }
+}
+
+static bool
+xfs_zone_gc_handle_work(
+       struct xfs_zone_gc_data *data,
+       struct xfs_zone_gc_iter *iter)
+{
+       struct bio_list         read_done = BIO_EMPTY_LIST;
+       struct bio_list         write_done = BIO_EMPTY_LIST;
+       struct bio_list         reset_done = BIO_EMPTY_LIST;
+       LIST_HEAD               (empty_zones);
+       struct blk_plug         plug;
+       struct bio              *bio;
+
+       spin_lock_irq(&data->list_lock);
+       bio_list_merge_init(&read_done, &data->read_done);
+       bio_list_merge_init(&write_done, &data->write_done);
+       bio_list_merge_init(&reset_done, &data->reset_done);
+       spin_unlock_irq(&data->list_lock);
+       
+       spin_lock(&data->mp->m_zone_list_lock);
+       list_splice_init(&data->mp->m_emptied_zones, &empty_zones);
+       spin_unlock(&data->mp->m_zone_list_lock);
+
+       if (!xfs_zone_reclaim_pick(data->mp, iter) ||
+           !xfs_zone_gc_space_available(data)) {
+               if (bio_list_empty(&read_done) &&
+                   bio_list_empty(&write_done) &&
+                   bio_list_empty(&reset_done) &&
+                   list_empty(&empty_zones))
+                       return false;
+       }
+
+       __set_current_state(TASK_RUNNING);
+       try_to_freeze();
+
+       while ((bio = bio_list_pop(&reset_done)))
+               xfs_zone_gc_finish_reset(bio->bi_private, bio);
+
+       if (!list_empty(&empty_zones))
+               xfs_reset_empty_zones(data, &empty_zones);
+
+       blk_start_plug(&plug);
+       while ((bio = bio_list_pop(&read_done)))
+               xfs_zone_gc_write_chunk(data, bio);
+       blk_finish_plug(&plug);
+
+       while ((bio = bio_list_pop(&write_done)))
+               xfs_zone_gc_finish_chunk(data, bio);
+
+       blk_start_plug(&plug);
+       while (xfs_zone_gc_start_chunk(data, iter))
+               ;
+       blk_finish_plug(&plug);
+       return true;
+}
+
+/*
+ * XXX: This breaks reflinks and thus duplicates data that was shared by
+ * multiple owners before.
+ */
+static int
+xfs_zoned_gcd(
+       void                    *private)
+{
+       struct xfs_mount        *mp = private;
+       unsigned int            nofs_flag;
+       struct xfs_zone_gc_data *data;
+       struct xfs_zone_gc_iter *iter;
+
+       data = xfs_zone_gc_data_alloc(mp);
+       if (!data)
+               return -ENOMEM;
+       iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+       if (!iter)
+               goto out_free_data;
+
+       nofs_flag = memalloc_nofs_save();
+       set_freezable();
+
+       for (;;) {
+               set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
+               xfs_set_in_gc(mp);
+               if (xfs_zone_gc_handle_work(data, iter))
+                       continue;
+
+               if (!data->inflight) {
+                       xfs_clear_in_gc(mp);
+                       xfs_zoned_wake_all(mp);
+
+                       if (kthread_should_stop()) {
+                               __set_current_state(TASK_RUNNING);
+                               break;
+                       }
+
+                       if (kthread_should_park()) {
+                               __set_current_state(TASK_RUNNING);
+                               kthread_parkme();
+                               continue;
+                       }
+               }
+
+               schedule();
+       }
+       xfs_clear_in_gc(mp);
+
+       if (iter->victim_rtg)
+               xfs_rtgroup_rele(iter->victim_rtg);
+       if (mp->m_open_gc_zone)
+               xfs_rtgroup_rele(mp->m_open_gc_zone);
+
+       memalloc_nofs_restore(nofs_flag);
+       kfree(iter);
+out_free_data:
+       xfs_zone_gc_data_free(data);
+       return 0;
+}
+
+static struct xfs_rtgroup *
+xfs_pick_open_zone_for_gc(
+       struct xfs_mount        *mp)
+{
+       struct xfs_rtgroup      *rtg, *found = NULL;
+
+       list_for_each_entry(rtg, &mp->m_open_zones, rtg_entry) {
+               if (!found)
+                       found = rtg;
+               else if (rtg->rtg_write_pointer < found->rtg_write_pointer)
+                       found = rtg;
+       }
+
+       return found;
+}
+
+void
+xfs_zone_gc_start(
+       struct xfs_mount        *mp)
+{
+       if (xfs_has_zoned(mp))
+               kthread_unpark(mp->m_zone_gc_thread);
+}
+
+void
+xfs_zone_gc_stop(
+       struct xfs_mount        *mp)
+{
+       if (xfs_has_zoned(mp))
+               kthread_park(mp->m_zone_gc_thread);
+}
+
+static int
+xfs_get_zone_info_cb(
+       struct blk_zone         *zone,
+       unsigned int            idx,
+       void                    *data)
+{
+       struct xfs_mount        *mp = data;
+       xfs_fsblock_t           zsbno = xfs_daddr_to_rtb(mp, zone->start);
+       xfs_rgnumber_t          rgno;
+       struct xfs_rtgroup      *rtg;
+       int                     error;
+
+       if (xfs_rtb_to_rgbno(mp, zsbno) != 0) {
+               xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno);
+               return -EFSCORRUPTED;
+       }
+
+       rgno = xfs_rtb_to_rgno(mp, zsbno);
+       rtg = xfs_rtgroup_get(mp, rgno);
+       if (!rtg) {
+               xfs_warn(mp, "realtime group not found for zone %u.", rgno);
+               return -EFSCORRUPTED;
+       }
+       error = xfs_zone_validate(zone, rtg);
+       xfs_rtgroup_put(rtg);
+       return error;
+}
+
+static int
+xfs_init_zone(
+       struct xfs_rtgroup      *rtg,
+       uint64_t                *available,
+       uint64_t                *freedblocks)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+       uint64_t                used = *xfs_zone_used_counter(rtg);
+
+       if (rtg->rtg_write_pointer == rtg->rtg_extents && used == 0) {
+               struct bio      *bio;
+               int             error;
+
+               bio = xfs_prepare_zone_reset(rtg, NULL);
+               error = submit_bio_wait(bio);
+               bio_put(bio);
+               if (error)
+                       return error;
+       } else {
+               /*
+                * For sequential write required zones, xfs_get_zone_info_cb
+                * initializes rtg_write_pointer to the hardware write pointer.
+                *
+                * We initialize it to the last recorded writes for conventional
+                * zone, as we don't know what actually got written, just what
+                * we are able to record in the I/O completion handler.
+                */
+               if (!test_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags))
+                       rtg->rtg_write_pointer = *xfs_zone_last_written(rtg);
+
+               /*
+                * There can't be any I/O in flight we need to care about at
+                * mount time, so treat the write pointer as the completed
+                * write counter.
+                */
+               rtg->rtg_written = rtg->rtg_write_pointer;
+       }
+
+       if (rtg->rtg_write_pointer == 0) {
+               /* zone is free */
+               list_add_tail(&rtg->rtg_entry, &mp->m_free_zones);
+               atomic_inc(&mp->m_nr_free_zones);
+               *available += rtg->rtg_extents;
+       } else if (rtg->rtg_write_pointer < rtg->rtg_extents) {
+               /* zone is open */
+               list_add(&rtg->rtg_entry, &mp->m_open_zones);
+               mp->m_nr_open_zones++;
+               set_bit(RTG_F_OPEN, &rtg->rtg_flags);
+               *available += (rtg->rtg_extents - rtg->rtg_write_pointer);
+               *freedblocks += (rtg->rtg_write_pointer) - used;
+       } else if (used < rtg->rtg_extents) {
+               /* zone fully written, but has freed blocks */
+               xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE);
+               *freedblocks += (rtg->rtg_extents - used);
+       }
+
+       return 0;
+}
+
+/*
+ * Calculate the max open zone limit based on the of number of
+ * backing zones available
+ */
+static inline uint32_t
+xfs_max_open_zones(
+       struct xfs_mount        *mp)
+{
+       unsigned int            max_open, max_open_data_zones;
+       /*
+        * We need two zones for every open data zone,
+        * one in reserve as we don't reclaim open zones. One data zone
+        * and its spare is included in XFS_MIN_ZONES.
+        */
+       max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1;
+       max_open = max_open_data_zones + XFS_OPEN_GC_ZONES;
+
+       /*
+        * Cap the max open limit to 1/4 of available space
+        */
+       max_open = min(max_open, mp->m_sb.sb_rgcount / 4);
+
+       return max(XFS_MIN_OPEN_ZONES, max_open);
+}
+
+int
+xfs_mount_zones(
+       struct xfs_mount        *mp)
+{
+       struct xfs_buftarg      *bt = mp->m_rtdev_targp;
+       unsigned int            bdev_open_zones;
+       int64_t                 available = 0, freedblocks = 0;
+       struct xfs_rtgroup      *rtg = NULL;
+       int                     error;
+
+       if (!bt) {
+               xfs_notice(mp, "RT device missing.");
+               return -EINVAL;
+       }
+
+       if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) {
+               xfs_notice(mp, "invalid flag combination.");
+               return -EFSCORRUPTED;
+       }
+       if (mp->m_sb.sb_rextsize != 1) {
+               xfs_notice(mp, "zoned file systems do not support rextsize.");
+               return -EFSCORRUPTED;
+       }
+       if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) {
+               xfs_notice(mp,
+"zoned file systems need to have at least %d zones.", XFS_MIN_ZONES);
+               return -EFSCORRUPTED;
+       }
+
+       /*
+        * Normally we pick the open zone limit that the device reports.  If
+        * there isn't one let the user pick one from the command line.
+        *
+        * If the device doesn't report an open zone limit and there is no
+        * override, allow to hold about half of the zones open.  In theory we
+        * should allow to be open, but at that point we run into GC deadlocks
+        * because we (at least currently) can't reclaim open zones.
+        *
+        * When used on conventional SSDs a lower open limit is advisable as
+        * we'll otherwise overwhelm the FTL just as much as a conventional
+        * block allocator.
+        *
+        * Note: To debug the open zone management code, force max_open to
+        * 1 here.
+        */
+       bdev_open_zones = bdev_max_open_zones(bt->bt_bdev);
+       if (bdev_open_zones && !mp->m_max_open_zones)
+               mp->m_max_open_zones = bdev_open_zones;
+       if (mp->m_max_open_zones) {
+               if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) {
+                       xfs_notice(mp, "need at least %d open zones.",
+                               XFS_MIN_OPEN_ZONES);
+                       return -EIO;
+               }
+               if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) {
+                       xfs_warn(mp, "device only supports %d open zones.\n",
+                               bdev_open_zones);
+                       mp->m_max_open_zones = bdev_open_zones;
+               }
+               if (mp->m_max_open_zones > xfs_max_open_zones(mp)) {
+                       mp->m_max_open_zones = xfs_max_open_zones(mp);
+                       xfs_info(mp,
+"limiting open zones to %u due to total zone count (%u)",
+                               mp->m_max_open_zones, mp->m_sb.sb_rgcount);
+               }
+       } else {
+               mp->m_max_open_zones = xfs_max_open_zones(mp);
+       }
+
+       INIT_LIST_HEAD(&mp->m_free_zones);
+       INIT_LIST_HEAD(&mp->m_open_zones);
+       INIT_LIST_HEAD(&mp->m_emptied_zones);
+       INIT_LIST_HEAD(&mp->m_reclaim_reservations);
+       spin_lock_init(&mp->m_zone_list_lock);
+       spin_lock_init(&mp->m_reservation_lock);
+       init_waitqueue_head(&mp->m_zone_wait);
+
+       xfs_info(mp, "%u zones of %u blocks size (%d max open)",
+                mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
+                mp->m_max_open_zones);
+
+       /*
+        * Sync our own information with the hardware zone state.
+        */
+       if (bdev_is_zoned(bt->bt_bdev)) {
+               if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+                       xfs_warn(mp,
+"zoned device support requires CONFIG_BLK_DEV_ZONED");
+                       return -EINVAL;
+               }
+               error = blkdev_report_zones(bt->bt_bdev, 0, mp->m_sb.sb_rgcount,
+                                           xfs_get_zone_info_cb, mp);
+               if (error < 0)
+                       return error;
+       }
+
+       mp->m_zone_gc_thread = kthread_create(xfs_zoned_gcd, mp,
+                               "xfs-zone-gc/%s",
+                               mp->m_super->s_id);
+       if (IS_ERR(mp->m_zone_gc_thread)) {
+               xfs_warn(mp, "unable to create zone gc thread");
+               return PTR_ERR(mp->m_zone_gc_thread);
+       }
+       /* xfs_zone_gc_start will unpark for rw mounts */
+       kthread_park(mp->m_zone_gc_thread);
+
+       while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+               error = xfs_init_zone(rtg, &available, &freedblocks);
+               if (error)
+                       goto out_unlink_zones;
+       }
+
+       /*
+        * XXX: convert to rtxlen.  Or just give up on the conversion because
+        * we have a 1:1 mapping.
+        */
+       percpu_counter_set(&mp->m_free[FREE_RTAVAILABLE], available);
+       percpu_counter_set(&mp->m_free[FREE_RTEXTENTS],
+                       available + freedblocks);
+
+       /*
+        * If there are no free zones available for GC, pick the open zone with
+        * the least used space to GC into.
+        */
+       if (list_empty(&mp->m_free_zones)) {
+               rtg = xfs_pick_open_zone_for_gc(mp);
+               if (!rtg) {
+                       error = -EINVAL;
+                       goto out_unlink_zones;
+               }
+               list_del_init(&rtg->rtg_entry);
+               mp->m_nr_open_zones--;
+               clear_bit(RTG_F_OPEN, &rtg->rtg_flags);
+               mp->m_open_gc_zone = rtg;
+       }
+       return 0;
+
+out_unlink_zones:
+       rtg = NULL;
+       while ((rtg = xfs_rtgroup_next(mp, rtg)))
+               list_del_init(&rtg->rtg_entry);
+       return error;
+}
+
+void
+xfs_unmount_zones(
+       struct xfs_mount        *mp)
+{
+       struct xfs_rtgroup      *rtg = NULL;
+
+       kthread_stop(mp->m_zone_gc_thread);
+       while ((rtg = xfs_rtgroup_next(mp, rtg)))
+               list_del_init(&rtg->rtg_entry);
+}
author	Christoph Hellwig <hch@lst.de>
	Tue, 5 Nov 2024 08:27:11 +0000 (09:27 +0100)
committer	Christoph Hellwig <hch@lst.de>
	Tue, 5 Nov 2024 08:29:53 +0000 (09:29 +0100)
fs/xfs/Makefile		patch \| blob \| history
fs/xfs/libxfs/xfs_bmap.c		patch \| blob \| history
fs/xfs/libxfs/xfs_bmap.h		patch \| blob \| history
fs/xfs/libxfs/xfs_rtbitmap.c		patch \| blob \| history
fs/xfs/libxfs/xfs_rtgroup.c		patch \| blob \| history
fs/xfs/libxfs/xfs_rtgroup.h		patch \| blob \| history
fs/xfs/libxfs/xfs_sb.c		patch \| blob \| history
fs/xfs/libxfs/xfs_zones.c	[new file with mode: 0644]	patch \| blob
fs/xfs/libxfs/xfs_zones.h	[new file with mode: 0644]	patch \| blob
fs/xfs/scrub/scrub.c		patch \| blob \| history
fs/xfs/xfs_aops.c		patch \| blob \| history
fs/xfs/xfs_aops.h		patch \| blob \| history
fs/xfs/xfs_bmap_util.c		patch \| blob \| history
fs/xfs/xfs_bmap_util.h		patch \| blob \| history
fs/xfs/xfs_discard.c		patch \| blob \| history
fs/xfs/xfs_extfree_item.c		patch \| blob \| history
fs/xfs/xfs_file.c		patch \| blob \| history
fs/xfs/xfs_icache.c		patch \| blob \| history
fs/xfs/xfs_inode.h		patch \| blob \| history
fs/xfs/xfs_ioctl.c		patch \| blob \| history
fs/xfs/xfs_iomap.c		patch \| blob \| history
fs/xfs/xfs_iomap.h		patch \| blob \| history
fs/xfs/xfs_iops.c		patch \| blob \| history
fs/xfs/xfs_log.c		patch \| blob \| history
fs/xfs/xfs_mount.c		patch \| blob \| history
fs/xfs/xfs_mount.h		patch \| blob \| history
fs/xfs/xfs_reflink.c		patch \| blob \| history
fs/xfs/xfs_reflink.h		patch \| blob \| history
fs/xfs/xfs_rtalloc.c		patch \| blob \| history
fs/xfs/xfs_super.c		patch \| blob \| history
fs/xfs/xfs_trace.c		patch \| blob \| history
fs/xfs/xfs_trace.h		patch \| blob \| history
fs/xfs/xfs_zone_alloc.c	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_zone_alloc.h	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_zone_gc.c	[new file with mode: 0644]	patch \| blob