]> www.infradead.org Git - users/hch/xfs.git/commitdiff
xfs: support zoned RT devices
authorChristoph Hellwig <hch@lst.de>
Tue, 5 Nov 2024 08:27:11 +0000 (09:27 +0100)
committerChristoph Hellwig <hch@lst.de>
Tue, 5 Nov 2024 08:29:53 +0000 (09:29 +0100)
WARNING: this is early prototype code.

The zoned allocator works by handing out data blocks to the direct or
buffered write code at the place where XFS currently does block
allocations.  It does not actually insert them into the bmap extent tree
at this time, but only after I/O completion when we known the block number.

The zoned allocator works on any kind of device, including conventional
devices or conventional zones by having a crude write pointer emulation.
For zone devices active zone management is fully support, as is
zone capacity < zone size.

The two major limitations are:

 - there is no support for unwritten extents and thus persistent
   file preallocations from fallocate().  This is inherent to an
   always out of place write scheme as there is no way to persistently
   preallocate blocks for an indefinite number of overwrites
 - because the metadata blocks and data blocks are on different
   device you can run out of space for metadata while having plenty
   of space for data and vice versa.  This is inherent to a scheme
   where we use different devices or pools for each.

For zoned file systems we reserve the free extents before taking the
ilock so that if we have to force garbage collection it happens before we
take the iolock.  This is done because GC has to take the iolock after it
moved data to a new place, and this could otherwise deadlock.

This unfortunately has to exclude block zeroing, as for truncate we are
called with the iolock (aka i_rwsem) already held.  As zeroing is always
only for a single block at a time, or up to two total for a syscall in
case for free_file_range we deal with that by just stealing the block,
but failing the allocation if we'd have to wait for GC.

Add a new RTAVAILABLE counter of blocks that are actually directly
available to be written into in addition to the classic free counter.
Only allow a write to go ahead if it has blocks available to write, and
otherwise wait for GC.  This also requires tweaking the need GC condition a
bit as we now always need to GC if someone is waiting for space.

Thanks to Hans Holmberg <hans.holmberg@wdc.com> for lots of fixes
and improvements.

Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
35 files changed:
fs/xfs/Makefile
fs/xfs/libxfs/xfs_bmap.c
fs/xfs/libxfs/xfs_bmap.h
fs/xfs/libxfs/xfs_rtbitmap.c
fs/xfs/libxfs/xfs_rtgroup.c
fs/xfs/libxfs/xfs_rtgroup.h
fs/xfs/libxfs/xfs_sb.c
fs/xfs/libxfs/xfs_zones.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_zones.h [new file with mode: 0644]
fs/xfs/scrub/scrub.c
fs/xfs/xfs_aops.c
fs/xfs/xfs_aops.h
fs/xfs/xfs_bmap_util.c
fs/xfs/xfs_bmap_util.h
fs/xfs/xfs_discard.c
fs/xfs/xfs_extfree_item.c
fs/xfs/xfs_file.c
fs/xfs/xfs_icache.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_ioctl.c
fs/xfs/xfs_iomap.c
fs/xfs/xfs_iomap.h
fs/xfs/xfs_iops.c
fs/xfs/xfs_log.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_reflink.c
fs/xfs/xfs_reflink.h
fs/xfs/xfs_rtalloc.c
fs/xfs/xfs_super.c
fs/xfs/xfs_trace.c
fs/xfs/xfs_trace.h
fs/xfs/xfs_zone_alloc.c [new file with mode: 0644]
fs/xfs/xfs_zone_alloc.h [new file with mode: 0644]
fs/xfs/xfs_zone_gc.c [new file with mode: 0644]

index 7afa51e414278e9cd39cf3888dc5713e0ca64ef1..cc6019342c0bf8a8c4ada5313a74d58dc89e6070 100644 (file)
@@ -64,6 +64,7 @@ xfs-y                         += $(addprefix libxfs/, \
 xfs-$(CONFIG_XFS_RT)           += $(addprefix libxfs/, \
                                   xfs_rtbitmap.o \
                                   xfs_rtgroup.o \
+                                  xfs_zones.o \
                                   )
 
 # highlevel code
@@ -136,7 +137,9 @@ xfs-$(CONFIG_XFS_QUOTA)             += xfs_dquot.o \
                                   xfs_quotaops.o
 
 # xfs_rtbitmap is shared with libxfs
-xfs-$(CONFIG_XFS_RT)           += xfs_rtalloc.o
+xfs-$(CONFIG_XFS_RT)           += xfs_rtalloc.o \
+                                  xfs_zone_alloc.o \
+                                  xfs_zone_gc.o
 
 xfs-$(CONFIG_XFS_POSIX_ACL)    += xfs_acl.o
 xfs-$(CONFIG_SYSCTL)           += xfs_sysctl.o
index dc2f2608c3962d915d4097d96a2cc169daf5d433..4bb13d34a87e2aa6641d68ae39f4d7a530802964 100644 (file)
@@ -41,6 +41,7 @@
 #include "xfs_symlink_remote.h"
 #include "xfs_inode_util.h"
 #include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
 
 struct kmem_cache              *xfs_bmap_intent_cache;
 
@@ -171,18 +172,16 @@ xfs_bmbt_update(
  * Compute the worst-case number of indirect blocks that will be used
  * for ip's delayed extent of length "len".
  */
-STATIC xfs_filblks_t
+xfs_filblks_t
 xfs_bmap_worst_indlen(
-       xfs_inode_t     *ip,            /* incore inode pointer */
-       xfs_filblks_t   len)            /* delayed extent length */
+       struct xfs_inode        *ip,            /* incore inode pointer */
+       xfs_filblks_t           len)            /* delayed extent length */
 {
-       int             level;          /* btree level number */
-       int             maxrecs;        /* maximum record count at this level */
-       xfs_mount_t     *mp;            /* mount structure */
-       xfs_filblks_t   rval;           /* return value */
+       struct xfs_mount        *mp = ip->i_mount;
+       int                     maxrecs = mp->m_bmap_dmxr[0];
+       int                     level;
+       xfs_filblks_t           rval;
 
-       mp = ip->i_mount;
-       maxrecs = mp->m_bmap_dmxr[0];
        for (level = 0, rval = 0;
             level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
             level++) {
@@ -2576,12 +2575,12 @@ done:
 /*
  * Convert a hole to a delayed allocation.
  */
-STATIC void
+void
 xfs_bmap_add_extent_hole_delay(
-       xfs_inode_t             *ip,    /* incore inode pointer */
+       struct xfs_inode        *ip,    /* incore inode pointer */
        int                     whichfork,
        struct xfs_iext_cursor  *icur,
-       xfs_bmbt_irec_t         *new)   /* new data to add to file extents */
+       struct xfs_bmbt_irec    *new)   /* new data to add to file extents */
 {
        struct xfs_ifork        *ifp;   /* inode fork pointer */
        xfs_bmbt_irec_t         left;   /* left neighbor extent entry */
@@ -4127,6 +4126,7 @@ retry:
 
        fdblocks = indlen;
        if (XFS_IS_REALTIME_INODE(ip)) {
+               ASSERT(!xfs_is_zoned_inode(ip));
                error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
                if (error)
                        goto out_unreserve_quota;
@@ -5072,12 +5072,18 @@ xfs_bmap_del_extent_delay(
        da_diff = da_old - da_new;
        fdblocks = da_diff;
 
-       if (bflags & XFS_BMAPI_REMAP)
+       if (bflags & XFS_BMAPI_REMAP) {
                ;
-       else if (isrt)
-               xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
-       else
+       } else if (isrt) {
+               xfs_rtxlen_t    rtxlen;
+
+               rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount);
+               if (xfs_is_zoned_inode(ip))
+                       xfs_zoned_add_available(mp, rtxlen);
+               xfs_add_frextents(mp, rtxlen);
+       } else {
                fdblocks += del->br_blockcount;
+       }
 
        xfs_add_fdblocks(mp, fdblocks);
        xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
@@ -6383,8 +6389,12 @@ xfs_bmap_validate_extent_raw(
                                           irec->br_blockcount))
                        return __this_address;
        }
-       if (irec->br_state != XFS_EXT_NORM && whichfork != XFS_DATA_FORK)
-               return __this_address;
+       if (irec->br_state != XFS_EXT_NORM) {
+               if (whichfork != XFS_DATA_FORK)
+                       return __this_address;
+               if (rtfile && xfs_has_zoned(mp))
+                       return __this_address;
+       }
        return NULL;
 }
 
index 8bfb75444d3b6c5569c1ecfff2d268e6a129b324..e114dd691fc027f1ccdd98471997e337b3276f06 100644 (file)
@@ -229,10 +229,13 @@ int       xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
                struct xfs_inode *ip, int whichfork,
                struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
                struct xfs_bmbt_irec *new, int *logflagsp);
+void   xfs_bmap_add_extent_hole_delay(struct xfs_inode *ip, int whichfork,
+               struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *new);
 xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip,
                int fork);
 int    xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap,
                struct xfs_alloc_arg *args);
+xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len);
 
 enum xfs_bmap_intent_type {
        XFS_BMAP_MAP = 1,
index 4ddfb7e395b38af2d3ee8f9de109053183c8903a..8ddc6e74240ae9c8c5982aeacabb0dd963ff8dba 100644 (file)
@@ -1123,6 +1123,7 @@ xfs_rtfree_blocks(
        xfs_extlen_t            mod;
        int                     error;
 
+       ASSERT(!xfs_has_zoned(mp));
        ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN);
 
        mod = xfs_blen_to_rtxoff(mp, rtlen);
@@ -1174,6 +1175,22 @@ xfs_rtalloc_query_range(
 
        end = min(end, rtg->rtg_extents - 1);
 
+       if (xfs_has_zoned(mp)) {
+               xfs_rtxnum_t            wp;
+
+               wp = rtg->rtg_write_pointer * mp->m_sb.sb_rextsize;
+               if (end >= wp) {
+                       struct xfs_rtalloc_rec  rec = {
+                               .ar_startext    = max(start, wp),
+                               .ar_extcount    = end - start + 1,
+                       };
+
+                       return fn(rtg, tp, &rec, priv);
+               }
+
+               return 0;
+       }
+
        /* Iterate the bitmap, looking for discrepancies. */
        while (start <= end) {
                struct xfs_rtalloc_rec  rec;
@@ -1268,6 +1285,8 @@ xfs_rtbitmap_blockcount_len(
        struct xfs_mount        *mp,
        xfs_rtbxlen_t           rtextents)
 {
+       if (xfs_has_zoned(mp))
+               return 0;
        return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp));
 }
 
@@ -1308,6 +1327,11 @@ xfs_rtsummary_blockcount(
        xfs_rtbxlen_t           rextents = xfs_rtbitmap_bitcount(mp);
        unsigned long long      rsumwords;
 
+       if (xfs_has_zoned(mp)) {
+               *rsumlevels = 0;
+               return 0;
+       }
+
        *rsumlevels = xfs_compute_rextslog(rextents) + 1;
        rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels);
        return howmany_64(rsumwords, mp->m_blockwsize);
index dbff04daaccada47c76ca109b8fe4418d2779b47..2f9fc1aaacccdc165608fef8dc0c625b4985e503 100644 (file)
@@ -77,6 +77,8 @@ xfs_rtgroup_alloc(
                return -ENOMEM;
 
        xfs_rtgroup_calc_geometry(mp, rtg, rgno, rgcount, rextents);
+       INIT_LIST_HEAD(&rtg->rtg_entry);
+       spin_lock_init(&rtg->rtg_alloc_lock);
 
        error = xfs_group_insert(mp, rtg_group(rtg), rgno, XG_TYPE_RTG);
        if (error)
@@ -184,6 +186,25 @@ xfs_update_last_rtgroup_size(
        return 0;
 }
 
+/*
+ * Zoned file systems don't have bitmap and summary inodes, instead allocations
+ * are only tracked in the rmap.
+ *
+ * This means XFS_RTGLOCK_BITMAP(_SHARED) implies that the rmap needs to be
+ * locked instead.
+ */
+static void
+xfs_rtglock_zoned_adjust(
+       struct xfs_rtgroup      *rtg,
+       unsigned int            *rtglock_flags)
+{
+       if (!xfs_has_zoned(rtg_mount(rtg)))
+               return;
+       if (*rtglock_flags & (XFS_RTGLOCK_BITMAP | XFS_RTGLOCK_BITMAP_SHARED))
+               *rtglock_flags |= XFS_RTGLOCK_RMAP;
+       *rtglock_flags &= ~(XFS_RTGLOCK_BITMAP | XFS_RTGLOCK_BITMAP_SHARED);
+}
+
 /* Lock metadata inodes associated with this rt group. */
 void
 xfs_rtgroup_lock(
@@ -194,6 +215,8 @@ xfs_rtgroup_lock(
        ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
               !(rtglock_flags & XFS_RTGLOCK_BITMAP));
 
+       xfs_rtglock_zoned_adjust(rtg, &rtglock_flags);
+
        if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
                /*
                 * Lock both realtime free space metadata inodes for a freespace
@@ -224,6 +247,8 @@ xfs_rtgroup_unlock(
        ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
               !(rtglock_flags & XFS_RTGLOCK_BITMAP));
 
+       xfs_rtglock_zoned_adjust(rtg, &rtglock_flags);
+
        if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) &&
                        rtg->rtg_inodes[XFS_RTGI_REFCOUNT])
                xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_REFCOUNT], XFS_ILOCK_EXCL);
@@ -252,6 +277,8 @@ xfs_rtgroup_trans_join(
        ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
        ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED));
 
+       xfs_rtglock_zoned_adjust(rtg, &rtglock_flags);
+
        if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
                xfs_trans_ijoin(tp, rtg->rtg_inodes[XFS_RTGI_BITMAP],
                                XFS_ILOCK_EXCL);
@@ -372,6 +399,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
                .sick           = XFS_SICK_RG_BITMAP,
                .fmt_mask       = (1U << XFS_DINODE_FMT_EXTENTS) |
                                  (1U << XFS_DINODE_FMT_BTREE),
+               .enabled        = xfs_has_nonzoned,
                .create         = xfs_rtbitmap_create,
        },
        [XFS_RTGI_SUMMARY] = {
@@ -380,6 +408,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
                .sick           = XFS_SICK_RG_SUMMARY,
                .fmt_mask       = (1U << XFS_DINODE_FMT_EXTENTS) |
                                  (1U << XFS_DINODE_FMT_BTREE),
+               .enabled        = xfs_has_nonzoned,
                .create         = xfs_rtsummary_create,
        },
        [XFS_RTGI_RMAP] = {
index 06f93b97d396c43b269c4fa520acf0aadf490cab..e8fab91a5ee2e65d9560af19e9256adec91c021a 100644 (file)
@@ -44,6 +44,17 @@ struct xfs_rtgroup {
         * Reads and writes are serialized by the rsumip inode lock.
         */
        uint8_t                 *rtg_rsum_cache;
+
+       unsigned long           rtg_flags;
+#define RTG_F_SEQUENTIAL               0
+#define RTG_F_OPEN                     1
+
+       spinlock_t              rtg_alloc_lock;
+       xfs_rgblock_t           rtg_write_pointer;
+       xfs_rgblock_t           rtg_written;
+
+       /* zone state entry */
+       struct list_head        rtg_entry;
 };
 
 static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg)
@@ -66,6 +77,8 @@ static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg)
        return rtg->rtg_group.xg_gno;
 }
 
+#define XFS_RTG_RECLAIMABLE            XA_MARK_0
+
 /* Passive rtgroup references */
 static inline struct xfs_rtgroup *
 xfs_rtgroup_get(
index 89ebab965d5e4ff561203784729f31c245c4a736..d4d19b6f4389ecb73abd4d7197eeb7ef1e23cb6d 100644 (file)
@@ -30,6 +30,7 @@
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
 #include "xfs_rtrefcount_btree.h"
+#include "xfs_rtbitmap.h"
 
 /*
  * Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -268,6 +269,9 @@ static uint64_t
 xfs_expected_rbmblocks(
        struct xfs_sb           *sbp)
 {
+       if (xfs_sb_is_v5(sbp) &&
+           (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED))
+               return 0;
        return howmany_64(xfs_extents_per_rbm(sbp),
                          NBBY * xfs_rtbmblock_size(sbp));
 }
@@ -1268,7 +1272,7 @@ xfs_log_sb(
         * we handle nearly-lockless reservations, so we must use the _positive
         * variant here to avoid writing out nonsense frextents.
         */
-       if (xfs_has_rtgroups(mp))
+       if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp))
                mp->m_sb.sb_frextents = xfs_sum_freecounter(mp, FREE_RTEXTENTS);
 
        xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c
new file mode 100644 (file)
index 0000000..e38529c
--- /dev/null
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Christoph Hellwig.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zones.h"
+
+static int
+xfs_zone_validate_empty(
+       struct blk_zone         *zone,
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+
+       if (*xfs_zone_used_counter(rtg) > 0) {
+               xfs_warn(mp, "empty zone %d has non-zero used counter (0x%llx).",
+                        rtg_rgno(rtg), *xfs_zone_used_counter(rtg));
+               return -EIO;
+       }
+       return 0;
+}
+
+static int
+xfs_zone_validate_wp(
+       struct blk_zone         *zone,
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+       xfs_fileoff_t           wp_fsb = xfs_daddr_to_rtb(mp, zone->wp);
+
+       if (*xfs_zone_used_counter(rtg) > (uint64_t)rtg->rtg_extents) {
+               xfs_warn(mp, "zone %d has too larged used counter (0x%llx).",
+                        rtg_rgno(rtg), *xfs_zone_used_counter(rtg));
+               return -EIO;
+       }
+
+       /*
+        * Always use the hardware write pointer.
+        */
+       rtg->rtg_write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb);
+       if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) {
+               xfs_warn(mp, "zone %d write pointer (0x%x) outside of zone.",
+                        rtg_rgno(rtg), rtg->rtg_write_pointer);
+               return -EFSCORRUPTED;
+       }
+       if (rtg->rtg_write_pointer >= rtg->rtg_extents) {
+               xfs_warn(mp, "zone %d has invalid write pointer (0x%x).",
+                        rtg_rgno(rtg), rtg->rtg_write_pointer);
+               return -EFSCORRUPTED;
+       }
+
+       return 0;
+}
+
+static int
+xfs_zone_validate_full(
+       struct blk_zone         *zone,
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+
+       rtg->rtg_write_pointer = rtg->rtg_extents;
+       if (*xfs_zone_used_counter(rtg) > rtg->rtg_extents) {
+               xfs_warn(mp, "zone %d has too larged used counter (0x%llx).",
+                        rtg_rgno(rtg), *xfs_zone_used_counter(rtg));
+               return -EIO;
+       }
+
+       return 0;
+}
+
+static int
+xfs_zone_validate_seq(
+       struct blk_zone         *zone,
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+
+       set_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags);
+
+       switch (zone->cond) {
+       case BLK_ZONE_COND_EMPTY:
+               return xfs_zone_validate_empty(zone, rtg);
+       case BLK_ZONE_COND_IMP_OPEN:
+       case BLK_ZONE_COND_EXP_OPEN:
+       case BLK_ZONE_COND_CLOSED:
+               return xfs_zone_validate_wp(zone, rtg);
+       case BLK_ZONE_COND_FULL:
+               return xfs_zone_validate_full(zone, rtg);
+       case BLK_ZONE_COND_NOT_WP:
+       case BLK_ZONE_COND_OFFLINE:
+       case BLK_ZONE_COND_READONLY:
+               xfs_warn(mp, "zone %d has unsupported zone condition 0x%x.",
+                       rtg_rgno(rtg), zone->cond);
+               return -EIO;
+       default:
+               xfs_warn(mp, "zone %d has unknown zone condition 0x%x.",
+                       rtg_rgno(rtg), zone->cond);
+               return -EIO;
+       }
+}
+
+static int
+xfs_zone_validate_conv(
+       struct blk_zone         *zone,
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+
+       switch (zone->cond) {
+       case BLK_ZONE_COND_NOT_WP:
+               return 0;
+       default:
+               xfs_warn(mp,
+"conventional zone %d has unsupported zone condition 0x%x.",
+                        rtg_rgno(rtg), zone->cond);
+               return -EIO;
+       }
+}
+
+int
+xfs_zone_validate(
+       struct blk_zone         *zone,
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+       struct xfs_groups       *g = &mp->m_groups[XG_TYPE_RTG];
+
+       /*
+        * Check that the zone capacity matches the capacity stored in the
+        * superblock.  Note that all zones including the last one must have a
+        * uniform capacity.
+        */
+       if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) {
+               xfs_warn(mp,
+"zone %d capacity (0x%llx) does not match RT group size (0x%x).",
+                       rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity),
+                       g->blocks);
+               return -EIO;
+       }
+
+       if (XFS_BB_TO_FSB(mp, zone->len) != 1 << g->blklog) {
+               xfs_warn(mp,
+"zone %d length (0x%llx) does match geometry (0x%x).",
+                       rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len),
+                       1 << g->blklog);
+       }
+
+       switch (zone->type) {
+       case BLK_ZONE_TYPE_CONVENTIONAL:
+               return xfs_zone_validate_conv(zone, rtg);
+       case BLK_ZONE_TYPE_SEQWRITE_REQ:
+               return xfs_zone_validate_seq(zone, rtg);
+       default:
+               xfs_warn(mp, "zoned %d has unsupported type 0x%x.",
+                       rtg_rgno(rtg), zone->type);
+               return -EFSCORRUPTED;
+       }
+}
diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h
new file mode 100644 (file)
index 0000000..7d0e3ce
--- /dev/null
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LIBXFS_ZONES_H
+#define _LIBXFS_ZONES_H
+
+/*
+ * In order to guarantee forward progress for GC we need to reserve at least
+ * two zones:  one that will be used for moving data into and one spare zone
+ * making sure that we have enough space to relocate a nearly-full zone.
+ * To allow for slightly sloppy accounting for when we need to reserve the
+ * second zone, we actually reserve three as that is easier than doing fully
+ * accurate bookkeeing.
+ */
+#define XFS_GC_ZONES           3U
+
+/*
+ * In addition we need two zones for user writes, one open zone for writing
+ * and one to still have available blocks without resetting the open zone
+ * when data in the open zone has been freed.
+ */
+#define XFS_RESERVED_ZONES     (XFS_GC_ZONES + 1)
+#define XFS_MIN_ZONES          (XFS_RESERVED_ZONES + 1)
+
+/*
+ * Always keep one zone out of the general open zone pool to allow for GC to
+ * happen while other writers are waiting for free space.
+ */
+#define XFS_OPEN_GC_ZONES      1U
+#define XFS_MIN_OPEN_ZONES     (XFS_OPEN_GC_ZONES + 1U)
+
+int xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg);
+
+#endif /* _LIBXFS_ZONES_H */
index 7567dd5cad14f4734fbfbcfc0e0278b46fc29525..1a05c27ba47197f2e39df3c3879e8b378beb2f5d 100644 (file)
@@ -387,12 +387,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
        },
        [XFS_SCRUB_TYPE_RTBITMAP] = {   /* realtime bitmap */
                .type   = ST_RTGROUP,
+               .has    = xfs_has_nonzoned,
                .setup  = xchk_setup_rtbitmap,
                .scrub  = xchk_rtbitmap,
                .repair = xrep_rtbitmap,
        },
        [XFS_SCRUB_TYPE_RTSUM] = {      /* realtime summary */
                .type   = ST_RTGROUP,
+               .has    = xfs_has_nonzoned,
                .setup  = xchk_setup_rtsummary,
                .scrub  = xchk_rtsummary,
                .repair = xrep_rtsummary,
index d175853da5aef0177e21ff0a50c2bb0c3b76853c..536776e14a779f890621391b80075b6530922439 100644 (file)
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * Copyright (c) 2016-2018 Christoph Hellwig.
+ * Copyright (c) 2016-2023 Christoph Hellwig.
  * All Rights Reserved.
  */
 #include "xfs.h"
@@ -19,6 +19,8 @@
 #include "xfs_reflink.h"
 #include "xfs_errortag.h"
 #include "xfs_error.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_rtgroup.h"
 
 struct xfs_writepage_ctx {
        struct iomap_writepage_ctx ctx;
@@ -85,6 +87,7 @@ xfs_end_ioend(
 {
        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
        struct xfs_mount        *mp = ip->i_mount;
+       bool                    is_zoned = xfs_is_zoned_inode(ip);
        xfs_off_t               offset = ioend->io_offset;
        size_t                  size = ioend->io_size;
        unsigned int            nofs_flag;
@@ -115,9 +118,10 @@ xfs_end_ioend(
        error = blk_status_to_errno(ioend->io_bio.bi_status);
        if (unlikely(error)) {
                if (ioend->io_flags & IOMAP_F_SHARED) {
+                       ASSERT(!is_zoned);
                        xfs_reflink_cancel_cow_range(ip, offset, size, true);
                        xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
-                                       offset + size);
+                                       offset + size, NULL);
                }
                goto done;
        }
@@ -125,12 +129,15 @@ xfs_end_ioend(
        /*
         * Success: commit the COW or unwritten blocks if needed.
         */
-       if (ioend->io_flags & IOMAP_F_SHARED)
+       if (is_zoned)
+               error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
+                               NULLFSBLOCK);
+       else if (ioend->io_flags & IOMAP_F_SHARED)
                error = xfs_reflink_end_cow(ip, offset, size);
        else if (ioend->io_type == IOMAP_UNWRITTEN)
                error = xfs_iomap_write_unwritten(ip, offset, size, false);
 
-       if (!error && xfs_ioend_is_append(ioend))
+       if (!error && xfs_ioend_is_append(ioend) && !ioend->io_isdirect)
                error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
 done:
        iomap_finish_ioends(ioend, error);
@@ -175,17 +182,30 @@ xfs_end_io(
        }
 }
 
-STATIC void
+void
 xfs_end_bio(
        struct bio              *bio)
 {
        struct iomap_ioend      *ioend = iomap_ioend_from_bio(bio);
        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
+       struct xfs_mount        *mp = ip->i_mount;
        unsigned long           flags;
 
+       if (bio_is_zone_append(bio)) {
+               /*
+                * Record the actually written block number and make sure we
+                * don't merge the first ioened for a zone into the last one
+                * for the previous zone.
+                */
+               ioend->io_sector = bio->bi_iter.bi_sector;
+               if (!(xfs_daddr_to_rtb(mp, ioend->io_sector) %
+                     mp->m_groups[XG_TYPE_RTG].blocks))
+                       ioend->io_flags |= IOMAP_F_BOUNDARY;
+       }
+
        spin_lock_irqsave(&ip->i_ioend_lock, flags);
        if (list_empty(&ip->i_ioend_list))
-               WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
+               WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
                                         &ip->i_ioend_work));
        list_add_tail(&ioend->io_list, &ip->i_ioend_list);
        spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
@@ -462,7 +482,7 @@ xfs_discard_folio(
         * folio itself and not the start offset that is passed in.
         */
        xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
-                               folio_pos(folio) + folio_size(folio));
+                               folio_pos(folio) + folio_size(folio), NULL);
 }
 
 static const struct iomap_writeback_ops xfs_writeback_ops = {
@@ -471,14 +491,116 @@ static const struct iomap_writeback_ops xfs_writeback_ops = {
        .discard_folio          = xfs_discard_folio,
 };
 
+struct xfs_zoned_writepage_ctx {
+       struct iomap_writepage_ctx ctx;
+       struct xfs_rtgroup      *rtg;
+};
+
+static inline struct xfs_zoned_writepage_ctx *
+XFS_ZWPC(struct iomap_writepage_ctx *ctx)
+{
+       return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
+}
+
+static int
+xfs_zoned_map_blocks(
+       struct iomap_writepage_ctx *wpc,
+       struct inode            *inode,
+       loff_t                  offset,
+       unsigned int            len)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + len);
+       xfs_filblks_t           count_fsb;
+       struct xfs_bmbt_irec    imap, del;
+       struct xfs_iext_cursor  icur;
+
+       if (xfs_is_shutdown(mp))
+               return -EIO;
+
+       XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
+
+       /*
+        * All dirty data must be covered by delalloc extents.  But truncate can
+        * remove delalloc extents underneath us or reduce their size.
+        * Returning a hole tells iomap to not write back any data from this
+        * range, which is the right thing to do in that case.
+        *
+        * Otherwise just tell iomap to treat ranges previously covered by a
+        * delalloc extent as mapped.  The actual block allocation will be done
+        * just before submitting the bio.
+        */
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
+               imap.br_startoff = end_fsb;     /* fake a hole past EOF */
+       if (imap.br_startoff > offset_fsb) {
+               imap.br_blockcount = imap.br_startoff - offset_fsb;
+               imap.br_startoff = offset_fsb;
+               imap.br_startblock = HOLESTARTBLOCK;
+               imap.br_state = XFS_EXT_NORM;
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
+               return 0;
+       }
+       end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
+       count_fsb = end_fsb - offset_fsb;
+
+       del = imap;
+       xfs_trim_extent(&del, offset_fsb, count_fsb);
+       xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
+                       XFS_BMAPI_REMAP);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+       wpc->iomap.type = IOMAP_MAPPED;
+       wpc->iomap.flags = IOMAP_F_DIRTY;
+       wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
+       wpc->iomap.offset = offset;
+       wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
+       wpc->iomap.flags = IOMAP_F_ZONE_APPEND;
+       wpc->iomap.addr = 0;
+
+       trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
+       return 0;
+}
+
+static int
+xfs_zoned_submit_ioend(
+       struct iomap_writepage_ctx *wpc,
+       int                     status)
+{
+       wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
+       if (status)
+               return status;
+       xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->rtg);
+       return 0;
+}
+
+static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
+       .map_blocks             = xfs_zoned_map_blocks,
+       .submit_ioend           = xfs_zoned_submit_ioend,
+       .discard_folio          = xfs_discard_folio,
+};
+
 STATIC int
 xfs_vm_writepages(
        struct address_space    *mapping,
        struct writeback_control *wbc)
 {
+       struct xfs_inode        *ip = XFS_I(mapping->host);
        struct xfs_writepage_ctx wpc = { };
+       int                     error;
 
-       xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+       xfs_iflags_clear(ip, XFS_ITRUNCATED);
+       if (xfs_is_zoned_inode(ip)) {
+               struct xfs_zoned_writepage_ctx xc = { };
+
+               error = iomap_writepages(mapping, wbc, &xc.ctx,
+                                        &xfs_zoned_writeback_ops);
+               xfs_zone_finish_alloc(xc.rtg);
+               return error;
+       }
        return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
 }
 
index c96187f981bdd38be93ecf9d4e66580d2780dc5e..906016c2c57c6d6ccc7ccf5ef32a720fc33595db 100644 (file)
@@ -11,6 +11,7 @@ struct iomap_ioend;
 extern const struct address_space_operations xfs_address_space_operations;
 extern const struct address_space_operations xfs_dax_aops;
 
-int    xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
+int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
+void xfs_end_bio(struct bio *bio);
 
 #endif /* __XFS_AOPS_H__ */
index 67a5940285f163da8696dad96fc2edb19f0e7204..05fd768f7dcd76d557aee4976643b535a8c995fb 100644 (file)
@@ -30,6 +30,7 @@
 #include "xfs_reflink.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
 
 /* Kernel only BMAP related definitions and functions */
 
@@ -436,7 +437,8 @@ xfs_bmap_punch_delalloc_range(
        struct xfs_inode        *ip,
        int                     whichfork,
        xfs_off_t               start_byte,
-       xfs_off_t               end_byte)
+       xfs_off_t               end_byte,
+       struct xfs_zone_alloc_ctx *ac)
 {
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_ifork        *ifp = xfs_ifork_ptr(ip, whichfork);
@@ -467,7 +469,10 @@ xfs_bmap_punch_delalloc_range(
                        continue;
                }
 
-               xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del, 0);
+               xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del,
+                               ac ? XFS_BMAPI_REMAP : 0);
+               if (xfs_is_zoned_inode(ip) && ac)
+                       ac->reserved_blocks += del.br_blockcount;
                if (!xfs_iext_get_extent(ifp, &icur, &got))
                        break;
        }
@@ -582,7 +587,7 @@ xfs_free_eofblocks(
                if (ip->i_delayed_blks) {
                        xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK,
                                round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
-                               LLONG_MAX);
+                               LLONG_MAX, NULL);
                }
                xfs_inode_clear_eofblocks_tag(ip);
                return 0;
@@ -823,14 +828,15 @@ xfs_flush_unmap_range(
 
 int
 xfs_free_file_space(
-       struct xfs_inode        *ip,
-       xfs_off_t               offset,
-       xfs_off_t               len)
+       struct xfs_inode                *ip,
+       xfs_off_t                       offset,
+       xfs_off_t                       len,
+       struct xfs_zone_alloc_ctx       *ac)
 {
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_fileoff_t           startoffset_fsb;
-       xfs_fileoff_t           endoffset_fsb;
-       int                     done = 0, error;
+       struct xfs_mount                *mp = ip->i_mount;
+       xfs_fileoff_t                   startoffset_fsb;
+       xfs_fileoff_t                   endoffset_fsb;
+       int                             done = 0, error;
 
        trace_xfs_free_file_space(ip);
 
@@ -880,7 +886,7 @@ xfs_free_file_space(
                return 0;
        if (offset + len > XFS_ISIZE(ip))
                len = XFS_ISIZE(ip) - offset;
-       error = xfs_zero_range(ip, offset, len, NULL);
+       error = xfs_zero_range(ip, offset, len, ac, NULL);
        if (error)
                return error;
 
@@ -968,7 +974,8 @@ int
 xfs_collapse_file_space(
        struct xfs_inode        *ip,
        xfs_off_t               offset,
-       xfs_off_t               len)
+       xfs_off_t               len,
+       struct xfs_zone_alloc_ctx *ac)
 {
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_trans        *tp;
@@ -981,7 +988,7 @@ xfs_collapse_file_space(
 
        trace_xfs_collapse_file_space(ip);
 
-       error = xfs_free_file_space(ip, offset, len);
+       error = xfs_free_file_space(ip, offset, len, ac);
        if (error)
                return error;
 
index b29760d36e1ab1ef5e8392c1457180ce3ac9f59d..41a5b70e19dbefa74e6c2ff8998a967de991b859 100644 (file)
@@ -15,6 +15,7 @@ struct xfs_inode;
 struct xfs_mount;
 struct xfs_trans;
 struct xfs_bmalloca;
+struct xfs_zone_alloc_ctx;
 
 #ifdef CONFIG_XFS_RT
 int    xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
@@ -31,7 +32,8 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
 #endif /* CONFIG_XFS_RT */
 
 void   xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork,
-               xfs_off_t start_byte, xfs_off_t end_byte);
+               xfs_off_t start_byte, xfs_off_t end_byte,
+               struct xfs_zone_alloc_ctx *ac);
 
 struct kgetbmap {
        __s64           bmv_offset;     /* file offset of segment in blocks */
@@ -56,9 +58,9 @@ int   xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 int    xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
                             xfs_off_t len);
 int    xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
-                           xfs_off_t len);
+                           xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
 int    xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
-                               xfs_off_t len);
+                               xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
 int    xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
                                xfs_off_t len);
 
index c4bd145f5ec1bfb1b0fda2bf5a78878c9a2b70e9..70ff5b92882ddfbbb604c8656cd9154fe90f8484 100644 (file)
@@ -844,15 +844,19 @@ xfs_ioc_trim(
 
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-       if (mp->m_rtdev_targp &&
+
+       if (mp->m_rtdev_targp && !xfs_has_zoned(mp) &&
            bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
                rt_bdev = mp->m_rtdev_targp->bt_bdev;
        if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)
                return -EOPNOTSUPP;
 
-       if (rt_bdev)
+       if (rt_bdev) {
+               if (!bdev_max_discard_sectors(rt_bdev))
+                       return -EOPNOTSUPP;
                granularity = max(granularity,
                                  bdev_discard_granularity(rt_bdev));
+       }
 
        /*
         * We haven't recovered the log, so we cannot use our bnobt-guided
index a25c713ff888c7c773c8fd4f87751feee82e1ffd..a8d1817542b0080e8e6d14475d93ed0002d3ccfa 100644 (file)
@@ -29,6 +29,7 @@
 #include "xfs_inode.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
 
 struct kmem_cache      *xfs_efi_cache;
 struct kmem_cache      *xfs_efd_cache;
@@ -774,14 +775,21 @@ xfs_rtextent_free_finish_item(
                        xfs_rtgroup_trans_join(tp, *rtgp,
                                        XFS_RTGLOCK_BITMAP);
                }
-               error = xfs_rtfree_blocks(tp, *rtgp,
-                               xefi->xefi_startblock, xefi->xefi_blockcount);
+
+               if (xfs_has_zoned(mp)) {
+                       error = xfs_zone_free_blocks(tp, *rtgp,
+                                       xefi->xefi_startblock,
+                                       xefi->xefi_blockcount);
+               } else {
+                       error = xfs_rtfree_blocks(tp, *rtgp,
+                                       xefi->xefi_startblock,
+                                       xefi->xefi_blockcount);
+               }
        }
        if (error == -EAGAIN) {
                xfs_efd_from_efi(efdp);
                return error;
        }
-
        xfs_efd_add_extent(efdp, xefi);
        xfs_extent_free_cancel_item(item);
        return error;
index a952d3faa06611e650f2c41f600c7a8ac9b39681..6cccb7c9560b7a66573226fa608be13ae11da4cf 100644 (file)
@@ -25,6 +25,9 @@
 #include "xfs_iomap.h"
 #include "xfs_reflink.h"
 #include "xfs_file.h"
+#include "xfs_aops.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_rtbitmap.h"
 
 #include <linux/dax.h>
 #include <linux/falloc.h>
@@ -360,7 +363,8 @@ xfs_file_write_zero_eof(
        struct iov_iter         *from,
        unsigned int            *iolock,
        size_t                  count,
-       bool                    *drained_dio)
+       bool                    *drained_dio,
+       struct xfs_zone_alloc_ctx *ac)
 {
        struct xfs_inode        *ip = XFS_I(iocb->ki_filp->f_mapping->host);
        loff_t                  isize;
@@ -414,7 +418,7 @@ xfs_file_write_zero_eof(
        trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
 
        xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
-       error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
+       error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
        xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
 
        return error;
@@ -431,7 +435,8 @@ STATIC ssize_t
 xfs_file_write_checks(
        struct kiocb            *iocb,
        struct iov_iter         *from,
-       unsigned int            *iolock)
+       unsigned int            *iolock,
+       struct xfs_zone_alloc_ctx *ac)
 {
        struct inode            *inode = iocb->ki_filp->f_mapping->host;
        size_t                  count = iov_iter_count(from);
@@ -478,10 +483,15 @@ restart:
         * can only extend EOF.  Truncate is locked out at this point, so the
         * EOF can not move backwards, only forwards. Hence we only need to take
         * the slow path when we are at or beyond the current EOF.
+        *
+        * For zoned file systems, we never allocated speculative blocks, so
+        * there is no need to zero anything.  The tail of the block beyond
+        * i_size was already zeroed when writing it, and the beginning of
+        * the block where the write starts will be zeroed by the write itself.
         */
        if (iocb->ki_pos > i_size_read(inode)) {
                error = xfs_file_write_zero_eof(iocb, from, iolock, count,
-                               &drained_dio);
+                               &drained_dio, ac);
                if (error == 1)
                        goto restart;
                if (error)
@@ -503,6 +513,9 @@ xfs_dio_write_end_io(
        loff_t                  offset = iocb->ki_pos;
        unsigned int            nofs_flag;
 
+       ASSERT(!xfs_is_zoned_inode(ip) ||
+              !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
+
        trace_xfs_end_io_direct_write(ip, offset, size);
 
        if (xfs_is_shutdown(ip->i_mount))
@@ -582,14 +595,94 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
        .end_io         = xfs_dio_write_end_io,
 };
 
+static void
+xfs_dio_zoned_submit_io(
+       const struct iomap_iter *iter,
+       struct bio              *bio,
+       loff_t                  file_offset)
+{
+       struct xfs_mount        *mp = XFS_I(iter->inode)->i_mount;
+       struct xfs_zone_alloc_ctx *ac = iter->private;
+       xfs_filblks_t           count_fsb;
+       struct iomap_ioend      *ioend;
+
+       count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
+       if (count_fsb > ac->reserved_blocks) {
+               xfs_err(mp,
+"allocation (%lld) larger than reservation (%lld).",
+                       count_fsb, ac->reserved_blocks);
+               xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+               bio_io_error(bio);
+               return;
+       }
+       ac->reserved_blocks -= count_fsb;
+
+       bio->bi_end_io = xfs_end_bio;
+       ioend = iomap_init_ioend(iter->inode, bio, file_offset,
+                       IOMAP_MAPPED, 0, true);
+       xfs_zone_alloc_and_submit(ioend, &ac->cached_rtg);
+}
+
+static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
+       .bio_set        = &iomap_ioend_bioset,
+       .submit_io      = xfs_dio_zoned_submit_io,
+       .end_io         = xfs_dio_write_end_io,
+};
+
+static ssize_t
+xfs_zoned_write_space_reserve(
+       struct xfs_inode                *ip,
+       struct kiocb                    *iocb,
+       struct iov_iter                 *from,
+       unsigned int                    flags,
+       struct xfs_zone_alloc_ctx       *ac)
+{
+       loff_t                          count = iov_iter_count(from);
+       int                             error;
+
+       if (iocb->ki_flags & IOCB_NOWAIT)
+               flags |= XFS_ZR_NOWAIT;
+
+       /*
+        * Check the rlimit and LFS boundary first so that we don't over-reserve
+        * by possibly a lot.
+        *
+        * The generic write path will redo this check later, and it might have
+        * changed by then.  If it got expanded we'll stick to our earlier
+        * smaller limit, and if it is decreased the new smaller limit will be
+        * used and our extra space reservation will be returned after finishing
+        * the write.
+        */
+       error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
+       if (error)
+               return error;
+
+       /*
+        * Sloppily round up count to file system blocks.
+        *
+        * This will often reserve an extra block, but that avoids having to look
+        * at the start offset, which isn't stable for O_APPEND until taking the
+        * iolock.  Also we need to reserve a block each for zeroing the old
+        * EOF block and the new new start block if they are unaligned.
+        *
+        * Any remaining block will be returned after the write.
+        */
+       return xfs_zoned_space_reserve(ip,
+                       XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2,
+                       flags, ac);
+}
+
 /*
- * Handle block aligned direct I/O writes
+ * Handle block aligned direct I/O writes.
  */
 static noinline ssize_t
 xfs_file_dio_write_aligned(
        struct xfs_inode        *ip,
        struct kiocb            *iocb,
-       struct iov_iter         *from)
+       struct iov_iter         *from,
+       const struct iomap_ops  *ops,
+       const struct iomap_dio_ops *dops,
+       struct xfs_zone_alloc_ctx *ac)
 {
        unsigned int            iolock = XFS_IOLOCK_SHARED;
        ssize_t                 ret;
@@ -597,7 +690,7 @@ xfs_file_dio_write_aligned(
        ret = xfs_ilock_iocb_for_write(iocb, &iolock);
        if (ret)
                return ret;
-       ret = xfs_file_write_checks(iocb, from, &iolock);
+       ret = xfs_file_write_checks(iocb, from, &iolock, ac);
        if (ret)
                goto out_unlock;
 
@@ -611,11 +704,31 @@ xfs_file_dio_write_aligned(
                iolock = XFS_IOLOCK_SHARED;
        }
        trace_xfs_file_direct_write(iocb, from);
-       ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
-                          &xfs_dio_write_ops, 0, NULL, 0);
+       ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
 out_unlock:
-       if (iolock)
-               xfs_iunlock(ip, iolock);
+       xfs_iunlock(ip, iolock);
+       return ret;
+}
+
+/*
+ * Handle block aligned direct I/O writes to zoned devices.
+ */
+static noinline ssize_t
+xfs_file_dio_write_zoned(
+       struct xfs_inode        *ip,
+       struct kiocb            *iocb,
+       struct iov_iter         *from)
+{
+       struct xfs_zone_alloc_ctx ac;
+       ssize_t                 ret;
+
+       ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac);
+       if (ret < 0)
+               return ret;
+       ret = xfs_file_dio_write_aligned(ip, iocb, from,
+                       &xfs_zoned_direct_write_iomap_ops,
+                       &xfs_dio_zoned_write_ops, &ac);
+       xfs_zoned_space_unreserve(ip, &ac);
        return ret;
 }
 
@@ -675,7 +788,7 @@ retry_exclusive:
                goto out_unlock;
        }
 
-       ret = xfs_file_write_checks(iocb, from, &iolock);
+       ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
        if (ret)
                goto out_unlock;
 
@@ -732,7 +845,10 @@ xfs_file_dio_write(
            (xfs_is_always_cow_inode(ip) &&
             (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
                return xfs_file_dio_write_unaligned(ip, iocb, from);
-       return xfs_file_dio_write_aligned(ip, iocb, from);
+       if (xfs_is_zoned_inode(ip))
+               return xfs_file_dio_write_zoned(ip, iocb, from);
+       return xfs_file_dio_write_aligned(ip, iocb, from,
+                       &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
 }
 
 static noinline ssize_t
@@ -749,7 +865,7 @@ xfs_file_dax_write(
        ret = xfs_ilock_iocb(iocb, iolock);
        if (ret)
                return ret;
-       ret = xfs_file_write_checks(iocb, from, &iolock);
+       ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
        if (ret)
                goto out;
 
@@ -793,7 +909,7 @@ write_retry:
        if (ret)
                return ret;
 
-       ret = xfs_file_write_checks(iocb, from, &iolock);
+       ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
        if (ret)
                goto out;
 
@@ -840,6 +956,67 @@ out:
        return ret;
 }
 
+STATIC ssize_t
+xfs_file_buffered_write_zoned(
+       struct kiocb            *iocb,
+       struct iov_iter         *from)
+{
+       struct xfs_inode        *ip = XFS_I(iocb->ki_filp->f_mapping->host);
+       struct xfs_mount        *mp = ip->i_mount;
+       unsigned int            iolock = XFS_IOLOCK_EXCL;
+       bool                    cleared_space = false;
+       struct xfs_zone_alloc_ctx ac;
+       ssize_t                 ret;
+
+       ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac);
+       if (ret < 0)
+               return ret;
+
+       ret = xfs_ilock_iocb(iocb, iolock);
+       if (ret)
+               goto out_unreserve;
+
+       ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
+       if (ret)
+               goto out_unlock;
+
+       /*
+        * Truncate the iter to the length that we were actually able to
+        * allocate blocks for.  This needs to happen after
+        * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
+        * writes.
+        */
+       iov_iter_truncate(from,
+                       XFS_FSB_TO_B(mp, ac.reserved_blocks) -
+                       (iocb->ki_pos & mp->m_blockmask));
+       if (!iov_iter_count(from))
+               goto out_unlock;
+
+retry:
+       trace_xfs_file_buffered_write(iocb, from);
+       ret = iomap_file_buffered_write(iocb, from,
+                       &xfs_buffered_write_iomap_ops, &ac);
+       if (ret == -ENOSPC && !cleared_space) {
+               /* 
+                * Kick off writeback to convert delalloc space and release the
+                * usually too pessimistic indirect block reservations.
+                */
+               xfs_flush_inodes(mp);
+               cleared_space = true;
+               goto retry;
+       }
+
+out_unlock:
+       xfs_iunlock(ip, iolock);
+out_unreserve:
+       xfs_zoned_space_unreserve(ip, &ac);
+       if (ret > 0) {
+               XFS_STATS_ADD(mp, xs_write_bytes, ret);
+               ret = generic_write_sync(iocb, ret);
+       }
+       return ret;
+}
+
 STATIC ssize_t
 xfs_file_write_iter(
        struct kiocb            *iocb,
@@ -873,6 +1050,8 @@ xfs_file_write_iter(
                        return ret;
        }
 
+       if (xfs_is_zoned_inode(ip))
+               return xfs_file_buffered_write_zoned(iocb, from);
        return xfs_file_buffered_write(iocb, from);
 }
 
@@ -927,7 +1106,8 @@ static int
 xfs_falloc_collapse_range(
        struct file             *file,
        loff_t                  offset,
-       loff_t                  len)
+       loff_t                  len,
+       struct xfs_zone_alloc_ctx *ac)
 {
        struct inode            *inode = file_inode(file);
        loff_t                  new_size = i_size_read(inode) - len;
@@ -943,7 +1123,7 @@ xfs_falloc_collapse_range(
        if (offset + len >= i_size_read(inode))
                return -EINVAL;
 
-       error = xfs_collapse_file_space(XFS_I(inode), offset, len);
+       error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
        if (error)
                return error;
        return xfs_falloc_setsize(file, new_size);
@@ -999,7 +1179,8 @@ xfs_falloc_zero_range(
        struct file             *file,
        int                     mode,
        loff_t                  offset,
-       loff_t                  len)
+       loff_t                  len,
+       struct xfs_zone_alloc_ctx *ac)
 {
        struct inode            *inode = file_inode(file);
        unsigned int            blksize = i_blocksize(inode);
@@ -1012,7 +1193,7 @@ xfs_falloc_zero_range(
        if (error)
                return error;
 
-       error = xfs_free_file_space(XFS_I(inode), offset, len);
+       error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
        if (error)
                return error;
 
@@ -1093,12 +1274,29 @@ xfs_file_fallocate(
        struct xfs_inode        *ip = XFS_I(inode);
        long                    error;
        uint                    iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+       struct xfs_zone_alloc_ctx ac = { };
 
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;
        if (mode & ~XFS_FALLOC_FL_SUPPORTED)
                return -EOPNOTSUPP;
 
+       /*
+        * For zoned file systems, zeroing the first and last block of a hole
+        * punch requires allocating a new block to rewrite the remaining data
+        * and new zeroes out of place.  Get a reservations for those before
+        * taking the iolock.  Dip into the reserved pool because we are
+        * expected to be able to punch a hole even on a completely full
+        * file system.
+        */
+       if (xfs_is_zoned_inode(ip) &&
+           (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
+                    FALLOC_FL_COLLAPSE_RANGE))) {
+               error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac);
+               if (error)
+                       return error;
+       }
+
        xfs_ilock(ip, iolock);
        error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
        if (error)
@@ -1119,16 +1317,16 @@ xfs_file_fallocate(
 
        switch (mode & FALLOC_FL_MODE_MASK) {
        case FALLOC_FL_PUNCH_HOLE:
-               error = xfs_free_file_space(ip, offset, len);
+               error = xfs_free_file_space(ip, offset, len, &ac);
                break;
        case FALLOC_FL_COLLAPSE_RANGE:
-               error = xfs_falloc_collapse_range(file, offset, len);
+               error = xfs_falloc_collapse_range(file, offset, len, &ac);
                break;
        case FALLOC_FL_INSERT_RANGE:
                error = xfs_falloc_insert_range(file, offset, len);
                break;
        case FALLOC_FL_ZERO_RANGE:
-               error = xfs_falloc_zero_range(file, mode, offset, len);
+               error = xfs_falloc_zero_range(file, mode, offset, len, &ac);
                break;
        case FALLOC_FL_UNSHARE_RANGE:
                error = xfs_falloc_unshare_range(file, mode, offset, len);
@@ -1146,6 +1344,8 @@ xfs_file_fallocate(
 
 out_unlock:
        xfs_iunlock(ip, iolock);
+       if (xfs_is_zoned_inode(ip))
+               xfs_zoned_space_unreserve(ip, &ac);
        return error;
 }
 
@@ -1449,8 +1649,24 @@ xfs_write_fault(
        struct inode            *inode = file_inode(vmf->vma->vm_file);
        struct xfs_inode        *ip = XFS_I(inode);
        unsigned int            lock_mode = XFS_MMAPLOCK_SHARED;
+       struct xfs_zone_alloc_ctx ac;
+       int                     error;
        vm_fault_t              ret;
 
+       if (xfs_is_zoned_inode(ip)) {
+               /*
+                * This could over-allocate as it doesn't check for truncation.
+                * But as the overallocation is limited to less than a folio and
+                * will be release instantly that's just fine.
+                */
+               unsigned int    len = folio_size(page_folio(vmf->page));
+
+               error = xfs_zoned_space_reserve(ip,
+                               XFS_B_TO_FSB(ip->i_mount, len), 0, &ac);
+               if (error < 0)
+                       return vmf_fs_error(error);
+       }
+
        sb_start_pagefault(inode->i_sb);
        file_update_time(vmf->vma->vm_file);
 
@@ -1470,10 +1686,12 @@ xfs_write_fault(
                ret = xfs_dax_fault_locked(vmf, order, true);
        else
                ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops,
-                               NULL);
+                               xfs_is_zoned_inode(ip) ? &ac : NULL);
        xfs_iunlock(ip, lock_mode);
 
        sb_end_pagefault(inode->i_sb);
+       if (xfs_is_zoned_inode(ip))
+               xfs_zoned_space_unreserve(ip, &ac);
        return ret;
 }
 
index 84d81f29222dbe987d0fce1f765ee514a7e51ce3..342b34e437f829a96a435aaff0a42ad428e25871 100644 (file)
@@ -2073,7 +2073,7 @@ xfs_inodegc_want_queue_rt_file(
 {
        struct xfs_mount        *mp = ip->i_mount;
 
-       if (!XFS_IS_REALTIME_INODE(ip))
+       if (!XFS_IS_REALTIME_INODE(ip) || xfs_has_zoned(mp))
                return false;
 
        if (xfs_compare_freecounter(mp, FREE_RTEXTENTS,
index 0ece40d38f4bd02cbd97d5d5d59e2798339131f9..254c59ae77d096abb95c7062fd9003ebf214ff59 100644 (file)
@@ -605,7 +605,8 @@ int xfs_break_layouts(struct inode *inode, uint *iolock,
 
 static inline void xfs_update_stable_writes(struct xfs_inode *ip)
 {
-       if (bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev))
+       if (xfs_is_zoned_inode(ip) ||
+           bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev))
                mapping_set_stable_writes(VFS_I(ip)->i_mapping);
        else
                mapping_clear_stable_writes(VFS_I(ip)->i_mapping);
index 4e25748b682279e6f185a1dd1ba892c63dd99f80..65b222281169ad6ec169c03f9d4ffde8a5856c6e 100644 (file)
@@ -41,6 +41,7 @@
 #include "xfs_exchrange.h"
 #include "xfs_handle.h"
 #include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
 
 #include <linux/mount.h>
 #include <linux/fileattr.h>
index e6854001b35487f05353f0165240ebf5b5bac633..8f24a2a357b571285e97cf3a60560d9ef741d588 100644 (file)
@@ -30,6 +30,8 @@
 #include "xfs_reflink.h"
 #include "xfs_health.h"
 #include "xfs_rtbitmap.h"
+#include "xfs_icache.h"
+#include "xfs_zone_alloc.h"
 
 #define XFS_ALLOC_ALIGN(mp, off) \
        (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
@@ -170,17 +172,6 @@ xfs_hole_to_iomap(
        iomap->dax_dev = target->bt_daxdev;
 }
 
-static inline xfs_fileoff_t
-xfs_iomap_end_fsb(
-       struct xfs_mount        *mp,
-       loff_t                  offset,
-       loff_t                  count)
-{
-       ASSERT(offset <= mp->m_super->s_maxbytes);
-       return min(XFS_B_TO_FSB(mp, offset + count),
-                  XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
-}
-
 static xfs_extlen_t
 xfs_eof_alignment(
        struct xfs_inode        *ip)
@@ -963,6 +954,60 @@ const struct iomap_ops xfs_direct_write_iomap_ops = {
        .iomap_begin            = xfs_direct_write_iomap_begin,
 };
 
+#ifdef CONFIG_XFS_RT
+/*
+ * This is really simple.  The space has already been reserved before taking the
+ * IOLOCK, the actual block allocation is done just before submitting the bio
+ * and only recorded in the extent map on I/O completion.
+ */
+static int
+xfs_zoned_direct_write_iomap_begin(
+       struct inode            *inode,
+       loff_t                  offset,
+       loff_t                  length,
+       unsigned                flags,
+       struct iomap            *iomap,
+       struct iomap            *srcmap)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+       int                     error;
+
+       ASSERT(!(flags & IOMAP_OVERWRITE_ONLY));
+
+       /*
+        * Needs to be pushed down into the allocator so that only writes into
+        * a single zone can be supported.
+        */
+       if (flags & IOMAP_NOWAIT)
+               return -EAGAIN;
+
+       /*
+        * Ensure the extent list is in memory in so that we don't have to do
+        * read it from the I/O completion handler.
+        */
+       if (xfs_need_iread_extents(&ip->i_df)) {
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               if (error)
+                       return error;
+       }
+
+       iomap->type = IOMAP_MAPPED;
+       iomap->flags = IOMAP_F_DIRTY;
+       iomap->bdev = ip->i_mount->m_rtdev_targp->bt_bdev;
+       iomap->offset = offset;
+       iomap->length = length;
+       iomap->flags = IOMAP_F_ZONE_APPEND;
+       iomap->addr = 0;
+       return 0;
+}
+
+const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
+       .iomap_begin            = xfs_zoned_direct_write_iomap_begin,
+};
+#endif /* CONFIG_XFS_RT */
+
 static int
 xfs_dax_write_iomap_end(
        struct inode            *inode,
@@ -990,6 +1035,176 @@ const struct iomap_ops xfs_dax_write_iomap_ops = {
        .iomap_end      = xfs_dax_write_iomap_end,
 };
 
+static int
+xfs_zoned_buffered_write_iomap_begin(
+       struct inode            *inode,
+       loff_t                  offset,
+       loff_t                  count,
+       unsigned                flags,
+       struct iomap            *iomap,
+       struct iomap            *srcmap)
+{
+       struct iomap_iter       *iter =
+               container_of(iomap, struct iomap_iter, iomap);
+       struct xfs_zone_alloc_ctx *ac = iter->private;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       xfs_fileoff_t           end_fsb = xfs_iomap_end_fsb(mp, offset, count);
+       u16                     iomap_flags = IOMAP_F_SHARED;
+       unsigned int            lockmode = XFS_ILOCK_EXCL;
+       xfs_filblks_t           count_fsb;
+       xfs_extlen_t            indlen;
+       struct xfs_bmbt_irec    got;
+       struct xfs_iext_cursor  icur;
+       int                     error = 0;
+
+       ASSERT(!xfs_get_extsz_hint(ip));
+       ASSERT(!(flags & IOMAP_UNSHARE));
+       ASSERT(ac);
+
+       if (xfs_is_shutdown(mp))
+               return -EIO;
+
+       error = xfs_qm_dqattach(ip);
+       if (error)
+               return error;
+
+       error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+       if (error)
+               return error;
+
+       if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
+           XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+               xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
+               error = -EFSCORRUPTED;
+               goto out_unlock;
+       }
+
+       XFS_STATS_INC(mp, xs_blk_mapw);
+
+       error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+       if (error)
+               goto out_unlock;
+
+       /*
+        * For zeroing operations check if there is any data to zero first.
+        *
+        * For regular writes we always need to allocate new blocks, but need to
+        * provide the source mapping when the range is unaligned to support
+        * read-modify-write of the whole block in the page cache.
+        *
+        * In either case we need to limit the reported range to the boundaries
+        * of the source map in the data fork.
+        */
+       if (!IS_ALIGNED(offset, mp->m_sb.sb_blocksize) ||
+           !IS_ALIGNED(offset + count, mp->m_sb.sb_blocksize) ||
+           (flags & IOMAP_ZERO)) {
+               struct xfs_bmbt_irec    smap;
+               struct xfs_iext_cursor  scur;
+
+               if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &scur,
+                               &smap))
+                       smap.br_startoff = end_fsb; /* fake hole until EOF */
+               if (smap.br_startoff > offset_fsb) {
+                       /*
+                        * We never need to allocate blocks for zeroing a hole.
+                        */
+                       if (flags & (IOMAP_UNSHARE | IOMAP_ZERO)) {
+                               xfs_hole_to_iomap(ip, iomap, offset_fsb,
+                                               smap.br_startoff);
+                               goto out_unlock;
+                       }
+                       end_fsb = min(end_fsb, smap.br_startoff);
+               } else {
+                       end_fsb = min(end_fsb,
+                               smap.br_startoff + smap.br_blockcount);
+                       xfs_trim_extent(&smap, offset_fsb,
+                                       end_fsb - offset_fsb);
+                       error = xfs_bmbt_to_iomap(ip, srcmap, &smap, flags, 0,
+                                       xfs_iomap_inode_sequence(ip, 0));
+                       if (error)
+                               goto out_unlock;
+               }
+       }
+
+       if (!ip->i_cowfp)
+               xfs_ifork_init_cow(ip);
+
+       if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
+               got.br_startoff = end_fsb;
+       if (got.br_startoff <= offset_fsb) {
+               trace_xfs_reflink_cow_found(ip, &got);
+               goto done;
+       }
+
+       /*
+        * Cap the maximum length to keep the chunks of work done here somewhat
+        * symmetric with the work writeback does.
+        */
+       end_fsb = min(end_fsb, got.br_startoff);
+       count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN,
+                        XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE));
+
+       /*
+        * The block reservation is supposed to cover all blocks that the
+        * operation could possible write, but there is a nasty corner case
+        * where blocks could be stole from underneath us:
+        *
+        *  1) while this thread iterates over a larger buffered write,
+        *  2) another thread is causing a write fault that calls into
+        *     ->page_mkwrite in range this thread writes to, using up the
+        *     delalloc reservation created by a previous call to this function.
+        *  3) another thread does direct I/O on the range that the write fault
+        *     happened on, which causes writeback of the dirty data.
+        *  4) this then set the stale flag, which cuts the current iomap
+        *     iteration short, causing the new call to ->iomap_begin that gets
+        *     us here again, but now without a sufficient reservation.
+        *
+        * This is a very unusual I/O pattern, and nothing but generic/095 is
+        * known to hit it. There's not really much we can do here, so turn this
+        * into a short write.
+        */
+       if (count_fsb > ac->reserved_blocks) {
+               xfs_warn_ratelimited(mp,
+"Short write on ino 0x%llx comm %.20s due to three-way race with write fault and direct I/O",
+                       ip->i_ino, current->comm);
+               count_fsb = ac->reserved_blocks;
+               if (!count_fsb) {
+                       error = -EIO;
+                       goto out_unlock;
+               }
+       }
+
+       error = xfs_quota_reserve_blkres(ip, count_fsb);
+       if (error)
+               goto out_unlock;
+
+       indlen = xfs_bmap_worst_indlen(ip, count_fsb);
+       error = xfs_dec_fdblocks(mp, indlen, false);
+       if (error)
+               goto out_unlock;
+       ip->i_delayed_blks += count_fsb;
+       xfs_mod_delalloc(ip, count_fsb, indlen);
+
+       got.br_startoff = offset_fsb;
+       got.br_startblock = nullstartblock(indlen);
+       got.br_blockcount = count_fsb;
+       got.br_state = XFS_EXT_NORM;
+       xfs_bmap_add_extent_hole_delay(ip, XFS_COW_FORK, &icur, &got);
+       ac->reserved_blocks -= count_fsb;
+       iomap_flags |= IOMAP_F_NEW;
+
+       trace_xfs_iomap_alloc(ip, offset, XFS_FSB_TO_B(mp, count_fsb),
+                       XFS_COW_FORK, &got);
+done:
+       error = xfs_bmbt_to_iomap(ip, iomap, &got, flags, iomap_flags,
+                       xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED));
+out_unlock:
+       xfs_iunlock(ip, lockmode);
+       return error;
+}
+
 static int
 xfs_buffered_write_iomap_begin(
        struct inode            *inode,
@@ -1016,6 +1231,10 @@ xfs_buffered_write_iomap_begin(
        if (xfs_is_shutdown(mp))
                return -EIO;
 
+       if (xfs_is_zoned_inode(ip))
+               return xfs_zoned_buffered_write_iomap_begin(inode, offset,
+                               count, flags, iomap, srcmap);
+
        /* we can't use delayed allocations when using extent size hints */
        if (xfs_get_extsz_hint(ip))
                return xfs_direct_write_iomap_begin(inode, offset, count,
@@ -1248,10 +1467,13 @@ xfs_buffered_write_delalloc_punch(
        loff_t                  length,
        struct iomap            *iomap)
 {
+       struct iomap_iter       *iter =
+               container_of(iomap, struct iomap_iter, iomap);
+
        xfs_bmap_punch_delalloc_range(XFS_I(inode),
                        (iomap->flags & IOMAP_F_SHARED) ?
                                XFS_COW_FORK : XFS_DATA_FORK,
-                       offset, offset + length);
+                       offset, offset + length, iter->private);
 }
 
 static int
@@ -1486,12 +1708,13 @@ const struct iomap_ops xfs_xattr_iomap_ops = {
 
 int
 xfs_zero_range(
-       struct xfs_inode        *ip,
-       loff_t                  pos,
-       loff_t                  len,
-       bool                    *did_zero)
+       struct xfs_inode                *ip,
+       loff_t                          pos,
+       loff_t                          len,
+       struct xfs_zone_alloc_ctx       *ac,
+       bool                            *did_zero)
 {
-       struct inode            *inode = VFS_I(ip);
+       struct inode                    *inode = VFS_I(ip);
 
        xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
 
@@ -1499,13 +1722,14 @@ xfs_zero_range(
                return dax_zero_range(inode, pos, len, did_zero,
                                      &xfs_dax_write_iomap_ops);
        return iomap_zero_range(inode, pos, len, did_zero,
-                               &xfs_buffered_write_iomap_ops, NULL);
+                               &xfs_buffered_write_iomap_ops, ac);
 }
 
 int
 xfs_truncate_page(
        struct xfs_inode        *ip,
        loff_t                  pos,
+       struct xfs_zone_alloc_ctx *ac,
        bool                    *did_zero)
 {
        struct inode            *inode = VFS_I(ip);
@@ -1514,5 +1738,5 @@ xfs_truncate_page(
                return dax_truncate_page(inode, pos, did_zero,
                                        &xfs_dax_write_iomap_ops);
        return iomap_truncate_page(inode, pos, did_zero,
-                                  &xfs_buffered_write_iomap_ops, NULL);
+                                  &xfs_buffered_write_iomap_ops, ac);
 }
index 4da13440bae9bd7e67f5044b8bfc3544d57e8be9..e9ddb5a1007e457f562796533f63092818ba3a6f 100644 (file)
@@ -10,6 +10,7 @@
 
 struct xfs_inode;
 struct xfs_bmbt_irec;
+struct xfs_zone_alloc_ctx;
 
 int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
                xfs_fileoff_t count_fsb, unsigned int flags,
@@ -24,8 +25,20 @@ int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
                u16 iomap_flags, u64 sequence_cookie);
 
 int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
-               bool *did_zero);
-int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero);
+               struct xfs_zone_alloc_ctx *ac, bool *did_zero);
+int xfs_truncate_page(struct xfs_inode *ip, loff_t pos,
+               struct xfs_zone_alloc_ctx *ac, bool *did_zero);
+
+static inline xfs_fileoff_t
+xfs_iomap_end_fsb(
+       struct xfs_mount        *mp,
+       loff_t                  offset,
+       loff_t                  count)
+{
+       ASSERT(offset <= mp->m_super->s_maxbytes);
+       return min(XFS_B_TO_FSB(mp, offset + count),
+                  XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+}
 
 static inline xfs_filblks_t
 xfs_aligned_fsb_count(
@@ -50,6 +63,7 @@ xfs_aligned_fsb_count(
 extern const struct iomap_ops xfs_buffered_write_iomap_ops;
 extern const struct iomap_ops xfs_page_mkwrite_iomap_ops;
 extern const struct iomap_ops xfs_direct_write_iomap_ops;
+extern const struct iomap_ops xfs_zoned_direct_write_iomap_ops;
 extern const struct iomap_ops xfs_read_iomap_ops;
 extern const struct iomap_ops xfs_seek_iomap_ops;
 extern const struct iomap_ops xfs_xattr_iomap_ops;
index 66a726a5fbbba2854479e4cf8674a3a8188f47e8..31c5e9d58e535daae20dd355b27a31eae401ffbe 100644 (file)
@@ -29,6 +29,7 @@
 #include "xfs_xattr.h"
 #include "xfs_file.h"
 #include "xfs_bmap.h"
+#include "xfs_zone_alloc.h"
 
 #include <linux/posix_acl.h>
 #include <linux/security.h>
@@ -822,6 +823,7 @@ xfs_setattr_size(
        uint                    lock_flags = 0;
        uint                    resblks = 0;
        bool                    did_zeroing = false;
+       struct xfs_zone_alloc_ctx ac;
 
        xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
        ASSERT(S_ISREG(inode->i_mode));
@@ -857,6 +859,28 @@ xfs_setattr_size(
         */
        inode_dio_wait(inode);
 
+       /*
+        * Normally xfs_zoned_space_reserve is supposed to be called outside the
+        * IOLOCK.  For for truncate we can't do that since ->setattr is called
+        * with it already held by the VFS.  So for now chicken out and try to
+        * allocate space under it.
+        *
+        * To avoid deadlocks this means we can't block waiting for space, which
+        * can lead to spurious -ENOSPC if there are no directly available
+        * blocks.  We mitigate this a bit by allowing zeroing to dip into the
+        * reserved pool, but eventually the VFS calling convention needs to
+        * change.
+        */
+       if (xfs_is_zoned_inode(ip)) {
+               error = xfs_zoned_space_reserve(ip, 1,
+                               XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac);
+               if (error) {
+                       if (error == -EAGAIN)
+                               return -ENOSPC;
+                       return error;
+               }
+       }
+
        /*
         * File data changes must be complete before we start the transaction to
         * modify the inode.  This needs to be done before joining the inode to
@@ -870,11 +894,14 @@ xfs_setattr_size(
        if (newsize > oldsize) {
                trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
                error = xfs_zero_range(ip, oldsize, newsize - oldsize,
-                               &did_zeroing);
+                               &ac, &did_zeroing);
        } else {
-               error = xfs_truncate_page(ip, newsize, &did_zeroing);
+               error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing);
        }
 
+       if (xfs_is_zoned_inode(ip))
+               xfs_zoned_space_unreserve(ip, &ac);
+
        if (error)
                return error;
 
index 26b2f5887b88193175e8abafa12b17a9bd370f31..d64a95f126f97862db32537c9020919e790dd40a 100644 (file)
@@ -3531,6 +3531,12 @@ xlog_force_shutdown(
        spin_unlock(&log->l_icloglock);
 
        wake_up_var(&log->l_opstate);
+
+       if (xfs_has_zoned(log->l_mp) && IS_ENABLED(CONFIG_XFS_RT)) {
+               spin_lock(&log->l_mp->m_zone_list_lock);
+               wake_up_all(&log->l_mp->m_zone_wait);
+               spin_unlock(&log->l_mp->m_zone_list_lock);
+       }
        return log_error;
 }
 
index 4d8cbf173f60ec4b1ad7eb4fff9fb750e6a58342..03ecdda1d34b827c9411409901146ed0c444fc52 100644 (file)
@@ -40,6 +40,8 @@
 #include "xfs_rtrmap_btree.h"
 #include "xfs_rtrefcount_btree.h"
 #include "scrub/stats.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_zone_alloc.h"
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
 static int xfs_uuid_table_size;
@@ -469,22 +471,26 @@ xfs_default_resblks(
        struct xfs_mount        *mp,
        unsigned int            idx)
 {
-       uint64_t resblks;
-
-       if (idx == FREE_RTEXTENTS)
-               return 0;
+       switch (idx) {
+       case FREE_BLOCKS:
+               /*
+                * We default to 5% or 8192 FSBs of space reserved, whichever is
+                * smaller.
+                *
+                * This is intended to cover concurrent allocation transactions
+                * when we initially hit ENOSPC.  These each require a 4 block
+                * reservation. Hence by default we cover roughly 2000
+                * concurrent allocation reservations.
+                */
+               return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL);
+       case FREE_RTEXTENTS:
+       case FREE_RTAVAILABLE:
+               if (!IS_ENABLED(CONFIG_XFS_RT) || !xfs_has_zoned(mp))
+                       break;
+               return xfs_zoned_default_resblks(mp, idx);
+       }
 
-       /*
-        * We default to 5% or 8192 fsbs of space reserved, whichever is
-        * smaller.  This is intended to cover concurrent allocation
-        * transactions when we initially hit enospc. These each require a 4
-        * block reservation. Hence by default we cover roughly 2000 concurrent
-        * allocation reservations.
-        */
-       resblks = mp->m_sb.sb_dblocks;
-       do_div(resblks, 20);
-       resblks = min_t(uint64_t, resblks, 8192);
-       return resblks;
+       return 0;
 }
 
 /* Ensure the summary counts are correct. */
@@ -551,7 +557,7 @@ xfs_check_summary_counts(
         * If we're mounting the rt volume after recovering the log, recompute
         * frextents from the rtbitmap file to fix the inconsistency.
         */
-       if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
+       if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) {
                error = xfs_rtalloc_reinit_frextents(mp);
                if (error)
                        return error;
@@ -1078,6 +1084,9 @@ xfs_mountfs(
                        goto out_agresv;
        }
 
+       if (!xfs_is_readonly(mp))
+               xfs_zone_gc_start(mp);
+
        return 0;
 
  out_agresv:
@@ -1162,6 +1171,8 @@ xfs_unmountfs(
        xfs_inodegc_flush(mp);
 
        xfs_blockgc_stop(mp);
+       if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate))
+               xfs_zone_gc_stop(mp);
        xfs_fs_unreserve_ag_blocks(mp);
        xfs_qm_unmount_quotas(mp);
        xfs_rtunmount_inodes(mp);
@@ -1247,7 +1258,7 @@ xfs_freecounter_unavailable(
        struct xfs_mount        *mp,
        unsigned int            idx)
 {
-       if (idx == FREE_RTEXTENTS)
+       if (idx == FREE_RTEXTENTS || idx == FREE_RTAVAILABLE)
                return 0;
        return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
 }
@@ -1345,7 +1356,9 @@ xfs_dec_freecounter(
                spin_unlock(&mp->m_sb_lock);
                return 0;
        }
-       xfs_warn_once(mp,
+
+       if (idx == FREE_BLOCKS)
+               xfs_warn_once(mp,
 "Reserve blocks depleted! Consider increasing reserve pool size.");
 
 fdblocks_enospc:
index c0f870130fbb8278d1fc672a59181573ae9d2a83..c7ae70fde64d511c0ceb7f85cc4233f3d4d78362 100644 (file)
@@ -108,6 +108,7 @@ struct xfs_groups {
 enum {
        FREE_BLOCKS,            /* free block counter */
        FREE_RTEXTENTS,         /* free rt extent counter */
+       FREE_RTAVAILABLE,       /* actually available rt extents */
        FREE_NR,
 };
 
@@ -255,7 +256,20 @@ typedef struct xfs_mount {
                uint64_t        avail;          /* available reserved blocks */
                uint64_t        save;           /* reserved blks @ remount,ro */
        } m_resblks[FREE_NR];
+       struct list_head        m_free_zones;
+       struct list_head        m_open_zones;
+       atomic_t                m_nr_free_zones;
+       unsigned int            m_nr_open_zones;
+       unsigned int            m_max_open_zones;
+       uint64_t                m_zoned_op;
+       struct list_head        m_emptied_zones;
+       spinlock_t              m_zone_list_lock;
+       wait_queue_head_t       m_zone_wait;
+       struct xfs_rtgroup      *m_open_gc_zone;
        struct delayed_work     m_reclaim_work; /* background inode reclaim */
+       spinlock_t              m_reservation_lock;
+       struct list_head        m_reclaim_reservations;
+       struct task_struct      *m_zone_gc_thread;
        struct dentry           *m_debugfs;     /* debugfs parent */
        struct xfs_kobj         m_kobj;
        struct xfs_kobj         m_error_kobj;
@@ -428,6 +442,11 @@ static inline bool xfs_has_rtreflink(const struct xfs_mount *mp)
               xfs_has_reflink(mp);
 }
 
+static inline bool xfs_has_nonzoned(const struct xfs_mount *mp)
+{
+       return !xfs_has_zoned(mp);
+}
+
 /*
  * Some features are always on for v5 file systems, allow the compiler to
  * eliminiate dead code when building without v4 support.
@@ -531,6 +550,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
 #define XFS_OPSTATE_WARNED_METADIR     17
 /* Filesystem should use qflags to determine quotaon status */
 #define XFS_OPSTATE_RESUMING_QUOTAON   18
+/* (Zoned) GC is in progress */
+#define XFS_OPSTATE_IN_GC              19
 
 #define __XFS_IS_OPSTATE(name, NAME) \
 static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -575,6 +596,7 @@ static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp)
 #endif /* CONFIG_XFS_QUOTA */
 __XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
 __XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
+__XFS_IS_OPSTATE(in_gc, IN_GC)
 
 static inline bool
 xfs_should_warn(struct xfs_mount *mp, long nr)
index 304cf53b0908ece468058f647e61cf0cecac038b..346a8e4d1a48aefd1a4991e7aaac0c82506ff8b6 100644 (file)
@@ -34,6 +34,7 @@
 #include "xfs_rtalloc.h"
 #include "xfs_rtgroup.h"
 #include "xfs_metafile.h"
+#include "xfs_zone_alloc.h"
 
 /*
  * Copy on Write of Shared Blocks
@@ -965,12 +966,137 @@ xfs_reflink_end_cow(
         */
        while (end_fsb > offset_fsb && !error)
                error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
-
        if (error)
                trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
        return error;
 }
 
+#ifdef CONFIG_XFS_RT
+static int
+xfs_zoned_end_extent(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip,
+       struct xfs_bmbt_irec    *new,
+       xfs_fsblock_t           old_startblock)
+{
+       struct xfs_bmbt_irec    data;
+       int                     nmaps = 1;
+       int                     error;
+
+       /* Grab the corresponding mapping in the data fork. */
+       error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data,
+                              &nmaps, 0);
+       if (error)
+               return error;
+
+       /*
+        * Cap the update to the existing extent in the data fork because we can
+        * only overwrite one extent at a time.
+        */
+       ASSERT(new->br_blockcount >= data.br_blockcount);
+       new->br_blockcount = data.br_blockcount;
+
+       /*
+        * If a data write raced with this GC write, keep the existing data in
+        * the data fork, mark our newly written GC extent as reclaimable, then
+        * move on to the next extent.
+        */
+       if (old_startblock != NULLFSBLOCK &&
+           old_startblock != data.br_startblock)
+               goto skip;
+
+       trace_xfs_reflink_cow_remap_from(ip, new);
+       trace_xfs_reflink_cow_remap_to(ip, &data);
+
+       error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
+                       XFS_IEXT_REFLINK_END_COW_CNT);
+       if (error)
+               return error;
+
+       if (data.br_startblock != HOLESTARTBLOCK) {
+               ASSERT(data.br_startblock != DELAYSTARTBLOCK);
+               ASSERT(!isnullstartblock(data.br_startblock));
+
+               xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
+               if (xfs_is_reflink_inode(ip)) {
+                       xfs_refcount_decrease_extent(tp, true, &data);
+               } else {
+                       error = xfs_free_extent_later(tp, data.br_startblock,
+                                       data.br_blockcount, NULL,
+                                       XFS_AG_RESV_NONE,
+                                       XFS_FREE_EXTENT_REALTIME);
+                       if (error)
+                               return error;
+               }
+       }
+
+       error = xfs_zone_record_blocks(tp, new->br_startblock,
+                       new->br_blockcount, true);
+       if (error)
+               return error;
+
+       /* Map the new blocks into the data fork. */
+       xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new);
+       return 0;
+
+skip:
+       trace_xfs_reflink_cow_remap_skip(ip, new);
+       return xfs_zone_record_blocks(tp, new->br_startblock,
+                       new->br_blockcount, false);
+}
+
+int
+xfs_zoned_end_io(
+       struct xfs_inode                *ip,
+       xfs_off_t                       offset,
+       xfs_off_t                       count,
+       xfs_daddr_t                     daddr,
+       xfs_fsblock_t                   old_startblock)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + count);
+       struct xfs_bmbt_irec    new = {
+               .br_startoff    = XFS_B_TO_FSBT(mp, offset),
+               .br_startblock  = xfs_daddr_to_rtb(mp, daddr),
+               .br_state       = XFS_EXT_NORM,
+       };
+       unsigned int            resblks =
+               XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+       struct xfs_trans        *tp;
+       int                     error;
+
+       if (xfs_is_shutdown(mp))
+               return -EIO;
+
+       while (new.br_startoff < end_fsb) {
+               new.br_blockcount = end_fsb - new.br_startoff;
+
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+                               XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp);
+               if (error)
+                       return error;
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               xfs_trans_ijoin(tp, ip, 0);
+
+               error = xfs_zoned_end_extent(tp, ip, &new, old_startblock);
+               if (error)
+                       xfs_trans_cancel(tp);
+               else
+                       error = xfs_trans_commit(tp);
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               if (error)
+                       return error;
+
+               new.br_startoff += new.br_blockcount;
+               new.br_startblock += new.br_blockcount;
+               if (old_startblock != NULLFSBLOCK)
+                       old_startblock += new.br_blockcount;
+       }
+
+       return 0;
+}
+#endif /* CONFIG_XFS_RT */
+
 /*
  * Free all CoW staging blocks that are still referenced by the ondisk refcount
  * metadata.  The ondisk metadata does not track which inode created the
@@ -1538,7 +1664,7 @@ xfs_reflink_zero_posteof(
                return 0;
 
        trace_xfs_zero_eof(ip, isize, pos - isize);
-       return xfs_zero_range(ip, isize, pos - isize, NULL);
+       return xfs_zero_range(ip, isize, pos - isize, NULL, NULL);
 }
 
 /*
index cc4e92278279b6231135512428f4a359689ebb22..7c5c06ce177ab000da01eb3fa08fc0ec4ffd5caa 100644 (file)
@@ -41,8 +41,10 @@ extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
                xfs_fileoff_t end_fsb, bool cancel_real);
 extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
                xfs_off_t count, bool cancel_real);
-extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
+int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
                xfs_off_t count);
+int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count,
+               xfs_daddr_t daddr, xfs_fsblock_t old_startblock);
 extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
 extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
                struct file *file_out, loff_t pos_out, loff_t len,
index b380414940ea38993d24859ab34d18746b81f87d..a3a2423d61d87a01baa5fc0480713004448ea3b1 100644 (file)
@@ -33,6 +33,7 @@
 #include "xfs_trace.h"
 #include "xfs_rtrefcount_btree.h"
 #include "xfs_reflink.h"
+#include "xfs_zone_alloc.h"
 
 /*
  * Return whether there are any free extents in the size range given
@@ -661,6 +662,7 @@ xfs_rtunmount_rtg(
 {
        int                     i;
 
+       list_del_init(&rtg->rtg_entry);
        for (i = 0; i < XFS_RTGI_MAX; i++)
                xfs_rtginode_irele(&rtg->rtg_inodes[i]);
        kvfree(rtg->rtg_rsum_cache);
@@ -1320,6 +1322,8 @@ xfs_growfs_rt(
        if (xfs_has_reflink(mp) &&
            !xfs_reflink_supports_rextsize(mp, in->extsize))
                goto out_unlock;
+       if (xfs_has_zoned(mp))
+               goto out_unlock;
 
        error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks);
        if (error)
@@ -1637,14 +1641,26 @@ xfs_rtmount_inodes(
                error = xfs_rtmount_rtg(mp, tp, rtg);
                if (error) {
                        xfs_rtgroup_rele(rtg);
-                       xfs_rtunmount_inodes(mp);
-                       break;
+                       goto out_rtunmount_rtgs;
                }
        }
 
+       if (xfs_has_zoned(mp)) {
+               error = xfs_mount_zones(mp);
+               if (error)
+                       goto out_rtunmount_rtgs;
+       }
+
 out_cancel:
        xfs_trans_cancel(tp);
        return error;
+
+out_rtunmount_rtgs:
+       rtg = NULL;
+       while ((rtg = xfs_rtgroup_next(mp, rtg)))
+               xfs_rtunmount_rtg(rtg);
+       xfs_rtginode_irele(&mp->m_rtdirip);
+       goto out_cancel;
 }
 
 void
@@ -1653,6 +1669,9 @@ xfs_rtunmount_inodes(
 {
        struct xfs_rtgroup      *rtg = NULL;
 
+       if (xfs_has_zoned(mp))
+               xfs_unmount_zones(mp);
+
        while ((rtg = xfs_rtgroup_next(mp, rtg)))
                xfs_rtunmount_rtg(rtg);
        xfs_rtginode_irele(&mp->m_rtdirip);
@@ -2092,6 +2111,8 @@ xfs_bmap_rtalloc(
                ap->datatype & XFS_ALLOC_INITIAL_USER_DATA;
        int                     error;
 
+       ASSERT(!xfs_has_zoned(ap->tp->t_mountp));
+
 retry:
        error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign);
        if (error)
index 2d1c97b333358244b3143d80add8ddddd4221c18..041c725047e447908a8a563a973a9c53620800fc 100644 (file)
@@ -46,6 +46,7 @@
 #include "xfs_exchmaps_item.h"
 #include "xfs_parent.h"
 #include "xfs_rtalloc.h"
+#include "xfs_zone_alloc.h"
 #include "scrub/stats.h"
 #include "scrub/rcbag_btree.h"
 
@@ -109,7 +110,8 @@ enum {
        Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
        Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
        Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
-       Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum,
+       Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
+       Opt_zoned_op,
 };
 
 static const struct fs_parameter_spec xfs_fs_parameters[] = {
@@ -154,6 +156,8 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
        fsparam_flag("nodiscard",       Opt_nodiscard),
        fsparam_flag("dax",             Opt_dax),
        fsparam_enum("dax",             Opt_dax_enum, dax_param_enums),
+       fsparam_u32("max_open_zones",   Opt_max_open_zones),
+       fsparam_u64("zoned_op",         Opt_zoned_op),
        {}
 };
 
@@ -233,6 +237,12 @@ xfs_fs_show_options(
        if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
                seq_puts(m, ",noquota");
 
+       if (mp->m_max_open_zones)
+               seq_printf(m, ",max_open_zones=%d", mp->m_max_open_zones);
+
+       if (mp->m_zoned_op)
+               seq_printf(m, ",zoned_op=%llu", mp->m_zoned_op);
+
        return 0;
 }
 
@@ -882,6 +892,8 @@ xfs_fs_statfs(
                s64     freertx;
 
                statp->f_blocks = sbp->sb_rblocks;
+               if (xfs_has_zoned(mp))
+                       statp->f_blocks -= mp->m_resblks[FREE_RTEXTENTS].total;
                freertx = max_t(int64_t, 0, xfs_sum_freecounter(mp, FREE_RTEXTENTS));
                statp->f_bavail = statp->f_bfree =
                        xfs_rtbxlen_to_blen(mp, freertx);
@@ -1093,7 +1105,9 @@ xfs_reinit_percpu_counters(
        percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
        percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
        percpu_counter_set(&mp->m_free[FREE_BLOCKS], mp->m_sb.sb_fdblocks);
-       percpu_counter_set(&mp->m_free[FREE_RTEXTENTS], mp->m_sb.sb_frextents);
+       if (!xfs_has_zoned(mp))
+               percpu_counter_set(&mp->m_free[FREE_RTEXTENTS],
+                               mp->m_sb.sb_frextents);
 }
 
 static void
@@ -1192,6 +1206,18 @@ xfs_fs_shutdown(
        xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED);
 }
 
+static int
+xfs_fs_show_stats(
+       struct seq_file         *m,
+       struct dentry           *root)
+{
+       struct xfs_mount        *mp = XFS_M(root->d_sb);
+
+       if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT))
+               xfs_zoned_show_stats(m, mp);
+       return 0;
+}
+
 static const struct super_operations xfs_super_operations = {
        .alloc_inode            = xfs_fs_alloc_inode,
        .destroy_inode          = xfs_fs_destroy_inode,
@@ -1206,6 +1232,7 @@ static const struct super_operations xfs_super_operations = {
        .nr_cached_objects      = xfs_fs_nr_cached_objects,
        .free_cached_objects    = xfs_fs_free_cached_objects,
        .shutdown               = xfs_fs_shutdown,
+       .show_stats             = xfs_fs_show_stats,
 };
 
 static int
@@ -1418,6 +1445,12 @@ xfs_fs_parse_param(
                xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
                parsing_mp->m_features |= XFS_FEAT_NOATTR2;
                return 0;
+       case Opt_max_open_zones:
+               parsing_mp->m_max_open_zones = result.uint_32;
+               return 0;
+       case Opt_zoned_op:
+               parsing_mp->m_zoned_op = result.uint_64;
+               return 0;
        default:
                xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
                return -EINVAL;
@@ -1758,8 +1791,14 @@ xfs_fs_fill_super(
                mp->m_features &= ~XFS_FEAT_DISCARD;
        }
 
-       if (xfs_has_metadir(mp))
+       if (xfs_has_metadir(mp)) {
                xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR);
+       } else if (xfs_has_zoned(mp)) {
+               xfs_alert(mp,
+       "metadir feature required for zoned realtime devices.");
+               error = -EINVAL;
+               goto out_filestream_unmount;
+       }
 
        if (xfs_has_reflink(mp)) {
                if (xfs_has_realtime(mp) &&
@@ -1771,6 +1810,13 @@ xfs_fs_fill_super(
                        goto out_filestream_unmount;
                }
 
+               if (xfs_has_zoned(mp)) {
+                       xfs_alert(mp,
+       "reflink not compatible with zoned RT device!");
+                       error = -EINVAL;
+                       goto out_filestream_unmount;
+               }
+
                /*
                 * always-cow mode is not supported on filesystems with rt
                 * extent sizes larger than a single block because we'd have
@@ -1902,6 +1948,9 @@ xfs_remount_rw(
        /* Re-enable the background inode inactivation worker. */
        xfs_inodegc_start(mp);
 
+       /* Restart zone reclaim */
+       xfs_zone_gc_start(mp);
+
        return 0;
 }
 
@@ -1946,6 +1995,9 @@ xfs_remount_ro(
         */
        xfs_inodegc_stop(mp);
 
+       /* Stop zone reclaim */
+       xfs_zone_gc_stop(mp);
+
        /* Free the per-AG metadata reservation pool. */
        xfs_fs_unreserve_ag_blocks(mp);
 
@@ -2069,6 +2121,7 @@ xfs_init_fs_context(
        mutex_init(&mp->m_growlock);
        INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
        INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+
        mp->m_kobj.kobject.kset = xfs_kset;
        /*
         * We don't create the finobt per-ag space reservation until after log
@@ -2097,8 +2150,10 @@ static void
 xfs_kill_sb(
        struct super_block              *sb)
 {
+       struct xfs_mount                *mp = XFS_M(sb);
+
        kill_block_super(sb);
-       xfs_mount_free(XFS_M(sb));
+       xfs_mount_free(mp);
 }
 
 static struct file_system_type xfs_fs_type = {
index 8f530e69c18ae73a4e7ecd2063ff0709d28ffaf7..8e5a3eb31bd28f1af667bfb47f43a9f252607411 100644 (file)
@@ -49,6 +49,7 @@
 #include "xfs_metafile.h"
 #include "xfs_metadir.h"
 #include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
 
 /*
  * We include this last to have the helpers above available for the trace
index 31822f940073201e0d2cb51ef98fb35b4369eee1..cc251aa0fb67221abf7b474f8694a850ab10fba1 100644 (file)
@@ -254,6 +254,7 @@ DECLARE_EVENT_CLASS(xfs_group_class,
                  (char *)__entry->caller_ip)
 );
 
+
 #define DEFINE_GROUP_REF_EVENT(name)   \
 DEFINE_EVENT(xfs_group_class, name,    \
        TP_PROTO(struct xfs_group *xg, unsigned long caller_ip), \
@@ -265,6 +266,86 @@ DEFINE_GROUP_REF_EVENT(xfs_group_grab);
 DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag);
 DEFINE_GROUP_REF_EVENT(xfs_group_rele);
 
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xfs_zone_class,
+       TP_PROTO(struct xfs_rtgroup *rtg),
+       TP_ARGS(rtg),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_rgnumber_t, rgno)
+               __field(xfs_rgblock_t, used)
+               __field(xfs_rgblock_t, written)
+               __field(xfs_rgblock_t, write_pointer)
+       ),
+       TP_fast_assign(
+               __entry->dev = rtg_mount(rtg)->m_super->s_dev;
+               __entry->rgno = rtg_rgno(rtg);
+               __entry->used = *xfs_zone_used_counter(rtg);
+               __entry->written = rtg->rtg_written;
+               __entry->write_pointer = rtg->rtg_write_pointer;
+       ),
+       TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->rgno,
+                 __entry->used,
+                 __entry->written,
+                 __entry->write_pointer)
+);
+
+#define DEFINE_ZONE_EVENT(name)                                \
+DEFINE_EVENT(xfs_zone_class, name,                     \
+       TP_PROTO(struct xfs_rtgroup *rtg),              \
+       TP_ARGS(rtg))
+DEFINE_ZONE_EVENT(xfs_zone_emptied);
+DEFINE_ZONE_EVENT(xfs_zone_full);
+DEFINE_ZONE_EVENT(xfs_zone_activate);
+DEFINE_ZONE_EVENT(xfs_zone_reset);
+DEFINE_ZONE_EVENT(xfs_zone_reclaim);
+DEFINE_ZONE_EVENT(xfs_gc_zone_activate);
+
+DECLARE_EVENT_CLASS(xfs_zone_alloc_class,
+       TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
+                xfs_extlen_t len),
+       TP_ARGS(rtg, rgbno, len),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_rgnumber_t, rgno)
+               __field(xfs_rgblock_t, used)
+               __field(xfs_rgblock_t, written)
+               __field(xfs_rgblock_t, write_pointer)
+               __field(xfs_rgblock_t, rgbno)
+               __field(xfs_extlen_t, len)
+       ),
+       TP_fast_assign(
+               __entry->dev = rtg_mount(rtg)->m_super->s_dev;
+               __entry->rgno = rtg_rgno(rtg);
+               __entry->used = *xfs_zone_used_counter(rtg);
+               __entry->written = rtg->rtg_written;
+               __entry->write_pointer = rtg->rtg_write_pointer;
+               __entry->rgbno = rgbno;
+               __entry->len = len;
+       ),
+       TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->rgno,
+                 __entry->used,
+                 __entry->written,
+                 __entry->write_pointer,
+                 __entry->rgbno,
+                 __entry->len)
+);
+
+
+#define DEFINE_ZONE_ALLOC_EVENT(name)                          \
+DEFINE_EVENT(xfs_zone_alloc_class, name,                       \
+       TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,  \
+                xfs_extlen_t len),                             \
+       TP_ARGS(rtg, rgbno, len))
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_free_blocks);
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
+#endif /* CONFIG_XFS_RT */
+
 TRACE_EVENT(xfs_inodegc_worker,
        TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
        TP_ARGS(mp, shrinker_hits),
@@ -1596,6 +1677,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
 DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
 DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
 DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read);
+DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks);
 
 DECLARE_EVENT_CLASS(xfs_itrunc_class,
        TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
@@ -3984,6 +4066,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip);
 
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
new file mode 100644 (file)
index 0000000..9cdce2c
--- /dev/null
@@ -0,0 +1,557 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Christoph Hellwig.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_fsops.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_iomap.h"
+#include "xfs_trans.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zones.h"
+#include "xfs_trace.h"
+
+/*
+ * Keep track of a counter of blocks used in a rtgroup.  This is incremented
+ * after the blocks have been written to and the I/O completion handlers sets
+ * up the bmap and remap records to link them into the file system metadata
+ * and decremented when the blocks are "freed" by unlinking them from the bmap
+ * and rmap trees.  The space will only become available for reuse when the
+ * zone is reset.
+ *
+ * The way this stored is a bit of a hack and abuses the atime field in the
+ * rmap inode.  There is precedence for this in the rtbimap inode, but it is
+ * a bit ugly.
+ */
+uint64_t *
+xfs_zone_used_counter(
+       struct xfs_rtgroup      *rtg)
+{
+       return (uint64_t *)&VFS_I(rtg->rtg_inodes[XFS_RTGI_RMAP])->i_atime_sec;
+}
+
+/*
+ * Keep track of the last written block in a zone.
+ *
+ * This is only needed when using the zoned allocator on a device that doesn't
+ * support zones natively and is an approximation for the hardware write
+ * pointer.  Unlike the hardware write pointer it might be past regions that
+ * haven't been written to.  In case of an unclean shutdown this means there
+ * could be blocks that we'll never write ever before finishing the zone.
+ * This is a little bit inefficient, but not a real problem as the used counter
+ * above doesn't account for them, so they will be treated by zone reclaim as
+ * if these blocks were written to but deleted immediately.
+ *
+ * This uses the same kind of hack to store extra information in the rmap inode
+ * as the used counter above.
+ */
+uint64_t *
+xfs_zone_last_written(
+       struct xfs_rtgroup      *rtg)
+{
+       return (uint64_t *)&VFS_I(rtg->rtg_inodes[XFS_RTGI_RMAP])->i_mtime_sec;
+}
+
+static void
+xfs_zone_emptied(
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+
+       trace_xfs_zone_emptied(rtg);
+
+       xfs_group_clear_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE);
+
+       spin_lock(&mp->m_zone_list_lock);
+       ASSERT(list_empty(&rtg->rtg_entry));
+       list_add_tail(&rtg->rtg_entry, &mp->m_emptied_zones);
+       spin_unlock(&mp->m_zone_list_lock);
+
+       wake_up_process(mp->m_zone_gc_thread);
+}
+
+static void
+xfs_zone_mark_reclaimable(
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+
+       xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE);
+       if (xfs_zoned_need_gc(mp))
+               wake_up_process(mp->m_zone_gc_thread);
+}
+
+static void
+xfs_zone_mark_full(
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+
+       trace_xfs_zone_full(rtg);
+
+       spin_lock(&mp->m_zone_list_lock);
+       clear_bit(RTG_F_OPEN, &rtg->rtg_flags);
+       if (!list_empty(&rtg->rtg_entry)) {
+               /* empty list means this is the open GC zone */
+               mp->m_nr_open_zones--;
+               list_del_init(&rtg->rtg_entry);
+       }
+       spin_unlock(&mp->m_zone_list_lock);
+
+       wake_up_all(&mp->m_zone_wait);
+       if (*xfs_zone_used_counter(rtg) < rtg->rtg_extents)
+               xfs_zone_mark_reclaimable(rtg);
+}
+
+/*
+ * Record data blocks as having been written to.
+ *
+ * This is called from the write completion handler and records blocks as
+ * actually used.  For zoned devices all this is purely an in-memory
+ * exercise to manage the open zones, but if we run on a conventional
+ * device we also have to record the last written block as the write pointer
+ * approximation.
+ */
+int
+xfs_zone_record_blocks(
+       struct xfs_trans        *tp,
+       xfs_fsblock_t           fsbno,
+       xfs_filblks_t           len,
+       bool                    used)
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       xfs_rgblock_t           rgbno = xfs_rtb_to_rgbno(mp, fsbno);
+       struct xfs_rtgroup      *rtg;
+
+       rtg = xfs_rtgroup_get(mp, xfs_rtb_to_rgno(mp, fsbno));
+       if (!rtg)
+               return -EIO;
+
+       trace_xfs_zone_record_blocks(rtg, rgbno, len);
+
+       xfs_ilock(rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOCK_EXCL);
+
+       if (used) {
+               *xfs_zone_used_counter(rtg) += len;
+               ASSERT(*xfs_zone_used_counter(rtg) <= rtg->rtg_extents);
+       } else {
+               xfs_add_frextents(mp, xfs_extlen_to_rtxlen(mp, len));
+       }
+
+       if (rgbno + len > *xfs_zone_last_written(rtg))
+               *xfs_zone_last_written(rtg) = rgbno + len;
+
+       rtg->rtg_written += len;
+       ASSERT(rtg->rtg_written <= rtg->rtg_write_pointer);
+       if (rtg->rtg_written == rtg->rtg_extents)
+               xfs_zone_mark_full(rtg);
+
+       xfs_trans_log_inode(tp, rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOG_CORE);
+
+       xfs_rtgroup_put(rtg);
+       return 0;
+}
+
+/*
+ * "Free" blocks allocated in a zone.
+ *
+ * Just decrement the used blocks counter and report the space as freed.
+ */
+int
+xfs_zone_free_blocks(
+       struct xfs_trans        *tp,
+       struct xfs_rtgroup      *rtg,
+       xfs_fsblock_t           fsbno,
+       xfs_filblks_t           len)
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       uint64_t                *used = xfs_zone_used_counter(rtg);
+
+       xfs_assert_ilocked(rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOCK_EXCL);
+       if (len > *used) {
+               xfs_err(mp,
+"trying to free more blocks (%lld) than used counter (%lld).",
+                       len, *used);
+               ASSERT(len <= *used);
+               xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+               return -EFSCORRUPTED;
+       }
+
+       trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len);
+
+       *used -= len;
+       if (rtg->rtg_written == rtg->rtg_extents) {
+               /*
+                * Mark up the zone as reclaimable, but only if the zone is full
+                * as we don't reclaim open zones.  As an optimization kick of a
+                * zone reset if the usage counter hits zero.
+                */
+               if (*used == 0)
+                       xfs_zone_emptied(rtg);
+               else if (*used + len == rtg->rtg_extents)
+                       xfs_zone_mark_reclaimable(rtg);
+       }
+
+       xfs_add_frextents(mp, xfs_extlen_to_rtxlen(mp, len));
+       xfs_trans_log_inode(tp, rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOG_CORE);
+       return 0;
+}
+
+/*
+ * Check if the zone containing the data just before the offset we are
+ * writing to is still open and has space.
+ */
+static struct xfs_rtgroup *
+xfs_last_used_zone(
+       struct iomap_ioend      *ioend)
+{
+       struct xfs_inode        *ip = XFS_I(ioend->io_inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset);
+       struct xfs_rtgroup      *rtg = NULL;
+       struct xfs_iext_cursor  icur;
+       struct xfs_bmbt_irec    got;
+
+       xfs_ilock(ip, XFS_ILOCK_SHARED);
+       if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb,
+                               &icur, &got))
+               goto out_unlock;
+       ASSERT(!isnullstartblock(got.br_startblock));
+       rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock));
+       if (rtg && !test_bit(RTG_F_OPEN, &rtg->rtg_flags)) {
+               xfs_rtgroup_rele(rtg);
+               rtg = NULL;
+       }
+out_unlock:
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+       return rtg;
+}
+
+struct xfs_rtgroup *
+xfs_find_free_zone(
+       struct xfs_mount        *mp)
+{
+       struct xfs_rtgroup      *rtg;
+
+       lockdep_assert_held(&mp->m_zone_list_lock);
+
+       list_for_each_entry(rtg, &mp->m_free_zones, rtg_entry) {
+               ASSERT(rtg->rtg_write_pointer == 0);
+               if (atomic_inc_not_zero(&rtg->rtg_group.xg_active_ref)) {
+                       list_del_init(&rtg->rtg_entry);
+                       atomic_dec(&mp->m_nr_free_zones);
+                       return rtg;
+               }
+       }
+
+       return NULL;
+}
+
+/*
+ * Activate a free zone.
+ *
+ * This just does the accounting and allows to find the zone on the open
+ * zones list.  Don't bother with an explicit open command, we'll just open it
+ * implicitly with the first write to it.
+ */
+static struct xfs_rtgroup *
+xfs_activate_zone(
+       struct xfs_mount        *mp)
+{
+       struct xfs_rtgroup      *rtg;
+
+       if (atomic_read(&mp->m_nr_free_zones) <
+           XFS_GC_ZONES - XFS_OPEN_GC_ZONES)
+               return NULL;
+
+       rtg = xfs_find_free_zone(mp);
+       if (!rtg)
+               return NULL;
+
+       list_add_tail(&rtg->rtg_entry, &mp->m_open_zones);
+       mp->m_nr_open_zones++;
+       if (xfs_zoned_need_gc(mp))
+               wake_up_process(mp->m_zone_gc_thread);
+
+       /* XXX: this is a little verbose, but let's keep it for now */
+       xfs_info(mp, "using zone %u (%d)",
+                rtg_rgno(rtg), mp->m_nr_open_zones);
+       set_bit(RTG_F_OPEN, &rtg->rtg_flags);
+       trace_xfs_zone_activate(rtg);
+       return rtg;
+}
+
+/*
+ * For SMR hard drives that have no open limit, keep opening a new zone for each
+ * allocation context.  If all zones in the system are open, use this simple LRU
+ * algorithm to pick then one that was least recently used.
+ *
+ * This requires that any reused zone is rotated to the end of the open list so
+ * that the next users doesn't pick it again.
+ */
+static struct xfs_rtgroup *
+xfs_select_open_zone_lru(
+       struct xfs_mount        *mp,
+       unsigned int            minlen)
+{
+       struct xfs_rtgroup      *rtg;
+
+       list_for_each_entry(rtg, &mp->m_open_zones, rtg_entry) {
+               if (rtg->rtg_extents - rtg->rtg_write_pointer < minlen)
+                       continue;
+               if (!atomic_inc_not_zero(&rtg->rtg_group.xg_active_ref))
+                       continue;
+               list_move_tail(&rtg->rtg_entry, &mp->m_open_zones);
+               return rtg;
+       }
+
+       return NULL;
+}
+
+/*
+ * Pick a new zone for writes.
+ *
+ * If we aren't using up our budget of open zones just open a new one from
+ * the freelist.  Else try to find one that matches the expected allocation
+ * length, or at least the minimum required length.  If we don't find one
+ * that is good enough we pick one anyway and let the caller finish it to
+ * free up open zone resources.
+ */
+static struct xfs_rtgroup *
+xfs_select_zone_nowait(
+       struct xfs_inode        *ip,
+       xfs_filblks_t           count_fsb)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_rtgroup      *rtg;
+
+       /*
+        * If we are below the open limit try to activate a zone.
+        */
+       if (mp->m_nr_open_zones < mp->m_max_open_zones - XFS_OPEN_GC_ZONES) {
+               rtg = xfs_activate_zone(mp);
+               if (rtg)
+                       return rtg;
+       }
+
+       rtg = xfs_select_open_zone_lru(mp, count_fsb);
+       if (rtg)
+               return rtg;
+       return xfs_select_open_zone_lru(mp, 1);
+}
+
+static struct xfs_rtgroup *
+xfs_select_zone(
+       struct iomap_ioend      *ioend)
+{
+       struct xfs_inode        *ip = XFS_I(ioend->io_inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_filblks_t           count_fsb = XFS_B_TO_FSB(mp, ioend->io_size);
+       struct xfs_rtgroup      *rtg = NULL;
+       DEFINE_WAIT             (wait);
+
+       spin_lock(&mp->m_zone_list_lock);
+       if (xfs_is_shutdown(mp))
+               goto out_unlock;
+
+       rtg = xfs_select_zone_nowait(ip, count_fsb);
+       if (rtg)
+               goto out_unlock;
+
+       for (;;) {
+               prepare_to_wait(&mp->m_zone_wait, &wait, TASK_UNINTERRUPTIBLE);
+               if (xfs_is_shutdown(mp))
+                       break;
+
+               rtg = xfs_select_zone_nowait(ip, count_fsb);
+               if (rtg)
+                       break;
+
+               spin_unlock(&mp->m_zone_list_lock);
+               schedule();
+               spin_lock(&mp->m_zone_list_lock);
+       }
+       finish_wait(&mp->m_zone_wait, &wait);
+
+out_unlock:
+       spin_unlock(&mp->m_zone_list_lock);
+       return rtg;
+}
+
+static unsigned int
+xfs_zone_alloc_blocks(
+       struct iomap_ioend      *ioend,
+       struct xfs_rtgroup      *rtg,
+       bool                    *is_seq)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+       xfs_filblks_t           count_fsb = XFS_B_TO_FSB(mp, ioend->io_size);
+       xfs_rgblock_t           rgbno;
+
+       spin_lock(&rtg->rtg_alloc_lock);
+       count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN,
+               (xfs_filblks_t)rtg->rtg_extents - rtg->rtg_write_pointer);
+       if (!count_fsb || !test_bit(RTG_F_OPEN, &rtg->rtg_flags)) {
+               spin_unlock(&rtg->rtg_alloc_lock);
+               return 0;
+       }
+       rgbno = rtg->rtg_write_pointer;
+       rtg->rtg_write_pointer += count_fsb;
+       spin_unlock(&rtg->rtg_alloc_lock);
+
+       trace_xfs_zone_alloc_blocks(rtg, rgbno, count_fsb);
+
+       *is_seq = test_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags);
+       if (*is_seq)
+               rgbno = 0;
+       ioend->io_sector = xfs_rtb_to_daddr(mp, xfs_rgbno_to_rtb(rtg, rgbno));
+       return XFS_FSB_TO_B(mp, count_fsb);
+}
+
+static inline void
+xfs_mark_rtg_boundary(
+       struct iomap_ioend      *ioend)
+{
+       struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
+       sector_t                sector = ioend->io_bio.bi_iter.bi_sector;
+
+       if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0)
+               ioend->io_flags |= IOMAP_F_BOUNDARY;
+}
+
+static void
+xfs_submit_zoned_bio(
+       struct iomap_ioend      *ioend,
+       bool                    is_seq)
+{
+       if (is_seq) {
+               ioend->io_bio.bi_opf &= ~REQ_OP_WRITE;
+               ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND;
+       } else {
+               xfs_mark_rtg_boundary(ioend);
+       }
+
+       ioend->io_bio.bi_iter.bi_sector = ioend->io_sector;
+       submit_bio(&ioend->io_bio);
+}
+
+void
+xfs_zone_alloc_and_submit(
+       struct iomap_ioend      *ioend,
+       struct xfs_rtgroup      **rtg)
+{
+       unsigned int            alloc_len;
+       struct iomap_ioend      *split;
+       bool                    is_seq;
+
+       if (xfs_is_shutdown(XFS_I(ioend->io_inode)->i_mount))
+               goto out_error;
+
+       /*
+        * If we don't have a cached zone in this write context, see if the
+        * last extent before the one we are writing points of an active zone.
+        * If so, just continue writing to it.
+        */
+       if (!*rtg)
+               *rtg = xfs_last_used_zone(ioend);
+
+       if (!*rtg) {
+select_zone:
+               *rtg = xfs_select_zone(ioend);
+               if (!*rtg)
+                       goto out_error;
+       }
+
+       alloc_len = xfs_zone_alloc_blocks(ioend, *rtg, &is_seq);
+       if (!alloc_len) {
+               xfs_zone_finish_alloc(*rtg);
+               goto select_zone;
+       }
+
+       while ((split = iomap_split_ioend(ioend, is_seq, &alloc_len))) {
+               xfs_submit_zoned_bio(split, is_seq);
+               if (!alloc_len) {
+                       xfs_zone_finish_alloc(*rtg);
+                       goto select_zone;
+               }
+       }
+
+       xfs_submit_zoned_bio(ioend, is_seq);
+       return;
+
+out_error:
+       bio_io_error(&ioend->io_bio);
+}
+
+void
+xfs_zone_finish_alloc(
+       struct xfs_rtgroup      *rtg)
+{
+       if (rtg)
+               xfs_rtgroup_rele(rtg);
+}
+
+static void
+xfs_show_zone(
+       struct seq_file         *m,
+       struct xfs_rtgroup      *rtg)
+{
+       seq_printf(m, "\t  zone %d, wp %u, written %u, used %llu\n",
+               rtg_rgno(rtg),
+               rtg->rtg_write_pointer, rtg->rtg_written,
+               *xfs_zone_used_counter(rtg));
+}
+
+void
+xfs_zoned_show_stats(
+       struct seq_file         *m,
+       struct xfs_mount        *mp)
+{
+       unsigned long           index = 0;
+       unsigned                count = 0;
+       struct xfs_rtgroup      *rtg;
+
+       seq_puts(m, "\n");
+
+       seq_printf(m, "\tuser free blocks: %lld\n",
+               xfs_sum_freecounter(mp, FREE_RTEXTENTS));
+       seq_printf(m, "\treserved free blocks: %lld\n",
+               mp->m_resblks[FREE_RTEXTENTS].avail);
+       seq_printf(m, "\tuser available blocks: %lld\n",
+               xfs_sum_freecounter(mp, FREE_RTAVAILABLE));
+       seq_printf(m, "\treserved available blocks: %lld\n",
+               mp->m_resblks[FREE_RTAVAILABLE].avail);
+       seq_printf(m, "\treservations required: %d\n",
+               !list_empty_careful(&mp->m_reclaim_reservations));
+       seq_printf(m, "\tGC required: %d\n",
+               xfs_zoned_need_gc(mp));
+
+       spin_lock(&mp->m_zone_list_lock);
+       seq_printf(m, "\tfree zones: %d\n", atomic_read(&mp->m_nr_free_zones));
+       seq_puts(m, "\topen zones:\n");
+       list_for_each_entry(rtg, &mp->m_open_zones, rtg_entry)
+               xfs_show_zone(m, rtg);
+       if (mp->m_open_gc_zone) {
+               seq_puts(m, "\topen gc zone:\n");
+               xfs_show_zone(m, mp->m_open_gc_zone);
+       }
+       seq_puts(m, "\treclaimable zones:\n");
+       xa_for_each_marked(&mp->m_groups[XG_TYPE_RTG].xa, index, rtg,
+                       XFS_RTG_RECLAIMABLE) {
+               if (++count > 20) {
+                       seq_puts(m, "\t  (truncated)\n");
+                       break;
+               }
+               xfs_show_zone(m, rtg);
+       }
+       spin_unlock(&mp->m_zone_list_lock);
+}
diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
new file mode 100644 (file)
index 0000000..0e5c612
--- /dev/null
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XFS_ZONE_ALLOC_H
+#define _XFS_ZONE_ALLOC_H
+
+void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
+               struct xfs_rtgroup **rtg);
+void xfs_zone_finish_alloc(struct xfs_rtgroup *rtg);
+int xfs_zone_record_blocks(struct xfs_trans *tp, xfs_fsblock_t fsbno,
+               xfs_filblks_t len, bool used);
+int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+               xfs_fsblock_t fsbno, xfs_filblks_t len);
+
+uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp, unsigned int idx);
+
+int xfs_mount_zones(struct xfs_mount *mp);
+void xfs_unmount_zones(struct xfs_mount *mp);
+
+#ifdef CONFIG_XFS_RT
+void xfs_zone_gc_start(struct xfs_mount *mp);
+void xfs_zone_gc_stop(struct xfs_mount *mp);
+#else
+static inline void xfs_zone_gc_start(struct xfs_mount *mp)
+{
+}
+static inline void xfs_zone_gc_stop(struct xfs_mount *mp)
+{
+}
+#endif /* CONFIG_XFS_RT */
+
+void xfs_zoned_show_stats(struct seq_file *m, struct xfs_mount *mp);
+
+uint64_t *xfs_zone_used_counter(struct xfs_rtgroup *rtg);
+uint64_t *xfs_zone_last_written(struct xfs_rtgroup *rtg);
+
+struct xfs_zone_alloc_ctx {
+       struct xfs_rtgroup      *cached_rtg;
+       xfs_filblks_t           reserved_blocks;
+};
+
+#define XFS_ZR_GREEDY          (1U << 0)
+#define XFS_ZR_NOWAIT          (1U << 1)
+#define XFS_ZR_RESERVED                (1U << 2)
+
+int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb,
+               unsigned int flags, struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_space_unreserve(struct xfs_inode *ip,
+               struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb);
+bool xfs_zoned_need_gc(struct xfs_mount *mp);
+struct xfs_rtgroup *xfs_find_free_zone(struct xfs_mount *mp);
+
+#endif /* _XFS_ZONE_ALLOC_H */
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
new file mode 100644 (file)
index 0000000..6fb8d62
--- /dev/null
@@ -0,0 +1,1409 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Christoph Hellwig.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_fsops.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_iomap.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_reflink.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zones.h"
+#include "xfs_trace.h"
+
+struct xfs_zone_reservation {
+       struct list_head        entry;
+       struct task_struct      *task;
+       xfs_rtxnum_t            rtxlen;
+};
+
+uint64_t
+xfs_zoned_default_resblks(
+       struct xfs_mount        *mp,
+       unsigned int            idx)
+{
+       /*
+        * For the available blocks dipped into by the allocator only reserved the
+        * required GC zones.
+        */
+       if (idx == FREE_RTAVAILABLE)
+               return XFS_GC_ZONES * mp->m_groups[XG_TYPE_RTG].blocks;
+
+       /*
+        * For the user reported blocks, include at least the extra sparse zone
+        * and also any extra overprovisioning.
+        */
+       return XFS_RESERVED_ZONES * mp->m_groups[XG_TYPE_RTG].blocks +
+               XFS_B_TO_FSB(mp, mp->m_zoned_op);
+}
+
+/*
+ * We aim to keep enough zones free in stock to fully use the open zone limit
+ * for data placement purposes.
+ */
+bool
+xfs_zoned_need_gc(
+       struct xfs_mount        *mp)
+{
+       if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
+               return false;
+       if (xfs_estimate_freecounter(mp, FREE_RTAVAILABLE) <
+           mp->m_groups[XG_TYPE_RTG].blocks *
+           (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
+               return true;
+       return false;
+}
+
+static void
+xfs_zoned_wake_all(
+       struct xfs_mount                *mp)
+{
+       struct xfs_zone_reservation     *reservation;
+
+       spin_lock(&mp->m_reservation_lock);
+       list_for_each_entry(reservation, &mp->m_reclaim_reservations, entry)
+               wake_up_process(reservation->task);
+       spin_unlock(&mp->m_reservation_lock);
+}
+
+void
+xfs_zoned_add_available(
+       struct xfs_mount                *mp,
+       xfs_filblks_t                   count_fsb)
+{
+       struct xfs_zone_reservation     *reservation;
+       xfs_rtxnum_t                    rtxlen;
+
+       rtxlen = xfs_extlen_to_rtxlen(mp, count_fsb);
+       if (list_empty_careful(&mp->m_reclaim_reservations)) {
+               xfs_add_freecounter(mp, FREE_RTAVAILABLE, rtxlen);
+               return;
+       }
+
+       spin_lock(&mp->m_reservation_lock);
+       xfs_add_freecounter(mp, FREE_RTAVAILABLE, rtxlen);
+       rtxlen = xfs_sum_freecounter(mp, FREE_RTAVAILABLE);
+       list_for_each_entry(reservation, &mp->m_reclaim_reservations, entry) {
+               if (reservation->rtxlen > rtxlen)
+                       break;
+               wake_up_process(reservation->task);
+               rtxlen -= reservation->rtxlen;
+
+       }
+       spin_unlock(&mp->m_reservation_lock);
+}
+
+static int
+xfs_zoned_space_wait_error(
+       struct xfs_mount                *mp)
+{
+       if (xfs_is_shutdown(mp))
+               return -EIO;
+       if (fatal_signal_pending(current))
+               return -EINTR;
+       return 0;
+}
+
+static int
+xfs_zoned_reserve_available(
+       struct xfs_inode                *ip,
+       xfs_rtxlen_t                    rtxlen,
+       unsigned int                    flags)
+{
+       struct xfs_mount                *mp = ip->i_mount;
+       struct xfs_zone_reservation     reservation = {
+               .task           = current,
+               .rtxlen         = rtxlen,
+       };
+       int                             error;
+
+       if (likely(list_empty_careful(&mp->m_reclaim_reservations))) {
+               error = xfs_dec_freecounter(mp, FREE_RTAVAILABLE, rtxlen,
+                               flags & XFS_ZR_RESERVED);
+               if (error != -ENOSPC)
+                       return error;
+       }
+
+       if (flags & XFS_ZR_NOWAIT)
+               return -EAGAIN;
+
+       spin_lock(&mp->m_reservation_lock);
+       list_add_tail(&reservation.entry, &mp->m_reclaim_reservations);
+       while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
+               set_current_state(TASK_KILLABLE);
+
+               error = xfs_dec_freecounter(mp, FREE_RTAVAILABLE, rtxlen,
+                               flags & XFS_ZR_RESERVED);
+               if (error != -ENOSPC)
+                       break;
+
+               /*
+                * If there is nothing left to reclaim, give up.
+                */
+               if (!xfs_is_in_gc(mp) &&
+                   !xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
+                       break;
+
+               spin_unlock(&mp->m_reservation_lock);
+               schedule();
+               spin_lock(&mp->m_reservation_lock);
+       }
+       list_del(&reservation.entry);
+       spin_unlock(&mp->m_reservation_lock);
+
+       __set_current_state(TASK_RUNNING);
+       return error;
+}
+
+/*
+ * Implement greedy space allocation for short writes by trying to grab all
+ * that is left after locking out other threads from trying to do the same.
+ * 
+ * This isn't exactly optimal and can hopefully be replaced by a proper
+ * percpu_counter primitive one day.
+ */
+static int
+xfs_zoned_reserve_extents_greedy(
+       struct xfs_inode                *ip,
+       xfs_rtxlen_t                    *rtxlen,
+       unsigned int                    flags)
+{
+       struct xfs_mount                *mp = ip->i_mount;
+       s64                             len = *rtxlen;
+       int                             error = -ENOSPC;
+
+       spin_lock(&mp->m_reservation_lock);
+       len = min(len, xfs_sum_freecounter(mp, FREE_RTEXTENTS));
+       if (len > 0) {
+               *rtxlen = len;
+               error = xfs_dec_freecounter(mp, FREE_RTEXTENTS, *rtxlen,
+                               flags & XFS_ZR_RESERVED);
+       }
+       spin_unlock(&mp->m_reservation_lock);
+       return error;
+}
+
+int
+xfs_zoned_space_reserve(
+       struct xfs_inode                *ip,
+       xfs_filblks_t                   count_fsb,
+       unsigned int                    flags,
+       struct xfs_zone_alloc_ctx       *ac)
+{
+       struct xfs_mount                *mp = ip->i_mount;
+       xfs_rtxlen_t                    rtxlen;
+       int                             error;
+
+       ac->cached_rtg = NULL;
+
+       rtxlen = xfs_extlen_to_rtxlen(mp, count_fsb);
+       error = xfs_dec_freecounter(mp, FREE_RTEXTENTS, rtxlen,
+                       flags & XFS_ZR_RESERVED);
+       if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && rtxlen > 1) {
+               error = xfs_zoned_reserve_extents_greedy(ip, &rtxlen, flags);
+               if (error)
+                       return error;
+       }
+       error = xfs_zoned_reserve_available(ip, rtxlen, flags);
+       if (error) {
+               xfs_add_freecounter(mp, FREE_RTEXTENTS, rtxlen);
+               return error;
+       }
+       ac->reserved_blocks = xfs_rtxlen_to_extlen(mp, rtxlen);
+       return 0;
+}
+
+void
+xfs_zoned_space_unreserve(
+       struct xfs_inode                *ip,
+       struct xfs_zone_alloc_ctx       *ac)
+{
+       if (ac->reserved_blocks > 0) {
+               struct xfs_mount        *mp = ip->i_mount;
+
+               xfs_zoned_add_available(mp, ac->reserved_blocks);
+               xfs_add_freecounter(mp, FREE_RTEXTENTS,
+                               xfs_extlen_to_rtxlen(mp, ac->reserved_blocks));
+       }
+       xfs_zone_finish_alloc(ac->cached_rtg);
+}
+
+/*
+ * Split up rewrites in smaller chunks (1MB)
+ */
+#define XFS_GC_CHUNK_SIZE      (1024u * 1024)
+
+#define XFS_ZONE_GC_NR_SCRATCH 2
+struct xfs_zone_scratch {
+       struct folio                    *folio;
+       unsigned int                    offset;
+       unsigned int                    freed;
+};
+
+struct xfs_gc_bio {
+       struct xfs_inode                *ip;
+       loff_t                          offset;
+       unsigned int                    len;
+       bool                            is_seq;
+       xfs_fsblock_t                   old_startblock;
+       xfs_daddr_t                     new_daddr;
+       union {
+               struct xfs_zone_scratch         *scratch;
+               struct xfs_zone_gc_data         *data;
+       };
+
+       struct bio_vec                  bv;
+       struct bio                      bio; /* must be last */
+};
+
+struct xfs_zone_gc_data {
+       /* global GC state */
+       struct xfs_mount                *mp;
+       struct bio_set                  bio_set;
+       struct xfs_zone_scratch         scratch[XFS_ZONE_GC_NR_SCRATCH];
+       unsigned int                    scratch_idx;
+       struct bio_list                 read_done;
+       struct bio_list                 write_done;
+       struct bio_list                 reset_done;
+       spinlock_t                      list_lock;
+       unsigned int                    inflight;
+};
+
+static struct xfs_zone_gc_data *
+xfs_zone_gc_data_alloc(
+       struct xfs_mount        *mp)
+{
+       struct xfs_zone_gc_data *data;
+       int i;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return NULL;
+
+       /*
+        * We actually only need a single bio_vec.  It would be nice to have
+        * a flag that only allocates the inline bvecs and not the separate
+        * bvec pool.
+        */
+       if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
+                       BIOSET_NEED_BVECS))
+               goto out_free_data;
+       for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
+               data->scratch[i].folio =
+                       folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
+               if (!data->scratch[i].folio)
+                       goto out_free_scratch;
+       }
+       spin_lock_init(&data->list_lock);
+       data->mp = mp;
+       return data;
+
+out_free_scratch:
+       while (--i >= 0)
+               folio_put(data->scratch[i].folio);
+       bioset_exit(&data->bio_set);
+out_free_data:
+       kfree(data);
+       return NULL;
+}
+
+static void
+xfs_zone_gc_data_free(
+       struct xfs_zone_gc_data *data)
+{
+       int                     i;
+
+       for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
+               folio_put(data->scratch[i].folio);
+       bioset_exit(&data->bio_set);
+       kfree(data);
+}
+
+#define XFS_ZONE_GC_RECS               32
+
+/* iterator, needs to be reinitialized for each victim zone */
+struct xfs_zone_gc_iter {
+       struct xfs_rtgroup              *victim_rtg;
+       unsigned int                    rec_count;
+       unsigned int                    rec_idx;
+       xfs_agblock_t                   next_startblock;
+       struct xfs_rmap_irec            recs[XFS_ZONE_GC_RECS];
+};
+
+static void
+xfs_zone_gc_iter_init(
+       struct xfs_zone_gc_iter *iter,
+       struct xfs_rtgroup      *victim_rtg)
+
+{
+       iter->next_startblock = 0;
+       iter->rec_count = 0;
+       iter->rec_idx = 0;
+       iter->victim_rtg = victim_rtg;
+}
+
+static int
+xfs_zone_gc_query_cb(
+       struct xfs_btree_cur    *cur,
+       const struct xfs_rmap_irec *irec,
+       void                    *private)
+{
+       struct xfs_zone_gc_iter *iter = private;
+
+       ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
+       ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
+       ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
+
+       iter->recs[iter->rec_count] = *irec;
+       if (++iter->rec_count == XFS_ZONE_GC_RECS) {
+               iter->next_startblock =
+                       irec->rm_startblock + irec->rm_blockcount;
+               return 1;
+       }
+       return 0;
+}
+
+static int
+xfs_zone_gc_rmap_rec_cmp(
+       const void                      *a,
+       const void                      *b)
+{
+       const struct xfs_rmap_irec      *reca = a;
+       const struct xfs_rmap_irec      *recb = b;
+
+       if (reca->rm_owner < recb->rm_owner)
+               return -1;
+       if (reca->rm_owner > recb->rm_owner)
+               return 1;
+
+       if (reca->rm_offset < recb->rm_offset)
+               return -1;
+       if (reca->rm_offset < recb->rm_offset)
+               return 1;
+
+       return 0;
+}
+
+static int
+xfs_zone_gc_query(
+       struct xfs_mount        *mp,
+       struct xfs_zone_gc_iter *iter)
+{
+       struct xfs_rtgroup      *rtg = iter->victim_rtg;
+       struct xfs_rmap_irec    ri_low = { };
+       struct xfs_rmap_irec    ri_high;
+       struct xfs_btree_cur    *cur;
+       struct xfs_trans        *tp;
+       int                     error;
+
+       ASSERT(iter->next_startblock <= rtg->rtg_extents);
+       if (iter->next_startblock == rtg->rtg_extents)
+               goto done;
+
+       ASSERT(iter->next_startblock < rtg->rtg_extents);
+       ri_low.rm_startblock = iter->next_startblock;
+       memset(&ri_high, 0xFF, sizeof(ri_high));
+
+       iter->rec_idx = 0;
+       iter->rec_count = 0;
+
+       error = xfs_trans_alloc_empty(mp, &tp);
+       if (error)
+               return error;
+
+       xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+       xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
+       cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+       error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+                       xfs_zone_gc_query_cb, iter);
+       xfs_btree_del_cursor(cur, error < 0 ? error : 0);
+       xfs_trans_cancel(tp);
+
+       if (error < 0)
+               return error;
+
+       /*
+        * Sort the rmap records by inode number and increasing offset to
+        * defragment the mappings.
+        *
+        * This could be further enhanced by an even bigger look ahead window,
+        * but that's better left until we have better detection of changes to
+        * inode mapping to avoid the potential of GCing already dead data.
+        */
+       sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
+               xfs_zone_gc_rmap_rec_cmp, NULL);
+
+       if (error == 0) {
+               /*
+                * We finished iterating through the zone.
+                */
+               iter->next_startblock = rtg->rtg_extents;
+               if (iter->rec_count == 0)
+                       goto done;
+       }
+
+       return 0;
+done:
+       xfs_rtgroup_rele(iter->victim_rtg);
+       iter->victim_rtg = NULL;
+       return 0;
+}
+
+static bool
+xfs_zone_gc_iter_next(
+       struct xfs_mount        *mp,
+       struct xfs_zone_gc_iter *iter,
+       struct xfs_rmap_irec    *chunk_rec,
+       struct xfs_inode        **ipp)
+{
+       struct xfs_rmap_irec    *irec;
+       int                     error;
+
+       if (!iter->victim_rtg)
+               return false;
+
+       if (iter->rec_idx == iter->rec_count) {
+retry:
+               error = xfs_zone_gc_query(mp, iter);
+               if (error)
+                       goto fail;
+               if (!iter->victim_rtg)
+                       return false;
+       }
+
+       irec = &iter->recs[iter->rec_idx];
+       error = xfs_iget(mp, NULL, irec->rm_owner, XFS_IGET_NORETRY |
+                       XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
+       if (error) {
+               if (error == -EAGAIN || error == -ENOENT) {
+                       iter->next_startblock = irec->rm_startblock;
+                       goto retry;
+               }
+               goto fail;
+       }
+
+       if (!S_ISREG(VFS_I(*ipp)->i_mode)) {
+               iter->next_startblock = irec->rm_startblock;
+               xfs_irele(*ipp);
+               goto retry;
+       }
+
+       *chunk_rec = *irec;
+       return true;
+
+fail:
+       xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+       return false;
+}
+
+static void
+xfs_zone_gc_iter_advance(
+       struct xfs_zone_gc_iter *iter,
+       xfs_extlen_t            count_fsb)
+{
+       struct xfs_rmap_irec    *irec = &iter->recs[iter->rec_idx];
+
+       irec->rm_offset += count_fsb;
+       irec->rm_startblock += count_fsb;
+       irec->rm_blockcount -= count_fsb;
+       if (!irec->rm_blockcount)
+               iter->rec_idx++;
+}
+
+/*
+ * Iterate through all zones marked as reclaimable and find a candidate that is
+ * either good enough for instant reclaim, or the one with the least used space.
+ */
+static bool
+xfs_zone_reclaim_pick(
+       struct xfs_mount        *mp,
+       struct xfs_zone_gc_iter *iter)
+{
+       struct xfs_rtgroup      *victim_rtg = NULL, *rtg;
+       u64                     victim_used = U64_MAX;
+       unsigned long           index = 0;
+       bool                    easy = false;
+
+       if (xfs_is_shutdown(mp))
+               return false;
+
+       if (iter->victim_rtg)
+               return true;
+
+       /*
+        * Don't start new work if we are asked to stop or park.
+        */
+       if (kthread_should_stop() || kthread_should_park())
+               return false;
+
+       if (!xfs_zoned_need_gc(mp))
+               return false;
+
+       rcu_read_lock();
+       xa_for_each_marked(&mp->m_groups[XG_TYPE_RTG].xa, index, rtg,
+                       XFS_RTG_RECLAIMABLE) {
+               u64 used = *xfs_zone_used_counter(rtg);
+
+               if (used >= victim_used)
+                       continue;
+               if (!atomic_inc_not_zero(&rtg->rtg_group.xg_active_ref))
+                       continue;
+
+               if (victim_rtg)
+                       xfs_rtgroup_rele(victim_rtg);
+               victim_rtg = rtg;
+               victim_used = used;
+
+               /*
+                * Any zone that is less than 1 percent used is fair game for
+                * instant reclaim.
+                */
+               if (used < div_u64(rtg->rtg_extents, 100)) {
+                       easy = true;
+                       break;
+               }
+       }
+       rcu_read_unlock();
+
+       if (!victim_rtg)
+               return false;
+
+       xfs_info(mp, "reclaiming zone %d, used = %lld/%llu (%s)",
+               rtg_rgno(victim_rtg), victim_used,
+               victim_rtg->rtg_extents,
+               easy ? "easy" : "best");
+       trace_xfs_zone_reclaim(victim_rtg);
+       xfs_zone_gc_iter_init(iter, victim_rtg);
+       return true;
+}
+
+static struct xfs_rtgroup *
+xfs_select_gc_zone(
+       struct xfs_mount        *mp)
+{
+       struct xfs_rtgroup      *rtg = mp->m_open_gc_zone;
+
+       if (rtg && rtg->rtg_write_pointer == rtg->rtg_extents) {
+               /*
+                * We need to wait for pending writes to finish.
+                */
+               if (rtg->rtg_written < rtg->rtg_extents)
+                       return NULL;
+               xfs_rtgroup_rele(rtg);
+               rtg = NULL;
+       }
+
+       if (!rtg) {
+               spin_lock(&mp->m_zone_list_lock);
+               rtg = xfs_find_free_zone(mp);
+               spin_unlock(&mp->m_zone_list_lock);
+
+               if (rtg)
+                       trace_xfs_gc_zone_activate(rtg);
+               mp->m_open_gc_zone = rtg;
+       }
+
+       return rtg;
+}
+
+static unsigned int
+xfs_zone_gc_scratch_available(
+       struct xfs_zone_gc_data *data)
+{
+       return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
+}
+
+static bool
+xfs_zone_gc_space_available(
+       struct xfs_zone_gc_data *data)
+{
+       struct xfs_rtgroup      *rtg;
+
+       rtg = xfs_select_gc_zone(data->mp);
+       if (!rtg)
+               return false;
+       return rtg->rtg_write_pointer < rtg->rtg_extents &&
+               xfs_zone_gc_scratch_available(data);
+}
+
+static void
+xfs_zone_gc_end_io(
+       struct bio              *bio)
+{
+       struct xfs_zone_gc_data *data = bio->bi_private;
+       unsigned long           flags;
+
+       spin_lock_irqsave(&data->list_lock, flags);
+       if (bio_op(bio) == REQ_OP_READ)
+               bio_list_add(&data->read_done, bio);
+       else
+               bio_list_add(&data->write_done, bio);
+       wake_up_process(data->mp->m_zone_gc_thread);
+       spin_unlock_irqrestore(&data->list_lock, flags);
+}
+
+static bool
+xfs_zone_gc_allocate(
+       struct xfs_zone_gc_data *data,
+       xfs_extlen_t            *count_fsb,
+       xfs_daddr_t             *daddr,
+       bool                    *is_seq)
+{
+       struct xfs_mount        *mp = data->mp;
+       xfs_rtxnum_t            rtxres, rtxlen;
+       xfs_rgblock_t           rgbno = 0;
+       struct xfs_rtgroup      *rtg;
+
+       rtg = xfs_select_gc_zone(mp);
+       if (!rtg)
+               return false;
+
+       *count_fsb = min(*count_fsb,
+               XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
+
+       /*
+        * Directly allocate GC blocks from the reserved pool.
+        *
+        * If we'd take them from the normal pool we could be stealing blocks a
+        * regular writer, which would then have to wait for GC and deadlock.
+        */
+       spin_lock(&mp->m_sb_lock);
+       rtxres = min(mp->m_resblks[FREE_RTEXTENTS].avail,
+                    mp->m_resblks[FREE_RTAVAILABLE].avail);
+       rtxlen = min3(rtxres,
+                     rtg->rtg_extents - rtg->rtg_write_pointer,
+                     xfs_extlen_to_rtxlen(mp, *count_fsb));
+       mp->m_resblks[FREE_RTEXTENTS].avail -= rtxlen;
+       mp->m_resblks[FREE_RTAVAILABLE].avail -= rtxlen;
+       spin_unlock(&mp->m_sb_lock);
+
+       if (!rtxlen)
+               return false;
+       *count_fsb = xfs_rtxlen_to_extlen(mp, rtxlen);
+       *is_seq = test_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags);
+       if (!*is_seq)
+               rgbno = rtg->rtg_write_pointer;
+       rtg->rtg_write_pointer += *count_fsb;
+       *daddr = xfs_gbno_to_daddr(&rtg->rtg_group, rgbno);
+       return true;
+}
+
+static bool
+xfs_zone_gc_start_chunk(
+       struct xfs_zone_gc_data *data,
+       struct xfs_zone_gc_iter *iter)
+{
+       struct xfs_mount        *mp = data->mp;
+       struct block_device     *bdev = mp->m_rtdev_targp->bt_bdev;
+       struct xfs_rmap_irec    irec;
+       struct xfs_gc_bio       *chunk;
+       struct xfs_inode        *ip;
+       struct bio              *bio;
+       xfs_daddr_t             daddr;
+       bool                    is_seq;
+
+       if (xfs_is_shutdown(mp))
+               return false;
+
+       if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
+               return false;
+       if (!xfs_zone_gc_allocate(data, &irec.rm_blockcount, &daddr, &is_seq)) {
+               xfs_irele(ip);
+               return false;
+       }
+
+       bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
+
+       chunk = container_of(bio, struct xfs_gc_bio, bio);
+       chunk->ip = ip;
+       chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
+       chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
+       chunk->old_startblock =
+               xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
+       chunk->new_daddr = daddr;
+       chunk->is_seq = is_seq;
+       chunk->scratch = &data->scratch[data->scratch_idx];
+
+       bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
+       bio->bi_end_io = xfs_zone_gc_end_io;
+       bio->bi_private = data;
+       bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
+                       chunk->scratch->offset);
+       chunk->scratch->offset += chunk->len;
+       if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
+               data->scratch_idx =
+                       (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
+       }
+       data->inflight++;
+       xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
+
+       submit_bio(bio);
+       return true;
+}
+
+static void
+xfs_zone_gc_free_chunk(
+       struct xfs_zone_gc_data *data,
+       struct xfs_gc_bio       *chunk)
+{
+       data->inflight--;
+       xfs_irele(chunk->ip);
+       bio_put(&chunk->bio);
+}
+
+static void
+xfs_gc_submit_write(
+       struct xfs_zone_gc_data *data,
+       struct xfs_gc_bio       *chunk)
+{
+       if (chunk->is_seq) {
+               chunk->bio.bi_opf &= ~REQ_OP_WRITE;
+               chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
+       }
+       chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
+       chunk->bio.bi_end_io = xfs_zone_gc_end_io;
+       chunk->bio.bi_private = data;
+       submit_bio(&chunk->bio);
+}
+
+static struct xfs_gc_bio *
+xfs_gc_split_write(
+       struct xfs_zone_gc_data *data,
+       struct xfs_gc_bio       *chunk)
+{
+       struct queue_limits     *lim =
+               &bdev_get_queue(chunk->bio.bi_bdev)->limits;
+       struct xfs_gc_bio       *split_chunk;
+       int                     split_sectors;
+       unsigned int            split_len;
+       struct bio              *split;
+       unsigned int            nsegs;
+
+       if (!chunk->is_seq)
+               return NULL;
+
+       split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
+               queue_limits_max_zone_append_sectors(lim) << SECTOR_SHIFT);
+       if (!split_sectors)
+               return NULL;
+       split_len = split_sectors << SECTOR_SHIFT;
+
+       split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
+       split_chunk = container_of(split, struct xfs_gc_bio, bio);
+       ihold(VFS_I(chunk->ip));
+       split_chunk->ip = chunk->ip;
+       split_chunk->is_seq = chunk->is_seq;
+       split_chunk->scratch = chunk->scratch;
+       split_chunk->offset = chunk->offset;
+       split_chunk->len = split_len;
+       split_chunk->old_startblock = chunk->old_startblock;
+       split_chunk->new_daddr = chunk->new_daddr;
+
+       chunk->offset += split_len;
+       chunk->len -= split_len;
+       chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
+
+       data->inflight++;
+       return split_chunk;
+}
+
+static void
+xfs_zone_gc_write_chunk(
+       struct xfs_zone_gc_data *data,
+       struct bio              *bio)
+{
+       struct xfs_gc_bio       *chunk =
+               container_of(bio, struct xfs_gc_bio, bio);
+       struct xfs_mount        *mp = chunk->ip->i_mount;
+       unsigned int            folio_offset = bio->bi_io_vec->bv_offset;
+       struct xfs_gc_bio       *split_chunk;
+
+       if (bio->bi_status)
+               xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+       if (xfs_is_shutdown(mp)) {
+               xfs_zone_gc_free_chunk(data, chunk);
+               return;
+       }
+
+       bio_reset(bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
+       bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
+                       folio_offset);
+
+       while ((split_chunk = xfs_gc_split_write(data, chunk)))
+               xfs_gc_submit_write(data, split_chunk);
+       xfs_gc_submit_write(data, chunk);
+}
+
+static void
+xfs_zone_gc_finish_chunk(
+       struct xfs_zone_gc_data *data,
+       struct bio              *bio)
+{
+       struct xfs_gc_bio       *chunk =
+               container_of(bio, struct xfs_gc_bio, bio);
+       uint                    iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+       struct xfs_inode        *ip = chunk->ip;
+       struct xfs_mount        *mp = ip->i_mount;
+       int                     error;
+
+       if (bio->bi_status)
+               xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+       if (xfs_is_shutdown(mp)) {
+               xfs_zone_gc_free_chunk(data, chunk);
+               return;
+       }
+
+       chunk->scratch->freed += chunk->len;
+       if (chunk->scratch->freed == chunk->scratch->offset) {
+               chunk->scratch->offset = 0;
+               chunk->scratch->freed = 0;
+       }
+
+       /*
+        * Cycle through the iolock and wait for direct I/O and layouts to
+        * ensure no one is reading from the old mapping before it goes away.
+        */
+       xfs_ilock(ip, iolock);
+       error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
+       if (!error)
+               inode_dio_wait(VFS_I(ip));
+       xfs_iunlock(ip, iolock);
+       if (error)
+               goto free;
+
+       if (chunk->is_seq)
+               chunk->new_daddr = bio->bi_iter.bi_sector;
+       error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
+                       chunk->new_daddr, chunk->old_startblock);
+free:
+       if (error)
+               xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+       xfs_zone_gc_free_chunk(data, chunk);
+}
+
+static void
+xfs_zone_gc_finish_reset(
+       struct xfs_rtgroup      *rtg,
+       struct bio              *bio)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+
+       if (bio->bi_status) {
+               xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+               goto out;
+       }
+
+       spin_lock(&mp->m_zone_list_lock);
+       list_add_tail(&rtg->rtg_entry, &mp->m_free_zones);
+       atomic_inc(&mp->m_nr_free_zones);
+       spin_unlock(&mp->m_zone_list_lock);
+
+       xfs_zoned_add_available(mp, rtg->rtg_extents);
+
+       wake_up_all(&mp->m_zone_wait);
+out:
+       bio_put(bio);
+}
+
+static void
+xfs_zone_reset_end_io(
+       struct bio              *bio)
+{
+       struct xfs_zone_gc_data *data =
+               container_of(bio, struct xfs_gc_bio, bio)->data;
+       struct xfs_rtgroup      *rtg = bio->bi_private;
+       unsigned long           flags;
+
+       spin_lock_irqsave(&data->list_lock, flags);
+       bio_list_add(&data->reset_done, bio);
+       data->inflight--;
+       wake_up_process(rtg_mount(rtg)->m_zone_gc_thread);
+       spin_unlock_irqrestore(&data->list_lock, flags);
+}
+
+static struct bio *
+xfs_prepare_zone_reset(
+       struct xfs_rtgroup      *rtg,
+       struct xfs_zone_gc_data *data)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+       struct block_device     *bdev = mp->m_rtdev_targp->bt_bdev;
+       struct bio              *bio;
+
+       spin_lock(&rtg->rtg_alloc_lock);
+       rtg->rtg_write_pointer = 0;
+       spin_unlock(&rtg->rtg_alloc_lock);
+
+       xfs_ilock(rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOCK_EXCL);
+       ASSERT(*xfs_zone_used_counter(rtg) == 0);
+       rtg->rtg_written = 0;
+       *xfs_zone_last_written(rtg) = 0;
+       xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOCK_EXCL);
+
+       trace_xfs_zone_reset(rtg);
+
+       bio = bio_alloc_bioset(bdev, 0, REQ_OP_ZONE_RESET, GFP_NOFS,
+                       data ? &data->bio_set : &fs_bio_set);
+       bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
+       if (!test_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags)) {
+               bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
+               bio->bi_iter.bi_size = XFS_FSB_TO_B(mp, rtg->rtg_extents);
+       }
+       return bio;
+}
+
+static void
+xfs_reset_empty_zones(
+       struct xfs_zone_gc_data *data,
+       struct list_head        *empty_zones)
+{
+       struct xfs_rtgroup      *rtg;
+       struct bio              *bio;
+
+       if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
+               xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
+               return;
+       }
+
+       while ((rtg = list_first_entry_or_null(empty_zones,
+                       struct xfs_rtgroup, rtg_entry))) {
+               list_del_init(&rtg->rtg_entry);
+
+               xfs_log_force_inode(rtg->rtg_inodes[XFS_RTGI_RMAP]);
+
+               bio = xfs_prepare_zone_reset(rtg, data);
+               bio->bi_private = rtg;
+               bio->bi_end_io = xfs_zone_reset_end_io;
+               data->inflight++;
+               container_of(bio, struct xfs_gc_bio, bio)->data = data;
+               submit_bio(bio);
+       }
+}
+
+static bool
+xfs_zone_gc_handle_work(
+       struct xfs_zone_gc_data *data,
+       struct xfs_zone_gc_iter *iter)
+{
+       struct bio_list         read_done = BIO_EMPTY_LIST;
+       struct bio_list         write_done = BIO_EMPTY_LIST;
+       struct bio_list         reset_done = BIO_EMPTY_LIST;
+       LIST_HEAD               (empty_zones);
+       struct blk_plug         plug;
+       struct bio              *bio;
+
+       spin_lock_irq(&data->list_lock);
+       bio_list_merge_init(&read_done, &data->read_done);
+       bio_list_merge_init(&write_done, &data->write_done);
+       bio_list_merge_init(&reset_done, &data->reset_done);
+       spin_unlock_irq(&data->list_lock);
+       
+       spin_lock(&data->mp->m_zone_list_lock);
+       list_splice_init(&data->mp->m_emptied_zones, &empty_zones);
+       spin_unlock(&data->mp->m_zone_list_lock);
+
+       if (!xfs_zone_reclaim_pick(data->mp, iter) ||
+           !xfs_zone_gc_space_available(data)) {
+               if (bio_list_empty(&read_done) &&
+                   bio_list_empty(&write_done) &&
+                   bio_list_empty(&reset_done) &&
+                   list_empty(&empty_zones))
+                       return false;
+       }
+
+       __set_current_state(TASK_RUNNING);
+       try_to_freeze();
+
+       while ((bio = bio_list_pop(&reset_done)))
+               xfs_zone_gc_finish_reset(bio->bi_private, bio);
+
+       if (!list_empty(&empty_zones))
+               xfs_reset_empty_zones(data, &empty_zones);
+
+       blk_start_plug(&plug);
+       while ((bio = bio_list_pop(&read_done)))
+               xfs_zone_gc_write_chunk(data, bio);
+       blk_finish_plug(&plug);
+
+       while ((bio = bio_list_pop(&write_done)))
+               xfs_zone_gc_finish_chunk(data, bio);
+
+       blk_start_plug(&plug);
+       while (xfs_zone_gc_start_chunk(data, iter))
+               ;
+       blk_finish_plug(&plug);
+       return true;
+}
+
+/*
+ * XXX: This breaks reflinks and thus duplicates data that was shared by
+ * multiple owners before.
+ */
+static int
+xfs_zoned_gcd(
+       void                    *private)
+{
+       struct xfs_mount        *mp = private;
+       unsigned int            nofs_flag;
+       struct xfs_zone_gc_data *data;
+       struct xfs_zone_gc_iter *iter;
+
+       data = xfs_zone_gc_data_alloc(mp);
+       if (!data)
+               return -ENOMEM;
+       iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+       if (!iter)
+               goto out_free_data;
+
+       nofs_flag = memalloc_nofs_save();
+       set_freezable();
+
+       for (;;) {
+               set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
+               xfs_set_in_gc(mp);
+               if (xfs_zone_gc_handle_work(data, iter))
+                       continue;
+
+               if (!data->inflight) {
+                       xfs_clear_in_gc(mp);
+                       xfs_zoned_wake_all(mp);
+
+                       if (kthread_should_stop()) {
+                               __set_current_state(TASK_RUNNING);
+                               break;
+                       }
+
+                       if (kthread_should_park()) {
+                               __set_current_state(TASK_RUNNING);
+                               kthread_parkme();
+                               continue;
+                       }
+               }
+
+               schedule();
+       }
+       xfs_clear_in_gc(mp);
+
+       if (iter->victim_rtg)
+               xfs_rtgroup_rele(iter->victim_rtg);
+       if (mp->m_open_gc_zone)
+               xfs_rtgroup_rele(mp->m_open_gc_zone);
+
+       memalloc_nofs_restore(nofs_flag);
+       kfree(iter);
+out_free_data:
+       xfs_zone_gc_data_free(data);
+       return 0;
+}
+
+static struct xfs_rtgroup *
+xfs_pick_open_zone_for_gc(
+       struct xfs_mount        *mp)
+{
+       struct xfs_rtgroup      *rtg, *found = NULL;
+
+       list_for_each_entry(rtg, &mp->m_open_zones, rtg_entry) {
+               if (!found)
+                       found = rtg;
+               else if (rtg->rtg_write_pointer < found->rtg_write_pointer)
+                       found = rtg;
+       }
+
+       return found;
+}
+
+void
+xfs_zone_gc_start(
+       struct xfs_mount        *mp)
+{
+       if (xfs_has_zoned(mp))
+               kthread_unpark(mp->m_zone_gc_thread);
+}
+
+void
+xfs_zone_gc_stop(
+       struct xfs_mount        *mp)
+{
+       if (xfs_has_zoned(mp))
+               kthread_park(mp->m_zone_gc_thread);
+}
+
+static int
+xfs_get_zone_info_cb(
+       struct blk_zone         *zone,
+       unsigned int            idx,
+       void                    *data)
+{
+       struct xfs_mount        *mp = data;
+       xfs_fsblock_t           zsbno = xfs_daddr_to_rtb(mp, zone->start);
+       xfs_rgnumber_t          rgno;
+       struct xfs_rtgroup      *rtg;
+       int                     error;
+
+       if (xfs_rtb_to_rgbno(mp, zsbno) != 0) {
+               xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno);
+               return -EFSCORRUPTED;
+       }
+
+       rgno = xfs_rtb_to_rgno(mp, zsbno);
+       rtg = xfs_rtgroup_get(mp, rgno);
+       if (!rtg) {
+               xfs_warn(mp, "realtime group not found for zone %u.", rgno);
+               return -EFSCORRUPTED;
+       }
+       error = xfs_zone_validate(zone, rtg);
+       xfs_rtgroup_put(rtg);
+       return error;
+}
+
+static int
+xfs_init_zone(
+       struct xfs_rtgroup      *rtg,
+       uint64_t                *available,
+       uint64_t                *freedblocks)
+{
+       struct xfs_mount        *mp = rtg_mount(rtg);
+       uint64_t                used = *xfs_zone_used_counter(rtg);
+
+       if (rtg->rtg_write_pointer == rtg->rtg_extents && used == 0) {
+               struct bio      *bio;
+               int             error;
+
+               bio = xfs_prepare_zone_reset(rtg, NULL);
+               error = submit_bio_wait(bio);
+               bio_put(bio);
+               if (error)
+                       return error;
+       } else {
+               /*
+                * For sequential write required zones, xfs_get_zone_info_cb
+                * initializes rtg_write_pointer to the hardware write pointer.
+                *
+                * We initialize it to the last recorded writes for conventional
+                * zone, as we don't know what actually got written, just what
+                * we are able to record in the I/O completion handler.
+                */
+               if (!test_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags))
+                       rtg->rtg_write_pointer = *xfs_zone_last_written(rtg);
+
+               /*
+                * There can't be any I/O in flight we need to care about at
+                * mount time, so treat the write pointer as the completed
+                * write counter.
+                */
+               rtg->rtg_written = rtg->rtg_write_pointer;
+       }
+
+       if (rtg->rtg_write_pointer == 0) {
+               /* zone is free */
+               list_add_tail(&rtg->rtg_entry, &mp->m_free_zones);
+               atomic_inc(&mp->m_nr_free_zones);
+               *available += rtg->rtg_extents;
+       } else if (rtg->rtg_write_pointer < rtg->rtg_extents) {
+               /* zone is open */
+               list_add(&rtg->rtg_entry, &mp->m_open_zones);
+               mp->m_nr_open_zones++;
+               set_bit(RTG_F_OPEN, &rtg->rtg_flags);
+               *available += (rtg->rtg_extents - rtg->rtg_write_pointer);
+               *freedblocks += (rtg->rtg_write_pointer) - used;
+       } else if (used < rtg->rtg_extents) {
+               /* zone fully written, but has freed blocks */
+               xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE);
+               *freedblocks += (rtg->rtg_extents - used);
+       }
+
+       return 0;
+}
+
+/*
+ * Calculate the max open zone limit based on the of number of
+ * backing zones available
+ */
+static inline uint32_t
+xfs_max_open_zones(
+       struct xfs_mount        *mp)
+{
+       unsigned int            max_open, max_open_data_zones;
+       /*
+        * We need two zones for every open data zone,
+        * one in reserve as we don't reclaim open zones. One data zone
+        * and its spare is included in XFS_MIN_ZONES.
+        */
+       max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1;
+       max_open = max_open_data_zones + XFS_OPEN_GC_ZONES;
+
+       /*
+        * Cap the max open limit to 1/4 of available space
+        */
+       max_open = min(max_open, mp->m_sb.sb_rgcount / 4);
+
+       return max(XFS_MIN_OPEN_ZONES, max_open);
+}
+
+int
+xfs_mount_zones(
+       struct xfs_mount        *mp)
+{
+       struct xfs_buftarg      *bt = mp->m_rtdev_targp;
+       unsigned int            bdev_open_zones;
+       int64_t                 available = 0, freedblocks = 0;
+       struct xfs_rtgroup      *rtg = NULL;
+       int                     error;
+
+       if (!bt) {
+               xfs_notice(mp, "RT device missing.");
+               return -EINVAL;
+       }
+
+       if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) {
+               xfs_notice(mp, "invalid flag combination.");
+               return -EFSCORRUPTED;
+       }
+       if (mp->m_sb.sb_rextsize != 1) {
+               xfs_notice(mp, "zoned file systems do not support rextsize.");
+               return -EFSCORRUPTED;
+       }
+       if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) {
+               xfs_notice(mp,
+"zoned file systems need to have at least %d zones.", XFS_MIN_ZONES);
+               return -EFSCORRUPTED;
+       }
+
+       /*
+        * Normally we pick the open zone limit that the device reports.  If
+        * there isn't one let the user pick one from the command line.
+        *
+        * If the device doesn't report an open zone limit and there is no
+        * override, allow to hold about half of the zones open.  In theory we
+        * should allow to be open, but at that point we run into GC deadlocks
+        * because we (at least currently) can't reclaim open zones.
+        *
+        * When used on conventional SSDs a lower open limit is advisable as
+        * we'll otherwise overwhelm the FTL just as much as a conventional
+        * block allocator.
+        *
+        * Note: To debug the open zone management code, force max_open to
+        * 1 here.
+        */
+       bdev_open_zones = bdev_max_open_zones(bt->bt_bdev);
+       if (bdev_open_zones && !mp->m_max_open_zones)
+               mp->m_max_open_zones = bdev_open_zones;
+       if (mp->m_max_open_zones) {
+               if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) {
+                       xfs_notice(mp, "need at least %d open zones.",
+                               XFS_MIN_OPEN_ZONES);
+                       return -EIO;
+               }
+               if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) {
+                       xfs_warn(mp, "device only supports %d open zones.\n",
+                               bdev_open_zones);
+                       mp->m_max_open_zones = bdev_open_zones;
+               }
+               if (mp->m_max_open_zones > xfs_max_open_zones(mp)) {
+                       mp->m_max_open_zones = xfs_max_open_zones(mp);
+                       xfs_info(mp,
+"limiting open zones to %u due to total zone count (%u)",
+                               mp->m_max_open_zones, mp->m_sb.sb_rgcount);
+               }
+       } else {
+               mp->m_max_open_zones = xfs_max_open_zones(mp);
+       }
+
+       INIT_LIST_HEAD(&mp->m_free_zones);
+       INIT_LIST_HEAD(&mp->m_open_zones);
+       INIT_LIST_HEAD(&mp->m_emptied_zones);
+       INIT_LIST_HEAD(&mp->m_reclaim_reservations);
+       spin_lock_init(&mp->m_zone_list_lock);
+       spin_lock_init(&mp->m_reservation_lock);
+       init_waitqueue_head(&mp->m_zone_wait);
+
+       xfs_info(mp, "%u zones of %u blocks size (%d max open)",
+                mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
+                mp->m_max_open_zones);
+
+       /*
+        * Sync our own information with the hardware zone state.
+        */
+       if (bdev_is_zoned(bt->bt_bdev)) {
+               if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+                       xfs_warn(mp,
+"zoned device support requires CONFIG_BLK_DEV_ZONED");
+                       return -EINVAL;
+               }
+               error = blkdev_report_zones(bt->bt_bdev, 0, mp->m_sb.sb_rgcount,
+                                           xfs_get_zone_info_cb, mp);
+               if (error < 0)
+                       return error;
+       }
+
+       mp->m_zone_gc_thread = kthread_create(xfs_zoned_gcd, mp,
+                               "xfs-zone-gc/%s",
+                               mp->m_super->s_id);
+       if (IS_ERR(mp->m_zone_gc_thread)) {
+               xfs_warn(mp, "unable to create zone gc thread");
+               return PTR_ERR(mp->m_zone_gc_thread);
+       }
+       /* xfs_zone_gc_start will unpark for rw mounts */
+       kthread_park(mp->m_zone_gc_thread);
+
+       while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+               error = xfs_init_zone(rtg, &available, &freedblocks);
+               if (error)
+                       goto out_unlink_zones;
+       }
+
+       /*
+        * XXX: convert to rtxlen.  Or just give up on the conversion because
+        * we have a 1:1 mapping.
+        */
+       percpu_counter_set(&mp->m_free[FREE_RTAVAILABLE], available);
+       percpu_counter_set(&mp->m_free[FREE_RTEXTENTS],
+                       available + freedblocks);
+
+       /*
+        * If there are no free zones available for GC, pick the open zone with
+        * the least used space to GC into.
+        */
+       if (list_empty(&mp->m_free_zones)) {
+               rtg = xfs_pick_open_zone_for_gc(mp);
+               if (!rtg) {
+                       error = -EINVAL;
+                       goto out_unlink_zones;
+               }
+               list_del_init(&rtg->rtg_entry);
+               mp->m_nr_open_zones--;
+               clear_bit(RTG_F_OPEN, &rtg->rtg_flags);
+               mp->m_open_gc_zone = rtg;
+       }
+       return 0;
+
+out_unlink_zones:
+       rtg = NULL;
+       while ((rtg = xfs_rtgroup_next(mp, rtg)))
+               list_del_init(&rtg->rtg_entry);
+       return error;
+}
+
+void
+xfs_unmount_zones(
+       struct xfs_mount        *mp)
+{
+       struct xfs_rtgroup      *rtg = NULL;
+
+       kthread_stop(mp->m_zone_gc_thread);
+       while ((rtg = xfs_rtgroup_next(mp, rtg)))
+               list_del_init(&rtg->rtg_entry);
+}