WARNING: this is early prototype code.
The zoned allocator works by handing out data blocks to the direct or
buffered write code at the place where XFS currently does block
allocations. It does not actually insert them into the bmap extent tree
at this time, but only after I/O completion when we known the block number.
The zoned allocator works on any kind of device, including conventional
devices or conventional zones by having a crude write pointer emulation.
For zone devices active zone management is fully support, as is
zone capacity < zone size.
The two major limitations are:
- there is no support for unwritten extents and thus persistent
file preallocations from fallocate(). This is inherent to an
always out of place write scheme as there is no way to persistently
preallocate blocks for an indefinite number of overwrites
- because the metadata blocks and data blocks are on different
device you can run out of space for metadata while having plenty
of space for data and vice versa. This is inherent to a scheme
where we use different devices or pools for each.
For zoned file systems we reserve the free extents before taking the
ilock so that if we have to force garbage collection it happens before we
take the iolock. This is done because GC has to take the iolock after it
moved data to a new place, and this could otherwise deadlock.
This unfortunately has to exclude block zeroing, as for truncate we are
called with the iolock (aka i_rwsem) already held. As zeroing is always
only for a single block at a time, or up to two total for a syscall in
case for free_file_range we deal with that by just stealing the block,
but failing the allocation if we'd have to wait for GC.
Add a new RTAVAILABLE counter of blocks that are actually directly
available to be written into in addition to the classic free counter.
Only allow a write to go ahead if it has blocks available to write, and
otherwise wait for GC. This also requires tweaking the need GC condition a
bit as we now always need to GC if someone is waiting for space.
Thanks to Hans Holmberg <hans.holmberg@wdc.com> for lots of fixes
and improvements.
Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \
xfs_rtbitmap.o \
xfs_rtgroup.o \
+ xfs_zones.o \
)
# highlevel code
xfs_quotaops.o
# xfs_rtbitmap is shared with libxfs
-xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
+xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
+ xfs_zone_alloc.o \
+ xfs_zone_gc.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
#include "xfs_symlink_remote.h"
#include "xfs_inode_util.h"
#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_bmap_intent_cache;
* Compute the worst-case number of indirect blocks that will be used
* for ip's delayed extent of length "len".
*/
-STATIC xfs_filblks_t
+xfs_filblks_t
xfs_bmap_worst_indlen(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_filblks_t len) /* delayed extent length */
+ struct xfs_inode *ip, /* incore inode pointer */
+ xfs_filblks_t len) /* delayed extent length */
{
- int level; /* btree level number */
- int maxrecs; /* maximum record count at this level */
- xfs_mount_t *mp; /* mount structure */
- xfs_filblks_t rval; /* return value */
+ struct xfs_mount *mp = ip->i_mount;
+ int maxrecs = mp->m_bmap_dmxr[0];
+ int level;
+ xfs_filblks_t rval;
- mp = ip->i_mount;
- maxrecs = mp->m_bmap_dmxr[0];
for (level = 0, rval = 0;
level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
level++) {
/*
* Convert a hole to a delayed allocation.
*/
-STATIC void
+void
xfs_bmap_add_extent_hole_delay(
- xfs_inode_t *ip, /* incore inode pointer */
+ struct xfs_inode *ip, /* incore inode pointer */
int whichfork,
struct xfs_iext_cursor *icur,
- xfs_bmbt_irec_t *new) /* new data to add to file extents */
+ struct xfs_bmbt_irec *new) /* new data to add to file extents */
{
struct xfs_ifork *ifp; /* inode fork pointer */
xfs_bmbt_irec_t left; /* left neighbor extent entry */
fdblocks = indlen;
if (XFS_IS_REALTIME_INODE(ip)) {
+ ASSERT(!xfs_is_zoned_inode(ip));
error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
if (error)
goto out_unreserve_quota;
da_diff = da_old - da_new;
fdblocks = da_diff;
- if (bflags & XFS_BMAPI_REMAP)
+ if (bflags & XFS_BMAPI_REMAP) {
;
- else if (isrt)
- xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
- else
+ } else if (isrt) {
+ xfs_rtxlen_t rtxlen;
+
+ rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount);
+ if (xfs_is_zoned_inode(ip))
+ xfs_zoned_add_available(mp, rtxlen);
+ xfs_add_frextents(mp, rtxlen);
+ } else {
fdblocks += del->br_blockcount;
+ }
xfs_add_fdblocks(mp, fdblocks);
xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
irec->br_blockcount))
return __this_address;
}
- if (irec->br_state != XFS_EXT_NORM && whichfork != XFS_DATA_FORK)
- return __this_address;
+ if (irec->br_state != XFS_EXT_NORM) {
+ if (whichfork != XFS_DATA_FORK)
+ return __this_address;
+ if (rtfile && xfs_has_zoned(mp))
+ return __this_address;
+ }
return NULL;
}
struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
struct xfs_bmbt_irec *new, int *logflagsp);
+void xfs_bmap_add_extent_hole_delay(struct xfs_inode *ip, int whichfork,
+ struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *new);
xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip,
int fork);
int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap,
struct xfs_alloc_arg *args);
+xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len);
enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,
xfs_extlen_t mod;
int error;
+ ASSERT(!xfs_has_zoned(mp));
ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN);
mod = xfs_blen_to_rtxoff(mp, rtlen);
end = min(end, rtg->rtg_extents - 1);
+ if (xfs_has_zoned(mp)) {
+ xfs_rtxnum_t wp;
+
+ wp = rtg->rtg_write_pointer * mp->m_sb.sb_rextsize;
+ if (end >= wp) {
+ struct xfs_rtalloc_rec rec = {
+ .ar_startext = max(start, wp),
+ .ar_extcount = end - start + 1,
+ };
+
+ return fn(rtg, tp, &rec, priv);
+ }
+
+ return 0;
+ }
+
/* Iterate the bitmap, looking for discrepancies. */
while (start <= end) {
struct xfs_rtalloc_rec rec;
struct xfs_mount *mp,
xfs_rtbxlen_t rtextents)
{
+ if (xfs_has_zoned(mp))
+ return 0;
return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp));
}
xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp);
unsigned long long rsumwords;
+ if (xfs_has_zoned(mp)) {
+ *rsumlevels = 0;
+ return 0;
+ }
+
*rsumlevels = xfs_compute_rextslog(rextents) + 1;
rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels);
return howmany_64(rsumwords, mp->m_blockwsize);
return -ENOMEM;
xfs_rtgroup_calc_geometry(mp, rtg, rgno, rgcount, rextents);
+ INIT_LIST_HEAD(&rtg->rtg_entry);
+ spin_lock_init(&rtg->rtg_alloc_lock);
error = xfs_group_insert(mp, rtg_group(rtg), rgno, XG_TYPE_RTG);
if (error)
return 0;
}
+/*
+ * Zoned file systems don't have bitmap and summary inodes, instead allocations
+ * are only tracked in the rmap.
+ *
+ * This means XFS_RTGLOCK_BITMAP(_SHARED) implies that the rmap needs to be
+ * locked instead.
+ */
+static void
+xfs_rtglock_zoned_adjust(
+ struct xfs_rtgroup *rtg,
+ unsigned int *rtglock_flags)
+{
+ if (!xfs_has_zoned(rtg_mount(rtg)))
+ return;
+ if (*rtglock_flags & (XFS_RTGLOCK_BITMAP | XFS_RTGLOCK_BITMAP_SHARED))
+ *rtglock_flags |= XFS_RTGLOCK_RMAP;
+ *rtglock_flags &= ~(XFS_RTGLOCK_BITMAP | XFS_RTGLOCK_BITMAP_SHARED);
+}
+
/* Lock metadata inodes associated with this rt group. */
void
xfs_rtgroup_lock(
ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
!(rtglock_flags & XFS_RTGLOCK_BITMAP));
+ xfs_rtglock_zoned_adjust(rtg, &rtglock_flags);
+
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
/*
* Lock both realtime free space metadata inodes for a freespace
ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
!(rtglock_flags & XFS_RTGLOCK_BITMAP));
+ xfs_rtglock_zoned_adjust(rtg, &rtglock_flags);
+
if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) &&
rtg->rtg_inodes[XFS_RTGI_REFCOUNT])
xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_REFCOUNT], XFS_ILOCK_EXCL);
ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED));
+ xfs_rtglock_zoned_adjust(rtg, &rtglock_flags);
+
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
xfs_trans_ijoin(tp, rtg->rtg_inodes[XFS_RTGI_BITMAP],
XFS_ILOCK_EXCL);
.sick = XFS_SICK_RG_BITMAP,
.fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
(1U << XFS_DINODE_FMT_BTREE),
+ .enabled = xfs_has_nonzoned,
.create = xfs_rtbitmap_create,
},
[XFS_RTGI_SUMMARY] = {
.sick = XFS_SICK_RG_SUMMARY,
.fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
(1U << XFS_DINODE_FMT_BTREE),
+ .enabled = xfs_has_nonzoned,
.create = xfs_rtsummary_create,
},
[XFS_RTGI_RMAP] = {
* Reads and writes are serialized by the rsumip inode lock.
*/
uint8_t *rtg_rsum_cache;
+
+ unsigned long rtg_flags;
+#define RTG_F_SEQUENTIAL 0
+#define RTG_F_OPEN 1
+
+ spinlock_t rtg_alloc_lock;
+ xfs_rgblock_t rtg_write_pointer;
+ xfs_rgblock_t rtg_written;
+
+ /* zone state entry */
+ struct list_head rtg_entry;
};
static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg)
return rtg->rtg_group.xg_gno;
}
+#define XFS_RTG_RECLAIMABLE XA_MARK_0
+
/* Passive rtgroup references */
static inline struct xfs_rtgroup *
xfs_rtgroup_get(
#include "xfs_rtgroup.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_rtrefcount_btree.h"
+#include "xfs_rtbitmap.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
xfs_expected_rbmblocks(
struct xfs_sb *sbp)
{
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED))
+ return 0;
return howmany_64(xfs_extents_per_rbm(sbp),
NBBY * xfs_rtbmblock_size(sbp));
}
* we handle nearly-lockless reservations, so we must use the _positive
* variant here to avoid writing out nonsense frextents.
*/
- if (xfs_has_rtgroups(mp))
+ if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp))
mp->m_sb.sb_frextents = xfs_sum_freecounter(mp, FREE_RTEXTENTS);
xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Christoph Hellwig.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zones.h"
+
+static int
+xfs_zone_validate_empty(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ if (*xfs_zone_used_counter(rtg) > 0) {
+ xfs_warn(mp, "empty zone %d has non-zero used counter (0x%llx).",
+ rtg_rgno(rtg), *xfs_zone_used_counter(rtg));
+ return -EIO;
+ }
+ return 0;
+}
+
+static int
+xfs_zone_validate_wp(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ xfs_fileoff_t wp_fsb = xfs_daddr_to_rtb(mp, zone->wp);
+
+ if (*xfs_zone_used_counter(rtg) > (uint64_t)rtg->rtg_extents) {
+ xfs_warn(mp, "zone %d has too larged used counter (0x%llx).",
+ rtg_rgno(rtg), *xfs_zone_used_counter(rtg));
+ return -EIO;
+ }
+
+ /*
+ * Always use the hardware write pointer.
+ */
+ rtg->rtg_write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb);
+ if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) {
+ xfs_warn(mp, "zone %d write pointer (0x%x) outside of zone.",
+ rtg_rgno(rtg), rtg->rtg_write_pointer);
+ return -EFSCORRUPTED;
+ }
+ if (rtg->rtg_write_pointer >= rtg->rtg_extents) {
+ xfs_warn(mp, "zone %d has invalid write pointer (0x%x).",
+ rtg_rgno(rtg), rtg->rtg_write_pointer);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+static int
+xfs_zone_validate_full(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ rtg->rtg_write_pointer = rtg->rtg_extents;
+ if (*xfs_zone_used_counter(rtg) > rtg->rtg_extents) {
+ xfs_warn(mp, "zone %d has too larged used counter (0x%llx).",
+ rtg_rgno(rtg), *xfs_zone_used_counter(rtg));
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int
+xfs_zone_validate_seq(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ set_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags);
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_EMPTY:
+ return xfs_zone_validate_empty(zone, rtg);
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ return xfs_zone_validate_wp(zone, rtg);
+ case BLK_ZONE_COND_FULL:
+ return xfs_zone_validate_full(zone, rtg);
+ case BLK_ZONE_COND_NOT_WP:
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ xfs_warn(mp, "zone %d has unsupported zone condition 0x%x.",
+ rtg_rgno(rtg), zone->cond);
+ return -EIO;
+ default:
+ xfs_warn(mp, "zone %d has unknown zone condition 0x%x.",
+ rtg_rgno(rtg), zone->cond);
+ return -EIO;
+ }
+}
+
+static int
+xfs_zone_validate_conv(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_NOT_WP:
+ return 0;
+ default:
+ xfs_warn(mp,
+"conventional zone %d has unsupported zone condition 0x%x.",
+ rtg_rgno(rtg), zone->cond);
+ return -EIO;
+ }
+}
+
+int
+xfs_zone_validate(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
+
+ /*
+ * Check that the zone capacity matches the capacity stored in the
+ * superblock. Note that all zones including the last one must have a
+ * uniform capacity.
+ */
+ if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) {
+ xfs_warn(mp,
+"zone %d capacity (0x%llx) does not match RT group size (0x%x).",
+ rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity),
+ g->blocks);
+ return -EIO;
+ }
+
+ if (XFS_BB_TO_FSB(mp, zone->len) != 1 << g->blklog) {
+ xfs_warn(mp,
+"zone %d length (0x%llx) does match geometry (0x%x).",
+ rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len),
+ 1 << g->blklog);
+ }
+
+ switch (zone->type) {
+ case BLK_ZONE_TYPE_CONVENTIONAL:
+ return xfs_zone_validate_conv(zone, rtg);
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
+ return xfs_zone_validate_seq(zone, rtg);
+ default:
+ xfs_warn(mp, "zoned %d has unsupported type 0x%x.",
+ rtg_rgno(rtg), zone->type);
+ return -EFSCORRUPTED;
+ }
+}
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LIBXFS_ZONES_H
+#define _LIBXFS_ZONES_H
+
+/*
+ * In order to guarantee forward progress for GC we need to reserve at least
+ * two zones: one that will be used for moving data into and one spare zone
+ * making sure that we have enough space to relocate a nearly-full zone.
+ * To allow for slightly sloppy accounting for when we need to reserve the
+ * second zone, we actually reserve three as that is easier than doing fully
+ * accurate bookkeeing.
+ */
+#define XFS_GC_ZONES 3U
+
+/*
+ * In addition we need two zones for user writes, one open zone for writing
+ * and one to still have available blocks without resetting the open zone
+ * when data in the open zone has been freed.
+ */
+#define XFS_RESERVED_ZONES (XFS_GC_ZONES + 1)
+#define XFS_MIN_ZONES (XFS_RESERVED_ZONES + 1)
+
+/*
+ * Always keep one zone out of the general open zone pool to allow for GC to
+ * happen while other writers are waiting for free space.
+ */
+#define XFS_OPEN_GC_ZONES 1U
+#define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U)
+
+int xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg);
+
+#endif /* _LIBXFS_ZONES_H */
},
[XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
.type = ST_RTGROUP,
+ .has = xfs_has_nonzoned,
.setup = xchk_setup_rtbitmap,
.scrub = xchk_rtbitmap,
.repair = xrep_rtbitmap,
},
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
.type = ST_RTGROUP,
+ .has = xfs_has_nonzoned,
.setup = xchk_setup_rtsummary,
.scrub = xchk_rtsummary,
.repair = xrep_rtsummary,
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * Copyright (c) 2016-2018 Christoph Hellwig.
+ * Copyright (c) 2016-2023 Christoph Hellwig.
* All Rights Reserved.
*/
#include "xfs.h"
#include "xfs_reflink.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_rtgroup.h"
struct xfs_writepage_ctx {
struct iomap_writepage_ctx ctx;
{
struct xfs_inode *ip = XFS_I(ioend->io_inode);
struct xfs_mount *mp = ip->i_mount;
+ bool is_zoned = xfs_is_zoned_inode(ip);
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
unsigned int nofs_flag;
error = blk_status_to_errno(ioend->io_bio.bi_status);
if (unlikely(error)) {
if (ioend->io_flags & IOMAP_F_SHARED) {
+ ASSERT(!is_zoned);
xfs_reflink_cancel_cow_range(ip, offset, size, true);
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
- offset + size);
+ offset + size, NULL);
}
goto done;
}
/*
* Success: commit the COW or unwritten blocks if needed.
*/
- if (ioend->io_flags & IOMAP_F_SHARED)
+ if (is_zoned)
+ error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
+ NULLFSBLOCK);
+ else if (ioend->io_flags & IOMAP_F_SHARED)
error = xfs_reflink_end_cow(ip, offset, size);
else if (ioend->io_type == IOMAP_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
- if (!error && xfs_ioend_is_append(ioend))
+ if (!error && xfs_ioend_is_append(ioend) && !ioend->io_isdirect)
error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
done:
iomap_finish_ioends(ioend, error);
}
}
-STATIC void
+void
xfs_end_bio(
struct bio *bio)
{
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
unsigned long flags;
+ if (bio_is_zone_append(bio)) {
+ /*
+ * Record the actually written block number and make sure we
+ * don't merge the first ioened for a zone into the last one
+ * for the previous zone.
+ */
+ ioend->io_sector = bio->bi_iter.bi_sector;
+ if (!(xfs_daddr_to_rtb(mp, ioend->io_sector) %
+ mp->m_groups[XG_TYPE_RTG].blocks))
+ ioend->io_flags |= IOMAP_F_BOUNDARY;
+ }
+
spin_lock_irqsave(&ip->i_ioend_lock, flags);
if (list_empty(&ip->i_ioend_list))
- WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
+ WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
&ip->i_ioend_work));
list_add_tail(&ioend->io_list, &ip->i_ioend_list);
spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
* folio itself and not the start offset that is passed in.
*/
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
- folio_pos(folio) + folio_size(folio));
+ folio_pos(folio) + folio_size(folio), NULL);
}
static const struct iomap_writeback_ops xfs_writeback_ops = {
.discard_folio = xfs_discard_folio,
};
+struct xfs_zoned_writepage_ctx {
+ struct iomap_writepage_ctx ctx;
+ struct xfs_rtgroup *rtg;
+};
+
+static inline struct xfs_zoned_writepage_ctx *
+XFS_ZWPC(struct iomap_writepage_ctx *ctx)
+{
+ return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
+}
+
+static int
+xfs_zoned_map_blocks(
+ struct iomap_writepage_ctx *wpc,
+ struct inode *inode,
+ loff_t offset,
+ unsigned int len)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
+ xfs_filblks_t count_fsb;
+ struct xfs_bmbt_irec imap, del;
+ struct xfs_iext_cursor icur;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
+
+ /*
+ * All dirty data must be covered by delalloc extents. But truncate can
+ * remove delalloc extents underneath us or reduce their size.
+ * Returning a hole tells iomap to not write back any data from this
+ * range, which is the right thing to do in that case.
+ *
+ * Otherwise just tell iomap to treat ranges previously covered by a
+ * delalloc extent as mapped. The actual block allocation will be done
+ * just before submitting the bio.
+ */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
+ imap.br_startoff = end_fsb; /* fake a hole past EOF */
+ if (imap.br_startoff > offset_fsb) {
+ imap.br_blockcount = imap.br_startoff - offset_fsb;
+ imap.br_startoff = offset_fsb;
+ imap.br_startblock = HOLESTARTBLOCK;
+ imap.br_state = XFS_EXT_NORM;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
+ return 0;
+ }
+ end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
+ count_fsb = end_fsb - offset_fsb;
+
+ del = imap;
+ xfs_trim_extent(&del, offset_fsb, count_fsb);
+ xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
+ XFS_BMAPI_REMAP);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ wpc->iomap.type = IOMAP_MAPPED;
+ wpc->iomap.flags = IOMAP_F_DIRTY;
+ wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
+ wpc->iomap.offset = offset;
+ wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
+ wpc->iomap.flags = IOMAP_F_ZONE_APPEND;
+ wpc->iomap.addr = 0;
+
+ trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
+ return 0;
+}
+
+static int
+xfs_zoned_submit_ioend(
+ struct iomap_writepage_ctx *wpc,
+ int status)
+{
+ wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
+ if (status)
+ return status;
+ xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->rtg);
+ return 0;
+}
+
+static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
+ .map_blocks = xfs_zoned_map_blocks,
+ .submit_ioend = xfs_zoned_submit_ioend,
+ .discard_folio = xfs_discard_folio,
+};
+
STATIC int
xfs_vm_writepages(
struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct xfs_inode *ip = XFS_I(mapping->host);
struct xfs_writepage_ctx wpc = { };
+ int error;
- xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+ xfs_iflags_clear(ip, XFS_ITRUNCATED);
+ if (xfs_is_zoned_inode(ip)) {
+ struct xfs_zoned_writepage_ctx xc = { };
+
+ error = iomap_writepages(mapping, wbc, &xc.ctx,
+ &xfs_zoned_writeback_ops);
+ xfs_zone_finish_alloc(xc.rtg);
+ return error;
+ }
return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
}
extern const struct address_space_operations xfs_address_space_operations;
extern const struct address_space_operations xfs_dax_aops;
-int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
+int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
+void xfs_end_bio(struct bio *bio);
#endif /* __XFS_AOPS_H__ */
#include "xfs_reflink.h"
#include "xfs_rtbitmap.h"
#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
/* Kernel only BMAP related definitions and functions */
struct xfs_inode *ip,
int whichfork,
xfs_off_t start_byte,
- xfs_off_t end_byte)
+ xfs_off_t end_byte,
+ struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
continue;
}
- xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del, 0);
+ xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del,
+ ac ? XFS_BMAPI_REMAP : 0);
+ if (xfs_is_zoned_inode(ip) && ac)
+ ac->reserved_blocks += del.br_blockcount;
if (!xfs_iext_get_extent(ifp, &icur, &got))
break;
}
if (ip->i_delayed_blks) {
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK,
round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
- LLONG_MAX);
+ LLONG_MAX, NULL);
}
xfs_inode_clear_eofblocks_tag(ip);
return 0;
int
xfs_free_file_space(
- struct xfs_inode *ip,
- xfs_off_t offset,
- xfs_off_t len)
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t startoffset_fsb;
- xfs_fileoff_t endoffset_fsb;
- int done = 0, error;
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t startoffset_fsb;
+ xfs_fileoff_t endoffset_fsb;
+ int done = 0, error;
trace_xfs_free_file_space(ip);
return 0;
if (offset + len > XFS_ISIZE(ip))
len = XFS_ISIZE(ip) - offset;
- error = xfs_zero_range(ip, offset, len, NULL);
+ error = xfs_zero_range(ip, offset, len, ac, NULL);
if (error)
return error;
xfs_collapse_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
- xfs_off_t len)
+ xfs_off_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
trace_xfs_collapse_file_space(ip);
- error = xfs_free_file_space(ip, offset, len);
+ error = xfs_free_file_space(ip, offset, len, ac);
if (error)
return error;
struct xfs_mount;
struct xfs_trans;
struct xfs_bmalloca;
+struct xfs_zone_alloc_ctx;
#ifdef CONFIG_XFS_RT
int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
#endif /* CONFIG_XFS_RT */
void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork,
- xfs_off_t start_byte, xfs_off_t end_byte);
+ xfs_off_t start_byte, xfs_off_t end_byte,
+ struct xfs_zone_alloc_ctx *ac);
struct kgetbmap {
__s64 bmv_offset; /* file offset of segment in blocks */
int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
- xfs_off_t len);
+ xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
- xfs_off_t len);
+ xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
xfs_off_t len);
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (mp->m_rtdev_targp &&
+
+ if (mp->m_rtdev_targp && !xfs_has_zoned(mp) &&
bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
rt_bdev = mp->m_rtdev_targp->bt_bdev;
if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)
return -EOPNOTSUPP;
- if (rt_bdev)
+ if (rt_bdev) {
+ if (!bdev_max_discard_sectors(rt_bdev))
+ return -EOPNOTSUPP;
granularity = max(granularity,
bdev_discard_granularity(rt_bdev));
+ }
/*
* We haven't recovered the log, so we cannot use our bnobt-guided
#include "xfs_inode.h"
#include "xfs_rtbitmap.h"
#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_efi_cache;
struct kmem_cache *xfs_efd_cache;
xfs_rtgroup_trans_join(tp, *rtgp,
XFS_RTGLOCK_BITMAP);
}
- error = xfs_rtfree_blocks(tp, *rtgp,
- xefi->xefi_startblock, xefi->xefi_blockcount);
+
+ if (xfs_has_zoned(mp)) {
+ error = xfs_zone_free_blocks(tp, *rtgp,
+ xefi->xefi_startblock,
+ xefi->xefi_blockcount);
+ } else {
+ error = xfs_rtfree_blocks(tp, *rtgp,
+ xefi->xefi_startblock,
+ xefi->xefi_blockcount);
+ }
}
if (error == -EAGAIN) {
xfs_efd_from_efi(efdp);
return error;
}
-
xfs_efd_add_extent(efdp, xefi);
xfs_extent_free_cancel_item(item);
return error;
#include "xfs_iomap.h"
#include "xfs_reflink.h"
#include "xfs_file.h"
+#include "xfs_aops.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_rtbitmap.h"
#include <linux/dax.h>
#include <linux/falloc.h>
struct iov_iter *from,
unsigned int *iolock,
size_t count,
- bool *drained_dio)
+ bool *drained_dio,
+ struct xfs_zone_alloc_ctx *ac)
{
struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
loff_t isize;
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
- error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
+ error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
return error;
xfs_file_write_checks(
struct kiocb *iocb,
struct iov_iter *from,
- unsigned int *iolock)
+ unsigned int *iolock,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
size_t count = iov_iter_count(from);
* can only extend EOF. Truncate is locked out at this point, so the
* EOF can not move backwards, only forwards. Hence we only need to take
* the slow path when we are at or beyond the current EOF.
+ *
+ * For zoned file systems, we never allocated speculative blocks, so
+ * there is no need to zero anything. The tail of the block beyond
+ * i_size was already zeroed when writing it, and the beginning of
+ * the block where the write starts will be zeroed by the write itself.
*/
if (iocb->ki_pos > i_size_read(inode)) {
error = xfs_file_write_zero_eof(iocb, from, iolock, count,
- &drained_dio);
+ &drained_dio, ac);
if (error == 1)
goto restart;
if (error)
loff_t offset = iocb->ki_pos;
unsigned int nofs_flag;
+ ASSERT(!xfs_is_zoned_inode(ip) ||
+ !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
+
trace_xfs_end_io_direct_write(ip, offset, size);
if (xfs_is_shutdown(ip->i_mount))
.end_io = xfs_dio_write_end_io,
};
+static void
+xfs_dio_zoned_submit_io(
+ const struct iomap_iter *iter,
+ struct bio *bio,
+ loff_t file_offset)
+{
+ struct xfs_mount *mp = XFS_I(iter->inode)->i_mount;
+ struct xfs_zone_alloc_ctx *ac = iter->private;
+ xfs_filblks_t count_fsb;
+ struct iomap_ioend *ioend;
+
+ count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
+ if (count_fsb > ac->reserved_blocks) {
+ xfs_err(mp,
+"allocation (%lld) larger than reservation (%lld).",
+ count_fsb, ac->reserved_blocks);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ bio_io_error(bio);
+ return;
+ }
+ ac->reserved_blocks -= count_fsb;
+
+ bio->bi_end_io = xfs_end_bio;
+ ioend = iomap_init_ioend(iter->inode, bio, file_offset,
+ IOMAP_MAPPED, 0, true);
+ xfs_zone_alloc_and_submit(ioend, &ac->cached_rtg);
+}
+
+static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
+ .bio_set = &iomap_ioend_bioset,
+ .submit_io = xfs_dio_zoned_submit_io,
+ .end_io = xfs_dio_write_end_io,
+};
+
+static ssize_t
+xfs_zoned_write_space_reserve(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from,
+ unsigned int flags,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ loff_t count = iov_iter_count(from);
+ int error;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ flags |= XFS_ZR_NOWAIT;
+
+ /*
+ * Check the rlimit and LFS boundary first so that we don't over-reserve
+ * by possibly a lot.
+ *
+ * The generic write path will redo this check later, and it might have
+ * changed by then. If it got expanded we'll stick to our earlier
+ * smaller limit, and if it is decreased the new smaller limit will be
+ * used and our extra space reservation will be returned after finishing
+ * the write.
+ */
+ error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
+ if (error)
+ return error;
+
+ /*
+ * Sloppily round up count to file system blocks.
+ *
+ * This will often reserve an extra block, but that avoids having to look
+ * at the start offset, which isn't stable for O_APPEND until taking the
+ * iolock. Also we need to reserve a block each for zeroing the old
+ * EOF block and the new new start block if they are unaligned.
+ *
+ * Any remaining block will be returned after the write.
+ */
+ return xfs_zoned_space_reserve(ip,
+ XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2,
+ flags, ac);
+}
+
/*
- * Handle block aligned direct I/O writes
+ * Handle block aligned direct I/O writes.
*/
static noinline ssize_t
xfs_file_dio_write_aligned(
struct xfs_inode *ip,
struct kiocb *iocb,
- struct iov_iter *from)
+ struct iov_iter *from,
+ const struct iomap_ops *ops,
+ const struct iomap_dio_ops *dops,
+ struct xfs_zone_alloc_ctx *ac)
{
unsigned int iolock = XFS_IOLOCK_SHARED;
ssize_t ret;
ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, ac);
if (ret)
goto out_unlock;
iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(iocb, from);
- ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops, 0, NULL, 0);
+ ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
out_unlock:
- if (iolock)
- xfs_iunlock(ip, iolock);
+ xfs_iunlock(ip, iolock);
+ return ret;
+}
+
+/*
+ * Handle block aligned direct I/O writes to zoned devices.
+ */
+static noinline ssize_t
+xfs_file_dio_write_zoned(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct xfs_zone_alloc_ctx ac;
+ ssize_t ret;
+
+ ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac);
+ if (ret < 0)
+ return ret;
+ ret = xfs_file_dio_write_aligned(ip, iocb, from,
+ &xfs_zoned_direct_write_iomap_ops,
+ &xfs_dio_zoned_write_ops, &ac);
+ xfs_zoned_space_unreserve(ip, &ac);
return ret;
}
goto out_unlock;
}
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out_unlock;
(xfs_is_always_cow_inode(ip) &&
(iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
return xfs_file_dio_write_unaligned(ip, iocb, from);
- return xfs_file_dio_write_aligned(ip, iocb, from);
+ if (xfs_is_zoned_inode(ip))
+ return xfs_file_dio_write_zoned(ip, iocb, from);
+ return xfs_file_dio_write_aligned(ip, iocb, from,
+ &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
}
static noinline ssize_t
ret = xfs_ilock_iocb(iocb, iolock);
if (ret)
return ret;
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out;
if (ret)
return ret;
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out;
return ret;
}
+STATIC ssize_t
+xfs_file_buffered_write_zoned(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
+ struct xfs_mount *mp = ip->i_mount;
+ unsigned int iolock = XFS_IOLOCK_EXCL;
+ bool cleared_space = false;
+ struct xfs_zone_alloc_ctx ac;
+ ssize_t ret;
+
+ ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac);
+ if (ret < 0)
+ return ret;
+
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ goto out_unreserve;
+
+ ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Truncate the iter to the length that we were actually able to
+ * allocate blocks for. This needs to happen after
+ * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
+ * writes.
+ */
+ iov_iter_truncate(from,
+ XFS_FSB_TO_B(mp, ac.reserved_blocks) -
+ (iocb->ki_pos & mp->m_blockmask));
+ if (!iov_iter_count(from))
+ goto out_unlock;
+
+retry:
+ trace_xfs_file_buffered_write(iocb, from);
+ ret = iomap_file_buffered_write(iocb, from,
+ &xfs_buffered_write_iomap_ops, &ac);
+ if (ret == -ENOSPC && !cleared_space) {
+ /*
+ * Kick off writeback to convert delalloc space and release the
+ * usually too pessimistic indirect block reservations.
+ */
+ xfs_flush_inodes(mp);
+ cleared_space = true;
+ goto retry;
+ }
+
+out_unlock:
+ xfs_iunlock(ip, iolock);
+out_unreserve:
+ xfs_zoned_space_unreserve(ip, &ac);
+ if (ret > 0) {
+ XFS_STATS_ADD(mp, xs_write_bytes, ret);
+ ret = generic_write_sync(iocb, ret);
+ }
+ return ret;
+}
+
STATIC ssize_t
xfs_file_write_iter(
struct kiocb *iocb,
return ret;
}
+ if (xfs_is_zoned_inode(ip))
+ return xfs_file_buffered_write_zoned(iocb, from);
return xfs_file_buffered_write(iocb, from);
}
xfs_falloc_collapse_range(
struct file *file,
loff_t offset,
- loff_t len)
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
loff_t new_size = i_size_read(inode) - len;
if (offset + len >= i_size_read(inode))
return -EINVAL;
- error = xfs_collapse_file_space(XFS_I(inode), offset, len);
+ error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
if (error)
return error;
return xfs_falloc_setsize(file, new_size);
struct file *file,
int mode,
loff_t offset,
- loff_t len)
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
unsigned int blksize = i_blocksize(inode);
if (error)
return error;
- error = xfs_free_file_space(XFS_I(inode), offset, len);
+ error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
if (error)
return error;
struct xfs_inode *ip = XFS_I(inode);
long error;
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ struct xfs_zone_alloc_ctx ac = { };
if (!S_ISREG(inode->i_mode))
return -EINVAL;
if (mode & ~XFS_FALLOC_FL_SUPPORTED)
return -EOPNOTSUPP;
+ /*
+ * For zoned file systems, zeroing the first and last block of a hole
+ * punch requires allocating a new block to rewrite the remaining data
+ * and new zeroes out of place. Get a reservations for those before
+ * taking the iolock. Dip into the reserved pool because we are
+ * expected to be able to punch a hole even on a completely full
+ * file system.
+ */
+ if (xfs_is_zoned_inode(ip) &&
+ (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
+ FALLOC_FL_COLLAPSE_RANGE))) {
+ error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac);
+ if (error)
+ return error;
+ }
+
xfs_ilock(ip, iolock);
error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
if (error)
switch (mode & FALLOC_FL_MODE_MASK) {
case FALLOC_FL_PUNCH_HOLE:
- error = xfs_free_file_space(ip, offset, len);
+ error = xfs_free_file_space(ip, offset, len, &ac);
break;
case FALLOC_FL_COLLAPSE_RANGE:
- error = xfs_falloc_collapse_range(file, offset, len);
+ error = xfs_falloc_collapse_range(file, offset, len, &ac);
break;
case FALLOC_FL_INSERT_RANGE:
error = xfs_falloc_insert_range(file, offset, len);
break;
case FALLOC_FL_ZERO_RANGE:
- error = xfs_falloc_zero_range(file, mode, offset, len);
+ error = xfs_falloc_zero_range(file, mode, offset, len, &ac);
break;
case FALLOC_FL_UNSHARE_RANGE:
error = xfs_falloc_unshare_range(file, mode, offset, len);
out_unlock:
xfs_iunlock(ip, iolock);
+ if (xfs_is_zoned_inode(ip))
+ xfs_zoned_space_unreserve(ip, &ac);
return error;
}
struct inode *inode = file_inode(vmf->vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
unsigned int lock_mode = XFS_MMAPLOCK_SHARED;
+ struct xfs_zone_alloc_ctx ac;
+ int error;
vm_fault_t ret;
+ if (xfs_is_zoned_inode(ip)) {
+ /*
+ * This could over-allocate as it doesn't check for truncation.
+ * But as the overallocation is limited to less than a folio and
+ * will be release instantly that's just fine.
+ */
+ unsigned int len = folio_size(page_folio(vmf->page));
+
+ error = xfs_zoned_space_reserve(ip,
+ XFS_B_TO_FSB(ip->i_mount, len), 0, &ac);
+ if (error < 0)
+ return vmf_fs_error(error);
+ }
+
sb_start_pagefault(inode->i_sb);
file_update_time(vmf->vma->vm_file);
ret = xfs_dax_fault_locked(vmf, order, true);
else
ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops,
- NULL);
+ xfs_is_zoned_inode(ip) ? &ac : NULL);
xfs_iunlock(ip, lock_mode);
sb_end_pagefault(inode->i_sb);
+ if (xfs_is_zoned_inode(ip))
+ xfs_zoned_space_unreserve(ip, &ac);
return ret;
}
{
struct xfs_mount *mp = ip->i_mount;
- if (!XFS_IS_REALTIME_INODE(ip))
+ if (!XFS_IS_REALTIME_INODE(ip) || xfs_has_zoned(mp))
return false;
if (xfs_compare_freecounter(mp, FREE_RTEXTENTS,
static inline void xfs_update_stable_writes(struct xfs_inode *ip)
{
- if (bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev))
+ if (xfs_is_zoned_inode(ip) ||
+ bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev))
mapping_set_stable_writes(VFS_I(ip)->i_mapping);
else
mapping_clear_stable_writes(VFS_I(ip)->i_mapping);
#include "xfs_exchrange.h"
#include "xfs_handle.h"
#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
#include <linux/mount.h>
#include <linux/fileattr.h>
#include "xfs_reflink.h"
#include "xfs_health.h"
#include "xfs_rtbitmap.h"
+#include "xfs_icache.h"
+#include "xfs_zone_alloc.h"
#define XFS_ALLOC_ALIGN(mp, off) \
(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
iomap->dax_dev = target->bt_daxdev;
}
-static inline xfs_fileoff_t
-xfs_iomap_end_fsb(
- struct xfs_mount *mp,
- loff_t offset,
- loff_t count)
-{
- ASSERT(offset <= mp->m_super->s_maxbytes);
- return min(XFS_B_TO_FSB(mp, offset + count),
- XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
-}
-
static xfs_extlen_t
xfs_eof_alignment(
struct xfs_inode *ip)
.iomap_begin = xfs_direct_write_iomap_begin,
};
+#ifdef CONFIG_XFS_RT
+/*
+ * This is really simple. The space has already been reserved before taking the
+ * IOLOCK, the actual block allocation is done just before submitting the bio
+ * and only recorded in the extent map on I/O completion.
+ */
+static int
+xfs_zoned_direct_write_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ int error;
+
+ ASSERT(!(flags & IOMAP_OVERWRITE_ONLY));
+
+ /*
+ * Needs to be pushed down into the allocator so that only writes into
+ * a single zone can be supported.
+ */
+ if (flags & IOMAP_NOWAIT)
+ return -EAGAIN;
+
+ /*
+ * Ensure the extent list is in memory in so that we don't have to do
+ * read it from the I/O completion handler.
+ */
+ if (xfs_need_iread_extents(&ip->i_df)) {
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+ }
+
+ iomap->type = IOMAP_MAPPED;
+ iomap->flags = IOMAP_F_DIRTY;
+ iomap->bdev = ip->i_mount->m_rtdev_targp->bt_bdev;
+ iomap->offset = offset;
+ iomap->length = length;
+ iomap->flags = IOMAP_F_ZONE_APPEND;
+ iomap->addr = 0;
+ return 0;
+}
+
+const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
+ .iomap_begin = xfs_zoned_direct_write_iomap_begin,
+};
+#endif /* CONFIG_XFS_RT */
+
static int
xfs_dax_write_iomap_end(
struct inode *inode,
.iomap_end = xfs_dax_write_iomap_end,
};
+static int
+xfs_zoned_buffered_write_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t count,
+ unsigned flags,
+ struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct iomap_iter *iter =
+ container_of(iomap, struct iomap_iter, iomap);
+ struct xfs_zone_alloc_ctx *ac = iter->private;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count);
+ u16 iomap_flags = IOMAP_F_SHARED;
+ unsigned int lockmode = XFS_ILOCK_EXCL;
+ xfs_filblks_t count_fsb;
+ xfs_extlen_t indlen;
+ struct xfs_bmbt_irec got;
+ struct xfs_iext_cursor icur;
+ int error = 0;
+
+ ASSERT(!xfs_get_extsz_hint(ip));
+ ASSERT(!(flags & IOMAP_UNSHARE));
+ ASSERT(ac);
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ error = xfs_qm_dqattach(ip);
+ if (error)
+ return error;
+
+ error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+ if (error)
+ return error;
+
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
+ error = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+
+ XFS_STATS_INC(mp, xs_blk_mapw);
+
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * For zeroing operations check if there is any data to zero first.
+ *
+ * For regular writes we always need to allocate new blocks, but need to
+ * provide the source mapping when the range is unaligned to support
+ * read-modify-write of the whole block in the page cache.
+ *
+ * In either case we need to limit the reported range to the boundaries
+ * of the source map in the data fork.
+ */
+ if (!IS_ALIGNED(offset, mp->m_sb.sb_blocksize) ||
+ !IS_ALIGNED(offset + count, mp->m_sb.sb_blocksize) ||
+ (flags & IOMAP_ZERO)) {
+ struct xfs_bmbt_irec smap;
+ struct xfs_iext_cursor scur;
+
+ if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &scur,
+ &smap))
+ smap.br_startoff = end_fsb; /* fake hole until EOF */
+ if (smap.br_startoff > offset_fsb) {
+ /*
+ * We never need to allocate blocks for zeroing a hole.
+ */
+ if (flags & (IOMAP_UNSHARE | IOMAP_ZERO)) {
+ xfs_hole_to_iomap(ip, iomap, offset_fsb,
+ smap.br_startoff);
+ goto out_unlock;
+ }
+ end_fsb = min(end_fsb, smap.br_startoff);
+ } else {
+ end_fsb = min(end_fsb,
+ smap.br_startoff + smap.br_blockcount);
+ xfs_trim_extent(&smap, offset_fsb,
+ end_fsb - offset_fsb);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &smap, flags, 0,
+ xfs_iomap_inode_sequence(ip, 0));
+ if (error)
+ goto out_unlock;
+ }
+ }
+
+ if (!ip->i_cowfp)
+ xfs_ifork_init_cow(ip);
+
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
+ got.br_startoff = end_fsb;
+ if (got.br_startoff <= offset_fsb) {
+ trace_xfs_reflink_cow_found(ip, &got);
+ goto done;
+ }
+
+ /*
+ * Cap the maximum length to keep the chunks of work done here somewhat
+ * symmetric with the work writeback does.
+ */
+ end_fsb = min(end_fsb, got.br_startoff);
+ count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN,
+ XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE));
+
+ /*
+ * The block reservation is supposed to cover all blocks that the
+ * operation could possible write, but there is a nasty corner case
+ * where blocks could be stole from underneath us:
+ *
+ * 1) while this thread iterates over a larger buffered write,
+ * 2) another thread is causing a write fault that calls into
+ * ->page_mkwrite in range this thread writes to, using up the
+ * delalloc reservation created by a previous call to this function.
+ * 3) another thread does direct I/O on the range that the write fault
+ * happened on, which causes writeback of the dirty data.
+ * 4) this then set the stale flag, which cuts the current iomap
+ * iteration short, causing the new call to ->iomap_begin that gets
+ * us here again, but now without a sufficient reservation.
+ *
+ * This is a very unusual I/O pattern, and nothing but generic/095 is
+ * known to hit it. There's not really much we can do here, so turn this
+ * into a short write.
+ */
+ if (count_fsb > ac->reserved_blocks) {
+ xfs_warn_ratelimited(mp,
+"Short write on ino 0x%llx comm %.20s due to three-way race with write fault and direct I/O",
+ ip->i_ino, current->comm);
+ count_fsb = ac->reserved_blocks;
+ if (!count_fsb) {
+ error = -EIO;
+ goto out_unlock;
+ }
+ }
+
+ error = xfs_quota_reserve_blkres(ip, count_fsb);
+ if (error)
+ goto out_unlock;
+
+ indlen = xfs_bmap_worst_indlen(ip, count_fsb);
+ error = xfs_dec_fdblocks(mp, indlen, false);
+ if (error)
+ goto out_unlock;
+ ip->i_delayed_blks += count_fsb;
+ xfs_mod_delalloc(ip, count_fsb, indlen);
+
+ got.br_startoff = offset_fsb;
+ got.br_startblock = nullstartblock(indlen);
+ got.br_blockcount = count_fsb;
+ got.br_state = XFS_EXT_NORM;
+ xfs_bmap_add_extent_hole_delay(ip, XFS_COW_FORK, &icur, &got);
+ ac->reserved_blocks -= count_fsb;
+ iomap_flags |= IOMAP_F_NEW;
+
+ trace_xfs_iomap_alloc(ip, offset, XFS_FSB_TO_B(mp, count_fsb),
+ XFS_COW_FORK, &got);
+done:
+ error = xfs_bmbt_to_iomap(ip, iomap, &got, flags, iomap_flags,
+ xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED));
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+ return error;
+}
+
static int
xfs_buffered_write_iomap_begin(
struct inode *inode,
if (xfs_is_shutdown(mp))
return -EIO;
+ if (xfs_is_zoned_inode(ip))
+ return xfs_zoned_buffered_write_iomap_begin(inode, offset,
+ count, flags, iomap, srcmap);
+
/* we can't use delayed allocations when using extent size hints */
if (xfs_get_extsz_hint(ip))
return xfs_direct_write_iomap_begin(inode, offset, count,
loff_t length,
struct iomap *iomap)
{
+ struct iomap_iter *iter =
+ container_of(iomap, struct iomap_iter, iomap);
+
xfs_bmap_punch_delalloc_range(XFS_I(inode),
(iomap->flags & IOMAP_F_SHARED) ?
XFS_COW_FORK : XFS_DATA_FORK,
- offset, offset + length);
+ offset, offset + length, iter->private);
}
static int
int
xfs_zero_range(
- struct xfs_inode *ip,
- loff_t pos,
- loff_t len,
- bool *did_zero)
+ struct xfs_inode *ip,
+ loff_t pos,
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac,
+ bool *did_zero)
{
- struct inode *inode = VFS_I(ip);
+ struct inode *inode = VFS_I(ip);
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
return dax_zero_range(inode, pos, len, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_zero_range(inode, pos, len, did_zero,
- &xfs_buffered_write_iomap_ops, NULL);
+ &xfs_buffered_write_iomap_ops, ac);
}
int
xfs_truncate_page(
struct xfs_inode *ip,
loff_t pos,
+ struct xfs_zone_alloc_ctx *ac,
bool *did_zero)
{
struct inode *inode = VFS_I(ip);
return dax_truncate_page(inode, pos, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_truncate_page(inode, pos, did_zero,
- &xfs_buffered_write_iomap_ops, NULL);
+ &xfs_buffered_write_iomap_ops, ac);
}
struct xfs_inode;
struct xfs_bmbt_irec;
+struct xfs_zone_alloc_ctx;
int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb, unsigned int flags,
u16 iomap_flags, u64 sequence_cookie);
int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
- bool *did_zero);
-int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero);
+ struct xfs_zone_alloc_ctx *ac, bool *did_zero);
+int xfs_truncate_page(struct xfs_inode *ip, loff_t pos,
+ struct xfs_zone_alloc_ctx *ac, bool *did_zero);
+
+static inline xfs_fileoff_t
+xfs_iomap_end_fsb(
+ struct xfs_mount *mp,
+ loff_t offset,
+ loff_t count)
+{
+ ASSERT(offset <= mp->m_super->s_maxbytes);
+ return min(XFS_B_TO_FSB(mp, offset + count),
+ XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+}
static inline xfs_filblks_t
xfs_aligned_fsb_count(
extern const struct iomap_ops xfs_buffered_write_iomap_ops;
extern const struct iomap_ops xfs_page_mkwrite_iomap_ops;
extern const struct iomap_ops xfs_direct_write_iomap_ops;
+extern const struct iomap_ops xfs_zoned_direct_write_iomap_ops;
extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
#include "xfs_xattr.h"
#include "xfs_file.h"
#include "xfs_bmap.h"
+#include "xfs_zone_alloc.h"
#include <linux/posix_acl.h>
#include <linux/security.h>
uint lock_flags = 0;
uint resblks = 0;
bool did_zeroing = false;
+ struct xfs_zone_alloc_ctx ac;
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
ASSERT(S_ISREG(inode->i_mode));
*/
inode_dio_wait(inode);
+ /*
+ * Normally xfs_zoned_space_reserve is supposed to be called outside the
+ * IOLOCK. For for truncate we can't do that since ->setattr is called
+ * with it already held by the VFS. So for now chicken out and try to
+ * allocate space under it.
+ *
+ * To avoid deadlocks this means we can't block waiting for space, which
+ * can lead to spurious -ENOSPC if there are no directly available
+ * blocks. We mitigate this a bit by allowing zeroing to dip into the
+ * reserved pool, but eventually the VFS calling convention needs to
+ * change.
+ */
+ if (xfs_is_zoned_inode(ip)) {
+ error = xfs_zoned_space_reserve(ip, 1,
+ XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac);
+ if (error) {
+ if (error == -EAGAIN)
+ return -ENOSPC;
+ return error;
+ }
+ }
+
/*
* File data changes must be complete before we start the transaction to
* modify the inode. This needs to be done before joining the inode to
if (newsize > oldsize) {
trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
error = xfs_zero_range(ip, oldsize, newsize - oldsize,
- &did_zeroing);
+ &ac, &did_zeroing);
} else {
- error = xfs_truncate_page(ip, newsize, &did_zeroing);
+ error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing);
}
+ if (xfs_is_zoned_inode(ip))
+ xfs_zoned_space_unreserve(ip, &ac);
+
if (error)
return error;
spin_unlock(&log->l_icloglock);
wake_up_var(&log->l_opstate);
+
+ if (xfs_has_zoned(log->l_mp) && IS_ENABLED(CONFIG_XFS_RT)) {
+ spin_lock(&log->l_mp->m_zone_list_lock);
+ wake_up_all(&log->l_mp->m_zone_wait);
+ spin_unlock(&log->l_mp->m_zone_list_lock);
+ }
return log_error;
}
#include "xfs_rtrmap_btree.h"
#include "xfs_rtrefcount_btree.h"
#include "scrub/stats.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_zone_alloc.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
struct xfs_mount *mp,
unsigned int idx)
{
- uint64_t resblks;
-
- if (idx == FREE_RTEXTENTS)
- return 0;
+ switch (idx) {
+ case FREE_BLOCKS:
+ /*
+ * We default to 5% or 8192 FSBs of space reserved, whichever is
+ * smaller.
+ *
+ * This is intended to cover concurrent allocation transactions
+ * when we initially hit ENOSPC. These each require a 4 block
+ * reservation. Hence by default we cover roughly 2000
+ * concurrent allocation reservations.
+ */
+ return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL);
+ case FREE_RTEXTENTS:
+ case FREE_RTAVAILABLE:
+ if (!IS_ENABLED(CONFIG_XFS_RT) || !xfs_has_zoned(mp))
+ break;
+ return xfs_zoned_default_resblks(mp, idx);
+ }
- /*
- * We default to 5% or 8192 fsbs of space reserved, whichever is
- * smaller. This is intended to cover concurrent allocation
- * transactions when we initially hit enospc. These each require a 4
- * block reservation. Hence by default we cover roughly 2000 concurrent
- * allocation reservations.
- */
- resblks = mp->m_sb.sb_dblocks;
- do_div(resblks, 20);
- resblks = min_t(uint64_t, resblks, 8192);
- return resblks;
+ return 0;
}
/* Ensure the summary counts are correct. */
* If we're mounting the rt volume after recovering the log, recompute
* frextents from the rtbitmap file to fix the inconsistency.
*/
- if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
+ if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) {
error = xfs_rtalloc_reinit_frextents(mp);
if (error)
return error;
goto out_agresv;
}
+ if (!xfs_is_readonly(mp))
+ xfs_zone_gc_start(mp);
+
return 0;
out_agresv:
xfs_inodegc_flush(mp);
xfs_blockgc_stop(mp);
+ if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate))
+ xfs_zone_gc_stop(mp);
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
xfs_rtunmount_inodes(mp);
struct xfs_mount *mp,
unsigned int idx)
{
- if (idx == FREE_RTEXTENTS)
+ if (idx == FREE_RTEXTENTS || idx == FREE_RTAVAILABLE)
return 0;
return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
}
spin_unlock(&mp->m_sb_lock);
return 0;
}
- xfs_warn_once(mp,
+
+ if (idx == FREE_BLOCKS)
+ xfs_warn_once(mp,
"Reserve blocks depleted! Consider increasing reserve pool size.");
fdblocks_enospc:
enum {
FREE_BLOCKS, /* free block counter */
FREE_RTEXTENTS, /* free rt extent counter */
+ FREE_RTAVAILABLE, /* actually available rt extents */
FREE_NR,
};
uint64_t avail; /* available reserved blocks */
uint64_t save; /* reserved blks @ remount,ro */
} m_resblks[FREE_NR];
+ struct list_head m_free_zones;
+ struct list_head m_open_zones;
+ atomic_t m_nr_free_zones;
+ unsigned int m_nr_open_zones;
+ unsigned int m_max_open_zones;
+ uint64_t m_zoned_op;
+ struct list_head m_emptied_zones;
+ spinlock_t m_zone_list_lock;
+ wait_queue_head_t m_zone_wait;
+ struct xfs_rtgroup *m_open_gc_zone;
struct delayed_work m_reclaim_work; /* background inode reclaim */
+ spinlock_t m_reservation_lock;
+ struct list_head m_reclaim_reservations;
+ struct task_struct *m_zone_gc_thread;
struct dentry *m_debugfs; /* debugfs parent */
struct xfs_kobj m_kobj;
struct xfs_kobj m_error_kobj;
xfs_has_reflink(mp);
}
+static inline bool xfs_has_nonzoned(const struct xfs_mount *mp)
+{
+ return !xfs_has_zoned(mp);
+}
+
/*
* Some features are always on for v5 file systems, allow the compiler to
* eliminiate dead code when building without v4 support.
#define XFS_OPSTATE_WARNED_METADIR 17
/* Filesystem should use qflags to determine quotaon status */
#define XFS_OPSTATE_RESUMING_QUOTAON 18
+/* (Zoned) GC is in progress */
+#define XFS_OPSTATE_IN_GC 19
#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
#endif /* CONFIG_XFS_QUOTA */
__XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
__XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
+__XFS_IS_OPSTATE(in_gc, IN_GC)
static inline bool
xfs_should_warn(struct xfs_mount *mp, long nr)
#include "xfs_rtalloc.h"
#include "xfs_rtgroup.h"
#include "xfs_metafile.h"
+#include "xfs_zone_alloc.h"
/*
* Copy on Write of Shared Blocks
*/
while (end_fsb > offset_fsb && !error)
error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
-
if (error)
trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
return error;
}
+#ifdef CONFIG_XFS_RT
+static int
+xfs_zoned_end_extent(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *new,
+ xfs_fsblock_t old_startblock)
+{
+ struct xfs_bmbt_irec data;
+ int nmaps = 1;
+ int error;
+
+ /* Grab the corresponding mapping in the data fork. */
+ error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data,
+ &nmaps, 0);
+ if (error)
+ return error;
+
+ /*
+ * Cap the update to the existing extent in the data fork because we can
+ * only overwrite one extent at a time.
+ */
+ ASSERT(new->br_blockcount >= data.br_blockcount);
+ new->br_blockcount = data.br_blockcount;
+
+ /*
+ * If a data write raced with this GC write, keep the existing data in
+ * the data fork, mark our newly written GC extent as reclaimable, then
+ * move on to the next extent.
+ */
+ if (old_startblock != NULLFSBLOCK &&
+ old_startblock != data.br_startblock)
+ goto skip;
+
+ trace_xfs_reflink_cow_remap_from(ip, new);
+ trace_xfs_reflink_cow_remap_to(ip, &data);
+
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
+ XFS_IEXT_REFLINK_END_COW_CNT);
+ if (error)
+ return error;
+
+ if (data.br_startblock != HOLESTARTBLOCK) {
+ ASSERT(data.br_startblock != DELAYSTARTBLOCK);
+ ASSERT(!isnullstartblock(data.br_startblock));
+
+ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
+ if (xfs_is_reflink_inode(ip)) {
+ xfs_refcount_decrease_extent(tp, true, &data);
+ } else {
+ error = xfs_free_extent_later(tp, data.br_startblock,
+ data.br_blockcount, NULL,
+ XFS_AG_RESV_NONE,
+ XFS_FREE_EXTENT_REALTIME);
+ if (error)
+ return error;
+ }
+ }
+
+ error = xfs_zone_record_blocks(tp, new->br_startblock,
+ new->br_blockcount, true);
+ if (error)
+ return error;
+
+ /* Map the new blocks into the data fork. */
+ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new);
+ return 0;
+
+skip:
+ trace_xfs_reflink_cow_remap_skip(ip, new);
+ return xfs_zone_record_blocks(tp, new->br_startblock,
+ new->br_blockcount, false);
+}
+
+int
+xfs_zoned_end_io(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t count,
+ xfs_daddr_t daddr,
+ xfs_fsblock_t old_startblock)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
+ struct xfs_bmbt_irec new = {
+ .br_startoff = XFS_B_TO_FSBT(mp, offset),
+ .br_startblock = xfs_daddr_to_rtb(mp, daddr),
+ .br_state = XFS_EXT_NORM,
+ };
+ unsigned int resblks =
+ XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ struct xfs_trans *tp;
+ int error;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ while (new.br_startoff < end_fsb) {
+ new.br_blockcount = end_fsb - new.br_startoff;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+ XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp);
+ if (error)
+ return error;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ error = xfs_zoned_end_extent(tp, ip, &new, old_startblock);
+ if (error)
+ xfs_trans_cancel(tp);
+ else
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ new.br_startoff += new.br_blockcount;
+ new.br_startblock += new.br_blockcount;
+ if (old_startblock != NULLFSBLOCK)
+ old_startblock += new.br_blockcount;
+ }
+
+ return 0;
+}
+#endif /* CONFIG_XFS_RT */
+
/*
* Free all CoW staging blocks that are still referenced by the ondisk refcount
* metadata. The ondisk metadata does not track which inode created the
return 0;
trace_xfs_zero_eof(ip, isize, pos - isize);
- return xfs_zero_range(ip, isize, pos - isize, NULL);
+ return xfs_zero_range(ip, isize, pos - isize, NULL, NULL);
}
/*
xfs_fileoff_t end_fsb, bool cancel_real);
extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count, bool cancel_real);
-extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
+int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
+int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count,
+ xfs_daddr_t daddr, xfs_fsblock_t old_startblock);
extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, loff_t len,
#include "xfs_trace.h"
#include "xfs_rtrefcount_btree.h"
#include "xfs_reflink.h"
+#include "xfs_zone_alloc.h"
/*
* Return whether there are any free extents in the size range given
{
int i;
+ list_del_init(&rtg->rtg_entry);
for (i = 0; i < XFS_RTGI_MAX; i++)
xfs_rtginode_irele(&rtg->rtg_inodes[i]);
kvfree(rtg->rtg_rsum_cache);
if (xfs_has_reflink(mp) &&
!xfs_reflink_supports_rextsize(mp, in->extsize))
goto out_unlock;
+ if (xfs_has_zoned(mp))
+ goto out_unlock;
error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks);
if (error)
error = xfs_rtmount_rtg(mp, tp, rtg);
if (error) {
xfs_rtgroup_rele(rtg);
- xfs_rtunmount_inodes(mp);
- break;
+ goto out_rtunmount_rtgs;
}
}
+ if (xfs_has_zoned(mp)) {
+ error = xfs_mount_zones(mp);
+ if (error)
+ goto out_rtunmount_rtgs;
+ }
+
out_cancel:
xfs_trans_cancel(tp);
return error;
+
+out_rtunmount_rtgs:
+ rtg = NULL;
+ while ((rtg = xfs_rtgroup_next(mp, rtg)))
+ xfs_rtunmount_rtg(rtg);
+ xfs_rtginode_irele(&mp->m_rtdirip);
+ goto out_cancel;
}
void
{
struct xfs_rtgroup *rtg = NULL;
+ if (xfs_has_zoned(mp))
+ xfs_unmount_zones(mp);
+
while ((rtg = xfs_rtgroup_next(mp, rtg)))
xfs_rtunmount_rtg(rtg);
xfs_rtginode_irele(&mp->m_rtdirip);
ap->datatype & XFS_ALLOC_INITIAL_USER_DATA;
int error;
+ ASSERT(!xfs_has_zoned(ap->tp->t_mountp));
+
retry:
error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign);
if (error)
#include "xfs_exchmaps_item.h"
#include "xfs_parent.h"
#include "xfs_rtalloc.h"
+#include "xfs_zone_alloc.h"
#include "scrub/stats.h"
#include "scrub/rcbag_btree.h"
Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
- Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum,
+ Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
+ Opt_zoned_op,
};
static const struct fs_parameter_spec xfs_fs_parameters[] = {
fsparam_flag("nodiscard", Opt_nodiscard),
fsparam_flag("dax", Opt_dax),
fsparam_enum("dax", Opt_dax_enum, dax_param_enums),
+ fsparam_u32("max_open_zones", Opt_max_open_zones),
+ fsparam_u64("zoned_op", Opt_zoned_op),
{}
};
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
seq_puts(m, ",noquota");
+ if (mp->m_max_open_zones)
+ seq_printf(m, ",max_open_zones=%d", mp->m_max_open_zones);
+
+ if (mp->m_zoned_op)
+ seq_printf(m, ",zoned_op=%llu", mp->m_zoned_op);
+
return 0;
}
s64 freertx;
statp->f_blocks = sbp->sb_rblocks;
+ if (xfs_has_zoned(mp))
+ statp->f_blocks -= mp->m_resblks[FREE_RTEXTENTS].total;
freertx = max_t(int64_t, 0, xfs_sum_freecounter(mp, FREE_RTEXTENTS));
statp->f_bavail = statp->f_bfree =
xfs_rtbxlen_to_blen(mp, freertx);
percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
percpu_counter_set(&mp->m_free[FREE_BLOCKS], mp->m_sb.sb_fdblocks);
- percpu_counter_set(&mp->m_free[FREE_RTEXTENTS], mp->m_sb.sb_frextents);
+ if (!xfs_has_zoned(mp))
+ percpu_counter_set(&mp->m_free[FREE_RTEXTENTS],
+ mp->m_sb.sb_frextents);
}
static void
xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED);
}
+static int
+xfs_fs_show_stats(
+ struct seq_file *m,
+ struct dentry *root)
+{
+ struct xfs_mount *mp = XFS_M(root->d_sb);
+
+ if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT))
+ xfs_zoned_show_stats(m, mp);
+ return 0;
+}
+
static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
.nr_cached_objects = xfs_fs_nr_cached_objects,
.free_cached_objects = xfs_fs_free_cached_objects,
.shutdown = xfs_fs_shutdown,
+ .show_stats = xfs_fs_show_stats,
};
static int
xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
parsing_mp->m_features |= XFS_FEAT_NOATTR2;
return 0;
+ case Opt_max_open_zones:
+ parsing_mp->m_max_open_zones = result.uint_32;
+ return 0;
+ case Opt_zoned_op:
+ parsing_mp->m_zoned_op = result.uint_64;
+ return 0;
default:
xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
return -EINVAL;
mp->m_features &= ~XFS_FEAT_DISCARD;
}
- if (xfs_has_metadir(mp))
+ if (xfs_has_metadir(mp)) {
xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR);
+ } else if (xfs_has_zoned(mp)) {
+ xfs_alert(mp,
+ "metadir feature required for zoned realtime devices.");
+ error = -EINVAL;
+ goto out_filestream_unmount;
+ }
if (xfs_has_reflink(mp)) {
if (xfs_has_realtime(mp) &&
goto out_filestream_unmount;
}
+ if (xfs_has_zoned(mp)) {
+ xfs_alert(mp,
+ "reflink not compatible with zoned RT device!");
+ error = -EINVAL;
+ goto out_filestream_unmount;
+ }
+
/*
* always-cow mode is not supported on filesystems with rt
* extent sizes larger than a single block because we'd have
/* Re-enable the background inode inactivation worker. */
xfs_inodegc_start(mp);
+ /* Restart zone reclaim */
+ xfs_zone_gc_start(mp);
+
return 0;
}
*/
xfs_inodegc_stop(mp);
+ /* Stop zone reclaim */
+ xfs_zone_gc_stop(mp);
+
/* Free the per-AG metadata reservation pool. */
xfs_fs_unreserve_ag_blocks(mp);
mutex_init(&mp->m_growlock);
INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+
mp->m_kobj.kobject.kset = xfs_kset;
/*
* We don't create the finobt per-ag space reservation until after log
xfs_kill_sb(
struct super_block *sb)
{
+ struct xfs_mount *mp = XFS_M(sb);
+
kill_block_super(sb);
- xfs_mount_free(XFS_M(sb));
+ xfs_mount_free(mp);
}
static struct file_system_type xfs_fs_type = {
#include "xfs_metafile.h"
#include "xfs_metadir.h"
#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
/*
* We include this last to have the helpers above available for the trace
(char *)__entry->caller_ip)
);
+
#define DEFINE_GROUP_REF_EVENT(name) \
DEFINE_EVENT(xfs_group_class, name, \
TP_PROTO(struct xfs_group *xg, unsigned long caller_ip), \
DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag);
DEFINE_GROUP_REF_EVENT(xfs_group_rele);
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xfs_zone_class,
+ TP_PROTO(struct xfs_rtgroup *rtg),
+ TP_ARGS(rtg),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_rgblock_t, used)
+ __field(xfs_rgblock_t, written)
+ __field(xfs_rgblock_t, write_pointer)
+ ),
+ TP_fast_assign(
+ __entry->dev = rtg_mount(rtg)->m_super->s_dev;
+ __entry->rgno = rtg_rgno(rtg);
+ __entry->used = *xfs_zone_used_counter(rtg);
+ __entry->written = rtg->rtg_written;
+ __entry->write_pointer = rtg->rtg_write_pointer;
+ ),
+ TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgno,
+ __entry->used,
+ __entry->written,
+ __entry->write_pointer)
+);
+
+#define DEFINE_ZONE_EVENT(name) \
+DEFINE_EVENT(xfs_zone_class, name, \
+ TP_PROTO(struct xfs_rtgroup *rtg), \
+ TP_ARGS(rtg))
+DEFINE_ZONE_EVENT(xfs_zone_emptied);
+DEFINE_ZONE_EVENT(xfs_zone_full);
+DEFINE_ZONE_EVENT(xfs_zone_activate);
+DEFINE_ZONE_EVENT(xfs_zone_reset);
+DEFINE_ZONE_EVENT(xfs_zone_reclaim);
+DEFINE_ZONE_EVENT(xfs_gc_zone_activate);
+
+DECLARE_EVENT_CLASS(xfs_zone_alloc_class,
+ TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
+ xfs_extlen_t len),
+ TP_ARGS(rtg, rgbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_rgblock_t, used)
+ __field(xfs_rgblock_t, written)
+ __field(xfs_rgblock_t, write_pointer)
+ __field(xfs_rgblock_t, rgbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = rtg_mount(rtg)->m_super->s_dev;
+ __entry->rgno = rtg_rgno(rtg);
+ __entry->used = *xfs_zone_used_counter(rtg);
+ __entry->written = rtg->rtg_written;
+ __entry->write_pointer = rtg->rtg_write_pointer;
+ __entry->rgbno = rgbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgno,
+ __entry->used,
+ __entry->written,
+ __entry->write_pointer,
+ __entry->rgbno,
+ __entry->len)
+);
+
+
+#define DEFINE_ZONE_ALLOC_EVENT(name) \
+DEFINE_EVENT(xfs_zone_alloc_class, name, \
+ TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno, \
+ xfs_extlen_t len), \
+ TP_ARGS(rtg, rgbno, len))
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_free_blocks);
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
+#endif /* CONFIG_XFS_RT */
+
TRACE_EVENT(xfs_inodegc_worker,
TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
TP_ARGS(mp, shrinker_hits),
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read);
+DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks);
DECLARE_EVENT_CLASS(xfs_itrunc_class,
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Christoph Hellwig.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_fsops.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_iomap.h"
+#include "xfs_trans.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zones.h"
+#include "xfs_trace.h"
+
+/*
+ * Keep track of a counter of blocks used in a rtgroup. This is incremented
+ * after the blocks have been written to and the I/O completion handlers sets
+ * up the bmap and remap records to link them into the file system metadata
+ * and decremented when the blocks are "freed" by unlinking them from the bmap
+ * and rmap trees. The space will only become available for reuse when the
+ * zone is reset.
+ *
+ * The way this stored is a bit of a hack and abuses the atime field in the
+ * rmap inode. There is precedence for this in the rtbimap inode, but it is
+ * a bit ugly.
+ */
+uint64_t *
+xfs_zone_used_counter(
+ struct xfs_rtgroup *rtg)
+{
+ return (uint64_t *)&VFS_I(rtg->rtg_inodes[XFS_RTGI_RMAP])->i_atime_sec;
+}
+
+/*
+ * Keep track of the last written block in a zone.
+ *
+ * This is only needed when using the zoned allocator on a device that doesn't
+ * support zones natively and is an approximation for the hardware write
+ * pointer. Unlike the hardware write pointer it might be past regions that
+ * haven't been written to. In case of an unclean shutdown this means there
+ * could be blocks that we'll never write ever before finishing the zone.
+ * This is a little bit inefficient, but not a real problem as the used counter
+ * above doesn't account for them, so they will be treated by zone reclaim as
+ * if these blocks were written to but deleted immediately.
+ *
+ * This uses the same kind of hack to store extra information in the rmap inode
+ * as the used counter above.
+ */
+uint64_t *
+xfs_zone_last_written(
+ struct xfs_rtgroup *rtg)
+{
+ return (uint64_t *)&VFS_I(rtg->rtg_inodes[XFS_RTGI_RMAP])->i_mtime_sec;
+}
+
+static void
+xfs_zone_emptied(
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ trace_xfs_zone_emptied(rtg);
+
+ xfs_group_clear_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE);
+
+ spin_lock(&mp->m_zone_list_lock);
+ ASSERT(list_empty(&rtg->rtg_entry));
+ list_add_tail(&rtg->rtg_entry, &mp->m_emptied_zones);
+ spin_unlock(&mp->m_zone_list_lock);
+
+ wake_up_process(mp->m_zone_gc_thread);
+}
+
+static void
+xfs_zone_mark_reclaimable(
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE);
+ if (xfs_zoned_need_gc(mp))
+ wake_up_process(mp->m_zone_gc_thread);
+}
+
+static void
+xfs_zone_mark_full(
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ trace_xfs_zone_full(rtg);
+
+ spin_lock(&mp->m_zone_list_lock);
+ clear_bit(RTG_F_OPEN, &rtg->rtg_flags);
+ if (!list_empty(&rtg->rtg_entry)) {
+ /* empty list means this is the open GC zone */
+ mp->m_nr_open_zones--;
+ list_del_init(&rtg->rtg_entry);
+ }
+ spin_unlock(&mp->m_zone_list_lock);
+
+ wake_up_all(&mp->m_zone_wait);
+ if (*xfs_zone_used_counter(rtg) < rtg->rtg_extents)
+ xfs_zone_mark_reclaimable(rtg);
+}
+
+/*
+ * Record data blocks as having been written to.
+ *
+ * This is called from the write completion handler and records blocks as
+ * actually used. For zoned devices all this is purely an in-memory
+ * exercise to manage the open zones, but if we run on a conventional
+ * device we also have to record the last written block as the write pointer
+ * approximation.
+ */
+int
+xfs_zone_record_blocks(
+ struct xfs_trans *tp,
+ xfs_fsblock_t fsbno,
+ xfs_filblks_t len,
+ bool used)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_rgblock_t rgbno = xfs_rtb_to_rgbno(mp, fsbno);
+ struct xfs_rtgroup *rtg;
+
+ rtg = xfs_rtgroup_get(mp, xfs_rtb_to_rgno(mp, fsbno));
+ if (!rtg)
+ return -EIO;
+
+ trace_xfs_zone_record_blocks(rtg, rgbno, len);
+
+ xfs_ilock(rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOCK_EXCL);
+
+ if (used) {
+ *xfs_zone_used_counter(rtg) += len;
+ ASSERT(*xfs_zone_used_counter(rtg) <= rtg->rtg_extents);
+ } else {
+ xfs_add_frextents(mp, xfs_extlen_to_rtxlen(mp, len));
+ }
+
+ if (rgbno + len > *xfs_zone_last_written(rtg))
+ *xfs_zone_last_written(rtg) = rgbno + len;
+
+ rtg->rtg_written += len;
+ ASSERT(rtg->rtg_written <= rtg->rtg_write_pointer);
+ if (rtg->rtg_written == rtg->rtg_extents)
+ xfs_zone_mark_full(rtg);
+
+ xfs_trans_log_inode(tp, rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOG_CORE);
+
+ xfs_rtgroup_put(rtg);
+ return 0;
+}
+
+/*
+ * "Free" blocks allocated in a zone.
+ *
+ * Just decrement the used blocks counter and report the space as freed.
+ */
+int
+xfs_zone_free_blocks(
+ struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg,
+ xfs_fsblock_t fsbno,
+ xfs_filblks_t len)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ uint64_t *used = xfs_zone_used_counter(rtg);
+
+ xfs_assert_ilocked(rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOCK_EXCL);
+ if (len > *used) {
+ xfs_err(mp,
+"trying to free more blocks (%lld) than used counter (%lld).",
+ len, *used);
+ ASSERT(len <= *used);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return -EFSCORRUPTED;
+ }
+
+ trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len);
+
+ *used -= len;
+ if (rtg->rtg_written == rtg->rtg_extents) {
+ /*
+ * Mark up the zone as reclaimable, but only if the zone is full
+ * as we don't reclaim open zones. As an optimization kick of a
+ * zone reset if the usage counter hits zero.
+ */
+ if (*used == 0)
+ xfs_zone_emptied(rtg);
+ else if (*used + len == rtg->rtg_extents)
+ xfs_zone_mark_reclaimable(rtg);
+ }
+
+ xfs_add_frextents(mp, xfs_extlen_to_rtxlen(mp, len));
+ xfs_trans_log_inode(tp, rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOG_CORE);
+ return 0;
+}
+
+/*
+ * Check if the zone containing the data just before the offset we are
+ * writing to is still open and has space.
+ */
+static struct xfs_rtgroup *
+xfs_last_used_zone(
+ struct iomap_ioend *ioend)
+{
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset);
+ struct xfs_rtgroup *rtg = NULL;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb,
+ &icur, &got))
+ goto out_unlock;
+ ASSERT(!isnullstartblock(got.br_startblock));
+ rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock));
+ if (rtg && !test_bit(RTG_F_OPEN, &rtg->rtg_flags)) {
+ xfs_rtgroup_rele(rtg);
+ rtg = NULL;
+ }
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return rtg;
+}
+
+struct xfs_rtgroup *
+xfs_find_free_zone(
+ struct xfs_mount *mp)
+{
+ struct xfs_rtgroup *rtg;
+
+ lockdep_assert_held(&mp->m_zone_list_lock);
+
+ list_for_each_entry(rtg, &mp->m_free_zones, rtg_entry) {
+ ASSERT(rtg->rtg_write_pointer == 0);
+ if (atomic_inc_not_zero(&rtg->rtg_group.xg_active_ref)) {
+ list_del_init(&rtg->rtg_entry);
+ atomic_dec(&mp->m_nr_free_zones);
+ return rtg;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Activate a free zone.
+ *
+ * This just does the accounting and allows to find the zone on the open
+ * zones list. Don't bother with an explicit open command, we'll just open it
+ * implicitly with the first write to it.
+ */
+static struct xfs_rtgroup *
+xfs_activate_zone(
+ struct xfs_mount *mp)
+{
+ struct xfs_rtgroup *rtg;
+
+ if (atomic_read(&mp->m_nr_free_zones) <
+ XFS_GC_ZONES - XFS_OPEN_GC_ZONES)
+ return NULL;
+
+ rtg = xfs_find_free_zone(mp);
+ if (!rtg)
+ return NULL;
+
+ list_add_tail(&rtg->rtg_entry, &mp->m_open_zones);
+ mp->m_nr_open_zones++;
+ if (xfs_zoned_need_gc(mp))
+ wake_up_process(mp->m_zone_gc_thread);
+
+ /* XXX: this is a little verbose, but let's keep it for now */
+ xfs_info(mp, "using zone %u (%d)",
+ rtg_rgno(rtg), mp->m_nr_open_zones);
+ set_bit(RTG_F_OPEN, &rtg->rtg_flags);
+ trace_xfs_zone_activate(rtg);
+ return rtg;
+}
+
+/*
+ * For SMR hard drives that have no open limit, keep opening a new zone for each
+ * allocation context. If all zones in the system are open, use this simple LRU
+ * algorithm to pick then one that was least recently used.
+ *
+ * This requires that any reused zone is rotated to the end of the open list so
+ * that the next users doesn't pick it again.
+ */
+static struct xfs_rtgroup *
+xfs_select_open_zone_lru(
+ struct xfs_mount *mp,
+ unsigned int minlen)
+{
+ struct xfs_rtgroup *rtg;
+
+ list_for_each_entry(rtg, &mp->m_open_zones, rtg_entry) {
+ if (rtg->rtg_extents - rtg->rtg_write_pointer < minlen)
+ continue;
+ if (!atomic_inc_not_zero(&rtg->rtg_group.xg_active_ref))
+ continue;
+ list_move_tail(&rtg->rtg_entry, &mp->m_open_zones);
+ return rtg;
+ }
+
+ return NULL;
+}
+
+/*
+ * Pick a new zone for writes.
+ *
+ * If we aren't using up our budget of open zones just open a new one from
+ * the freelist. Else try to find one that matches the expected allocation
+ * length, or at least the minimum required length. If we don't find one
+ * that is good enough we pick one anyway and let the caller finish it to
+ * free up open zone resources.
+ */
+static struct xfs_rtgroup *
+xfs_select_zone_nowait(
+ struct xfs_inode *ip,
+ xfs_filblks_t count_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_rtgroup *rtg;
+
+ /*
+ * If we are below the open limit try to activate a zone.
+ */
+ if (mp->m_nr_open_zones < mp->m_max_open_zones - XFS_OPEN_GC_ZONES) {
+ rtg = xfs_activate_zone(mp);
+ if (rtg)
+ return rtg;
+ }
+
+ rtg = xfs_select_open_zone_lru(mp, count_fsb);
+ if (rtg)
+ return rtg;
+ return xfs_select_open_zone_lru(mp, 1);
+}
+
+static struct xfs_rtgroup *
+xfs_select_zone(
+ struct iomap_ioend *ioend)
+{
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_filblks_t count_fsb = XFS_B_TO_FSB(mp, ioend->io_size);
+ struct xfs_rtgroup *rtg = NULL;
+ DEFINE_WAIT (wait);
+
+ spin_lock(&mp->m_zone_list_lock);
+ if (xfs_is_shutdown(mp))
+ goto out_unlock;
+
+ rtg = xfs_select_zone_nowait(ip, count_fsb);
+ if (rtg)
+ goto out_unlock;
+
+ for (;;) {
+ prepare_to_wait(&mp->m_zone_wait, &wait, TASK_UNINTERRUPTIBLE);
+ if (xfs_is_shutdown(mp))
+ break;
+
+ rtg = xfs_select_zone_nowait(ip, count_fsb);
+ if (rtg)
+ break;
+
+ spin_unlock(&mp->m_zone_list_lock);
+ schedule();
+ spin_lock(&mp->m_zone_list_lock);
+ }
+ finish_wait(&mp->m_zone_wait, &wait);
+
+out_unlock:
+ spin_unlock(&mp->m_zone_list_lock);
+ return rtg;
+}
+
+static unsigned int
+xfs_zone_alloc_blocks(
+ struct iomap_ioend *ioend,
+ struct xfs_rtgroup *rtg,
+ bool *is_seq)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ xfs_filblks_t count_fsb = XFS_B_TO_FSB(mp, ioend->io_size);
+ xfs_rgblock_t rgbno;
+
+ spin_lock(&rtg->rtg_alloc_lock);
+ count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN,
+ (xfs_filblks_t)rtg->rtg_extents - rtg->rtg_write_pointer);
+ if (!count_fsb || !test_bit(RTG_F_OPEN, &rtg->rtg_flags)) {
+ spin_unlock(&rtg->rtg_alloc_lock);
+ return 0;
+ }
+ rgbno = rtg->rtg_write_pointer;
+ rtg->rtg_write_pointer += count_fsb;
+ spin_unlock(&rtg->rtg_alloc_lock);
+
+ trace_xfs_zone_alloc_blocks(rtg, rgbno, count_fsb);
+
+ *is_seq = test_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags);
+ if (*is_seq)
+ rgbno = 0;
+ ioend->io_sector = xfs_rtb_to_daddr(mp, xfs_rgbno_to_rtb(rtg, rgbno));
+ return XFS_FSB_TO_B(mp, count_fsb);
+}
+
+static inline void
+xfs_mark_rtg_boundary(
+ struct iomap_ioend *ioend)
+{
+ struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
+ sector_t sector = ioend->io_bio.bi_iter.bi_sector;
+
+ if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0)
+ ioend->io_flags |= IOMAP_F_BOUNDARY;
+}
+
+static void
+xfs_submit_zoned_bio(
+ struct iomap_ioend *ioend,
+ bool is_seq)
+{
+ if (is_seq) {
+ ioend->io_bio.bi_opf &= ~REQ_OP_WRITE;
+ ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND;
+ } else {
+ xfs_mark_rtg_boundary(ioend);
+ }
+
+ ioend->io_bio.bi_iter.bi_sector = ioend->io_sector;
+ submit_bio(&ioend->io_bio);
+}
+
+void
+xfs_zone_alloc_and_submit(
+ struct iomap_ioend *ioend,
+ struct xfs_rtgroup **rtg)
+{
+ unsigned int alloc_len;
+ struct iomap_ioend *split;
+ bool is_seq;
+
+ if (xfs_is_shutdown(XFS_I(ioend->io_inode)->i_mount))
+ goto out_error;
+
+ /*
+ * If we don't have a cached zone in this write context, see if the
+ * last extent before the one we are writing points of an active zone.
+ * If so, just continue writing to it.
+ */
+ if (!*rtg)
+ *rtg = xfs_last_used_zone(ioend);
+
+ if (!*rtg) {
+select_zone:
+ *rtg = xfs_select_zone(ioend);
+ if (!*rtg)
+ goto out_error;
+ }
+
+ alloc_len = xfs_zone_alloc_blocks(ioend, *rtg, &is_seq);
+ if (!alloc_len) {
+ xfs_zone_finish_alloc(*rtg);
+ goto select_zone;
+ }
+
+ while ((split = iomap_split_ioend(ioend, is_seq, &alloc_len))) {
+ xfs_submit_zoned_bio(split, is_seq);
+ if (!alloc_len) {
+ xfs_zone_finish_alloc(*rtg);
+ goto select_zone;
+ }
+ }
+
+ xfs_submit_zoned_bio(ioend, is_seq);
+ return;
+
+out_error:
+ bio_io_error(&ioend->io_bio);
+}
+
+void
+xfs_zone_finish_alloc(
+ struct xfs_rtgroup *rtg)
+{
+ if (rtg)
+ xfs_rtgroup_rele(rtg);
+}
+
+static void
+xfs_show_zone(
+ struct seq_file *m,
+ struct xfs_rtgroup *rtg)
+{
+ seq_printf(m, "\t zone %d, wp %u, written %u, used %llu\n",
+ rtg_rgno(rtg),
+ rtg->rtg_write_pointer, rtg->rtg_written,
+ *xfs_zone_used_counter(rtg));
+}
+
+void
+xfs_zoned_show_stats(
+ struct seq_file *m,
+ struct xfs_mount *mp)
+{
+ unsigned long index = 0;
+ unsigned count = 0;
+ struct xfs_rtgroup *rtg;
+
+ seq_puts(m, "\n");
+
+ seq_printf(m, "\tuser free blocks: %lld\n",
+ xfs_sum_freecounter(mp, FREE_RTEXTENTS));
+ seq_printf(m, "\treserved free blocks: %lld\n",
+ mp->m_resblks[FREE_RTEXTENTS].avail);
+ seq_printf(m, "\tuser available blocks: %lld\n",
+ xfs_sum_freecounter(mp, FREE_RTAVAILABLE));
+ seq_printf(m, "\treserved available blocks: %lld\n",
+ mp->m_resblks[FREE_RTAVAILABLE].avail);
+ seq_printf(m, "\treservations required: %d\n",
+ !list_empty_careful(&mp->m_reclaim_reservations));
+ seq_printf(m, "\tGC required: %d\n",
+ xfs_zoned_need_gc(mp));
+
+ spin_lock(&mp->m_zone_list_lock);
+ seq_printf(m, "\tfree zones: %d\n", atomic_read(&mp->m_nr_free_zones));
+ seq_puts(m, "\topen zones:\n");
+ list_for_each_entry(rtg, &mp->m_open_zones, rtg_entry)
+ xfs_show_zone(m, rtg);
+ if (mp->m_open_gc_zone) {
+ seq_puts(m, "\topen gc zone:\n");
+ xfs_show_zone(m, mp->m_open_gc_zone);
+ }
+ seq_puts(m, "\treclaimable zones:\n");
+ xa_for_each_marked(&mp->m_groups[XG_TYPE_RTG].xa, index, rtg,
+ XFS_RTG_RECLAIMABLE) {
+ if (++count > 20) {
+ seq_puts(m, "\t (truncated)\n");
+ break;
+ }
+ xfs_show_zone(m, rtg);
+ }
+ spin_unlock(&mp->m_zone_list_lock);
+}
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XFS_ZONE_ALLOC_H
+#define _XFS_ZONE_ALLOC_H
+
+void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
+ struct xfs_rtgroup **rtg);
+void xfs_zone_finish_alloc(struct xfs_rtgroup *rtg);
+int xfs_zone_record_blocks(struct xfs_trans *tp, xfs_fsblock_t fsbno,
+ xfs_filblks_t len, bool used);
+int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+ xfs_fsblock_t fsbno, xfs_filblks_t len);
+
+uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp, unsigned int idx);
+
+int xfs_mount_zones(struct xfs_mount *mp);
+void xfs_unmount_zones(struct xfs_mount *mp);
+
+#ifdef CONFIG_XFS_RT
+void xfs_zone_gc_start(struct xfs_mount *mp);
+void xfs_zone_gc_stop(struct xfs_mount *mp);
+#else
+static inline void xfs_zone_gc_start(struct xfs_mount *mp)
+{
+}
+static inline void xfs_zone_gc_stop(struct xfs_mount *mp)
+{
+}
+#endif /* CONFIG_XFS_RT */
+
+void xfs_zoned_show_stats(struct seq_file *m, struct xfs_mount *mp);
+
+uint64_t *xfs_zone_used_counter(struct xfs_rtgroup *rtg);
+uint64_t *xfs_zone_last_written(struct xfs_rtgroup *rtg);
+
+struct xfs_zone_alloc_ctx {
+ struct xfs_rtgroup *cached_rtg;
+ xfs_filblks_t reserved_blocks;
+};
+
+#define XFS_ZR_GREEDY (1U << 0)
+#define XFS_ZR_NOWAIT (1U << 1)
+#define XFS_ZR_RESERVED (1U << 2)
+
+int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb,
+ unsigned int flags, struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_space_unreserve(struct xfs_inode *ip,
+ struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb);
+bool xfs_zoned_need_gc(struct xfs_mount *mp);
+struct xfs_rtgroup *xfs_find_free_zone(struct xfs_mount *mp);
+
+#endif /* _XFS_ZONE_ALLOC_H */
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Christoph Hellwig.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_fsops.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_iomap.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_reflink.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zones.h"
+#include "xfs_trace.h"
+
+struct xfs_zone_reservation {
+ struct list_head entry;
+ struct task_struct *task;
+ xfs_rtxnum_t rtxlen;
+};
+
+uint64_t
+xfs_zoned_default_resblks(
+ struct xfs_mount *mp,
+ unsigned int idx)
+{
+ /*
+ * For the available blocks dipped into by the allocator only reserved the
+ * required GC zones.
+ */
+ if (idx == FREE_RTAVAILABLE)
+ return XFS_GC_ZONES * mp->m_groups[XG_TYPE_RTG].blocks;
+
+ /*
+ * For the user reported blocks, include at least the extra sparse zone
+ * and also any extra overprovisioning.
+ */
+ return XFS_RESERVED_ZONES * mp->m_groups[XG_TYPE_RTG].blocks +
+ XFS_B_TO_FSB(mp, mp->m_zoned_op);
+}
+
+/*
+ * We aim to keep enough zones free in stock to fully use the open zone limit
+ * for data placement purposes.
+ */
+bool
+xfs_zoned_need_gc(
+ struct xfs_mount *mp)
+{
+ if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
+ return false;
+ if (xfs_estimate_freecounter(mp, FREE_RTAVAILABLE) <
+ mp->m_groups[XG_TYPE_RTG].blocks *
+ (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
+ return true;
+ return false;
+}
+
+static void
+xfs_zoned_wake_all(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_reservation *reservation;
+
+ spin_lock(&mp->m_reservation_lock);
+ list_for_each_entry(reservation, &mp->m_reclaim_reservations, entry)
+ wake_up_process(reservation->task);
+ spin_unlock(&mp->m_reservation_lock);
+}
+
+void
+xfs_zoned_add_available(
+ struct xfs_mount *mp,
+ xfs_filblks_t count_fsb)
+{
+ struct xfs_zone_reservation *reservation;
+ xfs_rtxnum_t rtxlen;
+
+ rtxlen = xfs_extlen_to_rtxlen(mp, count_fsb);
+ if (list_empty_careful(&mp->m_reclaim_reservations)) {
+ xfs_add_freecounter(mp, FREE_RTAVAILABLE, rtxlen);
+ return;
+ }
+
+ spin_lock(&mp->m_reservation_lock);
+ xfs_add_freecounter(mp, FREE_RTAVAILABLE, rtxlen);
+ rtxlen = xfs_sum_freecounter(mp, FREE_RTAVAILABLE);
+ list_for_each_entry(reservation, &mp->m_reclaim_reservations, entry) {
+ if (reservation->rtxlen > rtxlen)
+ break;
+ wake_up_process(reservation->task);
+ rtxlen -= reservation->rtxlen;
+
+ }
+ spin_unlock(&mp->m_reservation_lock);
+}
+
+static int
+xfs_zoned_space_wait_error(
+ struct xfs_mount *mp)
+{
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+ if (fatal_signal_pending(current))
+ return -EINTR;
+ return 0;
+}
+
+static int
+xfs_zoned_reserve_available(
+ struct xfs_inode *ip,
+ xfs_rtxlen_t rtxlen,
+ unsigned int flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_zone_reservation reservation = {
+ .task = current,
+ .rtxlen = rtxlen,
+ };
+ int error;
+
+ if (likely(list_empty_careful(&mp->m_reclaim_reservations))) {
+ error = xfs_dec_freecounter(mp, FREE_RTAVAILABLE, rtxlen,
+ flags & XFS_ZR_RESERVED);
+ if (error != -ENOSPC)
+ return error;
+ }
+
+ if (flags & XFS_ZR_NOWAIT)
+ return -EAGAIN;
+
+ spin_lock(&mp->m_reservation_lock);
+ list_add_tail(&reservation.entry, &mp->m_reclaim_reservations);
+ while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
+ set_current_state(TASK_KILLABLE);
+
+ error = xfs_dec_freecounter(mp, FREE_RTAVAILABLE, rtxlen,
+ flags & XFS_ZR_RESERVED);
+ if (error != -ENOSPC)
+ break;
+
+ /*
+ * If there is nothing left to reclaim, give up.
+ */
+ if (!xfs_is_in_gc(mp) &&
+ !xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
+ break;
+
+ spin_unlock(&mp->m_reservation_lock);
+ schedule();
+ spin_lock(&mp->m_reservation_lock);
+ }
+ list_del(&reservation.entry);
+ spin_unlock(&mp->m_reservation_lock);
+
+ __set_current_state(TASK_RUNNING);
+ return error;
+}
+
+/*
+ * Implement greedy space allocation for short writes by trying to grab all
+ * that is left after locking out other threads from trying to do the same.
+ *
+ * This isn't exactly optimal and can hopefully be replaced by a proper
+ * percpu_counter primitive one day.
+ */
+static int
+xfs_zoned_reserve_extents_greedy(
+ struct xfs_inode *ip,
+ xfs_rtxlen_t *rtxlen,
+ unsigned int flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ s64 len = *rtxlen;
+ int error = -ENOSPC;
+
+ spin_lock(&mp->m_reservation_lock);
+ len = min(len, xfs_sum_freecounter(mp, FREE_RTEXTENTS));
+ if (len > 0) {
+ *rtxlen = len;
+ error = xfs_dec_freecounter(mp, FREE_RTEXTENTS, *rtxlen,
+ flags & XFS_ZR_RESERVED);
+ }
+ spin_unlock(&mp->m_reservation_lock);
+ return error;
+}
+
+int
+xfs_zoned_space_reserve(
+ struct xfs_inode *ip,
+ xfs_filblks_t count_fsb,
+ unsigned int flags,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_rtxlen_t rtxlen;
+ int error;
+
+ ac->cached_rtg = NULL;
+
+ rtxlen = xfs_extlen_to_rtxlen(mp, count_fsb);
+ error = xfs_dec_freecounter(mp, FREE_RTEXTENTS, rtxlen,
+ flags & XFS_ZR_RESERVED);
+ if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && rtxlen > 1) {
+ error = xfs_zoned_reserve_extents_greedy(ip, &rtxlen, flags);
+ if (error)
+ return error;
+ }
+ error = xfs_zoned_reserve_available(ip, rtxlen, flags);
+ if (error) {
+ xfs_add_freecounter(mp, FREE_RTEXTENTS, rtxlen);
+ return error;
+ }
+ ac->reserved_blocks = xfs_rtxlen_to_extlen(mp, rtxlen);
+ return 0;
+}
+
+void
+xfs_zoned_space_unreserve(
+ struct xfs_inode *ip,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ if (ac->reserved_blocks > 0) {
+ struct xfs_mount *mp = ip->i_mount;
+
+ xfs_zoned_add_available(mp, ac->reserved_blocks);
+ xfs_add_freecounter(mp, FREE_RTEXTENTS,
+ xfs_extlen_to_rtxlen(mp, ac->reserved_blocks));
+ }
+ xfs_zone_finish_alloc(ac->cached_rtg);
+}
+
+/*
+ * Split up rewrites in smaller chunks (1MB)
+ */
+#define XFS_GC_CHUNK_SIZE (1024u * 1024)
+
+#define XFS_ZONE_GC_NR_SCRATCH 2
+struct xfs_zone_scratch {
+ struct folio *folio;
+ unsigned int offset;
+ unsigned int freed;
+};
+
+struct xfs_gc_bio {
+ struct xfs_inode *ip;
+ loff_t offset;
+ unsigned int len;
+ bool is_seq;
+ xfs_fsblock_t old_startblock;
+ xfs_daddr_t new_daddr;
+ union {
+ struct xfs_zone_scratch *scratch;
+ struct xfs_zone_gc_data *data;
+ };
+
+ struct bio_vec bv;
+ struct bio bio; /* must be last */
+};
+
+struct xfs_zone_gc_data {
+ /* global GC state */
+ struct xfs_mount *mp;
+ struct bio_set bio_set;
+ struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH];
+ unsigned int scratch_idx;
+ struct bio_list read_done;
+ struct bio_list write_done;
+ struct bio_list reset_done;
+ spinlock_t list_lock;
+ unsigned int inflight;
+};
+
+static struct xfs_zone_gc_data *
+xfs_zone_gc_data_alloc(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_gc_data *data;
+ int i;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return NULL;
+
+ /*
+ * We actually only need a single bio_vec. It would be nice to have
+ * a flag that only allocates the inline bvecs and not the separate
+ * bvec pool.
+ */
+ if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
+ BIOSET_NEED_BVECS))
+ goto out_free_data;
+ for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
+ data->scratch[i].folio =
+ folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
+ if (!data->scratch[i].folio)
+ goto out_free_scratch;
+ }
+ spin_lock_init(&data->list_lock);
+ data->mp = mp;
+ return data;
+
+out_free_scratch:
+ while (--i >= 0)
+ folio_put(data->scratch[i].folio);
+ bioset_exit(&data->bio_set);
+out_free_data:
+ kfree(data);
+ return NULL;
+}
+
+static void
+xfs_zone_gc_data_free(
+ struct xfs_zone_gc_data *data)
+{
+ int i;
+
+ for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
+ folio_put(data->scratch[i].folio);
+ bioset_exit(&data->bio_set);
+ kfree(data);
+}
+
+#define XFS_ZONE_GC_RECS 32
+
+/* iterator, needs to be reinitialized for each victim zone */
+struct xfs_zone_gc_iter {
+ struct xfs_rtgroup *victim_rtg;
+ unsigned int rec_count;
+ unsigned int rec_idx;
+ xfs_agblock_t next_startblock;
+ struct xfs_rmap_irec recs[XFS_ZONE_GC_RECS];
+};
+
+static void
+xfs_zone_gc_iter_init(
+ struct xfs_zone_gc_iter *iter,
+ struct xfs_rtgroup *victim_rtg)
+
+{
+ iter->next_startblock = 0;
+ iter->rec_count = 0;
+ iter->rec_idx = 0;
+ iter->victim_rtg = victim_rtg;
+}
+
+static int
+xfs_zone_gc_query_cb(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *irec,
+ void *private)
+{
+ struct xfs_zone_gc_iter *iter = private;
+
+ ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
+ ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
+ ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
+
+ iter->recs[iter->rec_count] = *irec;
+ if (++iter->rec_count == XFS_ZONE_GC_RECS) {
+ iter->next_startblock =
+ irec->rm_startblock + irec->rm_blockcount;
+ return 1;
+ }
+ return 0;
+}
+
+static int
+xfs_zone_gc_rmap_rec_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_rmap_irec *reca = a;
+ const struct xfs_rmap_irec *recb = b;
+
+ if (reca->rm_owner < recb->rm_owner)
+ return -1;
+ if (reca->rm_owner > recb->rm_owner)
+ return 1;
+
+ if (reca->rm_offset < recb->rm_offset)
+ return -1;
+ if (reca->rm_offset < recb->rm_offset)
+ return 1;
+
+ return 0;
+}
+
+static int
+xfs_zone_gc_query(
+ struct xfs_mount *mp,
+ struct xfs_zone_gc_iter *iter)
+{
+ struct xfs_rtgroup *rtg = iter->victim_rtg;
+ struct xfs_rmap_irec ri_low = { };
+ struct xfs_rmap_irec ri_high;
+ struct xfs_btree_cur *cur;
+ struct xfs_trans *tp;
+ int error;
+
+ ASSERT(iter->next_startblock <= rtg->rtg_extents);
+ if (iter->next_startblock == rtg->rtg_extents)
+ goto done;
+
+ ASSERT(iter->next_startblock < rtg->rtg_extents);
+ ri_low.rm_startblock = iter->next_startblock;
+ memset(&ri_high, 0xFF, sizeof(ri_high));
+
+ iter->rec_idx = 0;
+ iter->rec_count = 0;
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ return error;
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
+ cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+ error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+ xfs_zone_gc_query_cb, iter);
+ xfs_btree_del_cursor(cur, error < 0 ? error : 0);
+ xfs_trans_cancel(tp);
+
+ if (error < 0)
+ return error;
+
+ /*
+ * Sort the rmap records by inode number and increasing offset to
+ * defragment the mappings.
+ *
+ * This could be further enhanced by an even bigger look ahead window,
+ * but that's better left until we have better detection of changes to
+ * inode mapping to avoid the potential of GCing already dead data.
+ */
+ sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
+ xfs_zone_gc_rmap_rec_cmp, NULL);
+
+ if (error == 0) {
+ /*
+ * We finished iterating through the zone.
+ */
+ iter->next_startblock = rtg->rtg_extents;
+ if (iter->rec_count == 0)
+ goto done;
+ }
+
+ return 0;
+done:
+ xfs_rtgroup_rele(iter->victim_rtg);
+ iter->victim_rtg = NULL;
+ return 0;
+}
+
+static bool
+xfs_zone_gc_iter_next(
+ struct xfs_mount *mp,
+ struct xfs_zone_gc_iter *iter,
+ struct xfs_rmap_irec *chunk_rec,
+ struct xfs_inode **ipp)
+{
+ struct xfs_rmap_irec *irec;
+ int error;
+
+ if (!iter->victim_rtg)
+ return false;
+
+ if (iter->rec_idx == iter->rec_count) {
+retry:
+ error = xfs_zone_gc_query(mp, iter);
+ if (error)
+ goto fail;
+ if (!iter->victim_rtg)
+ return false;
+ }
+
+ irec = &iter->recs[iter->rec_idx];
+ error = xfs_iget(mp, NULL, irec->rm_owner, XFS_IGET_NORETRY |
+ XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
+ if (error) {
+ if (error == -EAGAIN || error == -ENOENT) {
+ iter->next_startblock = irec->rm_startblock;
+ goto retry;
+ }
+ goto fail;
+ }
+
+ if (!S_ISREG(VFS_I(*ipp)->i_mode)) {
+ iter->next_startblock = irec->rm_startblock;
+ xfs_irele(*ipp);
+ goto retry;
+ }
+
+ *chunk_rec = *irec;
+ return true;
+
+fail:
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ return false;
+}
+
+static void
+xfs_zone_gc_iter_advance(
+ struct xfs_zone_gc_iter *iter,
+ xfs_extlen_t count_fsb)
+{
+ struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx];
+
+ irec->rm_offset += count_fsb;
+ irec->rm_startblock += count_fsb;
+ irec->rm_blockcount -= count_fsb;
+ if (!irec->rm_blockcount)
+ iter->rec_idx++;
+}
+
+/*
+ * Iterate through all zones marked as reclaimable and find a candidate that is
+ * either good enough for instant reclaim, or the one with the least used space.
+ */
+static bool
+xfs_zone_reclaim_pick(
+ struct xfs_mount *mp,
+ struct xfs_zone_gc_iter *iter)
+{
+ struct xfs_rtgroup *victim_rtg = NULL, *rtg;
+ u64 victim_used = U64_MAX;
+ unsigned long index = 0;
+ bool easy = false;
+
+ if (xfs_is_shutdown(mp))
+ return false;
+
+ if (iter->victim_rtg)
+ return true;
+
+ /*
+ * Don't start new work if we are asked to stop or park.
+ */
+ if (kthread_should_stop() || kthread_should_park())
+ return false;
+
+ if (!xfs_zoned_need_gc(mp))
+ return false;
+
+ rcu_read_lock();
+ xa_for_each_marked(&mp->m_groups[XG_TYPE_RTG].xa, index, rtg,
+ XFS_RTG_RECLAIMABLE) {
+ u64 used = *xfs_zone_used_counter(rtg);
+
+ if (used >= victim_used)
+ continue;
+ if (!atomic_inc_not_zero(&rtg->rtg_group.xg_active_ref))
+ continue;
+
+ if (victim_rtg)
+ xfs_rtgroup_rele(victim_rtg);
+ victim_rtg = rtg;
+ victim_used = used;
+
+ /*
+ * Any zone that is less than 1 percent used is fair game for
+ * instant reclaim.
+ */
+ if (used < div_u64(rtg->rtg_extents, 100)) {
+ easy = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (!victim_rtg)
+ return false;
+
+ xfs_info(mp, "reclaiming zone %d, used = %lld/%llu (%s)",
+ rtg_rgno(victim_rtg), victim_used,
+ victim_rtg->rtg_extents,
+ easy ? "easy" : "best");
+ trace_xfs_zone_reclaim(victim_rtg);
+ xfs_zone_gc_iter_init(iter, victim_rtg);
+ return true;
+}
+
+static struct xfs_rtgroup *
+xfs_select_gc_zone(
+ struct xfs_mount *mp)
+{
+ struct xfs_rtgroup *rtg = mp->m_open_gc_zone;
+
+ if (rtg && rtg->rtg_write_pointer == rtg->rtg_extents) {
+ /*
+ * We need to wait for pending writes to finish.
+ */
+ if (rtg->rtg_written < rtg->rtg_extents)
+ return NULL;
+ xfs_rtgroup_rele(rtg);
+ rtg = NULL;
+ }
+
+ if (!rtg) {
+ spin_lock(&mp->m_zone_list_lock);
+ rtg = xfs_find_free_zone(mp);
+ spin_unlock(&mp->m_zone_list_lock);
+
+ if (rtg)
+ trace_xfs_gc_zone_activate(rtg);
+ mp->m_open_gc_zone = rtg;
+ }
+
+ return rtg;
+}
+
+static unsigned int
+xfs_zone_gc_scratch_available(
+ struct xfs_zone_gc_data *data)
+{
+ return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
+}
+
+static bool
+xfs_zone_gc_space_available(
+ struct xfs_zone_gc_data *data)
+{
+ struct xfs_rtgroup *rtg;
+
+ rtg = xfs_select_gc_zone(data->mp);
+ if (!rtg)
+ return false;
+ return rtg->rtg_write_pointer < rtg->rtg_extents &&
+ xfs_zone_gc_scratch_available(data);
+}
+
+static void
+xfs_zone_gc_end_io(
+ struct bio *bio)
+{
+ struct xfs_zone_gc_data *data = bio->bi_private;
+ unsigned long flags;
+
+ spin_lock_irqsave(&data->list_lock, flags);
+ if (bio_op(bio) == REQ_OP_READ)
+ bio_list_add(&data->read_done, bio);
+ else
+ bio_list_add(&data->write_done, bio);
+ wake_up_process(data->mp->m_zone_gc_thread);
+ spin_unlock_irqrestore(&data->list_lock, flags);
+}
+
+static bool
+xfs_zone_gc_allocate(
+ struct xfs_zone_gc_data *data,
+ xfs_extlen_t *count_fsb,
+ xfs_daddr_t *daddr,
+ bool *is_seq)
+{
+ struct xfs_mount *mp = data->mp;
+ xfs_rtxnum_t rtxres, rtxlen;
+ xfs_rgblock_t rgbno = 0;
+ struct xfs_rtgroup *rtg;
+
+ rtg = xfs_select_gc_zone(mp);
+ if (!rtg)
+ return false;
+
+ *count_fsb = min(*count_fsb,
+ XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
+
+ /*
+ * Directly allocate GC blocks from the reserved pool.
+ *
+ * If we'd take them from the normal pool we could be stealing blocks a
+ * regular writer, which would then have to wait for GC and deadlock.
+ */
+ spin_lock(&mp->m_sb_lock);
+ rtxres = min(mp->m_resblks[FREE_RTEXTENTS].avail,
+ mp->m_resblks[FREE_RTAVAILABLE].avail);
+ rtxlen = min3(rtxres,
+ rtg->rtg_extents - rtg->rtg_write_pointer,
+ xfs_extlen_to_rtxlen(mp, *count_fsb));
+ mp->m_resblks[FREE_RTEXTENTS].avail -= rtxlen;
+ mp->m_resblks[FREE_RTAVAILABLE].avail -= rtxlen;
+ spin_unlock(&mp->m_sb_lock);
+
+ if (!rtxlen)
+ return false;
+ *count_fsb = xfs_rtxlen_to_extlen(mp, rtxlen);
+ *is_seq = test_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags);
+ if (!*is_seq)
+ rgbno = rtg->rtg_write_pointer;
+ rtg->rtg_write_pointer += *count_fsb;
+ *daddr = xfs_gbno_to_daddr(&rtg->rtg_group, rgbno);
+ return true;
+}
+
+static bool
+xfs_zone_gc_start_chunk(
+ struct xfs_zone_gc_data *data,
+ struct xfs_zone_gc_iter *iter)
+{
+ struct xfs_mount *mp = data->mp;
+ struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
+ struct xfs_rmap_irec irec;
+ struct xfs_gc_bio *chunk;
+ struct xfs_inode *ip;
+ struct bio *bio;
+ xfs_daddr_t daddr;
+ bool is_seq;
+
+ if (xfs_is_shutdown(mp))
+ return false;
+
+ if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
+ return false;
+ if (!xfs_zone_gc_allocate(data, &irec.rm_blockcount, &daddr, &is_seq)) {
+ xfs_irele(ip);
+ return false;
+ }
+
+ bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
+
+ chunk = container_of(bio, struct xfs_gc_bio, bio);
+ chunk->ip = ip;
+ chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
+ chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
+ chunk->old_startblock =
+ xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
+ chunk->new_daddr = daddr;
+ chunk->is_seq = is_seq;
+ chunk->scratch = &data->scratch[data->scratch_idx];
+
+ bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
+ bio->bi_end_io = xfs_zone_gc_end_io;
+ bio->bi_private = data;
+ bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
+ chunk->scratch->offset);
+ chunk->scratch->offset += chunk->len;
+ if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
+ data->scratch_idx =
+ (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
+ }
+ data->inflight++;
+ xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
+
+ submit_bio(bio);
+ return true;
+}
+
+static void
+xfs_zone_gc_free_chunk(
+ struct xfs_zone_gc_data *data,
+ struct xfs_gc_bio *chunk)
+{
+ data->inflight--;
+ xfs_irele(chunk->ip);
+ bio_put(&chunk->bio);
+}
+
+static void
+xfs_gc_submit_write(
+ struct xfs_zone_gc_data *data,
+ struct xfs_gc_bio *chunk)
+{
+ if (chunk->is_seq) {
+ chunk->bio.bi_opf &= ~REQ_OP_WRITE;
+ chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
+ }
+ chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
+ chunk->bio.bi_end_io = xfs_zone_gc_end_io;
+ chunk->bio.bi_private = data;
+ submit_bio(&chunk->bio);
+}
+
+static struct xfs_gc_bio *
+xfs_gc_split_write(
+ struct xfs_zone_gc_data *data,
+ struct xfs_gc_bio *chunk)
+{
+ struct queue_limits *lim =
+ &bdev_get_queue(chunk->bio.bi_bdev)->limits;
+ struct xfs_gc_bio *split_chunk;
+ int split_sectors;
+ unsigned int split_len;
+ struct bio *split;
+ unsigned int nsegs;
+
+ if (!chunk->is_seq)
+ return NULL;
+
+ split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
+ queue_limits_max_zone_append_sectors(lim) << SECTOR_SHIFT);
+ if (!split_sectors)
+ return NULL;
+ split_len = split_sectors << SECTOR_SHIFT;
+
+ split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
+ split_chunk = container_of(split, struct xfs_gc_bio, bio);
+ ihold(VFS_I(chunk->ip));
+ split_chunk->ip = chunk->ip;
+ split_chunk->is_seq = chunk->is_seq;
+ split_chunk->scratch = chunk->scratch;
+ split_chunk->offset = chunk->offset;
+ split_chunk->len = split_len;
+ split_chunk->old_startblock = chunk->old_startblock;
+ split_chunk->new_daddr = chunk->new_daddr;
+
+ chunk->offset += split_len;
+ chunk->len -= split_len;
+ chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
+
+ data->inflight++;
+ return split_chunk;
+}
+
+static void
+xfs_zone_gc_write_chunk(
+ struct xfs_zone_gc_data *data,
+ struct bio *bio)
+{
+ struct xfs_gc_bio *chunk =
+ container_of(bio, struct xfs_gc_bio, bio);
+ struct xfs_mount *mp = chunk->ip->i_mount;
+ unsigned int folio_offset = bio->bi_io_vec->bv_offset;
+ struct xfs_gc_bio *split_chunk;
+
+ if (bio->bi_status)
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ if (xfs_is_shutdown(mp)) {
+ xfs_zone_gc_free_chunk(data, chunk);
+ return;
+ }
+
+ bio_reset(bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
+ bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
+ folio_offset);
+
+ while ((split_chunk = xfs_gc_split_write(data, chunk)))
+ xfs_gc_submit_write(data, split_chunk);
+ xfs_gc_submit_write(data, chunk);
+}
+
+static void
+xfs_zone_gc_finish_chunk(
+ struct xfs_zone_gc_data *data,
+ struct bio *bio)
+{
+ struct xfs_gc_bio *chunk =
+ container_of(bio, struct xfs_gc_bio, bio);
+ uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ struct xfs_inode *ip = chunk->ip;
+ struct xfs_mount *mp = ip->i_mount;
+ int error;
+
+ if (bio->bi_status)
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ if (xfs_is_shutdown(mp)) {
+ xfs_zone_gc_free_chunk(data, chunk);
+ return;
+ }
+
+ chunk->scratch->freed += chunk->len;
+ if (chunk->scratch->freed == chunk->scratch->offset) {
+ chunk->scratch->offset = 0;
+ chunk->scratch->freed = 0;
+ }
+
+ /*
+ * Cycle through the iolock and wait for direct I/O and layouts to
+ * ensure no one is reading from the old mapping before it goes away.
+ */
+ xfs_ilock(ip, iolock);
+ error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
+ if (!error)
+ inode_dio_wait(VFS_I(ip));
+ xfs_iunlock(ip, iolock);
+ if (error)
+ goto free;
+
+ if (chunk->is_seq)
+ chunk->new_daddr = bio->bi_iter.bi_sector;
+ error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
+ chunk->new_daddr, chunk->old_startblock);
+free:
+ if (error)
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ xfs_zone_gc_free_chunk(data, chunk);
+}
+
+static void
+xfs_zone_gc_finish_reset(
+ struct xfs_rtgroup *rtg,
+ struct bio *bio)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ if (bio->bi_status) {
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ goto out;
+ }
+
+ spin_lock(&mp->m_zone_list_lock);
+ list_add_tail(&rtg->rtg_entry, &mp->m_free_zones);
+ atomic_inc(&mp->m_nr_free_zones);
+ spin_unlock(&mp->m_zone_list_lock);
+
+ xfs_zoned_add_available(mp, rtg->rtg_extents);
+
+ wake_up_all(&mp->m_zone_wait);
+out:
+ bio_put(bio);
+}
+
+static void
+xfs_zone_reset_end_io(
+ struct bio *bio)
+{
+ struct xfs_zone_gc_data *data =
+ container_of(bio, struct xfs_gc_bio, bio)->data;
+ struct xfs_rtgroup *rtg = bio->bi_private;
+ unsigned long flags;
+
+ spin_lock_irqsave(&data->list_lock, flags);
+ bio_list_add(&data->reset_done, bio);
+ data->inflight--;
+ wake_up_process(rtg_mount(rtg)->m_zone_gc_thread);
+ spin_unlock_irqrestore(&data->list_lock, flags);
+}
+
+static struct bio *
+xfs_prepare_zone_reset(
+ struct xfs_rtgroup *rtg,
+ struct xfs_zone_gc_data *data)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
+ struct bio *bio;
+
+ spin_lock(&rtg->rtg_alloc_lock);
+ rtg->rtg_write_pointer = 0;
+ spin_unlock(&rtg->rtg_alloc_lock);
+
+ xfs_ilock(rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOCK_EXCL);
+ ASSERT(*xfs_zone_used_counter(rtg) == 0);
+ rtg->rtg_written = 0;
+ *xfs_zone_last_written(rtg) = 0;
+ xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOCK_EXCL);
+
+ trace_xfs_zone_reset(rtg);
+
+ bio = bio_alloc_bioset(bdev, 0, REQ_OP_ZONE_RESET, GFP_NOFS,
+ data ? &data->bio_set : &fs_bio_set);
+ bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
+ if (!test_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags)) {
+ bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
+ bio->bi_iter.bi_size = XFS_FSB_TO_B(mp, rtg->rtg_extents);
+ }
+ return bio;
+}
+
+static void
+xfs_reset_empty_zones(
+ struct xfs_zone_gc_data *data,
+ struct list_head *empty_zones)
+{
+ struct xfs_rtgroup *rtg;
+ struct bio *bio;
+
+ if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
+ xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
+ return;
+ }
+
+ while ((rtg = list_first_entry_or_null(empty_zones,
+ struct xfs_rtgroup, rtg_entry))) {
+ list_del_init(&rtg->rtg_entry);
+
+ xfs_log_force_inode(rtg->rtg_inodes[XFS_RTGI_RMAP]);
+
+ bio = xfs_prepare_zone_reset(rtg, data);
+ bio->bi_private = rtg;
+ bio->bi_end_io = xfs_zone_reset_end_io;
+ data->inflight++;
+ container_of(bio, struct xfs_gc_bio, bio)->data = data;
+ submit_bio(bio);
+ }
+}
+
+static bool
+xfs_zone_gc_handle_work(
+ struct xfs_zone_gc_data *data,
+ struct xfs_zone_gc_iter *iter)
+{
+ struct bio_list read_done = BIO_EMPTY_LIST;
+ struct bio_list write_done = BIO_EMPTY_LIST;
+ struct bio_list reset_done = BIO_EMPTY_LIST;
+ LIST_HEAD (empty_zones);
+ struct blk_plug plug;
+ struct bio *bio;
+
+ spin_lock_irq(&data->list_lock);
+ bio_list_merge_init(&read_done, &data->read_done);
+ bio_list_merge_init(&write_done, &data->write_done);
+ bio_list_merge_init(&reset_done, &data->reset_done);
+ spin_unlock_irq(&data->list_lock);
+
+ spin_lock(&data->mp->m_zone_list_lock);
+ list_splice_init(&data->mp->m_emptied_zones, &empty_zones);
+ spin_unlock(&data->mp->m_zone_list_lock);
+
+ if (!xfs_zone_reclaim_pick(data->mp, iter) ||
+ !xfs_zone_gc_space_available(data)) {
+ if (bio_list_empty(&read_done) &&
+ bio_list_empty(&write_done) &&
+ bio_list_empty(&reset_done) &&
+ list_empty(&empty_zones))
+ return false;
+ }
+
+ __set_current_state(TASK_RUNNING);
+ try_to_freeze();
+
+ while ((bio = bio_list_pop(&reset_done)))
+ xfs_zone_gc_finish_reset(bio->bi_private, bio);
+
+ if (!list_empty(&empty_zones))
+ xfs_reset_empty_zones(data, &empty_zones);
+
+ blk_start_plug(&plug);
+ while ((bio = bio_list_pop(&read_done)))
+ xfs_zone_gc_write_chunk(data, bio);
+ blk_finish_plug(&plug);
+
+ while ((bio = bio_list_pop(&write_done)))
+ xfs_zone_gc_finish_chunk(data, bio);
+
+ blk_start_plug(&plug);
+ while (xfs_zone_gc_start_chunk(data, iter))
+ ;
+ blk_finish_plug(&plug);
+ return true;
+}
+
+/*
+ * XXX: This breaks reflinks and thus duplicates data that was shared by
+ * multiple owners before.
+ */
+static int
+xfs_zoned_gcd(
+ void *private)
+{
+ struct xfs_mount *mp = private;
+ unsigned int nofs_flag;
+ struct xfs_zone_gc_data *data;
+ struct xfs_zone_gc_iter *iter;
+
+ data = xfs_zone_gc_data_alloc(mp);
+ if (!data)
+ return -ENOMEM;
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ goto out_free_data;
+
+ nofs_flag = memalloc_nofs_save();
+ set_freezable();
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
+ xfs_set_in_gc(mp);
+ if (xfs_zone_gc_handle_work(data, iter))
+ continue;
+
+ if (!data->inflight) {
+ xfs_clear_in_gc(mp);
+ xfs_zoned_wake_all(mp);
+
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ break;
+ }
+
+ if (kthread_should_park()) {
+ __set_current_state(TASK_RUNNING);
+ kthread_parkme();
+ continue;
+ }
+ }
+
+ schedule();
+ }
+ xfs_clear_in_gc(mp);
+
+ if (iter->victim_rtg)
+ xfs_rtgroup_rele(iter->victim_rtg);
+ if (mp->m_open_gc_zone)
+ xfs_rtgroup_rele(mp->m_open_gc_zone);
+
+ memalloc_nofs_restore(nofs_flag);
+ kfree(iter);
+out_free_data:
+ xfs_zone_gc_data_free(data);
+ return 0;
+}
+
+static struct xfs_rtgroup *
+xfs_pick_open_zone_for_gc(
+ struct xfs_mount *mp)
+{
+ struct xfs_rtgroup *rtg, *found = NULL;
+
+ list_for_each_entry(rtg, &mp->m_open_zones, rtg_entry) {
+ if (!found)
+ found = rtg;
+ else if (rtg->rtg_write_pointer < found->rtg_write_pointer)
+ found = rtg;
+ }
+
+ return found;
+}
+
+void
+xfs_zone_gc_start(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_zoned(mp))
+ kthread_unpark(mp->m_zone_gc_thread);
+}
+
+void
+xfs_zone_gc_stop(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_zoned(mp))
+ kthread_park(mp->m_zone_gc_thread);
+}
+
+static int
+xfs_get_zone_info_cb(
+ struct blk_zone *zone,
+ unsigned int idx,
+ void *data)
+{
+ struct xfs_mount *mp = data;
+ xfs_fsblock_t zsbno = xfs_daddr_to_rtb(mp, zone->start);
+ xfs_rgnumber_t rgno;
+ struct xfs_rtgroup *rtg;
+ int error;
+
+ if (xfs_rtb_to_rgbno(mp, zsbno) != 0) {
+ xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno);
+ return -EFSCORRUPTED;
+ }
+
+ rgno = xfs_rtb_to_rgno(mp, zsbno);
+ rtg = xfs_rtgroup_get(mp, rgno);
+ if (!rtg) {
+ xfs_warn(mp, "realtime group not found for zone %u.", rgno);
+ return -EFSCORRUPTED;
+ }
+ error = xfs_zone_validate(zone, rtg);
+ xfs_rtgroup_put(rtg);
+ return error;
+}
+
+static int
+xfs_init_zone(
+ struct xfs_rtgroup *rtg,
+ uint64_t *available,
+ uint64_t *freedblocks)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ uint64_t used = *xfs_zone_used_counter(rtg);
+
+ if (rtg->rtg_write_pointer == rtg->rtg_extents && used == 0) {
+ struct bio *bio;
+ int error;
+
+ bio = xfs_prepare_zone_reset(rtg, NULL);
+ error = submit_bio_wait(bio);
+ bio_put(bio);
+ if (error)
+ return error;
+ } else {
+ /*
+ * For sequential write required zones, xfs_get_zone_info_cb
+ * initializes rtg_write_pointer to the hardware write pointer.
+ *
+ * We initialize it to the last recorded writes for conventional
+ * zone, as we don't know what actually got written, just what
+ * we are able to record in the I/O completion handler.
+ */
+ if (!test_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags))
+ rtg->rtg_write_pointer = *xfs_zone_last_written(rtg);
+
+ /*
+ * There can't be any I/O in flight we need to care about at
+ * mount time, so treat the write pointer as the completed
+ * write counter.
+ */
+ rtg->rtg_written = rtg->rtg_write_pointer;
+ }
+
+ if (rtg->rtg_write_pointer == 0) {
+ /* zone is free */
+ list_add_tail(&rtg->rtg_entry, &mp->m_free_zones);
+ atomic_inc(&mp->m_nr_free_zones);
+ *available += rtg->rtg_extents;
+ } else if (rtg->rtg_write_pointer < rtg->rtg_extents) {
+ /* zone is open */
+ list_add(&rtg->rtg_entry, &mp->m_open_zones);
+ mp->m_nr_open_zones++;
+ set_bit(RTG_F_OPEN, &rtg->rtg_flags);
+ *available += (rtg->rtg_extents - rtg->rtg_write_pointer);
+ *freedblocks += (rtg->rtg_write_pointer) - used;
+ } else if (used < rtg->rtg_extents) {
+ /* zone fully written, but has freed blocks */
+ xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE);
+ *freedblocks += (rtg->rtg_extents - used);
+ }
+
+ return 0;
+}
+
+/*
+ * Calculate the max open zone limit based on the of number of
+ * backing zones available
+ */
+static inline uint32_t
+xfs_max_open_zones(
+ struct xfs_mount *mp)
+{
+ unsigned int max_open, max_open_data_zones;
+ /*
+ * We need two zones for every open data zone,
+ * one in reserve as we don't reclaim open zones. One data zone
+ * and its spare is included in XFS_MIN_ZONES.
+ */
+ max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1;
+ max_open = max_open_data_zones + XFS_OPEN_GC_ZONES;
+
+ /*
+ * Cap the max open limit to 1/4 of available space
+ */
+ max_open = min(max_open, mp->m_sb.sb_rgcount / 4);
+
+ return max(XFS_MIN_OPEN_ZONES, max_open);
+}
+
+int
+xfs_mount_zones(
+ struct xfs_mount *mp)
+{
+ struct xfs_buftarg *bt = mp->m_rtdev_targp;
+ unsigned int bdev_open_zones;
+ int64_t available = 0, freedblocks = 0;
+ struct xfs_rtgroup *rtg = NULL;
+ int error;
+
+ if (!bt) {
+ xfs_notice(mp, "RT device missing.");
+ return -EINVAL;
+ }
+
+ if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) {
+ xfs_notice(mp, "invalid flag combination.");
+ return -EFSCORRUPTED;
+ }
+ if (mp->m_sb.sb_rextsize != 1) {
+ xfs_notice(mp, "zoned file systems do not support rextsize.");
+ return -EFSCORRUPTED;
+ }
+ if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) {
+ xfs_notice(mp,
+"zoned file systems need to have at least %d zones.", XFS_MIN_ZONES);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Normally we pick the open zone limit that the device reports. If
+ * there isn't one let the user pick one from the command line.
+ *
+ * If the device doesn't report an open zone limit and there is no
+ * override, allow to hold about half of the zones open. In theory we
+ * should allow to be open, but at that point we run into GC deadlocks
+ * because we (at least currently) can't reclaim open zones.
+ *
+ * When used on conventional SSDs a lower open limit is advisable as
+ * we'll otherwise overwhelm the FTL just as much as a conventional
+ * block allocator.
+ *
+ * Note: To debug the open zone management code, force max_open to
+ * 1 here.
+ */
+ bdev_open_zones = bdev_max_open_zones(bt->bt_bdev);
+ if (bdev_open_zones && !mp->m_max_open_zones)
+ mp->m_max_open_zones = bdev_open_zones;
+ if (mp->m_max_open_zones) {
+ if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) {
+ xfs_notice(mp, "need at least %d open zones.",
+ XFS_MIN_OPEN_ZONES);
+ return -EIO;
+ }
+ if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) {
+ xfs_warn(mp, "device only supports %d open zones.\n",
+ bdev_open_zones);
+ mp->m_max_open_zones = bdev_open_zones;
+ }
+ if (mp->m_max_open_zones > xfs_max_open_zones(mp)) {
+ mp->m_max_open_zones = xfs_max_open_zones(mp);
+ xfs_info(mp,
+"limiting open zones to %u due to total zone count (%u)",
+ mp->m_max_open_zones, mp->m_sb.sb_rgcount);
+ }
+ } else {
+ mp->m_max_open_zones = xfs_max_open_zones(mp);
+ }
+
+ INIT_LIST_HEAD(&mp->m_free_zones);
+ INIT_LIST_HEAD(&mp->m_open_zones);
+ INIT_LIST_HEAD(&mp->m_emptied_zones);
+ INIT_LIST_HEAD(&mp->m_reclaim_reservations);
+ spin_lock_init(&mp->m_zone_list_lock);
+ spin_lock_init(&mp->m_reservation_lock);
+ init_waitqueue_head(&mp->m_zone_wait);
+
+ xfs_info(mp, "%u zones of %u blocks size (%d max open)",
+ mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
+ mp->m_max_open_zones);
+
+ /*
+ * Sync our own information with the hardware zone state.
+ */
+ if (bdev_is_zoned(bt->bt_bdev)) {
+ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+ xfs_warn(mp,
+"zoned device support requires CONFIG_BLK_DEV_ZONED");
+ return -EINVAL;
+ }
+ error = blkdev_report_zones(bt->bt_bdev, 0, mp->m_sb.sb_rgcount,
+ xfs_get_zone_info_cb, mp);
+ if (error < 0)
+ return error;
+ }
+
+ mp->m_zone_gc_thread = kthread_create(xfs_zoned_gcd, mp,
+ "xfs-zone-gc/%s",
+ mp->m_super->s_id);
+ if (IS_ERR(mp->m_zone_gc_thread)) {
+ xfs_warn(mp, "unable to create zone gc thread");
+ return PTR_ERR(mp->m_zone_gc_thread);
+ }
+ /* xfs_zone_gc_start will unpark for rw mounts */
+ kthread_park(mp->m_zone_gc_thread);
+
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ error = xfs_init_zone(rtg, &available, &freedblocks);
+ if (error)
+ goto out_unlink_zones;
+ }
+
+ /*
+ * XXX: convert to rtxlen. Or just give up on the conversion because
+ * we have a 1:1 mapping.
+ */
+ percpu_counter_set(&mp->m_free[FREE_RTAVAILABLE], available);
+ percpu_counter_set(&mp->m_free[FREE_RTEXTENTS],
+ available + freedblocks);
+
+ /*
+ * If there are no free zones available for GC, pick the open zone with
+ * the least used space to GC into.
+ */
+ if (list_empty(&mp->m_free_zones)) {
+ rtg = xfs_pick_open_zone_for_gc(mp);
+ if (!rtg) {
+ error = -EINVAL;
+ goto out_unlink_zones;
+ }
+ list_del_init(&rtg->rtg_entry);
+ mp->m_nr_open_zones--;
+ clear_bit(RTG_F_OPEN, &rtg->rtg_flags);
+ mp->m_open_gc_zone = rtg;
+ }
+ return 0;
+
+out_unlink_zones:
+ rtg = NULL;
+ while ((rtg = xfs_rtgroup_next(mp, rtg)))
+ list_del_init(&rtg->rtg_entry);
+ return error;
+}
+
+void
+xfs_unmount_zones(
+ struct xfs_mount *mp)
+{
+ struct xfs_rtgroup *rtg = NULL;
+
+ kthread_stop(mp->m_zone_gc_thread);
+ while ((rtg = xfs_rtgroup_next(mp, rtg)))
+ list_del_init(&rtg->rtg_entry);
+}