There is no need to assign a zone util we submit the bio using it.
Assigning a zone earlier and incrementing the write pointer just means
we us up zone active resources longer than required. So instead of
allocating the block in the iomap_begin and map_blocks methods, just
provide stub iomaps there, and then allocate the blocks just before
submitting the bios, which also fits in really nicely with the flow
to split the bios to the hardware limits.
Based on a really old recommendation from Damien.
Signed-off-by: Christoph Hellwig <hch@lst.de>
}
EXPORT_SYMBOL_GPL(iomap_init_ioend);
-struct iomap_ioend *iomap_split_zone_append_ioend(struct iomap_ioend *ioend)
+struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, bool is_append,
+ unsigned int *alloc_len)
{
struct bio *bio = &ioend->io_bio;
- struct queue_limits *lim = bdev_limits(bio->bi_bdev);
struct iomap_ioend *split_ioend;
struct bio *split;
int sector_offset;
unsigned int nr_segs;
- sector_offset = bio_split_rw_at(bio, lim, &nr_segs,
- queue_limits_max_zone_append_sectors(lim) << SECTOR_SHIFT);
- if (!sector_offset)
- return NULL;
+ if (is_append) {
+ struct queue_limits *lim = bdev_limits(bio->bi_bdev);
+
+ sector_offset = bio_split_rw_at(bio, lim, &nr_segs,
+ min(queue_limits_max_zone_append_sectors(lim) <<
+ SECTOR_SHIFT,
+ *alloc_len));
+ if (!sector_offset)
+ return NULL;
+ } else {
+ if (bio->bi_iter.bi_size <= *alloc_len)
+ return NULL;
+ sector_offset = *alloc_len >> SECTOR_SHIFT;
+ }
/* ensure the split ioend is still block size aligned */
sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT,
split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset);
if (!split)
return NULL;
-
split->bi_private = bio->bi_private;
+ split->bi_end_io = bio->bi_end_io;
+
split_ioend = iomap_init_ioend(ioend->io_inode, split,
ioend->io_offset, ioend->io_type, ioend->io_flags,
ioend->io_isdirect);
ioend->io_offset += split_ioend->io_size;
ioend->io_size -= split_ioend->io_size;
- /* keep pointing to the zone start */
- ioend->io_sector = split_ioend->io_sector;
- bio->bi_iter.bi_sector = split->bi_iter.bi_sector;
+ split_ioend->io_sector = ioend->io_sector;
+ if (!is_append)
+ ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT);
+
+ *alloc_len -= split->bi_iter.bi_size;
return split_ioend;
}
-EXPORT_SYMBOL_GPL(iomap_split_zone_append_ioend);
+EXPORT_SYMBOL_GPL(iomap_split_ioend);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
- unsigned alloc_flags = 0;
+ xfs_filblks_t count_fsb;
struct xfs_bmbt_irec imap, del;
struct xfs_iext_cursor icur;
- int error = 0;
if (xfs_is_shutdown(mp))
return -EIO;
XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
/*
- * Every time we write back a page, we need to increment the sequence
- * counter to ensure that xfs_iomap_valid breaks out of the current
- * iomap_iter() iteration because we might be using the delalloc
- * reservation it was about to use in case it was doing an in-memory
- * overwrite of an already dirty page.
+ * All dirty data must be covered by delalloc extents. But truncate can
+ * remove delalloc extents underneath us or reduce their size.
+ * Returning a hole tells iomap to not write back any data from this
+ * range, which is the right thing to do in that case.
+ *
+ * Otherwise just tell iomap to treat ranges previously covered by a
+ * delalloc extent as mapped. The actual block allocation will be done
+ * just before submitting the bio.
*/
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
imap.br_startoff = end_fsb; /* fake a hole past EOF */
-
- /*
- * All dirty data must be covered by delalloc extents. But truncate can
- * remove delalloc extents underneath us. Returning a hole tells iomap
- * to not write back any data from this range, which is the right thing
- * to do in that case.
- */
if (imap.br_startoff > offset_fsb) {
imap.br_blockcount = imap.br_startoff - offset_fsb;
imap.br_startoff = offset_fsb;
xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
return 0;
}
-
- /*
- * Similarly a truncate operation might not have entirely removed the
- * delalloc extent under us but reduced the range that it covers.
- * Adjust the writeback range for that.
- */
- if (end_fsb > imap.br_startoff + imap.br_blockcount) {
- end_fsb = imap.br_startoff + imap.br_blockcount;
- len = XFS_FSB_TO_B(mp, end_fsb - offset_fsb);
- }
+ end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
+ count_fsb = end_fsb - offset_fsb;
del = imap;
- xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
+ xfs_trim_extent(&del, offset_fsb, count_fsb);
xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
XFS_BMAPI_REMAP);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- /*
- * A new allocation we might have to block waiting for a currently
- * active zone to finish, which could require blocks already held in the
- * current ioend to be written. Kick off the current ioend to finish
- * up the active zone resource if we fail to allocate blocks without
- * waiting for a zone to finish.
- */
- if (wpc->ioend)
- alloc_flags |= XFS_ZONE_ALLOC_NOWAIT;
- error = xfs_zone_alloc_blocks(ip, offset, len, alloc_flags,
- &XFS_ZWPC(wpc)->rtg, &wpc->iomap);
- if (error == -EAGAIN) {
- alloc_flags &= ~XFS_ZONE_ALLOC_NOWAIT;
- error = iomap_submit_ioend(wpc, 0);
- if (error)
- return error;
- error = xfs_zone_alloc_blocks(ip, offset, len, alloc_flags,
- &XFS_ZWPC(wpc)->rtg, &wpc->iomap);
- }
-// trace_xfs_map_blocks_alloc(ip, offset, wpc->iomap.length, XFS_COW_FORK,
-// &imap);
- return error;
-}
-
-void
-xfs_submit_zoned_bio(
- struct iomap_ioend *ioend)
-{
- struct bio *bio = &ioend->io_bio;
- struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
- sector_t sector = bio->bi_iter.bi_sector;
-
- if (bdev_zone_is_seq(bio->bi_bdev, sector)) {
- bio->bi_opf &= ~REQ_OP_WRITE;
- bio->bi_opf |= REQ_OP_ZONE_APPEND;
- } else {
- if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0)
- ioend->io_flags |= IOMAP_F_BOUNDARY;
- }
+ wpc->iomap.type = IOMAP_MAPPED;
+ wpc->iomap.flags = IOMAP_F_DIRTY;
+ wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
+ wpc->iomap.offset = offset;
+ wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
+ wpc->iomap.flags = IOMAP_F_ZONE_APPEND;
+ wpc->iomap.addr = 0;
- bio->bi_end_io = xfs_end_bio;
- submit_bio(bio);
+ trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
+ return 0;
}
static int
struct iomap_writepage_ctx *wpc,
int status)
{
- struct iomap_ioend *ioend = wpc->ioend;
- struct iomap_ioend *split_ioend;
-
- ioend->io_bio.bi_end_io = xfs_end_bio;
-
+ wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
if (status)
return status;
-
- while ((split_ioend = iomap_split_zone_append_ioend(ioend)))
- xfs_submit_zoned_bio(split_ioend);
- xfs_submit_zoned_bio(ioend);
+ xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->rtg);
return 0;
}
int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
void xfs_end_bio(struct bio *bio);
-void xfs_submit_zoned_bio(struct iomap_ioend *ioend);
#endif /* __XFS_AOPS_H__ */
struct bio *bio,
loff_t file_offset)
{
- struct iomap_ioend *ioend, *split_ioend;
+ struct xfs_mount *mp = XFS_I(iter->inode)->i_mount;
+ struct xfs_zone_alloc_ctx *ac = iter->private;
+ xfs_filblks_t count_fsb;
+ struct iomap_ioend *ioend;
+
+ count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
+ if (count_fsb > ac->reserved_blocks) {
+ xfs_err(mp,
+"allocation (%lld) larger than reservation (%lld).",
+ count_fsb, ac->reserved_blocks);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ bio_io_error(bio);
+ return;
+ }
+ ac->reserved_blocks -= count_fsb;
+ bio->bi_end_io = xfs_end_bio;
ioend = iomap_init_ioend(iter->inode, bio, file_offset,
IOMAP_MAPPED, 0, true);
- while ((split_ioend = iomap_split_zone_append_ioend(ioend)))
- xfs_submit_zoned_bio(split_ioend);
- xfs_submit_zoned_bio(ioend);
+ xfs_zone_alloc_and_submit(ioend, &ac->cached_rtg);
}
static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
#ifdef CONFIG_XFS_RT
/*
- * For direct writes to zoned devices, just allocate space from an active zone
- * to cover the entire mapping upfront as we always write out of place. The
- * I/O completion handler than inserts it into the extent map, freeing the
- * existing blocks if there were any. There is no transaction at allocation
- * time, as the first time the block allocation is made permanent is at I/O
- * completion time.
+ * This is really simple. The space has already been reserved before taking the
+ * IOLOCK, the actual block allocation is done just before submitting the bio
+ * and only recorded in the extent map on I/O completion.
*/
static int
xfs_zoned_direct_write_iomap_begin(
struct iomap *srcmap)
{
struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- struct iomap_iter *iter =
- container_of(iomap, struct iomap_iter, iomap);
- struct xfs_zone_alloc_ctx *ac = iter->private;
int error;
ASSERT(!(flags & IOMAP_OVERWRITE_ONLY));
- if (XFS_B_TO_FSB(mp, length) > ac->reserved_blocks) {
- xfs_err(mp,
-"allocation (%lld) larger than reservation (%lld).",
- XFS_B_TO_FSB(mp, length), ac->reserved_blocks);
- dump_stack();
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- return -EIO;
- }
+ /*
+ * Needs to be pushed down into the allocator so that only writes into
+ * a single zone can be supported.
+ */
+ if (flags & IOMAP_NOWAIT)
+ return -EAGAIN;
/*
* Ensure the extent list is in memory in so that we don't have to do
* read it from the I/O completion handler.
*/
if (xfs_need_iread_extents(&ip->i_df)) {
- if (flags & IOMAP_NOWAIT)
- return -EAGAIN;
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
- error = xfs_zone_alloc_blocks(ip, offset, length,
- (flags & IOMAP_NOWAIT) ? XFS_ZONE_ALLOC_NOWAIT : 0,
- &ac->cached_rtg, iomap);
- if (!error) {
- ASSERT(iomap->length <= length);
- ac->reserved_blocks -= XFS_B_TO_FSB(mp, iomap->length);
- }
- return error;
-}
-
-static int
-xfs_zoned_direct_write_iomap_end(
- struct inode *inode,
- loff_t pos,
- loff_t length,
- ssize_t written,
- unsigned flags,
- struct iomap *iomap)
-{
- struct iomap_iter *iter =
- container_of(iomap, struct iomap_iter, iomap);
- struct xfs_zone_alloc_ctx *ac = iter->private;
- struct xfs_mount *mp = XFS_I(inode)->i_mount;
- loff_t alloc_end, written_end;
-
- alloc_end = round_up(pos + length, i_blocksize(inode));
- written_end = iomap_last_written_block(inode, pos, written);
- if (unlikely(written_end < alloc_end)) {
- xfs_filblks_t count_fsb =
- XFS_B_TO_FSB(mp, alloc_end - written_end);
-
- xfs_zone_rewind_blocks(ac->cached_rtg, count_fsb);
- ac->reserved_blocks += count_fsb;
- }
+ iomap->type = IOMAP_MAPPED;
+ iomap->flags = IOMAP_F_DIRTY;
+ iomap->bdev = ip->i_mount->m_rtdev_targp->bt_bdev;
+ iomap->offset = offset;
+ iomap->length = length;
+ iomap->flags = IOMAP_F_ZONE_APPEND;
+ iomap->addr = 0;
return 0;
}
const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
.iomap_begin = xfs_zoned_direct_write_iomap_begin,
- .iomap_end = xfs_zoned_direct_write_iomap_end,
};
#endif /* CONFIG_XFS_RT */
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_free_blocks);
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
-DEFINE_ZONE_ALLOC_EVENT(xfs_zone_rewind_blocks);
#endif /* CONFIG_XFS_RT */
TRACE_EVENT(xfs_inodegc_worker,
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read);
+DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks);
DECLARE_EVENT_CLASS(xfs_itrunc_class,
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
*/
static struct xfs_rtgroup *
xfs_last_used_zone(
- struct xfs_inode *ip,
- xfs_fileoff_t offset_fsb)
+ struct iomap_ioend *ioend)
{
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset);
struct xfs_rtgroup *rtg = NULL;
struct xfs_iext_cursor icur;
struct xfs_bmbt_irec got;
static struct xfs_rtgroup *
xfs_select_zone(
- struct xfs_inode *ip,
- xfs_filblks_t count_fsb,
- unsigned alloc_flags)
+ struct iomap_ioend *ioend)
{
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
struct xfs_mount *mp = ip->i_mount;
+ xfs_filblks_t count_fsb = XFS_B_TO_FSB(mp, ioend->io_size);
struct xfs_rtgroup *rtg = NULL;
DEFINE_WAIT (wait);
goto out_unlock;
rtg = xfs_select_zone_nowait(ip, count_fsb);
- if (rtg || (alloc_flags & XFS_ZONE_ALLOC_NOWAIT))
+ if (rtg)
goto out_unlock;
for (;;) {
return rtg;
}
-static bool
-xfs_zone_alloc_blocks_rtg(
- struct xfs_inode *ip,
- loff_t offset,
- xfs_filblks_t count_fsb,
- unsigned alloc_flags,
+static unsigned int
+xfs_zone_alloc_blocks(
+ struct iomap_ioend *ioend,
struct xfs_rtgroup *rtg,
- struct iomap *iomap)
-
+ bool *is_seq)
{
- struct xfs_mount *mp = ip->i_mount;
+ struct xfs_mount *mp = rtg_mount(rtg);
+ xfs_filblks_t count_fsb = XFS_B_TO_FSB(mp, ioend->io_size);
xfs_rgblock_t rgbno;
spin_lock(&rtg->rtg_alloc_lock);
- count_fsb = min(count_fsb,
+ count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN,
(xfs_filblks_t)rtg->rtg_extents - rtg->rtg_write_pointer);
if (!count_fsb || !test_bit(RTG_F_OPEN, &rtg->rtg_flags)) {
spin_unlock(&rtg->rtg_alloc_lock);
- return false;
+ return 0;
}
rgbno = rtg->rtg_write_pointer;
rtg->rtg_write_pointer += count_fsb;
trace_xfs_zone_alloc_blocks(rtg, rgbno, count_fsb);
- iomap->type = IOMAP_MAPPED;
- iomap->flags = IOMAP_F_DIRTY;
- iomap->bdev = mp->m_rtdev_targp->bt_bdev;
- iomap->offset = offset;
- iomap->length = XFS_FSB_TO_B(mp, count_fsb);
- if (test_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags)) {
- iomap->flags |= IOMAP_F_ZONE_APPEND;
+ *is_seq = test_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags);
+ if (*is_seq)
rgbno = 0;
+ ioend->io_sector = xfs_rtb_to_daddr(mp, xfs_rgbno_to_rtb(rtg, rgbno));
+ return XFS_FSB_TO_B(mp, count_fsb);
+}
+
+static inline void
+xfs_mark_rtg_boundary(
+ struct iomap_ioend *ioend)
+{
+ struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
+ sector_t sector = ioend->io_bio.bi_iter.bi_sector;
+
+ if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0)
+ ioend->io_flags |= IOMAP_F_BOUNDARY;
+}
+
+static void
+xfs_submit_zoned_bio(
+ struct iomap_ioend *ioend,
+ bool is_seq)
+{
+ if (is_seq) {
+ ioend->io_bio.bi_opf &= ~REQ_OP_WRITE;
+ ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND;
} else {
- if (rgbno == 0)
- iomap->flags |= IOMAP_F_BOUNDARY;
+ xfs_mark_rtg_boundary(ioend);
}
- iomap->addr = BBTOB(xfs_rtb_to_daddr(mp, xfs_rgbno_to_rtb(rtg, rgbno)));
- iomap->validity_cookie = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
-
-#if 0
- /* XXX: once we need quota accounting it would go here */
- xfs_trans_mod_dquot_byino(tp, ip, xfs_bmap_quota_field(ap),
- count_fsb);
-#endif
- return true;
+
+ ioend->io_bio.bi_iter.bi_sector = ioend->io_sector;
+ submit_bio(&ioend->io_bio);
}
-int
-xfs_zone_alloc_blocks(
- struct xfs_inode *ip,
- loff_t offset,
- loff_t count,
- unsigned alloc_flags,
- struct xfs_rtgroup **rtg,
- struct iomap *iomap)
+void
+xfs_zone_alloc_and_submit(
+ struct iomap_ioend *ioend,
+ struct xfs_rtgroup **rtg)
{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
- xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count);
- xfs_filblks_t count_fsb = min(end_fsb - offset_fsb,
- (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN);
-
- ASSERT(count_fsb > 0);
+ unsigned int alloc_len;
+ struct iomap_ioend *split;
+ bool is_seq;
- if (xfs_is_shutdown(mp))
- return -EIO;
+ if (xfs_is_shutdown(XFS_I(ioend->io_inode)->i_mount))
+ goto out_error;
/*
* If we don't have a cached zone in this write context, see if the
* If so, just continue writing to it.
*/
if (!*rtg)
- *rtg = xfs_last_used_zone(ip, offset_fsb);
-retry:
- if (!*rtg)
- *rtg = xfs_select_zone(ip, count_fsb, alloc_flags);
+ *rtg = xfs_last_used_zone(ioend);
- if (*rtg && !xfs_zone_alloc_blocks_rtg(ip, offset, count_fsb,
- alloc_flags, *rtg, iomap)) {
- xfs_zone_finish_alloc(*rtg);
- *rtg = NULL;
- goto retry;
+ if (!*rtg) {
+select_zone:
+ *rtg = xfs_select_zone(ioend);
+ if (!*rtg)
+ goto out_error;
}
- if (!*rtg) {
- if (xfs_is_shutdown(mp))
- return -EIO;
- return -EAGAIN;
+ alloc_len = xfs_zone_alloc_blocks(ioend, *rtg, &is_seq);
+ if (!alloc_len) {
+ xfs_zone_finish_alloc(*rtg);
+ goto select_zone;
}
- return 0;
-}
+ while ((split = iomap_split_ioend(ioend, is_seq, &alloc_len))) {
+ xfs_submit_zoned_bio(split, is_seq);
+ if (!alloc_len) {
+ xfs_zone_finish_alloc(*rtg);
+ goto select_zone;
+ }
+ }
-void
-xfs_zone_rewind_blocks(
- struct xfs_rtgroup *rtg,
- xfs_extlen_t len)
-{
- spin_lock(&rtg->rtg_alloc_lock);
- ASSERT(rtg->rtg_write_pointer > 0);
- ASSERT(len <= rtg->rtg_write_pointer);
- trace_xfs_zone_rewind_blocks(rtg, 0, len);
- rtg->rtg_write_pointer -= len;
- spin_unlock(&rtg->rtg_alloc_lock);
+ xfs_submit_zoned_bio(ioend, is_seq);
+ return;
- wake_up_all(&rtg_mount(rtg)->m_zone_wait);
+out_error:
+ bio_io_error(&ioend->io_bio);
}
void
#ifndef _XFS_ZONE_ALLOC_H
#define _XFS_ZONE_ALLOC_H
-#define XFS_ZONE_ALLOC_NOWAIT (1U << 0)
-
-int xfs_zone_alloc_blocks(struct xfs_inode *ip, loff_t offset, loff_t count,
- unsigned alloc_flags, struct xfs_rtgroup **rtg,
- struct iomap *iomap);
-void xfs_zone_rewind_blocks(struct xfs_rtgroup *rtg, xfs_extlen_t len);
+void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
+ struct xfs_rtgroup **rtg);
void xfs_zone_finish_alloc(struct xfs_rtgroup *rtg);
int xfs_zone_record_blocks(struct xfs_trans *tp, xfs_fsblock_t fsbno,
xfs_filblks_t len, bool used);
loff_t file_offset, u8 type, u16 flags, bool isdirect);
void iomap_dio_bio_end_io(struct bio *bio);
-struct iomap_ioend *iomap_split_zone_append_ioend(struct iomap_ioend *ioend);
+struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, bool is_append,
+ unsigned int *alloc_len);
#ifdef CONFIG_SWAP
struct file;