From: Christoph Hellwig Date: Fri, 1 Nov 2024 04:51:08 +0000 (+0100) Subject: xfs: support write stream separation X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=refs%2Fheads%2Fxfs-zoned-streams;p=users%2Fhch%2Fxfs.git xfs: support write stream separation Allow picking a write stream ID per "active zone" equivalent on conventional devices. The only complicated part is stealing yet another time stamp on the rmap inode to store the write stream ID so we can restart after a remount without de-synchronizing the software write pointer and the hardware equivalent. Due to the lack of a block layer API to query or resync our write pointer this still can happen on power fail or a kernel crash unfortunately. Signed-off-by: Christoph Hellwig --- diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h index 89bfcefb484b..46ab6cc43dc3 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.h +++ b/fs/xfs/libxfs/xfs_rtgroup.h @@ -53,6 +53,7 @@ struct xfs_rtgroup { xfs_rgblock_t rtg_write_pointer; xfs_rgblock_t rtg_written; enum rw_hint rtg_write_hint; + int rtg_write_stream; /* zone state entry */ struct list_head rtg_entry; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 4565f4c9ea1f..d9c4e01db92a 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -281,6 +281,7 @@ typedef struct xfs_mount { spinlock_t m_reservation_lock; struct list_head m_reclaim_reservations; struct task_struct *m_zone_gc_thread; + unsigned long *m_write_streams; struct dentry *m_debugfs; /* debugfs parent */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 29dea65fcd57..d92814f3e1ad 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -99,6 +99,8 @@ xfs_zone_mark_full( spin_lock(&mp->m_zone_list_lock); clear_bit(RTG_F_OPEN, &rtg->rtg_flags); + if (mp->m_write_streams) + clear_bit(rtg->rtg_write_stream, mp->m_write_streams); if (!list_empty(&rtg->rtg_entry)) { /* empty list means this is the open GC zone */ mp->m_nr_open_zones--; @@ -155,6 +157,15 @@ xfs_zone_record_blocks( if (rtg->rtg_written == rtg->rtg_extents) xfs_zone_mark_full(rtg); + /* + * (ab)use the ctime field to log the write stream. This allows us to + * pick up where we left after an unmount or power fail event. + * + * As the inode core get logged anyway there is no cost doing this + * every time an allocation is recorded. + */ + VFS_I(rtg->rtg_inodes[XFS_RTGI_RMAP])->i_ctime_sec = + rtg->rtg_write_stream; xfs_trans_log_inode(tp, rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOG_CORE); xfs_rtgroup_put(rtg); @@ -246,14 +257,25 @@ xfs_find_free_zone( list_for_each_entry(rtg, &mp->m_free_zones, rtg_entry) { ASSERT(rtg->rtg_write_pointer == 0); - if (atomic_inc_not_zero(&rtg->rtg_group.xg_active_ref)) { - list_del_init(&rtg->rtg_entry); - atomic_dec(&mp->m_nr_free_zones); - return rtg; - } + if (atomic_inc_not_zero(&rtg->rtg_group.xg_active_ref)) + goto found; } return NULL; +found: + if (mp->m_write_streams) { + rtg->rtg_write_stream = find_first_zero_bit(mp->m_write_streams, + mp->m_max_open_zones); + if (rtg->rtg_write_stream < 0) { + xfs_warn(mp, "no available write streams"); + return NULL; + } + set_bit(rtg->rtg_write_stream, mp->m_write_streams); + } + + list_del_init(&rtg->rtg_entry); + atomic_dec(&mp->m_nr_free_zones); + return rtg; } /* @@ -493,6 +515,7 @@ xfs_zone_alloc_blocks( if (*is_seq) rgbno = 0; ioend->io_sector = xfs_rtb_to_daddr(mp, xfs_rgbno_to_rtb(rtg, rgbno)); + ioend->io_bio.bi_write_hint = rtg->rtg_write_stream; return XFS_FSB_TO_B(mp, count_fsb); } diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index 3eddc792eb1f..0e2ddd781147 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -602,6 +602,8 @@ xfs_select_gc_zone( */ if (rtg->rtg_written < rtg->rtg_extents) return NULL; + if (mp->m_write_streams) + clear_bit(rtg->rtg_write_stream, mp->m_write_streams); xfs_rtgroup_rele(rtg); rtg = NULL; } @@ -1170,6 +1172,28 @@ xfs_get_zone_info_cb( return error; } + +/* + * XXX: this also needs to resync the hardware state with ours and advance the + * write pointer to match the usable capacity (e.g. RUAMW in FDP) because in + * case of a power fail we might have already written data, but not recorded + * it in the rmap yet. That needs a block layer API first, though. + */ +static void +xfs_recover_write_stream( + struct xfs_rtgroup *rtg) +{ + struct xfs_inode *rmapip = rtg->rtg_inodes[XFS_RTGI_RMAP]; + struct xfs_mount *mp = rtg_mount(rtg); + + rtg->rtg_write_stream = VFS_I(rmapip)->i_ctime_sec; + if (test_and_set_bit(rtg->rtg_write_stream, mp->m_write_streams)) { + xfs_warn(mp, "duplicate write stream %u for zone %u", + rtg->rtg_write_stream, rtg_rgno(rtg)); + rtg->rtg_write_stream = 0; + } +} + static int xfs_init_zone( struct xfs_rtgroup *rtg, @@ -1220,6 +1244,8 @@ xfs_init_zone( set_bit(RTG_F_OPEN, &rtg->rtg_flags); *available += (rtg->rtg_extents - rtg->rtg_write_pointer); *freedblocks += (rtg->rtg_write_pointer) - used; + if (mp->m_write_streams) + xfs_recover_write_stream(rtg); } else if (used < rtg->rtg_extents) { /* zone fully written, but has freed blocks */ xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE); @@ -1259,8 +1285,9 @@ xfs_mount_zones( struct xfs_mount *mp) { struct xfs_buftarg *bt = mp->m_rtdev_targp; - unsigned int bdev_open_zones; + unsigned int bdev_open_zones = 0; int64_t available = 0, freedblocks = 0; + struct queue_limits *lim = bdev_limits(bt->bt_bdev); struct xfs_rtgroup *rtg = NULL; int error; @@ -1299,7 +1326,11 @@ xfs_mount_zones( * Note: To debug the open zone management code, force max_open to * 1 here. */ - bdev_open_zones = bdev_max_open_zones(bt->bt_bdev); + if (bdev_is_zoned(bt->bt_bdev)) + bdev_open_zones = lim->max_open_zones; + else if (lim->features & BLK_FEAT_PLACEMENT_HINTS) + bdev_open_zones = lim->max_write_hints; + if (bdev_open_zones && !mp->m_max_open_zones) mp->m_max_open_zones = bdev_open_zones; if (mp->m_max_open_zones) { @@ -1348,6 +1379,15 @@ xfs_mount_zones( mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, mp); if (error < 0) return error; + } else if (lim->features & BLK_FEAT_PLACEMENT_HINTS) { + /* + * XXX: This won't cope with the per-partition restriction + * bitmap which I have no idea how to use correctly. + */ + mp->m_write_streams = bitmap_zalloc(mp->m_max_open_zones, + GFP_KERNEL); + if (!mp->m_write_streams) + return -ENOMEM; } mp->m_zone_gc_thread = kthread_create(xfs_zoned_gcd, mp, @@ -1395,6 +1435,7 @@ out_unlink_zones: rtg = NULL; while ((rtg = xfs_rtgroup_next(mp, rtg))) list_del_init(&rtg->rtg_entry); + bitmap_free(mp->m_write_streams); return error; } @@ -1407,4 +1448,5 @@ xfs_unmount_zones( kthread_stop(mp->m_zone_gc_thread); while ((rtg = xfs_rtgroup_next(mp, rtg))) list_del_init(&rtg->rtg_entry); + bitmap_free(mp->m_write_streams); }