xfs: support write stream separation xfs-zoned-streams
authorChristoph Hellwig <hch@lst.de>
Fri, 1 Nov 2024 04:51:08 +0000 (05:51 +0100)
committerChristoph Hellwig <hch@lst.de>
Tue, 5 Nov 2024 15:35:39 +0000 (16:35 +0100)
Allow picking a write stream ID per "active zone" equivalent on
conventional devices.  The only complicated part is stealing yet
another time stamp on the rmap inode to store the write stream
ID so we can restart after a remount without de-synchronizing the
software write pointer and the hardware equivalent.  Due to the
lack of a block layer API to query or resync our write pointer
this still can happen on power fail or a kernel crash
unfortunately.

Signed-off-by: Christoph Hellwig <hch@lst.de>
fs/xfs/libxfs/xfs_rtgroup.h
fs/xfs/xfs_mount.h
fs/xfs/xfs_zone_alloc.c
fs/xfs/xfs_zone_gc.c

index 89bfcefb484bfa9e94dddf2b259ef73d729f2ec7..46ab6cc43dc3150209dc5e34abab616cb66df7d1 100644 (file)
@@ -53,6 +53,7 @@ struct xfs_rtgroup {
        xfs_rgblock_t           rtg_write_pointer;
        xfs_rgblock_t           rtg_written;
        enum rw_hint            rtg_write_hint;
+       int                     rtg_write_stream;
 
        /* zone state entry */
        struct list_head        rtg_entry;
index 4565f4c9ea1f8eb4c2c2c85c7b8cbec8bf20102a..d9c4e01db92a3d93adac2bdc6d4c086c55f16f94 100644 (file)
@@ -281,6 +281,7 @@ typedef struct xfs_mount {
        spinlock_t              m_reservation_lock;
        struct list_head        m_reclaim_reservations;
        struct task_struct      *m_zone_gc_thread;
+       unsigned long           *m_write_streams;
        struct dentry           *m_debugfs;     /* debugfs parent */
        struct xfs_kobj         m_kobj;
        struct xfs_kobj         m_error_kobj;
index 29dea65fcd5735393261f178b164baf868132593..d92814f3e1ad21dc79a5954e9ca3afe45b2604e1 100644 (file)
@@ -99,6 +99,8 @@ xfs_zone_mark_full(
 
        spin_lock(&mp->m_zone_list_lock);
        clear_bit(RTG_F_OPEN, &rtg->rtg_flags);
+       if (mp->m_write_streams)
+               clear_bit(rtg->rtg_write_stream, mp->m_write_streams);
        if (!list_empty(&rtg->rtg_entry)) {
                /* empty list means this is the open GC zone */
                mp->m_nr_open_zones--;
@@ -155,6 +157,15 @@ xfs_zone_record_blocks(
        if (rtg->rtg_written == rtg->rtg_extents)
                xfs_zone_mark_full(rtg);
 
+       /*
+        * (ab)use the ctime field to log the write stream.  This allows us to
+        * pick up where we left after an unmount or power fail event.
+        *
+        * As the inode core get logged anyway there is no cost doing this
+        * every time an allocation is recorded.
+        */
+       VFS_I(rtg->rtg_inodes[XFS_RTGI_RMAP])->i_ctime_sec =
+                       rtg->rtg_write_stream;
        xfs_trans_log_inode(tp, rtg->rtg_inodes[XFS_RTGI_RMAP], XFS_ILOG_CORE);
 
        xfs_rtgroup_put(rtg);
@@ -246,14 +257,25 @@ xfs_find_free_zone(
 
        list_for_each_entry(rtg, &mp->m_free_zones, rtg_entry) {
                ASSERT(rtg->rtg_write_pointer == 0);
-               if (atomic_inc_not_zero(&rtg->rtg_group.xg_active_ref)) {
-                       list_del_init(&rtg->rtg_entry);
-                       atomic_dec(&mp->m_nr_free_zones);
-                       return rtg;
-               }
+               if (atomic_inc_not_zero(&rtg->rtg_group.xg_active_ref))
+                       goto found;
        }
 
        return NULL;
+found:
+       if (mp->m_write_streams) {
+               rtg->rtg_write_stream = find_first_zero_bit(mp->m_write_streams,
+                                       mp->m_max_open_zones);
+               if (rtg->rtg_write_stream < 0) {
+                       xfs_warn(mp, "no available write streams");
+                       return NULL;
+               }
+               set_bit(rtg->rtg_write_stream, mp->m_write_streams);
+       }
+
+       list_del_init(&rtg->rtg_entry);
+       atomic_dec(&mp->m_nr_free_zones);
+       return rtg;
 }
 
 /*
@@ -493,6 +515,7 @@ xfs_zone_alloc_blocks(
        if (*is_seq)
                rgbno = 0;
        ioend->io_sector = xfs_rtb_to_daddr(mp, xfs_rgbno_to_rtb(rtg, rgbno));
+       ioend->io_bio.bi_write_hint = rtg->rtg_write_stream;
        return XFS_FSB_TO_B(mp, count_fsb);
 }
 
index 3eddc792eb1f033f55f20c2f0c1cf5f91c6a87e9..0e2ddd781147fae6dfc143645facb8aef3cea23b 100644 (file)
@@ -602,6 +602,8 @@ xfs_select_gc_zone(
                 */
                if (rtg->rtg_written < rtg->rtg_extents)
                        return NULL;
+               if (mp->m_write_streams)
+                       clear_bit(rtg->rtg_write_stream, mp->m_write_streams);
                xfs_rtgroup_rele(rtg);
                rtg = NULL;
        }
@@ -1170,6 +1172,28 @@ xfs_get_zone_info_cb(
        return error;
 }
 
+
+/*
+ * XXX: this also needs to resync the hardware state with ours and advance the
+ * write pointer to match the usable capacity (e.g. RUAMW in FDP) because in
+ * case of a power fail we might have already written data, but not recorded
+ * it in the rmap yet.  That needs a block layer API first, though.
+ */
+static void
+xfs_recover_write_stream(
+       struct xfs_rtgroup      *rtg)
+{
+       struct xfs_inode        *rmapip = rtg->rtg_inodes[XFS_RTGI_RMAP];
+       struct xfs_mount        *mp = rtg_mount(rtg);
+
+       rtg->rtg_write_stream = VFS_I(rmapip)->i_ctime_sec;
+       if (test_and_set_bit(rtg->rtg_write_stream, mp->m_write_streams)) {
+               xfs_warn(mp, "duplicate write stream %u for zone %u",
+                               rtg->rtg_write_stream, rtg_rgno(rtg));
+               rtg->rtg_write_stream = 0;
+       }
+}
+
 static int
 xfs_init_zone(
        struct xfs_rtgroup      *rtg,
@@ -1220,6 +1244,8 @@ xfs_init_zone(
                set_bit(RTG_F_OPEN, &rtg->rtg_flags);
                *available += (rtg->rtg_extents - rtg->rtg_write_pointer);
                *freedblocks += (rtg->rtg_write_pointer) - used;
+               if (mp->m_write_streams)
+                       xfs_recover_write_stream(rtg);
        } else if (used < rtg->rtg_extents) {
                /* zone fully written, but has freed blocks */
                xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE);
@@ -1259,8 +1285,9 @@ xfs_mount_zones(
        struct xfs_mount        *mp)
 {
        struct xfs_buftarg      *bt = mp->m_rtdev_targp;
-       unsigned int            bdev_open_zones;
+       unsigned int            bdev_open_zones = 0;
        int64_t                 available = 0, freedblocks = 0;
+       struct queue_limits     *lim = bdev_limits(bt->bt_bdev);
        struct xfs_rtgroup      *rtg = NULL;
        int                     error;
 
@@ -1299,7 +1326,11 @@ xfs_mount_zones(
         * Note: To debug the open zone management code, force max_open to
         * 1 here.
         */
-       bdev_open_zones = bdev_max_open_zones(bt->bt_bdev);
+       if (bdev_is_zoned(bt->bt_bdev))
+               bdev_open_zones = lim->max_open_zones;
+       else if (lim->features & BLK_FEAT_PLACEMENT_HINTS)
+               bdev_open_zones = lim->max_write_hints;
+
        if (bdev_open_zones && !mp->m_max_open_zones)
                mp->m_max_open_zones = bdev_open_zones;
        if (mp->m_max_open_zones) {
@@ -1348,6 +1379,15 @@ xfs_mount_zones(
                                mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, mp);
                if (error < 0)
                        return error;
+       } else if (lim->features & BLK_FEAT_PLACEMENT_HINTS) {
+               /*
+                * XXX: This won't cope with the per-partition restriction
+                * bitmap which I have no idea how to use correctly.
+                */
+               mp->m_write_streams = bitmap_zalloc(mp->m_max_open_zones,
+                               GFP_KERNEL);
+               if (!mp->m_write_streams)
+                       return -ENOMEM;
        }
 
        mp->m_zone_gc_thread = kthread_create(xfs_zoned_gcd, mp,
@@ -1395,6 +1435,7 @@ out_unlink_zones:
        rtg = NULL;
        while ((rtg = xfs_rtgroup_next(mp, rtg)))
                list_del_init(&rtg->rtg_entry);
+       bitmap_free(mp->m_write_streams);
        return error;
 }
 
@@ -1407,4 +1448,5 @@ xfs_unmount_zones(
        kthread_stop(mp->m_zone_gc_thread);
        while ((rtg = xfs_rtgroup_next(mp, rtg)))
                list_del_init(&rtg->rtg_entry);
+       bitmap_free(mp->m_write_streams);
 }