]> www.infradead.org Git - users/hch/xfsprogs.git/commitdiff
add zoned device support xfs-zoned-old-2024-04-10
authorChristoph Hellwig <hch@lst.de>
Wed, 14 Feb 2024 14:48:07 +0000 (15:48 +0100)
committerChristoph Hellwig <hch@lst.de>
Wed, 14 Feb 2024 15:10:27 +0000 (16:10 +0100)
This is pretty crude, and only mkfs really works.  repair has all
the code checking and repairing realtime related metadata just stubbed
out when using the zone allocator.

Signed-off-by: Christoph Hellwig <hch@lst.de>
include/xfs_mount.h
libxfs/xfs_sb.c
mkfs/proto.c
mkfs/xfs_mkfs.c
repair/phase5.c
repair/phase6.c
repair/rt.c

index 8949efbdc61b066ce97b5be5774b1eb0876f1ee0..6820c046652d6e4aa4a886447f0821a25bc47cf7 100644 (file)
@@ -190,6 +190,7 @@ typedef struct xfs_mount {
 #define XFS_FEAT_NREXT64       (1ULL << 26)    /* large extent counters */
 #define XFS_FEAT_METADIR       (1ULL << 27)    /* metadata directory tree */
 #define XFS_FEAT_RTGROUPS      (1ULL << 28)    /* realtime groups */
+#define XFS_FEAT_ZONED         (1ULL << 29)
 
 #define __XFS_HAS_FEAT(name, NAME) \
 static inline bool xfs_has_ ## name (struct xfs_mount *mp) \
@@ -236,6 +237,7 @@ __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
 __XFS_HAS_FEAT(large_extent_counts, NREXT64)
 __XFS_HAS_FEAT(metadir, METADIR)
 __XFS_HAS_FEAT(rtgroups, RTGROUPS)
+__XFS_HAS_FEAT(zoned, ZONED)
 
 static inline bool xfs_has_rtrmapbt(struct xfs_mount *mp)
 {
@@ -251,7 +253,7 @@ static inline bool xfs_has_rtreflink(struct xfs_mount *mp)
 
 static inline bool xfs_has_rtsb(struct xfs_mount *mp)
 {
-       return xfs_has_rtgroups(mp);
+       return xfs_has_rtgroups(mp) && !xfs_has_zoned(mp);
 }
 
 /* Kernel mount features that we don't support */
index 994699139f1741592c0e489845f2b2211b523bdd..0adb532eec2685a78e3bdd962d35f949a66221f7 100644 (file)
@@ -585,6 +585,7 @@ xfs_validate_sb_common(
 
                rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize);
 
+               // XXX: zoned?
                if (xfs_sb_is_v5(sbp) &&
                    (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_RTGROUPS))
                        rbmblock_bytes -= sizeof(struct xfs_rtbuf_blkinfo);
@@ -1113,7 +1114,7 @@ xfs_sb_mount_common(
        mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize);
 
        if (is_power_of_2(sbp->sb_rgblocks)) {
-               mp->m_rgblklog = ilog2(sbp->sb_rgblocks);
+               mp->m_rgblklog = log2_if_power2(sbp->sb_rgblocks);
                mp->m_rgblkmask = sbp->sb_rgblocks - 1;
        } else if (xfs_has_zoned(mp)) {
                /*
index fc230cc975dcd0ab2021c79b43e59ef4c9009a69..7d78e04a54fc48ed5b3460a93c7413b1ac5b4132 100644 (file)
@@ -870,6 +870,14 @@ rtrmapbt_create(
        if (error)
                fail(_("rtrmap inode creation failed"), error);
 
+       if (xfs_has_zoned(mp)) {
+               /* Clear the WP/freed fields */
+               inode_set_atime(VFS_I(rtg->rtg_rmapip), 0, 0);
+               inode_set_mtime(VFS_I(rtg->rtg_rmapip), 0, 0);
+               libxfs_trans_log_inode(upd.tp, rtg->rtg_rmapip, XFS_ILOG_CORE);
+               goto commit;
+       }
+
        /* Adding an rmap for the rtgroup super should fit in the data fork */
        cur = libxfs_rtrmapbt_init_cursor(mp, upd.tp, rtg, rtg->rtg_rmapip);
        error = -libxfs_rmap_map_raw(cur, &rmap);
@@ -877,6 +885,7 @@ rtrmapbt_create(
        if (error)
                fail(_("rtrmapbt initialization failed"), error);
 
+commit:
        error = -libxfs_imeta_commit_update(&upd);
        if (error)
                fail(_("rtrmapbt commit failed"), error);
@@ -1104,7 +1113,7 @@ rtinit(
                        rtrefcountbt_create(rtg);
        }
 
-       if (mp->m_sb.sb_rbmblocks == 0)
+       if (mp->m_sb.sb_rbmblocks == 0 || xfs_has_zoned(mp))
                return;
 
        rtbitmap_init(mp);
index 20950614542c957f6011c38617ec0d3f7199d435..6ee47a83564955284493ccb24b752dd3a03edab0 100644 (file)
@@ -7,6 +7,7 @@
 #include "libfrog/util.h"
 #include "libxfs.h"
 #include <ctype.h>
+#include <linux/blkzoned.h>
 #include "xfs_multidisk.h"
 #include "libxcmd.h"
 #include "libfrog/fsgeom.h"
@@ -134,6 +135,7 @@ enum {
        R_RTGROUPS,
        R_RGCOUNT,
        R_RGSIZE,
+       R_ZONED,
        R_MAX_OPTS,
 };
 
@@ -724,6 +726,7 @@ static struct opt_params ropts = {
                [R_RTGROUPS] = "rtgroups",
                [R_RGCOUNT] = "rgcount",
                [R_RGSIZE] = "rgsize",
+               [R_ZONED] = "zoned",
                [R_MAX_OPTS] = NULL,
        },
        .subopt_params = {
@@ -784,6 +787,13 @@ static struct opt_params ropts = {
                  .maxval = (unsigned long long)XFS_MAX_RGBLOCKS << XFS_MAX_BLOCKSIZE_LOG,
                  .defaultval = SUBOPT_NEEDS_VAL,
                },
+               { .index = R_ZONED,
+                 .conflicts = { { &ropts, R_EXTSIZE },
+                                { NULL, LAST_CONFLICT } },
+                 .minval = 0,
+                 .maxval = 1,
+                 .defaultval = 0,
+               },
        },
 };
 
@@ -937,6 +947,7 @@ struct sb_feat_args {
        bool    nortalign;
        bool    nrext64;
        bool    rtgroups;               /* XFS_SB_FEAT_COMPAT_RTGROUPS */
+       bool    zoned;
 };
 
 struct cli_params {
@@ -1455,6 +1466,28 @@ discard_blocks(int fd, uint64_t nsectors, int quiet)
                printf("Done.\n");
 }
 
+static void
+reset_zones(int fd, uint64_t nsectors, int quiet)
+{
+       struct blk_zone_range range = {
+               .nr_sectors     = nsectors,
+       };
+
+       if (!quiet) {
+               printf("Discarding blocks...");
+               fflush(stdout);
+       }
+
+       if (ioctl(fd, BLKRESETZONE, &range) < 0) {
+               if (!quiet)
+                       printf(" FAILED\n");
+               exit(1);
+       }
+
+       if (!quiet)
+               printf("Done.\n");
+}
+
 static __attribute__((noreturn)) void
 illegal_option(
        const char              *value,
@@ -1994,6 +2027,9 @@ rtdev_opts_parser(
        case R_RGSIZE:
                cli->rgsize = getstr(value, opts, subopt);
                break;
+       case R_ZONED:
+               cli->sb_feat.zoned = getnum(value, opts, subopt);
+               break;
        default:
                return -EINVAL;
        }
@@ -2283,7 +2319,158 @@ _("Version 1 logs do not support sector size %d\n"),
 _("log stripe unit specified, using v2 logs\n"));
                cli->sb_feat.log_version = 2;
        }
+}
+
+struct zone_info {
+       unsigned int            nr_zones;
+       unsigned int            zone_capacity;
+};
+
+struct zone_topology {
+       struct zone_info        data;
+       struct zone_info        rt;
+       struct zone_info        log;
+};
+
+#define ZONES_PER_IOCTL                        16384
 
+static int report_zones(const char *name, struct zone_info *zi)
+{
+       struct blk_zone_report *rep;
+       size_t rep_size;
+       struct stat st;
+       unsigned int i, n = 0;
+       unsigned int zone_size = 0;
+       uint64_t device_size;
+       uint64_t sector = 0;
+       int ret = 0;
+       int fd;
+
+       fd = open(name, O_RDONLY);
+       if (fd < 0)
+               return -EIO;
+
+       if (fstat(fd, &st) < 0) {
+               ret = -EIO;
+               goto out_close;
+       }
+        if (!S_ISBLK(st.st_mode))
+               goto out_close;
+
+       if (ioctl(fd, BLKGETSIZE64, &device_size)) {
+               ret = -EIO;
+               goto out_close;
+       }
+       if (ioctl(fd, BLKGETZONESZ, &zone_size) || !zone_size)
+               goto out_close; /* not zoned */
+
+       device_size /= 512; /* BLKGETSIZE64 reports a byte value */
+       zi->nr_zones = device_size / zone_size;
+
+       rep_size = sizeof(struct blk_zone_report) +
+                  sizeof(struct blk_zone) * min(zi->nr_zones, ZONES_PER_IOCTL);
+       rep = malloc(rep_size);
+       if (!rep) {
+               ret = -ENOMEM;
+               goto out_close;
+       }
+
+       while (n < zi->nr_zones) {
+               struct blk_zone *zones = (struct blk_zone *)(rep + 1);
+
+               memset(rep, 0, rep_size);
+               rep->sector = sector;
+               rep->nr_zones = ZONES_PER_IOCTL;
+
+               ret = ioctl(fd, BLKREPORTZONE, rep);
+               if (ret) {
+                       fprintf(stderr,
+_("ioctl(BLKREPORTZONE) failed: %d!\n"), ret);
+                       goto out_free;
+               }
+               if (!rep->nr_zones)
+                       break;
+
+               for (i = 0; i < rep->nr_zones; i++) {
+                       if (n >= zi->nr_zones)
+                               break;
+
+                       switch (zones[i].type) {
+                       case BLK_ZONE_TYPE_CONVENTIONAL:
+                               break;
+                       case BLK_ZONE_TYPE_SEQWRITE_REQ:
+                               break;
+                       case BLK_ZONE_TYPE_SEQWRITE_PREF:
+                               fprintf(stderr,
+_("Sequential write preferred zones not supported!\n"));
+                               ret = -EIO;
+                               goto out_free;
+                       }
+
+                       if (zones[i].len != zone_size) {
+                               fprintf(stderr,
+_("Inconsistent zone size!\n"));
+                               ret = -EIO;
+                               goto out_free;
+                       }
+
+                       if (!n) {
+                               zi->zone_capacity = zones[i].capacity;
+                               if (zi->zone_capacity > zone_size) {
+                                       fprintf(stderr,
+_("Zone capacity larger than zone size!\n"));
+                                       ret = -EIO;
+                               }
+                       } else if (zones[i].capacity != zi->zone_capacity) {
+                               fprintf(stderr,
+_("Inconsistent zone capacity!\n"));
+                               ret = -EIO;
+                               goto out_free;
+                       }
+
+                       n++;
+               }
+               sector = zones[rep->nr_zones - 1].start +
+                        zones[rep->nr_zones - 1].len;
+       }
+
+out_free:
+       free(rep);
+out_close:
+       close(fd);
+       return ret;
+}
+
+static void
+validate_zoned(
+       struct mkfs_params      *cfg,
+       struct cli_params       *cli,
+       struct mkfs_default_params *dft,
+       struct zone_topology    *zt)
+{
+       if (!cli->xi->data.isfile) {
+               report_zones(cli->xi->data.name, &zt->data);
+               if (zt->data.nr_zones) {
+                       fprintf(stderr,
+_("Zoned devices not supported as main device!\n"));
+                       usage();
+               }
+       }
+
+       if (cli->xi->rt.name && !cli->xi->rt.isfile) {
+               report_zones(cli->xi->rt.name, &zt->rt);
+               if (zt->rt.nr_zones && !cli->sb_feat.zoned)
+                       cli->sb_feat.zoned = true;
+       }
+
+       if (cli->xi->log.name && !cli->xi->log.isfile) {
+               report_zones(cli->xi->log.name, &zt->log);
+               if (zt->log.nr_zones) {
+                       fprintf(stderr,
+_("Zoned devices not supported as log device!\n"));
+                       usage();
+               }
+       }
 }
 
 /*
@@ -2461,6 +2648,22 @@ _("parent pointers not supported on v4 filesystems\n"));
        }
 
        if (cli->xi->rt.name) {
+               if (cli->sb_feat.zoned && !cli->sb_feat.rtgroups) {
+                       if (cli_opt_set(&ropts, R_RTGROUPS)) {
+                               fprintf(stderr,
+_("zoned mode not supported without rtgroups support\n"));
+                               usage();
+                       }
+                       cli->sb_feat.rtgroups = true;
+               }
+               if (cli->sb_feat.zoned && cli->rtextsize) {
+                       if (cli_opt_set(&ropts, R_EXTSIZE)) {
+                               fprintf(stderr,
+_("rt extent size not supported on realtime devices with zoned mode specified\n"));
+                               usage();
+                       }
+                       cli->rtextsize = 0;
+               }
                if (cli->rtextsize && cli->sb_feat.reflink) {
                        if (cli_opt_set(&mopts, M_REFLINK)) {
                                fprintf(stderr,
@@ -3109,6 +3312,7 @@ open_devices(
 static void
 discard_devices(
        struct libxfs_init      *xi,
+       struct zone_topology    *zt,
        int                     quiet)
 {
        /*
@@ -3117,8 +3321,12 @@ discard_devices(
 
        if (!xi->data.isfile)
                discard_blocks(xi->data.fd, xi->data.size, quiet);
-       if (xi->rt.dev && !xi->rt.isfile)
-               discard_blocks(xi->rt.fd, xi->rt.size, quiet);
+       if (xi->rt.dev && !xi->rt.isfile) {
+               if (zt->rt.nr_zones)
+                       reset_zones(xi->rt.fd, xi->rt.size, quiet);
+               else
+                       discard_blocks(xi->rt.fd, xi->rt.size, quiet);
+       }
        if (xi->log.dev && xi->log.dev != xi->data.dev && !xi->log.isfile)
                discard_blocks(xi->log.fd, xi->log.size, quiet);
 }
@@ -3237,7 +3445,8 @@ reported by the device (%u).\n"),
 static void
 validate_rtdev(
        struct mkfs_params      *cfg,
-       struct cli_params       *cli)
+       struct cli_params       *cli,
+       struct zone_topology    *zt)
 {
        struct libxfs_init      *xi = cli->xi;
        unsigned int            rbmblocksize = cfg->blocksize;
@@ -3273,6 +3482,9 @@ _("size %s specified for rt subvolume is too large, maxi->um is %lld blocks\n"),
 reported by the device (%u).\n"),
                                cfg->sectorsize, xi->rt.bsize);
                }
+       } else if (zt->rt.nr_zones) {
+               cfg->rtblocks = DTOBT(zt->rt.nr_zones * zt->rt.zone_capacity,
+                                     cfg->blocklog);
        } else {
                /* grab volume size */
                cfg->rtblocks = DTOBT(xi->rt.size, cfg->blocklog);
@@ -3622,20 +3834,28 @@ _("realtime group size (%llu) not at all congruent with extent size (%llu)\n"),
        return 0;
 }
 
+#define XFS_MIN_ZONES 3                /* XXX: move to header */
+
 static void
 calculate_rtgroup_geometry(
        struct mkfs_params      *cfg,
-       struct cli_params       *cli)
+       struct cli_params       *cli,
+       struct zone_topology    *zt)
 {
-       if (!cli->sb_feat.rtgroups) {
-               cfg->rgcount = 0;
-               cfg->rgsize = 0;
-               return;
-       }
+       if (zt->rt.nr_zones) {
+               cfg->rgsize = zt->rt.zone_capacity * 512;
+       } else {
+               if (!cli->sb_feat.rtgroups) {
+                       cfg->rgcount = 0;
+                       cfg->rgsize = 0;
+                       return;
+               }
 
-       if (cli->rgsize) {      /* User-specified rtgroup size */
-               cfg->rgsize = getnum(cli->rgsize, &ropts, R_RGSIZE);
+               if (cli->rgsize)        /* User-specified rtgroup size */
+                       cfg->rgsize = getnum(cli->rgsize, &ropts, R_RGSIZE);
+       }
 
+       if (cfg->rgsize) {
                /*
                 * Check specified agsize is a multiple of blocksize.
                 */
@@ -3705,6 +3925,13 @@ _("realtime group count (%llu) must be less than the maximum (%u)\n"),
                                XFS_MAX_RGNUMBER);
                usage();
        }
+       if (cfg->sb_feat.zoned && cfg->rgcount < XFS_MIN_ZONES)  {
+               fprintf(stderr,
+_("realtime group count (%llu) must be greater than the minimum (%u)\n"),
+                               (unsigned long long)cfg->rgcount,
+                               XFS_MIN_ZONES);
+               usage();
+       }
 }
 
 static void
@@ -3852,6 +4079,9 @@ sb_set_features(
                sbp->sb_rgcount = cfg->rgcount;
                sbp->sb_rgblocks = cfg->rgsize;
        }
+
+       if (fp->zoned)
+               sbp->sb_features_incompat |= XFS_SB_FEAT_INCOMPAT_ZONED;
 }
 
 /*
@@ -4413,9 +4643,9 @@ prepare_devices(
                         (xfs_extlen_t)XFS_FSB_TO_BB(mp, cfg->logblocks),
                         &sbp->sb_uuid, cfg->sb_feat.log_version,
                         lsunit, XLOG_FMT, XLOG_INIT_CYCLE, false);
-
        /* finally, check we can write the last block in the realtime area */
-       if (mp->m_rtdev_targp->bt_bdev && cfg->rtblocks > 0) {
+       if (mp->m_rtdev_targp->bt_bdev && cfg->rtblocks > 0 &&
+           !xfs_has_zoned(mp)) {
                buf = alloc_write_buf(mp->m_rtdev_targp,
                                XFS_FSB_TO_BB(mp, cfg->rtblocks - 1LL),
                                BTOBB(cfg->blocksize));
@@ -4778,7 +5008,7 @@ main(
                         */
                },
        };
-
+       struct zone_topology zt = {};
        struct list_head        buffer_list;
        int                     error;
 
@@ -4880,6 +5110,7 @@ main(
        sectorsize = cfg.sectorsize;
 
        validate_log_sectorsize(&cfg, &cli, &dft, &ft);
+       validate_zoned(&cfg, &cli, &dft, &zt);
        validate_sb_features(&cfg, &cli);
 
        /*
@@ -4908,7 +5139,7 @@ main(
        validate_overwrite(xi.data.name, force_overwrite);
        validate_datadev(&cfg, &cli);
        validate_logdev(&cfg, &cli);
-       validate_rtdev(&cfg, &cli);
+       validate_rtdev(&cfg, &cli, &zt);
        calc_stripe_factors(&cfg, &cli, &ft);
 
        /*
@@ -4919,7 +5150,7 @@ main(
         */
        calculate_initial_ag_geometry(&cfg, &cli, &xi);
        align_ag_geometry(&cfg);
-       calculate_rtgroup_geometry(&cfg, &cli);
+       calculate_rtgroup_geometry(&cfg, &cli, &zt);
 
        calculate_imaxpct(&cfg, &cli);
 
@@ -4973,7 +5204,7 @@ main(
         * All values have been validated, discard the old device layout.
         */
        if (discard && !dry_run)
-               discard_devices(&xi, quiet);
+               discard_devices(&xi, &zt, quiet);
 
        /*
         * we need the libxfs buffer cache from here on in.
index 1309a43df21167b464480f273f9e7f9ac45f48e5..e49fdfbe484fc438448366e2210f5eda6f55cbe8 100644 (file)
@@ -626,6 +626,19 @@ void
 check_rtmetadata(
        struct xfs_mount        *mp)
 {
+       if (xfs_has_zoned(mp)) {
+               /*
+                * Rough plan:
+                *
+                * for each RTG:
+                *  a) if on an actual zoned device only:
+                *      - compare hw write pointer to last written
+                *      - compare zone state to last written
+                *  b) make sure there is no used space after the write pointer
+                */
+               return;
+       }
+
        rtinit(mp);
        generate_rtinfo(mp, btmcompute, sumcompute);
        check_rtbitmap(mp);
index b9e527e25956aba8bf7b6ac1a1f6098416e46041..f90fc28b7f3eed7d73dde0a83499662cb5248704 100644 (file)
@@ -798,6 +798,9 @@ fill_rbmino(xfs_mount_t *mp)
        xfs_fileoff_t   bno;
        xfs_bmbt_irec_t map;
 
+       if (xfs_has_zoned(mp))
+               return;
+
        bmp = btmcompute;
        bno = 0;
 
@@ -886,6 +889,9 @@ fill_rsumino(xfs_mount_t *mp)
        xfs_fileoff_t   end_bno;
        xfs_bmbt_irec_t map;
 
+       if (xfs_has_zoned(mp))
+               return;
+
        smp = sumcompute;
        bno = 0;
        end_bno = mp->m_rsumsize >> mp->m_sb.sb_blocklog;
@@ -1137,6 +1143,17 @@ zap:
                                (unsigned long long)ip->i_ino,
                                error);
 
+       if (xfs_has_zoned(mp)) {
+               /* Clear the WP/freed fields */
+               inode_set_atime(VFS_I(ip), 0, 0);
+               inode_set_mtime(VFS_I(ip), 0, 0);
+
+               /*
+                * XXX: also fill out the counters based off the regenerated
+                * rmap.
+                */
+       }
+
        /* Copy our incore rmap data to the ondisk rmap inode. */
        error = populate_rtgroup_rmapbt(rtg, ip, est_fdblocks);
        if (error)
index ea183204dfab4b650047b02b2c0874f35955d151..f4ff849c49b5b93f01af065ba4cd22617b934593 100644 (file)
@@ -201,6 +201,8 @@ check_rtfile_contents(
        xfs_fileoff_t           bno = 0;
        int                     error;
 
+       ASSERT(!xfs_has_zoned(mp));
+
        error = -libxfs_iget(mp, NULL, ino, 0, &ip);
        if (error) {
                do_warn(_("unable to open %s file, err %d\n"), filename, error);