From e64bc8439c5d8c23b71a3fc0db235e6d90a44d54 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Feb 2024 15:48:07 +0100 Subject: [PATCH] add zoned device support This is pretty crude, and only mkfs really works. repair has all the code checking and repairing realtime related metadata just stubbed out when using the zone allocator. Signed-off-by: Christoph Hellwig --- include/xfs_mount.h | 4 +- libxfs/xfs_sb.c | 3 +- mkfs/proto.c | 11 +- mkfs/xfs_mkfs.c | 265 +++++++++++++++++++++++++++++++++++++++++--- repair/phase5.c | 13 +++ repair/phase6.c | 17 +++ repair/rt.c | 2 + 7 files changed, 295 insertions(+), 20 deletions(-) diff --git a/include/xfs_mount.h b/include/xfs_mount.h index 8949efbdc..6820c0466 100644 --- a/include/xfs_mount.h +++ b/include/xfs_mount.h @@ -190,6 +190,7 @@ typedef struct xfs_mount { #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ #define XFS_FEAT_METADIR (1ULL << 27) /* metadata directory tree */ #define XFS_FEAT_RTGROUPS (1ULL << 28) /* realtime groups */ +#define XFS_FEAT_ZONED (1ULL << 29) #define __XFS_HAS_FEAT(name, NAME) \ static inline bool xfs_has_ ## name (struct xfs_mount *mp) \ @@ -236,6 +237,7 @@ __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) __XFS_HAS_FEAT(large_extent_counts, NREXT64) __XFS_HAS_FEAT(metadir, METADIR) __XFS_HAS_FEAT(rtgroups, RTGROUPS) +__XFS_HAS_FEAT(zoned, ZONED) static inline bool xfs_has_rtrmapbt(struct xfs_mount *mp) { @@ -251,7 +253,7 @@ static inline bool xfs_has_rtreflink(struct xfs_mount *mp) static inline bool xfs_has_rtsb(struct xfs_mount *mp) { - return xfs_has_rtgroups(mp); + return xfs_has_rtgroups(mp) && !xfs_has_zoned(mp); } /* Kernel mount features that we don't support */ diff --git a/libxfs/xfs_sb.c b/libxfs/xfs_sb.c index 994699139..0adb532ee 100644 --- a/libxfs/xfs_sb.c +++ b/libxfs/xfs_sb.c @@ -585,6 +585,7 @@ xfs_validate_sb_common( rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize); + // XXX: zoned? if (xfs_sb_is_v5(sbp) && (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_RTGROUPS)) rbmblock_bytes -= sizeof(struct xfs_rtbuf_blkinfo); @@ -1113,7 +1114,7 @@ xfs_sb_mount_common( mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize); if (is_power_of_2(sbp->sb_rgblocks)) { - mp->m_rgblklog = ilog2(sbp->sb_rgblocks); + mp->m_rgblklog = log2_if_power2(sbp->sb_rgblocks); mp->m_rgblkmask = sbp->sb_rgblocks - 1; } else if (xfs_has_zoned(mp)) { /* diff --git a/mkfs/proto.c b/mkfs/proto.c index fc230cc97..7d78e04a5 100644 --- a/mkfs/proto.c +++ b/mkfs/proto.c @@ -870,6 +870,14 @@ rtrmapbt_create( if (error) fail(_("rtrmap inode creation failed"), error); + if (xfs_has_zoned(mp)) { + /* Clear the WP/freed fields */ + inode_set_atime(VFS_I(rtg->rtg_rmapip), 0, 0); + inode_set_mtime(VFS_I(rtg->rtg_rmapip), 0, 0); + libxfs_trans_log_inode(upd.tp, rtg->rtg_rmapip, XFS_ILOG_CORE); + goto commit; + } + /* Adding an rmap for the rtgroup super should fit in the data fork */ cur = libxfs_rtrmapbt_init_cursor(mp, upd.tp, rtg, rtg->rtg_rmapip); error = -libxfs_rmap_map_raw(cur, &rmap); @@ -877,6 +885,7 @@ rtrmapbt_create( if (error) fail(_("rtrmapbt initialization failed"), error); +commit: error = -libxfs_imeta_commit_update(&upd); if (error) fail(_("rtrmapbt commit failed"), error); @@ -1104,7 +1113,7 @@ rtinit( rtrefcountbt_create(rtg); } - if (mp->m_sb.sb_rbmblocks == 0) + if (mp->m_sb.sb_rbmblocks == 0 || xfs_has_zoned(mp)) return; rtbitmap_init(mp); diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c index 209506145..6ee47a835 100644 --- a/mkfs/xfs_mkfs.c +++ b/mkfs/xfs_mkfs.c @@ -7,6 +7,7 @@ #include "libfrog/util.h" #include "libxfs.h" #include +#include #include "xfs_multidisk.h" #include "libxcmd.h" #include "libfrog/fsgeom.h" @@ -134,6 +135,7 @@ enum { R_RTGROUPS, R_RGCOUNT, R_RGSIZE, + R_ZONED, R_MAX_OPTS, }; @@ -724,6 +726,7 @@ static struct opt_params ropts = { [R_RTGROUPS] = "rtgroups", [R_RGCOUNT] = "rgcount", [R_RGSIZE] = "rgsize", + [R_ZONED] = "zoned", [R_MAX_OPTS] = NULL, }, .subopt_params = { @@ -784,6 +787,13 @@ static struct opt_params ropts = { .maxval = (unsigned long long)XFS_MAX_RGBLOCKS << XFS_MAX_BLOCKSIZE_LOG, .defaultval = SUBOPT_NEEDS_VAL, }, + { .index = R_ZONED, + .conflicts = { { &ropts, R_EXTSIZE }, + { NULL, LAST_CONFLICT } }, + .minval = 0, + .maxval = 1, + .defaultval = 0, + }, }, }; @@ -937,6 +947,7 @@ struct sb_feat_args { bool nortalign; bool nrext64; bool rtgroups; /* XFS_SB_FEAT_COMPAT_RTGROUPS */ + bool zoned; }; struct cli_params { @@ -1455,6 +1466,28 @@ discard_blocks(int fd, uint64_t nsectors, int quiet) printf("Done.\n"); } +static void +reset_zones(int fd, uint64_t nsectors, int quiet) +{ + struct blk_zone_range range = { + .nr_sectors = nsectors, + }; + + if (!quiet) { + printf("Discarding blocks..."); + fflush(stdout); + } + + if (ioctl(fd, BLKRESETZONE, &range) < 0) { + if (!quiet) + printf(" FAILED\n"); + exit(1); + } + + if (!quiet) + printf("Done.\n"); +} + static __attribute__((noreturn)) void illegal_option( const char *value, @@ -1994,6 +2027,9 @@ rtdev_opts_parser( case R_RGSIZE: cli->rgsize = getstr(value, opts, subopt); break; + case R_ZONED: + cli->sb_feat.zoned = getnum(value, opts, subopt); + break; default: return -EINVAL; } @@ -2283,7 +2319,158 @@ _("Version 1 logs do not support sector size %d\n"), _("log stripe unit specified, using v2 logs\n")); cli->sb_feat.log_version = 2; } +} + +struct zone_info { + unsigned int nr_zones; + unsigned int zone_capacity; +}; + +struct zone_topology { + struct zone_info data; + struct zone_info rt; + struct zone_info log; +}; + +#define ZONES_PER_IOCTL 16384 +static int report_zones(const char *name, struct zone_info *zi) +{ + struct blk_zone_report *rep; + size_t rep_size; + struct stat st; + unsigned int i, n = 0; + unsigned int zone_size = 0; + uint64_t device_size; + uint64_t sector = 0; + int ret = 0; + int fd; + + fd = open(name, O_RDONLY); + if (fd < 0) + return -EIO; + + if (fstat(fd, &st) < 0) { + ret = -EIO; + goto out_close; + } + if (!S_ISBLK(st.st_mode)) + goto out_close; + + if (ioctl(fd, BLKGETSIZE64, &device_size)) { + ret = -EIO; + goto out_close; + } + if (ioctl(fd, BLKGETZONESZ, &zone_size) || !zone_size) + goto out_close; /* not zoned */ + + device_size /= 512; /* BLKGETSIZE64 reports a byte value */ + zi->nr_zones = device_size / zone_size; + + rep_size = sizeof(struct blk_zone_report) + + sizeof(struct blk_zone) * min(zi->nr_zones, ZONES_PER_IOCTL); + rep = malloc(rep_size); + if (!rep) { + ret = -ENOMEM; + goto out_close; + } + + while (n < zi->nr_zones) { + struct blk_zone *zones = (struct blk_zone *)(rep + 1); + + memset(rep, 0, rep_size); + rep->sector = sector; + rep->nr_zones = ZONES_PER_IOCTL; + + ret = ioctl(fd, BLKREPORTZONE, rep); + if (ret) { + fprintf(stderr, +_("ioctl(BLKREPORTZONE) failed: %d!\n"), ret); + goto out_free; + } + if (!rep->nr_zones) + break; + + for (i = 0; i < rep->nr_zones; i++) { + if (n >= zi->nr_zones) + break; + + switch (zones[i].type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + break; + case BLK_ZONE_TYPE_SEQWRITE_REQ: + break; + case BLK_ZONE_TYPE_SEQWRITE_PREF: + fprintf(stderr, +_("Sequential write preferred zones not supported!\n")); + ret = -EIO; + goto out_free; + } + + if (zones[i].len != zone_size) { + fprintf(stderr, +_("Inconsistent zone size!\n")); + ret = -EIO; + goto out_free; + } + + if (!n) { + zi->zone_capacity = zones[i].capacity; + if (zi->zone_capacity > zone_size) { + fprintf(stderr, +_("Zone capacity larger than zone size!\n")); + ret = -EIO; + } + } else if (zones[i].capacity != zi->zone_capacity) { + fprintf(stderr, +_("Inconsistent zone capacity!\n")); + ret = -EIO; + goto out_free; + } + + n++; + } + sector = zones[rep->nr_zones - 1].start + + zones[rep->nr_zones - 1].len; + } + +out_free: + free(rep); +out_close: + close(fd); + return ret; +} + +static void +validate_zoned( + struct mkfs_params *cfg, + struct cli_params *cli, + struct mkfs_default_params *dft, + struct zone_topology *zt) +{ + if (!cli->xi->data.isfile) { + report_zones(cli->xi->data.name, &zt->data); + if (zt->data.nr_zones) { + fprintf(stderr, +_("Zoned devices not supported as main device!\n")); + usage(); + } + } + + if (cli->xi->rt.name && !cli->xi->rt.isfile) { + report_zones(cli->xi->rt.name, &zt->rt); + if (zt->rt.nr_zones && !cli->sb_feat.zoned) + cli->sb_feat.zoned = true; + } + + if (cli->xi->log.name && !cli->xi->log.isfile) { + report_zones(cli->xi->log.name, &zt->log); + if (zt->log.nr_zones) { + fprintf(stderr, +_("Zoned devices not supported as log device!\n")); + usage(); + } + } } /* @@ -2461,6 +2648,22 @@ _("parent pointers not supported on v4 filesystems\n")); } if (cli->xi->rt.name) { + if (cli->sb_feat.zoned && !cli->sb_feat.rtgroups) { + if (cli_opt_set(&ropts, R_RTGROUPS)) { + fprintf(stderr, +_("zoned mode not supported without rtgroups support\n")); + usage(); + } + cli->sb_feat.rtgroups = true; + } + if (cli->sb_feat.zoned && cli->rtextsize) { + if (cli_opt_set(&ropts, R_EXTSIZE)) { + fprintf(stderr, +_("rt extent size not supported on realtime devices with zoned mode specified\n")); + usage(); + } + cli->rtextsize = 0; + } if (cli->rtextsize && cli->sb_feat.reflink) { if (cli_opt_set(&mopts, M_REFLINK)) { fprintf(stderr, @@ -3109,6 +3312,7 @@ open_devices( static void discard_devices( struct libxfs_init *xi, + struct zone_topology *zt, int quiet) { /* @@ -3117,8 +3321,12 @@ discard_devices( if (!xi->data.isfile) discard_blocks(xi->data.fd, xi->data.size, quiet); - if (xi->rt.dev && !xi->rt.isfile) - discard_blocks(xi->rt.fd, xi->rt.size, quiet); + if (xi->rt.dev && !xi->rt.isfile) { + if (zt->rt.nr_zones) + reset_zones(xi->rt.fd, xi->rt.size, quiet); + else + discard_blocks(xi->rt.fd, xi->rt.size, quiet); + } if (xi->log.dev && xi->log.dev != xi->data.dev && !xi->log.isfile) discard_blocks(xi->log.fd, xi->log.size, quiet); } @@ -3237,7 +3445,8 @@ reported by the device (%u).\n"), static void validate_rtdev( struct mkfs_params *cfg, - struct cli_params *cli) + struct cli_params *cli, + struct zone_topology *zt) { struct libxfs_init *xi = cli->xi; unsigned int rbmblocksize = cfg->blocksize; @@ -3273,6 +3482,9 @@ _("size %s specified for rt subvolume is too large, maxi->um is %lld blocks\n"), reported by the device (%u).\n"), cfg->sectorsize, xi->rt.bsize); } + } else if (zt->rt.nr_zones) { + cfg->rtblocks = DTOBT(zt->rt.nr_zones * zt->rt.zone_capacity, + cfg->blocklog); } else { /* grab volume size */ cfg->rtblocks = DTOBT(xi->rt.size, cfg->blocklog); @@ -3622,20 +3834,28 @@ _("realtime group size (%llu) not at all congruent with extent size (%llu)\n"), return 0; } +#define XFS_MIN_ZONES 3 /* XXX: move to header */ + static void calculate_rtgroup_geometry( struct mkfs_params *cfg, - struct cli_params *cli) + struct cli_params *cli, + struct zone_topology *zt) { - if (!cli->sb_feat.rtgroups) { - cfg->rgcount = 0; - cfg->rgsize = 0; - return; - } + if (zt->rt.nr_zones) { + cfg->rgsize = zt->rt.zone_capacity * 512; + } else { + if (!cli->sb_feat.rtgroups) { + cfg->rgcount = 0; + cfg->rgsize = 0; + return; + } - if (cli->rgsize) { /* User-specified rtgroup size */ - cfg->rgsize = getnum(cli->rgsize, &ropts, R_RGSIZE); + if (cli->rgsize) /* User-specified rtgroup size */ + cfg->rgsize = getnum(cli->rgsize, &ropts, R_RGSIZE); + } + if (cfg->rgsize) { /* * Check specified agsize is a multiple of blocksize. */ @@ -3705,6 +3925,13 @@ _("realtime group count (%llu) must be less than the maximum (%u)\n"), XFS_MAX_RGNUMBER); usage(); } + if (cfg->sb_feat.zoned && cfg->rgcount < XFS_MIN_ZONES) { + fprintf(stderr, +_("realtime group count (%llu) must be greater than the minimum (%u)\n"), + (unsigned long long)cfg->rgcount, + XFS_MIN_ZONES); + usage(); + } } static void @@ -3852,6 +4079,9 @@ sb_set_features( sbp->sb_rgcount = cfg->rgcount; sbp->sb_rgblocks = cfg->rgsize; } + + if (fp->zoned) + sbp->sb_features_incompat |= XFS_SB_FEAT_INCOMPAT_ZONED; } /* @@ -4413,9 +4643,9 @@ prepare_devices( (xfs_extlen_t)XFS_FSB_TO_BB(mp, cfg->logblocks), &sbp->sb_uuid, cfg->sb_feat.log_version, lsunit, XLOG_FMT, XLOG_INIT_CYCLE, false); - /* finally, check we can write the last block in the realtime area */ - if (mp->m_rtdev_targp->bt_bdev && cfg->rtblocks > 0) { + if (mp->m_rtdev_targp->bt_bdev && cfg->rtblocks > 0 && + !xfs_has_zoned(mp)) { buf = alloc_write_buf(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, cfg->rtblocks - 1LL), BTOBB(cfg->blocksize)); @@ -4778,7 +5008,7 @@ main( */ }, }; - + struct zone_topology zt = {}; struct list_head buffer_list; int error; @@ -4880,6 +5110,7 @@ main( sectorsize = cfg.sectorsize; validate_log_sectorsize(&cfg, &cli, &dft, &ft); + validate_zoned(&cfg, &cli, &dft, &zt); validate_sb_features(&cfg, &cli); /* @@ -4908,7 +5139,7 @@ main( validate_overwrite(xi.data.name, force_overwrite); validate_datadev(&cfg, &cli); validate_logdev(&cfg, &cli); - validate_rtdev(&cfg, &cli); + validate_rtdev(&cfg, &cli, &zt); calc_stripe_factors(&cfg, &cli, &ft); /* @@ -4919,7 +5150,7 @@ main( */ calculate_initial_ag_geometry(&cfg, &cli, &xi); align_ag_geometry(&cfg); - calculate_rtgroup_geometry(&cfg, &cli); + calculate_rtgroup_geometry(&cfg, &cli, &zt); calculate_imaxpct(&cfg, &cli); @@ -4973,7 +5204,7 @@ main( * All values have been validated, discard the old device layout. */ if (discard && !dry_run) - discard_devices(&xi, quiet); + discard_devices(&xi, &zt, quiet); /* * we need the libxfs buffer cache from here on in. diff --git a/repair/phase5.c b/repair/phase5.c index 1309a43df..e49fdfbe4 100644 --- a/repair/phase5.c +++ b/repair/phase5.c @@ -626,6 +626,19 @@ void check_rtmetadata( struct xfs_mount *mp) { + if (xfs_has_zoned(mp)) { + /* + * Rough plan: + * + * for each RTG: + * a) if on an actual zoned device only: + * - compare hw write pointer to last written + * - compare zone state to last written + * b) make sure there is no used space after the write pointer + */ + return; + } + rtinit(mp); generate_rtinfo(mp, btmcompute, sumcompute); check_rtbitmap(mp); diff --git a/repair/phase6.c b/repair/phase6.c index b9e527e25..f90fc28b7 100644 --- a/repair/phase6.c +++ b/repair/phase6.c @@ -798,6 +798,9 @@ fill_rbmino(xfs_mount_t *mp) xfs_fileoff_t bno; xfs_bmbt_irec_t map; + if (xfs_has_zoned(mp)) + return; + bmp = btmcompute; bno = 0; @@ -886,6 +889,9 @@ fill_rsumino(xfs_mount_t *mp) xfs_fileoff_t end_bno; xfs_bmbt_irec_t map; + if (xfs_has_zoned(mp)) + return; + smp = sumcompute; bno = 0; end_bno = mp->m_rsumsize >> mp->m_sb.sb_blocklog; @@ -1137,6 +1143,17 @@ zap: (unsigned long long)ip->i_ino, error); + if (xfs_has_zoned(mp)) { + /* Clear the WP/freed fields */ + inode_set_atime(VFS_I(ip), 0, 0); + inode_set_mtime(VFS_I(ip), 0, 0); + + /* + * XXX: also fill out the counters based off the regenerated + * rmap. + */ + } + /* Copy our incore rmap data to the ondisk rmap inode. */ error = populate_rtgroup_rmapbt(rtg, ip, est_fdblocks); if (error) diff --git a/repair/rt.c b/repair/rt.c index ea183204d..f4ff849c4 100644 --- a/repair/rt.c +++ b/repair/rt.c @@ -201,6 +201,8 @@ check_rtfile_contents( xfs_fileoff_t bno = 0; int error; + ASSERT(!xfs_has_zoned(mp)); + error = -libxfs_iget(mp, NULL, ino, 0, &ip); if (error) { do_warn(_("unable to open %s file, err %d\n"), filename, error); -- 2.50.1