From: Dave Chinner Date: Wed, 3 Jun 2015 02:13:34 +0000 (+1000) Subject: patch xfsprogs-rmap-btree-single-owner-2 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=refs%2Fheads%2Frmap-btree;p=users%2Fhch%2Fxfsprogs.git patch xfsprogs-rmap-btree-single-owner-2 --- diff --git a/db/agf.c b/db/agf.c index d9a07cafb..f0121abf4 100644 --- a/db/agf.c +++ b/db/agf.c @@ -55,6 +55,9 @@ const field_t agf_flds[] = { { "cntroot", FLDT_AGBLOCK, OI(OFF(roots) + XFS_BTNUM_CNT * SZ(roots[XFS_BTNUM_CNT])), C1, 0, TYP_CNTBT }, + { "rmaproot", FLDT_AGBLOCK, + OI(OFF(roots) + XFS_BTNUM_RMAP * SZ(roots[XFS_BTNUM_RMAP])), C1, 0, + TYP_RMAPBT }, { "levels", FLDT_UINT32D, OI(OFF(levels)), CI(XFS_BTNUM_AGF), FLD_ARRAY|FLD_SKIPALL, TYP_NONE }, { "bnolevel", FLDT_UINT32D, @@ -63,6 +66,9 @@ const field_t agf_flds[] = { { "cntlevel", FLDT_UINT32D, OI(OFF(levels) + XFS_BTNUM_CNT * SZ(levels[XFS_BTNUM_CNT])), C1, 0, TYP_NONE }, + { "rmaplevel", FLDT_UINT32D, + OI(OFF(levels) + XFS_BTNUM_RMAP * SZ(levels[XFS_BTNUM_RMAP])), C1, 0, + TYP_NONE }, { "flfirst", FLDT_UINT32D, OI(OFF(flfirst)), C1, 0, TYP_NONE }, { "fllast", FLDT_UINT32D, OI(OFF(fllast)), C1, 0, TYP_NONE }, { "flcount", FLDT_UINT32D, OI(OFF(flcount)), C1, 0, TYP_NONE }, diff --git a/db/btblock.c b/db/btblock.c index cdb8b1df2..707bac8a5 100644 --- a/db/btblock.c +++ b/db/btblock.c @@ -96,6 +96,12 @@ struct xfs_db_btree { sizeof(xfs_inobt_rec_t), sizeof(__be32), }, + { XFS_RMAP_CRC_MAGIC, + XFS_BTREE_SBLOCK_CRC_LEN, + sizeof(struct xfs_rmap_key), + sizeof(struct xfs_rmap_rec), + sizeof(__be32), + }, { 0, }, }; @@ -571,3 +577,47 @@ const field_t cntbt_rec_flds[] = { { NULL } }; #undef ROFF + +/* RMAP btree blocks */ +const field_t rmapbt_crc_hfld[] = { + { "", FLDT_RMAPBT_CRC, OI(0), C1, 0, TYP_NONE }, + { NULL } +}; + +#define OFF(f) bitize(offsetof(struct xfs_btree_block, bb_ ## f)) +const field_t rmapbt_crc_flds[] = { + { "magic", FLDT_UINT32X, OI(OFF(magic)), C1, 0, TYP_NONE }, + { "level", FLDT_UINT16D, OI(OFF(level)), C1, 0, TYP_NONE }, + { "numrecs", FLDT_UINT16D, OI(OFF(numrecs)), C1, 0, TYP_NONE }, + { "leftsib", FLDT_AGBLOCK, OI(OFF(u.s.bb_leftsib)), C1, 0, TYP_RMAPBT }, + { "rightsib", FLDT_AGBLOCK, OI(OFF(u.s.bb_rightsib)), C1, 0, TYP_RMAPBT }, + { "bno", FLDT_DFSBNO, OI(OFF(u.s.bb_blkno)), C1, 0, TYP_CNTBT }, + { "lsn", FLDT_UINT64X, OI(OFF(u.s.bb_lsn)), C1, 0, TYP_NONE }, + { "uuid", FLDT_UUID, OI(OFF(u.s.bb_uuid)), C1, 0, TYP_NONE }, + { "owner", FLDT_AGNUMBER, OI(OFF(u.s.bb_owner)), C1, 0, TYP_NONE }, + { "crc", FLDT_CRC, OI(OFF(u.s.bb_crc)), C1, 0, TYP_NONE }, + { "recs", FLDT_RMAPBTREC, btblock_rec_offset, btblock_rec_count, + FLD_ARRAY|FLD_ABASE1|FLD_COUNT|FLD_OFFSET, TYP_NONE }, + { "keys", FLDT_RMAPBTKEY, btblock_key_offset, btblock_key_count, + FLD_ARRAY|FLD_ABASE1|FLD_COUNT|FLD_OFFSET, TYP_NONE }, + { "ptrs", FLDT_RMAPBTPTR, btblock_ptr_offset, btblock_key_count, + FLD_ARRAY|FLD_ABASE1|FLD_COUNT|FLD_OFFSET, TYP_RMAPBT }, + { NULL } +}; +#undef OFF + +#define KOFF(f) bitize(offsetof(struct xfs_rmap_key, rm_ ## f)) +const field_t rmapbt_key_flds[] = { + { "startblock", FLDT_AGBLOCK, OI(KOFF(startblock)), C1, 0, TYP_DATA }, + { NULL } +}; +#undef KOFF + +#define ROFF(f) bitize(offsetof(struct xfs_rmap_rec, rm_ ## f)) +const field_t rmapbt_rec_flds[] = { + { "startblock", FLDT_AGBLOCK, OI(ROFF(startblock)), C1, 0, TYP_DATA }, + { "blockcount", FLDT_EXTLEN, OI(ROFF(blockcount)), C1, 0, TYP_NONE }, + { "owner", FLDT_UINT64X, OI(ROFF(owner)), C1, 0, TYP_NONE }, + { NULL } +}; +#undef ROFF diff --git a/db/btblock.h b/db/btblock.h index daee060e3..d8662a1ea 100644 --- a/db/btblock.h +++ b/db/btblock.h @@ -51,4 +51,9 @@ extern const struct field cntbt_crc_hfld[]; extern const struct field cntbt_key_flds[]; extern const struct field cntbt_rec_flds[]; +extern const struct field rmapbt_crc_flds[]; +extern const struct field rmapbt_crc_hfld[]; +extern const struct field rmapbt_key_flds[]; +extern const struct field rmapbt_rec_flds[]; + extern int btblock_size(void *obj, int startoff, int idx); diff --git a/db/field.c b/db/field.c index 816065e74..d185b234c 100644 --- a/db/field.c +++ b/db/field.c @@ -164,6 +164,15 @@ const ftattr_t ftattrtab[] = { { FLDT_CNTBTREC, "cntbtrec", fp_sarray, (char *)cntbt_rec_flds, SI(bitsz(xfs_alloc_rec_t)), 0, NULL, cntbt_rec_flds }, + { FLDT_RMAPBT_CRC, "rmapbt", NULL, (char *)rmapbt_crc_flds, btblock_size, + FTARG_SIZE, NULL, rmapbt_crc_flds }, + { FLDT_RMAPBTKEY, "rmapbtkey", fp_sarray, (char *)rmapbt_key_flds, + SI(bitsz(struct xfs_rmap_key)), 0, NULL, rmapbt_key_flds }, + { FLDT_RMAPBTPTR, "rmapbtptr", fp_num, "%u", + SI(bitsz(xfs_rmap_ptr_t)), 0, fa_agblock, NULL }, + { FLDT_RMAPBTREC, "rmapbtrec", fp_sarray, (char *)rmapbt_rec_flds, + SI(bitsz(struct xfs_rmap_rec)), 0, NULL, rmapbt_rec_flds }, + /* CRC field */ { FLDT_CRC, "crc", fp_crc, "%#x (%s)", SI(bitsz(__uint32_t)), 0, NULL, NULL }, diff --git a/db/field.h b/db/field.h index 6343c9ae5..f3fba668d 100644 --- a/db/field.h +++ b/db/field.h @@ -80,6 +80,10 @@ typedef enum fldt { FLDT_CNTBTKEY, FLDT_CNTBTPTR, FLDT_CNTBTREC, + FLDT_RMAPBT_CRC, + FLDT_RMAPBTKEY, + FLDT_RMAPBTPTR, + FLDT_RMAPBTREC, /* CRC field type */ FLDT_CRC, diff --git a/db/type.c b/db/type.c index b29f2a47a..de978507c 100644 --- a/db/type.c +++ b/db/type.c @@ -58,6 +58,7 @@ static const typ_t __typtab[] = { { TYP_BMAPBTD, "bmapbtd", handle_struct, bmapbtd_hfld, NULL }, { TYP_BNOBT, "bnobt", handle_struct, bnobt_hfld, NULL }, { TYP_CNTBT, "cntbt", handle_struct, cntbt_hfld, NULL }, + { TYP_RMAPBT, NULL }, { TYP_DATA, "data", handle_block, NULL, NULL }, { TYP_DIR2, "dir2", handle_struct, dir2_hfld, NULL }, { TYP_DQBLK, "dqblk", handle_struct, dqblk_hfld, NULL }, @@ -87,6 +88,8 @@ static const typ_t __typtab_crc[] = { &xfs_allocbt_buf_ops }, { TYP_CNTBT, "cntbt", handle_struct, cntbt_crc_hfld, &xfs_allocbt_buf_ops }, + { TYP_RMAPBT, "rmapbt", handle_struct, rmapbt_crc_hfld, + &xfs_rmapbt_buf_ops }, { TYP_DATA, "data", handle_block, NULL, NULL }, { TYP_DIR2, "dir3", handle_struct, dir3_hfld, &xfs_dir3_db_buf_ops }, diff --git a/db/type.h b/db/type.h index 3bb26f174..9d02d6dad 100644 --- a/db/type.h +++ b/db/type.h @@ -24,7 +24,7 @@ struct field; typedef enum typnm { TYP_AGF, TYP_AGFL, TYP_AGI, TYP_ATTR, TYP_BMAPBTA, - TYP_BMAPBTD, TYP_BNOBT, TYP_CNTBT, TYP_DATA, + TYP_BMAPBTD, TYP_BNOBT, TYP_CNTBT, TYP_RMAPBT, TYP_DATA, TYP_DIR2, TYP_DQBLK, TYP_INOBT, TYP_INODATA, TYP_INODE, TYP_LOG, TYP_RTBITMAP, TYP_RTSUMMARY, TYP_SB, TYP_SYMLINK, TYP_TEXT, TYP_NONE diff --git a/include/Makefile b/include/Makefile index 70e43a05b..b3526ec29 100644 --- a/include/Makefile +++ b/include/Makefile @@ -30,7 +30,7 @@ QAHFILES = libxfs.h libxlog.h \ xfs_trace.h \ xfs_trans.h -HFILES = handle.h jdm.h xqm.h xfs.h +HFILES = handle.h jdm.h xqm.h xfs.h platform_defs.h HFILES += $(PKG_PLATFORM).h PHFILES = darwin.h freebsd.h irix.h linux.h gnukfreebsd.h DKHFILES = volume.h fstyp.h dvh.h diff --git a/include/libxfs.h b/include/libxfs.h index 6a59cc024..c6bd37ddb 100644 --- a/include/libxfs.h +++ b/include/libxfs.h @@ -66,6 +66,7 @@ extern uint32_t crc32c_le(uint32_t crc, unsigned char const *p, size_t len); #include #include #include +#include #include #include #include diff --git a/include/xfs_mount.h b/include/xfs_mount.h index 70bdea080..b614edd37 100644 --- a/include/xfs_mount.h +++ b/include/xfs_mount.h @@ -64,6 +64,8 @@ typedef struct xfs_mount { uint m_bmap_dmnr[2]; /* XFS_BMAP_BLOCK_DMINRECS */ uint m_inobt_mxr[2]; /* XFS_INOBT_BLOCK_MAXRECS */ uint m_inobt_mnr[2]; /* XFS_INOBT_BLOCK_MINRECS */ + uint m_rmap_mxr[2]; /* max rmap btree records */ + uint m_rmap_mnr[2]; /* min rmap btree records */ uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */ diff --git a/include/xfs_trace.h b/include/xfs_trace.h index ab046a914..31a994be2 100644 --- a/include/xfs_trace.h +++ b/include/xfs_trace.h @@ -169,4 +169,11 @@ #define trace_xfs_perag_get_tag(a,b,c,d) ((c) = (c)) #define trace_xfs_perag_put(a,b,c,d) ((c) = (c)) +#define trace_xfs_rmap_alloc_extent(a,b,c,d,e) ((void) 0) +#define trace_xfs_rmap_alloc_extent_done(a,b,c,d,e) ((void) 0) +#define trace_xfs_rmap_alloc_extent_error(a,b,c,d,e) ((void) 0) +#define trace_xfs_rmap_free_extent(a,b,c,d,e) ((void) 0) +#define trace_xfs_rmap_free_extent_done(a,b,c,d,e) ((void) 0) +#define trace_xfs_rmap_free_extent_error(a,b,c,d,e) ((void) 0) + #endif /* __TRACE_H__ */ diff --git a/libxfs/Makefile b/libxfs/Makefile index 981cb0bbd..7dffa6b78 100644 --- a/libxfs/Makefile +++ b/libxfs/Makefile @@ -42,6 +42,7 @@ QAHFILES = xfs_alloc.h \ xfs_inode_fork.h \ xfs_log_format.h \ xfs_quota_defs.h \ + xfs_rmap_btree.h \ xfs_sb.h \ xfs_shared.h \ xfs_trans_resv.h \ @@ -75,6 +76,8 @@ CFILES = cache.c \ xfs_ialloc_btree.c \ xfs_log_rlimit.c \ xfs_rtbitmap.c \ + xfs_rmap.c \ + xfs_rmap_btree.c \ xfs_sb.c \ xfs_symlink_remote.c \ xfs_trans_resv.c diff --git a/libxfs/xfs_alloc.c b/libxfs/xfs_alloc.c index 23e3c5387..d0003c563 100644 --- a/libxfs/xfs_alloc.c +++ b/libxfs/xfs_alloc.c @@ -26,6 +26,7 @@ #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" +#include "xfs_rmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_cksum.h" @@ -615,6 +616,12 @@ xfs_alloc_ag_vextent( ASSERT(!args->wasfromfl || !args->isfl); ASSERT(args->agbno % args->alignment == 0); + /* insert new block into the reverse map btree */ + error = xfs_rmap_alloc(args->tp, args->agbp, args->agno, + args->agbno, args->len, args->owner); + if (error) + return error; + if (!args->wasfromfl) { error = xfs_alloc_update_counters(args->tp, args->pag, args->agbp, @@ -1962,6 +1969,7 @@ xfs_alloc_fix_freelist( memset(&targs, 0, sizeof(targs)); targs.tp = tp; targs.mp = mp; + targs.owner = XFS_RMAP_OWN_AG; targs.agbp = agbp; targs.agno = args->agno; targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; @@ -2586,6 +2594,8 @@ error0: * Free an extent. * Just break up the extent address and hand off to xfs_free_ag_extent * after fixing up the freelist. + * + * XXX: need owner of extent being freed */ int /* error */ xfs_free_extent( @@ -2627,6 +2637,12 @@ xfs_free_extent( goto error0; } + /* XXX: need owner */ + error = xfs_rmap_free(tp, args.agbp, args.agno, args.agbno, len, 0); + if (error) + goto error0; + + /* XXX: initially no multiple references, so just free it */ error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); if (!error) xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0); @@ -2634,3 +2650,14 @@ error0: xfs_perag_put(args.pag); return error; } + +xfs_extlen_t +xfs_prealloc_blocks( + struct xfs_mount *mp) +{ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + return XFS_RMAP_BLOCK(mp) + 1; + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + return XFS_FIBT_BLOCK(mp) + 1; + return XFS_IBT_BLOCK(mp) + 1; +} diff --git a/libxfs/xfs_alloc.h b/libxfs/xfs_alloc.h index db5da4a9c..4e52b1f8f 100644 --- a/libxfs/xfs_alloc.h +++ b/libxfs/xfs_alloc.h @@ -72,6 +72,8 @@ typedef unsigned int xfs_alloctype_t; * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap * btree requires 1 fsb, so we set the number of set-aside blocks * to 4 + 4*agcount. + * + * XXX: this changes for rmapbt filesystems. */ #define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) @@ -86,10 +88,13 @@ typedef unsigned int xfs_alloctype_t; * * The AG headers are sector sized, so the amount of space they take up is * dependent on filesystem geometry. The others are all single blocks. + * + * XXX: this changes for rmapbt filesystems. */ #define XFS_ALLOC_AG_MAX_USABLE(mp) \ ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7) +xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp); /* * Argument structure for xfs_alloc routines. @@ -120,6 +125,7 @@ typedef struct xfs_alloc_arg { char isfl; /* set if is freelist blocks - !acctg */ char userdata; /* set if this is user data */ xfs_fsblock_t firstblock; /* io first block allocated */ + uint64_t owner; /* owner of blocks being allocated */ } xfs_alloc_arg_t; /* diff --git a/libxfs/xfs_bmap.c b/libxfs/xfs_bmap.c index e6d1e6c0d..56e68726c 100644 --- a/libxfs/xfs_bmap.c +++ b/libxfs/xfs_bmap.c @@ -769,6 +769,7 @@ xfs_bmap_extents_to_btree( memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = mp; + args.owner = ip->i_ino; args.firstblock = *firstblock; if (*firstblock == NULLFSBLOCK) { args.type = XFS_ALLOCTYPE_START_BNO; @@ -915,6 +916,7 @@ xfs_bmap_local_to_extents( memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; + args.owner = ip->i_ino; args.firstblock = *firstblock; /* * Allocate a block. We know we need only one, since the @@ -3683,6 +3685,7 @@ xfs_bmap_btalloc( memset(&args, 0, sizeof(args)); args.tp = ap->tp; args.mp = mp; + args.owner = ap->ip->i_ino; args.fsbno = ap->blkno; /* Trim the allocation back to the maximum an AG can fit. */ diff --git a/libxfs/xfs_bmap_btree.c b/libxfs/xfs_bmap_btree.c index 2fd04e0cd..5df400019 100644 --- a/libxfs/xfs_bmap_btree.c +++ b/libxfs/xfs_bmap_btree.c @@ -442,6 +442,7 @@ xfs_bmbt_alloc_block( args.mp = cur->bc_mp; args.fsbno = cur->bc_private.b.firstblock; args.firstblock = args.fsbno; + args.owner = cur->bc_private.b.ip->i_ino; if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); diff --git a/libxfs/xfs_btree.h b/libxfs/xfs_btree.h index 8f18bab73..48ab2b105 100644 --- a/libxfs/xfs_btree.h +++ b/libxfs/xfs_btree.h @@ -38,17 +38,19 @@ union xfs_btree_ptr { }; union xfs_btree_key { - xfs_bmbt_key_t bmbt; - xfs_bmdr_key_t bmbr; /* bmbt root block */ - xfs_alloc_key_t alloc; - xfs_inobt_key_t inobt; + struct xfs_bmbt_key bmbt; + xfs_bmdr_key_t bmbr; /* bmbt root block */ + xfs_alloc_key_t alloc; + struct xfs_inobt_key inobt; + struct xfs_rmap_key rmap; }; union xfs_btree_rec { - xfs_bmbt_rec_t bmbt; - xfs_bmdr_rec_t bmbr; /* bmbt root block */ - xfs_alloc_rec_t alloc; - xfs_inobt_rec_t inobt; + struct xfs_bmbt_rec bmbt; + xfs_bmdr_rec_t bmbr; /* bmbt root block */ + struct xfs_alloc_rec alloc; + struct xfs_inobt_rec inobt; + struct xfs_rmap_rec rmap; }; /* @@ -63,6 +65,7 @@ union xfs_btree_rec { #define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi) #define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) #define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi) +#define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi) /* * For logging record fields. @@ -94,6 +97,7 @@ do { \ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \ case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \ + case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(rmap, stat); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -108,6 +112,7 @@ do { \ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \ case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \ + case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_ADD(rmap, stat, val); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -199,6 +204,7 @@ typedef struct xfs_btree_cur xfs_alloc_rec_incore_t a; xfs_bmbt_irec_t b; xfs_inobt_rec_incore_t i; + struct xfs_rmap_irec r; } bc_rec; /* current insert/search record value */ struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */ int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */ diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h index 4d313d3d1..0fe326f7f 100644 --- a/libxfs/xfs_format.h +++ b/libxfs/xfs_format.h @@ -445,8 +445,10 @@ xfs_sb_has_compat_feature( } #define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ +#define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ #define XFS_SB_FEAT_RO_COMPAT_ALL \ - (XFS_SB_FEAT_RO_COMPAT_FINOBT) + (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ + XFS_SB_FEAT_RO_COMPAT_RMAPBT) #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( @@ -506,6 +508,12 @@ static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); } +static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT); +} + /* * end of superblock version macros */ @@ -566,10 +574,10 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) #define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION) /* - * Btree number 0 is bno, 1 is cnt. This value gives the size of the + * Btree number 0 is bno, 1 is cnt, 2 is rmap. This value gives the size of the * arrays below. */ -#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1) +#define XFS_BTNUM_AGF ((int)XFS_BTNUM_RMAPi + 1) /* * The second word of agf_levels in the first a.g. overlaps the EFS @@ -586,12 +594,10 @@ typedef struct xfs_agf { __be32 agf_seqno; /* sequence # starting from 0 */ __be32 agf_length; /* size in blocks of a.g. */ /* - * Freespace information + * Freespace and rmap information */ __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */ - __be32 agf_spare0; /* spare field */ __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */ - __be32 agf_spare1; /* spare field */ __be32 agf_flfirst; /* first freelist block's index */ __be32 agf_fllast; /* last freelist block's index */ @@ -1254,16 +1260,74 @@ typedef __be32 xfs_inobt_ptr_t; #define XFS_FIBT_BLOCK(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1)) /* - * The first data block of an AG depends on whether the filesystem was formatted - * with the finobt feature. If so, account for the finobt reserved root btree - * block. + * Reverse mapping btree format definitions + * + * There is a btree for the reverse map per allocation group + */ +#define XFS_RMAP_CRC_MAGIC 0x524d4233 /* 'RMB3' */ + +/* + * Special owner types. + * + * Seeing as we only support up to 8EB, we have the upper bit of the owner field + * to tell us we have a special owner value. We use these for static metadata + * allocated at mkfs/growfs time, as well as for freespace management metadata. + */ +#define XFS_RMAP_OWN_NULL (-1ULL) /* No owner, for growfs */ +#define XFS_RMAP_OWN_UNKNOWN (-2ULL) /* Unknown owner, for EFI recovery */ +#define XFS_RMAP_OWN_FS (-3ULL) /* static fs metadata */ +#define XFS_RMAP_OWN_LOG (-4ULL) /* static fs metadata */ +#define XFS_RMAP_OWN_AG (-5ULL) /* AG freespace btree blocks */ +#define XFS_RMAP_OWN_INOBT (-6ULL) /* Inode btree blocks */ +#define XFS_RMAP_OWN_INODES (-7ULL) /* Inode chunk */ +#define XFS_RMAP_OWN_MIN (-8ULL) /* guard */ + +/* + * Data record structure + */ +struct xfs_rmap_rec { + __be32 rm_startblock; /* extent start block */ + __be32 rm_blockcount; /* extent length */ + __be64 rm_owner; /* extent owner */ +}; + +struct xfs_rmap_irec { + xfs_agblock_t rm_startblock; /* extent start block */ + xfs_extlen_t rm_blockcount; /* extent length */ + __uint64_t rm_owner; /* extent owner */ +}; + +/* + * Key structure + * + * We don't use the length for lookups + */ +struct xfs_rmap_key { + __be32 rm_startblock; /* extent start block */ +}; + +/* btree pointer type */ +typedef __be32 xfs_rmap_ptr_t; + +/* + * block numbers in the AG. */ -#define XFS_PREALLOC_BLOCKS(mp) \ +#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1)) +#define XFS_FIBT_BLOCK(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1)) +#define XFS_RMAP_BLOCK(mp) \ (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \ XFS_FIBT_BLOCK(mp) + 1 : \ XFS_IBT_BLOCK(mp) + 1) - +/* + * The first data block of an AG depends on whether the filesystem was formatted + * with the optional btree features. These need to be accounted for + * appropriately. + * + * XXX: this should be calculated once at mount time and stored in the struct + * xfs_mount rather than calculated every time it is used. + */ +#define XFS_PREALLOC_BLOCKS(mp) xfs_prealloc_blocks(mp) /* * BMAP Btree format definitions diff --git a/libxfs/xfs_ialloc.c b/libxfs/xfs_ialloc.c index 2b4e4e077..08716f7e9 100644 --- a/libxfs/xfs_ialloc.c +++ b/libxfs/xfs_ialloc.c @@ -364,6 +364,7 @@ xfs_ialloc_ag_alloc( memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = tp->t_mountp; + args.owner = XFS_RMAP_OWN_INODES; /* * Locking will ensure that we don't have two callers in here diff --git a/libxfs/xfs_ialloc_btree.c b/libxfs/xfs_ialloc_btree.c index 9ac143a5e..a40b9e8c3 100644 --- a/libxfs/xfs_ialloc_btree.c +++ b/libxfs/xfs_ialloc_btree.c @@ -95,6 +95,7 @@ xfs_inobt_alloc_block( memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; + args.owner = XFS_RMAP_OWN_INOBT; args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno); args.minlen = 1; args.maxlen = 1; diff --git a/libxfs/xfs_rmap.c b/libxfs/xfs_rmap.c new file mode 100644 index 000000000..b2a33302b --- /dev/null +++ b/libxfs/xfs_rmap.c @@ -0,0 +1,413 @@ + +/* + * Copyright (c) 2014 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "libxfs_priv.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_btree.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_rmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" + + +/* + * Lookup the first record less than or equal to [bno, len] + * in the btree given by cur. + */ +STATIC int +xfs_rmap_lookup_le( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + uint64_t owner, + int *stat) +{ + cur->bc_rec.r.rm_startblock = bno; + cur->bc_rec.r.rm_blockcount = len; + cur->bc_rec.r.rm_owner = owner; + return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Update the record referred to by cur to the value given + * by [bno, len, ref]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_rmap_update( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *irec) +{ + union xfs_btree_rec rec; + + rec.rmap.rm_startblock = cpu_to_be32(irec->rm_startblock); + rec.rmap.rm_blockcount = cpu_to_be32(irec->rm_blockcount); + rec.rmap.rm_owner = cpu_to_be64(irec->rm_owner); + return xfs_btree_update(cur, &rec); +} + +/* + * Get the data from the pointed-to record. + */ +STATIC int +xfs_rmap_get_rec( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *irec, + int *stat) +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (error || !*stat) + return error; + + irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock); + irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount); + irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner); + return 0; +} + +/* + * Find the extent in the rmap btree and remove it. + * + * The record we find should always span a range greater than or equal to the + * the extent being freed. This makes the code simple as, in theory, we do not + * have to handle ranges that are split across multiple records as extents that + * result in bmap btree extent merges should also result in rmap btree extent + * merges. The owner field ensures we don't merge extents from different + * structures into the same record, hence this property should always hold true + * if we ensure that the rmap btree supports at least the same size maximum + * extent as the bmap btree (2^21 blocks at present). + * + * Complexity: when growing the filesystem, we "free" an extent when growing the + * last AG. This extent is new space and so it is not tracked as used space in + * the btree. The growfs code will pass in an owner of XFS_RMAP_OWN_NULL to + * indicate that it expected that there is no owner of this extent. We verify + * that - the extent lookup result in a record that does not overlap. + * + * Complexity #2: EFIs do not record the owner of the extent, so when recovering + * EFIs from the log we pass in XFS_RMAP_OWN_UNKNOWN to tell the rmap btree to + * ignore the owner (i.e. wildcard match) so we don't trigger corruption checks + * during log recovery. + */ +int +xfs_rmap_free( + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + uint64_t owner) +{ + struct xfs_btree_cur *cur; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rmap_irec ltrec; + int error; + int i; + + /* + * if rmap btree is not supported, then just return success without + * doing anything. + */ + if (!xfs_sb_version_hasrmapbt(&tp->t_mountp->m_sb)) + return 0; + + trace_xfs_rmap_free_extent(mp, agno, bno, len, owner); + cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); + + /* + * We always have a left record because there's a static record + * for the AG headers at rm_startblock == 0. + */ + error = xfs_rmap_lookup_le(cur, bno, len, owner, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + + error = xfs_rmap_get_rec(cur, <rec, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + + /* special growfs case - bno is beyond last record */ + if (owner == XFS_RMAP_OWN_NULL) { + XFS_WANT_CORRUPTED_GOTO(mp, bno > ltrec.rm_startblock + + ltrec.rm_blockcount, out_error); + goto out_done; + } + + /* make sure the extent we found covers the entire freeing range. */ + XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno, out_error); + XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_blockcount >= len, out_error); + +/* + if (owner != ltrec.rm_owner || + bno > ltrec.rm_startblock + ltrec.rm_blockcount) + */ + //printk("rmfree ag %d bno 0x%x/0x%x/0x%llx, ltrec 0x%x/0x%x/0x%llx\n", + // agno, bno, len, owner, ltrec.rm_startblock, + // ltrec.rm_blockcount, ltrec.rm_owner); + XFS_WANT_CORRUPTED_GOTO(mp, bno <= ltrec.rm_startblock + ltrec.rm_blockcount, + out_error); + XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner || + (owner < XFS_RMAP_OWN_NULL && + owner >= XFS_RMAP_OWN_MIN), out_error); + + /* exact match is easy */ + if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { + //printk("remove exact\n"); + /* remove extent from rmap tree */ + error = xfs_btree_delete(cur, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + } else if (ltrec.rm_startblock == bno) { + //printk("remove left\n"); + /* + * overlap left hand side of extent + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrrrrrrr| + * bno len + */ + ltrec.rm_startblock += len; + ltrec.rm_blockcount -= len; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + } else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) { + //printk("remove right\n"); + /* + * overlap right hand side of extent + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrrrrrrr| + * bno len + */ + ltrec.rm_blockcount -= len; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + } else { + /* + * overlap middle of extent + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrr| |rrrr| + * bno len + */ + xfs_extlen_t orig_len = ltrec.rm_blockcount; + //printk("remove middle\n"); + + ltrec.rm_blockcount = bno - ltrec.rm_startblock;; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto out_error; + + cur->bc_rec.r.rm_startblock = bno + len; + cur->bc_rec.r.rm_blockcount = orig_len - len - + ltrec.rm_blockcount; + cur->bc_rec.r.rm_owner = ltrec.rm_owner; + error = xfs_btree_insert(cur, &i); + if (error) + goto out_error; + } + +out_done: + trace_xfs_rmap_free_extent_done(mp, agno, bno, len, owner); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; + +out_error: + trace_xfs_rmap_free_extent_error(mp, agno, bno, len, owner); + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * When we allocate a new block, the first thing we do is add a reference to the + * extent in the rmap btree. This is how we track the owner of the extent and th + * enumber of references to it. + * + * Initially, we do not have shared extents, and so the extent can only have a + * single reference count and owner. This makes the initial implementation easy, + * but does not allow us to use the rmap tree for tracking reflink shared files. + * Hence the initial implementation is simply a lookup to find the place to + * insert (and checking we don't find a duplicate/overlap) and then insertng the + * appropriate record. + */ +int +xfs_rmap_alloc( + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + uint64_t owner) +{ + struct xfs_btree_cur *cur; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rmap_irec ltrec; + struct xfs_rmap_irec gtrec; + int have_gt; + int error; + int i; + + /* + * if rmap btree is not supported, then just return success without + * doing anything. + */ + if (!xfs_sb_version_hasrmapbt(&tp->t_mountp->m_sb)) + return 0; + + trace_xfs_rmap_alloc_extent(mp, agno, bno, len, owner); + cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); + + /* + * chekc to see if we find an existing record for this extent rather + * than just the location for insert. + */ + error = xfs_rmap_lookup_le(cur, bno, len, owner, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + + error = xfs_rmap_get_rec(cur, <rec, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + //printk("rmalloc ag %d bno 0x%x/0x%x/0x%llx, ltrec 0x%x/0x%x/0x%llx\n", + // agno, bno, len, owner, ltrec.rm_startblock, + // ltrec.rm_blockcount, ltrec.rm_owner); + + XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock + ltrec.rm_blockcount <= bno, + out_error); + + error = xfs_btree_increment(cur, 0, &have_gt); + if (error) + goto out_error; + if (have_gt) { + error = xfs_rmap_get_rec(cur, >rec, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + //printk("rmalloc ag %d bno 0x%x/0x%x/0x%llx, gtrec 0x%x/0x%x/0x%llx\n", + // agno, bno, len, owner, gtrec.rm_startblock, + // gtrec.rm_blockcount, gtrec.rm_owner); + XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= gtrec.rm_startblock, + out_error); + } else { + gtrec.rm_owner = XFS_RMAP_OWN_NULL; + } + + /* cursor currently points one record past ltrec */ + if (ltrec.rm_owner == owner && + ltrec.rm_startblock + ltrec.rm_blockcount == bno) { + /* + * left edge contiguous + * + * ltbno ltlen + * orig: |ooooooooo| + * adding: |aaaaaaaaa| + * result: |rrrrrrrrrrrrrrrrrrr| + * bno len + */ + //printk("add left\n"); + ltrec.rm_blockcount += len; + if (gtrec.rm_owner == owner && + bno + len == gtrec.rm_startblock) { + //printk("add middle\n"); + /* + * right edge also contiguous + * + * ltbno ltlen gtbno gtlen + * orig: |ooooooooo| |ooooooooo| + * adding: |aaaaaaaaa| + * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr| + */ + ltrec.rm_blockcount += gtrec.rm_blockcount; + error = xfs_btree_delete(cur, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + } + + error = xfs_btree_decrement(cur, 0, &have_gt); + if (error) + goto out_error; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + } else if (gtrec.rm_owner == owner && + bno + len == gtrec.rm_startblock) { + /* + * right edge contiguous + * + * gtbno gtlen + * Orig: |ooooooooo| + * adding: |aaaaaaaaa| + * Result: |rrrrrrrrrrrrrrrrrrr| + * bno len + */ + //printk("add right\n"); + gtrec.rm_startblock = bno; + gtrec.rm_blockcount += len; + error = xfs_rmap_update(cur, >rec); + if (error) + goto out_error; + } else { + //printk("add no match\n"); + /* no contiguous edge with identical owner */ + cur->bc_rec.r.rm_startblock = bno; + cur->bc_rec.r.rm_blockcount = len; + cur->bc_rec.r.rm_owner = owner; + error = xfs_btree_insert(cur, &i); + if (error) + goto out_error; + } + + trace_xfs_rmap_alloc_extent_done(mp, agno, bno, len, owner); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; + +out_error: + trace_xfs_rmap_alloc_extent_error(mp, agno, bno, len, owner); + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} diff --git a/libxfs/xfs_rmap_btree.c b/libxfs/xfs_rmap_btree.c new file mode 100644 index 000000000..ed1792db2 --- /dev/null +++ b/libxfs/xfs_rmap_btree.c @@ -0,0 +1,404 @@ +/* + * Copyright (c) 2014 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "libxfs_priv.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_rmap_btree.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" + + +/* + * Reverse map btree. + * + * This is a per-ag tree used to track the owner of a given extent. Owner + * records are inserted when an extent is allocated, and removed when an extent + * is freed. For existing filesystems, there can only be one owner of an extent, + * usually an inode or some other metadata structure like a AG btree. + * + * Initial thoughts are that the + * value of the owner field needs external flags to define what it means, and + * hence we need a flags field in the record. This means the record is going to + * be larger than 16 bytes (agbno,len,owner = 16 bytes), so maybe this isn't the + * best idea. Initially just implement the owner field - we can probably steal + * bits from the extent length field for type descriptors given that MAXEXTLEN + * is only 21 bits if we want to store the type as well. Keep in mind that if we + * want to do this there are still restrictions on the length of extents we + * track in the rmap btree (see comments on xfs_rmap_free()). + * + * The rmap btree is part of the free space management, so blocks for the tree + * are sourced from the agfl. Hence we need transaction reservation support for + * this tree so that the freelist is always large enough. This also impacts on + * the minimum space we need to leave free in the AG. + * + * The tree is ordered by block number - there's no need to order/search by + * extent size for online updating/management of the tree, and the reverse + * lookups are going to be "who owns this block" and so are by-block ordering is + * perfect for this. + * + * XXX: open question is how to handle blocks that are owned by the freespace + * tree blocks. Right now they will be classified when they are moved to the + * freelist or removed from the freelist. i.e. the extent allocation/freeing + * will mark the extents allocated as owned by the AG. + */ +STATIC struct xfs_btree_cur * +xfs_rmapbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agbp, cur->bc_private.a.agno); +} + +STATIC void +xfs_rmapbt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int inc) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + int btnum = cur->bc_btnum; + struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + + ASSERT(ptr->s != 0); + + agf->agf_roots[btnum] = ptr->s; + be32_add_cpu(&agf->agf_levels[btnum], inc); + pag->pagf_levels[btnum] += inc; + xfs_perag_put(pag); + + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); +} + +STATIC int +xfs_rmapbt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + int error; + xfs_agblock_t bno; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + /* Allocate the new block from the freelist. If we can't, give up. */ + error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + &bno, 1); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + + if (bno == NULLAGBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); + + xfs_trans_agbtree_delta(cur->bc_tp, 1); + new->s = cpu_to_be32(bno); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +} + +STATIC int +xfs_rmapbt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agblock_t bno; + int error; + + bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); + error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); + if (error) + return error; + + xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, + XFS_EXTENT_BUSY_SKIP_DISCARD); + xfs_trans_agbtree_delta(cur->bc_tp, -1); + + xfs_trans_binval(cur->bc_tp, bp); + return 0; +} + +STATIC int +xfs_rmapbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_rmap_mnr[level != 0]; +} + +STATIC int +xfs_rmapbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_rmap_mxr[level != 0]; +} + +STATIC void +xfs_rmapbt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->rmap.rm_startblock = rec->rmap.rm_startblock; +} + +STATIC void +xfs_rmapbt_init_rec_from_key( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + rec->rmap.rm_startblock = key->rmap.rm_startblock; +} + +STATIC void +xfs_rmapbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock); + rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount); + rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner); +} + +STATIC void +xfs_rmapbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(agf->agf_roots[cur->bc_btnum] != 0); + + ptr->s = agf->agf_roots[cur->bc_btnum]; +} + +STATIC __int64_t +xfs_rmapbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + struct xfs_rmap_irec *rec = &cur->bc_rec.r; + struct xfs_rmap_key *kp = &key->rmap; + + return (__int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; +} + +static bool +xfs_rmapbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + + /* + * magic number and level verification + * + * During growfs operations, we can't verify the exact level or owner as + * the perag is not fully initialised and hence not attached to the + * buffer. In this case, check against the maximum tree depth. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agf information will not yet have been initialised + * from the on disk AGF. Again, we can only check against maximum limits + * in this case. + */ + if (block->bb_magic!= cpu_to_be32(XFS_RMAP_CRC_MAGIC)) + return false; + + if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + + level = be16_to_cpu(block->bb_level); + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi]) + return false; + } else if (level >= mp->m_ag_maxlevels) + return false; + + /* numrecs verification */ + if (be16_to_cpu(block->bb_numrecs) > mp->m_rmap_mxr[level != 0]) + return false; + + /* sibling pointer verification */ + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; +} + +static void +xfs_rmapbt_read_verify( + struct xfs_buf *bp) +{ + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_rmapbt_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); + } +} + +static void +xfs_rmapbt_write_verify( + struct xfs_buf *bp) +{ + if (!xfs_rmapbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + xfs_btree_sblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_rmapbt_buf_ops = { + .verify_read = xfs_rmapbt_read_verify, + .verify_write = xfs_rmapbt_write_verify, +}; + + +#if defined(DEBUG) || defined(XFS_WARN) +STATIC int +xfs_rmapbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return be32_to_cpu(k1->rmap.rm_startblock) < + be32_to_cpu(k2->rmap.rm_startblock); +} + +STATIC int +xfs_rmapbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + return be32_to_cpu(r1->rmap.rm_startblock) + + be32_to_cpu(r1->rmap.rm_blockcount) <= + be32_to_cpu(r2->rmap.rm_startblock); +} +#endif /* DEBUG */ + +static const struct xfs_btree_ops xfs_rmapbt_ops = { + .rec_len = sizeof(struct xfs_rmap_rec), + .key_len = sizeof(struct xfs_rmap_key), + + .dup_cursor = xfs_rmapbt_dup_cursor, + .set_root = xfs_rmapbt_set_root, + .alloc_block = xfs_rmapbt_alloc_block, + .free_block = xfs_rmapbt_free_block, + .get_minrecs = xfs_rmapbt_get_minrecs, + .get_maxrecs = xfs_rmapbt_get_maxrecs, + .init_key_from_rec = xfs_rmapbt_init_key_from_rec, + .init_rec_from_key = xfs_rmapbt_init_rec_from_key, + .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur, + .key_diff = xfs_rmapbt_key_diff, + .buf_ops = &xfs_rmapbt_buf_ops, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_rmapbt_keys_inorder, + .recs_inorder = xfs_rmapbt_recs_inorder, +#endif +}; + +/* + * Allocate a new allocation btree cursor. + */ +struct xfs_btree_cur * +xfs_rmapbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_btree_cur *cur; + + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_btnum = XFS_BTNUM_RMAP; + cur->bc_flags = XFS_BTREE_CRC_BLOCKS; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_ops = &xfs_rmapbt_ops; + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + + return cur; +} + +/* + * Calculate number of records in an rmap btree block. + */ +int +xfs_rmapbt_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) +{ + blocklen -= XFS_RMAP_BLOCK_LEN; + + if (leaf) + return blocklen / sizeof(struct xfs_rmap_rec); + return blocklen / + (sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t)); +} diff --git a/libxfs/xfs_rmap_btree.h b/libxfs/xfs_rmap_btree.h new file mode 100644 index 000000000..9ad65e50b --- /dev/null +++ b/libxfs/xfs_rmap_btree.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2014 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_RMAP_BTREE_H__ +#define __XFS_RMAP_BTREE_H__ + +/* + * Freespace on-disk structures + */ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; + +/* rmaps only exist on crc enabled filesystems */ +#define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN + +/* + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define XFS_RMAP_REC_ADDR(block, index) \ + ((struct xfs_rmap_rec *) \ + ((char *)(block) + XFS_RMAP_BLOCK_LEN + \ + (((index) - 1) * sizeof(struct xfs_rmap_rec)))) + +#define XFS_RMAP_KEY_ADDR(block, index) \ + ((struct xfs_rmap_key *) \ + ((char *)(block) + XFS_RMAP_BLOCK_LEN + \ + ((index) - 1) * sizeof(struct xfs_rmap_key))) + +#define XFS_RMAP_PTR_ADDR(block, index, maxrecs) \ + ((xfs_rmap_ptr_t *) \ + ((char *)(block) + XFS_RMAP_BLOCK_LEN + \ + (maxrecs) * sizeof(struct xfs_rmap_key) + \ + ((index) - 1) * sizeof(xfs_rmap_ptr_t))) + +struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_buf *bp, + xfs_agnumber_t agno); +int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf); + +int xfs_rmap_alloc(struct xfs_trans *tp, struct xfs_buf *agbp, + xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, + uint64_t owner); +int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp, + xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, + uint64_t owner); + +#endif /* __XFS_RMAP_BTREE_H__ */ diff --git a/libxfs/xfs_sb.c b/libxfs/xfs_sb.c index 6844cd8c3..e5c7a8534 100644 --- a/libxfs/xfs_sb.c +++ b/libxfs/xfs_sb.c @@ -668,6 +668,11 @@ xfs_sb_mount_common( mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; + mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2; + mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2; + mp->m_bsize = XFS_FSB_TO_BB(mp, 1); mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, sbp->sb_inopblock); diff --git a/libxfs/xfs_shared.h b/libxfs/xfs_shared.h index 8dda4b321..e8e88f3e5 100644 --- a/libxfs/xfs_shared.h +++ b/libxfs/xfs_shared.h @@ -38,6 +38,7 @@ extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agfl_buf_ops; extern const struct xfs_buf_ops xfs_allocbt_buf_ops; +extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; extern const struct xfs_buf_ops xfs_bmbt_buf_ops; diff --git a/libxfs/xfs_types.h b/libxfs/xfs_types.h index b79dc66b2..3d503647f 100644 --- a/libxfs/xfs_types.h +++ b/libxfs/xfs_types.h @@ -108,8 +108,8 @@ typedef enum { } xfs_lookup_t; typedef enum { - XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi, - XFS_BTNUM_FINOi, XFS_BTNUM_MAX + XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi, + XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_MAX } xfs_btnum_t; struct xfs_name { diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c index 17706664a..c42581b9c 100644 --- a/mkfs/xfs_mkfs.c +++ b/mkfs/xfs_mkfs.c @@ -185,6 +185,8 @@ char *mopts[] = { "crc", #define M_FINOBT 1 "finobt", +#define M_RMAPBT 2 + "rmapbt", NULL }; @@ -1004,6 +1006,7 @@ main( int lazy_sb_counters; int crcs_enabled; int finobt; + bool rmapbt; progname = basename(argv[0]); setlocale(LC_ALL, ""); @@ -1038,6 +1041,7 @@ main( lazy_sb_counters = 1; crcs_enabled = 0; finobt = 0; + rmapbt = false; memset(&fsx, 0, sizeof(fsx)); memset(&xi, 0, sizeof(xi)); @@ -1539,6 +1543,14 @@ _("cannot specify both crc and ftype\n")); illegal(value, "m finobt"); finobt = c; break; + case M_RMAPBT: + if (!value || *value == '\0') + reqval('m', mopts, M_CRC); + c = atoi(value); + if (c < 0 || c > 1) + illegal(value, "m rmapbt"); + rmapbt = c; + break; default: unknown('m', value); } @@ -1889,6 +1901,11 @@ _("32 bit Project IDs always enabled on CRC enabled filesytems\n")); _("warning: finobt not supported without CRC support, disabled.\n")); finobt = 0; } + if (rmapbt && !crcs_enabled) { + fprintf(stderr, +_("warning: rmapbt not supported without CRC support, disabled.\n")); + rmapbt = 0; + } if (nsflag || nlflag) { if (dirblocksize < blocksize || @@ -2483,7 +2500,7 @@ _("size %s specified for log subvolume is too large, maximum is %lld blocks\n"), mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; /* - * sb_versionnum and finobt flags must be set before we use + * sb_versionnum, finobt and rmapbt flags must be set before we use * XFS_PREALLOC_BLOCKS(). */ sbp->sb_features2 = XFS_SB_VERSION2_MKFS(crcs_enabled, lazy_sb_counters, @@ -2505,6 +2522,8 @@ _("size %s specified for log subvolume is too large, maximum is %lld blocks\n"), if (finobt) sbp->sb_features_ro_compat = XFS_SB_FEAT_RO_COMPAT_FINOBT; + if (rmapbt) + sbp->sb_features_ro_compat |= XFS_SB_FEAT_RO_COMPAT_RMAPBT; if (loginternal) { /* @@ -2568,7 +2587,7 @@ _("size %s specified for log subvolume is too large, maximum is %lld blocks\n"), printf(_( "meta-data=%-22s isize=%-6d agcount=%lld, agsize=%lld blks\n" " =%-22s sectsz=%-5u attr=%u, projid32bit=%u\n" - " =%-22s crc=%-8u finobt=%u\n" + " =%-22s crc=%-8u finobt=%u, rmapbt=%u\n" "data =%-22s bsize=%-6u blocks=%llu, imaxpct=%u\n" " =%-22s sunit=%-6u swidth=%u blks\n" "naming =version %-14u bsize=%-6u ascii-ci=%d ftype=%d\n" @@ -2577,7 +2596,7 @@ _("size %s specified for log subvolume is too large, maximum is %lld blocks\n"), "realtime =%-22s extsz=%-6d blocks=%lld, rtextents=%lld\n"), dfile, isize, (long long)agcount, (long long)agsize, "", sectorsize, attrversion, !projid16bit, - "", crcs_enabled, finobt, + "", crcs_enabled, finobt, rmapbt, "", blocksize, (long long)dblocks, imaxpct, "", dsunit, dswidth, dirversion, dirblocksize, nci, dirftype, @@ -2748,6 +2767,12 @@ _("size %s specified for log subvolume is too large, maximum is %lld blocks\n"), agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + agf->agf_roots[XFS_BTNUM_RMAPi] = + cpu_to_be32(XFS_RMAP_BLOCK(mp)); + agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); + } + agf->agf_flfirst = 0; agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1); agf->agf_flcount = 0; @@ -2935,22 +2960,83 @@ _("size %s specified for log subvolume is too large, maximum is %lld blocks\n"), /* * Free INO btree root block */ - if (!finobt) - continue; + if (finobt) { + buf = libxfs_getbuf(mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)), + bsize); + buf->b_ops = &xfs_inobt_buf_ops; + block = XFS_BUF_TO_BLOCK(buf); + memset(block, 0, blocksize); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, buf, XFS_FIBT_CRC_MAGIC, 0, 0, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, buf, XFS_FIBT_MAGIC, 0, 0, + agno, 0); + libxfs_writebuf(buf, LIBXFS_EXIT_ON_FAILURE); + } - buf = libxfs_getbuf(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)), + /* RMAP btree root block */ + if (rmapbt) { + struct xfs_rmap_rec *rrec; + + buf = libxfs_getbuf(mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_RMAP_BLOCK(mp)), bsize); - buf->b_ops = &xfs_inobt_buf_ops; - block = XFS_BUF_TO_BLOCK(buf); - memset(block, 0, blocksize); - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block(mp, buf, XFS_FIBT_CRC_MAGIC, 0, 0, + buf->b_ops = &xfs_rmapbt_buf_ops; + block = XFS_BUF_TO_BLOCK(buf); + memset(block, 0, blocksize); + + xfs_btree_init_block(mp, buf, XFS_RMAP_CRC_MAGIC, 0, 0, agno, XFS_BTREE_CRC_BLOCKS); - else - xfs_btree_init_block(mp, buf, XFS_FIBT_MAGIC, 0, 0, - agno, 0); - libxfs_writebuf(buf, LIBXFS_EXIT_ON_FAILURE); + + /* + * mark the AG header regions as static metadata + * The BNO btree block is the first block after the + * headers, so it's location defines the size of region + * the static metadata consumes. + */ + rrec = XFS_RMAP_REC_ADDR(block, 1); + rrec->rm_startblock = 0; + rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp)); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS); + be16_add_cpu(&block->bb_numrecs, 1); + + /* account freespace btree root blocks */ + rrec = XFS_RMAP_REC_ADDR(block, 2); + rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp)); + rrec->rm_blockcount = cpu_to_be32(2); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); + be16_add_cpu(&block->bb_numrecs, 1); + + /* account inode btree root blocks */ + rrec = XFS_RMAP_REC_ADDR(block, 3); + rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp)); + rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) - + XFS_IBT_BLOCK(mp)); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT); + be16_add_cpu(&block->bb_numrecs, 1); + + /* account for rmap btree root */ + rrec = XFS_RMAP_REC_ADDR(block, 4); + rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp)); + rrec->rm_blockcount = cpu_to_be32(1); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); + be16_add_cpu(&block->bb_numrecs, 1); + + /* account for the log space */ + if (loginternal && agno == logagno) { + rrec = XFS_RMAP_REC_ADDR(block, 5); + rrec->rm_startblock = cpu_to_be32( + XFS_FSB_TO_AGBNO(mp, logstart)); + rrec->rm_blockcount = cpu_to_be32(logblocks); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_LOG); + be16_add_cpu(&block->bb_numrecs, 1); + } + + libxfs_writebuf(buf, LIBXFS_EXIT_ON_FAILURE); + } + } /* diff --git a/repair/dinode.c b/repair/dinode.c index 179203ee1..fc8bc128a 100644 --- a/repair/dinode.c +++ b/repair/dinode.c @@ -744,6 +744,7 @@ _("%s fork in ino %" PRIu64 " claims dup extent, " _("%s fork in ino %" PRIu64 " claims free block %" PRIu64 "\n"), forkname, ino, (__uint64_t) b); /* fall through ... */ + case XR_E_INUSE1: /* seen by rmap */ case XR_E_UNKNOWN: set_bmap_ext(agno, agbno, blen, XR_E_INUSE); break; @@ -751,6 +752,11 @@ _("%s fork in ino %" PRIu64 " claims free block %" PRIu64 "\n"), case XR_E_BAD_STATE: do_error(_("bad state in block map %" PRIu64 "\n"), b); + case XR_E_FS_MAP1: + case XR_E_INO1: + case XR_E_INUSE_FS1: + do_warn(_("rmap claims metadata use!\n")); + /* fall through */ case XR_E_FS_MAP: case XR_E_INO: case XR_E_INUSE_FS: diff --git a/repair/incore.h b/repair/incore.h index ba819b4ec..b5c00879c 100644 --- a/repair/incore.h +++ b/repair/incore.h @@ -102,17 +102,11 @@ typedef struct rt_extent_tree_node { #define XR_E_MULT 5 /* extent is multiply referenced */ #define XR_E_INO 6 /* extent used by inodes (inode blocks) */ #define XR_E_FS_MAP 7 /* extent used by fs space/inode maps */ -#define XR_E_BAD_STATE 8 - -/* extent states, in 64 bit word chunks */ -#define XR_E_UNKNOWN_LL 0x0000000000000000LL -#define XR_E_FREE1_LL 0x1111111111111111LL -#define XR_E_FREE_LL 0x2222222222222222LL -#define XR_E_INUSE_LL 0x3333333333333333LL -#define XR_E_INUSE_FS_LL 0x4444444444444444LL -#define XR_E_MULT_LL 0x5555555555555555LL -#define XR_E_INO_LL 0x6666666666666666LL -#define XR_E_FS_MAP_LL 0x7777777777777777LL +#define XR_E_INUSE1 8 /* used block (marked by rmap btree) */ +#define XR_E_INUSE_FS1 9 /* used by fs ag header or log (rmap btree) */ +#define XR_E_INO1 10 /* used by inodes (marked by rmap btree) */ +#define XR_E_FS_MAP1 11 /* used by fs space/inode maps (rmap btree) */ +#define XR_E_BAD_STATE 12 /* separate state bit, OR'ed into high (4th) bit of ex_state field */ diff --git a/repair/scan.c b/repair/scan.c index e7e05d188..3c00660da 100644 --- a/repair/scan.c +++ b/repair/scan.c @@ -44,6 +44,7 @@ struct aghdr_cnts { __uint32_t agicount; __uint32_t agifreecount; __uint64_t fdblocks; + __uint64_t usedblocks; __uint64_t icount; __uint64_t ifreecount; __uint32_t fibtfreecount; @@ -292,6 +293,13 @@ _("bad back (left) sibling pointer (saw %llu should be NULL (0))\n" pthread_mutex_lock(&ag_locks[agno].lock); state = get_bmap(agno, agbno); switch (state) { + case XR_E_INUSE1: + /* + * block was claimed as in use data by the rmap + * btree, but has not been found in the data extent + * map for the inode. That means this bmbt block hasn't + * yet been claimed as in use, which means -it's ours- + */ case XR_E_UNKNOWN: case XR_E_FREE1: case XR_E_FREE: @@ -737,6 +745,251 @@ _("%s freespace btree block claimed (state %d), agno %d, bno %d, suspect %d\n"), } } +static void +scan_rmapbt( + struct xfs_btree_block *block, + int level, + xfs_agblock_t bno, + xfs_agnumber_t agno, + int suspect, + int isroot, + __uint32_t magic, + void *priv) +{ + struct aghdr_cnts *agcnts = priv; + const char *name = "rmap"; + int i; + xfs_rmap_ptr_t *pp; + struct xfs_rmap_rec *rp; + int hdr_errors = 0; + int numrecs; + int state; + xfs_agblock_t lastblock = 0; + + if (magic != XFS_RMAP_CRC_MAGIC) { + name = "(unknown)"; + assert(0); + } + + if (be32_to_cpu(block->bb_magic) != magic) { + do_warn(_("bad magic # %#x in bt%s block %d/%d\n"), + be32_to_cpu(block->bb_magic), name, agno, bno); + hdr_errors++; + if (suspect) + return; + } + + /* + * All RMAP btree blocks except the roots are freed for a + * fully empty filesystem, thus they are counted towards the + * free data block counter. + */ + if (!isroot) { + agcnts->agfbtreeblks++; + agcnts->fdblocks++; + } + + if (be16_to_cpu(block->bb_level) != level) { + do_warn(_("expected level %d got %d in bt%s block %d/%d\n"), + level, be16_to_cpu(block->bb_level), name, agno, bno); + hdr_errors++; + if (suspect) + return; + } + + /* check for btree blocks multiply claimed */ + state = get_bmap(agno, bno); + if (!(state == XR_E_UNKNOWN || state == XR_E_FS_MAP1)) { + set_bmap(agno, bno, XR_E_MULT); + do_warn( +_("%s rmap btree block claimed (state %d), agno %d, bno %d, suspect %d\n"), + name, state, agno, bno, suspect); + return; + } + set_bmap(agno, bno, XR_E_FS_MAP); + + numrecs = be16_to_cpu(block->bb_numrecs); + if (level == 0) { + if (numrecs > mp->m_rmap_mxr[0]) { + numrecs = mp->m_rmap_mxr[0]; + hdr_errors++; + } + if (isroot == 0 && numrecs < mp->m_rmap_mnr[0]) { + numrecs = mp->m_rmap_mnr[0]; + hdr_errors++; + } + + if (hdr_errors) { + do_warn( + _("bad btree nrecs (%u, min=%u, max=%u) in bt%s block %u/%u\n"), + be16_to_cpu(block->bb_numrecs), + mp->m_rmap_mnr[0], mp->m_rmap_mxr[0], + name, agno, bno); + suspect++; + } + + rp = XFS_RMAP_REC_ADDR(block, 1); + for (i = 0; i < numrecs; i++) { + xfs_agblock_t b, end; + xfs_extlen_t len, blen; + int64_t owner; + + b = be32_to_cpu(rp[i].rm_startblock); + len = be32_to_cpu(rp[i].rm_blockcount); + owner = be64_to_cpu(rp[i].rm_owner); + end = b + len; + + if (!verify_agbno(mp, agno, b)) { + do_warn( + _("invalid start block %u in record %u of %s btree block %u/%u\n"), + b, i, name, agno, bno); + continue; + } + if (len == 0 || !verify_agbno(mp, agno, end - 1)) { + do_warn( + _("invalid length %u in record %u of %s btree block %u/%u\n"), + len, i, name, agno, bno); + continue; + } + + /* XXX: range check owner */ + + if (b && b <= lastblock) { + do_warn(_( + "out-of-order rmap btree record %d (%u %u) block %u/%u\n"), + i, b, len, agno, bno); + } else { + lastblock = b; + } + + for ( ; b < end; b += blen) { + state = get_bmap_ext(agno, b, end, &blen); + switch (state) { + case XR_E_UNKNOWN: + switch (owner) { + case XFS_RMAP_OWN_FS: + case XFS_RMAP_OWN_LOG: + set_bmap(agno, b, XR_E_INUSE_FS1); + break; + case XFS_RMAP_OWN_AG: + case XFS_RMAP_OWN_INOBT: + set_bmap(agno, b, XR_E_FS_MAP1); + break; + case XFS_RMAP_OWN_INODES: + set_bmap(agno, b, XR_E_INO1); + break; + case XFS_RMAP_OWN_NULL: + /* still unknown */ + break; + default: + /* file data */ + set_bmap(agno, b, XR_E_INUSE1); + break; + } + break; + case XR_E_INUSE_FS: + if (owner == XFS_RMAP_OWN_FS || + owner == XFS_RMAP_OWN_LOG) + break; + do_warn( +_("Static meta block (%d,%d-%d) mismatch in %s tree, state - %d,%" PRIx64 "\n"), + agno, b, b + blen - 1, + name, state, owner); + break; + case XR_E_FS_MAP: + if (owner == XFS_RMAP_OWN_AG || + owner == XFS_RMAP_OWN_INOBT) + break; + do_warn( +_("AG meta block (%d,%d-%d) mismatch in %s tree, state - %d,%" PRIx64 "\n"), + agno, b, b + blen - 1, + name, state, owner); + break; + case XR_E_INO: + if (owner == XFS_RMAP_OWN_INODES) + break; + do_warn( +_("inode block (%d,%d-%d) mismatch in %s tree, state - %d,%" PRIx64 "\n"), + agno, b, b + blen - 1, + name, state, owner); + break; + case XR_E_INUSE: + if (owner >= 0 && + owner < mp->m_sb.sb_dblocks) + break; + do_warn( +_("in use block (%d,%d-%d) mismatch in %s tree, state - %d,%" PRIx64 "\n"), + agno, b, b + blen - 1, + name, state, owner); + break; + case XR_E_FREE1: + case XR_E_FREE: + /* + * May be on the AGFL. If not, they'll + * be caught later. + */ + break; + default: + do_warn( +_("unknown block (%d,%d-%d) mismatch on %s tree, state - %d,%" PRIx64 "\n"), + agno, b, b + blen - 1, + name, state, owner); + break; + } + } + } + return; + } + + /* + * interior record + */ + pp = XFS_RMAP_PTR_ADDR(block, 1, mp->m_rmap_mxr[1]); + + if (numrecs > mp->m_rmap_mxr[1]) { + numrecs = mp->m_rmap_mxr[1]; + hdr_errors++; + } + if (isroot == 0 && numrecs < mp->m_rmap_mnr[1]) { + numrecs = mp->m_rmap_mnr[1]; + hdr_errors++; + } + + /* + * don't pass bogus tree flag down further if this block + * looked ok. bail out if two levels in a row look bad. + */ + if (hdr_errors) { + do_warn( + _("bad btree nrecs (%u, min=%u, max=%u) in bt%s block %u/%u\n"), + be16_to_cpu(block->bb_numrecs), + mp->m_rmap_mnr[1], mp->m_rmap_mxr[1], + name, agno, bno); + if (suspect) + return; + suspect++; + } else if (suspect) { + suspect = 0; + } + + for (i = 0; i < numrecs; i++) { + xfs_agblock_t bno = be32_to_cpu(pp[i]); + + /* + * XXX - put sibling detection right here. + * we know our sibling chain is good. So as we go, + * we check the entry before and after each entry. + * If either of the entries references a different block, + * check the sibling pointer. If there's a sibling + * pointer mismatch, try and extract as much data + * as possible. + */ + if (bno != 0 && verify_agbno(mp, agno, bno)) { + scan_sbtree(bno, level, agno, suspect, scan_rmapbt, 0, + magic, priv, &xfs_rmapbt_buf_ops); + } + } +} static int scan_single_ino_chunk( xfs_agnumber_t agno, @@ -814,20 +1067,27 @@ _("bad ending inode # (%" PRIu64 " (0x%x 0x%zx)) in ino rec, skipping rec\n"), agbno = XFS_AGINO_TO_AGBNO(mp, ino + j); state = get_bmap(agno, agbno); - if (state == XR_E_UNKNOWN) { - set_bmap(agno, agbno, XR_E_INO); - } else if (state == XR_E_INUSE_FS && agno == 0 && - ino + j >= first_prealloc_ino && - ino + j < last_prealloc_ino) { + switch (state) { + case XR_E_INO: + break; + case XR_E_UNKNOWN: + case XR_E_INO1: /* seen by rmap */ set_bmap(agno, agbno, XR_E_INO); - } else { + break; + case XR_E_INUSE_FS: + case XR_E_INUSE_FS1: + if (agno == 0 && + ino + j >= first_prealloc_ino && + ino + j < last_prealloc_ino) { + set_bmap(agno, agbno, XR_E_INO); + break; + } + /* fall through */ + default: + /* XXX - maybe should mark block a duplicate */ do_warn( _("inode chunk claims used block, inobt block - agno %d, bno %d, inopb %d\n"), agno, agbno, mp->m_sb.sb_inopblock); - /* - * XXX - maybe should mark - * block a duplicate - */ return ++suspect; } } @@ -973,19 +1233,35 @@ _("bad ending inode # (%" PRIu64 " (0x%x 0x%zx)) in finobt rec, skipping rec\n") agbno = XFS_AGINO_TO_AGBNO(mp, ino + j); state = get_bmap(agno, agbno); - if (state == XR_E_INO) { - continue; - } else if ((state == XR_E_UNKNOWN) || - (state == XR_E_INUSE_FS && agno == 0 && - ino + j >= first_prealloc_ino && - ino + j < last_prealloc_ino)) { + switch (state) { + case XR_E_INO: + break; + case XR_E_INO1: /* seen by rmap */ + set_bmap(agno, agbno, XR_E_INO); + break; + case XR_E_UNKNOWN: do_warn( _("inode chunk claims untracked block, finobt block - agno %d, bno %d, inopb %d\n"), agno, agbno, mp->m_sb.sb_inopblock); set_bmap(agno, agbno, XR_E_INO); suspect++; - } else { + break; + case XR_E_INUSE_FS: + case XR_E_INUSE_FS1: + if (agno == 0 && + ino + j >= first_prealloc_ino && + ino + j < last_prealloc_ino) { + do_warn( +_("inode chunk claims untracked block, finobt block - agno %d, bno %d, inopb %d\n"), + agno, agbno, mp->m_sb.sb_inopblock); + + set_bmap(agno, agbno, XR_E_INO); + suspect++; + break; + } + /* fall through */ + default: do_warn( _("inode chunk claims used block, finobt block - agno %d, bno %d, inopb %d\n"), agno, agbno, mp->m_sb.sb_inopblock); @@ -1163,6 +1439,7 @@ scan_inobt( */ state = get_bmap(agno, bno); switch (state) { + case XR_E_FS_MAP1: /* already been seen by an rmap scan */ case XR_E_UNKNOWN: case XR_E_FREE1: case XR_E_FREE: @@ -1296,7 +1573,7 @@ scan_freelist( if (XFS_SB_BLOCK(mp) != XFS_AGFL_BLOCK(mp) && XFS_AGF_BLOCK(mp) != XFS_AGFL_BLOCK(mp) && XFS_AGI_BLOCK(mp) != XFS_AGFL_BLOCK(mp)) - set_bmap(agno, XFS_AGFL_BLOCK(mp), XR_E_FS_MAP); + set_bmap(agno, XFS_AGFL_BLOCK(mp), XR_E_INUSE_FS); if (be32_to_cpu(agf->agf_flcount) == 0) return; @@ -1381,6 +1658,19 @@ validate_agf( bno, agno); } + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + bno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]); + if (bno != 0 && verify_agbno(mp, agno, bno)) { + scan_sbtree(bno, + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]), + agno, 0, scan_rmapbt, 1, XFS_RMAP_CRC_MAGIC, + agcnts, &xfs_rmapbt_buf_ops); + } else { + do_warn(_("bad agbno %u for rmapbt root, agno %d\n"), + bno, agno); + } + } + if (be32_to_cpu(agf->agf_freeblks) != agcnts->agffreeblks) { do_warn(_("agf_freeblks %u, counted %u in ag %u\n"), be32_to_cpu(agf->agf_freeblks), agcnts->agffreeblks, agno); @@ -1396,6 +1686,7 @@ validate_agf( do_warn(_("agf_btreeblks %u, counted %" PRIu64 " in ag %u\n"), be32_to_cpu(agf->agf_btreeblks), agcnts->agfbtreeblks, agno); } + } static void @@ -1635,6 +1926,7 @@ scan_ags( __uint64_t fdblocks = 0; __uint64_t icount = 0; __uint64_t ifreecount = 0; + __uint64_t usedblocks = 0; xfs_agnumber_t i; work_queue_t wq; @@ -1657,6 +1949,7 @@ scan_ags( fdblocks += agcnts[i].fdblocks; icount += agcnts[i].icount; ifreecount += agcnts[i].ifreecount; + usedblocks += agcnts[i].usedblocks; } free(agcnts); @@ -1678,5 +1971,11 @@ scan_ags( do_warn(_("sb_fdblocks %" PRIu64 ", counted %" PRIu64 "\n"), mp->m_sb.sb_fdblocks, fdblocks); } + + if (usedblocks && + usedblocks != mp->m_sb.sb_dblocks - fdblocks) { + do_warn(_("used blocks %" PRIu64 ", counted %" PRIu64 "\n"), + mp->m_sb.sb_dblocks - fdblocks, usedblocks); + } } diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c index 11a6069ac..07ddd0045 100644 --- a/repair/xfs_repair.c +++ b/repair/xfs_repair.c @@ -411,6 +411,8 @@ calc_mkfs(xfs_mount_t *mp) fino_bno = inobt_root + XFS_MIN_FREELIST_RAW(1, 1, mp) + 1; if (xfs_sb_version_hasfinobt(&mp->m_sb)) fino_bno++; + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + fino_bno++; /* * If the log is allocated in the first allocation group we need to