From ea364865a277293b8f0495ad40f1a883456a7f84 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 3 Jul 2024 14:22:24 -0700 Subject: [PATCH] xfs_repair: rebuild the realtime rmap btree Rebuild the realtime rmap btree file from the reverse mapping records we gathered from walking the inodes. Signed-off-by: Darrick J. Wong --- libxfs/libxfs_api_defs.h | 8 ++ repair/Makefile | 1 + repair/bulkload.c | 41 ++++++ repair/bulkload.h | 2 + repair/phase6.c | 181 ++++++++++++++++++++++++++ repair/rmap.c | 26 ++++ repair/rmap.h | 1 + repair/rtrmap_repair.c | 265 +++++++++++++++++++++++++++++++++++++++ repair/xfs_repair.c | 8 +- 9 files changed, 531 insertions(+), 2 deletions(-) create mode 100644 repair/rtrmap_repair.c diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h index fb4c81cab..9bc5b7aaf 100644 --- a/libxfs/libxfs_api_defs.h +++ b/libxfs/libxfs_api_defs.h @@ -271,6 +271,7 @@ #define xfs_rmap_irec_offset_unpack libxfs_rmap_irec_offset_unpack #define xfs_rmap_lookup_le libxfs_rmap_lookup_le #define xfs_rmap_lookup_le_range libxfs_rmap_lookup_le_range +#define xfs_rmap_map_extent libxfs_rmap_map_extent #define xfs_rmap_map_raw libxfs_rmap_map_raw #define xfs_rmap_query_all libxfs_rmap_query_all #define xfs_rmap_query_range libxfs_rmap_query_range @@ -279,9 +280,12 @@ #define xfs_rtbitmap_setword libxfs_rtbitmap_setword #define xfs_rtbitmap_wordcount libxfs_rtbitmap_wordcount +#define xfs_rtginode_create libxfs_rtginode_create #define xfs_rtginode_irele libxfs_rtginode_irele #define xfs_rtginode_load libxfs_rtginode_load #define xfs_rtginode_load_parent libxfs_rtginode_load_parent +#define xfs_rtginode_mkdir_parent libxfs_rtginode_mkdir_parent +#define xfs_rtginode_name libxfs_rtginode_name #define xfs_suminfo_add libxfs_suminfo_add #define xfs_suminfo_get libxfs_suminfo_get @@ -293,12 +297,16 @@ #define xfs_rtgroup_get libxfs_rtgroup_get #define xfs_rtgroup_put libxfs_rtgroup_put #define xfs_rtgroup_update_super libxfs_rtgroup_update_super +#define xfs_rtrmapbt_calc_size libxfs_rtrmapbt_calc_size +#define xfs_rtrmapbt_commit_staged_btree libxfs_rtrmapbt_commit_staged_btree +#define xfs_rtrmapbt_create libxfs_rtrmapbt_create #define xfs_rtrmapbt_droot_maxrecs libxfs_rtrmapbt_droot_maxrecs #define xfs_rtrmapbt_maxlevels_ondisk libxfs_rtrmapbt_maxlevels_ondisk #define xfs_rtrmapbt_init_cursor libxfs_rtrmapbt_init_cursor #define xfs_rtrmapbt_maxrecs libxfs_rtrmapbt_maxrecs #define xfs_rtrmapbt_mem_init libxfs_rtrmapbt_mem_init #define xfs_rtrmapbt_mem_cursor libxfs_rtrmapbt_mem_cursor +#define xfs_rtrmapbt_stage_cursor libxfs_rtrmapbt_stage_cursor #define xfs_sb_from_disk libxfs_sb_from_disk #define xfs_sb_quota_from_disk libxfs_sb_quota_from_disk diff --git a/repair/Makefile b/repair/Makefile index e7979a817..84ee9a4ac 100644 --- a/repair/Makefile +++ b/repair/Makefile @@ -73,6 +73,7 @@ CFILES = \ rcbag.c \ rmap.c \ rt.c \ + rtrmap_repair.c \ sb.c \ scan.c \ slab.c \ diff --git a/repair/bulkload.c b/repair/bulkload.c index c96e569ef..e2f6dcbaa 100644 --- a/repair/bulkload.c +++ b/repair/bulkload.c @@ -364,3 +364,44 @@ bulkload_estimate_ag_slack( if (bload->node_slack < 0) bload->node_slack = 2; } + +/* + * Estimate proper slack values for a btree that's being reloaded. + * + * Under most circumstances, we'll take whatever default loading value the + * btree bulk loading code calculates for us. However, there are some + * exceptions to this rule: + * + * (1) If someone turned one of the debug knobs. + * (2) The FS has less than ~9% space free. + * + * Note that we actually use 3/32 for the comparison to avoid division. + */ +void +bulkload_estimate_inode_slack( + struct xfs_mount *mp, + struct xfs_btree_bload *bload, + unsigned long long free) +{ + /* + * The global values are set to -1 (i.e. take the bload defaults) + * unless someone has set them otherwise, so we just pull the values + * here. + */ + bload->leaf_slack = bload_leaf_slack; + bload->node_slack = bload_node_slack; + + /* No further changes if there's more than 3/32ths space left. */ + if (free >= ((mp->m_sb.sb_dblocks * 3) >> 5)) + return; + + /* + * We're low on space; load the btrees as tightly as possible. Leave + * a couple of open slots in each btree block so that we don't end up + * splitting the btrees like crazy right after mount. + */ + if (bload->leaf_slack < 0) + bload->leaf_slack = 2; + if (bload->node_slack < 0) + bload->node_slack = 2; +} diff --git a/repair/bulkload.h b/repair/bulkload.h index a88aafaa6..842121b15 100644 --- a/repair/bulkload.h +++ b/repair/bulkload.h @@ -78,5 +78,7 @@ void bulkload_cancel(struct bulkload *bkl); int bulkload_commit(struct bulkload *bkl); void bulkload_estimate_ag_slack(struct repair_ctx *sc, struct xfs_btree_bload *bload, unsigned int free); +void bulkload_estimate_inode_slack(struct xfs_mount *mp, + struct xfs_btree_bload *bload, unsigned long long free); #endif /* __XFS_REPAIR_BULKLOAD_H__ */ diff --git a/repair/phase6.c b/repair/phase6.c index 779ea1ec5..c96b38bf4 100644 --- a/repair/phase6.c +++ b/repair/phase6.c @@ -20,6 +20,8 @@ #include "versions.h" #include "repair/pptr.h" #include "repair/rt.h" +#include "repair/slab.h" +#include "repair/rmap.h" static xfs_ino_t orphanage_ino; @@ -567,6 +569,147 @@ mk_rsumino( libxfs_irele(ip); } +/* Mark a newly allocated inode in use in the incore bitmap. */ +static void +mark_ino_inuse( + struct xfs_mount *mp, + xfs_ino_t ino, + int mode, + xfs_ino_t parent) +{ + struct ino_tree_node *irec; + int ino_offset; + int i; + + irec = find_inode_rec(mp, XFS_INO_TO_AGNO(mp, ino), + XFS_INO_TO_AGINO(mp, ino)); + + if (irec == NULL) { + /* + * This inode is allocated from a newly created inode + * chunk and therefore did not exist when inode chunks + * were processed in phase3. Add this group of inodes to + * the entry avl tree as if they were discovered in phase3. + */ + irec = set_inode_free_alloc(mp, + XFS_INO_TO_AGNO(mp, ino), + XFS_INO_TO_AGINO(mp, ino)); + alloc_ex_data(irec); + + for (i = 0; i < XFS_INODES_PER_CHUNK; i++) + set_inode_free(irec, i); + } + + ino_offset = get_inode_offset(mp, ino, irec); + + /* + * Mark the inode allocated so it is not skipped in phase 7. We'll + * find it with the directory traverser soon, so we don't need to + * mark it reached. + */ + set_inode_used(irec, ino_offset); + set_inode_ftype(irec, ino_offset, libxfs_mode_to_ftype(mode)); + set_inode_parent(irec, ino_offset, parent); + if (S_ISDIR(mode)) + set_inode_isadir(irec, ino_offset); +} + +static bool +ensure_rtgroup_file( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type) +{ + struct xfs_mount *mp = rtg->rtg_mount; + struct xfs_inode *ip = rtg->rtg_inodes[type]; + const char *name = libxfs_rtginode_name(type); + int error; + + if (!xfs_rtginode_enabled(rtg, type)) + return false; + + if (no_modify) { + if (!ip) + do_warn(_("would reset rtgroup %u %s inode\n"), + rtg->rtg_rgno, name); + return false; + } + + if (ip) { + struct xfs_metadir_update upd = { + .dp = mp->m_rtdirip, + .ip = ip, + }; + + upd.path = xfs_rtginode_path(rtg->rtg_rgno, type); + if (!upd.path) + do_error( + _("Couldn't create rtgroup %u %s file path\n"), + rtg->rtg_rgno, name); + + /* + * Since we're reattaching this file to the metadata directory + * tree, try to remove all the parent pointers that might be + * attached. + */ + try_erase_parent_ptrs(ip); + + error = -libxfs_metadir_start_link(&upd); + if (error) + do_error( + _("Couldn't grab resources to reconnect rtgroup %u %s, error %d\n"), + rtg->rtg_rgno, name, error); + + error = -libxfs_metadir_link(&upd); + if (error) + do_error( + _("Failed to link rtgroup %u %s inode 0x%llx, error %d\n"), + rtg->rtg_rgno, name, + (unsigned long long)ip->i_ino, error); + + /* Reset the link count to something sane. */ + set_nlink(VFS_I(ip), 1); + libxfs_trans_log_inode(upd.tp, ip, XFS_ILOG_CORE); + + error = -libxfs_metadir_commit(&upd); + if (error) + do_error( + _("Couldn't commit new rtgroup %u %s inode %llu, error %d\n"), + rtg->rtg_rgno, name, + (unsigned long long)upd.ip->i_ino, + error); + kfree(upd.path); + } else { + /* + * The inode was bad or gone, so just make a new one and give + * our reference to the rtgroup structure. + */ + do_warn(_("resetting rtgroup %u %s inode\n"), + rtg->rtg_rgno, name); + + error = -libxfs_rtginode_create(rtg, type, false); + if (error) + do_error( + _("Couldn't create rtgroup %u %s inode, error %d\n"), + rtg->rtg_rgno, name, error); + + ip = rtg->rtg_inodes[type]; + } + + /* Mark the inode in use. */ + mark_ino_inuse(mp, ip->i_ino, S_IFREG, mp->m_rtdirip->i_ino); + mark_ino_metadata(mp, ip->i_ino); + return true; +} + +static void +ensure_rtgroup_rmapbt( + struct xfs_rtgroup *rtg, + xfs_filblks_t est_fdblocks) +{ + if (ensure_rtgroup_file(rtg, XFS_RTG_RMAP)) + populate_rtgroup_rmapbt(rtg, est_fdblocks); +} + /* Initialize a root directory. */ static int init_fs_root_dir( @@ -631,6 +774,8 @@ mk_metadir( struct xfs_trans *tp; int error; + libxfs_rtginode_irele(&mp->m_rtdirip); + error = init_fs_root_dir(mp, mp->m_sb.sb_metadirino, 0, &mp->m_metadirip); if (error) @@ -3205,6 +3350,39 @@ traverse_ags( do_inode_prefetch(mp, ag_stride, traverse_function, false, true); } +static void +reset_rt_metadata_inodes( + struct xfs_mount *mp) +{ + struct xfs_rtgroup *rtg; + xfs_filblks_t metadata_blocks = 0; + xfs_filblks_t est_fdblocks = 0; + xfs_rgnumber_t rgno; + int error; + + if (!no_modify) { + error = -libxfs_rtginode_mkdir_parent(mp); + if (error) + do_error(_("failed to create realtime metadir (%d)\n"), + error); + } + + mark_ino_inuse(mp, mp->m_rtdirip->i_ino, S_IFDIR, + mp->m_metadirip->i_ino); + mark_ino_metadata(mp, mp->m_rtdirip->i_ino); + + /* Estimate how much free space will be left after building btrees */ + for_each_rtgroup(mp, rgno, rtg) { + metadata_blocks += estimate_rtrmapbt_blocks(rtg); + } + if (mp->m_sb.sb_fdblocks > metadata_blocks) + est_fdblocks = mp->m_sb.sb_fdblocks - metadata_blocks; + + for_each_rtgroup(mp, rgno, rtg) { + ensure_rtgroup_rmapbt(rtg, est_fdblocks); + } +} + void phase6(xfs_mount_t *mp) { @@ -3273,6 +3451,9 @@ phase6(xfs_mount_t *mp) } } + if (xfs_has_rtgroups(mp)) + reset_rt_metadata_inodes(mp); + if (!no_modify) { do_log( _(" - resetting contents of realtime bitmap and summary inodes\n")); diff --git a/repair/rmap.c b/repair/rmap.c index e714f185d..c9f4650b4 100644 --- a/repair/rmap.c +++ b/repair/rmap.c @@ -1934,3 +1934,29 @@ estimate_refcountbt_blocks( return libxfs_refcountbt_calc_size(mp, slab_count(x->ar_refcount_items)); } + +/* Estimate the size of the ondisk rtrmapbt from the incore tree. */ +xfs_filblks_t +estimate_rtrmapbt_blocks( + struct xfs_rtgroup *rtg) +{ + struct xfs_mount *mp = rtg->rtg_mount; + struct xfs_ag_rmap *x; + unsigned long long nr_recs; + + if (!rmap_needs_work(mp) || !xfs_has_rtrmapbt(mp)) + return 0; + + /* + * Overestimate the amount of space needed by pretending that every + * byte in the incore tree is used to store rtrmapbt records. This + * means we can use SEEK_DATA/HOLE on the xfile, which is faster than + * walking the entire btree. + */ + x = &rg_rmaps[rtg->rtg_rgno]; + if (!rmaps_has_observations(x)) + return 0; + + nr_recs = xmbuf_bytes(x->ar_xmbtp) / sizeof(struct xfs_rmap_rec); + return libxfs_rtrmapbt_calc_size(mp, nr_recs); +} diff --git a/repair/rmap.h b/repair/rmap.h index ebda561e5..23859bf6c 100644 --- a/repair/rmap.h +++ b/repair/rmap.h @@ -60,5 +60,6 @@ int rmap_get_mem_rec(struct xfs_btree_cur *rmcur, struct xfs_rmap_irec *irec); void populate_rtgroup_rmapbt(struct xfs_rtgroup *rtg, xfs_filblks_t est_fdblocks); +xfs_filblks_t estimate_rtrmapbt_blocks(struct xfs_rtgroup *rtg); #endif /* RMAP_H_ */ diff --git a/repair/rtrmap_repair.c b/repair/rtrmap_repair.c new file mode 100644 index 000000000..d9f66e7de --- /dev/null +++ b/repair/rtrmap_repair.c @@ -0,0 +1,265 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2019-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include +#include "btree.h" +#include "err_protos.h" +#include "libxlog.h" +#include "incore.h" +#include "globals.h" +#include "dinode.h" +#include "slab.h" +#include "rmap.h" +#include "bulkload.h" + +/* Ported routines from fs/xfs/scrub/rtrmap_repair.c */ + +/* + * Realtime Reverse Mapping (RTRMAPBT) Repair + * ========================================== + * + * Gather all the rmap records for the inode and fork we're fixing, reset the + * incore fork, then recreate the btree. + */ +struct xrep_rtrmap { + struct xfs_btree_cur *btree_cursor; + + /* New fork. */ + struct bulkload new_fork_info; + struct xfs_btree_bload rtrmap_bload; + + struct repair_ctx *sc; + struct xfs_rtgroup *rtg; + + /* Estimated free space after building all rt btrees */ + xfs_filblks_t est_fdblocks; +}; + +/* Retrieve rtrmapbt data for bulk load. */ +STATIC int +xrep_rtrmap_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xrep_rtrmap *rr = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int ret; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + ret = rmap_get_mem_rec(rr->btree_cursor, &cur->bc_rec.r); + if (ret < 0) + return ret; + if (ret == 0) + do_error( + _("ran out of records while rebuilding rt rmap btree\n")); + + block_rec = libxfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_rtrmap_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_rtrmap *rr = priv; + + return bulkload_claim_block(cur, &rr->new_fork_info, ptr); +} + +/* Figure out how much space we need to create the incore btree root block. */ +STATIC size_t +xrep_rtrmap_iroot_size( + struct xfs_btree_cur *cur, + unsigned int level, + unsigned int nr_this_level, + void *priv) +{ + return xfs_rtrmap_broot_space_calc(cur->bc_mp, level, nr_this_level); +} + +/* Reserve new btree blocks and bulk load all the rtrmap records. */ +STATIC int +xrep_rtrmap_btree_load( + struct xrep_rtrmap *rr, + struct xfs_btree_cur *rtrmap_cur) +{ + struct repair_ctx *sc = rr->sc; + int error; + + rr->rtrmap_bload.get_records = xrep_rtrmap_get_records; + rr->rtrmap_bload.claim_block = xrep_rtrmap_claim_block; + rr->rtrmap_bload.iroot_size = xrep_rtrmap_iroot_size; + bulkload_estimate_inode_slack(sc->mp, &rr->rtrmap_bload, + rr->est_fdblocks); + + /* Compute how many blocks we'll need. */ + error = -libxfs_btree_bload_compute_geometry(rtrmap_cur, + &rr->rtrmap_bload, + rmap_record_count(sc->mp, true, rr->rtg->rtg_rgno)); + if (error) + return error; + + /* + * Guess how many blocks we're going to need to rebuild an entire rtrmap + * from the number of extents we found, and pump up our transaction to + * have sufficient block reservation. + */ + error = -libxfs_trans_reserve_more(sc->tp, rr->rtrmap_bload.nr_blocks, + 0); + if (error) + return error; + + /* + * Reserve the space we'll need for the new btree. Drop the cursor + * while we do this because that can roll the transaction and cursors + * can't handle that. + */ + error = bulkload_alloc_file_blocks(&rr->new_fork_info, + rr->rtrmap_bload.nr_blocks); + if (error) + return error; + + /* Add all observed rtrmap records. */ + error = rmap_init_mem_cursor(rr->sc->mp, sc->tp, true, + rr->rtg->rtg_rgno, &rr->btree_cursor); + if (error) + return error; + error = -libxfs_btree_bload(rtrmap_cur, &rr->rtrmap_bload, rr); + libxfs_btree_del_cursor(rr->btree_cursor, error); + return error; +} + +/* Update the inode counters. */ +STATIC int +xrep_rtrmap_reset_counters( + struct xrep_rtrmap *rr) +{ + struct repair_ctx *sc = rr->sc; + + /* + * Update the inode block counts to reflect the btree we just + * generated. + */ + sc->ip->i_nblocks = rr->new_fork_info.ifake.if_blocks; + libxfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + + /* Quotas don't exist so we're done. */ + return 0; +} + +/* + * Use the collected rmap information to stage a new rt rmap btree. If this is + * successful we'll return with the new btree root information logged to the + * repair transaction but not yet committed. + */ +static int +xrep_rtrmap_build_new_tree( + struct xrep_rtrmap *rr) +{ + struct xfs_owner_info oinfo; + struct xfs_btree_cur *cur; + struct repair_ctx *sc = rr->sc; + struct xbtree_ifakeroot *ifake = &rr->new_fork_info.ifake; + int error; + + /* + * Prepare to construct the new fork by initializing the new btree + * structure and creating a fake ifork in the ifakeroot structure. + */ + libxfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); + bulkload_init_inode(&rr->new_fork_info, sc, XFS_DATA_FORK, &oinfo); + cur = libxfs_rtrmapbt_init_cursor(sc->mp, NULL, rr->rtg, sc->ip); + libxfs_btree_stage_ifakeroot(cur, ifake); + + /* + * Figure out the size and format of the new fork, then fill it with + * all the rtrmap records we've found. Join the inode to the + * transaction so that we can roll the transaction while holding the + * inode locked. + */ + libxfs_trans_ijoin(sc->tp, sc->ip, 0); + ifake->if_fork->if_format = XFS_DINODE_FMT_RMAP; + error = xrep_rtrmap_btree_load(rr, cur); + if (error) + goto err_cur; + + /* + * Install the new fork in the inode. After this point the old mapping + * data are no longer accessible and the new tree is live. We delete + * the cursor immediately after committing the staged root because the + * staged fork might be in extents format. + */ + libxfs_rtrmapbt_commit_staged_btree(cur, sc->tp); + libxfs_btree_del_cursor(cur, 0); + + /* Reset the inode counters now that we've changed the fork. */ + error = xrep_rtrmap_reset_counters(rr); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting infomation. */ + error = bulkload_commit(&rr->new_fork_info); + if (error) + return error; + + return -libxfs_trans_roll_inode(&sc->tp, sc->ip); +err_cur: + if (cur) + libxfs_btree_del_cursor(cur, error); +err_newbt: + bulkload_cancel(&rr->new_fork_info); + return error; +} + +/* Store the realtime reverse-mappings in the rtrmapbt. */ +void +populate_rtgroup_rmapbt( + struct xfs_rtgroup *rtg, + xfs_filblks_t est_fdblocks) +{ + struct xfs_mount *mp = rtg->rtg_mount; + struct xfs_inode *ip = rtg->rtg_inodes[XFS_RTG_RMAP]; + struct repair_ctx sc = { + .mp = mp, + .ip = ip, + }; + struct xrep_rtrmap rr = { + .sc = &sc, + .rtg = rtg, + .est_fdblocks = est_fdblocks, + }; + int error; + + if (!xfs_has_rtrmapbt(mp)) + return; + + error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, + &sc.tp); + if (error) + goto out; + + error = xrep_rtrmap_build_new_tree(&rr); + if (error) { + libxfs_trans_cancel(sc.tp); + goto out; + } + + error = -libxfs_trans_commit(sc.tp); +out: + if (error) + do_error( + _("rtgroup %u rmap btree could not be rebuilt, error %d\n"), + rtg->rtg_rgno, error); +} diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c index 8d2ea0664..c3f1fa8f4 100644 --- a/repair/xfs_repair.c +++ b/repair/xfs_repair.c @@ -1466,15 +1466,19 @@ main(int argc, char **argv) rcbagbt_destroy_cur_cache(); /* - * Done with the block usage maps, toss them... + * Done with the block usage maps, toss them. Realtime metadata aren't + * rebuilt until phase 6, so we have to keep them around. */ - rmaps_free(mp); + if (mp->m_sb.sb_rblocks == 0) + rmaps_free(mp); free_bmaps(mp); if (!bad_ino_btree) { phase6(mp); phase_end(mp, 6); + if (mp->m_sb.sb_rblocks != 0) + rmaps_free(mp); free_rtgroup_inodes(mp); phase7(mp, phase2_threads); -- 2.50.1