From 2f577ca8cee9095aa36e643fe4c29dcc2894102f Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 29 May 2015 09:18:32 +1000 Subject: [PATCH] xfs: allocate sparse inode chunks on full chunk allocation failure xfs_ialloc_ag_alloc() makes several attempts to allocate a full inode chunk. If all else fails, reduce the allocation to the sparse length and alignment and attempt to allocate a sparse inode chunk. If sparse chunk allocation succeeds, check whether an inobt record already exists that can track the chunk. If so, inherit and update the existing record. Otherwise, insert a new record for the sparse chunk. Create helpers to align sparse chunk inode records and insert or update existing records in the inode btrees. The xfs_inobt_insert_sprec() helper implements the merge or update semantics required for sparse inode records with respect to both the inobt and finobt. To update the inobt, either insert a new record or merge with an existing record. To update the finobt, use the updated inobt record to either insert or replace an existing record. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- include/xfs_trace.h | 2 + libxfs/libxfs_priv.h | 1 + libxfs/xfs_ialloc.c | 330 ++++++++++++++++++++++++++++++++++++-- libxfs/xfs_ialloc_btree.c | 31 ++++ libxfs/xfs_ialloc_btree.h | 7 + 5 files changed, 357 insertions(+), 14 deletions(-) diff --git a/include/xfs_trace.h b/include/xfs_trace.h index ab046a914..423772f51 100644 --- a/include/xfs_trace.h +++ b/include/xfs_trace.h @@ -58,6 +58,8 @@ #define trace_xfs_alloc_read_agf(a,b) ((void) 0) #define trace_xfs_read_agi(a,b) ((void) 0) #define trace_xfs_ialloc_read_agi(a,b) ((void) 0) +#define trace_xfs_irec_merge_pre(a,b,c,d,e,f) ((void) 0) +#define trace_xfs_irec_merge_post(a,b,c,d) ((void) 0) #define trace_xfs_iext_insert(a,b,c,d,e) ((void) 0) #define trace_xfs_iext_remove(a,b,c,d) ((void) 0) diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h index 1b4ac64c4..5e7810ad0 100644 --- a/libxfs/libxfs_priv.h +++ b/libxfs/libxfs_priv.h @@ -126,6 +126,7 @@ enum ce { CE_DEBUG, CE_CONT, CE_NOTE, CE_WARN, CE_ALERT, CE_PANIC }; #define xfs_warn(mp,fmt,args...) cmn_err(CE_WARN,fmt, ## args) #define xfs_hex_dump(d,n) ((void) 0) +#define xfs_force_shutdown(d,n) ((void) 0) /* stop unused var warnings by assigning mp to itself */ #define XFS_CORRUPTION_ERROR(e,l,mp,m) do { \ diff --git a/libxfs/xfs_ialloc.c b/libxfs/xfs_ialloc.c index 34f029020..ac7fcecf3 100644 --- a/libxfs/xfs_ialloc.c +++ b/libxfs/xfs_ialloc.c @@ -372,6 +372,214 @@ xfs_ialloc_inode_init( return 0; } +/* + * Align startino and allocmask for a recently allocated sparse chunk such that + * they are fit for insertion (or merge) into the on-disk inode btrees. + * + * Background: + * + * When enabled, sparse inode support increases the inode alignment from cluster + * size to inode chunk size. This means that the minimum range between two + * non-adjacent inode records in the inobt is large enough for a full inode + * record. This allows for cluster sized, cluster aligned block allocation + * without need to worry about whether the resulting inode record overlaps with + * another record in the tree. Without this basic rule, we would have to deal + * with the consequences of overlap by potentially undoing recent allocations in + * the inode allocation codepath. + * + * Because of this alignment rule (which is enforced on mount), there are two + * inobt possibilities for newly allocated sparse chunks. One is that the + * aligned inode record for the chunk covers a range of inodes not already + * covered in the inobt (i.e., it is safe to insert a new sparse record). The + * other is that a record already exists at the aligned startino that considers + * the newly allocated range as sparse. In the latter case, record content is + * merged in hope that sparse inode chunks fill to full chunks over time. + */ +STATIC void +xfs_align_sparse_ino( + struct xfs_mount *mp, + xfs_agino_t *startino, + uint16_t *allocmask) +{ + xfs_agblock_t agbno; + xfs_agblock_t mod; + int offset; + + agbno = XFS_AGINO_TO_AGBNO(mp, *startino); + mod = agbno % mp->m_sb.sb_inoalignmt; + if (!mod) + return; + + /* calculate the inode offset and align startino */ + offset = mod << mp->m_sb.sb_inopblog; + *startino -= offset; + + /* + * Since startino has been aligned down, left shift allocmask such that + * it continues to represent the same physical inodes relative to the + * new startino. + */ + *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT; +} + +/* + * Determine whether the source inode record can merge into the target. Both + * records must be sparse, the inode ranges must match and there must be no + * allocation overlap between the records. + */ +STATIC bool +__xfs_inobt_can_merge( + struct xfs_inobt_rec_incore *trec, /* tgt record */ + struct xfs_inobt_rec_incore *srec) /* src record */ +{ + uint64_t talloc; + uint64_t salloc; + + /* records must cover the same inode range */ + if (trec->ir_startino != srec->ir_startino) + return false; + + /* both records must be sparse */ + if (!xfs_inobt_issparse(trec->ir_holemask) || + !xfs_inobt_issparse(srec->ir_holemask)) + return false; + + /* both records must track some inodes */ + if (!trec->ir_count || !srec->ir_count) + return false; + + /* can't exceed capacity of a full record */ + if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK) + return false; + + /* verify there is no allocation overlap */ + talloc = xfs_inobt_irec_to_allocmask(trec); + salloc = xfs_inobt_irec_to_allocmask(srec); + if (talloc & salloc) + return false; + + return true; +} + +/* + * Merge the source inode record into the target. The caller must call + * __xfs_inobt_can_merge() to ensure the merge is valid. + */ +STATIC void +__xfs_inobt_rec_merge( + struct xfs_inobt_rec_incore *trec, /* target */ + struct xfs_inobt_rec_incore *srec) /* src */ +{ + ASSERT(trec->ir_startino == srec->ir_startino); + + /* combine the counts */ + trec->ir_count += srec->ir_count; + trec->ir_freecount += srec->ir_freecount; + + /* + * Merge the holemask and free mask. For both fields, 0 bits refer to + * allocated inodes. We combine the allocated ranges with bitwise AND. + */ + trec->ir_holemask &= srec->ir_holemask; + trec->ir_free &= srec->ir_free; +} + +/* + * Insert a new sparse inode chunk into the associated inode btree. The inode + * record for the sparse chunk is pre-aligned to a startino that should match + * any pre-existing sparse inode record in the tree. This allows sparse chunks + * to fill over time. + * + * This function supports two modes of handling preexisting records depending on + * the merge flag. If merge is true, the provided record is merged with the + * existing record and updated in place. The merged record is returned in nrec. + * If merge is false, an existing record is replaced with the provided record. + * If no preexisting record exists, the provided record is always inserted. + * + * It is considered corruption if a merge is requested and not possible. Given + * the sparse inode alignment constraints, this should never happen. + */ +STATIC int +xfs_inobt_insert_sprec( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + int btnum, + struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */ + bool merge) /* merge or replace */ +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + int error; + int i; + struct xfs_inobt_rec_incore rec; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + + /* the new record is pre-aligned so we know where to look */ + error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + /* if nothing there, insert a new record and return */ + if (i == 0) { + error = xfs_inobt_insert_rec(cur, nrec->ir_holemask, + nrec->ir_count, nrec->ir_freecount, + nrec->ir_free, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + + goto out; + } + + /* + * A record exists at this startino. Merge or replace the record + * depending on what we've been asked to do. + */ + if (merge) { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + XFS_WANT_CORRUPTED_GOTO(mp, + rec.ir_startino == nrec->ir_startino, + error); + + /* + * This should never fail. If we have coexisting records that + * cannot merge, something is seriously wrong. + */ + XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec), + error); + + trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino, + rec.ir_holemask, nrec->ir_startino, + nrec->ir_holemask); + + /* merge to nrec to output the updated record */ + __xfs_inobt_rec_merge(nrec, &rec); + + trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino, + nrec->ir_holemask); + + error = xfs_inobt_rec_check_count(mp, nrec); + if (error) + goto error; + } + + error = xfs_inobt_update(cur, nrec); + if (error) + goto error; + +out: + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + /* * Allocate new inodes in the allocation group specified by agbp. * Return 0 for success, else error code. @@ -390,6 +598,8 @@ xfs_ialloc_ag_alloc( xfs_agino_t newlen; /* new number of inodes */ int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ + uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */ + struct xfs_inobt_rec_incore rec; struct xfs_perag *pag; memset(&args, 0, sizeof(args)); @@ -506,6 +716,45 @@ xfs_ialloc_ag_alloc( return error; } + /* + * Finally, try a sparse allocation if the filesystem supports it and + * the sparse allocation length is smaller than a full chunk. + */ + if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) && + args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks && + args.fsbno == NULLFSBLOCK) { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.agbno = be32_to_cpu(agi->agi_root); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.alignment = args.mp->m_sb.sb_spino_align; + args.prod = 1; + + args.minlen = args.mp->m_ialloc_min_blks; + args.maxlen = args.minlen; + + /* + * The inode record will be aligned to full chunk size. We must + * prevent sparse allocation from AG boundaries that result in + * invalid inode records, such as records that start at agbno 0 + * or extend beyond the AG. + * + * Set min agbno to the first aligned, non-zero agbno and max to + * the last aligned agbno that is at least one full chunk from + * the end of the AG. + */ + args.min_agbno = args.mp->m_sb.sb_inoalignmt; + args.max_agbno = round_down(args.mp->m_sb.sb_agblocks, + args.mp->m_sb.sb_inoalignmt) - + args.mp->m_ialloc_blks; + + error = xfs_alloc_vextent(&args); + if (error) + return error; + + newlen = args.len << args.mp->m_sb.sb_inopblog; + allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1; + } + if (args.fsbno == NULLFSBLOCK) { *alloc = 0; return 0; @@ -530,6 +779,73 @@ xfs_ialloc_ag_alloc( * Convert the results. */ newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); + + if (xfs_inobt_issparse(~allocmask)) { + /* + * We've allocated a sparse chunk. Align the startino and mask. + */ + xfs_align_sparse_ino(args.mp, &newino, &allocmask); + + rec.ir_startino = newino; + rec.ir_holemask = ~allocmask; + rec.ir_count = newlen; + rec.ir_freecount = newlen; + rec.ir_free = XFS_INOBT_ALL_FREE; + + /* + * Insert the sparse record into the inobt and allow for a merge + * if necessary. If a merge does occur, rec is updated to the + * merged record. + */ + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO, + &rec, true); + if (error == -EFSCORRUPTED) { + xfs_alert(args.mp, + "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", + XFS_AGINO_TO_INO(args.mp, agno, + rec.ir_startino), + rec.ir_holemask, rec.ir_count); + xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); + } + if (error) + return error; + + /* + * We can't merge the part we've just allocated as for the inobt + * due to finobt semantics. The original record may or may not + * exist independent of whether physical inodes exist in this + * sparse chunk. + * + * We must update the finobt record based on the inobt record. + * rec contains the fully merged and up to date inobt record + * from the previous call. Set merge false to replace any + * existing record with this one. + */ + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, + XFS_BTNUM_FINO, &rec, + false); + if (error) + return error; + } + } else { + /* full chunk - insert new records to both btrees */ + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_INO); + if (error) + return error; + + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert(args.mp, tp, agbp, newino, + newlen, XFS_BTNUM_FINO); + if (error) + return error; + } + } + + /* + * Update AGI counts and newino. + */ be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); pag = xfs_perag_get(args.mp, agno); @@ -537,20 +853,6 @@ xfs_ialloc_ag_alloc( xfs_perag_put(pag); agi->agi_newino = cpu_to_be32(newino); - /* - * Insert records describing the new inode chunk into the btrees. - */ - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, - XFS_BTNUM_INO); - if (error) - return error; - - if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, - XFS_BTNUM_FINO); - if (error) - return error; - } /* * Log allocation group header fields */ diff --git a/libxfs/xfs_ialloc_btree.c b/libxfs/xfs_ialloc_btree.c index c67600a7e..7055149d0 100644 --- a/libxfs/xfs_ialloc_btree.c +++ b/libxfs/xfs_ialloc_btree.c @@ -477,3 +477,34 @@ xfs_inobt_irec_to_allocmask( return bitmap; } + +#if defined(DEBUG) || defined(XFS_WARN) +/* + * Verify that an in-core inode record has a valid inode count. + */ +int +xfs_inobt_rec_check_count( + struct xfs_mount *mp, + struct xfs_inobt_rec_incore *rec) +{ + int inocount = 0; + int nextbit = 0; + uint64_t allocbmap; + int wordsz; + + wordsz = sizeof(allocbmap) / sizeof(unsigned int); + allocbmap = xfs_inobt_irec_to_allocmask(rec); + + nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit); + while (nextbit != -1) { + inocount++; + nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, + nextbit + 1); + } + + if (inocount != rec->ir_count) + return -EFSCORRUPTED; + + return 0; +} +#endif /* DEBUG */ diff --git a/libxfs/xfs_ialloc_btree.h b/libxfs/xfs_ialloc_btree.h index 2c581ba69..bd8845321 100644 --- a/libxfs/xfs_ialloc_btree.h +++ b/libxfs/xfs_ialloc_btree.h @@ -65,4 +65,11 @@ extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); /* ir_holemask to inode allocation bitmap conversion */ uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *); +#if defined(DEBUG) || defined(XFS_WARN) +int xfs_inobt_rec_check_count(struct xfs_mount *, + struct xfs_inobt_rec_incore *); +#else +#define xfs_inobt_rec_check_count(mp, rec) 0 +#endif /* DEBUG */ + #endif /* __XFS_IALLOC_BTREE_H__ */ -- 2.50.1