From 15cf5ec39f1777d410f6d836fc30f3d7e7600e10 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 1 Oct 2024 16:00:09 -0700 Subject: [PATCH] xfs: allow inode-based btrees to reserve space in the data device Create a new space reservation scheme so that btree metadata for the realtime volume can reserve space in the data device to avoid space underruns. Signed-off-by: Darrick J. Wong --- include/xfs_inode.h | 5 +- include/xfs_mount.h | 1 + include/xfs_trace.h | 7 ++ io/inject.c | 1 + libxfs/init.c | 11 ++- libxfs/libxfs_priv.h | 11 +++ libxfs/xfs_ag_resv.c | 3 + libxfs/xfs_errortag.h | 4 +- libxfs/xfs_metadir.c | 3 + libxfs/xfs_metafile.c | 202 ++++++++++++++++++++++++++++++++++++++++++ libxfs/xfs_metafile.h | 11 +++ libxfs/xfs_types.h | 7 ++ 12 files changed, 263 insertions(+), 3 deletions(-) diff --git a/include/xfs_inode.h b/include/xfs_inode.h index 30e171696..5bb31eb4a 100644 --- a/include/xfs_inode.h +++ b/include/xfs_inode.h @@ -224,7 +224,10 @@ typedef struct xfs_inode { struct xfs_ifork i_df; /* data fork */ struct xfs_ifork i_af; /* attribute fork */ struct xfs_inode_log_item *i_itemp; /* logging information */ - unsigned int i_delayed_blks; /* count of delay alloc blks */ + uint64_t i_delayed_blks; /* count of delay alloc blks */ + /* Space that has been set aside to root a btree in this file. */ + uint64_t i_meta_resv_asked; + xfs_fsize_t i_disk_size; /* number of bytes in file */ xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */ prid_t i_projid; /* owner's project id */ diff --git a/include/xfs_mount.h b/include/xfs_mount.h index 19d08cf04..532bff851 100644 --- a/include/xfs_mount.h +++ b/include/xfs_mount.h @@ -115,6 +115,7 @@ typedef struct xfs_mount { uint m_rmap_maxlevels; /* max rmap btree levels */ uint m_refc_maxlevels; /* max refc btree levels */ unsigned int m_agbtree_maxlevels; /* max level of all AG btrees */ + unsigned int m_rtbtree_maxlevels; /* max level of all rt btrees */ xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */ uint m_alloc_set_aside; /* space we can't use */ uint m_ag_max_usable; /* max space per AG */ diff --git a/include/xfs_trace.h b/include/xfs_trace.h index a53ce092c..30166c11d 100644 --- a/include/xfs_trace.h +++ b/include/xfs_trace.h @@ -390,4 +390,11 @@ #define trace_xfs_group_put(...) ((void) 0) #define trace_xfs_group_rele(...) ((void) 0) +#define trace_xfs_metafile_resv_alloc_space(...) ((void) 0) +#define trace_xfs_metafile_resv_critical(...) ((void) 0) +#define trace_xfs_metafile_resv_free(...) ((void) 0) +#define trace_xfs_metafile_resv_free_space(...) ((void) 0) +#define trace_xfs_metafile_resv_init(...) ((void) 0) +#define trace_xfs_metafile_resv_init_error(...) ((void) 0) + #endif /* __TRACE_H__ */ diff --git a/io/inject.c b/io/inject.c index 4aeb6da32..7b9a76406 100644 --- a/io/inject.c +++ b/io/inject.c @@ -64,6 +64,7 @@ error_tag(char *name) { XFS_ERRTAG_WB_DELAY_MS, "wb_delay_ms" }, { XFS_ERRTAG_WRITE_DELAY_MS, "write_delay_ms" }, { XFS_ERRTAG_EXCHMAPS_FINISH_ONE, "exchmaps_finish_one" }, + { XFS_ERRTAG_METAFILE_RESV_CRITICAL, "metafile_resv_crit" }, { XFS_ERRTAG_MAX, NULL } }; int count; diff --git a/libxfs/init.c b/libxfs/init.c index dd2be84c0..6a8fb480e 100644 --- a/libxfs/init.c +++ b/libxfs/init.c @@ -598,6 +598,15 @@ xfs_agbtree_compute_maxlevels( mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels); } +/* Compute maximum possible height for realtime btree types for this fs. */ +static inline void +xfs_rtbtree_compute_maxlevels( + struct xfs_mount *mp) +{ + /* This will be filled in later. */ + mp->m_rtbtree_maxlevels = 0; +} + /* Compute maximum possible height of all btrees. */ void libxfs_compute_all_maxlevels( @@ -614,7 +623,7 @@ libxfs_compute_all_maxlevels( xfs_refcountbt_compute_maxlevels(mp); xfs_agbtree_compute_maxlevels(mp); - + xfs_rtbtree_compute_maxlevels(mp); } /* Mount the metadata files under the metadata directory tree. */ diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h index fc43fc8c9..4bab51204 100644 --- a/libxfs/libxfs_priv.h +++ b/libxfs/libxfs_priv.h @@ -218,6 +218,17 @@ uint32_t get_random_u32(void); #define get_random_u32() (0) #endif +static inline int +__percpu_counter_compare(uint64_t *count, int64_t rhs, int32_t batch) +{ + if (*count > rhs) + return 1; + else if (*count < rhs) + return -1; + return 0; +} + + #define PAGE_SIZE getpagesize() extern unsigned int PAGE_SHIFT; diff --git a/libxfs/xfs_ag_resv.c b/libxfs/xfs_ag_resv.c index f5cbaa946..83cac2033 100644 --- a/libxfs/xfs_ag_resv.c +++ b/libxfs/xfs_ag_resv.c @@ -113,6 +113,7 @@ xfs_ag_resv_needed( case XFS_AG_RESV_RMAPBT: len -= xfs_perag_resv(pag, type)->ar_reserved; break; + case XFS_AG_RESV_METAFILE: case XFS_AG_RESV_NONE: /* empty */ break; @@ -346,6 +347,7 @@ xfs_ag_resv_alloc_extent( switch (type) { case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_METAFILE: return; case XFS_AG_RESV_METADATA: case XFS_AG_RESV_RMAPBT: @@ -388,6 +390,7 @@ xfs_ag_resv_free_extent( switch (type) { case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_METAFILE: return; case XFS_AG_RESV_METADATA: case XFS_AG_RESV_RMAPBT: diff --git a/libxfs/xfs_errortag.h b/libxfs/xfs_errortag.h index 7002d7676..a53c5d40e 100644 --- a/libxfs/xfs_errortag.h +++ b/libxfs/xfs_errortag.h @@ -64,7 +64,8 @@ #define XFS_ERRTAG_WB_DELAY_MS 42 #define XFS_ERRTAG_WRITE_DELAY_MS 43 #define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44 -#define XFS_ERRTAG_MAX 45 +#define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45 +#define XFS_ERRTAG_MAX 46 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -113,5 +114,6 @@ #define XFS_RANDOM_WB_DELAY_MS 3000 #define XFS_RANDOM_WRITE_DELAY_MS 3000 #define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1 +#define XFS_RANDOM_METAFILE_RESV_CRITICAL 4 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/libxfs/xfs_metadir.c b/libxfs/xfs_metadir.c index b5f05925e..253fbf48e 100644 --- a/libxfs/xfs_metadir.c +++ b/libxfs/xfs_metadir.c @@ -28,6 +28,9 @@ #include "xfs_dir2_priv.h" #include "xfs_parent.h" #include "xfs_health.h" +#include "xfs_errortag.h" +#include "xfs_btree.h" +#include "xfs_alloc.h" /* * Metadata Directory Tree diff --git a/libxfs/xfs_metafile.c b/libxfs/xfs_metafile.c index 3bd949337..435b8f8bf 100644 --- a/libxfs/xfs_metafile.c +++ b/libxfs/xfs_metafile.c @@ -17,6 +17,8 @@ #include "xfs_metafile.h" #include "xfs_trace.h" #include "xfs_inode.h" +#include "xfs_errortag.h" +#include "xfs_alloc.h" /* Set up an inode to be recognized as a metadata directory inode. */ void @@ -50,3 +52,203 @@ xfs_metafile_clear_iflag( ip->i_diflags2 &= ~XFS_DIFLAG2_METADATA; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } + +/* + * Is the amount of space that could be allocated towards a given metadata + * file at or beneath a certain threshold? + */ +static inline bool +xfs_metafile_resv_can_cover( + struct xfs_inode *ip, + int64_t rhs) +{ + /* + * The amount of space that can be allocated to this metadata file is + * the remaining reservation for the particular metadata file + the + * global free block count. Take care of the first case to avoid + * touching the per-cpu counter. + */ + if (ip->i_delayed_blks >= rhs) + return true; + + /* + * There aren't enough blocks left in the inode's reservation, but it + * isn't critical unless there also isn't enough free space. + */ + return __percpu_counter_compare(&ip->i_mount->m_fdblocks, + rhs - ip->i_delayed_blks, 2048) >= 0; +} + +/* + * Is this metadata file critically low on blocks? For now we'll define that + * as the number of blocks we can get our hands on being less than 10% of what + * we reserved or less than some arbitrary number (maximum btree height). + */ +bool +xfs_metafile_resv_critical( + struct xfs_inode *ip) +{ + uint64_t asked_low_water; + + if (!ip) + return false; + + ASSERT(xfs_is_metadir_inode(ip)); + trace_xfs_metafile_resv_critical(ip, 0); + + if (!xfs_metafile_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels)) + return true; + + asked_low_water = div_u64(ip->i_meta_resv_asked, 10); + if (!xfs_metafile_resv_can_cover(ip, asked_low_water)) + return true; + + return XFS_TEST_ERROR(false, ip->i_mount, + XFS_ERRTAG_METAFILE_RESV_CRITICAL); +} + +/* Allocate a block from the metadata file's reservation. */ +void +xfs_metafile_resv_alloc_space( + struct xfs_inode *ip, + struct xfs_alloc_arg *args) +{ + int64_t len = args->len; + + ASSERT(xfs_is_metadir_inode(ip)); + ASSERT(XFS_IS_DQDETACHED(ip)); + ASSERT(args->resv == XFS_AG_RESV_METAFILE); + + trace_xfs_metafile_resv_alloc_space(ip, args->len); + + /* + * Allocate the blocks from the metadata inode's block reservation + * and update the ondisk sb counter. + */ + if (ip->i_delayed_blks > 0) { + int64_t from_resv; + + from_resv = min_t(int64_t, len, ip->i_delayed_blks); + ip->i_delayed_blks -= from_resv; + xfs_mod_delalloc(ip, 0, -from_resv); + xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, + -from_resv); + len -= from_resv; + } + + /* + * Any allocation in excess of the reservation requires in-core and + * on-disk fdblocks updates. If we can grab @len blocks from the + * in-core fdblocks then all we need to do is update the on-disk + * superblock; if not, then try to steal some from the transaction's + * block reservation. Overruns are only expected for rmap btrees. + */ + if (len) { + unsigned int field; + int error; + + error = xfs_dec_fdblocks(ip->i_mount, len, true); + if (error) + field = XFS_TRANS_SB_FDBLOCKS; + else + field = XFS_TRANS_SB_RES_FDBLOCKS; + + xfs_trans_mod_sb(args->tp, field, -len); + } + + ip->i_nblocks += args->len; + xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE); +} + +/* Free a block to the metadata file's reservation. */ +void +xfs_metafile_resv_free_space( + struct xfs_inode *ip, + struct xfs_trans *tp, + xfs_filblks_t len) +{ + int64_t to_resv; + + ASSERT(xfs_is_metadir_inode(ip)); + ASSERT(XFS_IS_DQDETACHED(ip)); + trace_xfs_metafile_resv_free_space(ip, len); + + ip->i_nblocks -= len; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + /* + * Add the freed blocks back into the inode's delalloc reservation + * until it reaches the maximum size. Update the ondisk fdblocks only. + */ + to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks); + if (to_resv > 0) { + to_resv = min_t(int64_t, to_resv, len); + ip->i_delayed_blks += to_resv; + xfs_mod_delalloc(ip, 0, to_resv); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv); + len -= to_resv; + } + + /* + * Everything else goes back to the filesystem, so update the in-core + * and on-disk counters. + */ + if (len) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len); +} + +/* Release a metadata file's space reservation. */ +void +xfs_metafile_resv_free( + struct xfs_inode *ip) +{ + if (!ip) + return; + + ASSERT(xfs_is_metadir_inode(ip)); + trace_xfs_metafile_resv_free(ip, 0); + + xfs_mod_delalloc(ip, 0, -ip->i_delayed_blks); + xfs_add_fdblocks(ip->i_mount, ip->i_delayed_blks); + ip->i_delayed_blks = 0; + ip->i_meta_resv_asked = 0; +} + +/* Set up a metadata file's space reservation. */ +int +xfs_metafile_resv_init( + struct xfs_inode *ip, + xfs_filblks_t ask) +{ + xfs_filblks_t hidden_space; + xfs_filblks_t used; + int error; + + if (!ip || ip->i_meta_resv_asked > 0) + return 0; + + ASSERT(xfs_is_metadir_inode(ip)); + + /* + * Space taken by all other metadata btrees are accounted on-disk as + * used space. We therefore only hide the space that is reserved but + * not used by the trees. + */ + used = ip->i_nblocks; + if (used > ask) + ask = used; + hidden_space = ask - used; + + error = xfs_dec_fdblocks(ip->i_mount, hidden_space, true); + if (error) { + trace_xfs_metafile_resv_init_error(ip, error, _RET_IP_); + return error; + } + + xfs_mod_delalloc(ip, 0, hidden_space); + ip->i_delayed_blks = hidden_space; + ip->i_meta_resv_asked = ask; + + trace_xfs_metafile_resv_init(ip, ask); + return 0; +} diff --git a/libxfs/xfs_metafile.h b/libxfs/xfs_metafile.h index acec40012..8d8f08a60 100644 --- a/libxfs/xfs_metafile.h +++ b/libxfs/xfs_metafile.h @@ -21,6 +21,17 @@ void xfs_metafile_set_iflag(struct xfs_trans *tp, struct xfs_inode *ip, enum xfs_metafile_type metafile_type); void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip); +/* Space reservations for metadata inodes. */ +struct xfs_alloc_arg; + +bool xfs_metafile_resv_critical(struct xfs_inode *ip); +void xfs_metafile_resv_alloc_space(struct xfs_inode *ip, + struct xfs_alloc_arg *args); +void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp, + xfs_filblks_t len); +void xfs_metafile_resv_free(struct xfs_inode *ip); +int xfs_metafile_resv_init(struct xfs_inode *ip, xfs_filblks_t ask); + /* Code specific to kernel/userspace; must be provided externally. */ int xfs_trans_metafile_iget(struct xfs_trans *tp, xfs_ino_t ino, diff --git a/libxfs/xfs_types.h b/libxfs/xfs_types.h index b7ff777eb..5661590d6 100644 --- a/libxfs/xfs_types.h +++ b/libxfs/xfs_types.h @@ -202,6 +202,13 @@ enum xfs_ag_resv_type { * altering fdblocks. If you think you need this you're wrong. */ XFS_AG_RESV_IGNORE, + + /* + * This allocation activity is being done on behalf of a metadata file. + * These files maintain their own permanent space reservations and are + * required to adjust fdblocks using the xfs_metafile_resv_* helpers. + */ + XFS_AG_RESV_METAFILE, }; /* Results of scanning a btree keyspace to check occupancy. */ -- 2.50.1