From e9756dd29f33ede1a595d9fb5e0e2586f7542c1f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 2 May 2025 12:37:36 -0400 Subject: [PATCH 01/16] bcachefs: bcachefs_metadata_version_snapshot_deletion_v2 We're going to be speeding up snapshot deletion, by only having it process the extents/dirents/xattrs btrees if an inode of a given snapshot ID was present. This raises the possibility of 'bkey_in_missing_snapshot' errors popping up, if we ever accidentally don't do the corresponding inode update, or if the new algorithm has bugs. So instead of deleting snapshot IDs, add a new deleted flag, so that 'key in missing snapshot' errors can more definitively tell what happened and automatically repair. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 +- fs/bcachefs/snapshot.c | 91 ++++++++++++++++++++++++++--------- fs/bcachefs/snapshot.h | 25 ++++++++-- fs/bcachefs/snapshot_format.h | 2 +- fs/bcachefs/snapshot_types.h | 30 ++++++++++++ fs/bcachefs/subvolume_types.h | 27 ----------- 6 files changed, 122 insertions(+), 56 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 7ce475c565b5..0beff6af7ecf 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -695,7 +695,8 @@ struct bch_sb_field_ext { x(stripe_backpointers, BCH_VERSION(1, 22)) \ x(stripe_lru, BCH_VERSION(1, 23)) \ x(casefolding, BCH_VERSION(1, 24)) \ - x(extent_flags, BCH_VERSION(1, 25)) + x(extent_flags, BCH_VERSION(1, 25)) \ + x(snapshot_deletion_v2, BCH_VERSION(1, 26)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 87c8aead8610..a16fa0d8a274 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -211,9 +211,14 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", - BCH_SNAPSHOT_SUBVOL(s.v), - BCH_SNAPSHOT_WILL_DELETE(s.v), + if (BCH_SNAPSHOT_SUBVOL(s.v)) + prt_str(out, "subvol "); + if (BCH_SNAPSHOT_WILL_DELETE(s.v)) + prt_str(out, "will_delete "); + if (BCH_SNAPSHOT_DELETED(s.v)) + prt_str(out, "deleted "); + + prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u", le32_to_cpu(s.v->parent), le32_to_cpu(s.v->children[0]), le32_to_cpu(s.v->children[1]), @@ -314,7 +319,9 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, if (new.k->type == KEY_TYPE_snapshot) { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); - t->live = true; + t->state = !BCH_SNAPSHOT_DELETED(s.v) + ? SNAPSHOT_ID_live + : SNAPSHOT_ID_deleted; t->parent = le32_to_cpu(s.v->parent); t->children[0] = le32_to_cpu(s.v->children[0]); t->children[1] = le32_to_cpu(s.v->children[1]); @@ -711,6 +718,9 @@ static int check_snapshot(struct btree_trans *trans, memset(&s, 0, sizeof(s)); memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k))); + if (BCH_SNAPSHOT_DELETED(&s)) + return 0; + id = le32_to_cpu(s.parent); if (id) { ret = bch2_snapshot_lookup(trans, id, &v); @@ -998,7 +1008,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) snapshot_id_list_to_text(&buf, t); darray_for_each(*t, id) { - if (fsck_err_on(!bch2_snapshot_exists(c, *id), + if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty, trans, snapshot_node_missing, "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { if (t->nr > 1) { @@ -1023,22 +1033,38 @@ err: return ret; } -int bch2_check_key_has_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) +int __bch2_check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; int ret = 0; + enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot); - if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot), + /* Snapshot was definitively deleted, this error is marked autofix */ + if (fsck_err_on(state == SNAPSHOT_ID_deleted, + trans, bkey_in_deleted_snapshot, + "key in deleted snapshot %s, delete?", + (bch2_btree_id_to_text(&buf, iter->btree_id), + prt_char(&buf, ' '), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node) ?: 1; + + /* + * Snapshot missing: we should have caught this with btree_lost_data and + * kicked off reconstruct_snapshots, so if we end up here we have no + * idea what happened: + */ + if (fsck_err_on(state == SNAPSHOT_ID_empty, trans, bkey_in_missing_snapshot, "key in missing snapshot %s, delete?", (bch2_btree_id_to_text(&buf, iter->btree_id), prt_char(&buf, ' '), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node) ?: 1; + BTREE_UPDATE_internal_snapshot_node) ?: 1; fsck_err: printbuf_exit(&buf); return ret; @@ -1085,24 +1111,25 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) struct btree_iter iter, p_iter = {}; struct btree_iter c_iter = {}; struct btree_iter tree_iter = {}; - struct bkey_s_c_snapshot s; u32 parent_id, child_id; unsigned i; int ret = 0; - s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_intent, snapshot); - ret = bkey_err(s); + struct bkey_i_snapshot *s = + bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_intent, snapshot); + ret = PTR_ERR_OR_ZERO(s); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "missing snapshot %u", id); if (ret) goto err; - BUG_ON(s.v->children[1]); + BUG_ON(BCH_SNAPSHOT_DELETED(&s->v)); + BUG_ON(s->v.children[1]); - parent_id = le32_to_cpu(s.v->parent); - child_id = le32_to_cpu(s.v->children[0]); + parent_id = le32_to_cpu(s->v.parent); + child_id = le32_to_cpu(s->v.children[0]); if (parent_id) { struct bkey_i_snapshot *parent; @@ -1160,24 +1187,38 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) */ struct bkey_i_snapshot_tree *s_t; - BUG_ON(s.v->children[1]); + BUG_ON(s->v.children[1]); s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, - BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), + BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)), 0, snapshot_tree); ret = PTR_ERR_OR_ZERO(s_t); if (ret) goto err; - if (s.v->children[0]) { - s_t->v.root_snapshot = s.v->children[0]; + if (s->v.children[0]) { + s_t->v.root_snapshot = s->v.children[0]; } else { s_t->k.type = KEY_TYPE_deleted; set_bkey_val_u64s(&s_t->k, 0); } } - ret = bch2_btree_delete_at(trans, &iter, 0); + if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) { + SET_BCH_SNAPSHOT_DELETED(&s->v, true); + s->v.parent = 0; + s->v.children[0] = 0; + s->v.children[1] = 0; + s->v.subvol = 0; + s->v.tree = 0; + s->v.depth = 0; + s->v.skip[0] = 0; + s->v.skip[1] = 0; + s->v.skip[2] = 0; + } else { + s->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&s->k, 0); + } err: bch2_trans_iter_exit(trans, &tree_iter); bch2_trans_iter_exit(trans, &p_iter); @@ -1478,6 +1519,9 @@ static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s if (BCH_SNAPSHOT_SUBVOL(s.v)) return 0; + if (BCH_SNAPSHOT_DELETED(s.v)) + return 0; + mutex_lock(&d->progress_lock); for (unsigned i = 0; i < 2; i++) { u32 child = le32_to_cpu(s.v->children[i]); @@ -1536,6 +1580,9 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, struct bkey_i_snapshot *s; int ret; + if (!bch2_snapshot_exists(c, k.k->p.offset)) + return 0; + if (k.k->type != KEY_TYPE_snapshot) return 0; diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 24a451bb7024..69c484b77729 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -120,21 +120,26 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) return id; } -static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id) +static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c, u32 id) { const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->live : 0; + return s ? s->state : SNAPSHOT_ID_empty; } -static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) +static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id) { rcu_read_lock(); - bool ret = __bch2_snapshot_exists(c, id); + enum snapshot_id_state ret = __bch2_snapshot_id_state(c, id); rcu_read_unlock(); return ret; } +static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) +{ + return bch2_snapshot_id_state(c, id) == SNAPSHOT_ID_live; +} + static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) { rcu_read_lock(); @@ -241,7 +246,17 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, int bch2_check_snapshot_trees(struct bch_fs *); int bch2_check_snapshots(struct bch_fs *); int bch2_reconstruct_snapshots(struct bch_fs *); -int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); + +int __bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); + +static inline int bch2_check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + return likely(bch2_snapshot_exists(trans->c, k.k->p.snapshot)) + ? 0 + : __bch2_check_key_has_snapshot(trans, iter, k); +} int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h index 685a9fe209ab..9bccae1f3590 100644 --- a/fs/bcachefs/snapshot_format.h +++ b/fs/bcachefs/snapshot_format.h @@ -16,9 +16,9 @@ struct bch_snapshot { }; LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE, struct bch_snapshot, flags, 0, 1) - /* True if a subvolume points to this snapshot node: */ LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) +LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 2, 3) /* * Snapshot trees: diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h index 6a969996e68f..1aa7a58442ae 100644 --- a/fs/bcachefs/snapshot_types.h +++ b/fs/bcachefs/snapshot_types.h @@ -3,8 +3,38 @@ #define _BCACHEFS_SNAPSHOT_TYPES_H #include "bbpos_types.h" +#include "darray.h" #include "subvolume_types.h" +typedef DARRAY(u32) snapshot_id_list; + +#define IS_ANCESTOR_BITMAP 128 + +struct snapshot_t { + enum snapshot_id_state { + SNAPSHOT_ID_empty, + SNAPSHOT_ID_live, + SNAPSHOT_ID_deleted, + } state; + u32 parent; + u32 skip[3]; + u32 depth; + u32 children[2]; + u32 subvol; /* Nonzero only if a subvolume points to this node: */ + u32 tree; + unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; +}; + +struct snapshot_table { + struct rcu_head rcu; + size_t nr; +#ifndef RUST_BINDGEN + DECLARE_FLEX_ARRAY(struct snapshot_t, s); +#else + struct snapshot_t s[0]; +#endif +}; + struct snapshot_interior_delete { u32 id; u32 live_child; diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index 1549d6daf7af..9d634b906dcd 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -2,33 +2,6 @@ #ifndef _BCACHEFS_SUBVOLUME_TYPES_H #define _BCACHEFS_SUBVOLUME_TYPES_H -#include "darray.h" - -typedef DARRAY(u32) snapshot_id_list; - -#define IS_ANCESTOR_BITMAP 128 - -struct snapshot_t { - bool live; - u32 parent; - u32 skip[3]; - u32 depth; - u32 children[2]; - u32 subvol; /* Nonzero only if a subvolume points to this node: */ - u32 tree; - unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; -}; - -struct snapshot_table { - struct rcu_head rcu; - size_t nr; -#ifndef RUST_BINDGEN - DECLARE_FLEX_ARRAY(struct snapshot_t, s); -#else - struct snapshot_t s[0]; -#endif -}; - typedef struct { /* we can't have padding in this struct: */ u64 subvol; -- 2.51.0 From 88f62ed60ceebf387140c8e59df8db827668d09a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 2 May 2025 13:23:22 -0400 Subject: [PATCH 02/16] bcachefs: delete_dead_snapshot_keys_v2() Since extents, dirents and xattrs require an inode with the corresponding snapshot ID to exists, we can avoid a lot of scanning by only scanning those trees for keys to process if the correspending inode exists. Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 164 ++++++++++++++++++++++++++++++++++------- 1 file changed, 136 insertions(+), 28 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index a16fa0d8a274..9ec3275c7b0a 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1432,6 +1432,12 @@ static unsigned live_child(struct bch_fs *c, u32 id) return ret; } +static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id) +{ + return snapshot_list_has_id(&d->delete_leaves, id) || + interior_delete_has_id(&d->delete_interior, id) != 0; +} + static int delete_dead_snapshots_process_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) @@ -1500,6 +1506,129 @@ static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree return ret; } +static int delete_dead_snapshot_keys_v1(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + + for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) { + struct disk_reservation res = { 0 }; + u64 prev_inum = 0; + + d->pos.pos = POS_MIN; + + if (!btree_type_has_snapshots(d->pos.btree)) + continue; + + int ret = for_each_btree_key_commit(trans, iter, + d->pos.btree, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + d->pos.pos = iter.pos; + + if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) + continue; + + delete_dead_snapshots_process_key(trans, &iter, k); + })); + + bch2_disk_reservation_put(c, &res); + + if (ret) + return ret; + } + + return 0; +} + +static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree, + struct bpos start, struct bpos end) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + struct disk_reservation res = { 0 }; + + d->pos.btree = btree; + d->pos.pos = POS_MIN; + + int ret = for_each_btree_key_max_commit(trans, iter, + btree, start, end, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + d->pos.pos = iter.pos; + delete_dead_snapshots_process_key(trans, &iter, k); + })); + + bch2_disk_reservation_put(c, &res); + return ret; +} + +static int delete_dead_snapshot_keys_v2(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + struct disk_reservation res = { 0 }; + u64 prev_inum = 0; + int ret = 0; + + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); + + while (1) { + struct bkey_s_c k; + ret = lockrestart_do(trans, + bkey_err(k = bch2_btree_iter_peek(trans, &iter))); + if (ret) + break; + + if (!k.k) + break; + + d->pos.btree = iter.btree_id; + d->pos.pos = iter.pos; + + if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) + continue; + + if (snapshot_id_dying(d, k.k->p.snapshot)) { + struct bpos start = POS(k.k->p.offset, 0); + struct bpos end = POS(k.k->p.offset, U64_MAX); + + ret = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?: + delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?: + delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end); + if (ret) + break; + + bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1)); + } else { + bch2_btree_iter_advance(trans, &iter); + } + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + goto err; + + prev_inum = 0; + ret = for_each_btree_key_commit(trans, iter, + BTREE_ID_inodes, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + d->pos.btree = iter.btree_id; + d->pos.pos = iter.pos; + + if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) + continue; + + delete_dead_snapshots_process_key(trans, &iter, k); + })); +err: + bch2_disk_reservation_put(c, &res); + return ret; +} + /* * For a given snapshot, if it doesn't have a subvolume that points to it, and * it doesn't have child snapshot nodes - it's now redundant and we can mark it @@ -1683,34 +1812,13 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) goto err; } - for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) { - struct disk_reservation res = { 0 }; - u64 prev_inum = 0; - - d->pos.pos = POS_MIN; - - if (!btree_type_has_snapshots(d->pos.btree)) - continue; - - ret = for_each_btree_key_commit(trans, iter, - d->pos.btree, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - d->pos.pos = iter.pos; - - if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) - continue; - - delete_dead_snapshots_process_key(trans, &iter, k); - })); - - bch2_disk_reservation_put(c, &res); - - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "deleting keys from dying snapshots"); - if (ret) - goto err; - } + ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2) + ? delete_dead_snapshot_keys_v2(trans) + : delete_dead_snapshot_keys_v1(trans); + if (!bch2_err_matches(ret, EROFS)) + bch_err_msg(c, ret, "deleting keys from dying snapshots"); + if (ret) + goto err; darray_for_each(d->delete_leaves, i) { ret = commit_do(trans, NULL, NULL, 0, -- 2.51.0 From 7d4f2687ef8a625742c5df4e2d42f50ba398f3a2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 3 May 2025 19:14:54 -0400 Subject: [PATCH 03/16] bcachefs: bch2_journal_write() refactoring Make the locking easier to follow; also take io_refs earlier, in __journal_write_alloc(). Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 110 ++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 64 deletions(-) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index be86fd21de2a..c593d77dc8f2 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1467,6 +1467,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j, { struct bch_fs *c = container_of(j, struct bch_fs, journal); + rcu_read_lock(); darray_for_each(*devs, i) { struct bch_dev *ca = rcu_dereference(c->devs[*i]); if (!ca) @@ -1488,6 +1489,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j, ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); } } + rcu_read_unlock(); } static void __journal_write_alloc(struct journal *j, @@ -1500,7 +1502,8 @@ static void __journal_write_alloc(struct journal *j, struct bch_fs *c = container_of(j, struct bch_fs, journal); darray_for_each(*devs, i) { - struct bch_dev *ca = rcu_dereference(c->devs[*i]); + struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE, + BCH_DEV_WRITE_REF_journal_write); if (!ca) continue; @@ -1514,8 +1517,10 @@ static void __journal_write_alloc(struct journal *j, ca->mi.state != BCH_MEMBER_STATE_rw || !ja->nr || bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || - sectors > ja->sectors_free) + sectors > ja->sectors_free) { + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); continue; + } bch2_dev_stripe_increment(ca, &j->wp.stripe); @@ -1538,15 +1543,8 @@ static void __journal_write_alloc(struct journal *j, } } -/** - * journal_write_alloc - decide where to write next journal entry - * - * @j: journal object - * @w: journal buf (entry to be written) - * - * Returns: 0 on success, or -BCH_ERR_insufficient_devices on failure - */ -static int journal_write_alloc(struct journal *j, struct journal_buf *w) +static int journal_write_alloc(struct journal *j, struct journal_buf *w, + unsigned *replicas) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_devs_mask devs; @@ -1554,29 +1552,18 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) unsigned sectors = vstruct_sectors(w->data, c->block_bits); unsigned target = c->opts.metadata_target ?: c->opts.foreground_target; - unsigned replicas = 0, replicas_want = - READ_ONCE(c->opts.metadata_replicas); + unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas); unsigned replicas_need = min_t(unsigned, replicas_want, READ_ONCE(c->opts.metadata_replicas_required)); bool advance_done = false; - rcu_read_lock(); - - /* We might run more than once if we have to stop and do discards: */ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key)); - bkey_for_each_ptr(ptrs, p) { - struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev); - if (ca) - replicas += ca->mi.durability; - } - retry_target: devs = target_rw_devs(c, BCH_DATA_journal, target); devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); retry_alloc: - __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); + __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want); - if (likely(replicas >= replicas_want)) + if (likely(*replicas >= replicas_want)) goto done; if (!advance_done) { @@ -1585,18 +1572,16 @@ retry_alloc: goto retry_alloc; } - if (replicas < replicas_want && target) { + if (*replicas < replicas_want && target) { /* Retry from all devices: */ target = 0; advance_done = false; goto retry_target; } done: - rcu_read_unlock(); - BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; + return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; } static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) @@ -1782,13 +1767,7 @@ static CLOSURE_CALLBACK(journal_write_submit) unsigned sectors = vstruct_sectors(w->data, c->block_bits); extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE, - BCH_DEV_WRITE_REF_journal_write); - if (!ca) { - /* XXX: fix this */ - bch_err(c, "missing device %u for journal write", ptr->dev); - continue; - } + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], sectors); @@ -2074,7 +2053,8 @@ CLOSURE_CALLBACK(bch2_journal_write) ret = bch2_journal_write_pick_flush(j, w); spin_unlock(&j->lock); - if (ret) + + if (unlikely(ret)) goto err; mutex_lock(&j->buf_lock); @@ -2082,43 +2062,30 @@ CLOSURE_CALLBACK(bch2_journal_write) ret = bch2_journal_write_prep(j, w); mutex_unlock(&j->buf_lock); - if (ret) - goto err; - j->entry_bytes_written += vstruct_bytes(w->data); + if (unlikely(ret)) + goto err; + unsigned replicas_allocated = 0; while (1) { - spin_lock(&j->lock); - ret = journal_write_alloc(j, w); + ret = journal_write_alloc(j, w, &replicas_allocated); if (!ret || !j->can_discard) break; - spin_unlock(&j->lock); bch2_journal_do_discards(j); } - if (ret && !bch2_journal_error(j)) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - - __bch2_journal_debug_to_text(&buf, j); - spin_unlock(&j->lock); - prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), - le64_to_cpu(w->data->seq), - vstruct_sectors(w->data, c->block_bits), - bch2_err_str(ret)); - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - if (ret) - goto err; + if (unlikely(ret)) + goto err_allocate_write; + spin_lock(&j->lock); /* * write is allocated, no longer need to account for it in * bch2_journal_space_available(): */ w->sectors = 0; w->write_allocated = true; + j->entry_bytes_written += vstruct_bytes(w->data); /* * journal entry has been compacted and allocated, recalculate space @@ -2130,9 +2097,6 @@ CLOSURE_CALLBACK(bch2_journal_write) w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); - if (c->opts.nochanges) - goto no_io; - /* * Mark journal replicas before we submit the write to guarantee * recovery will find the journal entries after a crash. @@ -2143,15 +2107,33 @@ CLOSURE_CALLBACK(bch2_journal_write) if (ret) goto err; + if (c->opts.nochanges) + goto no_io; + if (!JSET_NO_FLUSH(w->data)) continue_at(cl, journal_write_preflush, j->wq); else continue_at(cl, journal_write_submit, j->wq); return; -no_io: - continue_at(cl, journal_write_done, j->wq); - return; +err_allocate_write: + if (!bch2_journal_error(j)) { + struct printbuf buf = PRINTBUF; + + bch2_journal_debug_to_text(&buf, j); + prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), + le64_to_cpu(w->data->seq), + vstruct_sectors(w->data, c->block_bits), + bch2_err_str(ret)); + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } err: bch2_fatal_error(c); +no_io: + extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); + } + continue_at(cl, journal_write_done, j->wq); } -- 2.51.0 From e02888faab24494f91016d578e3dc9dce81e3d71 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 4 May 2025 14:45:54 -0400 Subject: [PATCH 04/16] bcachefs: bch2_dev_in_target() no longer takes rcu_read_lock() Minor optimization, the caller generally has it already. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_groups.c | 6 +----- fs/bcachefs/rebalance.c | 10 ++++++++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index c1a2a957c884..c20ecf5e5381 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -212,17 +212,13 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) case TARGET_DEV: return dev == t.dev; case TARGET_GROUP: { - rcu_read_lock(); struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); const struct bch_devs_mask *m = g && t.group < g->nr && !g->entries[t.group].deleted ? &g->entries[t.group].devs : NULL; - bool ret = m ? test_bit(dev, m->d) : false; - rcu_read_unlock(); - - return ret; + return m ? test_bit(dev, m->d) : false; } default: BUG(); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 26c87ab019e8..7bcebcac2e1a 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -80,11 +80,13 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, unsigned ptr_bit = 1; unsigned rewrite_ptrs = 0; + rcu_read_lock(); bkey_for_each_ptr(ptrs, ptr) { if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) rewrite_ptrs |= ptr_bit; ptr_bit <<= 1; } + rcu_read_unlock(); return rewrite_ptrs; } @@ -132,10 +134,14 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) } } incompressible: - if (opts->background_target) + if (opts->background_target) { + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) + if (!p.ptr.cached && + !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) sectors += p.crc.compressed_size; + rcu_read_unlock(); + } return sectors; } -- 2.51.0 From 84bd6afee121b9e9bcc26f88cb55e0ee5c7a8f56 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 4 May 2025 16:24:43 -0400 Subject: [PATCH 05/16] bcachefs: inline bch2_ob_ptr() This was an oversight, we want bch2_alloc_sectors_append_ptrs_inlined() fully inlined. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 14 -------------- fs/bcachefs/alloc_foreground.h | 15 ++++++++++++++- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 2d7f32f9499e..b50846da7ae4 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1388,20 +1388,6 @@ err: return ret; } -struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) -{ - struct bch_dev *ca = ob_dev(c, ob); - - return (struct bch_extent_ptr) { - .type = 1 << BCH_EXTENT_ENTRY_ptr, - .gen = ob->gen, - .dev = ob->dev, - .offset = bucket_to_sector(ca, ob->bucket) + - ca->mi.bucket_size - - ob->sectors_free, - }; -} - void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, struct bkey_i *k, unsigned sectors, bool cached) diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 192203410d4e..2e01c7b61ed1 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -3,6 +3,7 @@ #define _BCACHEFS_ALLOC_FOREGROUND_H #include "bcachefs.h" +#include "buckets.h" #include "alloc_types.h" #include "extents.h" #include "io_write_types.h" @@ -233,7 +234,19 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *, struct closure *, struct write_point **); -struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *); +static inline struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) +{ + struct bch_dev *ca = ob_dev(c, ob); + + return (struct bch_extent_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_ptr, + .gen = ob->gen, + .dev = ob->dev, + .offset = bucket_to_sector(ca, ob->bucket) + + ca->mi.bucket_size - + ob->sectors_free, + }; +} /* * Append pointers to the space we just allocated to @k, and mark @sectors space -- 2.51.0 From fbe728f9569b683564421a9190be53e60111a864 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 5 May 2025 21:15:34 -0400 Subject: [PATCH 06/16] bcachefs: improve check_inode_hash_info_matches_root() error message Signed-off-by: Kent Overstreet --- fs/bcachefs/str_hash.c | 31 +++++++++++++++++++------------ fs/bcachefs/str_hash.h | 8 +++++--- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index a90bf7b8a2b4..55a3a116b5a8 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -157,6 +157,8 @@ static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans if (bkey_is_inode(k.k)) goto found; } + + /* This would've been caught by check_key_has_inode() */ bch_err(c, "%s(): inum %llu not found", __func__, inum); ret = -BCH_ERR_fsck_repair_unimplemented; goto err; @@ -166,20 +168,25 @@ found:; if (ret) goto err; - struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); - if (hash_info->type != hash2.type || - memcmp(&hash_info->siphash_key, &hash2.siphash_key, sizeof(hash2.siphash_key))) { + struct bch_hash_info hash_root = bch2_hash_info_init(c, &inode); + if (hash_info->type != hash_root.type || + memcmp(&hash_info->siphash_key, + &hash_root.siphash_key, + sizeof(hash_root.siphash_key))) { ret = repair_inode_hash_info(trans, &inode); if (!ret) { - bch_err(c, "inode hash info mismatch with root, but mismatch not found\n" - "%u %llx %llx\n" - "%u %llx %llx", - hash_info->type, - hash_info->siphash_key.k0, - hash_info->siphash_key.k1, - hash2.type, - hash2.siphash_key.k0, - hash2.siphash_key.k1); + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n", inum); + + prt_printf(&buf, "root snapshot %u ", hash_root.inum_snapshot); + bch2_prt_str_hash_type(&buf, hash_root.type); + prt_printf(&buf, " %llx %llx\n", hash_root.siphash_key.k0, hash_root.siphash_key.k1); + + prt_printf(&buf, "vs snapshot %u ", hash_info->inum_snapshot); + bch2_prt_str_hash_type(&buf, hash_info->type); + prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1); + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); ret = -BCH_ERR_fsck_repair_unimplemented; } } diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 0c1a00539bd1..ae3154fb6a94 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -32,6 +32,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) } struct bch_hash_info { + u32 inum_snapshot; u8 type; struct unicode_map *cf_encoding; /* @@ -45,11 +46,12 @@ static inline struct bch_hash_info bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) { struct bch_hash_info info = { - .type = INODE_STR_HASH(bi), + .inum_snapshot = bi->bi_snapshot, + .type = INODE_STR_HASH(bi), #ifdef CONFIG_UNICODE - .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, + .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, #endif - .siphash_key = { .k0 = bi->bi_hash_seed } + .siphash_key = { .k0 = bi->bi_hash_seed } }; if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { -- 2.51.0 From 39430cfd27ed2e1243374ea28479773506e119c3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 4 May 2025 15:02:53 -0400 Subject: [PATCH 07/16] bcachefs: Improve bch2_extent_ptr_set_cached() Preferentially keep existing cached pointers instead of adding new ones. Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 55 ++++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index c4fe4ffd41f1..d3af841e48ef 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1136,33 +1136,50 @@ void bch2_extent_ptr_set_cached(struct bch_fs *c, struct bkey_s k, struct bch_extent_ptr *ptr) { - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + struct bkey_ptrs ptrs; union bch_extent_entry *entry; struct extent_ptr_decoded p; + bool have_cached_ptr; + unsigned drop_dev = ptr->dev; rcu_read_lock(); - if (!want_cached_ptr(c, opts, ptr)) { - bch2_bkey_drop_ptr_noerror(k, ptr); - goto out; - } +restart_drop_ptrs: + ptrs = bch2_bkey_ptrs(k); + have_cached_ptr = false; - /* - * Stripes can't contain cached data, for - reasons. - * - * Possibly something we can fix in the future? - */ - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (&entry->ptr == ptr) { - if (p.has_ec) - bch2_bkey_drop_ptr_noerror(k, ptr); - else - ptr->cached = true; - goto out; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + /* + * Check if it's erasure coded - stripes can't contain cached + * data. Possibly something we can fix in the future? + */ + if (&entry->ptr == ptr && p.has_ec) + goto drop; + + if (p.ptr.cached) { + if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) { + bch2_bkey_drop_ptr_noerror(k, &entry->ptr); + ptr = NULL; + goto restart_drop_ptrs; + } + + have_cached_ptr = true; } + } + + if (!ptr) + bkey_for_each_ptr(ptrs, ptr2) + if (ptr2->dev == drop_dev) + ptr = ptr2; - BUG(); -out: + if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) + goto drop; + + ptr->cached = true; + rcu_read_unlock(); + return; +drop: rcu_read_unlock(); + bch2_bkey_drop_ptr_noerror(k, ptr); } /* -- 2.51.0 From 502222041c810b5d5ba5d45512e0a131c7f07d0a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 5 May 2025 21:19:17 -0400 Subject: [PATCH 08/16] bcachefs: __bch2_fs_free() cleanup Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 45e2b2bc8c65..7c6ea43b4347 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -579,35 +579,36 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_find_btree_nodes_exit(&c->found_btree_nodes); bch2_free_pending_node_rewrites(c); bch2_free_fsck_errs(c); - bch2_fs_accounting_exit(c); - bch2_fs_async_obj_exit(c); - bch2_fs_sb_errors_exit(c); - bch2_fs_counters_exit(c); + bch2_fs_vfs_exit(c); bch2_fs_snapshots_exit(c); + bch2_fs_sb_errors_exit(c); + bch2_fs_replicas_exit(c); bch2_fs_quota_exit(c); + bch2_fs_nocow_locking_exit(c); + bch2_fs_journal_exit(&c->journal); bch2_fs_fs_io_direct_exit(c); bch2_fs_fs_io_buffered_exit(c); bch2_fs_fsio_exit(c); - bch2_fs_vfs_exit(c); - bch2_fs_ec_exit(c); - bch2_fs_encryption_exit(c); - bch2_fs_nocow_locking_exit(c); bch2_fs_io_write_exit(c); bch2_fs_io_read_exit(c); + bch2_fs_encryption_exit(c); + bch2_fs_ec_exit(c); + bch2_fs_counters_exit(c); + bch2_fs_compress_exit(c); + bch2_io_clock_exit(&c->io_clock[WRITE]); + bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_buckets_waiting_for_journal_exit(c); - bch2_fs_btree_interior_update_exit(c); + bch2_fs_btree_write_buffer_exit(c); bch2_fs_btree_key_cache_exit(&c->btree_key_cache); - bch2_fs_btree_cache_exit(c); bch2_fs_btree_iter_exit(c); - bch2_fs_replicas_exit(c); - bch2_fs_journal_exit(&c->journal); - bch2_io_clock_exit(&c->io_clock[WRITE]); - bch2_io_clock_exit(&c->io_clock[READ]); - bch2_fs_compress_exit(c); + bch2_fs_btree_interior_update_exit(c); + bch2_fs_btree_cache_exit(c); + bch2_fs_accounting_exit(c); + bch2_fs_async_obj_exit(c); bch2_journal_keys_put_initial(c); bch2_find_btree_nodes_exit(&c->found_btree_nodes); + BUG_ON(atomic_read(&c->journal_keys.ref)); - bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); if (c->online_reserved) { u64 v = percpu_u64_get(c->online_reserved); -- 2.51.0 From 96fc7d8adb7881f7ffffcd6ab2be3b43fc5a5978 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 5 May 2025 20:35:36 -0400 Subject: [PATCH 09/16] bcachefs: opts.rebalance_on_ac_only Add an option for setting rebalance to only run when connected to mains power. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 1 + fs/bcachefs/opts.h | 5 ++++ fs/bcachefs/rebalance.c | 48 ++++++++++++++++++++++++++++++++--- fs/bcachefs/rebalance.h | 4 ++- fs/bcachefs/rebalance_types.h | 5 ++++ fs/bcachefs/super.c | 3 ++- 6 files changed, 60 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 0beff6af7ecf..061fa2666f35 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -870,6 +870,7 @@ LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20); LE64_BITMASK(BCH_SB_DEGRADED_ACTION, struct bch_sb, flags[6], 20, 22); LE64_BITMASK(BCH_SB_CASEFOLD, struct bch_sb, flags[6], 22, 23); +LE64_BITMASK(BCH_SB_REBALANCE_AC_ONLY, struct bch_sb, flags[6], 23, 24); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index b8cd0b04e62a..f4c014ad43c1 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -490,6 +490,11 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, true, \ NULL, "Enable rebalance: disable for debugging, or to\n"\ "quiet the system when doing performance testing\n")\ + x(rebalance_on_ac_only, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_REBALANCE_AC_ONLY, false, \ + NULL, "Enable rebalance while on mains power only\n") \ x(no_data_io, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 7bcebcac2e1a..8fefe2b174c2 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -518,6 +518,13 @@ static void rebalance_wait(struct bch_fs *c) bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); } +static bool bch2_rebalance_enabled(struct bch_fs *c) +{ + return c->opts.rebalance_enabled && + !(c->opts.rebalance_on_ac_only && + c->rebalance.on_battery); +} + static int do_rebalance(struct moving_context *ctxt) { struct btree_trans *trans = ctxt->trans; @@ -537,9 +544,9 @@ static int do_rebalance(struct moving_context *ctxt) BTREE_ITER_all_snapshots); while (!bch2_move_ratelimit(ctxt)) { - if (!c->opts.rebalance_enabled) { + if (!bch2_rebalance_enabled(c)) { bch2_moving_ctxt_flush_all(ctxt); - kthread_wait_freezable(c->opts.rebalance_enabled || + kthread_wait_freezable(bch2_rebalance_enabled(c) || kthread_should_stop()); } @@ -714,9 +721,42 @@ int bch2_rebalance_start(struct bch_fs *c) return 0; } -void bch2_fs_rebalance_init(struct bch_fs *c) +#ifdef CONFIG_POWER_SUPPLY +#include + +static int bch2_rebalance_power_notifier(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier); + + c->rebalance.on_battery = !power_supply_is_system_supplied(); + bch2_rebalance_wakeup(c); + return NOTIFY_OK; +} +#endif + +void bch2_fs_rebalance_exit(struct bch_fs *c) { - bch2_pd_controller_init(&c->rebalance.pd); +#ifdef CONFIG_POWER_SUPPLY + power_supply_unreg_notifier(&c->rebalance.power_notifier); +#endif +} + +int bch2_fs_rebalance_init(struct bch_fs *c) +{ + struct bch_fs_rebalance *r = &c->rebalance; + + bch2_pd_controller_init(&r->pd); + +#ifdef CONFIG_POWER_SUPPLY + r->power_notifier.notifier_call = bch2_rebalance_power_notifier; + int ret = power_supply_reg_notifier(&r->power_notifier); + if (ret) + return ret; + + r->on_battery = !power_supply_is_system_supplied(); +#endif + return 0; } static int check_rebalance_work_one(struct btree_trans *trans, diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index b7c8c0652ad6..5d9214fe1a22 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -52,7 +52,9 @@ void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); void bch2_rebalance_stop(struct bch_fs *); int bch2_rebalance_start(struct bch_fs *); -void bch2_fs_rebalance_init(struct bch_fs *); + +void bch2_fs_rebalance_exit(struct bch_fs *); +int bch2_fs_rebalance_init(struct bch_fs *); int bch2_check_rebalance_work(struct bch_fs *); diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h index fe5098c17dfc..33d77286f1d5 100644 --- a/fs/bcachefs/rebalance_types.h +++ b/fs/bcachefs/rebalance_types.h @@ -30,6 +30,11 @@ struct bch_fs_rebalance { struct bbpos scan_start; struct bbpos scan_end; struct bch_move_stats scan_stats; + + bool on_battery; +#ifdef CONFIG_POWER_SUPPLY + struct notifier_block power_notifier; +#endif }; #endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 7c6ea43b4347..bd0565b7a9ba 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -583,6 +583,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_fs_snapshots_exit(c); bch2_fs_sb_errors_exit(c); bch2_fs_replicas_exit(c); + bch2_fs_rebalance_exit(c); bch2_fs_quota_exit(c); bch2_fs_nocow_locking_exit(c); bch2_fs_journal_exit(&c->journal); @@ -867,7 +868,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, bch2_fs_move_init(c); bch2_fs_nocow_locking_init_early(c); bch2_fs_quota_init(c); - bch2_fs_rebalance_init(c); bch2_fs_sb_errors_init_early(c); bch2_fs_snapshots_init_early(c); bch2_fs_subvolumes_init_early(c); @@ -989,6 +989,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, bch2_fs_fsio_init(c) ?: bch2_fs_fs_io_direct_init(c) ?: bch2_fs_io_read_init(c) ?: + bch2_fs_rebalance_init(c) ?: bch2_fs_sb_errors_init(c) ?: bch2_fs_vfs_init(c); if (ret) -- 2.51.0 From 66e9a7f13916fb0630cc48e0c21c474607aa3967 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 6 May 2025 00:51:39 -0400 Subject: [PATCH 10/16] bcachefs: bch2_dev_remove_stripes() respects degraded flags Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 3 +- fs/bcachefs/ec.c | 77 ++++++++++++++++++++++++---------- fs/bcachefs/ec.h | 4 +- fs/bcachefs/super.c | 3 +- 4 files changed, 62 insertions(+), 25 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 002e3853f8cf..81e2ae4bb400 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -2442,8 +2442,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) * We clear the LRU and need_discard btrees first so that we don't race * with bch2_do_invalidates() and bch2_do_discards() */ - ret = bch2_dev_remove_stripes(c, ca->dev_idx) ?: - bch2_btree_delete_range(c, BTREE_ID_lru, start, end, + ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, BTREE_TRIGGER_norun, NULL) ?: diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index dcd4e2266d34..bf5f4f6283a4 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -2106,23 +2106,17 @@ err: /* device removal */ -static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a) +int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + unsigned dev_idx, + unsigned flags) { - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); - - if (!a->stripe) + if (k.k->type != KEY_TYPE_stripe) return 0; - if (a->stripe_sectors) { - bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data"); - return -BCH_ERR_invalidate_stripe_to_dev; - } - - struct btree_iter iter; struct bkey_i_stripe *s = - bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), - BTREE_ITER_slots, stripe); + bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe); int ret = PTR_ERR_OR_ZERO(s); if (ret) return ret; @@ -2139,35 +2133,76 @@ static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_ acc.replicas.data_type = BCH_DATA_user; ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); if (ret) - goto err; + return ret; struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i)); - bkey_for_each_ptr(ptrs, ptr) - if (ptr->dev == k_a.k->p.inode) + + /* XXX: how much redundancy do we still have? check degraded flags */ + + unsigned nr_good = 0; + + rcu_read_lock(); + bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == dev_idx) ptr->dev = BCH_SB_MEMBER_INVALID; + struct bch_dev *ca = bch2_dev_rcu(trans->c, ptr->dev); + nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed; + } + rcu_read_unlock(); + + if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) + return -BCH_ERR_remove_would_lose_data; + + unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant; + + if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST)) + return -BCH_ERR_remove_would_lose_data; + sectors = -sectors; memset(&acc, 0, sizeof(acc)); acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); acc.replicas.data_type = BCH_DATA_user; - ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); + return bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); +} + +static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a, + unsigned flags) +{ + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); + + if (!a->stripe) + return 0; + + if (a->stripe_sectors) { + bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data"); + return -BCH_ERR_invalidate_stripe_to_dev; + } + + struct btree_iter iter; + struct bkey_s_c_stripe s = + bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), + BTREE_ITER_slots, stripe); + int ret = bkey_err(s); if (ret) - goto err; -err: + return ret; + + ret = bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags); bch2_trans_iter_exit(trans, &iter); return ret; } -int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx) +int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags) { return bch2_trans_run(c, for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), BTREE_ITER_intent, k, NULL, NULL, 0, ({ - bch2_invalidate_stripe_to_dev(trans, k); + bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags); }))); } diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 83d37bcb548a..548048adf0d5 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -288,7 +288,9 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, } } -int bch2_dev_remove_stripes(struct bch_fs *, unsigned); +int bch2_invalidate_stripe_to_dev(struct btree_trans *, struct btree_iter *, + struct bkey_s_c, unsigned, unsigned); +int bch2_dev_remove_stripes(struct bch_fs *, unsigned, unsigned); void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); void bch2_fs_ec_stop(struct bch_fs *); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index bd0565b7a9ba..faa012107a97 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1744,7 +1744,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) __bch2_dev_read_only(c, ca); - ret = bch2_dev_data_drop(c, ca->dev_idx, flags); + ret = bch2_dev_data_drop(c, ca->dev_idx, flags) ?: + bch2_dev_remove_stripes(c, ca->dev_idx, flags); bch_err_msg(ca, ret, "bch2_dev_data_drop()"); if (ret) goto err; -- 2.51.0 From b3f80d09236e4915d7deedfaf15d7bdaef6f14d2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 3 May 2025 21:55:26 -0400 Subject: [PATCH 11/16] bcachefs: BCH_SB_MEMBER_DELETED_UUID Add a sentinal value for devices that have been removed, but don't want to reuse their index until a fsck has completed. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 4 ++++ fs/bcachefs/sb-members.c | 29 ++++++++++++++++++++++++++++- fs/bcachefs/sb-members.h | 4 +++- fs/bcachefs/sb-members_format.h | 4 ++++ 4 files changed, 39 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 92ae31737a24..dd08ec080313 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -1079,6 +1079,10 @@ out: * allocator thread - issue wakeup in case they blocked on gc_lock: */ closure_wake_up(&c->freelist_wait); + + if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags)) + bch2_sb_members_clean_deleted(c); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index f6a0b3de6bca..b9568a68fbf6 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -525,6 +525,7 @@ int bch2_sb_member_alloc(struct bch_fs *c) unsigned u64s; int best = -1; u64 best_last_mount = 0; + unsigned nr_deleted = 0; if (dev_idx < BCH_SB_MEMBERS_MAX) goto have_slot; @@ -535,7 +536,10 @@ int bch2_sb_member_alloc(struct bch_fs *c) continue; struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); - if (bch2_member_alive(&m)) + + nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID); + + if (!bch2_is_zero(&m.uuid, sizeof(m.uuid))) continue; u64 last_mount = le64_to_cpu(m.last_mount); @@ -549,6 +553,10 @@ int bch2_sb_member_alloc(struct bch_fs *c) goto have_slot; } + if (nr_deleted) + bch_err(c, "unable to allocate new member, but have %u deleted: run fsck", + nr_deleted); + return -BCH_ERR_ENOSPC_sb_members; have_slot: nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); @@ -564,3 +572,22 @@ have_slot: c->disk_sb.sb->nr_devices = nr_devices; return dev_idx; } + +void bch2_sb_members_clean_deleted(struct bch_fs *c) +{ + mutex_lock(&c->sb_lock); + bool write_sb = false; + + for (unsigned i = 0; i < c->sb.nr_devices; i++) { + struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i); + + if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) { + memset(&m->uuid, 0, sizeof(m->uuid)); + write_sb = true; + } + } + + if (write_sb) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index c9cb8f7657b0..6bd9b86aee5b 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -320,7 +320,8 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; static inline bool bch2_member_alive(struct bch_member *m) { - return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); + return !bch2_is_zero(&m->uuid, sizeof(m->uuid)) && + !uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID); } static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev) @@ -381,5 +382,6 @@ bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c); void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c); int bch2_sb_member_alloc(struct bch_fs *); +void bch2_sb_members_clean_deleted(struct bch_fs *); #endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h index 472218a59102..fb72ad730518 100644 --- a/fs/bcachefs/sb-members_format.h +++ b/fs/bcachefs/sb-members_format.h @@ -13,6 +13,10 @@ */ #define BCH_SB_MEMBER_INVALID 255 +#define BCH_SB_MEMBER_DELETED_UUID \ + UUID_INIT(0xffffffff, 0xffff, 0xffff, \ + 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) + #define BCH_MIN_NR_NBUCKETS (1 << 6) #define BCH_IOPS_MEASUREMENTS() \ -- 2.51.0 From 09fa6c3039d8bb22351ad071ea4656dd4f331d18 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 3 May 2025 21:36:23 -0400 Subject: [PATCH 12/16] bcachefs: bch2_dev_data_drop_by_backpointers() Currently, device removal has to scan all metadata for pointers to the device being removed. Add a new method, with the same interface as bch2_dev_data_drop(), that scans by backpointers instead - this will drastically speed up device removal. Signed-off-by: Kent Overstreet --- fs/bcachefs/migrate.c | 117 +++++++++++++++++++++++++++++++++++++----- fs/bcachefs/migrate.h | 3 +- 2 files changed, 105 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 90dcf80bd64a..bb7a92270c09 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -4,10 +4,13 @@ */ #include "bcachefs.h" +#include "backpointers.h" #include "bkey_buf.h" #include "btree_update.h" #include "btree_update_interior.h" +#include "btree_write_buffer.h" #include "buckets.h" +#include "ec.h" #include "errcode.h" #include "extents.h" #include "io_write.h" @@ -20,7 +23,7 @@ #include "super-io.h" static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, - unsigned dev_idx, int flags, bool metadata) + unsigned dev_idx, unsigned flags, bool metadata) { unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; @@ -37,11 +40,28 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, return 0; } +static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter, + struct btree *b, unsigned dev_idx, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_buf k; + + bch2_bkey_buf_init(&k); + bch2_bkey_buf_copy(&k, c, &b->key); + + int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?: + bch2_btree_node_update_key(trans, iter, b, k.k, 0, false); + + bch_err_fn(c, ret); + bch2_bkey_buf_exit(&k, c); + return ret; +} + static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, unsigned dev_idx, - int flags) + unsigned flags) { struct bch_fs *c = trans->c; struct bkey_i *n; @@ -77,9 +97,27 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, return 0; } +static int bch2_dev_btree_drop_key(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + unsigned dev_idx, + struct bkey_buf *last_flushed, + unsigned flags) +{ + struct btree_iter iter; + struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret; + + ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + static int bch2_dev_usrdata_drop(struct bch_fs *c, struct progress_indicator_state *progress, - unsigned dev_idx, int flags) + unsigned dev_idx, unsigned flags) { struct btree_trans *trans = bch2_trans_get(c); enum btree_id id; @@ -106,7 +144,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, static int bch2_dev_metadata_drop(struct bch_fs *c, struct progress_indicator_state *progress, - unsigned dev_idx, int flags) + unsigned dev_idx, unsigned flags) { struct btree_trans *trans; struct btree_iter iter; @@ -137,20 +175,12 @@ retry: if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) goto next; - bch2_bkey_buf_copy(&k, c, &b->key); - - ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), - dev_idx, flags, true); - if (ret) - break; - - ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false); + ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ret = 0; continue; } - bch_err_msg(c, ret, "updating btree node key"); if (ret) break; next: @@ -176,7 +206,66 @@ err: return ret; } -int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) +static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx, + struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed, + unsigned flags) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, + last_flushed); + int ret = bkey_err(k); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) + return 0; + if (ret) + return ret; + + if (!k.k || !bch2_bkey_has_device_c(k, dev_idx)) + goto out; + + /* + * XXX: pass flags arg to invalidate_stripe_to_dev and handle it + * properly + */ + + if (bkey_is_btree_ptr(k.k)) + ret = bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags); + else if (k.k->type == KEY_TYPE_stripe) + ret = bch2_invalidate_stripe_to_dev(trans, &iter, k, dev_idx, flags); + else + ret = bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags) +{ + struct btree_trans *trans = bch2_trans_get(c); + + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + int ret = bch2_btree_write_buffer_flush_sync(trans) ?: + for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, + POS(dev_idx, 0), + POS(dev_idx, U64_MAX), 0, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + if (k.k->type != KEY_TYPE_backpointer) + continue; + + data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k), + &last_flushed, flags); + + })); + + bch2_bkey_buf_exit(&last_flushed, trans->c); + bch2_trans_put(trans); + bch_err_fn(c, ret); + return ret; +} + +int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags) { struct progress_indicator_state progress; bch2_progress_init(&progress, c, diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h index 027efaa0d575..30018140711b 100644 --- a/fs/bcachefs/migrate.h +++ b/fs/bcachefs/migrate.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_MIGRATE_H #define _BCACHEFS_MIGRATE_H -int bch2_dev_data_drop(struct bch_fs *, unsigned, int); +int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned); +int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned); #endif /* _BCACHEFS_MIGRATE_H */ -- 2.51.0 From a8539ad8fa88e39c3f566b7c35029f25ab90b72e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 3 May 2025 21:26:04 -0400 Subject: [PATCH 13/16] bcachefs: bcachefs_metadata_version_fast_device_removal Fast device removal, that uses backpointers to find pointers to the device being removed instead of a full metadata scan. This requires BCH_SB_MEMBER_DELETED_UUID, which is an incompatible change - hence the version number bump. We don't fully trust backpointers, so we don't want to reuse device indexes until after a fsck has verified that there aren't any pointers to removed devices. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 ++- fs/bcachefs/ec.c | 4 +++- fs/bcachefs/super.c | 27 +++++++++++++++++++++++---- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 061fa2666f35..a483d440fa39 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -696,7 +696,8 @@ struct bch_sb_field_ext { x(stripe_lru, BCH_VERSION(1, 23)) \ x(casefolding, BCH_VERSION(1, 24)) \ x(extent_flags, BCH_VERSION(1, 25)) \ - x(snapshot_deletion_v2, BCH_VERSION(1, 26)) + x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \ + x(fast_device_removal, BCH_VERSION(1, 27)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index bf5f4f6283a4..c581426e3894 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -2197,13 +2197,15 @@ static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, s int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags) { - return bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), BTREE_ITER_intent, k, NULL, NULL, 0, ({ bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags); }))); + bch_err_fn(c, ret); + return ret; } /* startup/shutdown */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index faa012107a97..dc8189f9d2f1 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1726,6 +1726,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) { struct bch_member *m; unsigned dev_idx = ca->dev_idx, data; + bool fast_device_removal = !bch2_request_incompat_feature(c, + bcachefs_metadata_version_fast_device_removal); int ret; down_write(&c->state_lock); @@ -1744,12 +1746,25 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) __bch2_dev_read_only(c, ca); - ret = bch2_dev_data_drop(c, ca->dev_idx, flags) ?: - bch2_dev_remove_stripes(c, ca->dev_idx, flags); - bch_err_msg(ca, ret, "bch2_dev_data_drop()"); + ret = fast_device_removal + ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags) + : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?: + bch2_dev_remove_stripes(c, ca->dev_idx, flags)); if (ret) goto err; + /* Check if device still has data before blowing away alloc info */ + struct bch_dev_usage usage = bch2_dev_usage_read(ca); + for (unsigned i = 0; i < BCH_DATA_NR; i++) + if (!data_type_is_empty(i) && + !data_type_is_hidden(i) && + usage.buckets[i]) { + bch_err(ca, "Remove failed: still has data (%s, %llu buckets)", + __bch2_data_types[i], usage.buckets[i]); + ret = -EBUSY; + goto err; + } + ret = bch2_dev_remove_alloc(c, ca); bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); if (ret) @@ -1813,7 +1828,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) */ mutex_lock(&c->sb_lock); m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); - memset(&m->uuid, 0, sizeof(m->uuid)); + + if (fast_device_removal) + m->uuid = BCH_SB_MEMBER_DELETED_UUID; + else + memset(&m->uuid, 0, sizeof(m->uuid)); bch2_write_super(c); -- 2.51.0 From 8c69e2b52ea81b102ace48debd114467199ca77a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 7 May 2025 14:26:18 -0400 Subject: [PATCH 14/16] bcachefs: Knob for manual snapshot deletion Add 'opts.snapshot_deletion_enabled', enabled by default. This may be turned off so that the new sysfs knob, 'internal/trigger_delete_dead_snapshots', may be used instead - this will allow snapshot deletion to be profiled more easily. Signed-off-by: Kent Overstreet --- fs/bcachefs/opts.h | 6 ++++++ fs/bcachefs/snapshot.c | 26 ++++++++++++++++++++++---- fs/bcachefs/snapshot.h | 1 + fs/bcachefs/snapshot_types.h | 1 + fs/bcachefs/sysfs.c | 5 +++++ 5 files changed, 35 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index f4c014ad43c1..2a02606254b3 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -495,6 +495,12 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH_SB_REBALANCE_AC_ONLY, false, \ NULL, "Enable rebalance while on mains power only\n") \ + x(auto_snapshot_deletion, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable automatic snapshot deletion: disable for debugging, or to\n"\ + "quiet the system when doing performance testing\n")\ x(no_data_io, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 9ec3275c7b0a..c3dc450cbcec 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1776,14 +1776,18 @@ static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snap prt_newline(out); } -int bch2_delete_dead_snapshots(struct bch_fs *c) +int __bch2_delete_dead_snapshots(struct bch_fs *c) { - if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) + struct snapshot_delete *d = &c->snapshot_delete; + int ret = 0; + + if (!mutex_trylock(&d->lock)) return 0; + if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) + goto out_unlock; + struct btree_trans *trans = bch2_trans_get(c); - struct snapshot_delete *d = &c->snapshot_delete; - int ret = 0; /* * For every snapshot node: If we have no live children and it's not @@ -1857,11 +1861,21 @@ err: d->running = false; mutex_unlock(&d->progress_lock); bch2_trans_put(trans); +out_unlock: + mutex_unlock(&d->lock); if (!bch2_err_matches(ret, EROFS)) bch_err_fn(c, ret); return ret; } +int bch2_delete_dead_snapshots(struct bch_fs *c) +{ + if (!c->opts.auto_snapshot_deletion) + return 0; + + return __bch2_delete_dead_snapshots(c); +} + void bch2_delete_dead_snapshots_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work); @@ -1874,6 +1888,9 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work) void bch2_delete_dead_snapshots_async(struct bch_fs *c) { + if (!c->opts.auto_snapshot_deletion) + return; + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots)) return; @@ -1977,6 +1994,7 @@ void bch2_fs_snapshots_exit(struct bch_fs *c) void bch2_fs_snapshots_init_early(struct bch_fs *c) { INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work); + mutex_init(&c->snapshot_delete.lock); mutex_init(&c->snapshot_delete.progress_lock); mutex_init(&c->snapshots_unlinked_lock); } diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 69c484b77729..63b9469eb1eb 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -273,6 +273,7 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return __bch2_key_has_snapshot_overwrites(trans, id, pos); } +int __bch2_delete_dead_snapshots(struct bch_fs *); int bch2_delete_dead_snapshots(struct bch_fs *); void bch2_delete_dead_snapshots_work(struct work_struct *); void bch2_delete_dead_snapshots_async(struct bch_fs *); diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h index 1aa7a58442ae..0ab698f13e5c 100644 --- a/fs/bcachefs/snapshot_types.h +++ b/fs/bcachefs/snapshot_types.h @@ -42,6 +42,7 @@ struct snapshot_interior_delete { typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; struct snapshot_delete { + struct mutex lock; struct work_struct work; struct mutex progress_lock; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index adf99a805a62..4c7d609d79fd 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -149,6 +149,7 @@ write_attribute(trigger_btree_key_cache_shrink); write_attribute(trigger_btree_updates); write_attribute(trigger_freelist_wakeup); write_attribute(trigger_recalc_capacity); +write_attribute(trigger_delete_dead_snapshots); read_attribute(gc_gens_pos); read_attribute(uuid); @@ -439,6 +440,9 @@ STORE(bch2_fs) up_read(&c->state_lock); } + if (attr == &sysfs_trigger_delete_dead_snapshots) + __bch2_delete_dead_snapshots(c); + #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -568,6 +572,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_btree_updates, &sysfs_trigger_freelist_wakeup, &sysfs_trigger_recalc_capacity, + &sysfs_trigger_delete_dead_snapshots, &sysfs_gc_gens_pos, -- 2.51.0 From 970dde8271b607876bfdbb66b941ffda35e74973 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 7 May 2025 16:34:35 -0400 Subject: [PATCH 15/16] bcachefs: Add missing include fix debug build in userspace Signed-off-by: Kent Overstreet --- fs/bcachefs/util.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 14cb2c7dfda4..25cf61ebd40c 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include -- 2.51.0 From 1dfa01ef24151547a2a622e90ad73b082b1bc739 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 8 May 2025 14:24:12 -0400 Subject: [PATCH 16/16] bcachefs: bch2_copygc_dev_wait_amount() Factor out the per-device calculations, for better introspection. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 2 +- fs/bcachefs/movinggc.c | 56 +++++++++++++++++++++------------- fs/bcachefs/movinggc.h | 2 +- 3 files changed, 36 insertions(+), 24 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index b50846da7ae4..828cf94217dd 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -465,7 +465,7 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, prt_printf(&buf, "blocking\t%u\n", cl != NULL); prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]); prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark)); - prt_printf(&buf, "copygc_wait\t%lu/%lli\n", + prt_printf(&buf, "copygc_wait\t%llu/%lli\n", bch2_copygc_wait_amount(c), c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); prt_printf(&buf, "seen\t%llu\n", req->counters.buckets_seen); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index e97e87ebe312..6e5680a3a97c 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -261,6 +261,25 @@ err: return ret; } +static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca) +{ + struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); + struct bch_dev_usage usage; + + for (unsigned i = 0; i < BCH_DATA_NR; i++) + usage.buckets[i] = usage_full.d[i].buckets; + + s64 fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * + ca->mi.bucket_size) >> 1); + s64 fragmented = 0; + + for (unsigned i = 0; i < BCH_DATA_NR; i++) + if (data_type_movable(i)) + fragmented += usage_full.d[i].fragmented; + + return max(0LL, fragmented_allowed - fragmented); +} + /* * Copygc runs when the amount of fragmented data is above some arbitrary * threshold: @@ -275,28 +294,13 @@ err: * often and continually reduce the amount of fragmented space as the device * fills up. So, we increase the threshold by half the current free space. */ -unsigned long bch2_copygc_wait_amount(struct bch_fs *c) +u64 bch2_copygc_wait_amount(struct bch_fs *c) { - s64 wait = S64_MAX, fragmented_allowed, fragmented; + u64 wait = U64_MAX; rcu_read_lock(); - for_each_rw_member_rcu(c, ca) { - struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); - struct bch_dev_usage usage; - - for (unsigned i = 0; i < BCH_DATA_NR; i++) - usage.buckets[i] = usage_full.d[i].buckets; - - fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * - ca->mi.bucket_size) >> 1); - fragmented = 0; - - for (unsigned i = 0; i < BCH_DATA_NR; i++) - if (data_type_movable(i)) - fragmented += usage_full.d[i].fragmented; - - wait = min(wait, max(0LL, fragmented_allowed - fragmented)); - } + for_each_rw_member_rcu(c, ca) + wait = min(wait, bch2_copygc_dev_wait_amount(ca)); rcu_read_unlock(); return wait; @@ -320,14 +324,22 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) c->copygc_wait_at) << 9); prt_newline(out); - prt_printf(out, "Currently calculated wait:\t"); - prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); - prt_newline(out); + bch2_printbuf_make_room(out, 4096); rcu_read_lock(); + out->atomic++; + + prt_printf(out, "Currently calculated wait:\n"); + for_each_rw_member_rcu(c, ca) { + prt_printf(out, " %s:\t", ca->name); + prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca)); + prt_newline(out); + } + struct task_struct *t = rcu_dereference(c->copygc_thread); if (t) get_task_struct(t); + --out->atomic; rcu_read_unlock(); if (t) { diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h index d1885cf67a45..b9683d22bab0 100644 --- a/fs/bcachefs/movinggc.h +++ b/fs/bcachefs/movinggc.h @@ -2,7 +2,7 @@ #ifndef _BCACHEFS_MOVINGGC_H #define _BCACHEFS_MOVINGGC_H -unsigned long bch2_copygc_wait_amount(struct bch_fs *); +u64 bch2_copygc_wait_amount(struct bch_fs *); void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *); static inline void bch2_copygc_wakeup(struct bch_fs *c) -- 2.51.0