From e882906929c55a9561641944d24c11dc3f338225 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 14 May 2025 17:58:00 -0400 Subject: [PATCH 01/16] bcachefs: Fix opt hooks in sysfs for non sb option We weren't checking if the option changed for non-superblock options - this led to rebalance not waking up when enabling the "rebalance_enabled" option. Signed-off-by: Kent Overstreet --- fs/bcachefs/sysfs.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 4c7d609d79fd..de7cda282a8c 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -642,7 +642,18 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, if (ret < 0) goto err; - bool changed = bch2_opt_set_sb(c, ca, opt, v); + bool is_sb = opt->get_sb || opt->get_member; + bool changed = false; + + if (is_sb) { + changed = bch2_opt_set_sb(c, ca, opt, v); + } else if (!ca) { + changed = bch2_opt_get_by_id(&c->opts, id) != v; + } else { + /* device options that aren't superblock options aren't + * supported */ + BUG(); + } if (!ca) bch2_opt_set_by_id(&c->opts, id, v); -- 2.51.0 From 688321f97e0820024c259a066cd197e9e69cb8c8 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Thu, 15 May 2025 22:29:50 +0800 Subject: [PATCH 02/16] bcachefs: Kill BTREE_TRIGGER_bucket_invalidate Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 9 --------- fs/bcachefs/btree_types.h | 6 +----- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 81e2ae4bb400..51dfd2f24c20 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -914,15 +914,6 @@ int bch2_trigger_alloc(struct btree_trans *trans, goto err; } - if ((flags & BTREE_TRIGGER_bucket_invalidate) && - old_a->cached_sectors) { - ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx, - -((s64) old_a->cached_sectors), - flags & BTREE_TRIGGER_gc); - if (ret) - goto err; - } - ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags); if (ret) goto err; diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 3acccca3b3a3..e5a965db68b4 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -258,9 +258,6 @@ struct btree_node_iter { * * BTREE_TRIGGER_insert - @new is entering the btree * BTREE_TRIGGER_overwrite - @old is leaving the btree - * - * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc - * trigger */ #define BTREE_TRIGGER_FLAGS() \ x(norun) \ @@ -270,8 +267,7 @@ struct btree_node_iter { x(gc) \ x(insert) \ x(overwrite) \ - x(is_root) \ - x(bucket_invalidate) + x(is_root) enum { #define x(n) BTREE_ITER_FLAG_BIT_##n, -- 2.51.0 From 4a67b94bd816b56768fe06d880f02ae0bf6ceade Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Tue, 13 May 2025 02:44:26 +0800 Subject: [PATCH 03/16] bcachefs: Early return to avoid unnecessary lock Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 51dfd2f24c20..daf23d471d4f 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -2576,19 +2576,18 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c) static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) { struct open_bucket *ob; - bool ret = false; for (ob = c->open_buckets; ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { - spin_lock(&ob->lock); - if (ob->valid && !ob->on_partial_list && - ob->dev == ca->dev_idx) - ret = true; - spin_unlock(&ob->lock); + scoped_guard(spinlock, &ob->lock) { + if (ob->valid && !ob->on_partial_list && + ob->dev == ca->dev_idx) + return true; + } } - return ret; + return false; } void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw) -- 2.51.0 From 123d2d09ff599ec11a01687f472c7bdc3b3b6f12 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 15 May 2025 08:31:02 -0400 Subject: [PATCH 04/16] bcachefs: bch2_inode_find_snapshot_root() Factor out a small common helper. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 28 +--------------------------- fs/bcachefs/inode.c | 24 ++++++++++++++++++++++++ fs/bcachefs/inode.h | 3 +++ fs/bcachefs/str_hash.c | 30 ++++++------------------------ 4 files changed, 34 insertions(+), 51 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 4258c91e6df3..e7cac5a69154 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1115,32 +1115,6 @@ fsck_err: return ret; } -static int get_snapshot_root_inode(struct btree_trans *trans, - struct bch_inode_unpacked *root, - u64 inum) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, - SPOS(0, inum, U32_MAX), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - if (bkey_is_inode(k.k)) - goto found_root; - } - if (ret) - goto err; - BUG(); -found_root: - ret = bch2_inode_unpack(k, root); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -1171,7 +1145,7 @@ static int check_inode(struct btree_trans *trans, goto err; if (snapshot_root->bi_inum != u.bi_inum) { - ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum); + ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root); if (ret) goto err; } diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 96d4ab0148bf..a17a952ea161 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1131,6 +1131,30 @@ int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); } +int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, + struct bch_inode_unpacked *root) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, + SPOS(0, inum, U32_MAX), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inum) + break; + if (bkey_is_inode(k.k)) { + ret = bch2_inode_unpack(k, root); + goto out; + } + } + /* We're only called when we know we have an inode for @inum */ + BUG_ON(!ret); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) { if (bi->bi_flags & BCH_INODE_unlinked) diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 5cfba9e98966..bb81b7c269bb 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -173,6 +173,9 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, struct bch_inode_unpacked *); +int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, + struct bch_inode_unpacked *root); + #define inode_opt_get(_c, _inode, _name) \ ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name) diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index 55a3a116b5a8..2d6379473ad4 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -146,34 +146,17 @@ static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans struct bch_hash_info *hash_info) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - if (bkey_is_inode(k.k)) - goto found; - } - - /* This would've been caught by check_key_has_inode() */ - bch_err(c, "%s(): inum %llu not found", __func__, inum); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto err; -found:; - struct bch_inode_unpacked inode; - ret = bch2_inode_unpack(k, &inode); + struct bch_inode_unpacked snapshot_root; + int ret = bch2_inode_find_snapshot_root(trans, inum, &snapshot_root); if (ret) - goto err; + return ret; - struct bch_hash_info hash_root = bch2_hash_info_init(c, &inode); + struct bch_hash_info hash_root = bch2_hash_info_init(c, &snapshot_root); if (hash_info->type != hash_root.type || memcmp(&hash_info->siphash_key, &hash_root.siphash_key, sizeof(hash_root.siphash_key))) { - ret = repair_inode_hash_info(trans, &inode); + ret = repair_inode_hash_info(trans, &snapshot_root); if (!ret) { struct printbuf buf = PRINTBUF; prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n", inum); @@ -190,8 +173,7 @@ found:; ret = -BCH_ERR_fsck_repair_unimplemented; } } -err: - bch2_trans_iter_exit(trans, &iter); + return ret; } -- 2.51.0 From fdd0807f812204c36bfe71710c8a796731ee3777 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 15 May 2025 08:41:26 -0400 Subject: [PATCH 05/16] bcachefs: Improve bch2_repair_inode_hash_info() Improve this so it can be used by fsck.c check_inode(); it provides a much better error message than the check_inode() version. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 13 +++-- fs/bcachefs/str_hash.c | 112 ++++++++++++++++++++++++++++------------- fs/bcachefs/str_hash.h | 2 + 3 files changed, 86 insertions(+), 41 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index e7cac5a69154..2b7bc67dcdf8 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1150,13 +1150,12 @@ static int check_inode(struct btree_trans *trans, goto err; } - if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed || - INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root), - trans, inode_snapshot_mismatch, - "inode hash info in different snapshots don't match")) { - u.bi_hash_seed = snapshot_root->bi_hash_seed; - SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root)); - do_update = true; + if (u.bi_hash_seed != snapshot_root->bi_hash_seed || + INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root)) { + ret = bch2_repair_inode_hash_info(trans, snapshot_root); + BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented); + if (ret) + goto err; } if (u.bi_dir || u.bi_dir_offset) { diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index 2d6379473ad4..0cbf5508a32c 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -101,17 +101,25 @@ static noinline int hash_pick_winner(struct btree_trans *trans, } } -static int repair_inode_hash_info(struct btree_trans *trans, - struct bch_inode_unpacked *snapshot_root) +/* + * str_hash lookups across snapshots break in wild ways if hash_info in + * different snapshot versions doesn't match - so if we find one mismatch, check + * them all + */ +int bch2_repair_inode_hash_info(struct btree_trans *trans, + struct bch_inode_unpacked *snapshot_root) { + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + bool need_commit = false; int ret = 0; - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, - SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != snapshot_root->bi_inum) + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, + POS(0, snapshot_root->bi_inum), + BTREE_ITER_all_snapshots, k, ret) { + if (bpos_ge(k.k->p, SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot))) break; if (!bkey_is_inode(k.k)) continue; @@ -121,19 +129,72 @@ static int repair_inode_hash_info(struct btree_trans *trans, if (ret) break; - if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed || - INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root), - trans, inode_snapshot_mismatch, - "inode hash info in different snapshots don't match")) { + if (inode.bi_hash_seed == snapshot_root->bi_hash_seed && + INODE_STR_HASH(&inode) == INODE_STR_HASH(snapshot_root)) { +#ifdef CONFIG_BCACHEFS_DEBUG + struct bch_hash_info hash1 = bch2_hash_info_init(c, snapshot_root); + struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); + + BUG_ON(hash1.type != hash2.type || + memcmp(&hash1.siphash_key, + &hash2.siphash_key, + sizeof(hash1.siphash_key))); +#endif + continue; + } + + printbuf_reset(&buf); + prt_printf(&buf, "inode %llu hash info in snapshots %u %u don't match\n", + snapshot_root->bi_inum, + inode.bi_snapshot, + snapshot_root->bi_snapshot); + + bch2_prt_str_hash_type(&buf, INODE_STR_HASH(&inode)); + prt_printf(&buf, " %llx\n", inode.bi_hash_seed); + + bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); + prt_printf(&buf, " %llx", snapshot_root->bi_hash_seed); + + if (fsck_err(trans, inode_snapshot_mismatch, "%s", buf.buf)) { inode.bi_hash_seed = snapshot_root->bi_hash_seed; SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root)); - ret = __bch2_fsck_write_inode(trans, &inode) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; - break; + + ret = __bch2_fsck_write_inode(trans, &inode); + if (ret) + break; + need_commit = true; } } + + if (ret) + goto err; + + if (!need_commit) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n", + snapshot_root->bi_inum); + + prt_printf(&buf, "root snapshot %u ", snapshot_root->bi_snapshot); + bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); + prt_printf(&buf, " %llx\n", snapshot_root->bi_hash_seed); +#if 0 + prt_printf(&buf, "vs snapshot %u ", hash_info->inum_snapshot); + bch2_prt_str_hash_type(&buf, hash_info->type); + prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1); +#endif + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; + } + + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; +err: fsck_err: + printbuf_exit(&buf); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -145,34 +206,17 @@ fsck_err: static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, struct bch_hash_info *hash_info) { - struct bch_fs *c = trans->c; struct bch_inode_unpacked snapshot_root; int ret = bch2_inode_find_snapshot_root(trans, inum, &snapshot_root); if (ret) return ret; - struct bch_hash_info hash_root = bch2_hash_info_init(c, &snapshot_root); + struct bch_hash_info hash_root = bch2_hash_info_init(trans->c, &snapshot_root); if (hash_info->type != hash_root.type || memcmp(&hash_info->siphash_key, &hash_root.siphash_key, - sizeof(hash_root.siphash_key))) { - ret = repair_inode_hash_info(trans, &snapshot_root); - if (!ret) { - struct printbuf buf = PRINTBUF; - prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n", inum); - - prt_printf(&buf, "root snapshot %u ", hash_root.inum_snapshot); - bch2_prt_str_hash_type(&buf, hash_root.type); - prt_printf(&buf, " %llx %llx\n", hash_root.siphash_key.k0, hash_root.siphash_key.k1); - - prt_printf(&buf, "vs snapshot %u ", hash_info->inum_snapshot); - bch2_prt_str_hash_type(&buf, hash_info->type); - prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1); - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = -BCH_ERR_fsck_repair_unimplemented; - } - } + sizeof(hash_root.siphash_key))) + ret = bch2_repair_inode_hash_info(trans, &snapshot_root); return ret; } diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index ae3154fb6a94..6762b3627e1b 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -394,6 +394,8 @@ int bch2_hash_delete(struct btree_trans *trans, return ret; } +int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *); + struct snapshots_seen; int __bch2_str_hash_check_key(struct btree_trans *, struct snapshots_seen *, -- 2.51.0 From bde41d9a58f159da8330b499cced04c40104f7f3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 15 May 2025 09:15:24 -0400 Subject: [PATCH 06/16] bcachefs: better error message for subvol_fs_path_parent_wrong Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 2b7bc67dcdf8..ab936520e0ae 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2079,7 +2079,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * 0, subvolume); ret = bkey_err(s.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; + goto err; if (ret) { if (fsck_err(trans, dirent_to_missing_subvol, @@ -2090,18 +2090,28 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * goto out; } - if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, - trans, subvol_fs_path_parent_wrong, - "subvol with wrong fs_path_parent, should be be %u\n%s", - parent_subvol, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - struct bkey_i_subvolume *n = - bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); - ret = PTR_ERR_OR_ZERO(n); + if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) { + printbuf_reset(&buf); + + prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n", + parent_subvol); + + ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset, + le64_to_cpu(s.v->inode) }, &buf); if (ret) goto err; + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, s.s_c); - n->v.fs_path_parent = cpu_to_le32(parent_subvol); + if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) { + struct bkey_i_subvolume *n = + bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + n->v.fs_path_parent = cpu_to_le32(parent_subvol); + } } u64 target_inum = le64_to_cpu(s.v->inode); -- 2.51.0 From 84b9f17195b2d4914763feee00ca44be42c9f8e2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 15 May 2025 10:08:06 -0400 Subject: [PATCH 07/16] bcachefs: do_rebalance_scan() now only updates bch_extent_rebalance This ensures that our pending rebalance work accounting is accurate quickly. Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 2 +- fs/bcachefs/move.h | 4 ++++ fs/bcachefs/rebalance.c | 42 ++++++++++++++++++++++++++--------------- 3 files changed, 32 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 3a92eced2e67..49898d5743d4 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -412,7 +412,7 @@ err: return ret; } -static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, +struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct per_snapshot_io_opts *io_opts, struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ struct btree_iter *extent_iter, diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index fb38383ffc7b..86b80499ac55 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -122,6 +122,10 @@ int bch2_move_extent(struct moving_context *, struct bch_io_opts, struct data_update_opts); +struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, + struct per_snapshot_io_opts *, struct bpos, + struct btree_iter *, struct bkey_s_c); + int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos, move_pred_fn, void *, enum btree_id, unsigned); int __bch2_move_data(struct moving_context *, diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 8fefe2b174c2..c223bb092d33 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -459,22 +459,11 @@ out: return ret; } -static bool rebalance_pred(struct bch_fs *c, void *arg, - enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); - data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_only_specified_devs; - return data_opts->rewrite_ptrs != 0; -} - static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) { struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; struct bch_fs_rebalance *r = &trans->c->rebalance; - int ret; bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); ctxt->stats = &r->scan_stats; @@ -489,11 +478,34 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) r->state = BCH_REBALANCE_scanning; - ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?: - commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_clear_rebalance_needs_scan(trans, inum, cookie)); + struct per_snapshot_io_opts snapshot_io_opts; + per_snapshot_io_opts_init(&snapshot_io_opts, c); + + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, + r->scan_start.pos, r->scan_end.pos, + BTREE_ITER_all_snapshots| + BTREE_ITER_not_extents| + BTREE_ITER_prefetch, k, ({ + ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); + struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans, + &snapshot_io_opts, iter.pos, &iter, k); + PTR_ERR_OR_ZERO(io_opts); + })) ?: + commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_clear_rebalance_needs_scan(trans, inum, cookie)); + + per_snapshot_io_opts_exit(&snapshot_io_opts); bch2_move_stats_exit(&r->scan_stats, trans->c); + + /* + * Ensure that the rebalance_work entries we created are seen by the + * next iteration of do_rebalance(), so we don't end up stuck in + * rebalance_wait(): + */ + atomic64_inc(&r->scan_stats.sectors_seen); + bch2_btree_write_buffer_flush_sync(trans); + return ret; } -- 2.51.0 From 8a6fa52e07bc8d2e5535e1e2c64b621d34fef9c7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 May 2025 17:21:00 -0400 Subject: [PATCH 08/16] bcachefs: relock_fail tracepoint now includes btree Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_locking.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index a45cfae8f671..6663e186a960 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_cache.h" #include "btree_locking.h" #include "btree_types.h" @@ -742,7 +743,9 @@ static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, str struct printbuf buf = PRINTBUF; bch2_bpos_to_text(&buf, path->pos); - prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq); + prt_printf(&buf, " %s l=%u seq=%u node seq=", + bch2_btree_id_str(path->btree_id), + f->l, path->l[f->l].lock_seq); if (IS_ERR_OR_NULL(f->b)) { prt_str(&buf, bch2_err_str(PTR_ERR(f->b))); } else { -- 2.51.0 From a78a11900ecbb358710f65f78eee45e3b5691180 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 May 2025 17:18:27 -0400 Subject: [PATCH 09/16] bcachefs: journal path now uses discard_opt_enabled() Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 15 +-------------- fs/bcachefs/bcachefs.h | 18 ++++++++++++++++++ fs/bcachefs/journal_reclaim.c | 3 ++- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index daf23d471d4f..5934104af7e6 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1798,19 +1798,6 @@ struct discard_buckets_state { u64 discarded; }; -/* - * This is needed because discard is both a filesystem option and a device - * option, and mount options are supposed to apply to that mount and not be - * persisted, i.e. if it's set as a mount option we can't propagate it to the - * device. - */ -static inline bool discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca) -{ - return test_bit(BCH_FS_discard_mount_opt_set, &c->flags) - ? c->opts.discard - : ca->mi.discard; -} - static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca, struct btree_iter *need_discard_iter, @@ -1874,7 +1861,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, s->discarded++; *discard_pos_done = iter.pos; - if (discard_opt_enabled(c, ca) && !c->opts.nochanges) { + if (bch2_discard_opt_enabled(c, ca) && !c->opts.nochanges) { /* * This works without any other locks because this is the only * thread that removes items from the need_discard tree diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 27c025c05f8e..252fc1eaa0dc 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -452,6 +452,7 @@ BCH_DEBUG_PARAMS_ALL() x(btree_node_compact) \ x(btree_node_merge) \ x(btree_node_sort) \ + x(btree_node_get) \ x(btree_node_read) \ x(btree_node_read_done) \ x(btree_node_write) \ @@ -459,6 +460,10 @@ BCH_DEBUG_PARAMS_ALL() x(btree_interior_update_total) \ x(btree_gc) \ x(data_write) \ + x(data_write_to_submit) \ + x(data_write_to_queue) \ + x(data_write_to_btree_update) \ + x(data_write_btree_update) \ x(data_read) \ x(data_promote) \ x(journal_flush_write) \ @@ -1272,4 +1277,17 @@ static inline unsigned data_replicas_required(struct bch_fs *c) #define BKEY_PADDED_ONSTACK(key, pad) \ struct { struct bkey_i key; __u64 key ## _pad[pad]; } +/* + * This is needed because discard is both a filesystem option and a device + * option, and mount options are supposed to apply to that mount and not be + * persisted, i.e. if it's set as a mount option we can't propagate it to the + * device. + */ +static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca) +{ + return test_bit(BCH_FS_discard_mount_opt_set, &c->flags) + ? c->opts.discard + : ca->mi.discard; +} + #endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index ce9e0bd7ec4f..70f36f6bc482 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -300,7 +300,7 @@ void bch2_journal_do_discards(struct journal *j) while (should_discard_bucket(j, ja)) { if (!c->opts.nochanges && - ca->mi.discard && + bch2_discard_opt_enabled(c, ca) && bdev_max_discard_sectors(ca->disk_sb.bdev)) blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, @@ -701,6 +701,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) if (ret) break; + /* XXX shove journal discards off to another thread */ bch2_journal_do_discards(j); seq_to_flush = journal_seq_to_flush(j); -- 2.51.0 From 9469556a5fc1457d0a55f391010dfb82f7c5e20a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 15 May 2025 07:45:52 -0400 Subject: [PATCH 10/16] bcachefs: btree key cache asserts Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 669825f89cdd..b8efe2fddbc4 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -101,8 +101,8 @@ static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu kmem_cache_free(bch2_key_cache, ck); } -static void bkey_cached_free(struct btree_key_cache *bc, - struct bkey_cached *ck) +static inline void bkey_cached_free_noassert(struct btree_key_cache *bc, + struct bkey_cached *ck) { kfree(ck->k); ck->k = NULL; @@ -116,6 +116,19 @@ static void bkey_cached_free(struct btree_key_cache *bc, this_cpu_inc(*bc->nr_pending); } +static void bkey_cached_free(struct btree_trans *trans, + struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + /* + * we'll hit strange issues in the SRCU code if we aren't holding an + * SRCU read lock... + */ + EBUG_ON(!trans->srcu_held); + + bkey_cached_free_noassert(bc, ck); +} + static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) { gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; @@ -281,7 +294,7 @@ static int btree_key_cache_create(struct btree_trans *trans, ck_path->uptodate = BTREE_ITER_UPTODATE; return 0; err: - bkey_cached_free(bc, ck); + bkey_cached_free(trans, bc, ck); mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); return ret; @@ -511,7 +524,7 @@ evict: mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); if (bkey_cached_evict(&c->btree_key_cache, ck)) { - bkey_cached_free(&c->btree_key_cache, ck); + bkey_cached_free(trans, &c->btree_key_cache, ck); } else { six_unlock_write(&ck->c.lock); six_unlock_intent(&ck->c.lock); @@ -625,7 +638,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, } bkey_cached_evict(bc, ck); - bkey_cached_free(bc, ck); + bkey_cached_free(trans, bc, ck); mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); @@ -693,7 +706,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, } else if (!bkey_cached_lock_for_evict(ck)) { bc->skipped_lock_fail++; } else if (bkey_cached_evict(bc, ck)) { - bkey_cached_free(bc, ck); + bkey_cached_free_noassert(bc, ck); bc->freed++; freed++; } else { -- 2.51.0 From 295dbf50e5f60cbc41416aff1feadd876d17e492 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 May 2025 17:29:53 -0400 Subject: [PATCH 11/16] bcachefs: Optimize bch2_trans_start_alloc_update() Avoid doing more updates if we already have one. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 5934104af7e6..f8d21c12c3d1 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -479,11 +479,26 @@ struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; - struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos); - int ret = PTR_ERR_OR_ZERO(a); - if (ret) + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, pos, + BTREE_ITER_with_updates| + BTREE_ITER_cached| + BTREE_ITER_intent); + int ret = bkey_err(k); + if (unlikely(ret)) return ERR_PTR(ret); + if ((void *) k.v >= trans->mem && + (void *) k.v < trans->mem + trans->mem_top) { + bch2_trans_iter_exit(trans, &iter); + return container_of(bkey_s_c_to_alloc_v4(k).v, struct bkey_i_alloc_v4, v); + } + + struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); + if (IS_ERR(a)) { + bch2_trans_iter_exit(trans, &iter); + return a; + } + ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_); bch2_trans_iter_exit(trans, &iter); return unlikely(ret) ? ERR_PTR(ret) : a; -- 2.51.0 From 878713b5f56aa4b3dce5bdb7f34952a10183b106 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 May 2025 23:12:09 -0400 Subject: [PATCH 12/16] bcachefs: kill copy in bch2_disk_accounting_mod() Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 195dc3fcec1d..2786a684b0c8 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -94,19 +94,27 @@ int bch2_disk_accounting_mod(struct btree_trans *trans, BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); - struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; + if (likely(!gc)) { + unsigned u64s = sizeof(struct bkey_i_accounting) / sizeof(u64) + nr; + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); + int ret = PTR_ERR_OR_ZERO(e); + if (ret) + return ret; - accounting_key_init(&k_i.k, k, d, nr); + journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, BTREE_ID_accounting, 0, u64s); + accounting_key_init(e->start, k, d, nr); + return 0; + } else { + struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; + + accounting_key_init(&k_i.k, k, d, nr); - if (unlikely(gc)) { int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); if (ret == -BCH_ERR_btree_insert_need_mark_replicas) ret = drop_locks_do(trans, bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); return ret; - } else { - return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k); } } -- 2.51.0 From 68708efcac711946ffeb1803eb54ebaf44675010 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 May 2025 18:21:49 -0400 Subject: [PATCH 13/16] bcachefs: struct bch_fs_recovery bch_fs has gotten obnoxiously big, let's start organizing thins a bit better. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 2 +- fs/bcachefs/alloc_foreground.c | 6 +-- fs/bcachefs/backpointers.c | 4 +- fs/bcachefs/bcachefs.h | 17 +------- fs/bcachefs/btree_cache.c | 2 +- fs/bcachefs/btree_io.c | 6 +-- fs/bcachefs/btree_update_interior.c | 2 +- fs/bcachefs/fsck.c | 8 ++-- fs/bcachefs/movinggc.c | 2 +- fs/bcachefs/rebalance.c | 2 +- fs/bcachefs/recovery.c | 10 ++--- fs/bcachefs/recovery_passes.c | 60 +++++++++++++++-------------- fs/bcachefs/recovery_passes_types.h | 23 +++++++++++ fs/bcachefs/snapshot.c | 4 +- fs/bcachefs/super.c | 2 +- 15 files changed, 81 insertions(+), 69 deletions(-) create mode 100644 fs/bcachefs/recovery_passes_types.h diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index f8d21c12c3d1..4ae2aa6ea758 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -309,7 +309,7 @@ int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, "data type inconsistency"); bkey_fsck_err_on(!a.io_time[READ] && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, + c->recovery.curr_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, c, alloc_key_cached_but_read_time_zero, "cached bucket with read_time == 0"); break; diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 6aefa490ec24..76641cc4c27d 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -154,7 +154,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) { - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) + if (c->recovery.curr_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) return false; return bch2_is_superblock_bucket(ca, b); @@ -524,7 +524,7 @@ again: if (!avail) { if (req->watermark > BCH_WATERMARK_normal && - c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + c->recovery.curr_pass <= BCH_RECOVERY_PASS_check_allocations) goto alloc; if (cl && !waiting) { @@ -554,7 +554,7 @@ alloc: goto alloc; } - if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { + if (!ob && freespace && c->recovery.curr_pass <= BCH_RECOVERY_PASS_check_alloc_info) { freespace = false; goto alloc; } diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index bdf524b465fa..44da8e2657af 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -120,7 +120,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, orig_k); bch_err(c, "%s", buf.buf); - } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { + } else if (c->recovery.curr_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { prt_printf(&buf, "backpointer not found when deleting\n"); printbuf_indent_add(&buf, 2); @@ -136,7 +136,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, orig_k); } - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers && + if (c->recovery.curr_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers && __bch2_inconsistent_error(c, &buf)) ret = -BCH_ERR_erofs_unfixed_errors; diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 252fc1eaa0dc..1458f131af16 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -502,6 +502,7 @@ enum bch_time_stats { #include "keylist_types.h" #include "quota_types.h" #include "rebalance_types.h" +#include "recovery_passes_types.h" #include "replicas_types.h" #include "sb-members_types.h" #include "subvolume_types.h" @@ -1116,21 +1117,7 @@ struct bch_fs { /* RECOVERY */ u64 journal_replay_seq_start; u64 journal_replay_seq_end; - /* - * Two different uses: - * "Has this fsck pass?" - i.e. should this type of error be an - * emergency read-only - * And, in certain situations fsck will rewind to an earlier pass: used - * for signaling to the toplevel code which pass we want to run now. - */ - enum bch_recovery_pass curr_recovery_pass; - enum bch_recovery_pass next_recovery_pass; - /* bitmask of recovery passes that we actually ran */ - u64 recovery_passes_complete; - /* never rewinds version of curr_recovery_pass */ - enum bch_recovery_pass recovery_pass_done; - spinlock_t recovery_pass_lock; - struct semaphore run_recovery_passes_lock; + struct bch_fs_recovery recovery; /* DEBUG JUNK */ struct dentry *fs_debug_dir; diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 2fd58b08a54d..b1932b6a514b 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1003,7 +1003,7 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) { struct printbuf buf = PRINTBUF; - if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + if (c->recovery.curr_pass <= BCH_RECOVERY_PASS_check_allocations) return; prt_printf(&buf, diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 97cd25cd492b..e5db374f001b 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -556,7 +556,7 @@ static int __btree_err(int ret, struct printbuf *err_msg, const char *fmt, ...) { - if (c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) + if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) return -BCH_ERR_fsck_fix; bool have_retry = false; @@ -1428,7 +1428,7 @@ start: if ((failed.nr || btree_node_need_rewrite(b)) && !btree_node_read_error(b) && - c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { + c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { prt_printf(&buf, " (rewriting node)"); bch2_btree_node_rewrite_async(c, b); } @@ -1776,7 +1776,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, bch2_btree_lost_data(c, &buf, b->c.btree_id); if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology && + c->recovery.curr_pass > BCH_RECOVERY_PASS_check_topology && bch2_fs_emergency_read_only2(c, &buf)) ratelimit = false; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 2d43d51b597d..a658c97439ed 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2363,7 +2363,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) bool now = false, pending = false; spin_lock(&c->btree_node_rewrites_lock); - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay && + if (c->recovery.curr_pass > BCH_RECOVERY_PASS_journal_replay && enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) { list_add(&a->list, &c->btree_node_rewrites); now = true; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index ab936520e0ae..94a64816cb50 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -3177,7 +3177,7 @@ static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) c->opts.fsck = true; set_bit(BCH_FS_in_fsck, &c->flags); - c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; + c->recovery.curr_pass = BCH_RECOVERY_PASS_check_alloc_info; int ret = bch2_run_online_recovery_passes(c); clear_bit(BCH_FS_in_fsck, &c->flags); @@ -3187,7 +3187,7 @@ static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) c->stdio_filter = NULL; c->opts.fix_errors = old_fix_errors; - up(&c->run_recovery_passes_lock); + up(&c->recovery.run_lock); bch2_ro_ref_put(c); return ret; } @@ -3211,7 +3211,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) if (!bch2_ro_ref_tryget(c)) return -EROFS; - if (down_trylock(&c->run_recovery_passes_lock)) { + if (down_trylock(&c->recovery.run_lock)) { bch2_ro_ref_put(c); return -EAGAIN; } @@ -3243,7 +3243,7 @@ err: bch_err_fn(c, ret); if (thr) bch2_fsck_thread_exit(&thr->thr); - up(&c->run_recovery_passes_lock); + up(&c->recovery.run_lock); bch2_ro_ref_put(c); } return ret; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index cc843815f7eb..4bfdb1befb9a 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -362,7 +362,7 @@ static int bch2_copygc_thread(void *arg) * Data move operations can't run until after check_snapshots has * completed, and bch2_snapshot_is_ancestor() is available. */ - kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots || + kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || kthread_should_stop()); bch2_move_stats_init(&move_stats, "copygc"); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index c223bb092d33..de1ec9e0caa0 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -616,7 +616,7 @@ static int bch2_rebalance_thread(void *arg) * Data move operations can't run until after check_snapshots has * completed, and bch2_snapshot_is_ancestor() is available. */ - kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots || + kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || kthread_should_stop()); bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 1895a6b13001..cd2372221a54 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -434,7 +434,7 @@ int bch2_journal_replay(struct bch_fs *c) trans = NULL; if (!c->opts.retain_recovery_info && - c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) + c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) bch2_journal_keys_put_initial(c); replay_now_at(j, j->replay_journal_seq_end); @@ -1001,7 +1001,7 @@ use_clean: bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); clear_bit(BCH_FS_errors_fixed, &c->flags); - c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; + c->recovery.curr_pass = BCH_RECOVERY_PASS_check_alloc_info; ret = bch2_run_recovery_passes(c); if (ret) @@ -1047,7 +1047,7 @@ use_clean: if (c->opts.fsck && !test_bit(BCH_FS_error, &c->flags) && - c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 && + c->recovery.pass_done == BCH_RECOVERY_PASS_NR - 1 && ext->btrees_lost_data) { ext->btrees_lost_data = 0; write_sb = true; @@ -1234,7 +1234,7 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1; + c->recovery.pass_done = BCH_RECOVERY_PASS_NR - 1; bch2_copygc_wakeup(c); bch2_rebalance_wakeup(c); @@ -1257,7 +1257,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); - c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; + c->recovery.curr_pass = BCH_RECOVERY_PASS_NR; return 0; err: bch_err_fn(c, ret); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 22cefffcf1fa..c1eca55a1dde 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -210,16 +210,18 @@ static int __bch2_run_explicit_recovery_pass(struct printbuf *out, struct bch_fs *c, enum bch_recovery_pass pass) { - if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns)) + struct bch_fs_recovery *r = &c->recovery; + + if (r->curr_pass == ARRAY_SIZE(recovery_pass_fns)) return -BCH_ERR_not_in_recovery; - if (c->recovery_passes_complete & BIT_ULL(pass)) + if (r->passes_complete & BIT_ULL(pass)) return 0; bool print = !(c->opts.recovery_passes & BIT_ULL(pass)); if (pass < BCH_RECOVERY_PASS_set_may_go_rw && - c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { + r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { if (print) prt_printf(out, "need recovery pass %s (%u), but already rw\n", bch2_recovery_passes[pass], pass); @@ -229,14 +231,14 @@ static int __bch2_run_explicit_recovery_pass(struct printbuf *out, if (print) prt_printf(out, "running explicit recovery pass %s (%u), currently at %s (%u)\n", bch2_recovery_passes[pass], pass, - bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + bch2_recovery_passes[r->curr_pass], r->curr_pass); c->opts.recovery_passes |= BIT_ULL(pass); if (test_bit(BCH_FS_in_recovery, &c->flags) && - c->curr_recovery_pass > pass) { - c->next_recovery_pass = pass; - c->recovery_passes_complete &= (1ULL << pass) >> 1; + r->curr_pass > pass) { + r->next_pass = pass; + r->passes_complete &= (1ULL << pass) >> 1; return -BCH_ERR_restart_recovery; } else { return 0; @@ -251,9 +253,9 @@ static int bch2_run_explicit_recovery_pass_printbuf(struct bch_fs *c, out->atomic++; unsigned long flags; - spin_lock_irqsave(&c->recovery_pass_lock, flags); + spin_lock_irqsave(&c->recovery.lock, flags); int ret = __bch2_run_explicit_recovery_pass(out, c, pass); - spin_unlock_irqrestore(&c->recovery_pass_lock, flags); + spin_unlock_irqrestore(&c->recovery.lock, flags); --out->atomic; return ret; @@ -361,7 +363,7 @@ int bch2_run_online_recovery_passes(struct bch_fs *c) int ret = bch2_run_recovery_pass(c, i); if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { - i = c->curr_recovery_pass; + i = c->recovery.curr_pass; continue; } if (ret) @@ -381,26 +383,26 @@ int bch2_run_recovery_passes(struct bch_fs *c) */ c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; - down(&c->run_recovery_passes_lock); - spin_lock_irq(&c->recovery_pass_lock); + down(&c->recovery.run_lock); + spin_lock_irq(&c->recovery.lock); - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { - unsigned prev_done = c->recovery_pass_done; - unsigned pass = c->curr_recovery_pass; + while (c->recovery.curr_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { + unsigned prev_done = c->recovery.pass_done; + unsigned pass = c->recovery.curr_pass; - c->next_recovery_pass = pass + 1; + c->recovery.next_pass = pass + 1; if (c->opts.recovery_pass_last && - c->curr_recovery_pass > c->opts.recovery_pass_last) + c->recovery.curr_pass > c->opts.recovery_pass_last) break; if (should_run_recovery_pass(c, pass)) { - spin_unlock_irq(&c->recovery_pass_lock); + spin_unlock_irq(&c->recovery.lock); ret = bch2_run_recovery_pass(c, pass) ?: bch2_journal_flush(&c->journal); - spin_lock_irq(&c->recovery_pass_lock); + spin_lock_irq(&c->recovery.lock); - if (c->next_recovery_pass < c->curr_recovery_pass) { + if (c->recovery.next_pass < c->recovery.curr_pass) { /* * bch2_run_explicit_recovery_pass() was called: we * can't always catch -BCH_ERR_restart_recovery because @@ -408,30 +410,30 @@ int bch2_run_recovery_passes(struct bch_fs *c) * node read completion) */ ret = 0; - c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass); + c->recovery.passes_complete &= ~(~0ULL << c->recovery.curr_pass); } else { - c->recovery_passes_complete |= BIT_ULL(pass); - c->recovery_pass_done = max(c->recovery_pass_done, pass); + c->recovery.passes_complete |= BIT_ULL(pass); + c->recovery.pass_done = max(c->recovery.pass_done, pass); } } - c->curr_recovery_pass = c->next_recovery_pass; + c->recovery.curr_pass = c->recovery.next_pass; if (prev_done <= BCH_RECOVERY_PASS_check_snapshots && - c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots) { + c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots) { bch2_copygc_wakeup(c); bch2_rebalance_wakeup(c); } } - spin_unlock_irq(&c->recovery_pass_lock); - up(&c->run_recovery_passes_lock); + spin_unlock_irq(&c->recovery.lock); + up(&c->recovery.run_lock); return ret; } void bch2_fs_recovery_passes_init(struct bch_fs *c) { - spin_lock_init(&c->recovery_pass_lock); - sema_init(&c->run_recovery_passes_lock, 1); + spin_lock_init(&c->recovery.lock); + sema_init(&c->recovery.run_lock, 1); } diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h new file mode 100644 index 000000000000..69e8e29d58d0 --- /dev/null +++ b/fs/bcachefs/recovery_passes_types.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H +#define _BCACHEFS_RECOVERY_PASSES_TYPES_H + +struct bch_fs_recovery { + /* + * Two different uses: + * "Has this fsck pass?" - i.e. should this type of error be an + * emergency read-only + * And, in certain situations fsck will rewind to an earlier pass: used + * for signaling to the toplevel code which pass we want to run now. + */ + enum bch_recovery_pass curr_pass; + enum bch_recovery_pass next_pass; + /* never rewinds version of curr_pass */ + enum bch_recovery_pass pass_done; + /* bitmask of recovery passes that we actually ran */ + u64 passes_complete; + spinlock_t lock; + struct semaphore run_lock; +}; + +#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index c3dc450cbcec..c401d5285701 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -143,7 +143,7 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) rcu_read_lock(); struct snapshot_table *t = rcu_dereference(c->snapshots); - if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) { + if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots)) { ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor); goto out; } @@ -348,7 +348,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, if (BCH_SNAPSHOT_WILL_DELETE(s.v)) { set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) + if (c->recovery.curr_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) bch2_delete_dead_snapshots_async(c); } } else { diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index c46b2b2ebab1..170b0f26c018 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -392,7 +392,7 @@ void bch2_fs_read_only(struct bch_fs *c) !test_bit(BCH_FS_emergency_ro, &c->flags) && test_bit(BCH_FS_started, &c->flags) && test_bit(BCH_FS_clean_shutdown, &c->flags) && - c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) { + c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); -- 2.51.0 From ab355520305ce4ab7331757c35b1042b32cae52e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 May 2025 17:45:45 -0400 Subject: [PATCH 14/16] bcachefs: __bch2_run_recovery_passes() Consolidate bch2_run_recovery_passes() and bch2_run_online_recovery_passes(), prep work for automatically scheduling and running recovery passes in the background. - Now takes a mask of which passes to run, automatic background repair will pass in sb.recovery_passes_required. - Skips passes that are failing: a pass that failed may be reattempted after another pass succeeds (some passes depend on repair done by other passes for successful completion). - bch2_recovery_passes_match() helper to skip alloc passes on a filesystem without alloc info. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 2 +- fs/bcachefs/recovery.c | 7 +- fs/bcachefs/recovery_passes.c | 180 +++++++++++++++------------- fs/bcachefs/recovery_passes.h | 4 +- fs/bcachefs/recovery_passes_types.h | 2 + 5 files changed, 104 insertions(+), 91 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 94a64816cb50..0e223d4ae2ec 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -3178,7 +3178,7 @@ static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) set_bit(BCH_FS_in_fsck, &c->flags); c->recovery.curr_pass = BCH_RECOVERY_PASS_check_alloc_info; - int ret = bch2_run_online_recovery_passes(c); + int ret = bch2_run_online_recovery_passes(c, ~0ULL); clear_bit(BCH_FS_in_fsck, &c->flags); bch_err_fn(c, ret); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index cd2372221a54..a7e6b5a6505a 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -966,7 +966,7 @@ use_clean: if (ret) goto err; - ret = bch2_run_recovery_passes(c); + ret = bch2_run_recovery_passes(c, 0); if (ret) goto err; @@ -1001,9 +1001,8 @@ use_clean: bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); clear_bit(BCH_FS_errors_fixed, &c->flags); - c->recovery.curr_pass = BCH_RECOVERY_PASS_check_alloc_info; - - ret = bch2_run_recovery_passes(c); + ret = bch2_run_recovery_passes(c, + BCH_RECOVERY_PASS_check_alloc_info); if (ret) goto err; diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index c1eca55a1dde..e0e261aa752e 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -203,6 +203,21 @@ static struct recovery_pass_fn recovery_pass_fns[] = { #undef x }; +static u64 bch2_recovery_passes_match(unsigned flags) +{ + u64 ret = 0; + + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) + if (recovery_pass_fns[i].when & flags) + ret |= BIT_ULL(i); + return ret; +} + +u64 bch2_fsck_recovery_passes(void) +{ + return bch2_recovery_passes_match(PASS_FSCK); +} + /* * For when we need to rewind recovery passes and run a pass we skipped: */ @@ -235,10 +250,12 @@ static int __bch2_run_explicit_recovery_pass(struct printbuf *out, c->opts.recovery_passes |= BIT_ULL(pass); + if (test_bit(BCH_FS_in_recovery, &c->flags)) + r->passes_to_run |= BIT_ULL(pass); + if (test_bit(BCH_FS_in_recovery, &c->flags) && r->curr_pass > pass) { r->next_pass = pass; - r->passes_complete &= (1ULL << pass) >> 1; return -BCH_ERR_restart_recovery; } else { return 0; @@ -302,37 +319,9 @@ int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, return ret; } -u64 bch2_fsck_recovery_passes(void) -{ - u64 ret = 0; - - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) - if (recovery_pass_fns[i].when & PASS_FSCK) - ret |= BIT_ULL(i); - return ret; -} - -static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - struct recovery_pass_fn *p = recovery_pass_fns + pass; - - if ((p->when & PASS_ALLOC) && (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) - return false; - if (c->opts.recovery_passes_exclude & BIT_ULL(pass)) - return false; - if (c->opts.recovery_passes & BIT_ULL(pass)) - return true; - if ((p->when & PASS_FSCK) && c->opts.fsck) - return true; - if ((p->when & PASS_UNCLEAN) && !c->sb.clean) - return true; - if (p->when & PASS_ALWAYS) - return true; - return false; -} - static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { + struct bch_fs_recovery *r = &c->recovery; struct recovery_pass_fn *p = recovery_pass_fns + pass; if (!(p->when & PASS_SILENT)) @@ -341,8 +330,15 @@ static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) s64 start_time = ktime_get_real_seconds(); int ret = p->fn(c); - if (ret) + + r->passes_to_run &= ~BIT_ULL(pass); + + if (ret) { + r->passes_failing |= BIT_ULL(pass); return ret; + } + + r->passes_failing = 0; if (!test_bit(BCH_FS_error, &c->flags)) bch2_sb_recovery_pass_complete(c, pass, start_time); @@ -353,80 +349,96 @@ static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) return 0; } -int bch2_run_online_recovery_passes(struct bch_fs *c) +static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run, + bool online) { - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { - struct recovery_pass_fn *p = recovery_pass_fns + i; - - if (!(p->when & PASS_ONLINE)) - continue; + struct bch_fs_recovery *r = &c->recovery; + int ret = 0; - int ret = bch2_run_recovery_pass(c, i); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { - i = c->recovery.curr_pass; - continue; - } - if (ret) - return ret; - } + spin_lock_irq(&r->lock); - return 0; -} + if (online) + orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE); -int bch2_run_recovery_passes(struct bch_fs *c) -{ - int ret = 0; + if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) + orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC); /* - * We can't allow set_may_go_rw to be excluded; that would cause us to - * use the journal replay keys for updates where it's not expected. + * A failed recovery pass will be retried after another pass succeeds - + * but not this iteration. + * + * This is because some passes depend on repair done by other passes: we + * may want to retry, but we don't want to loop on failing passes. */ - c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; - down(&c->recovery.run_lock); - spin_lock_irq(&c->recovery.lock); + orig_passes_to_run &= ~r->passes_failing; - while (c->recovery.curr_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { - unsigned prev_done = c->recovery.pass_done; - unsigned pass = c->recovery.curr_pass; + r->passes_to_run = orig_passes_to_run; - c->recovery.next_pass = pass + 1; + while (r->passes_to_run) { + unsigned prev_done = r->pass_done; + unsigned pass = __ffs64(r->passes_to_run); + r->curr_pass = pass; + r->next_pass = r->curr_pass + 1; + r->passes_to_run &= ~BIT_ULL(pass); - if (c->opts.recovery_pass_last && - c->recovery.curr_pass > c->opts.recovery_pass_last) - break; + spin_unlock_irq(&r->lock); + + int ret2 = bch2_run_recovery_pass(c, pass) ?: + bch2_journal_flush(&c->journal); - if (should_run_recovery_pass(c, pass)) { - spin_unlock_irq(&c->recovery.lock); - ret = bch2_run_recovery_pass(c, pass) ?: - bch2_journal_flush(&c->journal); - spin_lock_irq(&c->recovery.lock); - - if (c->recovery.next_pass < c->recovery.curr_pass) { - /* - * bch2_run_explicit_recovery_pass() was called: we - * can't always catch -BCH_ERR_restart_recovery because - * it may have been called from another thread (btree - * node read completion) - */ - ret = 0; - c->recovery.passes_complete &= ~(~0ULL << c->recovery.curr_pass); - } else { - c->recovery.passes_complete |= BIT_ULL(pass); - c->recovery.pass_done = max(c->recovery.pass_done, pass); - } + spin_lock_irq(&r->lock); + + if (r->next_pass < r->curr_pass) { + /* Rewind: */ + r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass); + } else if (!ret2) { + r->pass_done = max(r->pass_done, pass); + r->passes_complete |= BIT_ULL(pass); + } else { + ret = ret2; } - c->recovery.curr_pass = c->recovery.next_pass; + if (ret && !online) + break; if (prev_done <= BCH_RECOVERY_PASS_check_snapshots && - c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots) { + r->pass_done > BCH_RECOVERY_PASS_check_snapshots) { bch2_copygc_wakeup(c); bch2_rebalance_wakeup(c); } } - spin_unlock_irq(&c->recovery.lock); + spin_unlock_irq(&r->lock); + + return ret; +} + +int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes) +{ + return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true); +} + +int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from) +{ + u64 passes = + bch2_recovery_passes_match(PASS_ALWAYS) | + (!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) | + (c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) | + c->opts.recovery_passes | + c->sb.recovery_passes_required; + + /* + * We can't allow set_may_go_rw to be excluded; that would cause us to + * use the journal replay keys for updates where it's not expected. + */ + c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; + passes &= ~c->opts.recovery_passes_exclude; + + passes &= ~(BIT_ULL(from) - 1); + + down(&c->recovery.run_lock); + int ret = __bch2_run_recovery_passes(c, passes, false); up(&c->recovery.run_lock); return ret; diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index 4c03472be5b9..0e79cc33fd8f 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -17,8 +17,8 @@ int __bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, struct printbu int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, struct printbuf *, enum bch_recovery_pass); -int bch2_run_online_recovery_passes(struct bch_fs *); -int bch2_run_recovery_passes(struct bch_fs *); +int bch2_run_online_recovery_passes(struct bch_fs *, u64); +int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass); void bch2_fs_recovery_passes_init(struct bch_fs *); diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 69e8e29d58d0..deb6e0565cb9 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -14,8 +14,10 @@ struct bch_fs_recovery { enum bch_recovery_pass next_pass; /* never rewinds version of curr_pass */ enum bch_recovery_pass pass_done; + u64 passes_to_run; /* bitmask of recovery passes that we actually ran */ u64 passes_complete; + u64 passes_failing; spinlock_t lock; struct semaphore run_lock; }; -- 2.51.0 From 7ed4c14e20be2b113e111ec9fa1803c778e00280 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 May 2025 17:36:55 -0400 Subject: [PATCH 15/16] bcachefs: Reduce usage of recovery.curr_pass We want recovery.curr_pass to be private to the recovery passes code, for better showing recovery pass status; also, it may rewind and is generally not the correct member to use. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 3 ++- fs/bcachefs/alloc_foreground.c | 6 +++--- fs/bcachefs/backpointers.c | 7 ++++--- fs/bcachefs/btree_cache.c | 2 +- fs/bcachefs/btree_io.c | 3 +-- fs/bcachefs/btree_update_interior.c | 2 +- fs/bcachefs/fsck.c | 1 - fs/bcachefs/snapshot.c | 2 +- 8 files changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 4ae2aa6ea758..88e710ba2685 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -309,7 +309,8 @@ int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, "data type inconsistency"); bkey_fsck_err_on(!a.io_time[READ] && - c->recovery.curr_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, + !(c->recovery.passes_to_run & + BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs)), c, alloc_key_cached_but_read_time_zero, "cached bucket with read_time == 0"); break; diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 76641cc4c27d..1a52c12c51ae 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -154,7 +154,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) { - if (c->recovery.curr_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_trans_mark_dev_sbs)) return false; return bch2_is_superblock_bucket(ca, b); @@ -524,7 +524,7 @@ again: if (!avail) { if (req->watermark > BCH_WATERMARK_normal && - c->recovery.curr_pass <= BCH_RECOVERY_PASS_check_allocations) + c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) goto alloc; if (cl && !waiting) { @@ -554,7 +554,7 @@ alloc: goto alloc; } - if (!ob && freespace && c->recovery.curr_pass <= BCH_RECOVERY_PASS_check_alloc_info) { + if (!ob && freespace && c->recovery.pass_done < BCH_RECOVERY_PASS_check_alloc_info) { freespace = false; goto alloc; } diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 44da8e2657af..d9ddfc4b5dcc 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -104,6 +104,8 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; + bool will_check = c->recovery.passes_to_run & + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); int ret = 0; if (insert) { @@ -120,7 +122,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, orig_k); bch_err(c, "%s", buf.buf); - } else if (c->recovery.curr_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { + } else if (!will_check) { prt_printf(&buf, "backpointer not found when deleting\n"); printbuf_indent_add(&buf, 2); @@ -136,8 +138,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, orig_k); } - if (c->recovery.curr_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers && - __bch2_inconsistent_error(c, &buf)) + if (!will_check && __bch2_inconsistent_error(c, &buf)) ret = -BCH_ERR_erofs_unfixed_errors; bch_err(c, "%s", buf.buf); diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index b1932b6a514b..8557cbd3d818 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1003,7 +1003,7 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) { struct printbuf buf = PRINTBUF; - if (c->recovery.curr_pass <= BCH_RECOVERY_PASS_check_allocations) + if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) return; prt_printf(&buf, diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index e5db374f001b..34018296053a 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1775,8 +1775,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, prt_newline(&buf); bch2_btree_lost_data(c, &buf, b->c.btree_id); - if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && - c->recovery.curr_pass > BCH_RECOVERY_PASS_check_topology && + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && bch2_fs_emergency_read_only2(c, &buf)) ratelimit = false; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index a658c97439ed..74e65714fecd 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2363,7 +2363,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) bool now = false, pending = false; spin_lock(&c->btree_node_rewrites_lock); - if (c->recovery.curr_pass > BCH_RECOVERY_PASS_journal_replay && + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay) && enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) { list_add(&a->list, &c->btree_node_rewrites); now = true; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 0e223d4ae2ec..2a7f418f3d87 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -3177,7 +3177,6 @@ static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) c->opts.fsck = true; set_bit(BCH_FS_in_fsck, &c->flags); - c->recovery.curr_pass = BCH_RECOVERY_PASS_check_alloc_info; int ret = bch2_run_online_recovery_passes(c, ~0ULL); clear_bit(BCH_FS_in_fsck, &c->flags); diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index c401d5285701..24903e7de296 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -348,7 +348,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, if (BCH_SNAPSHOT_WILL_DELETE(s.v)) { set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); - if (c->recovery.curr_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) + if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots) bch2_delete_dead_snapshots_async(c); } } else { -- 2.51.0 From 06266465cc8a23ae037eb48ede9bdcd5eed8621c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 May 2025 18:23:41 -0400 Subject: [PATCH 16/16] bcachefs: bch2_recovery_pass_status_to_text() Show recovery pass status in sysfs - important now that we're running them automatically in the background. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes.c | 24 ++++++++++++++++++++++++ fs/bcachefs/recovery_passes.h | 2 ++ fs/bcachefs/sysfs.c | 6 ++++++ 3 files changed, 32 insertions(+) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index e0e261aa752e..02639b3d86b0 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -444,6 +444,30 @@ int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from) return ret; } +static void prt_passes(struct printbuf *out, const char *msg, u64 passes) +{ + prt_printf(out, "%s:\t", msg); + prt_bitflags(out, bch2_recovery_passes, passes); + prt_newline(out); +} + +void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_fs_recovery *r = &c->recovery; + + printbuf_tabstop_push(out, 32); + prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required); + prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required & + bch2_recovery_passes_match(PASS_ONLINE)); + prt_passes(out, "Complete passes", r->passes_complete); + prt_passes(out, "Failing passes", r->passes_failing); + + if (r->curr_pass) { + prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]); + prt_passes(out, "Current passes", r->passes_to_run); + } +} + void bch2_fs_recovery_passes_init(struct bch_fs *c) { spin_lock_init(&c->recovery.lock); diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index 0e79cc33fd8f..8c90e29cd6cb 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -20,6 +20,8 @@ int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, struct printbuf int bch2_run_online_recovery_passes(struct bch_fs *, u64); int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass); +void bch2_recovery_pass_status_to_text(struct printbuf *, struct bch_fs *); + void bch2_fs_recovery_passes_init(struct bch_fs *); #endif /* _BCACHEFS_RECOVERY_PASSES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index de7cda282a8c..1a55196d69f1 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -35,6 +35,7 @@ #include "nocow_locking.h" #include "opts.h" #include "rebalance.h" +#include "recovery_passes.h" #include "replicas.h" #include "super-io.h" #include "tests.h" @@ -200,6 +201,7 @@ read_attribute(copy_gc_wait); sysfs_pd_controller_attribute(rebalance); read_attribute(rebalance_status); read_attribute(snapshot_delete_status); +read_attribute(recovery_status); read_attribute(new_stripes); @@ -325,6 +327,9 @@ SHOW(bch2_fs) if (attr == &sysfs_snapshot_delete_status) bch2_snapshot_delete_status_to_text(out, c); + if (attr == &sysfs_recovery_status) + bch2_recovery_pass_status_to_text(out, c); + /* Debugging: */ if (attr == &sysfs_journal_debug) @@ -475,6 +480,7 @@ struct attribute *bch2_fs_files[] = { &sysfs_rebalance_status, &sysfs_snapshot_delete_status, + &sysfs_recovery_status, &sysfs_compression_stats, -- 2.51.0