From 0eaac0b44fa93a5bf8bd5d79c557ca5e04d2dc16 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 23 Apr 2024 02:18:18 -0400 Subject: [PATCH 01/16] bcachefs: btree_write_buffer_flush_seq() no longer closes journal Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_write_buffer.c | 19 ++++++++++++++----- fs/bcachefs/journal.c | 27 ++++++++++++++++++++------- fs/bcachefs/journal.h | 2 +- 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 1bd26221f156..49ce2d1e5c02 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -495,10 +495,6 @@ static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_bu entry->type = BCH_JSET_ENTRY_btree_keys; } - - spin_lock(&c->journal.lock); - buf->need_flush_to_write_buffer = false; - spin_unlock(&c->journal.lock); out: ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; return ret; @@ -508,11 +504,24 @@ static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq) { struct journal *j = &c->journal; struct journal_buf *buf; + bool blocked; int ret = 0; - while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq))) { + while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) { ret = bch2_journal_keys_to_write_buffer(c, buf); + + if (!blocked && !ret) { + spin_lock(&j->lock); + buf->need_flush_to_write_buffer = false; + spin_unlock(&j->lock); + } + mutex_unlock(&j->buf_lock); + + if (blocked) { + bch2_journal_unblock(j); + break; + } } return ret; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index bfbb1ac60c3d..699db0d0749a 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -908,6 +908,8 @@ static void __bch2_journal_block(struct journal *j) new.v = old.v; new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); + + journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); } } @@ -920,7 +922,8 @@ void bch2_journal_block(struct journal *j) journal_quiesce(j); } -static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, + u64 max_seq, bool *blocked) { struct journal_buf *ret = NULL; @@ -937,13 +940,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou struct journal_buf *buf = j->buf + idx; if (buf->need_flush_to_write_buffer) { - if (seq == journal_cur_seq(j)) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); - union journal_res_state s; s.v = atomic64_read_acquire(&j->reservations.counter); - ret = journal_state_count(s, idx) + unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s); + + if (open && !*blocked) { + __bch2_journal_block(j); + *blocked = true; + } + + ret = journal_state_count(s, idx) > open ? ERR_PTR(-EAGAIN) : buf; break; @@ -956,11 +963,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou return ret; } -struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, + u64 max_seq, bool *blocked) { struct journal_buf *ret; + *blocked = false; + + wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, + max_seq, blocked)) != ERR_PTR(-EAGAIN)); + if (IS_ERR_OR_NULL(ret) && *blocked) + bch2_journal_unblock(j); - wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN)); return ret; } diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 6d3c839bbbef..71a50846967f 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -425,7 +425,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j) void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); -struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq); +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *); void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); -- 2.51.0 From 375d21b76d9a20ff4bddf81aadadd6e57f38dee8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Nov 2024 22:12:58 -0500 Subject: [PATCH 02/16] bcachefs: BCH_ERR_btree_node_read_error_cached Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 6 +++--- fs/bcachefs/errcode.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index a0a406b0c7bc..36dfa6a48aa6 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1131,7 +1131,7 @@ retry: if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-BCH_ERR_btree_node_read_error); + return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -1221,7 +1221,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-BCH_ERR_btree_node_read_error); + return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -1303,7 +1303,7 @@ lock_node: if (unlikely(btree_node_read_error(b))) { six_unlock_read(&b->c.lock); - b = ERR_PTR(-BCH_ERR_btree_node_read_error); + b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); goto out; } diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 2dda7f962e5b..131b9bef21a0 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -242,6 +242,7 @@ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, btree_node_read_err) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ x(EIO, sb_not_downgraded) \ x(EIO, btree_node_write_all_failed) \ x(EIO, btree_node_read_error) \ -- 2.51.0 From 525be09e63cc9ff4bee0b0f480551a4dc2e7b276 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 20:15:30 -0500 Subject: [PATCH 03/16] bcachefs: Use separate rhltable for bch2_inode_or_descendents_is_open() Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/fs.c | 39 ++++++++++++++++++++++++++++++--------- fs/bcachefs/fs.h | 1 + 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 11f9ed42a9da..f1d8c821d27a 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1020,6 +1020,7 @@ struct bch_fs { struct list_head vfs_inodes_list; struct mutex vfs_inodes_lock; struct rhashtable vfs_inodes_table; + struct rhltable vfs_inodes_by_inum_table; /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 50d323fca001..c6e7df7c67fa 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -176,8 +177,9 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b) static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) { const subvol_inum *inum = data; + siphash_key_t k = { .key[0] = seed }; - return jhash(&inum->inum, sizeof(inum->inum), seed); + return siphash_2u64(inum->subvol, inum->inum, &k); } static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) @@ -206,11 +208,18 @@ static const struct rhashtable_params bch2_vfs_inodes_params = { .automatic_shrinking = true, }; +static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = { + .head_offset = offsetof(struct bch_inode_info, by_inum_hash), + .key_offset = offsetof(struct bch_inode_info, ei_inum.inum), + .key_len = sizeof(u64), + .automatic_shrinking = true, +}; + int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { struct bch_fs *c = trans->c; - struct rhashtable *ht = &c->vfs_inodes_table; - subvol_inum inum = (subvol_inum) { .inum = p.offset }; + struct rhltable *ht = &c->vfs_inodes_by_inum_table; + u64 inum = p.offset; DARRAY(u32) subvols; int ret = 0; @@ -235,15 +244,15 @@ restart_from_top: struct rhash_lock_head __rcu *const *bkt; struct rhash_head *he; unsigned int hash; - struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); + struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht); restart: - hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params); + hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params); bkt = rht_bucket(tbl, hash); do { struct bch_inode_info *inode; rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { - if (inode->ei_inum.inum == inum.inum) { + if (inode->ei_inum.inum == inum) { ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, GFP_NOWAIT|__GFP_NOWARN); if (ret) { @@ -264,7 +273,7 @@ restart: /* Ensure we see any new tables. */ smp_rmb(); - tbl = rht_dereference_rcu(tbl->future_tbl, ht); + tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht); if (unlikely(tbl)) goto restart; rcu_read_unlock(); @@ -343,7 +352,11 @@ static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inod spin_unlock(&inode->v.i_lock); if (remove) { - int ret = rhashtable_remove_fast(&c->vfs_inodes_table, + int ret = rhltable_remove(&c->vfs_inodes_by_inum_table, + &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params); + BUG_ON(ret); + + ret = rhashtable_remove_fast(&c->vfs_inodes_table, &inode->hash, bch2_vfs_inodes_params); BUG_ON(ret); inode->v.i_hash.pprev = NULL; @@ -388,6 +401,11 @@ retry: discard_new_inode(&inode->v); return old; } else { + int ret = rhltable_insert(&c->vfs_inodes_by_inum_table, + &inode->by_inum_hash, + bch2_vfs_inodes_by_inum_params); + BUG_ON(ret); + inode_fake_hash(&inode->v); inode_sb_list_add(&inode->v); @@ -2359,13 +2377,16 @@ static int bch2_init_fs_context(struct fs_context *fc) void bch2_fs_vfs_exit(struct bch_fs *c) { + if (c->vfs_inodes_by_inum_table.ht.tbl) + rhltable_destroy(&c->vfs_inodes_by_inum_table); if (c->vfs_inodes_table.tbl) rhashtable_destroy(&c->vfs_inodes_table); } int bch2_fs_vfs_init(struct bch_fs *c) { - return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params); + return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?: + rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params); } static struct file_system_type bcache_fs_type = { diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 59f9f7ae728d..dd2198541455 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -14,6 +14,7 @@ struct bch_inode_info { struct inode v; struct rhash_head hash; + struct rhlist_head by_inum_hash; subvol_inum ei_inum; struct list_head ei_vfs_inode_list; -- 2.51.0 From 6534a404d4924820bd1c06fb4412dd4444234f79 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 21:49:08 -0500 Subject: [PATCH 04/16] bcachefs: errcode cleanup: journal errors Instead of throwing standard error codes, we should be throwing dedicated private error codes, this greatly improves debugability. Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 2 ++ fs/bcachefs/journal.c | 4 ++-- fs/bcachefs/journal.h | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 131b9bef21a0..c989ce4f715f 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -241,6 +241,8 @@ x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ + x(EIO, journal_shutdown) \ + x(EIO, journal_flush_err) \ x(EIO, btree_node_read_err) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ x(EIO, sb_not_downgraded) \ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 699db0d0749a..bbdd0b17ae69 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -673,7 +673,7 @@ out: * @seq: seq to flush * @parent: closure object to wait with * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed, - * -EIO if @seq will never be flushed + * -BCH_ERR_journal_flush_err if @seq will never be flushed * * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if * necessary @@ -696,7 +696,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, /* Recheck under lock: */ if (j->err_seq && seq >= j->err_seq) { - ret = -EIO; + ret = -BCH_ERR_journal_flush_err; goto out; } diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 71a50846967f..a6a2e888c59b 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -412,7 +412,7 @@ void bch2_journal_halt(struct journal *); static inline int bch2_journal_error(struct journal *j) { return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL - ? -EIO : 0; + ? -BCH_ERR_journal_shutdown : 0; } struct bch_dev; -- 2.51.0 From a7ecd5f2ccabb97174f8264b965c9597ea854e8f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 22:23:41 -0500 Subject: [PATCH 05/16] bcachefs: disk_accounting: bch2_dev_rcu -> bch2_dev_rcu_noerror Accounting keys that reference invalid devices are corrected by fsck, they shouldn't cause an emergency shutdown. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 4 ++-- fs/bcachefs/disk_accounting.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 55a00018dc8b..fa821d278c45 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -805,7 +805,7 @@ int bch2_accounting_read(struct bch_fs *c) break; case BCH_DISK_ACCOUNTING_dev_data_type: rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, k.dev_data_type.dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); if (ca) { struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; percpu_u64_set(&d->buckets, v[0]); @@ -911,7 +911,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c) break; case BCH_DISK_ACCOUNTING_dev_data_type: { rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (!ca) { rcu_read_unlock(); continue; diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 6639535dc91c..8b2b2f83e6a4 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -142,7 +142,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, break; case BCH_DISK_ACCOUNTING_dev_data_type: rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (ca) { this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); -- 2.51.0 From b71d89bd7b1fbe0a4569d072a0069110f60f9ec9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 22:28:41 -0500 Subject: [PATCH 06/16] bcachefs: Fix accounting_read when we rewind If we rewind recovery to run topology repair, that causes accounting_read to run twice. This fixes accounting being double counted. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index fa821d278c45..bb5dbbf71d04 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -700,6 +700,21 @@ int bch2_accounting_read(struct bch_fs *c) struct btree_trans *trans = bch2_trans_get(c); struct printbuf buf = PRINTBUF; + /* + * We might run more than once if we rewind to start topology repair or + * btree node scan - and those might cause us to get different results, + * so we can't just skip if we've already run. + * + * Instead, zero out any accounting we have: + */ + percpu_down_write(&c->mark_lock); + darray_for_each(acc->k, e) + percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); + for_each_member_device(c, ca) + percpu_memset(ca->usage, 0, sizeof(*ca->usage)); + percpu_memset(c->usage, 0, sizeof(*c->usage)); + percpu_up_write(&c->mark_lock); + int ret = for_each_btree_key(trans, iter, BTREE_ID_accounting, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ -- 2.51.0 From 427db7ffe9a91bcad00a7bac4edcd74d75007a11 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 22:45:25 -0500 Subject: [PATCH 07/16] bcachefs: backpointer_to_missing_ptr is now autofix Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-errors_format.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 9e3425f533bc..d45d0789f1b1 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -141,7 +141,7 @@ enum bch_fsck_flags { x(backpointer_dev_bad, 297, 0) \ x(backpointer_to_missing_device, 126, 0) \ x(backpointer_to_missing_alloc, 127, 0) \ - x(backpointer_to_missing_ptr, 128, 0) \ + x(backpointer_to_missing_ptr, 128, FSCK_AUTOFIX) \ x(lru_entry_at_time_0, 129, FSCK_AUTOFIX) \ x(lru_entry_to_invalid_bucket, 130, FSCK_AUTOFIX) \ x(lru_entry_bad, 131, FSCK_AUTOFIX) \ -- 2.51.0 From abf23afa36eb425fb75d47c26fc665dbab2a9ae1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 22:57:01 -0500 Subject: [PATCH 08/16] bcachefs: Fix btree node scan when unknown btree IDs are present btree_root entries for unknown btree IDs are created during recovery, before reading those btree roots. But btree_node_scan may find btree nodes with unknown btree IDs when we haven't seen roots for those btrees. Reported-by: syzbot+1f202d4da221ec6ebf8e@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 11 ++++++++--- fs/bcachefs/btree_cache.h | 9 +++++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 36dfa6a48aa6..1f06e24e53fc 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1406,9 +1406,14 @@ void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsi void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) { bch2_btree_id_to_text(out, b->c.btree_id); - prt_printf(out, " level %u/%u\n ", - b->c.level, - bch2_btree_id_root(c, b->c.btree_id)->level); + prt_printf(out, " level %u/", b->c.level); + struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id); + if (r) + prt_printf(out, "%u", r->level); + else + prt_printf(out, "(unknown)"); + prt_printf(out, "\n "); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); } diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 6cfacacb6769..dcc34fe4996d 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -128,14 +128,19 @@ static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned i } else { unsigned idx = id - BTREE_ID_NR; - EBUG_ON(idx >= c->btree_roots_extra.nr); + /* This can happen when we're called from btree_node_scan */ + if (idx >= c->btree_roots_extra.nr) + return NULL; + return &c->btree_roots_extra.data[idx]; } } static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) { - return bch2_btree_id_root(c, b->c.btree_id)->b; + struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id); + + return r ? r->b : NULL; } const char *bch2_btree_id_str(enum btree_id); /* avoid */ -- 2.51.0 From 828552ca74a45877dbf139b34c47d0f600a1e852 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 23:28:21 -0500 Subject: [PATCH 09/16] bcachefs: Kill bch2_bucket_alloc_new_fs() The early-early allocation path, bch2_bucket_alloc_new_fs(), is no longer needed - and inconsistencies around new_fs_bucket_idx have been a frequent source of bugs. Reported-by: syzbot+592425844580a6598410@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 40 ++++++++++++++-------------------- fs/bcachefs/alloc_foreground.h | 2 -- fs/bcachefs/bcachefs.h | 1 - fs/bcachefs/buckets.c | 25 +++++++++++++++++++++ fs/bcachefs/buckets.h | 21 +----------------- fs/bcachefs/journal.c | 34 +++++++++++++---------------- fs/bcachefs/journal_reclaim.c | 3 +++ fs/bcachefs/recovery.c | 5 +---- fs/bcachefs/super.c | 12 +++++----- 9 files changed, 66 insertions(+), 77 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 6d665b720f72..4d1ff7f1f302 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -156,6 +156,14 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) return ob; } +static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) +{ + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) + return false; + + return bch2_is_superblock_bucket(ca, b); +} + static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) { BUG_ON(c->open_buckets_partial_nr >= @@ -175,20 +183,6 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) closure_wake_up(&c->freelist_wait); } -/* _only_ for allocating the journal on a new device: */ -long bch2_bucket_alloc_new_fs(struct bch_dev *ca) -{ - while (ca->new_fs_bucket_idx < ca->mi.nbuckets) { - u64 b = ca->new_fs_bucket_idx++; - - if (!is_superblock_bucket(ca, b) && - (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse))) - return b; - } - - return -1; -} - static inline unsigned open_buckets_reserved(enum bch_watermark watermark) { switch (watermark) { @@ -214,6 +208,9 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * { struct open_bucket *ob; + if (unlikely(is_superblock_bucket(c, ca, bucket))) + return NULL; + if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { s->skipped_nouse++; return NULL; @@ -295,9 +292,6 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc /* * This path is for before the freespace btree is initialized: - * - * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & - * journal buckets - journal buckets will be < ca->new_fs_bucket_idx */ static noinline struct open_bucket * bch2_bucket_alloc_early(struct btree_trans *trans, @@ -309,7 +303,7 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct btree_iter iter, citer; struct bkey_s_c k, ck; struct open_bucket *ob = NULL; - u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); + u64 first_bucket = ca->mi.first_bucket; u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; u64 alloc_start = max(first_bucket, *dev_alloc_cursor); u64 alloc_cursor = alloc_start; @@ -332,10 +326,6 @@ again: if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) break; - if (ca->new_fs_bucket_idx && - is_superblock_bucket(ca, k.k->p.offset)) - continue; - if (s->btree_bitmap != BTREE_BITMAP_ANY && s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { @@ -406,8 +396,6 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); u64 alloc_cursor = alloc_start; int ret; - - BUG_ON(ca->new_fs_bucket_idx); again: for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace, POS(ca->dev_idx, alloc_cursor), @@ -551,6 +539,10 @@ again: bch2_dev_do_invalidates(ca); if (!avail) { + if (watermark > BCH_WATERMARK_normal && + c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + goto alloc; + if (cl && !waiting) { closure_wait(&c->freelist_wait, cl); waiting = true; diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 1a16fd5bd4f8..4f87745df97e 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -28,8 +28,6 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, struct bch_devs_mask *); void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); -long bch2_bucket_alloc_new_fs(struct bch_dev *); - static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) { return bch2_dev_have_ref(c, ob->dev); diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index f1d8c821d27a..a85b3bcc6383 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -560,7 +560,6 @@ struct bch_dev { struct bch_dev_usage __percpu *usage; /* Allocator: */ - u64 new_fs_bucket_idx; u64 alloc_cursor[3]; unsigned nr_open_buckets; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 1547141ba2a0..afd35c93fcfb 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1161,6 +1161,31 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c) return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional); } +bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + u64 b_offset = bucket_to_sector(ca, b); + u64 b_end = bucket_to_sector(ca, b + 1); + unsigned i; + + if (!b) + return true; + + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + u64 end = offset + (1 << layout->sb_max_size_bits); + + if (!(offset >= b_end || end <= b_offset)) + return true; + } + + for (i = 0; i < ca->journal.nr; i++) + if (b == ca->journal.buckets[i]) + return true; + + return false; +} + /* Disk reservations: */ #define SECTORS_CACHE 1024 diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index ccc78bfe2fd4..3bebc4c3044f 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -308,26 +308,7 @@ int bch2_trans_mark_dev_sbs_flags(struct bch_fs *, enum btree_iter_update_trigger_flags); int bch2_trans_mark_dev_sbs(struct bch_fs *); -static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) -{ - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; - u64 b_offset = bucket_to_sector(ca, b); - u64 b_end = bucket_to_sector(ca, b + 1); - unsigned i; - - if (!b) - return true; - - for (i = 0; i < layout->nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout->sb_offset[i]); - u64 end = offset + (1 << layout->sb_max_size_bits); - - if (!(offset >= b_end || end <= b_offset)) - return true; - } - - return false; -} +bool bch2_is_superblock_bucket(struct bch_dev *, u64); static inline const char *bch2_data_type_str(enum bch_data_type type) { diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index bbdd0b17ae69..95cccda3b22c 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1002,19 +1002,17 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, } for (nr_got = 0; nr_got < nr_want; nr_got++) { - if (new_fs) { - bu[nr_got] = bch2_bucket_alloc_new_fs(ca); - if (bu[nr_got] < 0) { - ret = -BCH_ERR_ENOSPC_bucket_alloc; - break; - } - } else { - ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, - BCH_DATA_journal, cl); - ret = PTR_ERR_OR_ZERO(ob[nr_got]); - if (ret) - break; + enum bch_watermark watermark = new_fs + ? BCH_WATERMARK_btree + : BCH_WATERMARK_normal; + ob[nr_got] = bch2_bucket_alloc(c, ca, watermark, + BCH_DATA_journal, cl); + ret = PTR_ERR_OR_ZERO(ob[nr_got]); + if (ret) + break; + + if (!new_fs) { ret = bch2_trans_run(c, bch2_trans_mark_metadata_bucket(trans, ca, ob[nr_got]->bucket, BCH_DATA_journal, @@ -1024,9 +1022,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, bch_err_msg(c, ret, "marking new journal buckets"); break; } - - bu[nr_got] = ob[nr_got]->bucket; } + + bu[nr_got] = ob[nr_got]->bucket; } if (!nr_got) @@ -1066,8 +1064,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (ret) goto err_unblock; - if (!new_fs) - bch2_write_super(c); + bch2_write_super(c); /* Commit: */ if (c) @@ -1101,9 +1098,8 @@ err_unblock: bu[i], BCH_DATA_free, 0, BTREE_TRIGGER_transactional)); err_free: - if (!new_fs) - for (i = 0; i < nr_got; i++) - bch2_open_bucket_put(c, ob[i]); + for (i = 0; i < nr_got; i++) + bch2_open_bucket_put(c, ob[i]); kfree(new_bucket_seq); kfree(new_buckets); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 3d8fc2642425..1aabbbe328d9 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -38,6 +38,9 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, struct journal_device *ja, enum journal_space_from from) { + if (!ja->nr) + return 0; + unsigned available = (journal_space_from(ja, from) - ja->cur_idx - 1 + ja->nr) % ja->nr; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 7086a7226989..547c78a323f7 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -1070,7 +1070,6 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); - c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; set_bit(BCH_FS_btree_running, &c->flags); set_bit(BCH_FS_may_go_rw, &c->flags); @@ -1111,9 +1110,6 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - for_each_online_member(c, ca) - ca->new_fs_bucket_idx = 0; - ret = bch2_fs_freespace_init(c); if (ret) goto err; @@ -1172,6 +1168,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); + c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; return 0; err: bch_err_fn(c, ret); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 08170a3d524f..14157820705d 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1750,11 +1750,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err; - ret = bch2_dev_journal_alloc(ca, true); - bch_err_msg(c, ret, "allocating journal"); - if (ret) - goto err; - down_write(&c->state_lock); mutex_lock(&c->sb_lock); @@ -1805,11 +1800,14 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err_late; - ca->new_fs_bucket_idx = 0; - if (ca->mi.state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); + ret = bch2_dev_journal_alloc(ca, false); + bch_err_msg(c, ret, "allocating journal"); + if (ret) + goto err_late; + up_write(&c->state_lock); return 0; -- 2.51.0 From 14152654805256d760315ec24e414363bfa19a06 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 25 Nov 2024 00:21:27 -0500 Subject: [PATCH 10/16] bcachefs: Bad btree roots are now autofix Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-errors_format.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index d45d0789f1b1..89d9dc2c859b 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -68,8 +68,8 @@ enum bch_fsck_flags { x(btree_node_bkey_bad_format, 55, 0) \ x(btree_node_bad_bkey, 56, 0) \ x(btree_node_bkey_out_of_order, 57, 0) \ - x(btree_root_bkey_invalid, 58, 0) \ - x(btree_root_read_error, 59, 0) \ + x(btree_root_bkey_invalid, 58, FSCK_AUTOFIX) \ + x(btree_root_read_error, 59, FSCK_AUTOFIX) \ x(btree_root_bad_min_key, 60, 0) \ x(btree_root_bad_max_key, 61, 0) \ x(btree_node_read_error, 62, 0) \ -- 2.51.0 From 75eabea6988e4ef587c8d90425918fdc58a8f1c9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 25 Nov 2024 01:26:56 -0500 Subject: [PATCH 11/16] bcachefs: Fix dup/misordered check in btree node read We were checking for out of order keys, but not duplicate keys. Reported-by: syzbot+dedbd67513939979f84f@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 89a42ee81e5c..2b5da566fbac 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -857,6 +857,14 @@ static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, return !__bch2_bkey_validate(c, u.s_c, btree_node_type(b), BCH_VALIDATE_silent); } +static inline int btree_node_read_bkey_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) +{ + return bch2_bkey_cmp_packed(b, l, r) + ?: (int) bkey_deleted(r) - (int) bkey_deleted(l); +} + static int validate_bset_keys(struct bch_fs *c, struct btree *b, struct bset *i, int write, bool have_retry, bool *saw_error) @@ -917,7 +925,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, BSET_BIG_ENDIAN(i), write, &b->format, k); - if (prev && bkey_iter_cmp(b, prev, k) > 0) { + if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) { struct bkey up = bkey_unpack_key(b, prev); printbuf_reset(&buf); -- 2.51.0 From dba8243f3b466dd39ef05ff63890d5acab30c852 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 25 Nov 2024 02:05:02 -0500 Subject: [PATCH 12/16] bcachefs: Don't try to en/decrypt when encryption not available If a btree node says it's encrypted, but the superblock never had an encryptino key - whoops, that needs to be handled. Reported-by: syzbot+026f1857b12f5eb3f9e9@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 117 +++++++++++++++++----------------- fs/bcachefs/btree_node_scan.c | 3 + fs/bcachefs/checksum.c | 10 ++- fs/bcachefs/errcode.h | 1 + fs/bcachefs/io_read.c | 14 +++- 5 files changed, 84 insertions(+), 61 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 2b5da566fbac..3bb6db9bd4a4 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1045,39 +1045,51 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, while (b->written < (ptr_written ?: btree_sectors(c))) { unsigned sectors; - struct nonce nonce; bool first = !b->written; - bool csum_bad; - if (!b->written) { + if (first) { + bne = NULL; i = &b->data->keys; + } else { + bne = write_block(b); + i = &bne->keys; - btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_unknown_csum, - "unknown checksum type %llu", BSET_CSUM_TYPE(i)); - - nonce = btree_nonce(i, b->written << 9); + if (i->seq != b->data->keys.seq) + break; + } - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); - csum_bad = bch2_crc_cmp(b->data->csum, csum); - if (csum_bad) - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + struct nonce nonce = btree_nonce(i, b->written << 9); + bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); - btree_err_on(csum_bad, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_bad_csum, - "%s", - (printbuf_reset(&buf), - bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), - buf.buf)); - - ret = bset_encrypt(c, i, b->written << 9); - if (bch2_fs_fatal_err_on(ret, c, - "decrypting btree node: %s", bch2_err_str(ret))) - goto fsck_err; + btree_err_on(!good_csum_type, + bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) + ? -BCH_ERR_btree_node_read_err_must_retry + : -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, NULL, + bset_unknown_csum, + "unknown checksum type %llu", BSET_CSUM_TYPE(i)); + + if (first) { + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); + bool csum_bad = bch2_crc_cmp(b->data->csum, csum); + if (csum_bad) + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + + btree_err_on(csum_bad, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, NULL, + bset_bad_csum, + "%s", + (printbuf_reset(&buf), + bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), + buf.buf)); + + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "decrypting btree node: %s", bch2_err_str(ret))) + goto fsck_err; + } btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), @@ -1088,37 +1100,26 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, sectors = vstruct_sectors(b->data, c->block_bits); } else { - bne = write_block(b); - i = &bne->keys; - - if (i->seq != b->data->keys.seq) - break; - - btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_unknown_csum, - "unknown checksum type %llu", BSET_CSUM_TYPE(i)); - - nonce = btree_nonce(i, b->written << 9); - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - csum_bad = bch2_crc_cmp(bne->csum, csum); - if (ca && csum_bad) - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - - btree_err_on(csum_bad, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_bad_csum, - "%s", - (printbuf_reset(&buf), - bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), - buf.buf)); - - ret = bset_encrypt(c, i, b->written << 9); - if (bch2_fs_fatal_err_on(ret, c, - "decrypting btree node: %s", bch2_err_str(ret))) - goto fsck_err; + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + bool csum_bad = bch2_crc_cmp(bne->csum, csum); + if (ca && csum_bad) + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + + btree_err_on(csum_bad, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, NULL, + bset_bad_csum, + "%s", + (printbuf_reset(&buf), + bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), + buf.buf)); + + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "decrypting btree node: %s", bch2_err_str(ret))) + goto fsck_err; + } sectors = vstruct_sectors(bne, c->block_bits); } diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 4b4df31d4b95..327f1a1859b9 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -159,6 +159,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, return; if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { + if (!c->chacha20) + return; + struct nonce nonce = btree_nonce(&bn->keys, 0); unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index ce8fc677bef9..23a383577d4c 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "checksum.h" #include "errcode.h" +#include "error.h" #include "super.h" #include "super-io.h" @@ -252,6 +253,10 @@ int bch2_encrypt(struct bch_fs *c, unsigned type, if (!bch2_csum_type_is_encryption(type)) return 0; + if (bch2_fs_inconsistent_on(!c->chacha20, + c, "attempting to encrypt without encryption key")) + return -BCH_ERR_no_encryption_key; + return do_encrypt(c->chacha20, nonce, data, len); } @@ -337,8 +342,9 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, size_t sgl_len = 0; int ret = 0; - if (!bch2_csum_type_is_encryption(type)) - return 0; + if (bch2_fs_inconsistent_on(!c->chacha20, + c, "attempting to encrypt without encryption key")) + return -BCH_ERR_no_encryption_key; darray_init(&sgl); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index c989ce4f715f..a12050e9c191 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -260,6 +260,7 @@ x(EIO, no_device_to_read_from) \ x(EIO, missing_indirect_extent) \ x(EIO, invalidate_stripe_to_dev) \ + x(EIO, no_encryption_key) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index eb8d12fd6398..4b6b6d25725b 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -830,7 +830,7 @@ retry_pick: if (!pick_ret) goto hole; - if (pick_ret < 0) { + if (unlikely(pick_ret < 0)) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); @@ -843,6 +843,18 @@ retry_pick: goto err; } + if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, k); + + bch_err_inum_offset_ratelimited(c, + read_pos.inode, read_pos.offset << 9, + "attempting to read encrypted data without encryption key\n %s", + buf.buf); + printbuf_exit(&buf); + goto err; + } + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); /* -- 2.51.0 From 686d2ebec683903fe9ca41fd3cef9b4fb924aae4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 25 Nov 2024 17:03:13 -0500 Subject: [PATCH 13/16] bcachefs: Change "disk accounting version 0" check to commit only 6.11 had a bug where we'd sometimes create disk accounting keys with version 0, which causes issues for journal replay - but we don't need to delete existing accounting keys with version 0. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index bb5dbbf71d04..c5e61265b709 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -134,7 +134,8 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, void *end = &acc_k + 1; int ret = 0; - bkey_fsck_err_on(bversion_zero(k.k->bversion), + bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && + bversion_zero(k.k->bversion), c, accounting_key_version_0, "accounting key with version=0"); -- 2.51.0 From d94159763649831024255a4ec3b4bd1e1bf3f234 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Nov 2024 15:16:57 -0500 Subject: [PATCH 14/16] bcachefs: Fix bch2_btree_node_update_key_early() Fix an assertion pop from the recent btree cache freelist fixes. Fixes: baefd3f849ed ("bcachefs: btree_cache.freeable list fixes") Reported-by: Tyler Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 1f06e24e53fc..1117be901cf0 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -326,7 +326,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *trans, if (!IS_ERR_OR_NULL(b)) { mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); + __bch2_btree_node_hash_remove(&c->btree_cache, b); bkey_copy(&b->key, new); ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); -- 2.51.0 From db0667a4ed82b67779855674682956685fc71f15 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Nov 2024 21:27:16 -0500 Subject: [PATCH 15/16] bcachefs: Go RW earlier, for normal rw mount Previously, when mounting read-write after a clean shutdown, we wouldn't go read-write until after all the recovery passes completed. Now, go RW early in recovery, the same as any other situation we'll need to go read-write. This fixes a bug where we discover unlinked inodes after a clean shutdown: repair fails because we're read only. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 1240c5c19fea..f6d3a99cb63e 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -46,7 +46,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c) set_bit(BCH_FS_may_go_rw, &c->flags); - if (keys->nr || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) + if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) return bch2_fs_read_write_early(c); return 0; } -- 2.51.0 From 90f3683e8f7c9eba516b65c47865fa3a5c08c6fc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Nov 2024 22:59:27 -0500 Subject: [PATCH 16/16] bcachefs: Fix null ptr deref in btree_path_lock_root() Historically, we required that all btree node roots point to a valid (possibly fake) node, but we're improving our ability to continue in the presence of errors. Reported-by: syzbot+e22007d6acb9c87c2362@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 89f9665ce70d..80c3b55ce763 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -722,7 +722,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, unsigned long trace_ip) { struct bch_fs *c = trans->c; - struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b; + struct btree_root *r = bch2_btree_id_root(c, path->btree_id); enum six_lock_type lock_type; unsigned i; int ret; @@ -730,7 +730,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans, EBUG_ON(path->nodes_locked); while (1) { - b = READ_ONCE(*rootp); + struct btree *b = READ_ONCE(r->b); + if (unlikely(!b)) { + BUG_ON(!r->error); + return r->error; + } + path->level = READ_ONCE(b->c.level); if (unlikely(path->level < depth_want)) { @@ -755,7 +760,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, BUG(); } - if (likely(b == READ_ONCE(*rootp) && + if (likely(b == READ_ONCE(r->b) && b->c.level == path->level && !race_fault())) { for (i = 0; i < path->level; i++) -- 2.51.0