From 04e90891be26a240e41d51d1770de56e810fda5e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 11:41:07 -0400 Subject: [PATCH 01/16] bcachefs: Run bch2_check_dirent_target() at lookup time More on the "full online self healing" project: We now run most of the dirent <-> inode consistency checks, with repair code, at runtime - the exact same check and repair code that fsck runs. This will allow us to repair the "dirent points to inode that does not point back" inconsistencies that have been popping up at runtime. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 273078ceb4df..fbca200f7636 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -641,7 +641,9 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, if (ret) return ERR_PTR(ret); - ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + + ret = bch2_dirent_read_target(trans, dir, d, &inum); if (ret > 0) ret = -ENOENT; if (ret) @@ -651,30 +653,30 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, if (inode) goto out; + /* + * Note: if check/repair needs it, we commit before + * bch2_inode_hash_init_insert(), as after that point we can't take a + * restart - not in the top level loop with a commit_do(), like we + * usually do: + */ + struct bch_subvolume subvol; struct bch_inode_unpacked inode_u; ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: + bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); + /* + * don't remove it: check_inodes might find another inode that points + * back to this dirent + */ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "dirent to missing inode:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)); if (ret) goto err; - - /* regular files may have hardlinks: */ - if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) && - !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), - c, - "dirent points to inode that does not point back:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), - prt_printf(&buf, "\n "), - bch2_inode_unpacked_to_text(&buf, &inode_u), - buf.buf))) { - ret = -ENOENT; - goto err; - } out: bch2_trans_iter_exit(trans, &dirent_iter); printbuf_exit(&buf); -- 2.50.1 From 6a9f681ef623ae3804bc2ca3a2d06d2458142975 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 11:53:50 -0400 Subject: [PATCH 02/16] bcachefs: Count BCH_DATA_parity backpointers correctly These are counted as stripe data in the corresponding alloc keys. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index c9dfc3657696..8da1b68821a0 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -50,6 +50,8 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke } bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); + prt_str(out, " data_type="); + bch2_prt_data_type(out, bp.v->data_type); prt_printf(out, " suboffset=%u len=%u gen=%u pos=", (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), bp.v->bucket_len, @@ -791,6 +793,7 @@ static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t case BCH_DATA_cached: return ALLOC_cached; case BCH_DATA_stripe: + case BCH_DATA_parity: return ALLOC_stripe; default: BUG(); -- 2.50.1 From 962322475bb5cebe0da581f6f18d23b00184aa01 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 19 Mar 2025 17:01:38 -0400 Subject: [PATCH 03/16] bcachefs: Handle backpointers with unknown data types New data types might be added later, so we don't want to disallow unknown data types - that'll be a compatibility hassle later. Instead, ignore them. Reported-by: syzbot+3a290f5ff67ca3023834@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 8da1b68821a0..20c497f0c2cb 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -784,7 +784,7 @@ enum alloc_sector_counter { ALLOC_SECTORS_NR }; -static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t) +static int data_type_to_alloc_counter(enum bch_data_type t) { switch (t) { case BCH_DATA_btree: @@ -796,7 +796,7 @@ static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t case BCH_DATA_parity: return ALLOC_stripe; default: - BUG(); + return -1; } } @@ -847,7 +847,11 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b if (bp.v->bucket_gen != a->gen) continue; - sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len; + int alloc_counter = data_type_to_alloc_counter(bp.v->data_type); + if (alloc_counter < 0) + continue; + + sectors[alloc_counter] += bp.v->bucket_len; }; bch2_trans_iter_exit(trans, &iter); if (ret) -- 2.50.1 From 9c3a2c9b471aa42b13c26c916f6a0852899a57e0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 12:38:59 -0400 Subject: [PATCH 04/16] bcachefs: Disable asm memcpys when kmsan enabled kmsan doesn't know about inline assembly, obviously; this will close a ton of syzbot bugs. Signed-off-by: Kent Overstreet --- fs/bcachefs/util.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index d41e133acc4d..7d921fc920a0 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -431,7 +431,7 @@ static inline void memcpy_u64s_small(void *dst, const void *src, static inline void __memcpy_u64s(void *dst, const void *src, unsigned u64s) { -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) long d0, d1, d2; asm volatile("rep ; movsq" @@ -508,7 +508,7 @@ static inline void __memmove_u64s_up(void *_dst, const void *_src, u64 *dst = (u64 *) _dst + u64s - 1; u64 *src = (u64 *) _src + u64s - 1; -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) long d0, d1, d2; asm volatile("std ;\n" -- 2.50.1 From 53cf2a3daa4ca5f0a40eeb18c2be8724f123a63c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 13:24:50 -0400 Subject: [PATCH 05/16] bcachefs: Fix kmsan warnings in bch2_extent_crc_pack() We store to all fields, so the kmsan warnings were spurious - but initializing via stores to bitfields appear to have been giving the compiler/kmsan trouble, and they're not necessary. Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 04946d9911f5..ae1a1d917805 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -592,29 +592,35 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst, struct bch_extent_crc_unpacked src, enum bch_extent_entry_type type) { -#define set_common_fields(_dst, _src) \ - _dst.type = 1 << type; \ - _dst.csum_type = _src.csum_type, \ - _dst.compression_type = _src.compression_type, \ - _dst._compressed_size = _src.compressed_size - 1, \ - _dst._uncompressed_size = _src.uncompressed_size - 1, \ - _dst.offset = _src.offset +#define common_fields(_src) \ + .type = BIT(type), \ + .csum_type = _src.csum_type, \ + .compression_type = _src.compression_type, \ + ._compressed_size = _src.compressed_size - 1, \ + ._uncompressed_size = _src.uncompressed_size - 1, \ + .offset = _src.offset switch (type) { case BCH_EXTENT_ENTRY_crc32: - set_common_fields(dst->crc32, src); - dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo); + dst->crc32 = (struct bch_extent_crc32) { + common_fields(src), + .csum = (u32 __force) *((__le32 *) &src.csum.lo), + }; break; case BCH_EXTENT_ENTRY_crc64: - set_common_fields(dst->crc64, src); - dst->crc64.nonce = src.nonce; - dst->crc64.csum_lo = (u64 __force) src.csum.lo; - dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi); + dst->crc64 = (struct bch_extent_crc64) { + common_fields(src), + .nonce = src.nonce, + .csum_lo = (u64 __force) src.csum.lo, + .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi), + }; break; case BCH_EXTENT_ENTRY_crc128: - set_common_fields(dst->crc128, src); - dst->crc128.nonce = src.nonce; - dst->crc128.csum = src.csum; + dst->crc128 = (struct bch_extent_crc128) { + common_fields(src), + .nonce = src.nonce, + .csum = src.csum, + }; break; default: BUG(); -- 2.50.1 From 28aa859b6b422da5c982610d0add9128f813e9f2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 14:17:53 -0400 Subject: [PATCH 06/16] bcachefs: kmsan asserts Catching these early makes them a lot easier to track down. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 1 + fs/bcachefs/btree_update.c | 2 ++ fs/bcachefs/btree_update.h | 2 ++ 3 files changed, 5 insertions(+) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index d50dc31d0bea..7d7e52ddde02 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -164,6 +164,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b)); EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); + kmsan_check_memory(insert, bkey_bytes(&insert->k)); k = bch2_btree_node_iter_peek_all(node_iter, b); if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index b3e346b5f8d7..bd2eb42edb24 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -512,6 +512,8 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, enum btree_iter_update_trigger_flags flags) { + kmsan_check_memory(k, bkey_bytes(&k->k)); + btree_path_idx_t path_idx = iter->update_path ?: iter->path; int ret; diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 47d8690f01bf..d2e1c04353f6 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -133,6 +133,8 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr enum btree_id btree, struct bkey_i *k) { + kmsan_check_memory(k, bkey_bytes(&k->k)); + if (unlikely(!btree_type_uses_write_buffer(btree))) { int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); dump_stack(); -- 2.50.1 From 1f88c35674954fbb0b14d994c5fa02c7c5190356 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 14:54:49 -0400 Subject: [PATCH 07/16] bcachefs: Fix a KMSAN splat in btree_update_nodes_written() We may sometimes read from uninitialized memory; we know, and that's ok. We check if a btree node has been reused before waiting on any outstanding IO. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index d3e0cf01ba37..67f1e3202835 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -649,6 +649,14 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, return 0; } +/* If the node has been reused, we might be reading uninitialized memory - that's fine: */ +static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq) +{ + struct btree_node *b_data = READ_ONCE(b->data); + + return (b_data ? b_data->keys.seq : 0) == seq; +} + static void btree_update_nodes_written(struct btree_update *as) { struct bch_fs *c = as->c; @@ -677,17 +685,9 @@ static void btree_update_nodes_written(struct btree_update *as) * on disk: */ for (i = 0; i < as->nr_old_nodes; i++) { - __le64 seq; - b = as->old_nodes[i]; - bch2_trans_begin(trans); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - seq = b->data ? b->data->keys.seq : 0; - six_unlock_read(&b->c.lock); - bch2_trans_unlock_long(trans); - - if (seq == as->old_nodes_seq[i]) + if (btree_node_seq_matches(b, as->old_nodes_seq[i])) wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, TASK_UNINTERRUPTIBLE); } -- 2.50.1 From 9ea24b287b3b9118a157509d931e7d27414e98c7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 14:15:33 -0400 Subject: [PATCH 08/16] bcachefs: Eliminate padding in move_bucket_key We appear to be tripping over a compiler/kmsan bug with padding fields - this is an easy workaround. Signed-off-by: Kent Overstreet --- fs/bcachefs/move_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h index 82e473ed48d2..807f779f6f76 100644 --- a/fs/bcachefs/move_types.h +++ b/fs/bcachefs/move_types.h @@ -32,7 +32,7 @@ struct bch_move_stats { struct move_bucket_key { struct bpos bucket; - u8 gen; + unsigned gen; }; struct move_bucket { -- 2.50.1 From 5ae6f33053af6e904e609593d05e4faf3aeb16fb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 11:30:09 -0400 Subject: [PATCH 09/16] bcachefs: zero init journal bios fix a kmsan splat Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index ce7302695547..bfdaea6569ae 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1510,7 +1510,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, + ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, nr_bvecs), GFP_KERNEL); if (!ja->bio[i]) return -BCH_ERR_ENOMEM_dev_journal_init; -- 2.50.1 From f4a584f4bf64e0db30312088d504d4da29ca556b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 12:29:56 -0400 Subject: [PATCH 10/16] bcachefs: bch2_disk_accounting_mod2() We're hitting some issues with uninitialized struct padding, flagged by kmsan. They appear to be falso positives, otherwise bch2_accounting_validate() would have flagged them as "junk at end". But for now, we'll need to initialize disk_accounting_pos with memset(). This adds a new helper, bch2_disk_accounting_mod2(), that initializes a disk_accounting_pos and does the accounting mod all at once - so overall things actually get slightly more ergonomic. BCH_DISK_ACCOUNTING_replicas keys are left for now; KMSAN isn't warning about them and they're a bit special. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 10 ++--- fs/bcachefs/buckets.c | 66 ++++++++++------------------ fs/bcachefs/disk_accounting.h | 18 ++++++++ fs/bcachefs/disk_accounting_format.h | 12 ++--- fs/bcachefs/inode.c | 7 ++- fs/bcachefs/super.c | 9 ++-- 6 files changed, 57 insertions(+), 65 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 2828baa9b162..5fb396be9127 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -777,14 +777,12 @@ static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, s s64 delta_sectors, s64 delta_fragmented, unsigned flags) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_dev_data_type, - .dev_data_type.dev = ca->dev_idx, - .dev_data_type.data_type = data_type, - }; s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented }; - return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc); + return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, + d, dev_data_type, + .dev = ca->dev_idx, + .data_type = data_type); } int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca, diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index bb7742cf0014..e56ef623ebc1 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -724,9 +724,7 @@ static int __trigger_extent(struct btree_trans *trans, .replicas.nr_required = 1, }; - struct disk_accounting_pos acct_compression_key = { - .type = BCH_DISK_ACCOUNTING_compression, - }; + unsigned cur_compression_type = 0; u64 compression_acct[3] = { 1, 0, 0 }; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { @@ -760,13 +758,13 @@ static int __trigger_extent(struct btree_trans *trans, acc_replicas_key.replicas.nr_required = 0; } - if (acct_compression_key.compression.type && - acct_compression_key.compression.type != p.crc.compression_type) { + if (cur_compression_type && + cur_compression_type != p.crc.compression_type) { if (flags & BTREE_TRIGGER_overwrite) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, - ARRAY_SIZE(compression_acct), gc); + ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, + compression, cur_compression_type); if (ret) return ret; @@ -775,7 +773,7 @@ static int __trigger_extent(struct btree_trans *trans, compression_acct[2] = 0; } - acct_compression_key.compression.type = p.crc.compression_type; + cur_compression_type = p.crc.compression_type; if (p.crc.compression_type) { compression_acct[1] += p.crc.uncompressed_size; compression_acct[2] += p.crc.compressed_size; @@ -789,45 +787,34 @@ static int __trigger_extent(struct btree_trans *trans, } if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { - struct disk_accounting_pos acc_snapshot_key = { - .type = BCH_DISK_ACCOUNTING_snapshot, - .snapshot.id = k.k->p.snapshot, - }; - ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot); if (ret) return ret; } - if (acct_compression_key.compression.type) { + if (cur_compression_type) { if (flags & BTREE_TRIGGER_overwrite) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, - ARRAY_SIZE(compression_acct), gc); + ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, + compression, cur_compression_type); if (ret) return ret; } if (level) { - struct disk_accounting_pos acc_btree_key = { - .type = BCH_DISK_ACCOUNTING_btree, - .btree.id = btree_id, - }; - ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id); if (ret) return ret; } else { bool insert = !(flags & BTREE_TRIGGER_overwrite); - struct disk_accounting_pos acc_inum_key = { - .type = BCH_DISK_ACCOUNTING_inum, - .inum.inum = k.k->p.inode, - }; + s64 v[3] = { insert ? 1 : -1, insert ? k.k->size : -((s64) k.k->size), *replicas_sectors, }; - ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc); + ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode); if (ret) return ret; } @@ -876,15 +863,15 @@ int bch2_trigger_extent(struct btree_trans *trans, } int need_rebalance_delta = 0; - s64 need_rebalance_sectors_delta = 0; + s64 need_rebalance_sectors_delta[1] = { 0 }; s64 s = bch2_bkey_sectors_need_rebalance(c, old); need_rebalance_delta -= s != 0; - need_rebalance_sectors_delta -= s; + need_rebalance_sectors_delta[0] -= s; s = bch2_bkey_sectors_need_rebalance(c, new.s_c); need_rebalance_delta += s != 0; - need_rebalance_sectors_delta += s; + need_rebalance_sectors_delta[0] += s; if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, @@ -893,12 +880,9 @@ int bch2_trigger_extent(struct btree_trans *trans, return ret; } - if (need_rebalance_sectors_delta) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_rebalance_work, - }; - int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1, - flags & BTREE_TRIGGER_gc); + if (need_rebalance_sectors_delta[0]) { + int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, + need_rebalance_sectors_delta, rebalance_work); if (ret) return ret; } @@ -914,17 +898,13 @@ static int __trigger_reservation(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - s64 sectors = k.k->size; + s64 sectors[1] = { k.k->size }; if (flags & BTREE_TRIGGER_overwrite) - sectors = -sectors; - - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_persistent_reserved, - .persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas, - }; + sectors[0] = -sectors[0]; - return bch2_disk_accounting_mod(trans, &acc, §ors, 1, flags & BTREE_TRIGGER_gc); + return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors, + persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas); } return 0; diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index f4372cafea2e..f9214e2d1346 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -85,6 +85,24 @@ static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, s64 *, unsigned, bool); + +#define disk_accounting_key_init(_k, _type, ...) \ +do { \ + memset(&(_k), 0, sizeof(_k)); \ + (_k).type = BCH_DISK_ACCOUNTING_##_type; \ + (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \ +} while (0) + +#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \ +({ \ + struct disk_accounting_pos pos; \ + disk_accounting_key_init(pos, __VA_ARGS__); \ + bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \ +}) + +#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \ + bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__) + int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h index 7b6e6c97e6aa..15190196485f 100644 --- a/fs/bcachefs/disk_accounting_format.h +++ b/fs/bcachefs/disk_accounting_format.h @@ -113,14 +113,14 @@ enum disk_accounting_type { BCH_DISK_ACCOUNTING_TYPE_NR, }; -struct bch_nr_inodes { +struct bch_acct_nr_inodes { }; -struct bch_persistent_reserved { +struct bch_acct_persistent_reserved { __u8 nr_replicas; }; -struct bch_dev_data_type { +struct bch_acct_dev_data_type { __u8 dev; __u8 data_type; }; @@ -149,10 +149,10 @@ struct disk_accounting_pos { struct { __u8 type; union { - struct bch_nr_inodes nr_inodes; - struct bch_persistent_reserved persistent_reserved; + struct bch_acct_nr_inodes nr_inodes; + struct bch_acct_persistent_reserved persistent_reserved; struct bch_replicas_entry_v1 replicas; - struct bch_dev_data_type dev_data_type; + struct bch_acct_dev_data_type dev_data_type; struct bch_acct_compression compression; struct bch_acct_snapshot snapshot; struct bch_acct_btree btree; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 1383fdcc42a5..80051073f613 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -731,10 +731,9 @@ int bch2_trigger_inode(struct btree_trans *trans, bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); } - s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); - if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) { - struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes }; - int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc); + s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) }; + if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) { + int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes); if (ret) return ret; } diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index d662adfbdbcc..99f9a0aaa380 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1990,15 +1990,12 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) mutex_unlock(&c->sb_lock); if (ca->mi.freespace_initialized) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_dev_data_type, - .dev_data_type.dev = ca->dev_idx, - .dev_data_type.data_type = BCH_DATA_free, - }; u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?: + bch2_disk_accounting_mod2(trans, false, v, dev_data_type, + .dev = ca->dev_idx, + .data_type = BCH_DATA_free)) ?: bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); if (ret) goto err; -- 2.50.1 From 0b4fd567261bc21ba1fd8636489396f0940b54f8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 14:22:39 -0400 Subject: [PATCH 11/16] bcachefs: btree_trans_restart_foreign_task() In debug mode, we save the call stack on transaction restart - but there's no locking, so we can't touch it if we're issuing the restart from another thread. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.h | 9 ++++++++- fs/bcachefs/btree_locking.c | 4 +++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index b96157f3dc9c..8823eec6b284 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -335,13 +335,20 @@ static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_tra } __always_inline -static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) +static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip) { BUG_ON(err <= 0); BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); trans->restarted = err; trans->last_restarted_ip = ip; + return -err; +} + +__always_inline +static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) +{ + btree_trans_restart_foreign_task(trans, err, ip); #ifdef CONFIG_BCACHEFS_DEBUG darray_exit(&trans->last_restarted_trace); bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT); diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index caef65adeae4..b18fbf6f6226 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -172,7 +172,9 @@ static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) { if (i == g->g) { trace_would_deadlock(g, i->trans); - return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); + return btree_trans_restart_foreign_task(i->trans, + BCH_ERR_transaction_restart_would_deadlock, + _THIS_IP_); } else { i->trans->lock_must_abort = true; wake_up_process(i->trans->locking_wait.task); -- 2.50.1 From 739200c57384313e688e6945b56782721c29459f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 21:53:41 -0400 Subject: [PATCH 12/16] bcachefs: Fix race in print_chain() 00636 Unable to handle kernel NULL pointer dereference at virtual address 00000000000000b0 00636 Mem abort info: 00636 ESR = 0x0000000096000005 00636 EC = 0x25: DABT (current EL), IL = 32 bits 00636 SET = 0, FnV = 0 00636 EA = 0, S1PTW = 0 00636 FSC = 0x05: level 1 translation fault 00636 Data abort info: 00636 ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000 00636 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 00636 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 00636 user pgtable: 4k pages, 39-bit VAs, pgdp=0000000101b10000 00636 [00000000000000b0] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000 00636 Internal error: Oops: 0000000096000005 [#1] SMP 00636 Modules linked in: 00636 CPU: 12 UID: 0 PID: 79369 Comm: cat Not tainted 6.14.0-rc6-ktest-g3783b8973ab7 #17757 00636 Hardware name: linux,dummy-virt (DT) 00636 pstate: 20001005 (nzCv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--) 00636 pc : print_chain+0xb8/0x170 00636 lr : print_chain+0xa0/0x170 00636 sp : ffffff80d9c1bbb0 00636 x29: ffffff80d9c1bbb0 x28: 0000000000000002 x27: ffffff80c1be8250 00636 x26: ffffff80dd9b0000 x25: 0000000000000020 x24: 000000000000002d 00636 x23: 000000000000003c x22: ffffffc080a54518 x21: ffffff80da6e00d0 00636 x20: ffffff80da6e0170 x19: ffffff80c1a1d240 x18: 00000000ffffffff 00636 x17: 3535303937202d3c x16: 203139202d3c2035 x15: 00000000ffffffff 00636 x14: 0000000000000000 x13: ffffff80d71b63f1 x12: 0000000000000006 00636 x11: ffffffc080beb1c0 x10: 0000000000000020 x9 : 00000000000134cc 00636 x8 : 0000000000000020 x7 : 0000000000000004 x6 : 0000000000000020 00636 x5 : ffffff80d71b63f7 x4 : ffffffc080a5451b x3 : 0000000000000000 00636 x2 : 0000000000000000 x1 : 0000000000000000 x0 : 0000000000000000 00636 Call trace: 00636 print_chain+0xb8/0x170 (P) 00636 bch2_check_for_deadlock+0x444/0x5a0 00636 bch2_btree_deadlock_read+0xb4/0x1c8 00636 full_proxy_read+0x74/0xd8 00636 vfs_read+0x90/0x300 Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_locking.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index b18fbf6f6226..94eb2b73a843 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -91,10 +91,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g) struct trans_waiting_for_lock *i; for (i = g->g; i != g->g + g->nr; i++) { - struct task_struct *task = i->trans->locking_wait.task; + struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); if (i != g->g) prt_str(out, "<- "); - prt_printf(out, "%u ", task ?task->pid : 0); + prt_printf(out, "%u ", task ? task->pid : 0); } prt_newline(out); } -- 2.50.1 From 2adfa467347f6e5d8091ecbc45a78cac3d2a2b91 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 22 Mar 2025 16:26:32 -0400 Subject: [PATCH 13/16] bcachefs: btree node write errors now print btree node It turned out a user was wondering why we were going read-only after a write error, and he didn't realize he didn't have replication enabled - this will make that more obvious, and we should be printing it anyways. Link: https://www.reddit.com/r/bcachefs/comments/1jf9akl/large_data_transfers_switched_bcachefs_to_readonly/ Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 6abc9f17ea3c..2ba33ffc9795 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -2117,8 +2117,14 @@ out: return; err: set_btree_node_noevict(b); - bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c, - "writing btree node: %s", bch2_err_str(ret)); + + if (!bch2_err_matches(ret, EROFS)) { + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret)); + bch2_btree_pos_to_text(&buf, c, b); + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + } goto out; } @@ -2135,10 +2141,14 @@ static void btree_node_write_endio(struct bio *bio) bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, wbio->submit_time, !bio->bi_status); - if (ca && bio->bi_status) - bch_err_dev_ratelimited(ca, - "btree write error: %s", - bch2_blk_status_to_str(bio->bi_status)); + if (ca && bio->bi_status) { + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "btree write error: %s\n ", + bch2_blk_status_to_str(bio->bi_status)); + bch2_btree_pos_to_text(&buf, c, b); + bch_err_dev_ratelimited(ca, "%s", buf.buf); + printbuf_exit(&buf); + } if (bio->bi_status) { unsigned long flags; -- 2.50.1 From d8bdc8daac1d1b0a4efb1ecc69bef4eb4fc5e050 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 21:16:50 -0400 Subject: [PATCH 14/16] bcachefs: Kill unnecessary bch2_dev_usage_read() bch2_dev_usage_read() is fairly expensive, we should optimize this more. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 95aafc232290..0cac65347a5d 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -631,7 +631,7 @@ static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, struct bch_dev_usage *usage) { u64 *v = stripe->next_alloc + ca->dev_idx; - u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal); + u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal); u64 free_space_inv = free_space ? div64_u64(1ULL << 48, free_space) : 1ULL << 48; -- 2.50.1 From 5af61dbd96275e184adcfe615507b0f04ed7b328 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 25 Mar 2025 11:40:35 -0400 Subject: [PATCH 15/16] bcachefs: Fix nonce inconsistency in bch2_write_prep_encoded_data() If we're moving an extent that was partially overwritten, bch2_write_rechecksum() will trim it to the currenty live range. If we then also want to compress it, it'll be decrypted - but the nonce has been advanced for the overwritten start of the extent that we dropped, and we were using the nonce we calculated before rechecksum(). Reported-by: Gabriel de Perthuis Fixes: 127d90d2823e ("bcachefs: bch2_write_prep_encoded_data() now returns errcode") Signed-off-by: Kent Overstreet --- fs/bcachefs/io_write.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 29671075e3f1..14c5c5c98d51 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -839,7 +839,6 @@ static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct { struct bch_fs *c = op->c; struct bio *bio = &op->wbio.bio; - struct nonce nonce = extent_nonce(op->version, op->crc); int ret = 0; BUG_ON(bio_sectors(bio) != op->crc.compressed_size); @@ -866,6 +865,7 @@ static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct */ if (crc_is_compressed(op->crc)) { /* Last point we can still verify checksum: */ + struct nonce nonce = extent_nonce(op->version, op->crc); struct bch_csum csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) goto csum_err; @@ -905,6 +905,7 @@ static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct */ if (bch2_csum_type_is_encryption(op->crc.csum_type) && (op->compression_opt || op->crc.csum_type != op->csum_type)) { + struct nonce nonce = extent_nonce(op->version, op->crc); struct bch_csum csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) goto csum_err; -- 2.50.1 From 3ba0240a8789f8c059990b81c6f34c29769a5a49 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 24 Mar 2025 11:51:01 -0400 Subject: [PATCH 16/16] bcachefs: Fix silent short reads in data read retry path __bch2_read, before calling __bch2_read_extent(), sets bvec_iter.bi_size to "the size we can read from the current extent" with a swap, and restores it to "the size for the total read" after the read_extent call with another swap. But we neglected to do the restore before the "if (ret) goto err;" - which is a problem if we're retrying those errors. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-buffered.c | 2 +- fs/bcachefs/io_read.c | 3 ++- fs/bcachefs/io_read.h | 6 ++++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 5ab1c73c8d4c..a03e2c780cba 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -225,11 +225,11 @@ static void bchfs_read(struct btree_trans *trans, bch2_read_extent(trans, rbio, iter.pos, data_btree, k, offset_into_extent, flags); + swap(rbio->bio.bi_iter.bi_size, bytes); if (flags & BCH_READ_last_fragment) break; - swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); err: if (ret && diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index f1503df57dc7..fafd00a3d6c9 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -1322,13 +1322,14 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, data_btree, k, offset_into_extent, failed, flags, -1); + swap(bvec_iter.bi_size, bytes); + if (ret) goto err; if (flags & BCH_READ_last_fragment) break; - swap(bvec_iter.bi_size, bytes); bio_advance_iter(&rbio->bio, &bvec_iter, bytes); err: if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index cd21950417f6..c78025d863e0 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -137,8 +137,10 @@ static inline void bch2_read_extent(struct btree_trans *trans, enum btree_id data_btree, struct bkey_s_c k, unsigned offset_into_extent, unsigned flags) { - __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, - data_btree, k, offset_into_extent, NULL, flags, -1); + int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, + data_btree, k, offset_into_extent, NULL, flags, -1); + /* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */ + WARN(ret, "unhandled error from __bch2_read_extent()"); } int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, -- 2.50.1