From 8a9f3d058279ed0f99114e0449d129fb5abc5eca Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 10:16:48 -0400 Subject: [PATCH 01/16] bcachefs: EIO cleanup Replace these with proper private error codes, so that when we get an error message we're not sifting through the entire codebase to see where it came from. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 4 +-- fs/bcachefs/alloc_foreground.c | 4 +-- fs/bcachefs/alloc_foreground.h | 2 +- fs/bcachefs/checksum.c | 2 +- fs/bcachefs/compress.c | 56 ++++++++++++++++++---------------- fs/bcachefs/data_update.c | 2 +- fs/bcachefs/ec.c | 16 +++++----- fs/bcachefs/ec.h | 2 +- fs/bcachefs/errcode.h | 18 ++++++++++- fs/bcachefs/inode.c | 4 +-- fs/bcachefs/io_write.c | 14 ++++----- fs/bcachefs/journal_io.c | 2 +- fs/bcachefs/journal_reclaim.c | 6 ++-- 13 files changed, 73 insertions(+), 59 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 54e0cc373bb1..2828baa9b162 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -837,7 +837,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); if (!ca) - return -EIO; + return -BCH_ERR_trigger_alloc; struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); @@ -1031,7 +1031,7 @@ fsck_err: invalid_bucket: bch2_fs_inconsistent(c, "reference to invalid bucket\n %s", (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); - ret = -EIO; + ret = -BCH_ERR_trigger_alloc; goto err; } diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 1759c15a7745..95aafc232290 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -127,14 +127,14 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) void bch2_open_bucket_write_error(struct bch_fs *c, struct open_buckets *obs, - unsigned dev) + unsigned dev, int err) { struct open_bucket *ob; unsigned i; open_bucket_for_each(c, obs, ob, i) if (ob->dev == dev && ob->ec) - bch2_ec_bucket_cancel(c, ob); + bch2_ec_bucket_cancel(c, ob, err); } static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index baf5dc163c8a..69ec6a012898 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -82,7 +82,7 @@ static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, } void bch2_open_bucket_write_error(struct bch_fs *, - struct open_buckets *, unsigned); + struct open_buckets *, unsigned, int); void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 7f9e4c59950c..3726689093e3 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -466,7 +466,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, prt_str(&buf, ")"); WARN_RATELIMIT(1, "%s", buf.buf); printbuf_exit(&buf); - return -EIO; + return -BCH_ERR_recompute_checksum; } for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 91483f83eb59..85fc90342492 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -177,7 +177,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, size_t src_len = src->bi_iter.bi_size; size_t dst_len = crc.uncompressed_size << 9; void *workspace; - int ret; + int ret = 0, ret2; enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type); mempool_t *workspace_pool = &c->compress_workspace[opt]; @@ -189,7 +189,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, else ret = -BCH_ERR_compression_workspace_not_initialized; if (ret) - goto out; + goto err; } src_data = bio_map_or_bounce(c, src, READ); @@ -197,10 +197,10 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, switch (crc.compression_type) { case BCH_COMPRESSION_TYPE_lz4_old: case BCH_COMPRESSION_TYPE_lz4: - ret = LZ4_decompress_safe_partial(src_data.b, dst_data, - src_len, dst_len, dst_len); - if (ret != dst_len) - goto err; + ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data, + src_len, dst_len, dst_len); + if (ret2 != dst_len) + ret = -BCH_ERR_decompress_lz4; break; case BCH_COMPRESSION_TYPE_gzip: { z_stream strm = { @@ -214,45 +214,43 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, zlib_set_workspace(&strm, workspace); zlib_inflateInit2(&strm, -MAX_WBITS); - ret = zlib_inflate(&strm, Z_FINISH); + ret2 = zlib_inflate(&strm, Z_FINISH); mempool_free(workspace, workspace_pool); - if (ret != Z_STREAM_END) - goto err; + if (ret2 != Z_STREAM_END) + ret = -BCH_ERR_decompress_gzip; break; } case BCH_COMPRESSION_TYPE_zstd: { ZSTD_DCtx *ctx; size_t real_src_len = le32_to_cpup(src_data.b); - if (real_src_len > src_len - 4) + if (real_src_len > src_len - 4) { + ret = -BCH_ERR_decompress_zstd_src_len_bad; goto err; + } workspace = mempool_alloc(workspace_pool, GFP_NOFS); ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); - ret = zstd_decompress_dctx(ctx, + ret2 = zstd_decompress_dctx(ctx, dst_data, dst_len, src_data.b + 4, real_src_len); mempool_free(workspace, workspace_pool); - if (ret != dst_len) - goto err; + if (ret2 != dst_len) + ret = -BCH_ERR_decompress_zstd; break; } default: BUG(); } - ret = 0; +err: fsck_err: -out: bio_unmap_or_unbounce(c, src_data); return ret; -err: - ret = -EIO; - goto out; } int bch2_bio_uncompress_inplace(struct bch_write_op *op, @@ -268,18 +266,22 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, BUG_ON(!bio->bi_vcnt); BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); - if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || - crc->compressed_size << 9 > c->opts.encoded_extent_max) { - bch2_write_op_error(op, op->pos.offset, "extent too big to decompress"); - return -EIO; + if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max) { + bch2_write_op_error(op, op->pos.offset, + "extent too big to decompress (%u > %u)", + crc->uncompressed_size << 9, c->opts.encoded_extent_max); + return -BCH_ERR_decompress_exceeded_max_encoded_extent; } data = __bounce_alloc(c, dst_len, WRITE); - if (__bio_uncompress(c, bio, data.b, *crc)) { - if (!c->opts.no_data_io) - bch2_write_op_error(op, op->pos.offset, "decompression error"); - ret = -EIO; + ret = __bio_uncompress(c, bio, data.b, *crc); + + if (c->opts.no_data_io) + ret = 0; + + if (ret) { + bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret)); goto err; } @@ -312,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || crc.compressed_size << 9 > c->opts.encoded_extent_max) - return -EIO; + return -BCH_ERR_decompress_exceeded_max_encoded_extent; dst_data = dst_len == dst_iter.bi_size ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 08bb7f3019ce..0ec273daccb7 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -354,7 +354,7 @@ restart_drop_extra_replicas: printbuf_exit(&buf); bch2_fatal_error(c); - ret = -EIO; + ret = -BCH_ERR_invalid_bkey; goto out; } diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index c73ba73f6890..f2b9225fe0bc 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1124,7 +1124,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, bch2_fs_inconsistent(c, "%s", buf.buf); printbuf_exit(&buf); - return -EIO; + return -BCH_ERR_erasure_coding_found_btree_node; } k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); @@ -1190,7 +1190,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); if (!ca) - return -EIO; + return -BCH_ERR_ENOENT_dev_not_found; struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); @@ -1227,21 +1227,19 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) { struct btree_trans *trans = bch2_trans_get(c); struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - unsigned i, nr_data = v->nr_blocks - v->nr_redundant; - int ret = 0; + unsigned nr_data = v->nr_blocks - v->nr_redundant; - ret = bch2_btree_write_buffer_flush_sync(trans); + int ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) goto err; - for (i = 0; i < nr_data; i++) { + for (unsigned i = 0; i < nr_data; i++) { ret = ec_stripe_update_bucket(trans, s, i); if (ret) break; } err: bch2_trans_put(trans); - return ret; } @@ -1451,11 +1449,11 @@ static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int ec_stripe_new_set_pending(c, h); } -void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) +void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err) { struct ec_stripe_new *s = ob->ec; - s->err = -EIO; + s->err = err; } void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 8f2228e59eda..62d27e04d763 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -249,7 +249,7 @@ int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); -void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); +void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index c179954aaf33..101806d7ebe1 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -116,6 +116,7 @@ x(ENOENT, ENOENT_snapshot_tree) \ x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ x(ENOENT, ENOENT_dev_not_found) \ + x(ENOENT, ENOENT_dev_bucket_not_found) \ x(ENOENT, ENOENT_dev_idx_not_found) \ x(ENOENT, ENOENT_inode_no_backpointer) \ x(ENOENT, ENOENT_no_snapshot_tree_subvol) \ @@ -207,6 +208,7 @@ x(EINVAL, no_resize_with_buckets_nouse) \ x(EINVAL, inode_unpack_error) \ x(EINVAL, varint_decode_error) \ + x(EINVAL, erasure_coding_found_btree_node) \ x(EOPNOTSUPP, may_not_use_incompat_feature) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ @@ -267,6 +269,7 @@ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, journal_shutdown) \ x(EIO, journal_flush_err) \ + x(EIO, journal_write_err) \ x(EIO, btree_node_read_err) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ x(EIO, sb_not_downgraded) \ @@ -275,6 +278,7 @@ x(EIO, btree_node_read_validate_error) \ x(EIO, btree_need_topology_repair) \ x(EIO, bucket_ref_update) \ + x(EIO, trigger_alloc) \ x(EIO, trigger_pointer) \ x(EIO, trigger_stripe_pointer) \ x(EIO, metadata_bucket_inconsistency) \ @@ -290,7 +294,19 @@ x(EIO, EIO_fault_injected) \ x(EIO, ec_block_read) \ x(EIO, ec_block_write) \ - x(EIO, data_read) \ + x(EIO, recompute_checksum) \ + x(EIO, decompress) \ + x(BCH_ERR_decompress, decompress_exceeded_max_encoded_extent) \ + x(BCH_ERR_decompress, decompress_lz4) \ + x(BCH_ERR_decompress, decompress_gzip) \ + x(BCH_ERR_decompress, decompress_zstd_src_len_bad) \ + x(BCH_ERR_decompress, decompress_zstd) \ + x(EIO, data_write) \ + x(BCH_ERR_data_write, data_write_io) \ + x(BCH_ERR_data_write, data_write_csum) \ + x(BCH_ERR_data_write, data_write_invalid_ptr) \ + x(BCH_ERR_data_write, data_write_misaligned) \ + x(BCH_ERR_decompress, data_read) \ x(BCH_ERR_data_read, no_device_to_read_from) \ x(BCH_ERR_data_read, data_read_io_err) \ x(BCH_ERR_data_read, data_read_csum_err) \ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 7aca010e2e10..1383fdcc42a5 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1079,7 +1079,7 @@ retry: bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum.inum, snapshot); - ret = -EIO; + ret = -BCH_ERR_ENOENT_inode; goto err; } @@ -1243,7 +1243,7 @@ retry: bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum, snapshot); - ret = -EIO; + ret = -BCH_ERR_ENOENT_inode; goto err; } diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index a861f786c3db..29671075e3f1 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -535,7 +535,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) test_bit(ptr->dev, op->failed.d)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) - return -EIO; + return -BCH_ERR_data_write_io; } if (dst != src) @@ -589,7 +589,7 @@ static void __bch2_write_index(struct bch_write_op *op) out: /* If some a bucket wasn't written, we can't erasure code it: */ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) - bch2_open_bucket_write_error(c, &op->open_buckets, dev); + bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io); bch2_open_buckets_put(c, &op->open_buckets); return; @@ -920,7 +920,7 @@ static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct return 0; csum_err: bch2_write_csum_err_msg(op); - return -EIO; + return -BCH_ERR_data_write_csum; } static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, @@ -1127,7 +1127,7 @@ do_write: return more; csum_err: bch2_write_csum_err_msg(op); - ret = -EIO; + ret = -BCH_ERR_data_write_csum; err: if (to_wbio(dst)->bounce) bch2_bio_free_pages_pool(c, dst); @@ -1233,7 +1233,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) static void __bch2_nocow_write_done(struct bch_write_op *op) { if (unlikely(op->flags & BCH_WRITE_io_error)) { - op->error = -EIO; + op->error = -BCH_ERR_data_write_io; } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) bch2_nocow_write_convert_unwritten(op); } @@ -1424,7 +1424,7 @@ err_bucket_stale: "pointer to invalid bucket in nocow path on device %llu\n %s", stale_at->b.inode, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_data_write_invalid_ptr; } else { /* We can retry this: */ ret = -BCH_ERR_transaction_restart; @@ -1632,7 +1632,7 @@ CLOSURE_CALLBACK(bch2_write) if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { bch2_write_op_error(op, op->pos.offset, "misaligned write"); - op->error = -EIO; + op->error = -BCH_ERR_data_write_misaligned; goto err; } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index cf2700b06d58..4ed6137f0439 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1624,7 +1624,7 @@ static CLOSURE_CALLBACK(journal_write_done) if (!w->devs_written.nr) { bch_err(c, "unable to write journal to sufficient devices"); - err = -EIO; + err = -BCH_ERR_journal_write_err; } else { bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, w->devs_written); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 3ed31492e1aa..5d1547aa118a 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -645,7 +645,6 @@ static u64 journal_seq_to_flush(struct journal *j) * @j: journal object * @direct: direct or background reclaim? * @kicked: requested to run since we last ran? - * Returns: 0 on success, or -EIO if the journal has been shutdown * * Background journal reclaim writes out btree nodes. It should be run * early enough so that we never completely run out of journal buckets. @@ -685,10 +684,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) if (kthread && kthread_should_stop()) break; - if (bch2_journal_error(j)) { - ret = -EIO; + ret = bch2_journal_error(j); + if (ret) break; - } bch2_journal_do_discards(j); -- 2.51.0 From 4fcd4de0a659a1b9b151d9f88e6ec67a6c05fba5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 10:53:52 -0400 Subject: [PATCH 02/16] bcachefs: fs-common.c -> namei.c name <-> inode, code for managing the relationships between inodes and dirents. Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 2 +- fs/bcachefs/error.c | 2 +- fs/bcachefs/fs-ioctl.c | 2 +- fs/bcachefs/fs.c | 2 +- fs/bcachefs/fsck.c | 2 +- fs/bcachefs/{fs-common.c => namei.c} | 2 +- fs/bcachefs/{fs-common.h => namei.h} | 6 +++--- fs/bcachefs/recovery.c | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) rename fs/bcachefs/{fs-common.c => namei.c} (99%) rename fs/bcachefs/{fs-common.h => namei.h} (93%) diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 1cf17a16af9f..9af65079374f 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -41,7 +41,6 @@ bcachefs-y := \ extent_update.o \ eytzinger.o \ fs.o \ - fs-common.o \ fs-ioctl.o \ fs-io.o \ fs-io-buffered.o \ @@ -64,6 +63,7 @@ bcachefs-y := \ migrate.o \ move.o \ movinggc.o \ + namei.o \ nocow_locking.o \ opts.o \ printbuf.o \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 6d68c89a49b2..207f35d3cce2 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -3,8 +3,8 @@ #include "btree_cache.h" #include "btree_iter.h" #include "error.h" -#include "fs-common.h" #include "journal.h" +#include "namei.h" #include "recovery_passes.h" #include "super.h" #include "thread_with_file.h" diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 5b47b94fe1ea..e3a3230fc652 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -5,8 +5,8 @@ #include "chardev.h" #include "dirent.h" #include "fs.h" -#include "fs-common.h" #include "fs-ioctl.h" +#include "namei.h" #include "quota.h" #include diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 4453dd2f888e..273078ceb4df 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -11,7 +11,6 @@ #include "errcode.h" #include "extents.h" #include "fs.h" -#include "fs-common.h" #include "fs-io.h" #include "fs-ioctl.h" #include "fs-io-buffered.h" @@ -22,6 +21,7 @@ #include "io_read.h" #include "journal.h" #include "keylist.h" +#include "namei.h" #include "quota.h" #include "rebalance.h" #include "snapshot.h" diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 0e85131d0af8..4271ce4a4c8c 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -10,10 +10,10 @@ #include "dirent.h" #include "error.h" #include "fs.h" -#include "fs-common.h" #include "fsck.h" #include "inode.h" #include "keylist.h" +#include "namei.h" #include "recovery_passes.h" #include "snapshot.h" #include "super.h" diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/namei.c similarity index 99% rename from fs/bcachefs/fs-common.c rename to fs/bcachefs/namei.c index fbc3da59536c..bc83acbf5414 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/namei.c @@ -4,8 +4,8 @@ #include "acl.h" #include "btree_update.h" #include "dirent.h" -#include "fs-common.h" #include "inode.h" +#include "namei.h" #include "subvolume.h" #include "xattr.h" diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/namei.h similarity index 93% rename from fs/bcachefs/fs-common.h rename to fs/bcachefs/namei.h index 2b59210bb5e8..7383b76270e9 100644 --- a/fs/bcachefs/fs-common.h +++ b/fs/bcachefs/namei.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_COMMON_H -#define _BCACHEFS_FS_COMMON_H +#ifndef _BCACHEFS_NAMEI_H +#define _BCACHEFS_NAMEI_H #include "dirent.h" @@ -44,4 +44,4 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *, int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); -#endif /* _BCACHEFS_FS_COMMON_H */ +#endif /* _BCACHEFS_NAMEI_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index a6e26733854d..266c5770c824 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -13,12 +13,12 @@ #include "disk_accounting.h" #include "errcode.h" #include "error.h" -#include "fs-common.h" #include "journal_io.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "logged_ops.h" #include "move.h" +#include "namei.h" #include "quota.h" #include "rebalance.h" #include "recovery.h" -- 2.51.0 From 758ea4ff812b4dfd4cef6dba0eb4b0025a7b147e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 11:06:50 -0400 Subject: [PATCH 03/16] bcachefs: Move bch2_check_dirent_target() to namei.c We're gradually running more and more fsck.c checks at runtime, whereever applicable; when we do so they get moved out of fsck.c. Next patch will call bch2_check_dirent_target() from bch2_lookup(). Signed-off-by: Kent Overstreet --- fs/bcachefs/dirent.c | 51 ++++++++++ fs/bcachefs/dirent.h | 2 + fs/bcachefs/fsck.c | 229 +------------------------------------------ fs/bcachefs/namei.c | 173 ++++++++++++++++++++++++++++++++ fs/bcachefs/namei.h | 5 + 5 files changed, 236 insertions(+), 224 deletions(-) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index f4c283d1e86a..d7f9f79318a2 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -729,3 +729,54 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) return ret < 0 ? ret : 0; } + +/* fsck */ + +static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inode_nr) + break; + if (!bkey_is_inode(k.k)) + continue; + ret = bch2_inode_unpack(k, inode); + goto found; + } + ret = -BCH_ERR_ENOENT_inode; +found: + bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bch_inode_unpacked dir_inode; + struct bch_hash_info dir_hash_info; + int ret; + + ret = lookup_first_inode(trans, pos.inode, &dir_inode); + if (ret) + goto err; + + dir_hash_info = bch2_hash_info_init(c, &dir_inode); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); + + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash_info, &iter, + BTREE_UPDATE_internal_snapshot_node); + bch2_trans_iter_exit(trans, &iter); +err: + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index a6e15a012936..0880772b80a9 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -82,4 +82,6 @@ int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); +int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos); + #endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 4271ce4a4c8c..f3853b741937 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -23,13 +23,6 @@ #include #include /* struct qstr */ -static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, - struct bkey_s_c_dirent d) -{ - return inode->bi_dir == d.k->p.inode && - inode->bi_dir_offset == d.k->p.offset; -} - static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, struct bch_inode_unpacked *inode) { @@ -116,29 +109,6 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, return ret; } -static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inode_nr) - break; - if (!bkey_is_inode(k.k)) - continue; - ret = bch2_inode_unpack(k, inode); - goto found; - } - ret = -BCH_ERR_ENOENT_inode; -found: - bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot, struct bch_inode_unpacked *inode) { @@ -179,32 +149,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, return 0; } -static int __remove_dirent(struct btree_trans *trans, struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bch_inode_unpacked dir_inode; - struct bch_hash_info dir_hash_info; - int ret; - - ret = lookup_first_inode(trans, pos.inode, &dir_inode); - if (ret) - goto err; - - dir_hash_info = bch2_hash_info_init(c, &dir_inode); - - bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); - - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, &iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter); -err: - bch_err_fn(c, ret); - return ret; -} - /* * Find any subvolume associated with a tree of snapshots * We can't rely on master_subvol - it might have been deleted. @@ -548,7 +492,7 @@ static int remove_backpointer(struct btree_trans *trans, SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot)); int ret = bkey_err(d) ?: dirent_points_to_inode(c, d, inode) ?: - __remove_dirent(trans, d.k->p); + bch2_fsck_remove_dirent(trans, d.k->p); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1985,169 +1929,6 @@ static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_wa trans_was_restarted(trans, restart_count); } -noinline_for_stack -static int check_dirent_inode_dirent(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter bp_iter = { NULL }; - int ret = 0; - - if (inode_points_to_dirent(target, d)) - return 0; - - if (!target->bi_dir && - !target->bi_dir_offset) { - fsck_err_on(S_ISDIR(target->bi_mode), - trans, inode_dir_missing_backpointer, - "directory with missing backpointer\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n"), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf)); - - fsck_err_on(target->bi_flags & BCH_INODE_unlinked, - trans, inode_unlinked_but_has_dirent, - "inode unlinked but has dirent\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n"), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf)); - - target->bi_flags &= ~BCH_INODE_unlinked; - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - return __bch2_fsck_write_inode(trans, target); - } - - if (bch2_inode_should_have_single_bp(target) && - !fsck_err(trans, inode_wrong_backpointer, - "dirent points to inode that does not point back:\n %s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n "), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf))) - goto err; - - struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, - SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot)); - ret = bkey_err(bp_dirent); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - bool backpointer_exists = !ret; - ret = 0; - - if (fsck_err_on(!backpointer_exists, - trans, inode_wrong_backpointer, - "inode %llu:%u has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - target->bi_inum, target->bi_snapshot, - target->bi_dir, - target->bi_dir_offset, - d.k->p.inode, - d.k->p.offset)) { - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - ret = __bch2_fsck_write_inode(trans, target); - goto out; - } - - bch2_bkey_val_to_text(&buf, c, d.s_c); - prt_newline(&buf); - if (backpointer_exists) - bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); - - if (fsck_err_on(backpointer_exists && - (S_ISDIR(target->bi_mode) || - target->bi_subvol), - trans, inode_dir_multiple_links, - "%s %llu:%u with multiple links\n%s", - S_ISDIR(target->bi_mode) ? "directory" : "subvolume", - target->bi_inum, target->bi_snapshot, buf.buf)) { - ret = __remove_dirent(trans, d.k->p); - goto out; - } - - /* - * hardlinked file with nlink 0: - * We're just adjusting nlink here so check_nlinks() will pick - * it up, it ignores inodes with nlink 0 - */ - if (fsck_err_on(backpointer_exists && !target->bi_nlink, - trans, inode_multiple_links_but_nlink_0, - "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", - target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { - target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_unlinked; - ret = __bch2_fsck_write_inode(trans, target); - if (ret) - goto err; - } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &bp_iter); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -noinline_for_stack -static int check_dirent_target(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target) -{ - struct bch_fs *c = trans->c; - struct bkey_i_dirent *n; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = check_dirent_inode_dirent(trans, iter, d, target); - if (ret) - goto err; - - if (fsck_err_on(d.v->d_type != inode_d_type(target), - trans, dirent_d_type_wrong, - "incorrect d_type: got %s, should be %s:\n%s", - bch2_d_type_str(d.v->d_type), - bch2_d_type_str(inode_d_type(target)), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { - n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_type = inode_d_type(target); - if (n->v.d_type == DT_SUBVOL) { - n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); - n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); - } else { - n->v.d_inum = cpu_to_le64(target->bi_inum); - } - - ret = bch2_trans_update(trans, iter, &n->k_i, 0); - if (ret) - goto err; - - d = dirent_i_to_s_c(n); - } -err: -fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - /* find a subvolume that's a descendent of @snapshot: */ static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) { @@ -2247,7 +2028,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * if (fsck_err(trans, dirent_to_missing_subvol, "dirent points to missing subvolume\n%s", (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) - return __remove_dirent(trans, d.k->p); + return bch2_fsck_remove_dirent(trans, d.k->p); ret = 0; goto out; } @@ -2291,7 +2072,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * goto err; } - ret = check_dirent_target(trans, iter, d, &subvol_root); + ret = bch2_check_dirent_target(trans, iter, d, &subvol_root); if (ret) goto err; out: @@ -2378,13 +2159,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = __remove_dirent(trans, d.k->p); + ret = bch2_fsck_remove_dirent(trans, d.k->p); if (ret) goto err; } darray_for_each(target->inodes, i) { - ret = check_dirent_target(trans, iter, d, &i->inode); + ret = bch2_check_dirent_target(trans, iter, d, &i->inode); if (ret) goto err; } diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index bc83acbf5414..4d0ee85e5016 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -564,6 +564,8 @@ err: return ret; } +/* inum_to_path */ + static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n) { bch2_printbuf_make_room(out, n); @@ -654,3 +656,174 @@ disconnected: prt_str_reversed(path, "(disconnected)"); goto out; } + +/* fsck */ + +static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + struct btree_iter bp_iter = { NULL }; + int ret = 0; + + if (inode_points_to_dirent(target, d)) + return 0; + + if (!target->bi_dir && + !target->bi_dir_offset) { + fsck_err_on(S_ISDIR(target->bi_mode), + trans, inode_dir_missing_backpointer, + "directory with missing backpointer\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n"), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf)); + + fsck_err_on(target->bi_flags & BCH_INODE_unlinked, + trans, inode_unlinked_but_has_dirent, + "inode unlinked but has dirent\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n"), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf)); + + target->bi_flags &= ~BCH_INODE_unlinked; + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + return __bch2_fsck_write_inode(trans, target); + } + + if (bch2_inode_should_have_single_bp(target) && + !fsck_err(trans, inode_wrong_backpointer, + "dirent points to inode that does not point back:\n %s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n "), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf))) + goto err; + + struct bkey_s_c_dirent bp_dirent = + bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents, + SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot), + 0, dirent); + ret = bkey_err(bp_dirent); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + bool backpointer_exists = !ret; + ret = 0; + + if (fsck_err_on(!backpointer_exists, + trans, inode_wrong_backpointer, + "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + target->bi_inum, target->bi_snapshot, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, + d.k->p.offset)) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + ret = __bch2_fsck_write_inode(trans, target); + goto out; + } + + bch2_bkey_val_to_text(&buf, c, d.s_c); + prt_newline(&buf); + if (backpointer_exists) + bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + + if (fsck_err_on(backpointer_exists && + (S_ISDIR(target->bi_mode) || + target->bi_subvol), + trans, inode_dir_multiple_links, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target->bi_snapshot, buf.buf)) { + ret = bch2_fsck_remove_dirent(trans, d.k->p); + goto out; + } + + /* + * hardlinked file with nlink 0: + * We're just adjusting nlink here so check_nlinks() will pick + * it up, it ignores inodes with nlink 0 + */ + if (fsck_err_on(backpointer_exists && !target->bi_nlink, + trans, inode_multiple_links_but_nlink_0, + "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", + target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_unlinked; + ret = __bch2_fsck_write_inode(trans, target); + if (ret) + goto err; + } +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + +int bch2_check_dirent_target(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0; + + ret = bch2_check_dirent_inode_dirent(trans, iter, d, target); + if (ret) + goto err; + + if (fsck_err_on(d.v->d_type != inode_d_type(target), + trans, dirent_d_type_wrong, + "incorrect d_type: got %s, should be %s:\n%s", + bch2_d_type_str(d.v->d_type), + bch2_d_type_str(inode_d_type(target)), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_type = inode_d_type(target); + if (n->v.d_type == DT_SUBVOL) { + n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); + } else { + n->v.d_inum = cpu_to_le64(target->bi_inum); + } + + ret = bch2_trans_update(trans, iter, &n->k_i, 0); + if (ret) + goto err; + + d = dirent_i_to_s_c(n); + } +err: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/namei.h b/fs/bcachefs/namei.h index 7383b76270e9..48a2c8cb5fa9 100644 --- a/fs/bcachefs/namei.h +++ b/fs/bcachefs/namei.h @@ -44,4 +44,9 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *, int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); +int bch2_check_dirent_target(struct btree_trans *, + struct btree_iter *, + struct bkey_s_c_dirent, + struct bch_inode_unpacked *); + #endif /* _BCACHEFS_NAMEI_H */ -- 2.51.0 From 9b0d00a3693bbab49dcec00dd981c8661d6011bf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 11:06:50 -0400 Subject: [PATCH 04/16] bcachefs: Refactor bch2_check_dirent_target() Prep work for calling bch2_check_dirent_target() from bch2_lookup(). - Add an inline wrapper, if the target and backpointer match we can skip the function call. - We don't (yet?) want to remove the dirent we did the lookup from (when we find a directory or subvol with multiple valid dirents pointing to it), we can defer on that until later. For now, add an "are we in fsck?" parameter. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 4 +- fs/bcachefs/inode.h | 1 + fs/bcachefs/namei.c | 129 +++++++++++++++++++++++--------------------- fs/bcachefs/namei.h | 28 ++++++++-- 4 files changed, 94 insertions(+), 68 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index f3853b741937..091057023fc5 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2072,7 +2072,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * goto err; } - ret = bch2_check_dirent_target(trans, iter, d, &subvol_root); + ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true); if (ret) goto err; out: @@ -2165,7 +2165,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, } darray_for_each(target->inodes, i) { - ret = bch2_check_dirent_target(trans, iter, d, &i->inode); + ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true); if (ret) goto err; } diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 428b9be6af34..f82cfbf460d0 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -277,6 +277,7 @@ static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *i bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; return S_ISDIR(inode->bi_mode) || + inode->bi_subvol || (!inode->bi_nlink && inode_has_bp); } diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index 4d0ee85e5016..93246ad31541 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -659,17 +659,10 @@ disconnected: /* fsck */ -static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, - struct bkey_s_c_dirent d) -{ - return inode->bi_dir == d.k->p.inode && - inode->bi_dir_offset == d.k->p.offset; -} - static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target) + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + bool in_fsck) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; @@ -725,52 +718,65 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, bool backpointer_exists = !ret; ret = 0; - if (fsck_err_on(!backpointer_exists, - trans, inode_wrong_backpointer, - "inode %llu:%u has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - target->bi_inum, target->bi_snapshot, - target->bi_dir, - target->bi_dir_offset, - d.k->p.inode, - d.k->p.offset)) { - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - ret = __bch2_fsck_write_inode(trans, target); - goto out; - } - - bch2_bkey_val_to_text(&buf, c, d.s_c); - prt_newline(&buf); - if (backpointer_exists) + if (!backpointer_exists) { + if (fsck_err(trans, inode_wrong_backpointer, + "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + target->bi_inum, target->bi_snapshot, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, + d.k->p.offset)) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + ret = __bch2_fsck_write_inode(trans, target); + } + } else { + bch2_bkey_val_to_text(&buf, c, d.s_c); + prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); - if (fsck_err_on(backpointer_exists && - (S_ISDIR(target->bi_mode) || - target->bi_subvol), - trans, inode_dir_multiple_links, - "%s %llu:%u with multiple links\n%s", - S_ISDIR(target->bi_mode) ? "directory" : "subvolume", - target->bi_inum, target->bi_snapshot, buf.buf)) { - ret = bch2_fsck_remove_dirent(trans, d.k->p); - goto out; - } - - /* - * hardlinked file with nlink 0: - * We're just adjusting nlink here so check_nlinks() will pick - * it up, it ignores inodes with nlink 0 - */ - if (fsck_err_on(backpointer_exists && !target->bi_nlink, - trans, inode_multiple_links_but_nlink_0, - "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", - target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { - target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_unlinked; - ret = __bch2_fsck_write_inode(trans, target); - if (ret) - goto err; + if (S_ISDIR(target->bi_mode) || target->bi_subvol) { + /* + * XXX: verify connectivity of the other dirent + * up to the root before removing this one + * + * Additionally, bch2_lookup would need to cope with the + * dirent it found being removed - or should we remove + * the other one, even though the inode points to it? + */ + if (in_fsck) { + if (fsck_err(trans, inode_dir_multiple_links, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target->bi_snapshot, buf.buf)) + ret = bch2_fsck_remove_dirent(trans, d.k->p); + } else { + bch2_fs_inconsistent(c, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target->bi_snapshot, buf.buf); + } + + goto out; + } else { + /* + * hardlinked file with nlink 0: + * We're just adjusting nlink here so check_nlinks() will pick + * it up, it ignores inodes with nlink 0 + */ + if (fsck_err_on(!target->bi_nlink, + trans, inode_multiple_links_but_nlink_0, + "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", + target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_unlinked; + ret = __bch2_fsck_write_inode(trans, target); + if (ret) + goto err; + } + } } out: err: @@ -781,16 +787,17 @@ fsck_err: return ret; } -int bch2_check_dirent_target(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target) +int __bch2_check_dirent_target(struct btree_trans *trans, + struct btree_iter *dirent_iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + bool in_fsck) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; int ret = 0; - ret = bch2_check_dirent_inode_dirent(trans, iter, d, target); + ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck); if (ret) goto err; @@ -815,11 +822,9 @@ int bch2_check_dirent_target(struct btree_trans *trans, n->v.d_inum = cpu_to_le64(target->bi_inum); } - ret = bch2_trans_update(trans, iter, &n->k_i, 0); + ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0); if (ret) goto err; - - d = dirent_i_to_s_c(n); } err: fsck_err: diff --git a/fs/bcachefs/namei.h b/fs/bcachefs/namei.h index 48a2c8cb5fa9..2e6f6364767f 100644 --- a/fs/bcachefs/namei.h +++ b/fs/bcachefs/namei.h @@ -44,9 +44,29 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *, int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); -int bch2_check_dirent_target(struct btree_trans *, - struct btree_iter *, - struct bkey_s_c_dirent, - struct bch_inode_unpacked *); +int __bch2_check_dirent_target(struct btree_trans *, + struct btree_iter *, + struct bkey_s_c_dirent, + struct bch_inode_unpacked *, bool); + +static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static inline int bch2_check_dirent_target(struct btree_trans *trans, + struct btree_iter *dirent_iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + bool in_fsck) +{ + if (likely(inode_points_to_dirent(target, d) && + d.v->d_type == inode_d_type(target))) + return 0; + + return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck); +} #endif /* _BCACHEFS_NAMEI_H */ -- 2.51.0 From 04e90891be26a240e41d51d1770de56e810fda5e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 11:41:07 -0400 Subject: [PATCH 05/16] bcachefs: Run bch2_check_dirent_target() at lookup time More on the "full online self healing" project: We now run most of the dirent <-> inode consistency checks, with repair code, at runtime - the exact same check and repair code that fsck runs. This will allow us to repair the "dirent points to inode that does not point back" inconsistencies that have been popping up at runtime. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 273078ceb4df..fbca200f7636 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -641,7 +641,9 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, if (ret) return ERR_PTR(ret); - ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + + ret = bch2_dirent_read_target(trans, dir, d, &inum); if (ret > 0) ret = -ENOENT; if (ret) @@ -651,30 +653,30 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, if (inode) goto out; + /* + * Note: if check/repair needs it, we commit before + * bch2_inode_hash_init_insert(), as after that point we can't take a + * restart - not in the top level loop with a commit_do(), like we + * usually do: + */ + struct bch_subvolume subvol; struct bch_inode_unpacked inode_u; ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: + bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); + /* + * don't remove it: check_inodes might find another inode that points + * back to this dirent + */ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "dirent to missing inode:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)); if (ret) goto err; - - /* regular files may have hardlinks: */ - if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) && - !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), - c, - "dirent points to inode that does not point back:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), - prt_printf(&buf, "\n "), - bch2_inode_unpacked_to_text(&buf, &inode_u), - buf.buf))) { - ret = -ENOENT; - goto err; - } out: bch2_trans_iter_exit(trans, &dirent_iter); printbuf_exit(&buf); -- 2.51.0 From 6a9f681ef623ae3804bc2ca3a2d06d2458142975 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 11:53:50 -0400 Subject: [PATCH 06/16] bcachefs: Count BCH_DATA_parity backpointers correctly These are counted as stripe data in the corresponding alloc keys. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index c9dfc3657696..8da1b68821a0 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -50,6 +50,8 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke } bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); + prt_str(out, " data_type="); + bch2_prt_data_type(out, bp.v->data_type); prt_printf(out, " suboffset=%u len=%u gen=%u pos=", (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), bp.v->bucket_len, @@ -791,6 +793,7 @@ static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t case BCH_DATA_cached: return ALLOC_cached; case BCH_DATA_stripe: + case BCH_DATA_parity: return ALLOC_stripe; default: BUG(); -- 2.51.0 From 962322475bb5cebe0da581f6f18d23b00184aa01 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 19 Mar 2025 17:01:38 -0400 Subject: [PATCH 07/16] bcachefs: Handle backpointers with unknown data types New data types might be added later, so we don't want to disallow unknown data types - that'll be a compatibility hassle later. Instead, ignore them. Reported-by: syzbot+3a290f5ff67ca3023834@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 8da1b68821a0..20c497f0c2cb 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -784,7 +784,7 @@ enum alloc_sector_counter { ALLOC_SECTORS_NR }; -static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t) +static int data_type_to_alloc_counter(enum bch_data_type t) { switch (t) { case BCH_DATA_btree: @@ -796,7 +796,7 @@ static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t case BCH_DATA_parity: return ALLOC_stripe; default: - BUG(); + return -1; } } @@ -847,7 +847,11 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b if (bp.v->bucket_gen != a->gen) continue; - sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len; + int alloc_counter = data_type_to_alloc_counter(bp.v->data_type); + if (alloc_counter < 0) + continue; + + sectors[alloc_counter] += bp.v->bucket_len; }; bch2_trans_iter_exit(trans, &iter); if (ret) -- 2.51.0 From 9c3a2c9b471aa42b13c26c916f6a0852899a57e0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 12:38:59 -0400 Subject: [PATCH 08/16] bcachefs: Disable asm memcpys when kmsan enabled kmsan doesn't know about inline assembly, obviously; this will close a ton of syzbot bugs. Signed-off-by: Kent Overstreet --- fs/bcachefs/util.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index d41e133acc4d..7d921fc920a0 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -431,7 +431,7 @@ static inline void memcpy_u64s_small(void *dst, const void *src, static inline void __memcpy_u64s(void *dst, const void *src, unsigned u64s) { -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) long d0, d1, d2; asm volatile("rep ; movsq" @@ -508,7 +508,7 @@ static inline void __memmove_u64s_up(void *_dst, const void *_src, u64 *dst = (u64 *) _dst + u64s - 1; u64 *src = (u64 *) _src + u64s - 1; -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) long d0, d1, d2; asm volatile("std ;\n" -- 2.51.0 From 53cf2a3daa4ca5f0a40eeb18c2be8724f123a63c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 13:24:50 -0400 Subject: [PATCH 09/16] bcachefs: Fix kmsan warnings in bch2_extent_crc_pack() We store to all fields, so the kmsan warnings were spurious - but initializing via stores to bitfields appear to have been giving the compiler/kmsan trouble, and they're not necessary. Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 04946d9911f5..ae1a1d917805 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -592,29 +592,35 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst, struct bch_extent_crc_unpacked src, enum bch_extent_entry_type type) { -#define set_common_fields(_dst, _src) \ - _dst.type = 1 << type; \ - _dst.csum_type = _src.csum_type, \ - _dst.compression_type = _src.compression_type, \ - _dst._compressed_size = _src.compressed_size - 1, \ - _dst._uncompressed_size = _src.uncompressed_size - 1, \ - _dst.offset = _src.offset +#define common_fields(_src) \ + .type = BIT(type), \ + .csum_type = _src.csum_type, \ + .compression_type = _src.compression_type, \ + ._compressed_size = _src.compressed_size - 1, \ + ._uncompressed_size = _src.uncompressed_size - 1, \ + .offset = _src.offset switch (type) { case BCH_EXTENT_ENTRY_crc32: - set_common_fields(dst->crc32, src); - dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo); + dst->crc32 = (struct bch_extent_crc32) { + common_fields(src), + .csum = (u32 __force) *((__le32 *) &src.csum.lo), + }; break; case BCH_EXTENT_ENTRY_crc64: - set_common_fields(dst->crc64, src); - dst->crc64.nonce = src.nonce; - dst->crc64.csum_lo = (u64 __force) src.csum.lo; - dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi); + dst->crc64 = (struct bch_extent_crc64) { + common_fields(src), + .nonce = src.nonce, + .csum_lo = (u64 __force) src.csum.lo, + .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi), + }; break; case BCH_EXTENT_ENTRY_crc128: - set_common_fields(dst->crc128, src); - dst->crc128.nonce = src.nonce; - dst->crc128.csum = src.csum; + dst->crc128 = (struct bch_extent_crc128) { + common_fields(src), + .nonce = src.nonce, + .csum = src.csum, + }; break; default: BUG(); -- 2.51.0 From 28aa859b6b422da5c982610d0add9128f813e9f2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 14:17:53 -0400 Subject: [PATCH 10/16] bcachefs: kmsan asserts Catching these early makes them a lot easier to track down. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 1 + fs/bcachefs/btree_update.c | 2 ++ fs/bcachefs/btree_update.h | 2 ++ 3 files changed, 5 insertions(+) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index d50dc31d0bea..7d7e52ddde02 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -164,6 +164,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b)); EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); + kmsan_check_memory(insert, bkey_bytes(&insert->k)); k = bch2_btree_node_iter_peek_all(node_iter, b); if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index b3e346b5f8d7..bd2eb42edb24 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -512,6 +512,8 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, enum btree_iter_update_trigger_flags flags) { + kmsan_check_memory(k, bkey_bytes(&k->k)); + btree_path_idx_t path_idx = iter->update_path ?: iter->path; int ret; diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 47d8690f01bf..d2e1c04353f6 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -133,6 +133,8 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr enum btree_id btree, struct bkey_i *k) { + kmsan_check_memory(k, bkey_bytes(&k->k)); + if (unlikely(!btree_type_uses_write_buffer(btree))) { int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); dump_stack(); -- 2.51.0 From 1f88c35674954fbb0b14d994c5fa02c7c5190356 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 14:54:49 -0400 Subject: [PATCH 11/16] bcachefs: Fix a KMSAN splat in btree_update_nodes_written() We may sometimes read from uninitialized memory; we know, and that's ok. We check if a btree node has been reused before waiting on any outstanding IO. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index d3e0cf01ba37..67f1e3202835 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -649,6 +649,14 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, return 0; } +/* If the node has been reused, we might be reading uninitialized memory - that's fine: */ +static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq) +{ + struct btree_node *b_data = READ_ONCE(b->data); + + return (b_data ? b_data->keys.seq : 0) == seq; +} + static void btree_update_nodes_written(struct btree_update *as) { struct bch_fs *c = as->c; @@ -677,17 +685,9 @@ static void btree_update_nodes_written(struct btree_update *as) * on disk: */ for (i = 0; i < as->nr_old_nodes; i++) { - __le64 seq; - b = as->old_nodes[i]; - bch2_trans_begin(trans); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - seq = b->data ? b->data->keys.seq : 0; - six_unlock_read(&b->c.lock); - bch2_trans_unlock_long(trans); - - if (seq == as->old_nodes_seq[i]) + if (btree_node_seq_matches(b, as->old_nodes_seq[i])) wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, TASK_UNINTERRUPTIBLE); } -- 2.51.0 From 9ea24b287b3b9118a157509d931e7d27414e98c7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 14:15:33 -0400 Subject: [PATCH 12/16] bcachefs: Eliminate padding in move_bucket_key We appear to be tripping over a compiler/kmsan bug with padding fields - this is an easy workaround. Signed-off-by: Kent Overstreet --- fs/bcachefs/move_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h index 82e473ed48d2..807f779f6f76 100644 --- a/fs/bcachefs/move_types.h +++ b/fs/bcachefs/move_types.h @@ -32,7 +32,7 @@ struct bch_move_stats { struct move_bucket_key { struct bpos bucket; - u8 gen; + unsigned gen; }; struct move_bucket { -- 2.51.0 From 5ae6f33053af6e904e609593d05e4faf3aeb16fb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 11:30:09 -0400 Subject: [PATCH 13/16] bcachefs: zero init journal bios fix a kmsan splat Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index ce7302695547..bfdaea6569ae 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1510,7 +1510,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, + ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, nr_bvecs), GFP_KERNEL); if (!ja->bio[i]) return -BCH_ERR_ENOMEM_dev_journal_init; -- 2.51.0 From f4a584f4bf64e0db30312088d504d4da29ca556b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 12:29:56 -0400 Subject: [PATCH 14/16] bcachefs: bch2_disk_accounting_mod2() We're hitting some issues with uninitialized struct padding, flagged by kmsan. They appear to be falso positives, otherwise bch2_accounting_validate() would have flagged them as "junk at end". But for now, we'll need to initialize disk_accounting_pos with memset(). This adds a new helper, bch2_disk_accounting_mod2(), that initializes a disk_accounting_pos and does the accounting mod all at once - so overall things actually get slightly more ergonomic. BCH_DISK_ACCOUNTING_replicas keys are left for now; KMSAN isn't warning about them and they're a bit special. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 10 ++--- fs/bcachefs/buckets.c | 66 ++++++++++------------------ fs/bcachefs/disk_accounting.h | 18 ++++++++ fs/bcachefs/disk_accounting_format.h | 12 ++--- fs/bcachefs/inode.c | 7 ++- fs/bcachefs/super.c | 9 ++-- 6 files changed, 57 insertions(+), 65 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 2828baa9b162..5fb396be9127 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -777,14 +777,12 @@ static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, s s64 delta_sectors, s64 delta_fragmented, unsigned flags) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_dev_data_type, - .dev_data_type.dev = ca->dev_idx, - .dev_data_type.data_type = data_type, - }; s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented }; - return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc); + return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, + d, dev_data_type, + .dev = ca->dev_idx, + .data_type = data_type); } int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca, diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index bb7742cf0014..e56ef623ebc1 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -724,9 +724,7 @@ static int __trigger_extent(struct btree_trans *trans, .replicas.nr_required = 1, }; - struct disk_accounting_pos acct_compression_key = { - .type = BCH_DISK_ACCOUNTING_compression, - }; + unsigned cur_compression_type = 0; u64 compression_acct[3] = { 1, 0, 0 }; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { @@ -760,13 +758,13 @@ static int __trigger_extent(struct btree_trans *trans, acc_replicas_key.replicas.nr_required = 0; } - if (acct_compression_key.compression.type && - acct_compression_key.compression.type != p.crc.compression_type) { + if (cur_compression_type && + cur_compression_type != p.crc.compression_type) { if (flags & BTREE_TRIGGER_overwrite) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, - ARRAY_SIZE(compression_acct), gc); + ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, + compression, cur_compression_type); if (ret) return ret; @@ -775,7 +773,7 @@ static int __trigger_extent(struct btree_trans *trans, compression_acct[2] = 0; } - acct_compression_key.compression.type = p.crc.compression_type; + cur_compression_type = p.crc.compression_type; if (p.crc.compression_type) { compression_acct[1] += p.crc.uncompressed_size; compression_acct[2] += p.crc.compressed_size; @@ -789,45 +787,34 @@ static int __trigger_extent(struct btree_trans *trans, } if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { - struct disk_accounting_pos acc_snapshot_key = { - .type = BCH_DISK_ACCOUNTING_snapshot, - .snapshot.id = k.k->p.snapshot, - }; - ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot); if (ret) return ret; } - if (acct_compression_key.compression.type) { + if (cur_compression_type) { if (flags & BTREE_TRIGGER_overwrite) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, - ARRAY_SIZE(compression_acct), gc); + ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, + compression, cur_compression_type); if (ret) return ret; } if (level) { - struct disk_accounting_pos acc_btree_key = { - .type = BCH_DISK_ACCOUNTING_btree, - .btree.id = btree_id, - }; - ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id); if (ret) return ret; } else { bool insert = !(flags & BTREE_TRIGGER_overwrite); - struct disk_accounting_pos acc_inum_key = { - .type = BCH_DISK_ACCOUNTING_inum, - .inum.inum = k.k->p.inode, - }; + s64 v[3] = { insert ? 1 : -1, insert ? k.k->size : -((s64) k.k->size), *replicas_sectors, }; - ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc); + ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode); if (ret) return ret; } @@ -876,15 +863,15 @@ int bch2_trigger_extent(struct btree_trans *trans, } int need_rebalance_delta = 0; - s64 need_rebalance_sectors_delta = 0; + s64 need_rebalance_sectors_delta[1] = { 0 }; s64 s = bch2_bkey_sectors_need_rebalance(c, old); need_rebalance_delta -= s != 0; - need_rebalance_sectors_delta -= s; + need_rebalance_sectors_delta[0] -= s; s = bch2_bkey_sectors_need_rebalance(c, new.s_c); need_rebalance_delta += s != 0; - need_rebalance_sectors_delta += s; + need_rebalance_sectors_delta[0] += s; if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, @@ -893,12 +880,9 @@ int bch2_trigger_extent(struct btree_trans *trans, return ret; } - if (need_rebalance_sectors_delta) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_rebalance_work, - }; - int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1, - flags & BTREE_TRIGGER_gc); + if (need_rebalance_sectors_delta[0]) { + int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, + need_rebalance_sectors_delta, rebalance_work); if (ret) return ret; } @@ -914,17 +898,13 @@ static int __trigger_reservation(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - s64 sectors = k.k->size; + s64 sectors[1] = { k.k->size }; if (flags & BTREE_TRIGGER_overwrite) - sectors = -sectors; - - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_persistent_reserved, - .persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas, - }; + sectors[0] = -sectors[0]; - return bch2_disk_accounting_mod(trans, &acc, §ors, 1, flags & BTREE_TRIGGER_gc); + return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors, + persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas); } return 0; diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index f4372cafea2e..f9214e2d1346 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -85,6 +85,24 @@ static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, s64 *, unsigned, bool); + +#define disk_accounting_key_init(_k, _type, ...) \ +do { \ + memset(&(_k), 0, sizeof(_k)); \ + (_k).type = BCH_DISK_ACCOUNTING_##_type; \ + (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \ +} while (0) + +#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \ +({ \ + struct disk_accounting_pos pos; \ + disk_accounting_key_init(pos, __VA_ARGS__); \ + bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \ +}) + +#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \ + bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__) + int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h index 7b6e6c97e6aa..15190196485f 100644 --- a/fs/bcachefs/disk_accounting_format.h +++ b/fs/bcachefs/disk_accounting_format.h @@ -113,14 +113,14 @@ enum disk_accounting_type { BCH_DISK_ACCOUNTING_TYPE_NR, }; -struct bch_nr_inodes { +struct bch_acct_nr_inodes { }; -struct bch_persistent_reserved { +struct bch_acct_persistent_reserved { __u8 nr_replicas; }; -struct bch_dev_data_type { +struct bch_acct_dev_data_type { __u8 dev; __u8 data_type; }; @@ -149,10 +149,10 @@ struct disk_accounting_pos { struct { __u8 type; union { - struct bch_nr_inodes nr_inodes; - struct bch_persistent_reserved persistent_reserved; + struct bch_acct_nr_inodes nr_inodes; + struct bch_acct_persistent_reserved persistent_reserved; struct bch_replicas_entry_v1 replicas; - struct bch_dev_data_type dev_data_type; + struct bch_acct_dev_data_type dev_data_type; struct bch_acct_compression compression; struct bch_acct_snapshot snapshot; struct bch_acct_btree btree; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 1383fdcc42a5..80051073f613 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -731,10 +731,9 @@ int bch2_trigger_inode(struct btree_trans *trans, bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); } - s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); - if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) { - struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes }; - int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc); + s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) }; + if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) { + int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes); if (ret) return ret; } diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index d662adfbdbcc..99f9a0aaa380 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1990,15 +1990,12 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) mutex_unlock(&c->sb_lock); if (ca->mi.freespace_initialized) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_dev_data_type, - .dev_data_type.dev = ca->dev_idx, - .dev_data_type.data_type = BCH_DATA_free, - }; u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?: + bch2_disk_accounting_mod2(trans, false, v, dev_data_type, + .dev = ca->dev_idx, + .data_type = BCH_DATA_free)) ?: bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); if (ret) goto err; -- 2.51.0 From 0b4fd567261bc21ba1fd8636489396f0940b54f8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 14:22:39 -0400 Subject: [PATCH 15/16] bcachefs: btree_trans_restart_foreign_task() In debug mode, we save the call stack on transaction restart - but there's no locking, so we can't touch it if we're issuing the restart from another thread. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.h | 9 ++++++++- fs/bcachefs/btree_locking.c | 4 +++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index b96157f3dc9c..8823eec6b284 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -335,13 +335,20 @@ static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_tra } __always_inline -static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) +static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip) { BUG_ON(err <= 0); BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); trans->restarted = err; trans->last_restarted_ip = ip; + return -err; +} + +__always_inline +static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) +{ + btree_trans_restart_foreign_task(trans, err, ip); #ifdef CONFIG_BCACHEFS_DEBUG darray_exit(&trans->last_restarted_trace); bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT); diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index caef65adeae4..b18fbf6f6226 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -172,7 +172,9 @@ static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) { if (i == g->g) { trace_would_deadlock(g, i->trans); - return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); + return btree_trans_restart_foreign_task(i->trans, + BCH_ERR_transaction_restart_would_deadlock, + _THIS_IP_); } else { i->trans->lock_must_abort = true; wake_up_process(i->trans->locking_wait.task); -- 2.51.0 From 739200c57384313e688e6945b56782721c29459f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 21:53:41 -0400 Subject: [PATCH 16/16] bcachefs: Fix race in print_chain() 00636 Unable to handle kernel NULL pointer dereference at virtual address 00000000000000b0 00636 Mem abort info: 00636 ESR = 0x0000000096000005 00636 EC = 0x25: DABT (current EL), IL = 32 bits 00636 SET = 0, FnV = 0 00636 EA = 0, S1PTW = 0 00636 FSC = 0x05: level 1 translation fault 00636 Data abort info: 00636 ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000 00636 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 00636 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 00636 user pgtable: 4k pages, 39-bit VAs, pgdp=0000000101b10000 00636 [00000000000000b0] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000 00636 Internal error: Oops: 0000000096000005 [#1] SMP 00636 Modules linked in: 00636 CPU: 12 UID: 0 PID: 79369 Comm: cat Not tainted 6.14.0-rc6-ktest-g3783b8973ab7 #17757 00636 Hardware name: linux,dummy-virt (DT) 00636 pstate: 20001005 (nzCv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--) 00636 pc : print_chain+0xb8/0x170 00636 lr : print_chain+0xa0/0x170 00636 sp : ffffff80d9c1bbb0 00636 x29: ffffff80d9c1bbb0 x28: 0000000000000002 x27: ffffff80c1be8250 00636 x26: ffffff80dd9b0000 x25: 0000000000000020 x24: 000000000000002d 00636 x23: 000000000000003c x22: ffffffc080a54518 x21: ffffff80da6e00d0 00636 x20: ffffff80da6e0170 x19: ffffff80c1a1d240 x18: 00000000ffffffff 00636 x17: 3535303937202d3c x16: 203139202d3c2035 x15: 00000000ffffffff 00636 x14: 0000000000000000 x13: ffffff80d71b63f1 x12: 0000000000000006 00636 x11: ffffffc080beb1c0 x10: 0000000000000020 x9 : 00000000000134cc 00636 x8 : 0000000000000020 x7 : 0000000000000004 x6 : 0000000000000020 00636 x5 : ffffff80d71b63f7 x4 : ffffffc080a5451b x3 : 0000000000000000 00636 x2 : 0000000000000000 x1 : 0000000000000000 x0 : 0000000000000000 00636 Call trace: 00636 print_chain+0xb8/0x170 (P) 00636 bch2_check_for_deadlock+0x444/0x5a0 00636 bch2_btree_deadlock_read+0xb4/0x1c8 00636 full_proxy_read+0x74/0xd8 00636 vfs_read+0x90/0x300 Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_locking.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index b18fbf6f6226..94eb2b73a843 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -91,10 +91,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g) struct trans_waiting_for_lock *i; for (i = g->g; i != g->g + g->nr; i++) { - struct task_struct *task = i->trans->locking_wait.task; + struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); if (i != g->g) prt_str(out, "<- "); - prt_printf(out, "%u ", task ?task->pid : 0); + prt_printf(out, "%u ", task ? task->pid : 0); } prt_newline(out); } -- 2.51.0