From ff4cb203ccce24630c50a503973ac596c3d5d1be Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 11 Mar 2025 12:13:11 +0100 Subject: [PATCH 01/16] bcachefs: Use max() to improve gen_after() Use max() to simplify gen_after() and improve its readability. Signed-off-by: Thorsten Blum Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 6aeec1c0973c..c5363256e363 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -140,9 +140,7 @@ static inline int gen_cmp(u8 a, u8 b) static inline int gen_after(u8 a, u8 b) { - int r = gen_cmp(a, b); - - return r > 0 ? r : 0; + return max(0, gen_cmp(a, b)); } static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) -- 2.51.0 From a2e9e6874612582367be674e4d961de2ec8a9d05 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 11 Mar 2025 09:31:03 -0400 Subject: [PATCH 02/16] bcachefs: Kill a bit of dead code Found with CC=clang W=1 Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 14 -------------- fs/bcachefs/inode.c | 13 ------------- fs/bcachefs/journal_io.c | 5 ----- fs/bcachefs/move.c | 2 -- 4 files changed, 34 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index e32fce4fd258..7542c6f9c88e 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -562,20 +562,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, bch2_btree_node_iter_peek_all(&l->iter, l->b)); } -static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, - struct btree_path *path, - struct btree_path_level *l, - struct bkey *u) -{ - struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, - bch2_btree_node_iter_peek(&l->iter, l->b)); - - path->pos = k.k ? k.k->p : l->b->key.k.p; - trans->paths_sorted = false; - bch2_btree_path_verify_level(trans, path, l - path->l); - return k; -} - static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, struct btree_path *path, struct btree_path_level *l, diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 339b80770f1d..7aca010e2e10 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -868,19 +868,6 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, uid, gid, mode, rdev, parent); } -static inline u32 bkey_generation(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - case KEY_TYPE_inode_v2: - BUG(); - case KEY_TYPE_inode_generation: - return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); - default: - return 0; - } -} - static struct bkey_i_inode_alloc_cursor * bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) { diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 331c9d762439..cf2700b06d58 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1609,11 +1609,6 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) kvfree(new_buf); } -static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) -{ - return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); -} - static CLOSURE_CALLBACK(journal_write_done) { closure_type(w, struct journal_buf, io); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index ee489d222fba..0787d04a5fc3 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -712,7 +712,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, struct btree_iter iter = {}, bp_iter = {}; struct bkey_buf sk; struct bkey_s_c k; - unsigned sectors_moved = 0; struct bkey_buf last_flushed; int ret = 0; @@ -834,7 +833,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (ctxt->stats) atomic64_add(sectors, &ctxt->stats->sectors_seen); - sectors_moved += sectors; next: bch2_btree_iter_advance(&bp_iter); } -- 2.51.0 From 8dc4514d58f684b9bc08d956ab9a9ec65b38f63a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 13 Mar 2025 11:44:52 -0400 Subject: [PATCH 03/16] bcachefs: Kill bch2_remount() Single caller, so inline it. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 71 ++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 39 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 459ca8259fc0..17ac9c55fb96 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -2026,44 +2026,6 @@ static struct bch_fs *bch2_path_to_fs(const char *path) return c ?: ERR_PTR(-ENOENT); } -static int bch2_remount(struct super_block *sb, int *flags, - struct bch_opts opts) -{ - struct bch_fs *c = sb->s_fs_info; - int ret = 0; - - opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); - - if (opts.read_only != c->opts.read_only) { - down_write(&c->state_lock); - - if (opts.read_only) { - bch2_fs_read_only(c); - - sb->s_flags |= SB_RDONLY; - } else { - ret = bch2_fs_read_write(c); - if (ret) { - bch_err(c, "error going rw: %i", ret); - up_write(&c->state_lock); - ret = -EINVAL; - goto err; - } - - sb->s_flags &= ~SB_RDONLY; - } - - c->opts.read_only = opts.read_only; - - up_write(&c->state_lock); - } - - if (opt_defined(opts, errors)) - c->opts.errors = opts.errors; -err: - return bch2_err_class(ret); -} - static int bch2_show_devname(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; @@ -2374,8 +2336,39 @@ static int bch2_fs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct bch2_opts_parse *opts = fc->fs_private; + struct bch_fs *c = sb->s_fs_info; + int ret = 0; + + opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); + + if (opts->opts.read_only != c->opts.read_only) { + down_write(&c->state_lock); + + if (opts->opts.read_only) { + bch2_fs_read_only(c); + + sb->s_flags |= SB_RDONLY; + } else { + ret = bch2_fs_read_write(c); + if (ret) { + bch_err(c, "error going rw: %i", ret); + up_write(&c->state_lock); + ret = -EINVAL; + goto err; + } + + sb->s_flags &= ~SB_RDONLY; + } + + c->opts.read_only = opts->opts.read_only; - return bch2_remount(sb, &fc->sb_flags, opts->opts); + up_write(&c->state_lock); + } + + if (opt_defined(opts->opts, errors)) + c->opts.errors = opts->opts.errors; +err: + return bch2_err_class(ret); } static const struct fs_context_operations bch2_context_ops = { -- 2.51.0 From c991fbee8e6e91e9d0c859627b87fb7a06244a8b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 13 Mar 2025 15:21:13 -0400 Subject: [PATCH 04/16] bcachefs: rebalance, copygc status also print stacktrace These are commonly needed when debugging, and saves from having to ask users to dig. Also, rebalance_status now includes pending rebalance work. Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 14 ++++++++------ fs/bcachefs/movinggc.c | 11 +++++++++++ fs/bcachefs/rebalance.c | 29 ++++++++++++++++++++++++++--- 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 0787d04a5fc3..f86fb8ad636a 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -1251,17 +1251,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) prt_newline(out); printbuf_indent_add(out, 2); - prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved)); - prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced)); - prt_printf(out, "bytes seen: "); + prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved)); + prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced)); + prt_printf(out, "bytes seen:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); prt_newline(out); - prt_printf(out, "bytes moved: "); + prt_printf(out, "bytes moved:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); prt_newline(out); - prt_printf(out, "bytes raced: "); + prt_printf(out, "bytes raced:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); prt_newline(out); @@ -1270,7 +1270,8 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) { - struct moving_io *io; + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); bch2_move_stats_to_text(out, ctxt->stats); printbuf_indent_add(out, 2); @@ -1290,6 +1291,7 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str printbuf_indent_add(out, 2); mutex_lock(&ctxt->lock); + struct moving_io *io; list_for_each_entry(io, &ctxt->ios, io_list) bch2_data_update_inflight_to_text(out, &io->write); mutex_unlock(&ctxt->lock); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index fa19fc44622c..5126c870ce5b 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -317,6 +317,17 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) prt_printf(out, "Currently calculated wait:\t"); prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); prt_newline(out); + + rcu_read_lock(); + struct task_struct *t = rcu_dereference(c->copygc_thread); + if (t) + get_task_struct(t); + rcu_read_unlock(); + + if (t) { + bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); + put_task_struct(t); + } } static int bch2_copygc_thread(void *arg) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 58f6d97e506c..8b6795ec82f6 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -590,8 +590,19 @@ static int bch2_rebalance_thread(void *arg) void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) { + printbuf_tabstop_push(out, 32); + struct bch_fs_rebalance *r = &c->rebalance; + /* print pending work */ + struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_rebalance_work, }; + u64 v; + bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); + + prt_printf(out, "pending work:\t"); + prt_human_readable_u64(out, v); + prt_printf(out, "\n\n"); + prt_str(out, bch2_rebalance_state_strs[r->state]); prt_newline(out); printbuf_indent_add(out, 2); @@ -600,15 +611,15 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) case BCH_REBALANCE_waiting: { u64 now = atomic64_read(&c->io_clock[WRITE].now); - prt_str(out, "io wait duration: "); + prt_printf(out, "io wait duration:\t"); bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); prt_newline(out); - prt_str(out, "io wait remaining: "); + prt_printf(out, "io wait remaining:\t"); bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); prt_newline(out); - prt_str(out, "duration waited: "); + prt_printf(out, "duration waited:\t"); bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); prt_newline(out); break; @@ -621,6 +632,18 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) break; } prt_newline(out); + + rcu_read_lock(); + struct task_struct *t = rcu_dereference(c->rebalance.thread); + if (t) + get_task_struct(t); + rcu_read_unlock(); + + if (t) { + bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); + put_task_struct(t); + } + printbuf_indent_sub(out, 2); } -- 2.51.0 From 7c1e2a254fbc023df8d681946bab69cd68a4bde6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 14 Mar 2025 18:19:17 -0400 Subject: [PATCH 05/16] bcachefs: Add a cond_resched() to btree cache teardown [12308.606480] watchdog: BUG: soft lockup - CPU#18 stuck for 26s! [umount:48479] [12308.606485] Modules linked in: bcachefs lz4hc_compress lz4_compress lz4_decompress sunrpc overlay nf_conntrack_netlink xt_nat xt_tcpudp veth xt_conntrack xt_MASQUERADE bridge stp llc xfrm_user ip6table_nat ip6table_filter ip6_tables iptable_nat xt_addrtype iptable_filter ip_tables x_tables nfnetlink_cttimeout nfnetlink openvswitch nsh nf_conncount nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 psample ext4 mbcache jbd2 nls_iso8859_1 nls_cp850 vfat fat binfmt_misc skx_edac_common nfit edac_core libnvdimm cbc encrypted_keys intel_rapl_msr intel_rapl_common intel_uncore_frequency intel_uncore_frequency_common ipmi_ssif x86_pkg_temp_thermal intel_powerclamp kvm_intel kvm drivetemp rapl intel_cstate coretemp mgag200 i2c_algo_bit ixgbe drm_shmem_helper drm_kms_helper mdio_devres xfrm_algo mdio drm ptp intel_uncore mei_me efi_pstore evdev uas pl2303 pps_core libphy usb_storage usbserial lpc_ich mei drm_panel_orientation_quirks acpi_power_meter tiny_power_button ipmi_si mfd_core intel_pch_thermal acpi_tad acpi_ipmi ioatdma [12308.606541] ipmi_devintf ipmi_msghandler dca wmi button efivarfs polyval_clmulni polyval_generic ghash_clmulni_intel sha512_ssse3 sha256_ssse3 sha1_ssse3 sha1_generic xhci_pci xhci_hcd aesni_intel ehci_pci ehci_hcd gf128mul crypto_simd cryptd usbcore hpwdt usb_common [12308.606557] CPU: 18 UID: 0 PID: 48479 Comm: umount Tainted: G L 6.14.0-rc6-x86_64-00159-ga09496a03e63 #1 [12308.606560] Tainted: [L]=SOFTLOCKUP [12308.606561] Hardware name: HPE ProLiant DL380 Gen10/ProLiant DL380 Gen10, BIOS U30 07/20/2023 [12308.606563] RIP: 0010:clear_page_erms+0x7/0x10 [12308.606570] Code: 48 89 47 38 48 8d 7f 40 75 d9 90 c3 cc cc cc cc 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 b9 00 10 00 00 31 c0 aa c3 cc cc cc cc 66 90 90 90 90 90 90 90 90 90 90 90 90 90 90 [12308.606572] RSP: 0018:ffff9ed5b622fba0 EFLAGS: 00010246 [12308.606574] RAX: 0000000000000000 RBX: ffff90347fffe6c0 RCX: 00000000000004c0 [12308.606575] RDX: ffffe34ea9bec1c0 RSI: 00000000000405f0 RDI: ffff902eafb07b40 [12308.606576] RBP: ffff9ed5b622fbf0 R08: 0000000000000001 R09: 0000000000000006 [12308.606577] R10: 0000000000040001 R11: 0000000000000000 R12: ffffe34ea9bec000 [12308.606578] R13: 0000000000000000 R14: 0000000000000006 R15: ffffe34ea9bed000 [12308.606580] FS: 00007fe704ecfb68(0000) GS:ffff9053fea00000(0000) knlGS:0000000000000000 [12308.606581] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [12308.606582] CR2: 00007f18159068ae CR3: 00000001314d0005 CR4: 00000000007726f0 [12308.606583] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [12308.606584] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [12308.606584] PKRU: 55555554 [12308.606585] Call Trace: [12308.606587] [12308.606590] ? show_regs.cold+0x19/0x28 [12308.606595] ? watchdog_timer_fn.cold+0x3d/0x9d [12308.606598] ? __pfx_watchdog_timer_fn+0x10/0x10 [12308.606602] ? __hrtimer_run_queues+0x12e/0x250 [12308.606607] ? hrtimer_interrupt+0xfd/0x220 [12308.606609] ? __sysvec_apic_timer_interrupt+0x53/0xe0 [12308.606614] ? sysvec_apic_timer_interrupt+0x76/0xa0 [12308.606619] [12308.606620] [12308.606620] ? asm_sysvec_apic_timer_interrupt+0x1b/0x20 [12308.606626] ? clear_page_erms+0x7/0x10 [12308.606628] ? __free_pages_ok+0x374/0x640 [12308.606633] free_frozen_pages+0x34/0x570 [12308.606636] __folio_put+0x87/0xe0 [12308.606641] free_large_kmalloc+0x70/0x80 [12308.606645] kfree+0x2f6/0x390 [12308.606648] kvfree+0x2d/0x40 [12308.606653] __btree_node_data_free+0xaf/0xf0 [bcachefs] [12308.606726] btree_node_data_free+0x6a/0x80 [bcachefs] [12308.606778] bch2_fs_btree_cache_exit+0x262/0x440 [bcachefs] [12308.606829] bch2_fs_release+0xe8/0x340 [bcachefs] [12308.606905] kobject_put+0x60/0xc0 [12308.606908] bch2_fs_free+0xdd/0x120 [bcachefs] [12308.606981] bch2_kill_sb+0x1e/0x30 [bcachefs] [12308.607051] deactivate_locked_super+0x32/0xb0 [12308.607055] deactivate_super+0x40/0x50 [12308.607057] cleanup_mnt+0xc3/0x160 [12308.607060] __cleanup_mnt+0x12/0x20 [12308.607062] task_work_run+0x5f/0xa0 [12308.607064] syscall_exit_to_user_mode+0x194/0x1a0 [12308.607066] do_syscall_64+0x67/0x170 [12308.607068] entry_SYSCALL_64_after_hwframe+0x76/0x7e [12308.607070] RIP: 0033:0x7fe704e66eed [12308.607073] Code: 08 49 89 ca b8 a5 00 00 00 0f 05 48 89 c7 e8 8a e6 ff ff 48 83 c4 Reported-by: Stijn Tintel Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 1ec1f90e0eb3..54666027aa85 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -610,6 +610,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) btree_node_write_in_flight(b)); btree_node_data_free(bc, b); + cond_resched(); } BUG_ON(!bch2_journal_error(&c->journal) && -- 2.51.0 From 9ec00891493d3e4f60678ed12988761538f95bd1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 13 Mar 2025 00:47:51 -0400 Subject: [PATCH 06/16] bcachefs: bch2_bkey_ptrs_rebalance_opts() Small optimization for bch2_bkey_sectors_need_rebalance() Signed-off-by: Kent Overstreet --- fs/bcachefs/rebalance.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 8b6795ec82f6..29a569384146 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -26,9 +26,8 @@ /* bch_extent_rebalance: */ -static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; bkey_extent_entry_for_each(ptrs, entry) @@ -38,6 +37,11 @@ static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s return NULL; } +static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +{ + return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k)); +} + static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, struct bch_io_opts *opts, struct bkey_s_c k, @@ -97,11 +101,12 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) { - const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs); if (!opts) return 0; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 sectors = 0; -- 2.51.0 From 6d80fca9efe9255369aa91e85e8f3367c42acdde Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Mar 2025 11:54:13 -0400 Subject: [PATCH 07/16] bcachefs: Don't create bch_io_failures unless it's needed Only needed in retry path, no point in wasting stack space. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 73275da5d2c4..6bdb8efb4cd1 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -147,13 +147,11 @@ void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, subvol_inum inum) { - struct bch_io_failures failed = { .nr = 0 }; - BUG_ON(rbio->_state); rbio->subvol = inum.subvol; - __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, + __bch2_read(c, rbio, rbio->bio.bi_iter, inum, NULL, BCH_READ_retry_if_stale| BCH_READ_may_promote| BCH_READ_user_mapped); -- 2.51.0 From 5a06cb8000addbbfd1f9ce6891098da5b48d3d1e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 8 Mar 2025 18:42:56 -0500 Subject: [PATCH 08/16] bcachefs: Debug params for data corruption injection dm-flakey is busted, and this is simpler anyways - this lets us test the checksum error retry ptahs Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 8 ++++++++ fs/bcachefs/io_write.c | 24 ++++++++++++++++++++++++ fs/bcachefs/util.c | 21 +++++++++++++++++++++ fs/bcachefs/util.h | 12 ++++++++++++ 4 files changed, 65 insertions(+) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 70e5c5a32d01..d39f321b51fc 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -27,6 +27,12 @@ #include +#ifdef CONFIG_BCACHEFS_DEBUG +static unsigned bch2_read_corrupt_ratio; +module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); +MODULE_PARM_DESC(read_corrupt_ratio, ""); +#endif + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static bool bch2_target_congested(struct bch_fs *c, u16 target) @@ -688,6 +694,8 @@ static void __bch2_read_endio(struct work_struct *work) src->bi_iter = rbio->bvec_iter; } + bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio); + csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index dbfcb28f003d..48befbae0226 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -34,6 +34,12 @@ #include #include +#ifdef CONFIG_BCACHEFS_DEBUG +static unsigned bch2_write_corrupt_ratio; +module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644); +MODULE_PARM_DESC(write_corrupt_ratio, ""); +#endif + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, @@ -1005,6 +1011,15 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, bounce = true; } +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio); + if (!bounce && write_corrupt_ratio) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bounce = true; + } +#endif saved_iter = dst->bi_iter; do { @@ -1114,6 +1129,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, init_append_extent(op, wp, version, crc); +#ifdef CONFIG_BCACHEFS_DEBUG + if (write_corrupt_ratio) { + swap(dst->bi_iter.bi_size, dst_len); + bch2_maybe_corrupt_bio(dst, write_corrupt_ratio); + swap(dst->bi_iter.bi_size, dst_len); + } +#endif + if (dst != src) bio_advance(dst, dst_len); bio_advance(src, src_len); @@ -1394,6 +1417,7 @@ retry: bio->bi_private = &op->cl; bio->bi_opf |= REQ_OP_WRITE; closure_get(&op->cl); + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, op->insert_keys.top, true); diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index a7edbcca1a84..553de8d8e3e5 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -704,6 +704,27 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) } } +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_corrupt_bio(struct bio *bio) +{ + struct bvec_iter iter; + struct bio_vec bv; + unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64)); + + bio_for_each_segment(bv, bio, iter) { + unsigned u64s = bv.bv_len / sizeof(u64); + + if (offset < u64s) { + u64 *segment = bvec_kmap_local(&bv); + segment[offset] = get_random_u64(); + kunmap_local(segment); + return; + } + offset -= u64s; + } +} +#endif + #if 0 void eytzinger1_test(void) { diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index f4a4783219d9..d41e133acc4d 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -406,6 +406,18 @@ u64 bch2_get_random_u64_below(u64); void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); void memcpy_from_bio(void *, struct bio *, struct bvec_iter); +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_corrupt_bio(struct bio *); + +static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio) +{ + if (ratio && !get_random_u32_below(ratio)) + bch2_corrupt_bio(bio); +} +#else +#define bch2_maybe_corrupt_bio(...) do {} while (0) +#endif + static inline void memcpy_u64s_small(void *dst, const void *src, unsigned u64s) { -- 2.51.0 From 943f0cfb1559ac6c9fc9082998f20dfe2aa01a74 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Mar 2025 17:20:22 -0500 Subject: [PATCH 09/16] bcachefs: Convert read path to standard error codes Kill the READ_ERR/READ_RETRY/READ_RETRY_AVOID enums, and add standard error codes that describe precisely which error occured. This is going to be used for the data move path, to move but poison extents with checksum errors. Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 13 ++++++ fs/bcachefs/io_read.c | 93 ++++++++++++++++++++++++------------------- fs/bcachefs/io_read.h | 4 +- 3 files changed, 67 insertions(+), 43 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index e14e0d1cc93d..5050d978624b 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -282,6 +282,19 @@ x(EIO, EIO_fault_injected) \ x(EIO, ec_block_read) \ x(EIO, ec_block_write) \ + x(EIO, data_read) \ + x(BCH_ERR_data_read, data_read_retry) \ + x(BCH_ERR_data_read_retry, data_read_retry_avoid) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \ + x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\ + x(BCH_ERR_data_read, data_read_decompress_err) \ + x(BCH_ERR_data_read, data_read_decrypt_err) \ + x(BCH_ERR_data_read, data_read_ptr_stale_race) \ + x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \ + x(BCH_ERR_data_read, data_read_no_encryption_key) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index d39f321b51fc..797c29bde9b6 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -347,10 +347,6 @@ static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); } -#define READ_RETRY_AVOID 1 -#define READ_RETRY 2 -#define READ_ERR 3 - enum rbio_context { RBIO_CONTEXT_NULL, RBIO_CONTEXT_HIGHPRI, @@ -452,7 +448,7 @@ retry: err: bch2_trans_iter_exit(trans, &iter); - if (ret == READ_RETRY) + if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) goto retry; if (ret) rbio->bio.bi_status = BLK_STS_IOERR; @@ -479,11 +475,13 @@ static void bch2_rbio_retry(struct work_struct *work) this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], bvec_iter_sectors(rbio->bvec_iter)); - if (rbio->retry == READ_RETRY_AVOID) + if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) bch2_mark_io_failure(&failed, &rbio->pick); - if (!rbio->split) - rbio->bio.bi_status = 0; + if (!rbio->split) { + rbio->bio.bi_status = 0; + rbio->ret = 0; + } rbio = bch2_rbio_free(rbio); @@ -498,23 +496,29 @@ static void bch2_rbio_retry(struct work_struct *work) __bch2_read(c, rbio, iter, inum, &failed, flags); } -static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, - blk_status_t error) +static void bch2_rbio_error(struct bch_read_bio *rbio, + int ret, blk_status_t blk_error) { - rbio->retry = retry; - rbio->saw_error = true; + BUG_ON(ret >= 0); + + rbio->ret = ret; + rbio->bio.bi_status = blk_error; + + bch2_rbio_parent(rbio)->saw_error = true; if (rbio->flags & BCH_READ_in_retry) return; - if (retry == READ_ERR) { + if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { + bch2_rbio_punt(rbio, bch2_rbio_retry, + RBIO_CONTEXT_UNBOUND, system_unbound_wq); + } else { rbio = bch2_rbio_free(rbio); - rbio->bio.bi_status = error; + rbio->ret = ret; + rbio->bio.bi_status = blk_error; + bch2_rbio_done(rbio); - } else { - bch2_rbio_punt(rbio, bch2_rbio_retry, - RBIO_CONTEXT_UNBOUND, system_unbound_wq); } } @@ -536,7 +540,7 @@ static void bch2_read_io_err(struct work_struct *work) bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); - bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); } static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, @@ -623,7 +627,7 @@ static void bch2_read_csum_err(struct work_struct *work) else bch_err_ratelimited(c, "%s", buf.buf); - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); printbuf_exit(&buf); } @@ -643,7 +647,7 @@ static void bch2_read_decompress_err(struct work_struct *work) else bch_err_ratelimited(c, "%s", buf.buf); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); printbuf_exit(&buf); } @@ -663,7 +667,7 @@ static void bch2_read_decrypt_err(struct work_struct *work) else bch_err_ratelimited(c, "%s", buf.buf); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); printbuf_exit(&buf); } @@ -706,7 +710,8 @@ static void __bch2_read_endio(struct work_struct *work) */ if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { rbio->flags |= BCH_READ_must_bounce; - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace, + BLK_STS_IOERR); goto out; } @@ -820,9 +825,9 @@ static void bch2_read_endio(struct bio *bio) trace_and_count(c, io_read_reuse_race, &rbio->bio); if (rbio->flags & BCH_READ_retry_if_stale) - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); + bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN); else - bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); + bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN); return; } @@ -895,7 +900,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bch_read_bio *rbio = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); - int pick_ret; + int ret = 0; if (bkey_extent_is_inline_data(k.k)) { unsigned bytes = min_t(unsigned, iter.bi_size, @@ -911,16 +916,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto out_read_done; } retry_pick: - pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); + ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); /* hole or reservation - just zero fill: */ - if (!pick_ret) + if (!ret) goto hole; - if (unlikely(pick_ret < 0)) { + if (unlikely(ret < 0)) { struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, orig, read_pos); - prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret)); + prt_printf(&buf, "%s\n ", bch2_err_str(ret)); bch2_bkey_val_to_text(&buf, c, k); bch_err_ratelimited(c, "%s", buf.buf); @@ -936,6 +941,7 @@ retry_pick: bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); + ret = -BCH_ERR_data_read_no_encryption_key; goto err; } @@ -1071,7 +1077,7 @@ retry_pick: rbio->have_ioref = ca != NULL; rbio->narrow_crcs = narrow_crcs; rbio->hole = 0; - rbio->retry = 0; + rbio->ret = 0; rbio->context = 0; rbio->pick = pick; rbio->subvol = orig->subvol; @@ -1126,7 +1132,9 @@ retry_pick: bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_error(rbio, + -BCH_ERR_data_read_retry_device_offline, + BLK_STS_IOERR); goto out; } @@ -1152,7 +1160,8 @@ retry_pick: } else { /* Attempting reconstruct read: */ if (bch2_ec_read_extent(trans, rbio, k)) { - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err, + BLK_STS_IOERR); goto out; } @@ -1170,13 +1179,11 @@ out: rbio->context = RBIO_CONTEXT_UNBOUND; bch2_read_endio(&rbio->bio); - ret = rbio->retry; + ret = rbio->ret; rbio = bch2_rbio_free(rbio); - if (ret == READ_RETRY_AVOID) { + if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) bch2_mark_io_failure(failed, &pick); - ret = READ_RETRY; - } if (!ret) goto out_read_done; @@ -1186,9 +1193,10 @@ out: err: if (flags & BCH_READ_in_retry) - return READ_ERR; + return ret; - orig->bio.bi_status = BLK_STS_IOERR; + orig->bio.bi_status = BLK_STS_IOERR; + orig->ret = ret; goto out_read_done; hole: @@ -1285,8 +1293,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, err: if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - ret != READ_RETRY && - ret != READ_RETRY_AVOID) + !bch2_err_matches(ret, BCH_ERR_data_read_retry)) break; } @@ -1297,11 +1304,13 @@ err: lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9)); - prt_printf(&buf, "read error %i from btree lookup", ret); + prt_printf(&buf, "read error %s from btree lookup", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); - rbio->bio.bi_status = BLK_STS_IOERR; + rbio->bio.bi_status = BLK_STS_IOERR; + rbio->ret = ret; + bch2_rbio_done(rbio); } diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 6bdb8efb4cd1..1eb01e9847d7 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -42,11 +42,11 @@ struct bch_read_bio { narrow_crcs:1, hole:1, saw_error:1, - retry:2, context:2; }; u16 _state; }; + s16 ret; struct extent_ptr_decoded pick; @@ -164,6 +164,7 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, rbio->c = orig->c; rbio->_state = 0; + rbio->ret = 0; rbio->split = true; rbio->parent = orig; rbio->opts = orig->opts; @@ -180,6 +181,7 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio, rbio->start_time = local_clock(); rbio->c = c; rbio->_state = 0; + rbio->ret = 0; rbio->opts = opts; rbio->bio.bi_end_io = end_io; return rbio; -- 2.51.0 From e75993b0bf8baa48b2e96d693852191f63b615fd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 11 Mar 2025 09:04:09 -0400 Subject: [PATCH 10/16] bcachefs: Fix BCH_ERR_data_read_csum_err_maybe_userspace in retry path When we do a read to a buffer that's mapped into userspace, it's possible to get a spurious checksum error if userspace was modified the buffer at the same time. When we retry those, they have to be bounced before we know definitively whether we're reading corrupt data. But the retry path propagates read flags differently, so needs special handling. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 797c29bde9b6..17bc413c27ba 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -1291,6 +1291,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, swap(bvec_iter.bi_size, bytes); bio_advance_iter(&rbio->bio, &bvec_iter, bytes); err: + if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) + flags |= BCH_READ_must_bounce; + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_data_read_retry)) -- 2.51.0 From f4b84bac20d6999db9e7db2254e63471c6c3fd9c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 8 Mar 2025 11:24:22 -0500 Subject: [PATCH 11/16] bcachefs: Read error message now indicates if it was for an internal move Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 17bc413c27ba..a7865f34ea35 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -335,10 +335,17 @@ nopromote: static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, struct bch_read_bio *rbio, struct bpos read_pos) { - return lockrestart_do(trans, + int ret = lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, out, (subvol_inum) { rbio->subvol, read_pos.inode }, read_pos.offset << 9)); + if (ret) + return ret; + + if (rbio->flags & BCH_READ_data_update) + prt_str(out, "(internal move) "); + + return 0; } static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, -- 2.51.0 From 881b598ef144a1dee3850be9e6b9ecfcfc5bf4b0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 8 Mar 2025 11:37:51 -0500 Subject: [PATCH 12/16] bcachefs: BCH_ERR_data_read_buffer_too_small Now that the read path uses proper error codes, we can get rid of the weird rbio->hole signalling to the move path that the read didn't happen. Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 2 ++ fs/bcachefs/io_read.c | 9 ++++----- fs/bcachefs/io_read.h | 1 - fs/bcachefs/move.c | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 5050d978624b..afa16d58041e 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -295,6 +295,8 @@ x(BCH_ERR_data_read, data_read_ptr_stale_race) \ x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \ x(BCH_ERR_data_read, data_read_no_encryption_key) \ + x(BCH_ERR_data_read, data_read_buffer_too_small) \ + x(BCH_ERR_data_read, data_read_key_overwritten) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index a7865f34ea35..d1497af58180 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -443,7 +443,7 @@ retry: if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { /* extent we wanted to read no longer exists: */ - rbio->hole = true; + rbio->ret = -BCH_ERR_data_read_key_overwritten; goto err; } @@ -1000,10 +1000,10 @@ retry_pick: */ struct data_update *u = container_of(orig, struct data_update, rbio); if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { - BUG(); if (ca) percpu_ref_put(&ca->io_ref); - goto hole; + rbio->ret = -BCH_ERR_data_read_buffer_too_small; + goto out_read_done; } iter.bi_size = pick.crc.compressed_size << 9; @@ -1083,7 +1083,6 @@ retry_pick: rbio->flags = flags; rbio->have_ioref = ca != NULL; rbio->narrow_crcs = narrow_crcs; - rbio->hole = 0; rbio->ret = 0; rbio->context = 0; rbio->pick = pick; @@ -1215,7 +1214,7 @@ hole: * to read no longer exists we have to signal that: */ if (flags & BCH_READ_data_update) - orig->hole = true; + orig->ret = -BCH_ERR_data_read_key_overwritten; zero_fill_bio_iter(&orig->bio, iter); out_read_done: diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 1eb01e9847d7..924406558f78 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -40,7 +40,6 @@ struct bch_read_bio { split:1, have_ioref:1, narrow_crcs:1, - hole:1, saw_error:1, context:2; }; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index f86fb8ad636a..307b918fa2e7 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -125,8 +125,8 @@ static void move_write(struct moving_io *io) &ctxt->stats->sectors_error_corrected); } - if (unlikely(io->write.rbio.bio.bi_status || - io->write.rbio.hole || + if (unlikely(io->write.rbio.ret || + io->write.rbio.bio.bi_status || io->write.data_opts.scrub)) { move_free(io); return; -- 2.51.0 From de73677ff8e677bf84a0eefa17b3913f65b57a74 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 8 Mar 2025 19:37:10 -0500 Subject: [PATCH 13/16] bcachefs: Return errors to top level bch2_rbio_retry() Next patch will be adding an additional retry loop for checksum errors, so that we can rule out transient errors before marking an extent as poisoned. Prerequisite to this is returning errors to bch2_rbio_retry(); this will also let us add a "successful retry" message. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 41 ++++++++++++++++++++++++++--------------- fs/bcachefs/io_read.h | 4 ++-- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index d1497af58180..e54103f79323 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -422,7 +422,7 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) bio_endio(&rbio->bio); } -static noinline void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, +static noinline int bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, struct bch_io_failures *failed, unsigned flags) @@ -457,12 +457,16 @@ err: if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) goto retry; - if (ret) - rbio->bio.bi_status = BLK_STS_IOERR; + + if (ret) { + rbio->bio.bi_status = BLK_STS_IOERR; + rbio->ret = ret; + } BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); - bch2_rbio_done(rbio); bch2_trans_put(trans); + + return ret; } static void bch2_rbio_retry(struct work_struct *work) @@ -497,10 +501,16 @@ static void bch2_rbio_retry(struct work_struct *work) flags &= ~BCH_READ_last_fragment; flags |= BCH_READ_must_clone; - if (flags & BCH_READ_data_update) - bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); - else - __bch2_read(c, rbio, iter, inum, &failed, flags); + int ret = flags & BCH_READ_data_update + ? bch2_read_retry_nodecode(c, rbio, iter, &failed, flags) + : __bch2_read(c, rbio, iter, inum, &failed, flags); + + if (ret) { + rbio->ret = ret; + rbio->bio.bi_status = BLK_STS_IOERR; + } + + bch2_rbio_done(rbio); } static void bch2_rbio_error(struct bch_read_bio *rbio, @@ -1191,9 +1201,6 @@ out: if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) bch2_mark_io_failure(failed, &pick); - if (!ret) - goto out_read_done; - return ret; } @@ -1218,12 +1225,13 @@ hole: zero_fill_bio_iter(&orig->bio, iter); out_read_done: - if (flags & BCH_READ_last_fragment) + if ((flags & BCH_READ_last_fragment) && + !(flags & BCH_READ_in_retry)) bch2_rbio_done(orig); return 0; } -void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, +int __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, subvol_inum inum, struct bch_io_failures *failed, unsigned flags) { @@ -1313,18 +1321,21 @@ err: lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9)); - prt_printf(&buf, "read error %s from btree lookup", bch2_err_str(ret)); + prt_printf(&buf, "read error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); rbio->bio.bi_status = BLK_STS_IOERR; rbio->ret = ret; - bch2_rbio_done(rbio); + if (!(flags & BCH_READ_in_retry)) + bch2_rbio_done(rbio); } bch2_trans_put(trans); bch2_bkey_buf_exit(&sk, c); + + return ret; } void bch2_fs_io_read_exit(struct bch_fs *c) diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 924406558f78..42a22985d789 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -140,8 +140,8 @@ static inline void bch2_read_extent(struct btree_trans *trans, data_btree, k, offset_into_extent, NULL, flags, -1); } -void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - subvol_inum, struct bch_io_failures *, unsigned flags); +int __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, + subvol_inum, struct bch_io_failures *, unsigned flags); static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, subvol_inum inum) -- 2.51.0 From ccba9029b01cdcc1aa6f3ed6375efdc0d779cc8f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 8 Mar 2025 18:42:34 -0500 Subject: [PATCH 14/16] bcachefs: Print message on successful read retry Users have been asking for this, and now that errors are returned to the top level read retry path - we can. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index e54103f79323..887e3c9ac181 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -494,6 +494,9 @@ static void bch2_rbio_retry(struct work_struct *work) rbio->ret = 0; } + unsigned subvol = rbio->subvol; + struct bpos read_pos = rbio->read_pos; + rbio = bch2_rbio_free(rbio); flags |= BCH_READ_in_retry; @@ -508,6 +511,19 @@ static void bch2_rbio_retry(struct work_struct *work) if (ret) { rbio->ret = ret; rbio->bio.bi_status = BLK_STS_IOERR; + } else { + struct printbuf buf = PRINTBUF; + + bch2_trans_do(c, + bch2_inum_offset_err_msg_trans(trans, &buf, + (subvol_inum) { subvol, read_pos.inode }, + read_pos.offset << 9)); + if (rbio->flags & BCH_READ_data_update) + prt_str(&buf, "(internal move) "); + prt_str(&buf, "successful retry"); + + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); } bch2_rbio_done(rbio); -- 2.51.0 From be31e412ac01f49bf7afa8eaa93dac399914a0a1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 8 Mar 2025 12:56:43 -0500 Subject: [PATCH 15/16] bcachefs: Checksum errors get additional retries It's possible for checksum errors to be transient - e.g. flakey controller or cable, thus we need additional retries (besides retrying from different replicas) before we can definitely return an error. This is particularly important for the next patch, which will allow the data move path to move extents with checksum errors - we don't want to accidentally introduce bitrot due to a transient error! - bch2_bkey_pick_read_device() is substantially reworked, and bch2_dev_io_failures is expanded to record more information about the type of failure (i.e. number of checksum errors). It now returns an error code that describes more precisely the reason for the failure - checksum error, io error, or offline device, instead of the previous generic "insufficient devices". This is important for the next patches that add poisoning, as we only want to poison extents when we've got real checksum errors (or perhaps IO errors?) - not because a device was offline. - Add a new option and superblock field for the number of checksum retries. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 2 + fs/bcachefs/btree_io.c | 2 +- fs/bcachefs/errcode.h | 4 +- fs/bcachefs/extents.c | 165 ++++++++++++++++++++-------------- fs/bcachefs/extents.h | 7 +- fs/bcachefs/extents_types.h | 11 +-- fs/bcachefs/io_read.c | 10 ++- fs/bcachefs/opts.h | 5 ++ fs/bcachefs/super-io.c | 4 + 9 files changed, 128 insertions(+), 82 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 7a5b0d211a82..e96d87767020 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -842,6 +842,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); +/* one free bit */ LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); @@ -861,6 +862,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, struct bch_sb, flags[5], 48, 64); LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); +LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index cd792fee7ee3..6abc9f17ea3c 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1355,7 +1355,7 @@ start: percpu_ref_put(&ca->io_ref); rb->have_ioref = false; - bch2_mark_io_failure(&failed, &rb->pick); + bch2_mark_io_failure(&failed, &rb->pick, false); can_retry = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index afa16d58041e..493cae4efc37 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -273,7 +273,6 @@ x(EIO, stripe_reconstruct) \ x(EIO, key_type_error) \ x(EIO, extent_poisened) \ - x(EIO, no_device_to_read_from) \ x(EIO, missing_indirect_extent) \ x(EIO, invalidate_stripe_to_dev) \ x(EIO, no_encryption_key) \ @@ -283,6 +282,9 @@ x(EIO, ec_block_read) \ x(EIO, ec_block_write) \ x(EIO, data_read) \ + x(BCH_ERR_data_read, no_device_to_read_from) \ + x(BCH_ERR_data_read, data_read_io_err) \ + x(BCH_ERR_data_read, data_read_csum_err) \ x(BCH_ERR_data_read, data_read_retry) \ x(BCH_ERR_data_read_retry, data_read_retry_avoid) \ x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 032cd0bda017..04946d9911f5 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -58,7 +58,8 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, } void bch2_mark_io_failure(struct bch_io_failures *failed, - struct extent_ptr_decoded *p) + struct extent_ptr_decoded *p, + bool csum_error) { struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev); @@ -66,17 +67,16 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); f = &failed->devs[failed->nr++]; - f->dev = p->ptr.dev; - f->idx = p->idx; - f->nr_failed = 1; - f->nr_retries = 0; - } else if (p->idx != f->idx) { - f->idx = p->idx; - f->nr_failed = 1; - f->nr_retries = 0; - } else { - f->nr_failed++; + memset(f, 0, sizeof(*f)); + f->dev = p->ptr.dev; } + + if (p->do_ec_reconstruct) + f->failed_ec = true; + else if (!csum_error) + f->failed_io = true; + else + f->failed_csum_nr++; } static inline u64 dev_latency(struct bch_dev *ca) @@ -94,37 +94,30 @@ static inline int dev_failed(struct bch_dev *ca) */ static inline bool ptr_better(struct bch_fs *c, const struct extent_ptr_decoded p1, - const struct extent_ptr_decoded p2) + u64 p1_latency, + struct bch_dev *ca1, + const struct extent_ptr_decoded p2, + u64 p2_latency) { - if (likely(!p1.idx && !p2.idx)) { - struct bch_dev *ca1 = bch2_dev_rcu(c, p1.ptr.dev); - struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev); - - int failed_delta = dev_failed(ca1) - dev_failed(ca2); - - if (failed_delta) - return failed_delta < 0; + struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev); - u64 l1 = dev_latency(ca1); - u64 l2 = dev_latency(ca2); + int failed_delta = dev_failed(ca1) - dev_failed(ca2); + if (unlikely(failed_delta)) + return failed_delta < 0; - /* - * Square the latencies, to bias more in favor of the faster - * device - we never want to stop issuing reads to the slower - * device altogether, so that we can update our latency numbers: - */ - l1 *= l1; - l2 *= l2; + if (unlikely(bch2_force_reconstruct_read)) + return p1.do_ec_reconstruct > p2.do_ec_reconstruct; - /* Pick at random, biased in favor of the faster device: */ + if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct)) + return p1.do_ec_reconstruct < p2.do_ec_reconstruct; - return bch2_get_random_u64_below(l1 + l2) > l1; - } + int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr; + if (unlikely(crc_retry_delta)) + return crc_retry_delta < 0; - if (bch2_force_reconstruct_read) - return p1.idx > p2.idx; + /* Pick at random, biased in favor of the faster device: */ - return p1.idx < p2.idx; + return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency; } /* @@ -133,73 +126,109 @@ static inline bool ptr_better(struct bch_fs *c, * other devices, it will still pick a pointer from avoid. */ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_failures *failed, - struct extent_ptr_decoded *pick, - int dev) + struct bch_io_failures *failed, + struct extent_ptr_decoded *pick, + int dev) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - struct bch_dev_io_failures *f; - int ret = 0; + bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false; + bool have_dirty_ptrs = false, have_pick = false; if (k.k->type == KEY_TYPE_error) return -BCH_ERR_key_type_error; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) return -BCH_ERR_extent_poisened; rcu_read_lock(); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + u64 pick_latency; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + have_dirty_ptrs |= !p.ptr.cached; + /* * Unwritten extent: no need to actually read, treat it as a * hole and return 0s: */ if (p.ptr.unwritten) { - ret = 0; - break; + rcu_read_unlock(); + return 0; } /* Are we being asked to read from a specific device? */ if (dev >= 0 && p.ptr.dev != dev) continue; - /* - * If there are any dirty pointers it's an error if we can't - * read: - */ - if (!ret && !p.ptr.cached) - ret = -BCH_ERR_no_device_to_read_from; - struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) continue; - f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; - if (f) - p.idx = f->nr_failed < f->nr_retries - ? f->idx - : f->idx + 1; + struct bch_dev_io_failures *f = + unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; + if (unlikely(f)) { + p.crc_retry_nr = f->failed_csum_nr; + p.has_ec &= ~f->failed_ec; - if (!p.idx && (!ca || !bch2_dev_is_online(ca))) - p.idx++; + if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) { + have_io_errors |= f->failed_io; + have_io_errors |= f->failed_ec; + } + have_csum_errors |= !!f->failed_csum_nr; - if (!p.idx && p.has_ec && bch2_force_reconstruct_read) - p.idx++; + if (p.has_ec && (f->failed_io || f->failed_csum_nr)) + p.do_ec_reconstruct = true; + else if (f->failed_io || + f->failed_csum_nr > c->opts.checksum_err_retry_nr) + continue; + } - if (p.idx > (unsigned) p.has_ec) - continue; + have_missing_devs |= ca && !bch2_dev_is_online(ca); - if (ret > 0 && !ptr_better(c, p, *pick)) - continue; + if (!ca || !bch2_dev_is_online(ca)) { + if (!p.has_ec) + continue; + p.do_ec_reconstruct = true; + } - *pick = p; - ret = 1; + if (bch2_force_reconstruct_read && p.has_ec) + p.do_ec_reconstruct = true; + + u64 p_latency = dev_latency(ca); + /* + * Square the latencies, to bias more in favor of the faster + * device - we never want to stop issuing reads to the slower + * device altogether, so that we can update our latency numbers: + */ + p_latency *= p_latency; + + if (!have_pick || + ptr_better(c, + p, p_latency, ca, + *pick, pick_latency)) { + *pick = p; + pick_latency = p_latency; + have_pick = true; + } } rcu_read_unlock(); - return ret; + if (have_pick) + return 1; + if (!have_dirty_ptrs) + return 0; + if (have_missing_devs) + return -BCH_ERR_no_device_to_read_from; + if (have_csum_errors) + return -BCH_ERR_data_read_csum_err; + if (have_io_errors) + return -BCH_ERR_data_read_io_err; + + WARN_ONCE(1, "unhandled error case in %s\n", __func__); + return -EINVAL; } /* KEY_TYPE_btree_ptr: */ diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index c50c4f353bab..e78a39e7e18f 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) ({ \ __label__ out; \ \ - (_ptr).idx = 0; \ - (_ptr).has_ec = false; \ + (_ptr).has_ec = false; \ + (_ptr).do_ec_reconstruct = false; \ + (_ptr).crc_retry_nr = 0; \ \ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ switch (__extent_entry_type(_entry)) { \ @@ -401,7 +402,7 @@ out: \ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *, unsigned); void bch2_mark_io_failure(struct bch_io_failures *, - struct extent_ptr_decoded *); + struct extent_ptr_decoded *, bool); int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, struct bch_io_failures *, struct extent_ptr_decoded *, int); diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h index 43d6c341ecca..e51529dca4c2 100644 --- a/fs/bcachefs/extents_types.h +++ b/fs/bcachefs/extents_types.h @@ -20,8 +20,9 @@ struct bch_extent_crc_unpacked { }; struct extent_ptr_decoded { - unsigned idx; bool has_ec; + bool do_ec_reconstruct; + u8 crc_retry_nr; struct bch_extent_crc_unpacked crc; struct bch_extent_ptr ptr; struct bch_extent_stripe_ptr ec; @@ -31,10 +32,10 @@ struct bch_io_failures { u8 nr; struct bch_dev_io_failures { u8 dev; - u8 idx; - u8 nr_failed; - u8 nr_retries; - } devs[BCH_REPLICAS_MAX]; + unsigned failed_csum_nr:6, + failed_io:1, + failed_ec:1; + } devs[BCH_REPLICAS_MAX + 1]; }; #endif /* _BCACHEFS_EXTENTS_TYPES_H */ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 887e3c9ac181..aecc645e6516 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -487,7 +487,8 @@ static void bch2_rbio_retry(struct work_struct *work) bvec_iter_sectors(rbio->bvec_iter)); if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) - bch2_mark_io_failure(&failed, &rbio->pick); + bch2_mark_io_failure(&failed, &rbio->pick, + rbio->ret == -BCH_ERR_data_read_retry_csum_err); if (!rbio->split) { rbio->bio.bi_status = 0; @@ -991,7 +992,7 @@ retry_pick: ca && unlikely(dev_ptr_stale(ca, &pick.ptr))) { read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); - bch2_mark_io_failure(failed, &pick); + bch2_mark_io_failure(failed, &pick, false); percpu_ref_put(&ca->io_ref); goto retry_pick; } @@ -1154,7 +1155,7 @@ retry_pick: else bch2_trans_unlock_long(trans); - if (!rbio->pick.idx) { + if (likely(!rbio->pick.do_ec_reconstruct)) { if (unlikely(!rbio->have_ioref)) { struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); @@ -1215,7 +1216,8 @@ out: rbio = bch2_rbio_free(rbio); if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) - bch2_mark_io_failure(failed, &pick); + bch2_mark_io_failure(failed, &pick, + ret == -BCH_ERR_data_read_retry_csum_err); return ret; } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index afb89d318d24..baa9c11acb1a 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -186,6 +186,11 @@ enum fsck_err_opts { OPT_STR(__bch2_csum_opts), \ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ + x(checksum_err_retry_nr, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(0, 32), \ + BCH_SB_CSUM_ERR_RETRY_NR, 3, \ + NULL, NULL) \ x(compression, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_compression), \ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index ee32d043414a..f87e3bf33ec0 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -457,6 +457,10 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb)) SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30); + + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags && + !BCH_SB_CSUM_ERR_RETRY_NR(sb)) + SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3); } #ifdef __KERNEL__ -- 2.51.0 From 3fb8bacb14b6fb7a0040177bb7766f5c7bd68913 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 12 Mar 2025 16:56:09 -0400 Subject: [PATCH 16/16] bcachefs: BCH_READ_data_update -> bch_read_bio.data_update Read flags are codepath dependent and change as they're passed around, while the fields in rbio._state are mostly fixed properties of that particular object. Losing track of BCH_READ_data_update would be bad, and previously it was not obvious if it was always correctly set in the rbio, so this is a safety cleanup. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 1 + fs/bcachefs/io_read.c | 57 ++++++++++++++++++++++----------------- fs/bcachefs/io_read.h | 20 +++++++------- fs/bcachefs/move.c | 1 - 4 files changed, 44 insertions(+), 35 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 522574bc4197..44b8ed3cc5d6 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -700,6 +700,7 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, } rbio_init(&m->rbio.bio, c, *io_opts, NULL); + m->rbio.data_update = true; m->rbio.bio.bi_iter.bi_size = buf_bytes; m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k); m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index aecc645e6516..a4a61bce076a 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -103,14 +103,21 @@ static inline bool have_io_error(struct bch_io_failures *failed) return failed && failed->nr; } -static bool ptr_being_rewritten(struct bch_read_bio *orig, - unsigned dev, - unsigned flags) +static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio) { - if (!(flags & BCH_READ_data_update)) + EBUG_ON(rbio->split); + + return rbio->data_update + ? container_of(rbio, struct data_update, rbio) + : NULL; +} + +static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) +{ + struct data_update *u = rbio_data_update(orig); + if (!u) return false; - struct data_update *u = container_of(orig, struct data_update, rbio); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); unsigned i = 0; bkey_for_each_ptr(ptrs, ptr) { @@ -199,7 +206,6 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, struct bpos pos, struct extent_ptr_decoded *pick, unsigned sectors, - unsigned flags, struct bch_read_bio *orig, struct bch_io_failures *failed) { @@ -220,7 +226,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, unsigned ptr_bit = 1; bkey_for_each_ptr(ptrs, ptr) { if (bch2_dev_io_failures(failed, ptr->dev) && - !ptr_being_rewritten(orig, ptr->dev, flags)) + !ptr_being_rewritten(orig, ptr->dev)) update_opts.rewrite_ptrs |= ptr_bit; ptr_bit <<= 1; } @@ -314,7 +320,7 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, k.k->type == KEY_TYPE_reflink_v ? BTREE_ID_reflink : BTREE_ID_extents, - k, pos, pick, sectors, flags, orig, failed); + k, pos, pick, sectors, orig, failed); if (!promote) return NULL; @@ -342,7 +348,7 @@ static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *o if (ret) return ret; - if (rbio->flags & BCH_READ_data_update) + if (rbio->data_update) prt_str(out, "(internal move) "); return 0; @@ -505,7 +511,7 @@ static void bch2_rbio_retry(struct work_struct *work) flags &= ~BCH_READ_last_fragment; flags |= BCH_READ_must_clone; - int ret = flags & BCH_READ_data_update + int ret = rbio->data_update ? bch2_read_retry_nodecode(c, rbio, iter, &failed, flags) : __bch2_read(c, rbio, iter, inum, &failed, flags); @@ -519,7 +525,7 @@ static void bch2_rbio_retry(struct work_struct *work) bch2_inum_offset_err_msg_trans(trans, &buf, (subvol_inum) { subvol, read_pos.inode }, read_pos.offset << 9)); - if (rbio->flags & BCH_READ_data_update) + if (rbio->data_update) prt_str(&buf, "(internal move) "); prt_str(&buf, "successful retry"); @@ -712,9 +718,10 @@ static void __bch2_read_endio(struct work_struct *work) container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - struct bio *src = &rbio->bio; - struct bio *dst = &bch2_rbio_parent(rbio)->bio; - struct bvec_iter dst_iter = rbio->bvec_iter; + struct bch_read_bio *parent = bch2_rbio_parent(rbio); + struct bio *src = &rbio->bio; + struct bio *dst = &parent->bio; + struct bvec_iter dst_iter = rbio->bvec_iter; struct bch_extent_crc_unpacked crc = rbio->pick.crc; struct nonce nonce = extent_nonce(rbio->version, crc); unsigned nofs_flags; @@ -764,7 +771,7 @@ static void __bch2_read_endio(struct work_struct *work) if (unlikely(rbio->narrow_crcs)) bch2_rbio_narrow_crcs(rbio); - if (likely(!(rbio->flags & BCH_READ_data_update))) { + if (likely(!parent->data_update)) { /* Adjust crc to point to subset of data we want: */ crc.offset += rbio->offset_into_extent; crc.live_size = bvec_iter_sectors(rbio->bvec_iter); @@ -934,6 +941,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bch_read_bio *rbio = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); + struct data_update *u = rbio_data_update(orig); int ret = 0; if (bkey_extent_is_inline_data(k.k)) { @@ -997,7 +1005,7 @@ retry_pick: goto retry_pick; } - if (!(flags & BCH_READ_data_update)) { + if (likely(!u)) { if (!(flags & BCH_READ_last_fragment) || bio_flagged(&orig->bio, BIO_CHAIN)) flags |= BCH_READ_must_clone; @@ -1020,12 +1028,10 @@ retry_pick: bounce = true; } } else { - read_full = true; /* * can happen if we retry, and the extent we were going to read * has been merged in the meantime: */ - struct data_update *u = container_of(orig, struct data_update, rbio); if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { if (ca) percpu_ref_put(&ca->io_ref); @@ -1034,6 +1040,7 @@ retry_pick: } iter.bi_size = pick.crc.compressed_size << 9; + read_full = true; } if (orig->opts.promote_target || have_io_error(failed)) @@ -1127,7 +1134,7 @@ retry_pick: if (rbio->bounce) trace_and_count(c, io_read_bounce, &rbio->bio); - if (!(flags & BCH_READ_data_update)) + if (!u) this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); else this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); @@ -1137,7 +1144,7 @@ retry_pick: * If it's being moved internally, we don't want to flag it as a cache * hit: */ - if (ca && pick.ptr.cached && !(flags & BCH_READ_data_update)) + if (ca && pick.ptr.cached && !u) bch2_bucket_io_time_reset(trans, pick.ptr.dev, PTR_BUCKET_NR(ca, &pick.ptr), READ); @@ -1234,11 +1241,11 @@ hole: this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], bvec_iter_sectors(iter)); /* - * won't normally happen in the BCH_READ_data_update - * (bch2_move_extent()) path, but if we retry and the extent we wanted - * to read no longer exists we have to signal that: + * won't normally happen in the data update (bch2_move_extent()) path, + * but if we retry and the extent we wanted to read no longer exists we + * have to signal that: */ - if (flags & BCH_READ_data_update) + if (u) orig->ret = -BCH_ERR_data_read_key_overwritten; zero_fill_bio_iter(&orig->bio, iter); @@ -1259,7 +1266,7 @@ int __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, struct bkey_s_c k; int ret; - BUG_ON(flags & BCH_READ_data_update); + EBUG_ON(rbio->data_update); bch2_bkey_buf_init(&sk); bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 42a22985d789..559d8986d84c 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -35,7 +35,8 @@ struct bch_read_bio { u16 flags; union { struct { - u16 promote:1, + u16 data_update:1, + promote:1, bounce:1, split:1, have_ioref:1, @@ -108,7 +109,6 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, x(retry_if_stale) \ x(may_promote) \ x(user_mapped) \ - x(data_update) \ x(last_fragment) \ x(must_bounce) \ x(must_clone) \ @@ -161,12 +161,13 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, { struct bch_read_bio *rbio = to_rbio(bio); - rbio->c = orig->c; - rbio->_state = 0; - rbio->ret = 0; - rbio->split = true; - rbio->parent = orig; - rbio->opts = orig->opts; + rbio->c = orig->c; + rbio->_state = 0; + rbio->flags = 0; + rbio->ret = 0; + rbio->split = true; + rbio->parent = orig; + rbio->opts = orig->opts; return rbio; } @@ -180,7 +181,8 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio, rbio->start_time = local_clock(); rbio->c = c; rbio->_state = 0; - rbio->ret = 0; + rbio->flags = 0; + rbio->ret = 0; rbio->opts = opts; rbio->bio.bi_end_io = end_io; return rbio; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 307b918fa2e7..10843a2ebb88 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -359,7 +359,6 @@ int bch2_move_extent(struct moving_context *ctxt, bkey_start_pos(k.k), iter->btree_id, k, 0, NULL, - BCH_READ_data_update| BCH_READ_last_fragment, data_opts.scrub ? data_opts.read_dev : -1); return 0; -- 2.51.0