From f351d91edd507391518a4f5870185fa5bf38446b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 23 May 2025 18:30:10 -0400 Subject: [PATCH 01/16] bcachefs: Fix allocate -> self healing path When we go to allocate and find taht a bucket in the freespace btree is actually allocated, we're supposed to return nonzero to tell the allocator to skip it. This fixes an emergency read only due to a bucket/ptr gen mismatch - we also don't return the correct bucket gen when this happens. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index a38b9c6c891e..173e81c2bbcb 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1475,6 +1475,8 @@ delete: w->c = c; w->pos = BBPOS(iter->btree_id, iter->pos); queue_work(c->write_ref_wq, &w->work); + + ret = 1; /* don't allocate from this bucket */ goto out; } } -- 2.50.1 From cade003209cfe728de2ef880d5704cc322a7ce1f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 23 May 2025 18:31:53 -0400 Subject: [PATCH 02/16] bcachefs: Fix opts.recovery_pass_last This was lost in the giant recovery pass rework - but it's used heavily by bcachefs subcommand utilities. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index f74f14227137..dabb29b08ad0 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -525,6 +525,9 @@ int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from) c->opts.recovery_passes | c->sb.recovery_passes_required; + if (c->opts.recovery_pass_last) + passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1; + /* * We can't allow set_may_go_rw to be excluded; that would cause us to * use the journal replay keys for updates where it's not expected. -- 2.50.1 From 9b133c0d74b17db2dc0d2d70b6591b0ebb604463 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 23 May 2025 14:19:25 -0400 Subject: [PATCH 03/16] bcachefs: Small check_fix_ptr fixes We don't want to change the bucket gen, on gen mismatch: it's possible to have multiple btree nodes with different gens in the same bucket that we want to keep, if we have to recover from btree node scan. It's also not necessary to set g->gen_valid; add a comment to that effect. Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 8bb6384190c5..09eb5a543ae4 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -156,10 +156,14 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, g->gen_valid = true; g->gen = p.ptr.gen; } else { + /* this pointer will be dropped */ *do_update = true; + goto out; } } + /* g->gen_valid == true */ + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, trans, ptr_gen_newer_than_bucket_gen, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" @@ -172,15 +176,13 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, if (!p.ptr.cached && (g->data_type != BCH_DATA_btree || data_type == BCH_DATA_btree)) { - g->gen_valid = true; - g->gen = p.ptr.gen; - g->data_type = 0; + g->data_type = data_type; g->stripe_sectors = 0; g->dirty_sectors = 0; g->cached_sectors = 0; - } else { - *do_update = true; } + + *do_update = true; } if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, @@ -217,9 +219,8 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, bch2_data_type_str(data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (data_type == BCH_DATA_btree) { - g->gen_valid = true; - g->gen = p.ptr.gen; + if (!p.ptr.cached && + data_type == BCH_DATA_btree) { g->data_type = data_type; g->stripe_sectors = 0; g->dirty_sectors = 0; -- 2.50.1 From 521f9584c2bd48198ac9d9b99a372b1306f3bb97 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 23 May 2025 14:03:06 -0400 Subject: [PATCH 04/16] bcachefs: Ensure we don't use a blacklisted journal seq Different versions differ on the size of the blacklist range; it is theoretically possible that we could end up with blacklisted journal sequence numbers newer than the newest seq we find in the journal, and pick a new start seq that's blacklisted. Explicitly check for this in bch2_fs_journal_start(). Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 17 ++++++++++++++++- fs/bcachefs/journal_seq_blacklist.c | 10 ++++++++++ fs/bcachefs/journal_seq_blacklist.h | 1 + 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index f2963a6cca88..09b70fd140a1 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -415,7 +415,7 @@ static int journal_entry_open(struct journal *j) if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) return -BCH_ERR_journal_max_open; - if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { + if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) { bch_err(c, "cannot start: journal seq overflow"); if (bch2_fs_emergency_read_only_locked(c)) bch_err(c, "fatal error - emergency read only"); @@ -459,6 +459,14 @@ static int journal_entry_open(struct journal *j) atomic64_inc(&j->seq); journal_pin_list_init(fifo_push_ref(&j->pin), 1); + if (unlikely(bch2_journal_seq_is_blacklisted(c, journal_cur_seq(j), false))) { + bch_err(c, "attempting to open blacklisted journal seq %llu", + journal_cur_seq(j)); + if (bch2_fs_emergency_read_only_locked(c)) + bch_err(c, "fatal error - emergency read only"); + return -BCH_ERR_journal_shutdown; + } + BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); @@ -1415,6 +1423,13 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) bool had_entries = false; u64 last_seq = cur_seq, nr, seq; + /* + * + * XXX pick most recent non blacklisted sequence number + */ + + cur_seq = max(cur_seq, bch2_journal_last_blacklisted_seq(c)); + if (cur_seq >= JOURNAL_SEQ_MAX) { bch_err(c, "cannot start: journal seq overflow"); return -EINVAL; diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index e463d2d95359..c5a7d800a0f5 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -130,6 +130,16 @@ bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, return true; } +u64 bch2_journal_last_blacklisted_seq(struct bch_fs *c) +{ + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; + + if (!t || !t->nr) + return 0; + + return t->entries[eytzinger0_last(t->nr)].end - 1; +} + int bch2_blacklist_table_initialize(struct bch_fs *c) { struct bch_sb_field_journal_seq_blacklist *bl = diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h index d47636f96fdc..f06942ccfcdd 100644 --- a/fs/bcachefs/journal_seq_blacklist.h +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -12,6 +12,7 @@ blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) } bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); +u64 bch2_journal_last_blacklisted_seq(struct bch_fs *); int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); int bch2_blacklist_table_initialize(struct bch_fs *); -- 2.50.1 From 3f2f028814abf68ce4d74bfd2627cb84d2afa389 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 23 May 2025 20:11:43 -0400 Subject: [PATCH 05/16] bcachefs: Fix btree_iter_next_node() for new locking asserts We can't unlock a should_be_locked path unless we're in a transaction restart. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 97f3faac8067..b4bf4217a3fa 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1991,12 +1991,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_ bch2_btree_path_downgrade(trans, path); if (!bch2_btree_node_relock(trans, path, path->level + 1)) { + trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); __bch2_btree_path_unlock(trans, path); path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); goto err; } -- 2.50.1 From 9caea9208fc3fbdbd4a41a2de8c6a0c969b030f9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 23 May 2025 21:59:12 -0400 Subject: [PATCH 06/16] bcachefs: Don't mount bs > ps without TRANSPARENT_HUGEPAGE Large folios aren't supported without TRANSPARENT_HUGEPAGE Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 24658bf450ab..11579b74c640 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -950,6 +950,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, bch2_opts_apply(&c->opts, *opts); + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + c->opts.block_size > PAGE_SIZE) { + bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE"); + ret = -EINVAL; + goto err; + } + c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; if (c->opts.inodes_use_key_cache) c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; -- 2.50.1 From 7098ba57c443868cab250a3a5db72c89ffbd9026 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 24 May 2025 01:56:10 -0400 Subject: [PATCH 07/16] bcachefs: fix REFLINK_P_MAY_UPDATE_OPTIONS If we're doing a reflink copy of existing reflinked data, we may only set REFLINK_P_MAY_UPDATE_OPTIONS if it was set on the reflink pointer we're copying from. Signed-off-by: Kent Overstreet --- fs/bcachefs/reflink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 3a13dbcab6ba..41ca86cba2cd 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -711,7 +711,8 @@ s64 bch2_remap_range(struct bch_fs *c, SET_REFLINK_P_IDX(&dst_p->v, offset); if (reflink_p_may_update_opts_field && - may_change_src_io_path_opts) + may_change_src_io_path_opts && + REFLINK_P_MAY_UPDATE_OPTIONS(src_p.v)) SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true); } else { BUG(); -- 2.50.1 From 97e69f12edb19a17589ca0b6f3988b2a28af87c8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 24 May 2025 14:20:58 -0400 Subject: [PATCH 08/16] bcachefs: Fix missing BTREE_UPDATE_internal_snapshot_node Repair code will do updates on older snapshot versions, so needs the correct annotation. Reported-by: syzbot+42581416dba62b364750@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index a84b69d6caef..ff7ba297d3ef 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -857,7 +857,8 @@ int __bch2_check_dirent_target(struct btree_trans *trans, n->v.d_inum = cpu_to_le64(target->bi_inum); } - ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0); + ret = bch2_trans_update(trans, dirent_iter, &n->k_i, + BTREE_UPDATE_internal_snapshot_node); if (ret) goto err; } -- 2.50.1 From ff875d4b474739662d7fefece7532ff77c8b3b70 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 24 May 2025 14:37:20 -0400 Subject: [PATCH 09/16] bcachefs: Ensure we print output of run_recovery_pass if it errors Also, don't error out in bucket_ref_update_err(): we don't want to return -BCH_ERR_cannot_rewind_recovery if it's not an insert, if it's an overwrite we continue. Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 09eb5a543ae4..492a368a9993 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -406,7 +406,15 @@ static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf if (insert) { bch2_trans_updates_to_text(buf, trans); __bch2_inconsistent_error(c, buf); - ret = -BCH_ERR_bucket_ref_update; + /* + * If we're in recovery, run_explicit_recovery_pass might give + * us an error code for rewinding recovery + */ + if (!ret) + ret = -BCH_ERR_bucket_ref_update; + } else { + /* Always ignore overwrite errors, so that deletion works */ + ret = 0; } if (print || insert) @@ -971,15 +979,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, bch2_data_type_str(type), bch2_data_type_str(type)); - bool print = bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf); + bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf); - bch2_run_explicit_recovery_pass(c, &buf, + ret = bch2_run_explicit_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_allocations, 0); - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); + /* Always print, this is always fatal */ + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); - ret = -BCH_ERR_metadata_bucket_inconsistency; + if (!ret) + ret = -BCH_ERR_metadata_bucket_inconsistency; goto err; } -- 2.50.1 From dc37dcca8cb71d7ddddfd5035619eeb27c5aab8d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 24 May 2025 15:24:00 -0400 Subject: [PATCH 10/16] bcachefs: bch2_kthread_io_clock_wait_once() Add a version of bch2_kthread_io_clock_wait() that only schedules once - behaving more like schedule_timeout(). This will be used for fixing rebalance wakeups. Signed-off-by: Kent Overstreet --- fs/bcachefs/clock.c | 47 +++++++++++++++++---------------------------- fs/bcachefs/clock.h | 1 + 2 files changed, 19 insertions(+), 29 deletions(-) diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index d6dd12d74d4f..8e9264b5a84e 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -53,7 +53,6 @@ void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) struct io_clock_wait { struct io_timer io_timer; - struct timer_list cpu_timer; struct task_struct *task; int expired; }; @@ -67,15 +66,6 @@ static void io_clock_wait_fn(struct io_timer *timer) wake_up_process(wait->task); } -static void io_clock_cpu_timeout(struct timer_list *timer) -{ - struct io_clock_wait *wait = container_of(timer, - struct io_clock_wait, cpu_timer); - - wait->expired = 1; - wake_up_process(wait->task); -} - void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) { struct io_clock_wait wait = { @@ -90,8 +80,8 @@ void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) bch2_io_timer_del(clock, &wait.io_timer); } -void bch2_kthread_io_clock_wait(struct io_clock *clock, - u64 io_until, unsigned long cpu_timeout) +unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *clock, + u64 io_until, unsigned long cpu_timeout) { bool kthread = (current->flags & PF_KTHREAD) != 0; struct io_clock_wait wait = { @@ -103,27 +93,26 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, bch2_io_timer_add(clock, &wait.io_timer); - timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); - - if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) - mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); - - do { - set_current_state(TASK_INTERRUPTIBLE); - if (kthread && kthread_should_stop()) - break; - - if (wait.expired) - break; - - schedule(); + set_current_state(TASK_INTERRUPTIBLE); + if (!(kthread && kthread_should_stop())) { + cpu_timeout = schedule_timeout(cpu_timeout); try_to_freeze(); - } while (0); + } __set_current_state(TASK_RUNNING); - timer_delete_sync(&wait.cpu_timer); - destroy_timer_on_stack(&wait.cpu_timer); bch2_io_timer_del(clock, &wait.io_timer); + return cpu_timeout; +} + +void bch2_kthread_io_clock_wait(struct io_clock *clock, + u64 io_until, unsigned long cpu_timeout) +{ + bool kthread = (current->flags & PF_KTHREAD) != 0; + + while (!(kthread && kthread_should_stop()) && + cpu_timeout && + atomic64_read(&clock->now) < io_until) + cpu_timeout = bch2_kthread_io_clock_wait_once(clock, io_until, cpu_timeout); } static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h index 82c79c8baf92..8769be2aa21e 100644 --- a/fs/bcachefs/clock.h +++ b/fs/bcachefs/clock.h @@ -4,6 +4,7 @@ void bch2_io_timer_add(struct io_clock *, struct io_timer *); void bch2_io_timer_del(struct io_clock *, struct io_timer *); +unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *, u64, unsigned long); void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long); void __bch2_increment_clock(struct io_clock *, u64); -- 2.50.1 From 9e2c3c2ed4772cb0e2ad5b0def08c2c943483445 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 24 May 2025 15:29:50 -0400 Subject: [PATCH 11/16] bcachefs: Fix lost rebalance wakeups Fix a missing wakeup in 'bcachefs set-file-option' -> xattr option update -> inode_write this was missing because the wakeup needs to happen after transaction commit. Also, add a 'kick' counter, to make sure we don't miss a wakeup that occured right after we finished checking the rebalance_work btree. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 6 +++++- fs/bcachefs/rebalance.c | 6 ++++-- fs/bcachefs/rebalance.h | 8 +++----- fs/bcachefs/rebalance_types.h | 1 + 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index ddfe89d84966..adae43223bce 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -124,8 +124,9 @@ retry: goto err; struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); + bool rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r)); - if (memcmp(&old_r, &new_r, sizeof(new_r))) { + if (rebalance_changed) { ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum); if (ret) goto err; @@ -146,6 +147,9 @@ err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; + if (rebalance_changed) + bch2_rebalance_wakeup(c); + bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, "%s: inode %llu:%llu not found when updating", bch2_err_str(ret), diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index de1ec9e0caa0..dbaabaad1986 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -527,7 +527,7 @@ static void rebalance_wait(struct bch_fs *c) r->state = BCH_REBALANCE_waiting; } - bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); + bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); } static bool bch2_rebalance_enabled(struct bch_fs *c) @@ -544,6 +544,7 @@ static int do_rebalance(struct moving_context *ctxt) struct bch_fs_rebalance *r = &c->rebalance; struct btree_iter rebalance_work_iter, extent_iter = {}; struct bkey_s_c k; + u32 kick = r->kick; int ret = 0; bch2_trans_begin(trans); @@ -593,7 +594,8 @@ static int do_rebalance(struct moving_context *ctxt) if (!ret && !kthread_should_stop() && !atomic64_read(&r->work_stats.sectors_seen) && - !atomic64_read(&r->scan_stats.sectors_seen)) { + !atomic64_read(&r->scan_stats.sectors_seen) && + kick == r->kick) { bch2_moving_ctxt_flush_all(ctxt); bch2_trans_unlock_long(trans); rebalance_wait(c); diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 5d9214fe1a22..7a565ea7dbfc 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -39,13 +39,11 @@ int bch2_set_fs_needs_rebalance(struct bch_fs *); static inline void bch2_rebalance_wakeup(struct bch_fs *c) { - struct task_struct *p; - - rcu_read_lock(); - p = rcu_dereference(c->rebalance.thread); + c->rebalance.kick++; + guard(rcu)(); + struct task_struct *p = rcu_dereference(c->rebalance.thread); if (p) wake_up_process(p); - rcu_read_unlock(); } void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h index 33d77286f1d5..c659da149fa3 100644 --- a/fs/bcachefs/rebalance_types.h +++ b/fs/bcachefs/rebalance_types.h @@ -18,6 +18,7 @@ enum bch_rebalance_states { struct bch_fs_rebalance { struct task_struct __rcu *thread; + u32 kick; struct bch_pd_controller pd; enum bch_rebalance_states state; -- 2.50.1 From 1cda5b88e6d13ebf42078253abbb2ed0efe9ab0a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 24 May 2025 19:53:03 -0400 Subject: [PATCH 12/16] bcachefs: Fix missing commit in check_dirents Other repair code seems to be doing commits themselves, but check_key_has_snapshot() does not. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 49f46df8340e..8e07a365b24c 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2312,9 +2312,10 @@ int bch2_check_dirents(struct bch_fs *c) snapshots_seen_init(&s); int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_dirents, + for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?: check_subdir_count_notnested(trans, &dir)); -- 2.50.1 From 686db67a8ebecdc6eb7b9ca8ef8eddb99bdf1083 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 25 May 2025 11:51:33 -0400 Subject: [PATCH 13/16] bcachefs: Move unicode message to after the startup message Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 11579b74c640..df42a66b8bc3 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1038,10 +1038,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, ret = -EINVAL; goto err; } - bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", - unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); #else if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); @@ -1159,6 +1155,12 @@ int bch2_fs_start(struct bch_fs *c) print_mount_opts(c); + if (IS_ENABLED(CONFIG_UNICODE)) + bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", + unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); + if (!bch2_fs_may_start(c)) return -BCH_ERR_insufficient_devices_to_start; -- 2.50.1 From 72ab5136e86fcbccebb4a423d83332f41a7bd697 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 26 May 2025 11:12:53 -0400 Subject: [PATCH 14/16] bcachefs: Don't rewind to run a recovery pass we already ran Fix a small regression from the "run recovery passes" rewrite, which enabled async recovery passes. This fixes getting stuck in a loop in recovery. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index dabb29b08ad0..212658cb97dd 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -315,7 +315,9 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c, goto out; bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); - bool rewind = in_recovery && r->curr_pass > pass; + bool rewind = in_recovery && + r->curr_pass > pass && + !(r->passes_complete & BIT_ULL(pass)); bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit; if (!(in_recovery && (flags & RUN_RECOVERY_PASS_nopersistent))) { -- 2.50.1 From cd04497b10e6178a7510329465d05788b906ce5f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 26 May 2025 12:21:57 -0400 Subject: [PATCH 15/16] bcachefs: Journal read error message improvements - Don't print a checksum error when we first read a journal entry: we print a checksum error later if we'll be using the journal entry. - Continuing with the theme of of improving error messages and grouping errors into a single log message per error, print a single 'checksum error' message per journal entry, and use bch2_journal_ptr_to_text() to print out where on the device it was. - Factor out checksum error messages and checking for missing journal entries into helpers, bch2_journal_read() has gotten obnoxiously big. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 228 ++++++++++++++++++++++++--------------- fs/bcachefs/journal_io.h | 1 + 2 files changed, 140 insertions(+), 89 deletions(-) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 63bb207208b2..a322c3d7123a 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -49,25 +49,27 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) mutex_unlock(&c->sb_lock); } -void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) +static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p) +{ + struct bch_dev *ca = bch2_dev_tryget_noerror(c, p->dev); + prt_printf(out, "%s %u:%u:%u (sector %llu)", + ca ? ca->name : "(invalid dev)", + p->dev, p->bucket, p->bucket_offset, p->sector); + bch2_dev_put(ca); +} + +void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j) { darray_for_each(j->ptrs, i) { if (i != j->ptrs.data) prt_printf(out, " "); - prt_printf(out, "%u:%u:%u (sector %llu)", - i->dev, i->bucket, i->bucket_offset, i->sector); + bch2_journal_ptr_to_text(out, c, i); } } -static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) +static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j) { - prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); - - bch2_journal_ptrs_to_text(out, c, j); - - for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { + for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) { struct jset_entry_datetime *datetime = container_of(entry, struct jset_entry_datetime, entry); bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); @@ -75,6 +77,15 @@ static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, } } +static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); + bch2_journal_datetime_to_text(out, &j->j); + prt_char(out, ' '); + bch2_journal_ptrs_to_text(out, c, j); +} + static struct nonce journal_nonce(const struct jset *jset) { return (struct nonce) {{ @@ -1037,7 +1048,6 @@ static int journal_read_bucket(struct bch_dev *ca, u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), end = offset + ca->mi.bucket_size; bool saw_bad = false, csum_good; - struct printbuf err = PRINTBUF; int ret = 0; pr_debug("reading %u", bucket); @@ -1078,7 +1088,7 @@ reread: * found on a different device, and missing or * no journal entries will be handled later */ - goto out; + return 0; } j = buf->data; @@ -1095,12 +1105,12 @@ reread: ret = journal_read_buf_realloc(buf, vstruct_bytes(j)); if (ret) - goto err; + return ret; } goto reread; case JOURNAL_ENTRY_NONE: if (!saw_bad) - goto out; + return 0; /* * On checksum error we don't really trust the size * field of the journal entry we read, so try reading @@ -1109,7 +1119,7 @@ reread: sectors = block_sectors(c); goto next_block; default: - goto err; + return ret; } if (le64_to_cpu(j->seq) > ja->highest_seq_found) { @@ -1126,22 +1136,20 @@ reread: * bucket: */ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) - goto out; + return 0; ja->bucket_seq[bucket] = le64_to_cpu(j->seq); - enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); struct bch_csum csum; csum_good = jset_csum_good(c, j, &csum); bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); if (!csum_good) { - bch_err_dev_ratelimited(ca, "%s", - (printbuf_reset(&err), - prt_str(&err, "journal "), - bch2_csum_err_msg(&err, csum_type, j->csum, csum), - err.buf)); + /* + * Don't print an error here, we'll print the error + * later if we need this journal entry + */ saw_bad = true; } @@ -1153,6 +1161,7 @@ reread: mutex_lock(&jlist->lock); ret = journal_entry_add(c, ca, (struct journal_ptr) { .csum_good = csum_good, + .csum = csum, .dev = ca->dev_idx, .bucket = bucket, .bucket_offset = offset - @@ -1167,7 +1176,7 @@ reread: case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: break; default: - goto err; + return ret; } next_block: pr_debug("next"); @@ -1176,11 +1185,7 @@ next_block: j = ((void *) j) + (sectors << 9); } -out: - ret = 0; -err: - printbuf_exit(&err); - return ret; + return 0; } static CLOSURE_CALLBACK(bch2_journal_read_device) @@ -1229,13 +1234,105 @@ err: goto out; } +noinline_for_stack +static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j) +{ + struct printbuf buf = PRINTBUF; + enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j); + bool have_good = false; + + prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq)); + bch2_journal_datetime_to_text(&buf, &j->j); + prt_newline(&buf); + + darray_for_each(j->ptrs, ptr) + if (!ptr->csum_good) { + bch2_journal_ptr_to_text(&buf, c, ptr); + prt_char(&buf, ' '); + bch2_csum_to_text(&buf, csum_type, ptr->csum); + prt_newline(&buf); + } else { + have_good = true; + } + + prt_printf(&buf, "should be "); + bch2_csum_to_text(&buf, csum_type, j->j.csum); + + if (have_good) + prt_printf(&buf, "\n(had good copy on another device)"); + + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); +} + +noinline_for_stack +static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq) +{ + struct printbuf buf = PRINTBUF; + int ret = 0; + + struct genradix_iter radix_iter; + struct journal_replay *i, **_i, *prev = NULL; + u64 seq = start_seq; + + genradix_for_each(&c->journal_entries, radix_iter, _i) { + i = *_i; + + if (journal_replay_ignore(i)) + continue; + + BUG_ON(seq > le64_to_cpu(i->j.seq)); + + while (seq < le64_to_cpu(i->j.seq)) { + while (seq < le64_to_cpu(i->j.seq) && + bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + if (seq == le64_to_cpu(i->j.seq)) + break; + + u64 missing_start = seq; + + while (seq < le64_to_cpu(i->j.seq) && + !bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + u64 missing_end = seq - 1; + + printbuf_reset(&buf); + prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)", + missing_start, missing_end, + start_seq, end_seq); + + prt_printf(&buf, "\nprev at "); + if (prev) { + bch2_journal_ptrs_to_text(&buf, c, prev); + prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); + } else + prt_printf(&buf, "(none)"); + + prt_printf(&buf, "\nnext at "); + bch2_journal_ptrs_to_text(&buf, c, i); + prt_printf(&buf, ", continue?"); + + fsck_err(c, journal_entries_missing, "%s", buf.buf); + } + + prev = i; + seq++; + } +fsck_err: + printbuf_exit(&buf); + return ret; +} + int bch2_journal_read(struct bch_fs *c, u64 *last_seq, u64 *blacklist_seq, u64 *start_seq) { struct journal_list jlist; - struct journal_replay *i, **_i, *prev = NULL; + struct journal_replay *i, **_i; struct genradix_iter radix_iter; struct printbuf buf = PRINTBUF; bool degraded = false, last_write_torn = false; @@ -1354,56 +1451,9 @@ int bch2_journal_read(struct bch_fs *c, } } - /* Check for missing entries: */ - seq = *last_seq; - genradix_for_each(&c->journal_entries, radix_iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - BUG_ON(seq > le64_to_cpu(i->j.seq)); - - while (seq < le64_to_cpu(i->j.seq)) { - u64 missing_start, missing_end; - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - - while (seq < le64_to_cpu(i->j.seq) && - bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - if (seq == le64_to_cpu(i->j.seq)) - break; - - missing_start = seq; - - while (seq < le64_to_cpu(i->j.seq) && - !bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - if (prev) { - bch2_journal_ptrs_to_text(&buf1, c, prev); - prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); - } else - prt_printf(&buf1, "(none)"); - bch2_journal_ptrs_to_text(&buf2, c, i); - - missing_end = seq - 1; - fsck_err(c, journal_entries_missing, - "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" - "prev at %s\n" - "next at %s, continue?", - missing_start, missing_end, - *last_seq, *blacklist_seq - 1, - buf1.buf, buf2.buf); - - printbuf_exit(&buf1); - printbuf_exit(&buf2); - } - - prev = i; - seq++; - } + ret = bch2_journal_check_for_missing(c, *last_seq, *blacklist_seq - 1); + if (ret) + goto err; genradix_for_each(&c->journal_entries, radix_iter, _i) { union bch_replicas_padded replicas = { @@ -1416,15 +1466,15 @@ int bch2_journal_read(struct bch_fs *c, if (journal_replay_ignore(i)) continue; - darray_for_each(i->ptrs, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - - if (!ptr->csum_good) - bch_err_dev_offset(ca, ptr->sector, - "invalid journal checksum, seq %llu%s", - le64_to_cpu(i->j.seq), - i->csum_good ? " (had good copy on another device)" : ""); - } + /* + * Don't print checksum errors until we know we're going to use + * a given journal entry: + */ + darray_for_each(i->ptrs, ptr) + if (!ptr->csum_good) { + bch2_journal_print_checksum_error(c, i); + break; + } ret = jset_validate(c, bch2_dev_have_ref(c, i->ptrs.data[0].dev), diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 12b39fcb4424..6fa82c4050fe 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -9,6 +9,7 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *); struct journal_ptr { bool csum_good; + struct bch_csum csum; u8 dev; u32 bucket; u32 bucket_offset; -- 2.50.1 From d6efd42a8450c67d283dbaacd127dcccca858f51 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 25 May 2025 17:04:11 -0400 Subject: [PATCH 16/16] bcachefs: Fix infinite loop in journal_entry_btree_keys_to_text() Fix an infinite loop when bkey_i->k.u64s is 0. This only happens in userspace, where 'bcachefs list_journal' can print the entire contents of the journal, and non-dirty entries aren't validated. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index a322c3d7123a..30122abf3e2c 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -429,6 +429,10 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs bool first = true; jset_entry_for_each_key(entry, k) { + /* We may be called on entries that haven't been validated: */ + if (!k->k.u64s) + break; + if (!first) { prt_newline(out); bch2_prt_jset_entry_type(out, entry->type); -- 2.50.1