From 511ddcdb2d5e0bdb73c7968e4215268f4572a984 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 30 Nov 2024 23:27:45 -0500 Subject: [PATCH 01/16] bcachefs: fix bch2_journal_key_insert_take() seq Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index de3db161d6ab..6d25e3f85ce8 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -259,7 +259,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, * Ensure these keys are done last by journal replay, to unblock * journal reclaim: */ - .journal_seq = U32_MAX, + .journal_seq = U64_MAX, }; struct journal_keys *keys = &c->journal_keys; size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); -- 2.51.0 From 5cdaec193a85e32235e7dccb95c085acc50b8dbd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 1 Dec 2024 16:39:54 -0500 Subject: [PATCH 02/16] bcachefs: Improve "unable to allocate journal write" message Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 1627f3e16517..bb69d80886b5 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -2036,8 +2036,9 @@ CLOSURE_CALLBACK(bch2_journal_write) struct printbuf buf = PRINTBUF; buf.atomic++; - prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu: %s"), + prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), le64_to_cpu(w->data->seq), + vstruct_sectors(w->data, c->block_bits), bch2_err_str(ret)); __bch2_journal_debug_to_text(&buf, j); spin_unlock(&j->lock); -- 2.51.0 From 9c22dd02ae8b80bf662ab409091731cfb9a09348 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 2 Dec 2024 23:36:38 -0500 Subject: [PATCH 03/16] bcachefs: Fix allocating too big journal entry The "journal space available" calculations didn't take into account mismatched bucket sizes; we need to take the minimum space available out of our devices. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_reclaim.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 1aabbbe328d9..b7936ad3ae7f 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -140,6 +140,7 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne struct bch_fs *c = container_of(j, struct bch_fs, journal); unsigned pos, nr_devs = 0; struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; + unsigned min_bucket_size = U32_MAX; BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); @@ -148,6 +149,8 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne if (!ca->journal.nr) continue; + min_bucket_size = min(min_bucket_size, ca->mi.bucket_size); + space = journal_dev_space_available(j, ca, from); if (!space.next_entry) continue; @@ -167,7 +170,9 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne * We sorted largest to smallest, and we want the smallest out of the * @nr_devs_want largest devices: */ - return dev_space[nr_devs_want - 1]; + space = dev_space[nr_devs_want - 1]; + space.next_entry = min(space.next_entry, min_bucket_size); + return space; } void bch2_journal_space_available(struct journal *j) -- 2.51.0 From d36b3e74b65f4ec68a38bdb717d94b32a81a355f Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 3 Dec 2024 17:40:10 +0100 Subject: [PATCH 04/16] bcachefs: BCACHEFS_PATH_TRACEPOINTS should depend on TRACING When tracing is disabled, there is no point in asking the user about enabling extra btree_path tracepoints in bcachefs. Fixes: 32ed4a620c5405be ("bcachefs: Btree path tracepoints") Signed-off-by: Geert Uytterhoeven Signed-off-by: Kent Overstreet --- fs/bcachefs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index ab6c95b895b3..464b927e4fff 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -90,7 +90,7 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN config BCACHEFS_PATH_TRACEPOINTS bool "Extra btree_path tracepoints" - depends on BCACHEFS_FS + depends on BCACHEFS_FS && TRACING help Enable extra tracepoints for debugging btree_path operations; we don't normally want these enabled because they happen at very high rates. -- 2.51.0 From ad0b2544ec827e03b75143bed83338bda7f6fe21 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 3 Dec 2024 21:22:26 -0500 Subject: [PATCH 05/16] bcachefs: rcu_pending now works in userspace Introduce a typedef to handle the difference between unsigned long/struct urcu_gp_poll_state. Signed-off-by: Kent Overstreet --- fs/bcachefs/rcu_pending.c | 40 ++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c index 67522aa344a7..bef2aa1b8bcd 100644 --- a/fs/bcachefs/rcu_pending.c +++ b/fs/bcachefs/rcu_pending.c @@ -25,21 +25,37 @@ enum rcu_pending_special { #define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE) #define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU) -static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp) +#ifdef __KERNEL__ +typedef unsigned long rcu_gp_poll_state_t; + +static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) +{ + return l == r; +} +#else +typedef struct urcu_gp_poll_state rcu_gp_poll_state_t; + +static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) +{ + return l.grace_period_id == r.grace_period_id; +} +#endif + +static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp) { return ssp ? get_state_synchronize_srcu(ssp) : get_state_synchronize_rcu(); } -static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp) +static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp) { return ssp ? start_poll_synchronize_srcu(ssp) : start_poll_synchronize_rcu(); } -static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie) +static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie) { return ssp ? poll_state_synchronize_srcu(ssp, cookie) @@ -71,13 +87,13 @@ struct rcu_pending_seq { GENRADIX(struct rcu_head *) objs; size_t nr; struct rcu_head **cursor; - unsigned long seq; + rcu_gp_poll_state_t seq; }; struct rcu_pending_list { struct rcu_head *head; struct rcu_head *tail; - unsigned long seq; + rcu_gp_poll_state_t seq; }; struct rcu_pending_pcpu { @@ -316,10 +332,10 @@ static void rcu_pending_rcu_cb(struct rcu_head *rcu) } static __always_inline struct rcu_pending_seq * -get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq) +get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq) { darray_for_each_reverse(p->objs, objs) - if (objs->seq == seq) + if (rcu_gp_poll_cookie_eq(objs->seq, seq)) return objs; if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC)) @@ -329,7 +345,7 @@ get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq) } static noinline bool -rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq, +rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq, struct rcu_head *head, void *ptr, unsigned long *flags) { @@ -364,7 +380,7 @@ rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq, again: for (struct rcu_pending_list *i = p->lists; i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { - if (i->seq == seq) { + if (rcu_gp_poll_cookie_eq(i->seq, seq)) { rcu_pending_list_add(i, head); return false; } @@ -408,7 +424,7 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, struct rcu_pending_pcpu *p; struct rcu_pending_seq *objs; struct genradix_node *new_node = NULL; - unsigned long seq, flags; + unsigned long flags; bool start_gp = false; BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN)); @@ -416,7 +432,7 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, local_irq_save(flags); p = this_cpu_ptr(pending->p); spin_lock(&p->lock); - seq = __get_state_synchronize_rcu(pending->srcu); + rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu); restart: if (may_sleep && unlikely(process_finished_items(pending, p, flags))) @@ -478,9 +494,7 @@ start_gp: */ if (!p->cb_armed) { p->cb_armed = true; - spin_unlock_irqrestore(&p->lock, flags); __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb); - goto free_node; } else { __start_poll_synchronize_rcu(pending->srcu); } -- 2.51.0 From f78760dede23affb50a6fe62b1230849e1a5d15f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 1 Dec 2024 21:35:11 -0500 Subject: [PATCH 06/16] bcachefs: logged ops only use inum 0 of logged ops btree we wish to use the logged ops btree for other items that aren't strictly logged ops: cursors for inode allocation There's no reason to create another cached btree for inode allocator cursors - so reserve different parts of the keyspace for different purposes. Older versions will ignore or delete the cursors. Signed-off-by: Kent Overstreet --- fs/bcachefs/logged_ops.c | 10 +++++----- fs/bcachefs/logged_ops_format.h | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index 60e00702d1a4..1ac51af16299 100644 --- a/fs/bcachefs/logged_ops.c +++ b/fs/bcachefs/logged_ops.c @@ -63,8 +63,9 @@ fsck_err: int bch2_resume_logged_ops(struct bch_fs *c) { int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, - BTREE_ID_logged_ops, POS_MIN, + for_each_btree_key_max(trans, iter, + BTREE_ID_logged_ops, + POS(LOGGED_OPS_INUM, 0), POS(LOGGED_OPS_INUM, U64_MAX), BTREE_ITER_prefetch, k, resume_logged_op(trans, &iter, k))); bch_err_fn(c, ret); @@ -74,9 +75,8 @@ int bch2_resume_logged_ops(struct bch_fs *c) static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) { struct btree_iter iter; - int ret; - - ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX); + int ret = bch2_bkey_get_empty_slot(trans, &iter, + BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM, U64_MAX)); if (ret) return ret; diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h index 6a4bf7129dba..0b370a963ac6 100644 --- a/fs/bcachefs/logged_ops_format.h +++ b/fs/bcachefs/logged_ops_format.h @@ -2,6 +2,8 @@ #ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H #define _BCACHEFS_LOGGED_OPS_FORMAT_H +#define LOGGED_OPS_INUM 0 + struct bch_logged_op_truncate { struct bch_val v; __le32 subvol; -- 2.51.0 From 8dabb19ff4b802131ebfc1024de132b601c3c23d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 3 Dec 2024 22:03:18 -0500 Subject: [PATCH 07/16] bcachefs: Simplify disk accounting validate late The validate late path was iterating over accounting entries in eytzinger order, which is unnecessarily tricky when we may have to remove entries. Signed-off-by: Kent Overstreet --- fs/bcachefs/darray.h | 2 +- fs/bcachefs/disk_accounting.c | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index 8f4c3f0665c4..c6151495985f 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -83,7 +83,7 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++) #define darray_for_each_reverse(_d, _i) \ - for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i) + for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i) #define darray_init(_d) \ do { \ diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 71c49a7ee2fe..a915d9dc8de4 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -765,15 +765,16 @@ int bch2_accounting_read(struct bch_fs *c) keys->gap = keys->nr = dst - keys->data; percpu_down_write(&c->mark_lock); - unsigned i = 0; - while (i < acc->k.nr) { - unsigned idx = inorder_to_eytzinger0(i, acc->k.nr); + darray_for_each_reverse(acc->k, i) { struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos); + bpos_to_disk_accounting_pos(&acc_k, i->pos); u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false); + memset(v, 0, sizeof(v)); + + for (unsigned j = 0; j < i->nr_counters; j++) + v[j] = percpu_u64_get(i->v[0] + j); /* * If the entry counters are zeroed, it should be treated as @@ -782,26 +783,25 @@ int bch2_accounting_read(struct bch_fs *c) * Remove it, so that if it's re-added it gets re-marked in the * superblock: */ - ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters) + ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) ? -BCH_ERR_remove_disk_accounting_entry - : bch2_disk_accounting_validate_late(trans, acc_k, - v, acc->k.data[idx].nr_counters); + : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters); if (ret == -BCH_ERR_remove_disk_accounting_entry) { - free_percpu(acc->k.data[idx].v[0]); - free_percpu(acc->k.data[idx].v[1]); - darray_remove_item(&acc->k, &acc->k.data[idx]); - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); + free_percpu(i->v[0]); + free_percpu(i->v[1]); + darray_remove_item(&acc->k, i); ret = 0; continue; } if (ret) goto fsck_err; - i++; } + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); + preempt_disable(); struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); -- 2.51.0 From e3474394eb1a0e4ebf4a5e0e2531671fa96add16 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 01:19:28 -0500 Subject: [PATCH 08/16] bcachefs: Advance to next bp on BCH_ERR_backpointer_to_overwritten_btree_node Don't spin. Fixes: de95cc201a97 ("bcachefs: Kill bch2_get_next_backpointer()") Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 460175464762..6f21e36d89f7 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -785,7 +785,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, b = bch2_backpointer_get_node(trans, bp, &iter); ret = PTR_ERR_OR_ZERO(b); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - continue; + goto next; if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) -- 2.51.0 From 400af9a398186851103e27d848ef42be8870072b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 17:44:25 -0500 Subject: [PATCH 09/16] bcachefs: trace_accounting_mem_insert Add a tracepoint for inserting new accounting entries: we're seeing odd spinning behaviour in accounting read. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 8 ++++++++ fs/bcachefs/trace.h | 24 ++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index a915d9dc8de4..a0061bcf9159 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -324,6 +324,14 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, NULL); + + if (trace_accounting_mem_insert_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_accounting_to_text(&buf, c, a.s_c); + trace_accounting_mem_insert(c, buf.buf); + printbuf_exit(&buf); + } return 0; err: free_percpu(n.v[1]); diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 2d5932d2881e..7baf66beee22 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -199,6 +199,30 @@ DECLARE_EVENT_CLASS(bio, (unsigned long long)__entry->sector, __entry->nr_sector) ); +/* disk_accounting.c */ + +TRACE_EVENT(accounting_mem_insert, + TP_PROTO(struct bch_fs *c, const char *acc), + TP_ARGS(c, acc), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(unsigned, new_nr ) + __string(acc, acc ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->new_nr = c->accounting.k.nr; + __assign_str(acc); + ), + + TP_printk("%d,%d entries %u added %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->new_nr, + __get_str(acc)) +); + /* fs.c: */ TRACE_EVENT(bch2_sync_fs, TP_PROTO(struct super_block *sb, int wait), -- 2.51.0 From 3f1cf04ff9877bf043795d05bb6704d0a85bcd80 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 17:48:06 -0500 Subject: [PATCH 10/16] bcachefs: Silence "unable to allocate journal write" if we're already RO Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index bb69d80886b5..e7a43400a587 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -2032,7 +2032,7 @@ CLOSURE_CALLBACK(bch2_journal_write) bch2_journal_do_discards(j); } - if (ret) { + if (ret && !bch2_journal_error(j)) { struct printbuf buf = PRINTBUF; buf.atomic++; @@ -2044,8 +2044,9 @@ CLOSURE_CALLBACK(bch2_journal_write) spin_unlock(&j->lock); bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_exit(&buf); - goto err; } + if (ret) + goto err; /* * write is allocated, no longer need to account for it in -- 2.51.0 From 6728f8f829cf68ae25cc664d3b1ba7034bc81fd4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 17:53:38 -0500 Subject: [PATCH 11/16] bcachefs: BCH_ERR_insufficient_journal_devices kill another standard error code use Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 1 + fs/bcachefs/journal_io.c | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 47387f7d6202..5e4dd85ac669 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -263,6 +263,7 @@ x(EIO, missing_indirect_extent) \ x(EIO, invalidate_stripe_to_dev) \ x(EIO, no_encryption_key) \ + x(EIO, insufficient_journal_devices) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index e7a43400a587..e5fce5e497f2 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1503,8 +1503,7 @@ retry: devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); - __journal_write_alloc(j, w, &devs_sorted, - sectors, &replicas, replicas_want); + __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); if (replicas >= replicas_want) goto done; @@ -1544,7 +1543,7 @@ done: BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - return replicas >= replicas_need ? 0 : -EROFS; + return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; } static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) -- 2.51.0 From 49833ce27ed2eed91915a4c25690d82aae5b6a0b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 18:16:25 -0500 Subject: [PATCH 12/16] bcachefs: Fix failure to allocate journal write on discard retry When allocating a journal write fails, then retries after doing discards, we were failing to count already allocated replicas. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index e5fce5e497f2..d7dfea5f0181 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1498,6 +1498,15 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) READ_ONCE(c->opts.metadata_replicas_required)); rcu_read_lock(); + + /* We might run more than once if we have to stop and do discards: */ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key)); + bkey_for_each_ptr(ptrs, p) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev); + if (ca) + replicas += ca->mi.durability; + } + retry: devs = target_rw_devs(c, BCH_DATA_journal, target); -- 2.51.0 From 47d6ee766f8033563aff333f326378cd4b36a170 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 19:21:22 -0500 Subject: [PATCH 13/16] bcachefs: dev_alloc_list.devs -> dev_alloc_list.data This lets us use darray macros on dev_alloc_list (and it will become a darray eventually, when we increase the maximum number of devices). Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 60 ++++++++++++++-------------------- fs/bcachefs/alloc_foreground.h | 2 +- fs/bcachefs/journal_io.c | 21 +++++------- 3 files changed, 34 insertions(+), 49 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 095bfe7c53bd..49c9275465f9 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -626,9 +626,9 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, unsigned i; for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) - ret.devs[ret.nr++] = i; + ret.data[ret.nr++] = i; - bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); + bubble_sort(ret.data, ret.nr, dev_stripe_cmp); return ret; } @@ -700,18 +700,13 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct closure *cl) { struct bch_fs *c = trans->c; - struct dev_alloc_list devs_sorted = - bch2_dev_alloc_list(c, stripe, devs_may_alloc); int ret = -BCH_ERR_insufficient_devices; BUG_ON(*nr_effective >= nr_replicas); - for (unsigned i = 0; i < devs_sorted.nr; i++) { - struct bch_dev_usage usage; - struct open_bucket *ob; - - unsigned dev = devs_sorted.devs[i]; - struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); + struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); + darray_for_each(devs_sorted, i) { + struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i); if (!ca) continue; @@ -720,8 +715,9 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, - cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); + struct bch_dev_usage usage; + struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, + cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); bch2_dev_put(ca); @@ -765,10 +761,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, struct closure *cl) { struct bch_fs *c = trans->c; - struct dev_alloc_list devs_sorted; - struct ec_stripe_head *h; - struct open_bucket *ob; - unsigned i, ec_idx; int ret = 0; if (nr_replicas < 2) @@ -777,34 +769,32 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, if (ec_open_bucket(c, ptrs)) return 0; - h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); + struct ec_stripe_head *h = + bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); if (IS_ERR(h)) return PTR_ERR(h); if (!h) return 0; - devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); - - for (i = 0; i < devs_sorted.nr; i++) - for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { + struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); + darray_for_each(devs_sorted, i) + for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { if (!h->s->blocks[ec_idx]) continue; - ob = c->open_buckets + h->s->blocks[ec_idx]; - if (ob->dev == devs_sorted.devs[i] && - !test_and_set_bit(ec_idx, h->s->blocks_allocated)) - goto got_bucket; + struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx]; + if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) { + ob->ec_idx = ec_idx; + ob->ec = h->s; + ec_stripe_new_get(h->s, STRIPE_REF_io); + + ret = add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, ob); + goto out; + } } - goto out_put_head; -got_bucket: - ob->ec_idx = ec_idx; - ob->ec = h->s; - ec_stripe_new_get(h->s, STRIPE_REF_io); - - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); -out_put_head: +out: bch2_ec_stripe_head_put(c, h); return ret; } diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 4f87745df97e..f25481a0d1a0 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -20,7 +20,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *); struct dev_alloc_list { unsigned nr; - u8 devs[BCH_SB_MEMBERS_MAX]; + u8 data[BCH_SB_MEMBERS_MAX]; }; struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index d7dfea5f0181..9a1647297d11 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1422,25 +1422,22 @@ fsck_err: static void __journal_write_alloc(struct journal *j, struct journal_buf *w, - struct dev_alloc_list *devs_sorted, + struct dev_alloc_list *devs, unsigned sectors, unsigned *replicas, unsigned replicas_want) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_device *ja; - struct bch_dev *ca; - unsigned i; if (*replicas >= replicas_want) return; - for (i = 0; i < devs_sorted->nr; i++) { - ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); + darray_for_each(*devs, i) { + struct bch_dev *ca = rcu_dereference(c->devs[*i]); if (!ca) continue; - ja = &ca->journal; + struct journal_device *ja = &ca->journal; /* * Check that we can use this device, and aren't already using @@ -1486,13 +1483,11 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_devs_mask devs; - struct journal_device *ja; - struct bch_dev *ca; struct dev_alloc_list devs_sorted; unsigned sectors = vstruct_sectors(w->data, c->block_bits); unsigned target = c->opts.metadata_target ?: c->opts.foreground_target; - unsigned i, replicas = 0, replicas_want = + unsigned replicas = 0, replicas_want = READ_ONCE(c->opts.metadata_replicas); unsigned replicas_need = min_t(unsigned, replicas_want, READ_ONCE(c->opts.metadata_replicas_required)); @@ -1517,12 +1512,12 @@ retry: if (replicas >= replicas_want) goto done; - for (i = 0; i < devs_sorted.nr; i++) { - ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); + darray_for_each(devs_sorted, i) { + struct bch_dev *ca = rcu_dereference(c->devs[*i]); if (!ca) continue; - ja = &ca->journal; + struct journal_device *ja = &ca->journal; if (sectors > ja->sectors_free && sectors <= ca->mi.bucket_size && -- 2.51.0 From ff7e7c5367250454ed10a6113695d2e01ccc0cfc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 18:14:14 -0500 Subject: [PATCH 14/16] bcachefs: Journal write path refactoring, debug improvements Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 6 ++++ fs/bcachefs/journal_io.c | 70 ++++++++++++++++++++++------------------ 2 files changed, 45 insertions(+), 31 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index dc66521964b7..04a9ccf76d75 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1564,6 +1564,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) printbuf_indent_sub(out, 2); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { + if (!ca->mi.durability) + continue; + struct journal_device *ja = &ca->journal; if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) @@ -1573,6 +1576,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) continue; prt_printf(out, "dev %u:\n", ca->dev_idx); + prt_printf(out, "durability %u:\n", ca->mi.durability); printbuf_indent_add(out, 2); prt_printf(out, "nr\t%u\n", ja->nr); prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size); @@ -1584,6 +1588,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) printbuf_indent_sub(out, 2); } + prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required); + rcu_read_unlock(); --out->atomic; diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 9a1647297d11..2f4daa8bd498 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1420,6 +1420,35 @@ fsck_err: /* journal write: */ +static void journal_advance_devs_to_next_bucket(struct journal *j, + struct dev_alloc_list *devs, + unsigned sectors, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + darray_for_each(*devs, i) { + struct bch_dev *ca = rcu_dereference(c->devs[*i]); + if (!ca) + continue; + + struct journal_device *ja = &ca->journal; + + if (sectors > ja->sectors_free && + sectors <= ca->mi.bucket_size && + bch2_journal_dev_buckets_available(j, ja, + journal_space_discarded)) { + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + ja->sectors_free = ca->mi.bucket_size; + + /* + * ja->bucket_seq[ja->cur_idx] must always have + * something sensible: + */ + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); + } + } +} + static void __journal_write_alloc(struct journal *j, struct journal_buf *w, struct dev_alloc_list *devs, @@ -1429,9 +1458,6 @@ static void __journal_write_alloc(struct journal *j, { struct bch_fs *c = container_of(j, struct bch_fs, journal); - if (*replicas >= replicas_want) - return; - darray_for_each(*devs, i) { struct bch_dev *ca = rcu_dereference(c->devs[*i]); if (!ca) @@ -1491,6 +1517,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) READ_ONCE(c->opts.metadata_replicas); unsigned replicas_need = min_t(unsigned, replicas_want, READ_ONCE(c->opts.metadata_replicas_required)); + bool advance_done = false; rcu_read_lock(); @@ -1502,45 +1529,26 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) replicas += ca->mi.durability; } -retry: +retry_target: devs = target_rw_devs(c, BCH_DATA_journal, target); - devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); - +retry_alloc: __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); - if (replicas >= replicas_want) + if (likely(replicas >= replicas_want)) goto done; - darray_for_each(devs_sorted, i) { - struct bch_dev *ca = rcu_dereference(c->devs[*i]); - if (!ca) - continue; - - struct journal_device *ja = &ca->journal; - - if (sectors > ja->sectors_free && - sectors <= ca->mi.bucket_size && - bch2_journal_dev_buckets_available(j, ja, - journal_space_discarded)) { - ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->sectors_free = ca->mi.bucket_size; - - /* - * ja->bucket_seq[ja->cur_idx] must always have - * something sensible: - */ - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); - } + if (!advance_done) { + journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq); + advance_done = true; + goto retry_alloc; } - __journal_write_alloc(j, w, &devs_sorted, - sectors, &replicas, replicas_want); - if (replicas < replicas_want && target) { /* Retry from all devices: */ target = 0; - goto retry; + advance_done = false; + goto retry_target; } done: rcu_read_unlock(); -- 2.51.0 From 90c6daa6ac90a7f83efa566350fbe2404f848ef0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 19:41:38 -0500 Subject: [PATCH 15/16] bcachefs: Call bch2_btree_lost_data() on btree read error Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index e59924cfe2bc..24f2f3bdf704 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -29,6 +29,7 @@ #include "move.h" #include "recovery_passes.h" #include "reflink.h" +#include "recovery.h" #include "replicas.h" #include "super-io.h" #include "trace.h" @@ -359,11 +360,9 @@ again: if (ret) break; - if (!btree_id_is_alloc(b->c.btree_id)) { - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); - if (ret) - break; - } + ret = bch2_btree_lost_data(c, b->c.btree_id); + if (ret) + break; continue; } @@ -525,7 +524,7 @@ int bch2_check_topology(struct bch_fs *c) bch2_btree_id_to_text(&buf, i); if (r->error) { - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + ret = bch2_btree_lost_data(c, i); if (ret) break; reconstruct_root: @@ -741,7 +740,7 @@ static int bch2_gc_btrees(struct bch_fs *c) (printbuf_reset(&buf), bch2_btree_id_to_text(&buf, btree), buf.buf))) - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); + ret = bch2_btree_lost_data(c, btree); } fsck_err: printbuf_exit(&buf); -- 2.51.0 From c67fab0774cee93b6aac9adc3601bcf0a4ea6ab4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 19:46:35 -0500 Subject: [PATCH 16/16] bcachefs: Make sure __bch2_run_explicit_recovery_pass() signals to rewind We should always signal to rewind if the requested pass hasn't been run, even if called multiple times. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/recovery_passes.c | 52 +++++++++++++++++------------------ 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index b12c9c78beec..e6cd93e1ed0f 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1044,6 +1044,7 @@ struct bch_fs { * for signaling to the toplevel code which pass we want to run now. */ enum bch_recovery_pass curr_recovery_pass; + enum bch_recovery_pass next_recovery_pass; /* bitmask of recovery passes that we actually ran */ u64 recovery_passes_complete; /* never rewinds version of curr_recovery_pass */ diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index f6d3a99cb63e..0b3c951c32da 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -103,27 +103,31 @@ u64 bch2_recovery_passes_from_stable(u64 v) static int __bch2_run_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - if (c->opts.recovery_passes & BIT_ULL(pass)) - return 0; - if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns)) return -BCH_ERR_not_in_recovery; + if (c->recovery_passes_complete & BIT_ULL(pass)) + return 0; + + bool print = !(c->opts.recovery_passes & BIT_ULL(pass)); + if (pass < BCH_RECOVERY_PASS_set_may_go_rw && c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { - bch_info(c, "need recovery pass %s (%u), but already rw", - bch2_recovery_passes[pass], pass); + if (print) + bch_info(c, "need recovery pass %s (%u), but already rw", + bch2_recovery_passes[pass], pass); return -BCH_ERR_cannot_rewind_recovery; } - bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", - bch2_recovery_passes[pass], pass, - bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + if (print) + bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", + bch2_recovery_passes[pass], pass, + bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); c->opts.recovery_passes |= BIT_ULL(pass); - if (c->curr_recovery_pass >= pass) { - c->curr_recovery_pass = pass; + if (c->curr_recovery_pass > pass) { + c->next_recovery_pass = pass; c->recovery_passes_complete &= (1ULL << pass) >> 1; return -BCH_ERR_restart_recovery; } else { @@ -264,7 +268,9 @@ int bch2_run_recovery_passes(struct bch_fs *c) */ c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { + c->next_recovery_pass = c->curr_recovery_pass + 1; + spin_lock_irq(&c->recovery_pass_lock); unsigned pass = c->curr_recovery_pass; @@ -285,31 +291,25 @@ int bch2_run_recovery_passes(struct bch_fs *c) ret = bch2_run_recovery_pass(c, pass) ?: bch2_journal_flush(&c->journal); + if (!ret && !test_bit(BCH_FS_error, &c->flags)) + bch2_clear_recovery_pass_required(c, pass); + spin_lock_irq(&c->recovery_pass_lock); - if (c->curr_recovery_pass < pass) { + if (c->next_recovery_pass < c->curr_recovery_pass) { /* * bch2_run_explicit_recovery_pass() was called: we * can't always catch -BCH_ERR_restart_recovery because * it may have been called from another thread (btree * node read completion) */ - spin_unlock_irq(&c->recovery_pass_lock); - continue; - } else if (c->curr_recovery_pass == pass) { - c->curr_recovery_pass++; + ret = 0; + c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass); } else { - BUG(); + c->recovery_passes_complete |= BIT_ULL(pass); + c->recovery_pass_done = max(c->recovery_pass_done, pass); } + c->curr_recovery_pass = c->next_recovery_pass; spin_unlock_irq(&c->recovery_pass_lock); - - if (ret) - break; - - c->recovery_passes_complete |= BIT_ULL(pass); - c->recovery_pass_done = max(c->recovery_pass_done, pass); - - if (!test_bit(BCH_FS_error, &c->flags)) - bch2_clear_recovery_pass_required(c, pass); } return ret; -- 2.51.0