From cf164a91066d9af7db3cfa9ee2ac2e36f692dc5e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 1 Mar 2025 17:34:33 -0500 Subject: [PATCH 01/16] bcachefs: bch2_dev_get_ioref() may now sleep The next patch implementing freezing will change bch2_dev_get_ioref() to sleep if a device is currently frozen. Add an annotation and fix the journal code accordingly. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 5 ++++- fs/bcachefs/sb-members.h | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index c12d9f9bd536..a510755a8364 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1664,6 +1664,7 @@ static CLOSURE_CALLBACK(journal_write_done) } bool completed = false; + bool do_discards = false; for (seq = journal_last_unwritten_seq(j); seq <= journal_cur_seq(j); @@ -1676,7 +1677,6 @@ static CLOSURE_CALLBACK(journal_write_done) j->flushed_seq_ondisk = seq; j->last_seq_ondisk = w->last_seq; - bch2_do_discards(c); closure_wake_up(&c->freelist_wait); bch2_reset_alloc_cursors(c); } @@ -1727,6 +1727,9 @@ static CLOSURE_CALLBACK(journal_write_done) */ bch2_journal_do_writes(j); spin_unlock(&j->lock); + + if (do_discards) + bch2_do_discards(c); } static void journal_write_endio(struct bio *bio) diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index b29b6c6c21dd..df91b02ce575 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -283,6 +283,8 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw) { + might_sleep(); + rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu(c, dev); if (ca && !percpu_ref_tryget(&ca->io_ref)) -- 2.51.0 From d71e023376d3e56bf2a787c9b5d2600a2db2aabf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Mar 2025 10:50:49 -0500 Subject: [PATCH 02/16] bcachefs: Change BCH_MEMBER_STATE_failed semantics Previously, we woudn't try to read at all from a failed device - that doesn't make much sense, the device may be unhealthy (perhaps taking longer than it should to service reads), but if it's our only option we should still try to read from it. Now, bch2_bkey_pick_read_device() will pick failed devices only if there are no non-failed replicas to read from. Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 24 ++++++++++++++++++------ fs/bcachefs/sb-members.h | 2 +- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index d9bdf433c118..032cd0bda017 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -79,12 +79,16 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, } } -static inline u64 dev_latency(struct bch_fs *c, unsigned dev) +static inline u64 dev_latency(struct bch_dev *ca) { - struct bch_dev *ca = bch2_dev_rcu(c, dev); return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; } +static inline int dev_failed(struct bch_dev *ca) +{ + return !ca || ca->mi.state == BCH_MEMBER_STATE_failed; +} + /* * returns true if p1 is better than p2: */ @@ -93,8 +97,16 @@ static inline bool ptr_better(struct bch_fs *c, const struct extent_ptr_decoded p2) { if (likely(!p1.idx && !p2.idx)) { - u64 l1 = dev_latency(c, p1.ptr.dev); - u64 l2 = dev_latency(c, p2.ptr.dev); + struct bch_dev *ca1 = bch2_dev_rcu(c, p1.ptr.dev); + struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev); + + int failed_delta = dev_failed(ca1) - dev_failed(ca2); + + if (failed_delta) + return failed_delta < 0; + + u64 l1 = dev_latency(ca1); + u64 l2 = dev_latency(ca2); /* * Square the latencies, to bias more in favor of the faster @@ -170,7 +182,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, ? f->idx : f->idx + 1; - if (!p.idx && (!ca || !bch2_dev_is_readable(ca))) + if (!p.idx && (!ca || !bch2_dev_is_online(ca))) p.idx++; if (!p.idx && p.has_ec && bch2_force_reconstruct_read) @@ -1012,7 +1024,7 @@ static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); - return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr); + return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr); } void bch2_extent_ptr_set_cached(struct bch_fs *c, diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index df91b02ce575..38261638a611 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -35,7 +35,7 @@ static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) return ret; } -static inline bool bch2_dev_is_readable(struct bch_dev *ca) +static inline bool bch2_dev_is_healthy(struct bch_dev *ca) { return bch2_dev_is_online(ca) && ca->mi.state != BCH_MEMBER_STATE_failed; -- 2.51.0 From 981e3801443f507d74e2dae5710452642c96e8e3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 26 Feb 2025 18:44:23 -0500 Subject: [PATCH 03/16] bcachefs: Kick devices out after too many write IO errors We're improving our handling of write errors - we shouldn't write degraded data just because a write failed once, we should retry it (on other devices, if possible). But for this to work, we need to kick devices out when they're only returning errors - otherwise those retries will loop infinitely. This adds a configurable timeout - if writes are failing for too long, we'll set that device read-only. In the future we should also implement more tracking and another knob for an "allowed error rate", so that we can kick out drives that are acting "unhealthy". Another thing we'll want is a mechanism (likely in userspace) for bringing a device back in after a transient error - perhaps a cable was jiggled, or there was a controller reset. After transient errors we also need a mechanism to walk (from the journal) recent btree updates that weren't flushed to that device and treat them as "degraded", since unflushed data may well not have been written. Out of scope for this patch, but becoming relevant. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/bcachefs_format.h | 1 + fs/bcachefs/error.c | 34 +++++++++++++++++++++++++--------- fs/bcachefs/error.h | 7 ++++++- fs/bcachefs/opts.h | 5 +++++ fs/bcachefs/super-io.c | 3 +++ 6 files changed, 41 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index d2c3f59a668f..8abefc994016 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -536,6 +536,7 @@ struct bch_dev { */ struct bch_member_cpu mi; atomic64_t errors[BCH_MEMBER_ERROR_NR]; + unsigned long write_errors_start; __uuid_t uuid; char name[BDEVNAME_SIZE]; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index a6cc817ccd87..7a5b0d211a82 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -860,6 +860,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48); LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, struct bch_sb, flags[5], 48, 64); LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); +LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 3f93a5a6bbfa..6d68c89a49b2 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -54,25 +54,41 @@ void bch2_io_error_work(struct work_struct *work) { struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); struct bch_fs *c = ca->fs; - bool dev; + + /* XXX: if it's reads or checksums that are failing, set it to failed */ down_write(&c->state_lock); - dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro, - BCH_FORCE_IF_DEGRADED); - if (dev - ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, - BCH_FORCE_IF_DEGRADED) - : bch2_fs_emergency_read_only(c)) + unsigned long write_errors_start = READ_ONCE(ca->write_errors_start); + + if (write_errors_start && + time_after(jiffies, + write_errors_start + c->opts.write_error_timeout * HZ)) { + if (ca->mi.state >= BCH_MEMBER_STATE_ro) + goto out; + + bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, + BCH_FORCE_IF_DEGRADED); + bch_err(ca, - "too many IO errors, setting %s RO", + "writes erroring for %u seconds, setting %s ro", + c->opts.write_error_timeout, dev ? "device" : "filesystem"); + if (!dev) + bch2_fs_emergency_read_only(c); + + } +out: up_write(&c->state_lock); } void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type) { atomic64_inc(&ca->errors[type]); - //queue_work(system_long_wq, &ca->io_error_work); + + if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start) + ca->write_errors_start = jiffies; + + queue_work(system_long_wq, &ca->io_error_work); } enum ask_yn { diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index a57b9f18d060..7d3f0e2a5fd6 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -226,8 +226,13 @@ static inline void bch2_account_io_success_fail(struct bch_dev *ca, enum bch_member_error_type type, bool success) { - if (!success) + if (likely(success)) { + if (type == BCH_MEMBER_ERROR_write && + ca->write_errors_start) + ca->write_errors_start = 0; + } else { bch2_io_error(ca, type); + } } static inline void bch2_account_io_completion(struct bch_dev *ca, diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 071a92ec8a14..afb89d318d24 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -145,6 +145,11 @@ enum fsck_err_opts { OPT_STR(bch2_error_actions), \ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \ NULL, "Action to take on filesystem error") \ + x(write_error_timeout, u16, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, 300), \ + BCH_SB_WRITE_ERROR_TIMEOUT, 30, \ + NULL, "Number of consecutive write errors allowed before kicking out a device")\ x(metadata_replicas, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(1, BCH_REPLICAS_MAX), \ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 918e4e7704dd..ee32d043414a 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -454,6 +454,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2) SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); + + if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb)) + SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30); } #ifdef __KERNEL__ -- 2.51.0 From 4b0fac4bed0797c33e0852312e1dbe11baa3fb01 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Mar 2025 12:00:56 -0500 Subject: [PATCH 04/16] bcachefs: journal write path comment Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index a510755a8364..331c9d762439 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1813,6 +1813,10 @@ static CLOSURE_CALLBACK(journal_write_preflush) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); + /* + * Wait for previous journal writes to comelete; they won't necessarily + * be flushed if they're still in flight + */ if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { spin_lock(&j->lock); if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { -- 2.51.0 From 039790cfb5c8255cf9f5523017b9eb0006d1df33 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 20:35:08 -0500 Subject: [PATCH 05/16] bcachefs: ec_stripe_delete() uses new stripe lru Convert to the new persistent stripe LRU. Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 64 +++++++++++++++++------------------------------- 1 file changed, 22 insertions(+), 42 deletions(-) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 8c7a9addafae..dba4b599f827 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1152,37 +1152,22 @@ void bch2_stripes_heap_update(struct bch_fs *c, static int ec_stripe_delete(struct btree_trans *trans, u64 idx) { - struct bch_fs *c = trans->c; struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_stripe s; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), - BTREE_ITER_intent); - ret = bkey_err(k); + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_stripes, POS(0, idx), + BTREE_ITER_intent); + int ret = bkey_err(k); if (ret) goto err; - if (k.k->type != KEY_TYPE_stripe) { - bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx); - ret = -EINVAL; - goto err; - } - - s = bkey_s_c_to_stripe(k); - for (unsigned i = 0; i < s.v->nr_blocks; i++) - if (stripe_blockcount_get(s.v, i)) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf); - printbuf_exit(&buf); - ret = -EINVAL; - goto err; - } - - ret = bch2_btree_delete_at(trans, &iter, 0); + /* + * We expect write buffer races here + * Important: check stripe_is_open with stripe key locked: + */ + if (k.k->type == KEY_TYPE_stripe && + !bch2_stripe_is_open(trans->c, idx) && + stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1) + ret = bch2_btree_delete_at(trans, &iter, 0); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -1197,21 +1182,16 @@ static void ec_stripe_delete_work(struct work_struct *work) struct bch_fs *c = container_of(work, struct bch_fs, ec_stripe_delete_work); - while (1) { - mutex_lock(&c->ec_stripes_heap_lock); - u64 idx = stripe_idx_to_delete(c); - mutex_unlock(&c->ec_stripes_heap_lock); - - if (!idx) - break; - - int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - ec_stripe_delete(trans, idx)); - bch_err_fn(c, ret); - if (ret) - break; - } - + bch2_trans_run(c, + bch2_btree_write_buffer_tryflush(trans) ?: + for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru, + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0), + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX), + 0, lru_k, + NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, ({ + ec_stripe_delete(trans, lru_k.k->p.offset); + }))); bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); } -- 2.51.0 From 6c336144b9a1b671fccd4d90f1cfb5e9a5398bfa Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 20:34:47 -0500 Subject: [PATCH 06/16] bcachefs: get_existing_stripe() uses new stripe lru Convert to the new persistent stripe LRU. Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 86 +++++++++++++++++++++++++++--------------------- fs/bcachefs/ec.h | 10 +++--- 2 files changed, 52 insertions(+), 44 deletions(-) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index dba4b599f827..84f232f4cbf8 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1981,39 +1981,40 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, return 0; } -static s64 get_existing_stripe(struct bch_fs *c, - struct ec_stripe_head *head) +static int __get_existing_stripe(struct btree_trans *trans, + struct ec_stripe_head *head, + struct ec_stripe_buf *stripe, + u64 idx) { - ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m; - size_t heap_idx; - u64 stripe_idx; - s64 ret = -1; - - if (may_create_new_stripe(c)) - return -1; + struct bch_fs *c = trans->c; - mutex_lock(&c->ec_stripes_heap_lock); - for (heap_idx = 0; heap_idx < h->nr; heap_idx++) { - /* No blocks worth reusing, stripe will just be deleted: */ - if (!h->data[heap_idx].blocks_nonempty) - continue; + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_stripes, POS(0, idx), 0); + int ret = bkey_err(k); + if (ret) + goto err; - stripe_idx = h->data[heap_idx].idx; + /* We expect write buffer races here */ + if (k.k->type != KEY_TYPE_stripe) + goto out; - m = genradix_ptr(&c->stripes, stripe_idx); + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + if (stripe_lru_pos(s.v) <= 1) + goto out; - if (m->disk_label == head->disk_label && - m->algorithm == head->algo && - m->nr_redundant == head->redundancy && - m->sectors == head->blocksize && - m->blocks_nonempty < m->nr_blocks - m->nr_redundant && - bch2_try_open_stripe(c, head->s, stripe_idx)) { - ret = stripe_idx; - break; - } + if (s.v->disk_label == head->disk_label && + s.v->algorithm == head->algo && + s.v->nr_redundant == head->redundancy && + le16_to_cpu(s.v->sectors) == head->blocksize && + bch2_try_open_stripe(c, head->s, idx)) { + bkey_reassemble(&stripe->key, k); + ret = 1; } - mutex_unlock(&c->ec_stripes_heap_lock); +out: + bch2_set_btree_iter_dontneed(&iter); +err: + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -2065,24 +2066,33 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri struct ec_stripe_new *s) { struct bch_fs *c = trans->c; - s64 idx; - int ret; /* * If we can't allocate a new stripe, and there's no stripes with empty * blocks for us to reuse, that means we have to wait on copygc: */ - idx = get_existing_stripe(c, h); - if (idx < 0) - return -BCH_ERR_stripe_alloc_blocked; + if (may_create_new_stripe(c)) + return -1; - ret = get_stripe_key_trans(trans, idx, &s->existing_stripe); - bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, - "reading stripe key: %s", bch2_err_str(ret)); - if (ret) { - bch2_stripe_close(c, s); - return ret; + struct btree_iter lru_iter; + struct bkey_s_c lru_k; + int ret = 0; + + for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru, + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0), + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX), + 0, lru_k, ret) { + ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset); + if (ret) + break; } + bch2_trans_iter_exit(trans, &lru_iter); + if (!ret) + ret = -BCH_ERR_stripe_alloc_blocked; + if (ret == 1) + ret = 0; + if (ret) + return ret; return init_new_stripe_from_existing(c, s); } diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index cd1c837e4933..3008d41db12d 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -99,15 +99,13 @@ static inline u64 stripe_lru_pos(const struct bch_stripe *s) if (!s) return 0; - unsigned blocks_empty = 0, blocks_nonempty = 0; + unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0; - for (unsigned i = 0; i < s->nr_blocks; i++) { - blocks_empty += !stripe_blockcount_get(s, i); - blocks_nonempty += !!stripe_blockcount_get(s, i); - } + for (unsigned i = 0; i < nr_data; i++) + blocks_empty += !stripe_blockcount_get(s, i); /* Will be picked up by the stripe_delete worker */ - if (!blocks_nonempty) + if (blocks_empty == nr_data) return STRIPE_LRU_POS_EMPTY; if (!blocks_empty) -- 2.51.0 From 434a3f2ffaa1519a562909a92c62b77cf29f05da Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Mar 2025 14:30:29 -0500 Subject: [PATCH 07/16] bcachefs: trace_stripe_create Add a simple tracepoint for stripe creation, we'll want to expand this later. Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 5 +++++ fs/bcachefs/errcode.h | 2 ++ fs/bcachefs/trace.h | 24 ++++++++++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 84f232f4cbf8..37269c0f79b5 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1486,6 +1486,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (s->err) { if (!bch2_err_matches(s->err, EROFS)) bch_err(c, "error creating stripe: error writing data buckets"); + ret = s->err; goto err; } @@ -1494,6 +1495,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_do_recov(c, &s->existing_stripe)) { bch_err(c, "error creating stripe: error reading existing stripe"); + ret = -BCH_ERR_ec_block_read; goto err; } @@ -1519,6 +1521,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_nr_failed(&s->new_stripe)) { bch_err(c, "error creating stripe: error writing redundancy buckets"); + ret = -BCH_ERR_ec_block_write; goto err; } @@ -1540,6 +1543,8 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ret) goto err; err: + trace_stripe_create(c, s->idx, ret); + bch2_disk_reservation_put(c, &s->res); for (i = 0; i < v->nr_blocks; i++) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index d45ef03abc91..e14e0d1cc93d 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -280,6 +280,8 @@ x(EIO, insufficient_journal_devices) \ x(EIO, device_offline) \ x(EIO, EIO_fault_injected) \ + x(EIO, ec_block_read) \ + x(EIO, ec_block_write) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 5718988dd7d6..c8669a6b9cec 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -339,6 +339,30 @@ DEFINE_EVENT(bio, io_read_reuse_race, TP_ARGS(bio) ); +/* ec.c */ + +TRACE_EVENT(stripe_create, + TP_PROTO(struct bch_fs *c, u64 idx, int ret), + TP_ARGS(c, idx, ret), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, idx ) + __field(int, ret ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->idx = idx; + __entry->ret = ret; + ), + + TP_printk("%d,%d idx %llu ret %i", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->idx, + __entry->ret) +); + /* Journal */ DEFINE_EVENT(bch_fs, journal_full, -- 2.51.0 From 94373026d930b9ed72c8f8f0f3d532e13654fdb1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 20:15:40 -0500 Subject: [PATCH 08/16] bcachefs: We no longer read stripes into memory at startup And the stripes heap gets deleted. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 4 - fs/bcachefs/ec.c | 223 +--------------------------- fs/bcachefs/ec.h | 5 - fs/bcachefs/ec_types.h | 7 - fs/bcachefs/recovery_passes_types.h | 2 +- fs/bcachefs/sysfs.c | 5 - 6 files changed, 2 insertions(+), 244 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 8abefc994016..b432bb6e6f6e 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1003,15 +1003,11 @@ struct bch_fs { wait_queue_head_t copygc_running_wq; /* STRIPES: */ - GENRADIX(struct stripe) stripes; GENRADIX(struct gc_stripe) gc_stripes; struct hlist_head ec_stripes_new[32]; spinlock_t ec_stripes_new_lock; - ec_stripes_heap ec_stripes_heap; - struct mutex ec_stripes_heap_lock; - /* ERASURE CODING */ struct list_head ec_stripe_head_list; struct mutex ec_stripe_head_lock; diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 37269c0f79b5..c73ba73f6890 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -380,19 +380,6 @@ static int mark_stripe_buckets(struct btree_trans *trans, return 0; } -static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s) -{ - m->sectors = le16_to_cpu(s->sectors); - m->algorithm = s->algorithm; - m->nr_blocks = s->nr_blocks; - m->nr_redundant = s->nr_redundant; - m->disk_label = s->disk_label; - m->blocks_nonempty = 0; - - for (unsigned i = 0; i < s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(s, i); -} - int bch2_trigger_stripe(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s _new, @@ -495,38 +482,6 @@ int bch2_trigger_stripe(struct btree_trans *trans, return ret; } - if (flags & BTREE_TRIGGER_atomic) { - struct stripe *m = genradix_ptr(&c->stripes, idx); - - if (!m) { - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - - bch2_bkey_val_to_text(&buf1, c, old); - bch2_bkey_val_to_text(&buf2, c, new); - bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" - "old %s\n" - "new %s", idx, buf1.buf, buf2.buf); - printbuf_exit(&buf2); - printbuf_exit(&buf1); - bch2_inconsistent_error(c); - return -1; - } - - if (!new_s) { - bch2_stripes_heap_del(c, m, idx); - - memset(m, 0, sizeof(*m)); - } else { - stripe_to_mem(m, new_s); - - if (!old_s) - bch2_stripes_heap_insert(c, m, idx); - else - bch2_stripes_heap_update(c, m, idx); - } - } - return 0; } @@ -942,26 +897,6 @@ err: static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) { - ec_stripes_heap n, *h = &c->ec_stripes_heap; - - if (idx >= h->size) { - if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) - return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; - - mutex_lock(&c->ec_stripes_heap_lock); - if (n.size > h->size) { - memcpy(n.data, h->data, h->nr * sizeof(h->data[0])); - n.nr = h->nr; - swap(*h, n); - } - mutex_unlock(&c->ec_stripes_heap_lock); - - free_heap(&n); - } - - if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) - return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; - if (c->gc_pos.phase != GC_PHASE_not_running && !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; @@ -1034,120 +969,6 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) s->idx = 0; } -/* Heap of all existing stripes, ordered by blocks_nonempty */ - -static u64 stripe_idx_to_delete(struct bch_fs *c) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - - lockdep_assert_held(&c->ec_stripes_heap_lock); - - if (h->nr && - h->data[0].blocks_nonempty == 0 && - !bch2_stripe_is_open(c, h->data[0].idx)) - return h->data[0].idx; - - return 0; -} - -static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, - size_t i) -{ - struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); - - genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; -} - -static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args) -{ - struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; - struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; - - return ((_l->blocks_nonempty > _r->blocks_nonempty) < - (_l->blocks_nonempty < _r->blocks_nonempty)); -} - -static inline void ec_stripes_heap_swap(void *l, void *r, void *h) -{ - struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; - struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; - ec_stripes_heap *_h = (ec_stripes_heap *)h; - size_t i = _l - _h->data; - size_t j = _r - _h->data; - - swap(*_l, *_r); - - ec_stripes_heap_set_backpointer(_h, i); - ec_stripes_heap_set_backpointer(_h, j); -} - -static const struct min_heap_callbacks callbacks = { - .less = ec_stripes_heap_cmp, - .swp = ec_stripes_heap_swap, -}; - -static void heap_verify_backpointer(struct bch_fs *c, size_t idx) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m = genradix_ptr(&c->stripes, idx); - - BUG_ON(m->heap_idx >= h->nr); - BUG_ON(h->data[m->heap_idx].idx != idx); -} - -void bch2_stripes_heap_del(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - mutex_lock(&c->ec_stripes_heap_lock); - heap_verify_backpointer(c, idx); - - min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap); - mutex_unlock(&c->ec_stripes_heap_lock); -} - -void bch2_stripes_heap_insert(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - mutex_lock(&c->ec_stripes_heap_lock); - BUG_ON(min_heap_full(&c->ec_stripes_heap)); - - genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr; - min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) { - .idx = idx, - .blocks_nonempty = m->blocks_nonempty, - }), - &callbacks, - &c->ec_stripes_heap); - - heap_verify_backpointer(c, idx); - mutex_unlock(&c->ec_stripes_heap_lock); -} - -void bch2_stripes_heap_update(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - bool do_deletes; - size_t i; - - mutex_lock(&c->ec_stripes_heap_lock); - heap_verify_backpointer(c, idx); - - h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; - - i = m->heap_idx; - min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap); - min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap); - - heap_verify_backpointer(c, idx); - - do_deletes = stripe_idx_to_delete(c) != 0; - mutex_unlock(&c->ec_stripes_heap_lock); - - if (do_deletes) - bch2_do_stripe_deletes(c); -} - /* stripe deletion */ static int ec_stripe_delete(struct btree_trans *trans, u64 idx) @@ -2395,46 +2216,7 @@ void bch2_fs_ec_flush(struct bch_fs *c) int bch2_stripes_read(struct bch_fs *c) { - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_prefetch, k, ({ - if (k.k->type != KEY_TYPE_stripe) - continue; - - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); - if (ret) - break; - - struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset); - - stripe_to_mem(m, bkey_s_c_to_stripe(k).v); - - bch2_stripes_heap_insert(c, m, k.k->p.offset); - 0; - }))); - bch_err_fn(c, ret); - return ret; -} - -void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m; - size_t i; - - mutex_lock(&c->ec_stripes_heap_lock); - for (i = 0; i < min_t(size_t, h->nr, 50); i++) { - m = genradix_ptr(&c->stripes, h->data[i].idx); - - prt_printf(out, "%zu %u/%u+%u", h->data[i].idx, - h->data[i].blocks_nonempty, - m->nr_blocks - m->nr_redundant, - m->nr_redundant); - if (bch2_stripe_is_open(c, h->data[i].idx)) - prt_str(out, " open"); - prt_newline(out); - } - mutex_unlock(&c->ec_stripes_heap_lock); + return 0; } static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, @@ -2505,15 +2287,12 @@ void bch2_fs_ec_exit(struct bch_fs *c) BUG_ON(!list_empty(&c->ec_stripe_new_list)); - free_heap(&c->ec_stripes_heap); - genradix_free(&c->stripes); bioset_exit(&c->ec_bioset); } void bch2_fs_ec_init_early(struct bch_fs *c) { spin_lock_init(&c->ec_stripes_new_lock); - mutex_init(&c->ec_stripes_heap_lock); INIT_LIST_HEAD(&c->ec_stripe_head_list); mutex_init(&c->ec_stripe_head_lock); diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 3008d41db12d..8f2228e59eda 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -258,10 +258,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, unsigned, unsigned, unsigned, enum bch_watermark, struct closure *); -void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); -void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); -void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); - void bch2_do_stripe_deletes(struct bch_fs *); void bch2_ec_do_stripe_creates(struct bch_fs *); void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *); @@ -298,7 +294,6 @@ void bch2_fs_ec_flush(struct bch_fs *); int bch2_stripes_read(struct bch_fs *); -void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); void bch2_fs_ec_exit(struct bch_fs *); diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h index 37558cc2d89f..06144bfd9c19 100644 --- a/fs/bcachefs/ec_types.h +++ b/fs/bcachefs/ec_types.h @@ -31,11 +31,4 @@ struct gc_stripe { struct bch_replicas_padded r; }; -struct ec_stripe_heap_entry { - size_t idx; - unsigned blocks_nonempty; -}; - -typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap; - #endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 418557960ed6..e89b9c783285 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -24,7 +24,7 @@ x(check_topology, 4, 0) \ x(accounting_read, 39, PASS_ALWAYS) \ x(alloc_read, 0, PASS_ALWAYS) \ - x(stripes_read, 1, PASS_ALWAYS) \ + x(stripes_read, 1, 0) \ x(initialize_subvolumes, 2, 0) \ x(snapshots_read, 3, PASS_ALWAYS) \ x(check_allocations, 5, PASS_FSCK) \ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index a9953181c29b..2ed3f755eadd 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -174,7 +174,6 @@ read_attribute(journal_debug); read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(btree_reserve_cache); -read_attribute(stripes_heap); read_attribute(open_buckets); read_attribute(open_buckets_partial); read_attribute(nocow_lock_table); @@ -355,9 +354,6 @@ SHOW(bch2_fs) if (attr == &sysfs_btree_reserve_cache) bch2_btree_reserve_cache_to_text(out, c); - if (attr == &sysfs_stripes_heap) - bch2_stripes_heap_to_text(out, c); - if (attr == &sysfs_open_buckets) bch2_open_buckets_to_text(out, c, NULL); @@ -566,7 +562,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_btree_key_cache, &sysfs_btree_reserve_cache, &sysfs_new_stripes, - &sysfs_stripes_heap, &sysfs_open_buckets, &sysfs_open_buckets_partial, #ifdef BCH_WRITE_REF_DEBUG -- 2.51.0 From c073ec6bec0d05781380ecabca9e8611e4b48502 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sat, 8 Mar 2025 20:53:53 +0100 Subject: [PATCH 09/16] bcachefs: Remove unnecessary byte allocation The extra byte is not used - remove it. Signed-off-by: Thorsten Blum Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 17c035f9d629..5b47b94fe1ea 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -244,7 +244,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, int ret = 0; subvol_inum inum; - kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); + kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL); if (!kname) return -ENOMEM; -- 2.51.0 From ff4cb203ccce24630c50a503973ac596c3d5d1be Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 11 Mar 2025 12:13:11 +0100 Subject: [PATCH 10/16] bcachefs: Use max() to improve gen_after() Use max() to simplify gen_after() and improve its readability. Signed-off-by: Thorsten Blum Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 6aeec1c0973c..c5363256e363 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -140,9 +140,7 @@ static inline int gen_cmp(u8 a, u8 b) static inline int gen_after(u8 a, u8 b) { - int r = gen_cmp(a, b); - - return r > 0 ? r : 0; + return max(0, gen_cmp(a, b)); } static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) -- 2.51.0 From a2e9e6874612582367be674e4d961de2ec8a9d05 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 11 Mar 2025 09:31:03 -0400 Subject: [PATCH 11/16] bcachefs: Kill a bit of dead code Found with CC=clang W=1 Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 14 -------------- fs/bcachefs/inode.c | 13 ------------- fs/bcachefs/journal_io.c | 5 ----- fs/bcachefs/move.c | 2 -- 4 files changed, 34 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index e32fce4fd258..7542c6f9c88e 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -562,20 +562,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, bch2_btree_node_iter_peek_all(&l->iter, l->b)); } -static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, - struct btree_path *path, - struct btree_path_level *l, - struct bkey *u) -{ - struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, - bch2_btree_node_iter_peek(&l->iter, l->b)); - - path->pos = k.k ? k.k->p : l->b->key.k.p; - trans->paths_sorted = false; - bch2_btree_path_verify_level(trans, path, l - path->l); - return k; -} - static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, struct btree_path *path, struct btree_path_level *l, diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 339b80770f1d..7aca010e2e10 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -868,19 +868,6 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, uid, gid, mode, rdev, parent); } -static inline u32 bkey_generation(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - case KEY_TYPE_inode_v2: - BUG(); - case KEY_TYPE_inode_generation: - return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); - default: - return 0; - } -} - static struct bkey_i_inode_alloc_cursor * bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) { diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 331c9d762439..cf2700b06d58 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1609,11 +1609,6 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) kvfree(new_buf); } -static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) -{ - return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); -} - static CLOSURE_CALLBACK(journal_write_done) { closure_type(w, struct journal_buf, io); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index ee489d222fba..0787d04a5fc3 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -712,7 +712,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, struct btree_iter iter = {}, bp_iter = {}; struct bkey_buf sk; struct bkey_s_c k; - unsigned sectors_moved = 0; struct bkey_buf last_flushed; int ret = 0; @@ -834,7 +833,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (ctxt->stats) atomic64_add(sectors, &ctxt->stats->sectors_seen); - sectors_moved += sectors; next: bch2_btree_iter_advance(&bp_iter); } -- 2.51.0 From 8dc4514d58f684b9bc08d956ab9a9ec65b38f63a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 13 Mar 2025 11:44:52 -0400 Subject: [PATCH 12/16] bcachefs: Kill bch2_remount() Single caller, so inline it. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 71 ++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 39 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 459ca8259fc0..17ac9c55fb96 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -2026,44 +2026,6 @@ static struct bch_fs *bch2_path_to_fs(const char *path) return c ?: ERR_PTR(-ENOENT); } -static int bch2_remount(struct super_block *sb, int *flags, - struct bch_opts opts) -{ - struct bch_fs *c = sb->s_fs_info; - int ret = 0; - - opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); - - if (opts.read_only != c->opts.read_only) { - down_write(&c->state_lock); - - if (opts.read_only) { - bch2_fs_read_only(c); - - sb->s_flags |= SB_RDONLY; - } else { - ret = bch2_fs_read_write(c); - if (ret) { - bch_err(c, "error going rw: %i", ret); - up_write(&c->state_lock); - ret = -EINVAL; - goto err; - } - - sb->s_flags &= ~SB_RDONLY; - } - - c->opts.read_only = opts.read_only; - - up_write(&c->state_lock); - } - - if (opt_defined(opts, errors)) - c->opts.errors = opts.errors; -err: - return bch2_err_class(ret); -} - static int bch2_show_devname(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; @@ -2374,8 +2336,39 @@ static int bch2_fs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct bch2_opts_parse *opts = fc->fs_private; + struct bch_fs *c = sb->s_fs_info; + int ret = 0; + + opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); + + if (opts->opts.read_only != c->opts.read_only) { + down_write(&c->state_lock); + + if (opts->opts.read_only) { + bch2_fs_read_only(c); + + sb->s_flags |= SB_RDONLY; + } else { + ret = bch2_fs_read_write(c); + if (ret) { + bch_err(c, "error going rw: %i", ret); + up_write(&c->state_lock); + ret = -EINVAL; + goto err; + } + + sb->s_flags &= ~SB_RDONLY; + } + + c->opts.read_only = opts->opts.read_only; - return bch2_remount(sb, &fc->sb_flags, opts->opts); + up_write(&c->state_lock); + } + + if (opt_defined(opts->opts, errors)) + c->opts.errors = opts->opts.errors; +err: + return bch2_err_class(ret); } static const struct fs_context_operations bch2_context_ops = { -- 2.51.0 From c991fbee8e6e91e9d0c859627b87fb7a06244a8b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 13 Mar 2025 15:21:13 -0400 Subject: [PATCH 13/16] bcachefs: rebalance, copygc status also print stacktrace These are commonly needed when debugging, and saves from having to ask users to dig. Also, rebalance_status now includes pending rebalance work. Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 14 ++++++++------ fs/bcachefs/movinggc.c | 11 +++++++++++ fs/bcachefs/rebalance.c | 29 ++++++++++++++++++++++++++--- 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 0787d04a5fc3..f86fb8ad636a 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -1251,17 +1251,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) prt_newline(out); printbuf_indent_add(out, 2); - prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved)); - prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced)); - prt_printf(out, "bytes seen: "); + prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved)); + prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced)); + prt_printf(out, "bytes seen:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); prt_newline(out); - prt_printf(out, "bytes moved: "); + prt_printf(out, "bytes moved:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); prt_newline(out); - prt_printf(out, "bytes raced: "); + prt_printf(out, "bytes raced:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); prt_newline(out); @@ -1270,7 +1270,8 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) { - struct moving_io *io; + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); bch2_move_stats_to_text(out, ctxt->stats); printbuf_indent_add(out, 2); @@ -1290,6 +1291,7 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str printbuf_indent_add(out, 2); mutex_lock(&ctxt->lock); + struct moving_io *io; list_for_each_entry(io, &ctxt->ios, io_list) bch2_data_update_inflight_to_text(out, &io->write); mutex_unlock(&ctxt->lock); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index fa19fc44622c..5126c870ce5b 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -317,6 +317,17 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) prt_printf(out, "Currently calculated wait:\t"); prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); prt_newline(out); + + rcu_read_lock(); + struct task_struct *t = rcu_dereference(c->copygc_thread); + if (t) + get_task_struct(t); + rcu_read_unlock(); + + if (t) { + bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); + put_task_struct(t); + } } static int bch2_copygc_thread(void *arg) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 58f6d97e506c..8b6795ec82f6 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -590,8 +590,19 @@ static int bch2_rebalance_thread(void *arg) void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) { + printbuf_tabstop_push(out, 32); + struct bch_fs_rebalance *r = &c->rebalance; + /* print pending work */ + struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_rebalance_work, }; + u64 v; + bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); + + prt_printf(out, "pending work:\t"); + prt_human_readable_u64(out, v); + prt_printf(out, "\n\n"); + prt_str(out, bch2_rebalance_state_strs[r->state]); prt_newline(out); printbuf_indent_add(out, 2); @@ -600,15 +611,15 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) case BCH_REBALANCE_waiting: { u64 now = atomic64_read(&c->io_clock[WRITE].now); - prt_str(out, "io wait duration: "); + prt_printf(out, "io wait duration:\t"); bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); prt_newline(out); - prt_str(out, "io wait remaining: "); + prt_printf(out, "io wait remaining:\t"); bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); prt_newline(out); - prt_str(out, "duration waited: "); + prt_printf(out, "duration waited:\t"); bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); prt_newline(out); break; @@ -621,6 +632,18 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) break; } prt_newline(out); + + rcu_read_lock(); + struct task_struct *t = rcu_dereference(c->rebalance.thread); + if (t) + get_task_struct(t); + rcu_read_unlock(); + + if (t) { + bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); + put_task_struct(t); + } + printbuf_indent_sub(out, 2); } -- 2.51.0 From 7c1e2a254fbc023df8d681946bab69cd68a4bde6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 14 Mar 2025 18:19:17 -0400 Subject: [PATCH 14/16] bcachefs: Add a cond_resched() to btree cache teardown [12308.606480] watchdog: BUG: soft lockup - CPU#18 stuck for 26s! [umount:48479] [12308.606485] Modules linked in: bcachefs lz4hc_compress lz4_compress lz4_decompress sunrpc overlay nf_conntrack_netlink xt_nat xt_tcpudp veth xt_conntrack xt_MASQUERADE bridge stp llc xfrm_user ip6table_nat ip6table_filter ip6_tables iptable_nat xt_addrtype iptable_filter ip_tables x_tables nfnetlink_cttimeout nfnetlink openvswitch nsh nf_conncount nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 psample ext4 mbcache jbd2 nls_iso8859_1 nls_cp850 vfat fat binfmt_misc skx_edac_common nfit edac_core libnvdimm cbc encrypted_keys intel_rapl_msr intel_rapl_common intel_uncore_frequency intel_uncore_frequency_common ipmi_ssif x86_pkg_temp_thermal intel_powerclamp kvm_intel kvm drivetemp rapl intel_cstate coretemp mgag200 i2c_algo_bit ixgbe drm_shmem_helper drm_kms_helper mdio_devres xfrm_algo mdio drm ptp intel_uncore mei_me efi_pstore evdev uas pl2303 pps_core libphy usb_storage usbserial lpc_ich mei drm_panel_orientation_quirks acpi_power_meter tiny_power_button ipmi_si mfd_core intel_pch_thermal acpi_tad acpi_ipmi ioatdma [12308.606541] ipmi_devintf ipmi_msghandler dca wmi button efivarfs polyval_clmulni polyval_generic ghash_clmulni_intel sha512_ssse3 sha256_ssse3 sha1_ssse3 sha1_generic xhci_pci xhci_hcd aesni_intel ehci_pci ehci_hcd gf128mul crypto_simd cryptd usbcore hpwdt usb_common [12308.606557] CPU: 18 UID: 0 PID: 48479 Comm: umount Tainted: G L 6.14.0-rc6-x86_64-00159-ga09496a03e63 #1 [12308.606560] Tainted: [L]=SOFTLOCKUP [12308.606561] Hardware name: HPE ProLiant DL380 Gen10/ProLiant DL380 Gen10, BIOS U30 07/20/2023 [12308.606563] RIP: 0010:clear_page_erms+0x7/0x10 [12308.606570] Code: 48 89 47 38 48 8d 7f 40 75 d9 90 c3 cc cc cc cc 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 b9 00 10 00 00 31 c0 aa c3 cc cc cc cc 66 90 90 90 90 90 90 90 90 90 90 90 90 90 90 [12308.606572] RSP: 0018:ffff9ed5b622fba0 EFLAGS: 00010246 [12308.606574] RAX: 0000000000000000 RBX: ffff90347fffe6c0 RCX: 00000000000004c0 [12308.606575] RDX: ffffe34ea9bec1c0 RSI: 00000000000405f0 RDI: ffff902eafb07b40 [12308.606576] RBP: ffff9ed5b622fbf0 R08: 0000000000000001 R09: 0000000000000006 [12308.606577] R10: 0000000000040001 R11: 0000000000000000 R12: ffffe34ea9bec000 [12308.606578] R13: 0000000000000000 R14: 0000000000000006 R15: ffffe34ea9bed000 [12308.606580] FS: 00007fe704ecfb68(0000) GS:ffff9053fea00000(0000) knlGS:0000000000000000 [12308.606581] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [12308.606582] CR2: 00007f18159068ae CR3: 00000001314d0005 CR4: 00000000007726f0 [12308.606583] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [12308.606584] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [12308.606584] PKRU: 55555554 [12308.606585] Call Trace: [12308.606587] [12308.606590] ? show_regs.cold+0x19/0x28 [12308.606595] ? watchdog_timer_fn.cold+0x3d/0x9d [12308.606598] ? __pfx_watchdog_timer_fn+0x10/0x10 [12308.606602] ? __hrtimer_run_queues+0x12e/0x250 [12308.606607] ? hrtimer_interrupt+0xfd/0x220 [12308.606609] ? __sysvec_apic_timer_interrupt+0x53/0xe0 [12308.606614] ? sysvec_apic_timer_interrupt+0x76/0xa0 [12308.606619] [12308.606620] [12308.606620] ? asm_sysvec_apic_timer_interrupt+0x1b/0x20 [12308.606626] ? clear_page_erms+0x7/0x10 [12308.606628] ? __free_pages_ok+0x374/0x640 [12308.606633] free_frozen_pages+0x34/0x570 [12308.606636] __folio_put+0x87/0xe0 [12308.606641] free_large_kmalloc+0x70/0x80 [12308.606645] kfree+0x2f6/0x390 [12308.606648] kvfree+0x2d/0x40 [12308.606653] __btree_node_data_free+0xaf/0xf0 [bcachefs] [12308.606726] btree_node_data_free+0x6a/0x80 [bcachefs] [12308.606778] bch2_fs_btree_cache_exit+0x262/0x440 [bcachefs] [12308.606829] bch2_fs_release+0xe8/0x340 [bcachefs] [12308.606905] kobject_put+0x60/0xc0 [12308.606908] bch2_fs_free+0xdd/0x120 [bcachefs] [12308.606981] bch2_kill_sb+0x1e/0x30 [bcachefs] [12308.607051] deactivate_locked_super+0x32/0xb0 [12308.607055] deactivate_super+0x40/0x50 [12308.607057] cleanup_mnt+0xc3/0x160 [12308.607060] __cleanup_mnt+0x12/0x20 [12308.607062] task_work_run+0x5f/0xa0 [12308.607064] syscall_exit_to_user_mode+0x194/0x1a0 [12308.607066] do_syscall_64+0x67/0x170 [12308.607068] entry_SYSCALL_64_after_hwframe+0x76/0x7e [12308.607070] RIP: 0033:0x7fe704e66eed [12308.607073] Code: 08 49 89 ca b8 a5 00 00 00 0f 05 48 89 c7 e8 8a e6 ff ff 48 83 c4 Reported-by: Stijn Tintel Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 1ec1f90e0eb3..54666027aa85 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -610,6 +610,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) btree_node_write_in_flight(b)); btree_node_data_free(bc, b); + cond_resched(); } BUG_ON(!bch2_journal_error(&c->journal) && -- 2.51.0 From 9ec00891493d3e4f60678ed12988761538f95bd1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 13 Mar 2025 00:47:51 -0400 Subject: [PATCH 15/16] bcachefs: bch2_bkey_ptrs_rebalance_opts() Small optimization for bch2_bkey_sectors_need_rebalance() Signed-off-by: Kent Overstreet --- fs/bcachefs/rebalance.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 8b6795ec82f6..29a569384146 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -26,9 +26,8 @@ /* bch_extent_rebalance: */ -static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; bkey_extent_entry_for_each(ptrs, entry) @@ -38,6 +37,11 @@ static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s return NULL; } +static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +{ + return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k)); +} + static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, struct bch_io_opts *opts, struct bkey_s_c k, @@ -97,11 +101,12 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) { - const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs); if (!opts) return 0; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 sectors = 0; -- 2.51.0 From 6d80fca9efe9255369aa91e85e8f3367c42acdde Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Mar 2025 11:54:13 -0400 Subject: [PATCH 16/16] bcachefs: Don't create bch_io_failures unless it's needed Only needed in retry path, no point in wasting stack space. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 73275da5d2c4..6bdb8efb4cd1 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -147,13 +147,11 @@ void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, subvol_inum inum) { - struct bch_io_failures failed = { .nr = 0 }; - BUG_ON(rbio->_state); rbio->subvol = inum.subvol; - __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, + __bch2_read(c, rbio, rbio->bio.bi_iter, inum, NULL, BCH_READ_retry_if_stale| BCH_READ_may_promote| BCH_READ_user_mapped); -- 2.51.0