From e48fda6cdc2799ef4a63692f0f11240613067b38 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 18 Nov 2024 00:32:57 -0500 Subject: [PATCH 01/16] bcachefs: Fix check_backpointers_to_extents range limiting bch2_get_btree_in_memory_pos() will return positions that refer directly to the btree it's checking will fit in memory - i.e. backpointer positions, not buckets. This also means check_bp_exists() no longer has to refer to the device, and we can delete some code. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 63 +++++++++++++++----------------------- 1 file changed, 25 insertions(+), 38 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 24804f0bb2fd..5963217cd90c 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -352,8 +352,8 @@ int bch2_check_btree_backpointers(struct bch_fs *c) } struct extents_to_bp_state { - struct bpos bucket_start; - struct bpos bucket_end; + struct bpos bp_start; + struct bpos bp_end; struct bkey_buf last_flushed; }; @@ -445,29 +445,16 @@ static int check_bp_exists(struct btree_trans *trans, struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; - struct btree_iter bp_iter = {}; struct btree_iter other_extent_iter = {}; struct printbuf buf = PRINTBUF; - struct bkey_s_c bp_k; - int ret = 0; - struct bch_dev *ca = bch2_dev_tryget_noerror(c, bp->k.p.inode); - if (!ca) { - prt_printf(&buf, "extent for nonexistent device %llu\n", bp->k.p.inode); - bch2_bkey_val_to_text(&buf, c, orig_k); - bch_err(c, "%s", buf.buf); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto err; - } - - struct bpos bucket = bp_pos_to_bucket(ca, bp->k.p); - - if (bpos_lt(bucket, s->bucket_start) || - bpos_gt(bucket, s->bucket_end)) - goto out; + if (bpos_lt(bp->k.p, s->bp_start) || + bpos_gt(bp->k.p, s->bp_end)) + return 0; - bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0); - ret = bkey_err(bp_k); + struct btree_iter bp_iter; + struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0); + int ret = bkey_err(bp_k); if (ret) goto err; @@ -484,7 +471,6 @@ err: fsck_err: bch2_trans_iter_exit(trans, &other_extent_iter); bch2_trans_iter_exit(trans, &bp_iter); - bch2_dev_put(ca); printbuf_exit(&buf); return ret; check_existing_bp: @@ -514,12 +500,13 @@ check_existing_bp: bch_err(c, "%s", buf.buf); if (other_extent.k->size <= orig_k.k->size) { - ret = drop_dev_and_update(trans, other_bp.v->btree_id, other_extent, bucket.inode); + ret = drop_dev_and_update(trans, other_bp.v->btree_id, + other_extent, bp->k.p.inode); if (ret) goto err; goto out; } else { - ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bucket.inode); + ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode); if (ret) goto err; goto missing; @@ -529,7 +516,7 @@ check_existing_bp: ret = check_extent_checksum(trans, other_bp.v->btree_id, other_extent, bp->v.btree_id, orig_k, - bucket.inode); + bp->k.p.inode); if (ret < 0) goto err; if (ret) { @@ -538,7 +525,7 @@ check_existing_bp: } ret = check_extent_checksum(trans, bp->v.btree_id, orig_k, - other_bp.v->btree_id, other_extent, bucket.inode); + other_bp.v->btree_id, other_extent, bp->k.p.inode); if (ret < 0) goto err; if (ret) { @@ -547,7 +534,7 @@ check_existing_bp: } printbuf_reset(&buf); - prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bucket.inode); + prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bp->k.p.inode); bch2_bkey_val_to_text(&buf, c, orig_k); prt_str(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, other_extent); @@ -811,7 +798,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, int bch2_check_extents_to_backpointers(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct extents_to_bp_state s = { .bucket_start = POS_MIN }; + struct extents_to_bp_state s = { .bp_start = POS_MIN }; int ret; bch2_bkey_buf_init(&s.last_flushed); @@ -822,35 +809,35 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) ret = bch2_get_btree_in_memory_pos(trans, BIT_ULL(BTREE_ID_backpointers), BIT_ULL(BTREE_ID_backpointers), - BBPOS(BTREE_ID_backpointers, s.bucket_start), &end); + BBPOS(BTREE_ID_backpointers, s.bp_start), &end); if (ret) break; - s.bucket_end = end.pos; + s.bp_end = end.pos; - if ( bpos_eq(s.bucket_start, POS_MIN) && - !bpos_eq(s.bucket_end, SPOS_MAX)) + if ( bpos_eq(s.bp_start, POS_MIN) && + !bpos_eq(s.bp_end, SPOS_MAX)) bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", __func__, btree_nodes_fit_in_ram(c)); - if (!bpos_eq(s.bucket_start, POS_MIN) || - !bpos_eq(s.bucket_end, SPOS_MAX)) { + if (!bpos_eq(s.bp_start, POS_MIN) || + !bpos_eq(s.bp_end, SPOS_MAX)) { struct printbuf buf = PRINTBUF; prt_str(&buf, "check_extents_to_backpointers(): "); - bch2_bpos_to_text(&buf, s.bucket_start); + bch2_bpos_to_text(&buf, s.bp_start); prt_str(&buf, "-"); - bch2_bpos_to_text(&buf, s.bucket_end); + bch2_bpos_to_text(&buf, s.bp_end); bch_verbose(c, "%s", buf.buf); printbuf_exit(&buf); } ret = bch2_check_extents_to_backpointers_pass(trans, &s); - if (ret || bpos_eq(s.bucket_end, SPOS_MAX)) + if (ret || bpos_eq(s.bp_end, SPOS_MAX)) break; - s.bucket_start = bpos_successor(s.bucket_end); + s.bp_start = bpos_successor(s.bp_end); } bch2_trans_put(trans); bch2_bkey_buf_exit(&s.last_flushed, c); -- 2.51.0 From 1ab00b6cddc53d47e6f45002e69034941d40ae03 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 17 Nov 2024 18:26:54 -0500 Subject: [PATCH 02/16] bcachefs: kill bch_backpointer.bucket_offset usage bch_backpointer.bucket_offset is going away - it's no longer needed since we no longer store backpointers in alloc keys, the same information is in the key position itself. And we'll be reclaiming the space in bch_backpointer for the bucket generation number. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 15 +++++++-------- fs/bcachefs/backpointers.h | 8 ++++++++ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 5963217cd90c..620fa67db7a6 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -54,21 +54,20 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); if (ca) { - struct bpos bucket = bp_pos_to_bucket(ca, k.k->p); + u32 bucket_offset; + struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); rcu_read_unlock(); - prt_str(out, "bucket="); - bch2_bpos_to_text(out, bucket); - prt_str(out, " "); + prt_printf(out, "bucket=%llu:%llu:%u", bucket.inode, bucket.offset, bucket_offset); } else { rcu_read_unlock(); + prt_printf(out, "sector=%llu:%llu", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); } bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); - prt_printf(out, " offset=%llu:%u len=%u pos=", - (u64) (bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), - (u32) bp.v->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), + prt_printf(out, " suboffset=%u len=%u pos=", + (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), bp.v->bucket_len); bch2_bpos_to_text(out, bp.v->pos); } diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 5f34a25b599a..d8a15f5fa767 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -42,6 +42,14 @@ static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); } +static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, struct bpos bp_pos, + u32 *bucket_offset) +{ + u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; + + return POS(bp_pos.inode, sector_to_bucket_and_offset(ca, bucket_sector, bucket_offset)); +} + static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) { rcu_read_lock(); -- 2.51.0 From c80f33b752688392adf2aa71190636b8e1ff204b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 14 Nov 2024 21:34:43 -0500 Subject: [PATCH 03/16] bcachefs: New backpointers helpers - bch2_backpointer_del() - bch2_backpointer_maybe_flush() Kill a bit of open coding and make sure we're properly handling the btree write buffer. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 58 +++++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 620fa67db7a6..cfd9b9ead473 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -199,6 +199,22 @@ err: return ret; } +static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) +{ + return likely(!bch2_backpointers_no_use_write_buffer) + ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) + : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0); +} + +static int bch2_backpointers_maybe_flush(struct btree_trans *trans, + struct bkey_s_c visiting_k, + struct bkey_buf *last_flushed) +{ + return likely(!bch2_backpointers_no_use_write_buffer) + ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) + : 0; +} + static void backpointer_target_not_found(struct btree_trans *trans, struct bkey_s_c_backpointer bp, struct bkey_s_c target_k) @@ -300,9 +316,12 @@ err: return b; } -static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter, - struct bkey_s_c k) +static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, + struct bkey_buf *last_flushed) { + if (k.k->type != KEY_TYPE_backpointer) + return 0; + struct bch_fs *c = trans->c; struct btree_iter alloc_iter = { NULL }; struct bkey_s_c alloc_k; @@ -311,10 +330,14 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ struct bpos bucket; if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { + ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); + if (ret) + goto out; + if (fsck_err(trans, backpointer_to_missing_device, "backpointer for missing device:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, bp_iter, 0); + ret = bch2_backpointer_del(trans, k.k->p); goto out; } @@ -323,13 +346,16 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ if (ret) goto out; - if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, - trans, backpointer_to_missing_alloc, - "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", - alloc_iter.pos.inode, alloc_iter.pos.offset, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, bp_iter, 0); - goto out; + if (alloc_k.k->type != KEY_TYPE_alloc_v4) { + ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); + if (ret) + goto out; + + if (fsck_err(trans, backpointer_to_missing_alloc, + "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", + alloc_iter.pos.inode, alloc_iter.pos.offset, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_backpointer_del(trans, k.k->p); } out: fsck_err: @@ -341,11 +367,17 @@ fsck_err: /* verify that every backpointer has a corresponding alloc key */ int bch2_check_btree_backpointers(struct bch_fs *c) { + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, 0, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_btree_backpointer(trans, &iter, k))); + bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed))); + + bch2_bkey_buf_exit(&last_flushed, c); bch_err_fn(c, ret); return ret; } @@ -874,7 +906,7 @@ static int check_one_backpointer(struct btree_trans *trans, return ret; if (!k.k) { - ret = bch2_btree_write_buffer_maybe_flush(trans, bp.s_c, last_flushed); + ret = bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed); if (ret) goto out; @@ -882,7 +914,7 @@ static int check_one_backpointer(struct btree_trans *trans, "backpointer for missing %s\n %s", bp.v->level ? "btree node" : "extent", (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { - ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); + ret = bch2_backpointer_del(trans, bp.k->p); goto out; } } -- 2.51.0 From c601e5d7daea102481bd7512b013018fdbf57034 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Nov 2024 16:27:47 -0500 Subject: [PATCH 04/16] bcachefs: Can now block journal activity without closing cur entry Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 44 +++++++++++++++++++++++++++++++++++-- fs/bcachefs/journal.h | 3 ++- fs/bcachefs/journal_types.h | 2 ++ 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 2cf8f24d50cc..bfbb1ac60c3d 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -217,6 +217,12 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq) if (__bch2_journal_pin_put(j, seq)) bch2_journal_reclaim_fast(j); bch2_journal_do_writes(j); + + /* + * for __bch2_next_write_buffer_flush_journal_buf(), when quiescing an + * open journal entry + */ + wake_up(&j->wait); } /* @@ -251,6 +257,9 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t if (!__journal_entry_is_open(old)) return; + if (old.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) + old.cur_entry_offset = j->cur_entry_offset_if_blocked; + /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); @@ -868,16 +877,44 @@ int bch2_journal_meta(struct journal *j) void bch2_journal_unblock(struct journal *j) { spin_lock(&j->lock); - j->blocked--; + if (!--j->blocked && + j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL && + j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) { + union journal_res_state old, new; + + old.v = atomic64_read(&j->reservations.counter); + do { + new.v = old.v; + new.cur_entry_offset = j->cur_entry_offset_if_blocked; + } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); + } spin_unlock(&j->lock); journal_wake(j); } +static void __bch2_journal_block(struct journal *j) +{ + if (!j->blocked++) { + union journal_res_state old, new; + + old.v = atomic64_read(&j->reservations.counter); + do { + j->cur_entry_offset_if_blocked = old.cur_entry_offset; + + if (j->cur_entry_offset_if_blocked >= JOURNAL_ENTRY_CLOSED_VAL) + break; + + new.v = old.v; + new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; + } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); + } +} + void bch2_journal_block(struct journal *j) { spin_lock(&j->lock); - j->blocked++; + __bch2_journal_block(j); spin_unlock(&j->lock); journal_quiesce(j); @@ -1481,6 +1518,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) case JOURNAL_ENTRY_CLOSED_VAL: prt_printf(out, "closed\n"); break; + case JOURNAL_ENTRY_BLOCKED_VAL: + prt_printf(out, "blocked\n"); + break; default: prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); break; diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 2762be6f9814..6d3c839bbbef 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -285,7 +285,8 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq spin_lock(&j->lock); bch2_journal_buf_put_final(j, seq); spin_unlock(&j->lock); - } + } else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)) + wake_up(&j->wait); } /* diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 19183fcf7ad7..425d1abb257e 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -112,6 +112,7 @@ union journal_res_state { */ #define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) +#define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2) #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) #define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) @@ -193,6 +194,7 @@ struct journal { * insufficient devices: */ enum journal_errors cur_entry_error; + unsigned cur_entry_offset_if_blocked; unsigned buf_size_want; /* -- 2.51.0 From f4d67f6d5a4c042a28fa1d67f756e3d24ba60888 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Nov 2024 18:21:12 -0500 Subject: [PATCH 05/16] bcachefs: trivial btree write buffer refactoring Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_write_buffer.c | 64 ++++++++++++++++---------------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 1639c60dffa0..1bd26221f156 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -19,8 +19,6 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *, struct journal_entry_pin *, u64); -static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *); - static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) { return (cmp_int(l->hi, r->hi) ?: @@ -481,13 +479,38 @@ err: return ret; } -static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) +static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) +{ + struct journal_keys_to_wb dst; + int ret = 0; + + bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); + + for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { + jset_entry_for_each_key(entry, k) { + ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); + if (ret) + goto out; + } + + entry->type = BCH_JSET_ENTRY_btree_keys; + } + + spin_lock(&c->journal.lock); + buf->need_flush_to_write_buffer = false; + spin_unlock(&c->journal.lock); +out: + ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; + return ret; +} + +static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq) { struct journal *j = &c->journal; struct journal_buf *buf; int ret = 0; - while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) { + while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq))) { ret = bch2_journal_keys_to_write_buffer(c, buf); mutex_unlock(&j->buf_lock); } @@ -495,7 +518,7 @@ static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) return ret; } -static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq, +static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq, bool *did_work) { struct bch_fs *c = trans->c; @@ -505,7 +528,7 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq, do { bch2_trans_unlock(trans); - fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq); + fetch_from_journal_err = fetch_wb_keys_from_journal(c, max_seq); *did_work |= wb->inc.keys.nr || wb->flushing.keys.nr; @@ -518,8 +541,8 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq, mutex_unlock(&wb->flushing.lock); } while (!ret && (fetch_from_journal_err || - (wb->inc.pin.seq && wb->inc.pin.seq <= seq) || - (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq))); + (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) || + (wb->flushing.pin.seq && wb->flushing.pin.seq <= max_seq))); return ret; } @@ -771,31 +794,6 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_ return ret; } -static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) -{ - struct journal_keys_to_wb dst; - int ret = 0; - - bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); - - for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { - jset_entry_for_each_key(entry, k) { - ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); - if (ret) - goto out; - } - - entry->type = BCH_JSET_ENTRY_btree_keys; - } - - spin_lock(&c->journal.lock); - buf->need_flush_to_write_buffer = false; - spin_unlock(&c->journal.lock); -out: - ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; - return ret; -} - static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size) { if (wb->keys.size >= new_size) -- 2.51.0 From e1cb4f56dc4cfcf1cb1b90479cb35c5a304ff527 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Nov 2024 16:47:10 -0500 Subject: [PATCH 06/16] bcachefs: Bias reads more in favor of faster device Per reports of performance issues on mixed multi device filesystems where we're issuing too much IO to the spinning rust - tweak this algorithm. Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 98bb680b3860..83aeceb68847 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -89,6 +89,14 @@ static inline bool ptr_better(struct bch_fs *c, u64 l1 = dev_latency(c, p1.ptr.dev); u64 l2 = dev_latency(c, p2.ptr.dev); + /* + * Square the latencies, to bias more in favor of the faster + * device - we never want to stop issuing reads to the slower + * device altogether, so that we can update our latency numbers: + */ + l1 *= l1; + l2 *= l2; + /* Pick at random, biased in favor of the faster device: */ return bch2_rand_range(l1 + l2) > l1; -- 2.51.0 From bb61afebca3b5608842e2ac59bbe4eedf137050f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 21 Nov 2024 20:09:45 -0500 Subject: [PATCH 07/16] bcachefs: discard fastpath now uses bch2_discard_one_bucket() The discard bucket fastpath previously was using its own code for discarding buckets and clearing them in the need_discard btree, which didn't have any of the consistency checks of the main discard path. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 75 +++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 34 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index ae9fdb5ad758..1e9f53db4bb8 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1725,7 +1725,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca, struct btree_iter *need_discard_iter, struct bpos *discard_pos_done, - struct discard_buckets_state *s) + struct discard_buckets_state *s, + bool fastpath) { struct bch_fs *c = trans->c; struct bpos pos = need_discard_iter->pos; @@ -1782,10 +1783,12 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; } - if (discard_in_flight_add(ca, iter.pos.offset, true)) - goto out; + if (!fastpath) { + if (discard_in_flight_add(ca, iter.pos.offset, true)) + goto out; - discard_locked = true; + discard_locked = true; + } if (!bkey_eq(*discard_pos_done, iter.pos) && ca->mi.discard && !c->opts.nochanges) { @@ -1799,6 +1802,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, ca->mi.bucket_size, GFP_KERNEL); *discard_pos_done = iter.pos; + s->discarded++; ret = bch2_trans_relock_notrace(trans); if (ret) @@ -1819,12 +1823,12 @@ commit: goto out; count_event(c, bucket_discard); - s->discarded++; out: fsck_err: if (discard_locked) discard_in_flight_remove(ca, iter.pos.offset); - s->seen++; + if (!ret) + s->seen++; bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); return ret; @@ -1848,7 +1852,7 @@ static void bch2_do_discards_work(struct work_struct *work) BTREE_ID_need_discard, POS(ca->dev_idx, 0), POS(ca->dev_idx, U64_MAX), 0, k, - bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s))); + bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false))); trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); @@ -1881,27 +1885,31 @@ void bch2_do_discards(struct bch_fs *c) bch2_dev_do_discards(ca); } -static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) +static int bch2_do_discards_fast_one(struct btree_trans *trans, + struct bch_dev *ca, + u64 bucket, + struct bpos *discard_pos_done, + struct discard_buckets_state *s) { - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent); - struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); - int ret = bkey_err(k); - if (ret) - goto err; - - struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k); - ret = PTR_ERR_OR_ZERO(a); + struct btree_iter need_discard_iter; + struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter, + BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0); + int ret = bkey_err(discard_k); if (ret) - goto err; + return ret; - BUG_ON(a->v.dirty_sectors); - SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); - alloc_data_type_set(&a->v, a->v.data_type); + if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set, + trans, discarding_bucket_not_in_need_discard_btree, + "attempting to discard bucket %u:%llu not in need_discard btree", + ca->dev_idx, bucket)) { + /* log it in the superblock and continue: */ + goto out; + } - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); -err: - bch2_trans_iter_exit(trans, &iter); + ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true); +out: +fsck_err: + bch2_trans_iter_exit(trans, &need_discard_iter); return ret; } @@ -1909,6 +1917,10 @@ static void bch2_do_discards_fast_work(struct work_struct *work) { struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work); struct bch_fs *c = ca->fs; + struct discard_buckets_state s = {}; + struct bpos discard_pos_done = POS_MAX; + struct btree_trans *trans = bch2_trans_get(c); + int ret = 0; while (1) { bool got_bucket = false; @@ -1929,16 +1941,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work) if (!got_bucket) break; - if (ca->mi.discard && !c->opts.nochanges) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, bucket), - ca->mi.bucket_size, - GFP_KERNEL); - - int ret = bch2_trans_commit_do(c, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc, - bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket))); + ret = lockrestart_do(trans, + bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s)); bch_err_fn(c, ret); discard_in_flight_remove(ca, bucket); @@ -1947,6 +1951,9 @@ static void bch2_do_discards_fast_work(struct work_struct *work) break; } + trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); + + bch2_trans_put(trans); percpu_ref_put(&ca->io_ref); bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } -- 2.51.0 From 0eaac0b44fa93a5bf8bd5d79c557ca5e04d2dc16 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 23 Apr 2024 02:18:18 -0400 Subject: [PATCH 08/16] bcachefs: btree_write_buffer_flush_seq() no longer closes journal Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_write_buffer.c | 19 ++++++++++++++----- fs/bcachefs/journal.c | 27 ++++++++++++++++++++------- fs/bcachefs/journal.h | 2 +- 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 1bd26221f156..49ce2d1e5c02 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -495,10 +495,6 @@ static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_bu entry->type = BCH_JSET_ENTRY_btree_keys; } - - spin_lock(&c->journal.lock); - buf->need_flush_to_write_buffer = false; - spin_unlock(&c->journal.lock); out: ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; return ret; @@ -508,11 +504,24 @@ static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq) { struct journal *j = &c->journal; struct journal_buf *buf; + bool blocked; int ret = 0; - while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq))) { + while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) { ret = bch2_journal_keys_to_write_buffer(c, buf); + + if (!blocked && !ret) { + spin_lock(&j->lock); + buf->need_flush_to_write_buffer = false; + spin_unlock(&j->lock); + } + mutex_unlock(&j->buf_lock); + + if (blocked) { + bch2_journal_unblock(j); + break; + } } return ret; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index bfbb1ac60c3d..699db0d0749a 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -908,6 +908,8 @@ static void __bch2_journal_block(struct journal *j) new.v = old.v; new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); + + journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); } } @@ -920,7 +922,8 @@ void bch2_journal_block(struct journal *j) journal_quiesce(j); } -static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, + u64 max_seq, bool *blocked) { struct journal_buf *ret = NULL; @@ -937,13 +940,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou struct journal_buf *buf = j->buf + idx; if (buf->need_flush_to_write_buffer) { - if (seq == journal_cur_seq(j)) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); - union journal_res_state s; s.v = atomic64_read_acquire(&j->reservations.counter); - ret = journal_state_count(s, idx) + unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s); + + if (open && !*blocked) { + __bch2_journal_block(j); + *blocked = true; + } + + ret = journal_state_count(s, idx) > open ? ERR_PTR(-EAGAIN) : buf; break; @@ -956,11 +963,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou return ret; } -struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, + u64 max_seq, bool *blocked) { struct journal_buf *ret; + *blocked = false; + + wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, + max_seq, blocked)) != ERR_PTR(-EAGAIN)); + if (IS_ERR_OR_NULL(ret) && *blocked) + bch2_journal_unblock(j); - wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN)); return ret; } diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 6d3c839bbbef..71a50846967f 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -425,7 +425,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j) void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); -struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq); +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *); void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); -- 2.51.0 From 375d21b76d9a20ff4bddf81aadadd6e57f38dee8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Nov 2024 22:12:58 -0500 Subject: [PATCH 09/16] bcachefs: BCH_ERR_btree_node_read_error_cached Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 6 +++--- fs/bcachefs/errcode.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index a0a406b0c7bc..36dfa6a48aa6 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1131,7 +1131,7 @@ retry: if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-BCH_ERR_btree_node_read_error); + return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -1221,7 +1221,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-BCH_ERR_btree_node_read_error); + return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -1303,7 +1303,7 @@ lock_node: if (unlikely(btree_node_read_error(b))) { six_unlock_read(&b->c.lock); - b = ERR_PTR(-BCH_ERR_btree_node_read_error); + b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); goto out; } diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 2dda7f962e5b..131b9bef21a0 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -242,6 +242,7 @@ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, btree_node_read_err) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ x(EIO, sb_not_downgraded) \ x(EIO, btree_node_write_all_failed) \ x(EIO, btree_node_read_error) \ -- 2.51.0 From 525be09e63cc9ff4bee0b0f480551a4dc2e7b276 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 20:15:30 -0500 Subject: [PATCH 10/16] bcachefs: Use separate rhltable for bch2_inode_or_descendents_is_open() Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/fs.c | 39 ++++++++++++++++++++++++++++++--------- fs/bcachefs/fs.h | 1 + 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 11f9ed42a9da..f1d8c821d27a 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1020,6 +1020,7 @@ struct bch_fs { struct list_head vfs_inodes_list; struct mutex vfs_inodes_lock; struct rhashtable vfs_inodes_table; + struct rhltable vfs_inodes_by_inum_table; /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 50d323fca001..c6e7df7c67fa 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -176,8 +177,9 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b) static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) { const subvol_inum *inum = data; + siphash_key_t k = { .key[0] = seed }; - return jhash(&inum->inum, sizeof(inum->inum), seed); + return siphash_2u64(inum->subvol, inum->inum, &k); } static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) @@ -206,11 +208,18 @@ static const struct rhashtable_params bch2_vfs_inodes_params = { .automatic_shrinking = true, }; +static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = { + .head_offset = offsetof(struct bch_inode_info, by_inum_hash), + .key_offset = offsetof(struct bch_inode_info, ei_inum.inum), + .key_len = sizeof(u64), + .automatic_shrinking = true, +}; + int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { struct bch_fs *c = trans->c; - struct rhashtable *ht = &c->vfs_inodes_table; - subvol_inum inum = (subvol_inum) { .inum = p.offset }; + struct rhltable *ht = &c->vfs_inodes_by_inum_table; + u64 inum = p.offset; DARRAY(u32) subvols; int ret = 0; @@ -235,15 +244,15 @@ restart_from_top: struct rhash_lock_head __rcu *const *bkt; struct rhash_head *he; unsigned int hash; - struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); + struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht); restart: - hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params); + hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params); bkt = rht_bucket(tbl, hash); do { struct bch_inode_info *inode; rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { - if (inode->ei_inum.inum == inum.inum) { + if (inode->ei_inum.inum == inum) { ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, GFP_NOWAIT|__GFP_NOWARN); if (ret) { @@ -264,7 +273,7 @@ restart: /* Ensure we see any new tables. */ smp_rmb(); - tbl = rht_dereference_rcu(tbl->future_tbl, ht); + tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht); if (unlikely(tbl)) goto restart; rcu_read_unlock(); @@ -343,7 +352,11 @@ static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inod spin_unlock(&inode->v.i_lock); if (remove) { - int ret = rhashtable_remove_fast(&c->vfs_inodes_table, + int ret = rhltable_remove(&c->vfs_inodes_by_inum_table, + &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params); + BUG_ON(ret); + + ret = rhashtable_remove_fast(&c->vfs_inodes_table, &inode->hash, bch2_vfs_inodes_params); BUG_ON(ret); inode->v.i_hash.pprev = NULL; @@ -388,6 +401,11 @@ retry: discard_new_inode(&inode->v); return old; } else { + int ret = rhltable_insert(&c->vfs_inodes_by_inum_table, + &inode->by_inum_hash, + bch2_vfs_inodes_by_inum_params); + BUG_ON(ret); + inode_fake_hash(&inode->v); inode_sb_list_add(&inode->v); @@ -2359,13 +2377,16 @@ static int bch2_init_fs_context(struct fs_context *fc) void bch2_fs_vfs_exit(struct bch_fs *c) { + if (c->vfs_inodes_by_inum_table.ht.tbl) + rhltable_destroy(&c->vfs_inodes_by_inum_table); if (c->vfs_inodes_table.tbl) rhashtable_destroy(&c->vfs_inodes_table); } int bch2_fs_vfs_init(struct bch_fs *c) { - return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params); + return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?: + rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params); } static struct file_system_type bcache_fs_type = { diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 59f9f7ae728d..dd2198541455 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -14,6 +14,7 @@ struct bch_inode_info { struct inode v; struct rhash_head hash; + struct rhlist_head by_inum_hash; subvol_inum ei_inum; struct list_head ei_vfs_inode_list; -- 2.51.0 From 6534a404d4924820bd1c06fb4412dd4444234f79 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 21:49:08 -0500 Subject: [PATCH 11/16] bcachefs: errcode cleanup: journal errors Instead of throwing standard error codes, we should be throwing dedicated private error codes, this greatly improves debugability. Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 2 ++ fs/bcachefs/journal.c | 4 ++-- fs/bcachefs/journal.h | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 131b9bef21a0..c989ce4f715f 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -241,6 +241,8 @@ x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ + x(EIO, journal_shutdown) \ + x(EIO, journal_flush_err) \ x(EIO, btree_node_read_err) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ x(EIO, sb_not_downgraded) \ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 699db0d0749a..bbdd0b17ae69 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -673,7 +673,7 @@ out: * @seq: seq to flush * @parent: closure object to wait with * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed, - * -EIO if @seq will never be flushed + * -BCH_ERR_journal_flush_err if @seq will never be flushed * * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if * necessary @@ -696,7 +696,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, /* Recheck under lock: */ if (j->err_seq && seq >= j->err_seq) { - ret = -EIO; + ret = -BCH_ERR_journal_flush_err; goto out; } diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 71a50846967f..a6a2e888c59b 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -412,7 +412,7 @@ void bch2_journal_halt(struct journal *); static inline int bch2_journal_error(struct journal *j) { return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL - ? -EIO : 0; + ? -BCH_ERR_journal_shutdown : 0; } struct bch_dev; -- 2.51.0 From a7ecd5f2ccabb97174f8264b965c9597ea854e8f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 22:23:41 -0500 Subject: [PATCH 12/16] bcachefs: disk_accounting: bch2_dev_rcu -> bch2_dev_rcu_noerror Accounting keys that reference invalid devices are corrected by fsck, they shouldn't cause an emergency shutdown. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 4 ++-- fs/bcachefs/disk_accounting.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 55a00018dc8b..fa821d278c45 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -805,7 +805,7 @@ int bch2_accounting_read(struct bch_fs *c) break; case BCH_DISK_ACCOUNTING_dev_data_type: rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, k.dev_data_type.dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); if (ca) { struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; percpu_u64_set(&d->buckets, v[0]); @@ -911,7 +911,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c) break; case BCH_DISK_ACCOUNTING_dev_data_type: { rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (!ca) { rcu_read_unlock(); continue; diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 6639535dc91c..8b2b2f83e6a4 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -142,7 +142,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, break; case BCH_DISK_ACCOUNTING_dev_data_type: rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (ca) { this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); -- 2.51.0 From b71d89bd7b1fbe0a4569d072a0069110f60f9ec9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 22:28:41 -0500 Subject: [PATCH 13/16] bcachefs: Fix accounting_read when we rewind If we rewind recovery to run topology repair, that causes accounting_read to run twice. This fixes accounting being double counted. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index fa821d278c45..bb5dbbf71d04 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -700,6 +700,21 @@ int bch2_accounting_read(struct bch_fs *c) struct btree_trans *trans = bch2_trans_get(c); struct printbuf buf = PRINTBUF; + /* + * We might run more than once if we rewind to start topology repair or + * btree node scan - and those might cause us to get different results, + * so we can't just skip if we've already run. + * + * Instead, zero out any accounting we have: + */ + percpu_down_write(&c->mark_lock); + darray_for_each(acc->k, e) + percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); + for_each_member_device(c, ca) + percpu_memset(ca->usage, 0, sizeof(*ca->usage)); + percpu_memset(c->usage, 0, sizeof(*c->usage)); + percpu_up_write(&c->mark_lock); + int ret = for_each_btree_key(trans, iter, BTREE_ID_accounting, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ -- 2.51.0 From 427db7ffe9a91bcad00a7bac4edcd74d75007a11 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 22:45:25 -0500 Subject: [PATCH 14/16] bcachefs: backpointer_to_missing_ptr is now autofix Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-errors_format.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 9e3425f533bc..d45d0789f1b1 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -141,7 +141,7 @@ enum bch_fsck_flags { x(backpointer_dev_bad, 297, 0) \ x(backpointer_to_missing_device, 126, 0) \ x(backpointer_to_missing_alloc, 127, 0) \ - x(backpointer_to_missing_ptr, 128, 0) \ + x(backpointer_to_missing_ptr, 128, FSCK_AUTOFIX) \ x(lru_entry_at_time_0, 129, FSCK_AUTOFIX) \ x(lru_entry_to_invalid_bucket, 130, FSCK_AUTOFIX) \ x(lru_entry_bad, 131, FSCK_AUTOFIX) \ -- 2.51.0 From abf23afa36eb425fb75d47c26fc665dbab2a9ae1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 22:57:01 -0500 Subject: [PATCH 15/16] bcachefs: Fix btree node scan when unknown btree IDs are present btree_root entries for unknown btree IDs are created during recovery, before reading those btree roots. But btree_node_scan may find btree nodes with unknown btree IDs when we haven't seen roots for those btrees. Reported-by: syzbot+1f202d4da221ec6ebf8e@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 11 ++++++++--- fs/bcachefs/btree_cache.h | 9 +++++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 36dfa6a48aa6..1f06e24e53fc 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1406,9 +1406,14 @@ void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsi void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) { bch2_btree_id_to_text(out, b->c.btree_id); - prt_printf(out, " level %u/%u\n ", - b->c.level, - bch2_btree_id_root(c, b->c.btree_id)->level); + prt_printf(out, " level %u/", b->c.level); + struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id); + if (r) + prt_printf(out, "%u", r->level); + else + prt_printf(out, "(unknown)"); + prt_printf(out, "\n "); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); } diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 6cfacacb6769..dcc34fe4996d 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -128,14 +128,19 @@ static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned i } else { unsigned idx = id - BTREE_ID_NR; - EBUG_ON(idx >= c->btree_roots_extra.nr); + /* This can happen when we're called from btree_node_scan */ + if (idx >= c->btree_roots_extra.nr) + return NULL; + return &c->btree_roots_extra.data[idx]; } } static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) { - return bch2_btree_id_root(c, b->c.btree_id)->b; + struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id); + + return r ? r->b : NULL; } const char *bch2_btree_id_str(enum btree_id); /* avoid */ -- 2.51.0 From 828552ca74a45877dbf139b34c47d0f600a1e852 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 23:28:21 -0500 Subject: [PATCH 16/16] bcachefs: Kill bch2_bucket_alloc_new_fs() The early-early allocation path, bch2_bucket_alloc_new_fs(), is no longer needed - and inconsistencies around new_fs_bucket_idx have been a frequent source of bugs. Reported-by: syzbot+592425844580a6598410@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 40 ++++++++++++++-------------------- fs/bcachefs/alloc_foreground.h | 2 -- fs/bcachefs/bcachefs.h | 1 - fs/bcachefs/buckets.c | 25 +++++++++++++++++++++ fs/bcachefs/buckets.h | 21 +----------------- fs/bcachefs/journal.c | 34 +++++++++++++---------------- fs/bcachefs/journal_reclaim.c | 3 +++ fs/bcachefs/recovery.c | 5 +---- fs/bcachefs/super.c | 12 +++++----- 9 files changed, 66 insertions(+), 77 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 6d665b720f72..4d1ff7f1f302 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -156,6 +156,14 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) return ob; } +static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) +{ + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) + return false; + + return bch2_is_superblock_bucket(ca, b); +} + static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) { BUG_ON(c->open_buckets_partial_nr >= @@ -175,20 +183,6 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) closure_wake_up(&c->freelist_wait); } -/* _only_ for allocating the journal on a new device: */ -long bch2_bucket_alloc_new_fs(struct bch_dev *ca) -{ - while (ca->new_fs_bucket_idx < ca->mi.nbuckets) { - u64 b = ca->new_fs_bucket_idx++; - - if (!is_superblock_bucket(ca, b) && - (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse))) - return b; - } - - return -1; -} - static inline unsigned open_buckets_reserved(enum bch_watermark watermark) { switch (watermark) { @@ -214,6 +208,9 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * { struct open_bucket *ob; + if (unlikely(is_superblock_bucket(c, ca, bucket))) + return NULL; + if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { s->skipped_nouse++; return NULL; @@ -295,9 +292,6 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc /* * This path is for before the freespace btree is initialized: - * - * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & - * journal buckets - journal buckets will be < ca->new_fs_bucket_idx */ static noinline struct open_bucket * bch2_bucket_alloc_early(struct btree_trans *trans, @@ -309,7 +303,7 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct btree_iter iter, citer; struct bkey_s_c k, ck; struct open_bucket *ob = NULL; - u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); + u64 first_bucket = ca->mi.first_bucket; u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; u64 alloc_start = max(first_bucket, *dev_alloc_cursor); u64 alloc_cursor = alloc_start; @@ -332,10 +326,6 @@ again: if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) break; - if (ca->new_fs_bucket_idx && - is_superblock_bucket(ca, k.k->p.offset)) - continue; - if (s->btree_bitmap != BTREE_BITMAP_ANY && s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { @@ -406,8 +396,6 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); u64 alloc_cursor = alloc_start; int ret; - - BUG_ON(ca->new_fs_bucket_idx); again: for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace, POS(ca->dev_idx, alloc_cursor), @@ -551,6 +539,10 @@ again: bch2_dev_do_invalidates(ca); if (!avail) { + if (watermark > BCH_WATERMARK_normal && + c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + goto alloc; + if (cl && !waiting) { closure_wait(&c->freelist_wait, cl); waiting = true; diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 1a16fd5bd4f8..4f87745df97e 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -28,8 +28,6 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, struct bch_devs_mask *); void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); -long bch2_bucket_alloc_new_fs(struct bch_dev *); - static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) { return bch2_dev_have_ref(c, ob->dev); diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index f1d8c821d27a..a85b3bcc6383 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -560,7 +560,6 @@ struct bch_dev { struct bch_dev_usage __percpu *usage; /* Allocator: */ - u64 new_fs_bucket_idx; u64 alloc_cursor[3]; unsigned nr_open_buckets; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 1547141ba2a0..afd35c93fcfb 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1161,6 +1161,31 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c) return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional); } +bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + u64 b_offset = bucket_to_sector(ca, b); + u64 b_end = bucket_to_sector(ca, b + 1); + unsigned i; + + if (!b) + return true; + + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + u64 end = offset + (1 << layout->sb_max_size_bits); + + if (!(offset >= b_end || end <= b_offset)) + return true; + } + + for (i = 0; i < ca->journal.nr; i++) + if (b == ca->journal.buckets[i]) + return true; + + return false; +} + /* Disk reservations: */ #define SECTORS_CACHE 1024 diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index ccc78bfe2fd4..3bebc4c3044f 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -308,26 +308,7 @@ int bch2_trans_mark_dev_sbs_flags(struct bch_fs *, enum btree_iter_update_trigger_flags); int bch2_trans_mark_dev_sbs(struct bch_fs *); -static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) -{ - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; - u64 b_offset = bucket_to_sector(ca, b); - u64 b_end = bucket_to_sector(ca, b + 1); - unsigned i; - - if (!b) - return true; - - for (i = 0; i < layout->nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout->sb_offset[i]); - u64 end = offset + (1 << layout->sb_max_size_bits); - - if (!(offset >= b_end || end <= b_offset)) - return true; - } - - return false; -} +bool bch2_is_superblock_bucket(struct bch_dev *, u64); static inline const char *bch2_data_type_str(enum bch_data_type type) { diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index bbdd0b17ae69..95cccda3b22c 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1002,19 +1002,17 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, } for (nr_got = 0; nr_got < nr_want; nr_got++) { - if (new_fs) { - bu[nr_got] = bch2_bucket_alloc_new_fs(ca); - if (bu[nr_got] < 0) { - ret = -BCH_ERR_ENOSPC_bucket_alloc; - break; - } - } else { - ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, - BCH_DATA_journal, cl); - ret = PTR_ERR_OR_ZERO(ob[nr_got]); - if (ret) - break; + enum bch_watermark watermark = new_fs + ? BCH_WATERMARK_btree + : BCH_WATERMARK_normal; + ob[nr_got] = bch2_bucket_alloc(c, ca, watermark, + BCH_DATA_journal, cl); + ret = PTR_ERR_OR_ZERO(ob[nr_got]); + if (ret) + break; + + if (!new_fs) { ret = bch2_trans_run(c, bch2_trans_mark_metadata_bucket(trans, ca, ob[nr_got]->bucket, BCH_DATA_journal, @@ -1024,9 +1022,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, bch_err_msg(c, ret, "marking new journal buckets"); break; } - - bu[nr_got] = ob[nr_got]->bucket; } + + bu[nr_got] = ob[nr_got]->bucket; } if (!nr_got) @@ -1066,8 +1064,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (ret) goto err_unblock; - if (!new_fs) - bch2_write_super(c); + bch2_write_super(c); /* Commit: */ if (c) @@ -1101,9 +1098,8 @@ err_unblock: bu[i], BCH_DATA_free, 0, BTREE_TRIGGER_transactional)); err_free: - if (!new_fs) - for (i = 0; i < nr_got; i++) - bch2_open_bucket_put(c, ob[i]); + for (i = 0; i < nr_got; i++) + bch2_open_bucket_put(c, ob[i]); kfree(new_bucket_seq); kfree(new_buckets); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 3d8fc2642425..1aabbbe328d9 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -38,6 +38,9 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, struct journal_device *ja, enum journal_space_from from) { + if (!ja->nr) + return 0; + unsigned available = (journal_space_from(ja, from) - ja->cur_idx - 1 + ja->nr) % ja->nr; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 7086a7226989..547c78a323f7 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -1070,7 +1070,6 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); - c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; set_bit(BCH_FS_btree_running, &c->flags); set_bit(BCH_FS_may_go_rw, &c->flags); @@ -1111,9 +1110,6 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - for_each_online_member(c, ca) - ca->new_fs_bucket_idx = 0; - ret = bch2_fs_freespace_init(c); if (ret) goto err; @@ -1172,6 +1168,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); + c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; return 0; err: bch_err_fn(c, ret); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 08170a3d524f..14157820705d 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1750,11 +1750,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err; - ret = bch2_dev_journal_alloc(ca, true); - bch_err_msg(c, ret, "allocating journal"); - if (ret) - goto err; - down_write(&c->state_lock); mutex_lock(&c->sb_lock); @@ -1805,11 +1800,14 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err_late; - ca->new_fs_bucket_idx = 0; - if (ca->mi.state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); + ret = bch2_dev_journal_alloc(ca, false); + bch_err_msg(c, ret, "allocating journal"); + if (ret) + goto err_late; + up_write(&c->state_lock); return 0; -- 2.51.0