]> www.infradead.org Git - nvme.git/commitdiff
bcachefs: Repair pass for scanning for btree nodes
authorKent Overstreet <kent.overstreet@linux.dev>
Tue, 12 Mar 2024 03:11:46 +0000 (23:11 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Wed, 3 Apr 2024 18:44:18 +0000 (14:44 -0400)
If a btree root or interior btree node goes bad, we're going to lose a
lot of data, unless we can recover the nodes that it pointed to by
scanning.

Fortunately btree node headers are fully self describing, and
additionally the magic number is xored with the filesytem UUID, so we
can do so safely.

This implements the scanning - next patch will rework topology repair to
make use of the found nodes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
12 files changed:
fs/bcachefs/Makefile
fs/bcachefs/bcachefs.h
fs/bcachefs/btree_node_scan.c [new file with mode: 0644]
fs/bcachefs/btree_node_scan.h [new file with mode: 0644]
fs/bcachefs/btree_node_scan_types.h [new file with mode: 0644]
fs/bcachefs/extents.c
fs/bcachefs/extents.h
fs/bcachefs/opts.h
fs/bcachefs/recovery.c
fs/bcachefs/recovery_passes.c
fs/bcachefs/recovery_passes_types.h
fs/bcachefs/super.c

index 0933493a322c415a3c47727de4684312f9060bbd..66ca0bbee639492d6593655b7d9d061bc9125a4f 100644 (file)
@@ -17,6 +17,7 @@ bcachefs-y            :=      \
        btree_journal_iter.o    \
        btree_key_cache.o       \
        btree_locking.o         \
+       btree_node_scan.o       \
        btree_trans_commit.o    \
        btree_update.o          \
        btree_update_interior.o \
index 963162a627cd6adfd5ba1ac29f0709ff8ff12dd9..93a61dbaa3d8691ef5fe28af55f43ff6162465eb 100644 (file)
@@ -456,6 +456,7 @@ enum bch_time_stats {
 
 #include "alloc_types.h"
 #include "btree_types.h"
+#include "btree_node_scan_types.h"
 #include "btree_write_buffer_types.h"
 #include "buckets_types.h"
 #include "buckets_waiting_for_journal_types.h"
@@ -1103,6 +1104,8 @@ struct bch_fs {
        struct journal_keys     journal_keys;
        struct list_head        journal_iters;
 
+       struct find_btree_nodes found_btree_nodes;
+
        u64                     last_bucket_seq_cleanup;
 
        u64                     counters_on_mount[BCH_COUNTER_NR];
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
new file mode 100644 (file)
index 0000000..3f33be7
--- /dev/null
@@ -0,0 +1,495 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_journal_iter.h"
+#include "btree_node_scan.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "error.h"
+#include "journal_io.h"
+#include "recovery_passes.h"
+
+#include <linux/kthread.h>
+#include <linux/sort.h>
+
+struct find_btree_nodes_worker {
+       struct closure          *cl;
+       struct find_btree_nodes *f;
+       struct bch_dev          *ca;
+};
+
+static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
+{
+       prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie);
+       bch2_bpos_to_text(out, n->min_key);
+       prt_str(out, "-");
+       bch2_bpos_to_text(out, n->max_key);
+
+       if (n->range_updated)
+               prt_str(out, " range updated");
+       if (n->overwritten)
+               prt_str(out, " overwritten");
+
+       for (unsigned i = 0; i < n->nr_ptrs; i++) {
+               prt_char(out, ' ');
+               bch2_extent_ptr_to_text(out, c, n->ptrs + i);
+       }
+}
+
+static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
+{
+       printbuf_indent_add(out, 2);
+       darray_for_each(nodes, i) {
+               found_btree_node_to_text(out, c, i);
+               prt_newline(out);
+       }
+       printbuf_indent_sub(out, 2);
+}
+
+static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
+{
+       struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
+
+       set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
+       bp->k.p                 = f->max_key;
+       bp->v.seq               = cpu_to_le64(f->cookie);
+       bp->v.sectors_written   = 0;
+       bp->v.flags             = 0;
+       bp->v.min_key           = f->min_key;
+       SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
+       memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
+}
+
+static bool found_btree_node_is_readable(struct btree_trans *trans,
+                                        const struct found_btree_node *f)
+{
+       struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k;
+
+       found_btree_node_to_key(&k.k, f);
+
+       struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
+       bool ret = !IS_ERR_OR_NULL(b);
+       if (ret)
+               six_unlock_read(&b->c.lock);
+
+       /*
+        * We might update this node's range; if that happens, we need the node
+        * to be re-read so the read path can trim keys that are no longer in
+        * this node
+        */
+       if (b != btree_node_root(trans->c, b))
+               bch2_btree_node_evict(trans, &k.k);
+       return ret;
+}
+
+static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
+{
+       const struct found_btree_node *l = _l;
+       const struct found_btree_node *r = _r;
+
+       return  cmp_int(l->btree_id,    r->btree_id) ?:
+               cmp_int(l->level,       r->level) ?:
+               cmp_int(l->cookie,      r->cookie);
+}
+
+/*
+ * Given two found btree nodes, if their sequence numbers are equal, take the
+ * one that's readable:
+ */
+static int found_btree_node_cmp_time(const struct found_btree_node *l,
+                                    const struct found_btree_node *r)
+{
+       return cmp_int(l->seq, r->seq);
+}
+
+static int found_btree_node_cmp_pos(const void *_l, const void *_r)
+{
+       const struct found_btree_node *l = _l;
+       const struct found_btree_node *r = _r;
+
+       return  cmp_int(l->btree_id,    r->btree_id) ?:
+              -cmp_int(l->level,       r->level) ?:
+               bpos_cmp(l->min_key,    r->min_key) ?:
+              -found_btree_node_cmp_time(l, r);
+}
+
+static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
+                               struct bio *bio, struct btree_node *bn, u64 offset)
+{
+       struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+
+       bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
+       bio->bi_iter.bi_sector  = offset;
+       bch2_bio_map(bio, bn, PAGE_SIZE);
+
+       submit_bio_wait(bio);
+       if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+                              "IO error in try_read_btree_node() at %llu: %s",
+                              offset, bch2_blk_status_to_str(bio->bi_status)))
+               return;
+
+       if (le64_to_cpu(bn->magic) != bset_magic(c))
+               return;
+
+       rcu_read_lock();
+       struct found_btree_node n = {
+               .btree_id       = BTREE_NODE_ID(bn),
+               .level          = BTREE_NODE_LEVEL(bn),
+               .seq            = BTREE_NODE_SEQ(bn),
+               .cookie         = le64_to_cpu(bn->keys.seq),
+               .min_key        = bn->min_key,
+               .max_key        = bn->max_key,
+               .nr_ptrs        = 1,
+               .ptrs[0].type   = 1 << BCH_EXTENT_ENTRY_ptr,
+               .ptrs[0].offset = offset,
+               .ptrs[0].dev    = ca->dev_idx,
+               .ptrs[0].gen    = *bucket_gen(ca, sector_to_bucket(ca, offset)),
+       };
+       rcu_read_unlock();
+
+       if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
+               mutex_lock(&f->lock);
+               if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
+                       bch_err(c, "try_read_btree_node() can't handle endian conversion");
+                       f->ret = -EINVAL;
+                       goto unlock;
+               }
+
+               if (darray_push(&f->nodes, n))
+                       f->ret = -ENOMEM;
+unlock:
+               mutex_unlock(&f->lock);
+       }
+}
+
+static int read_btree_nodes_worker(void *p)
+{
+       struct find_btree_nodes_worker *w = p;
+       struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
+       struct bch_dev *ca = w->ca;
+       void *buf = (void *) __get_free_page(GFP_KERNEL);
+       struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
+       unsigned long last_print = jiffies;
+
+       if (!buf || !bio) {
+               bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
+               w->f->ret = -ENOMEM;
+               goto err;
+       }
+
+       for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
+               for (unsigned bucket_offset = 0;
+                    bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
+                    bucket_offset += btree_sectors(c)) {
+                       if (time_after(jiffies, last_print + HZ * 30)) {
+                               u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
+                               u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
+
+                               bch_info(ca, "%s: %2u%% done", __func__,
+                                        (unsigned) div64_u64(cur_sector * 100, end_sector));
+                               last_print = jiffies;
+                       }
+
+                       try_read_btree_node(w->f, ca, bio, buf,
+                                           bucket * ca->mi.bucket_size + bucket_offset);
+               }
+err:
+       bio_put(bio);
+       free_page((unsigned long) buf);
+       percpu_ref_get(&ca->io_ref);
+       closure_put(w->cl);
+       kfree(w);
+       return 0;
+}
+
+static int read_btree_nodes(struct find_btree_nodes *f)
+{
+       struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+       struct closure cl;
+       int ret = 0;
+
+       closure_init_stack(&cl);
+
+       for_each_online_member(c, ca) {
+               struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
+               struct task_struct *t;
+
+               if (!w) {
+                       percpu_ref_put(&ca->io_ref);
+                       ret = -ENOMEM;
+                       goto err;
+               }
+
+               percpu_ref_get(&ca->io_ref);
+               closure_get(&cl);
+               w->cl           = &cl;
+               w->f            = f;
+               w->ca           = ca;
+
+               t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
+               ret = IS_ERR_OR_NULL(t);
+               if (ret) {
+                       percpu_ref_put(&ca->io_ref);
+                       closure_put(&cl);
+                       f->ret = ret;
+                       bch_err(c, "error starting kthread: %i", ret);
+                       break;
+               }
+       }
+err:
+       closure_sync(&cl);
+       return f->ret ?: ret;
+}
+
+static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
+{
+       while (n + 1 < end &&
+              found_btree_node_cmp_pos(n, n + 1) > 0) {
+               swap(n[0], n[1]);
+               n++;
+       }
+}
+
+static int handle_overwrites(struct bch_fs *c,
+                            struct found_btree_node *start,
+                            struct found_btree_node *end)
+{
+       struct found_btree_node *n;
+again:
+       for (n = start + 1;
+            n < end &&
+            n->btree_id        == start->btree_id &&
+            n->level           == start->level &&
+            bpos_lt(n->min_key, start->max_key);
+            n++)  {
+               int cmp = found_btree_node_cmp_time(start, n);
+
+               if (cmp > 0) {
+                       if (bpos_cmp(start->max_key, n->max_key) >= 0)
+                               n->overwritten = true;
+                       else {
+                               n->range_updated = true;
+                               n->min_key = bpos_successor(start->max_key);
+                               n->range_updated = true;
+                               bubble_up(n, end);
+                               goto again;
+                       }
+               } else if (cmp < 0) {
+                       BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
+
+                       start->max_key = bpos_predecessor(n->min_key);
+                       start->range_updated = true;
+               } else {
+                       struct printbuf buf = PRINTBUF;
+
+                       prt_str(&buf, "overlapping btree nodes with same seq! halting\n  ");
+                       found_btree_node_to_text(&buf, c, start);
+                       prt_str(&buf, "\n  ");
+                       found_btree_node_to_text(&buf, c, n);
+                       bch_err(c, "%s", buf.buf);
+                       printbuf_exit(&buf);
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+int bch2_scan_for_btree_nodes(struct bch_fs *c)
+{
+       struct find_btree_nodes *f = &c->found_btree_nodes;
+       struct printbuf buf = PRINTBUF;
+       size_t dst;
+       int ret = 0;
+
+       if (f->nodes.nr)
+               return 0;
+
+       mutex_init(&f->lock);
+
+       ret = read_btree_nodes(f);
+       if (ret)
+               return ret;
+
+       if (!f->nodes.nr) {
+               bch_err(c, "%s: no btree nodes found", __func__);
+               ret = -EINVAL;
+               goto err;
+       }
+
+       if (0 && c->opts.verbose) {
+               printbuf_reset(&buf);
+               prt_printf(&buf, "%s: nodes found:\n", __func__);
+               found_btree_nodes_to_text(&buf, c, f->nodes);
+               bch2_print_string_as_lines(KERN_INFO, buf.buf);
+       }
+
+       sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
+
+       dst = 0;
+       darray_for_each(f->nodes, i) {
+               struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
+
+               if (prev &&
+                   prev->cookie == i->cookie) {
+                       if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
+                               bch_err(c, "%s: found too many replicas for btree node", __func__);
+                               ret = -EINVAL;
+                               goto err;
+                       }
+                       prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
+               } else {
+                       f->nodes.data[dst++] = *i;
+               }
+       }
+       f->nodes.nr = dst;
+
+       sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+
+       if (0 && c->opts.verbose) {
+               printbuf_reset(&buf);
+               prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
+               found_btree_nodes_to_text(&buf, c, f->nodes);
+               bch2_print_string_as_lines(KERN_INFO, buf.buf);
+       }
+
+       dst = 0;
+       darray_for_each(f->nodes, i) {
+               if (i->overwritten)
+                       continue;
+
+               ret = handle_overwrites(c, i, &darray_top(f->nodes));
+               if (ret)
+                       goto err;
+
+               BUG_ON(i->overwritten);
+               f->nodes.data[dst++] = *i;
+       }
+       f->nodes.nr = dst;
+
+       if (c->opts.verbose) {
+               printbuf_reset(&buf);
+               prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
+               found_btree_nodes_to_text(&buf, c, f->nodes);
+               bch2_print_string_as_lines(KERN_INFO, buf.buf);
+       }
+
+       eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+err:
+       printbuf_exit(&buf);
+       return ret;
+}
+
+static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
+{
+       const struct found_btree_node *l = _l;
+       const struct found_btree_node *r = _r;
+
+       return  cmp_int(l->btree_id,    r->btree_id) ?:
+              -cmp_int(l->level,       r->level) ?:
+               bpos_cmp(l->max_key,    r->min_key);
+}
+
+#define for_each_found_btree_node_in_range(_f, _search, _idx)                          \
+       for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr,         \
+                                       sizeof((_f)->nodes.data[0]),                    \
+                                       found_btree_node_range_start_cmp, &search);     \
+            _idx < (_f)->nodes.nr &&                                                   \
+            (_f)->nodes.data[_idx].btree_id == _search.btree_id &&                     \
+            (_f)->nodes.data[_idx].level == _search.level &&                           \
+            bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key);                  \
+            _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
+
+bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
+{
+       struct find_btree_nodes *f = &c->found_btree_nodes;
+
+       struct found_btree_node search = {
+               .btree_id       = b->c.btree_id,
+               .level          = b->c.level,
+               .min_key        = b->data->min_key,
+               .max_key        = b->key.k.p,
+       };
+
+       for_each_found_btree_node_in_range(f, search, idx)
+               if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
+                       return true;
+       return false;
+}
+
+bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
+{
+       struct found_btree_node search = {
+               .btree_id       = btree,
+               .level          = 0,
+               .min_key        = POS_MIN,
+               .max_key        = SPOS_MAX,
+       };
+
+       for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
+               return true;
+       return false;
+}
+
+int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
+                          unsigned level, struct bpos node_min, struct bpos node_max)
+{
+       struct find_btree_nodes *f = &c->found_btree_nodes;
+
+       int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+       if (ret)
+               return ret;
+
+       if (c->opts.verbose) {
+               struct printbuf buf = PRINTBUF;
+
+               prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level);
+               bch2_bpos_to_text(&buf, node_min);
+               prt_str(&buf, " - ");
+               bch2_bpos_to_text(&buf, node_max);
+
+               bch_info(c, "%s(): %s", __func__, buf.buf);
+               printbuf_exit(&buf);
+       }
+
+       struct found_btree_node search = {
+               .btree_id       = btree,
+               .level          = level,
+               .min_key        = node_min,
+               .max_key        = node_max,
+       };
+
+       for_each_found_btree_node_in_range(f, search, idx) {
+               struct found_btree_node n = f->nodes.data[idx];
+
+               n.range_updated |= bpos_lt(n.min_key, node_min);
+               n.min_key = bpos_max(n.min_key, node_min);
+
+               n.range_updated |= bpos_gt(n.max_key, node_max);
+               n.max_key = bpos_min(n.max_key, node_max);
+
+               struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
+
+               found_btree_node_to_key(&tmp.k, &n);
+
+               struct printbuf buf = PRINTBUF;
+               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
+               bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
+               printbuf_exit(&buf);
+
+               BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL));
+
+               ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
+{
+       darray_exit(&f->nodes);
+}
diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h
new file mode 100644 (file)
index 0000000..08687b2
--- /dev/null
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_NODE_SCAN_H
+#define _BCACHEFS_BTREE_NODE_SCAN_H
+
+int bch2_scan_for_btree_nodes(struct bch_fs *);
+bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *);
+bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id);
+int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos);
+void bch2_find_btree_nodes_exit(struct find_btree_nodes *);
+
+#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
new file mode 100644 (file)
index 0000000..abb7b27
--- /dev/null
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
+#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
+
+#include "darray.h"
+
+struct found_btree_node {
+       bool                    range_updated:1;
+       bool                    overwritten:1;
+       u8                      btree_id;
+       u8                      level;
+       u32                     seq;
+       u64                     cookie;
+
+       struct bpos             min_key;
+       struct bpos             max_key;
+
+       unsigned                nr_ptrs;
+       struct bch_extent_ptr   ptrs[BCH_REPLICAS_MAX];
+};
+
+typedef DARRAY(struct found_btree_node)        found_btree_nodes;
+
+struct find_btree_nodes {
+       int                     ret;
+       struct mutex            lock;
+       found_btree_nodes       nodes;
+};
+
+#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */
index b2432d88cda64c7fea38cc3118e6591e5b4efbcf..0e3ca99fbd2de1522c5e8dea8ac313232f60f7f3 100644 (file)
@@ -978,6 +978,31 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
        return bkey_deleted(k.k);
 }
 
+void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
+{
+       struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+               ? bch_dev_bkey_exists(c, ptr->dev)
+               : NULL;
+
+       if (!ca) {
+               prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+                          (u64) ptr->offset, ptr->gen,
+                          ptr->cached ? " cached" : "");
+       } else {
+               u32 offset;
+               u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+               prt_printf(out, "ptr: %u:%llu:%u gen %u",
+                          ptr->dev, b, offset, ptr->gen);
+               if (ptr->cached)
+                       prt_str(out, " cached");
+               if (ptr->unwritten)
+                       prt_str(out, " unwritten");
+               if (ca && ptr_stale(ca, ptr))
+                       prt_printf(out, " stale");
+       }
+}
+
 void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
                            struct bkey_s_c k)
 {
@@ -993,31 +1018,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
                        prt_printf(out, " ");
 
                switch (__extent_entry_type(entry)) {
-               case BCH_EXTENT_ENTRY_ptr: {
-                       const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
-                       struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
-                               ? bch_dev_bkey_exists(c, ptr->dev)
-                               : NULL;
-
-                       if (!ca) {
-                               prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
-                                      (u64) ptr->offset, ptr->gen,
-                                      ptr->cached ? " cached" : "");
-                       } else {
-                               u32 offset;
-                               u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
-
-                               prt_printf(out, "ptr: %u:%llu:%u gen %u",
-                                          ptr->dev, b, offset, ptr->gen);
-                               if (ptr->cached)
-                                       prt_str(out, " cached");
-                               if (ptr->unwritten)
-                                       prt_str(out, " unwritten");
-                               if (ca && ptr_stale(ca, ptr))
-                                       prt_printf(out, " stale");
-                       }
+               case BCH_EXTENT_ENTRY_ptr:
+                       bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
                        break;
-               }
+
                case BCH_EXTENT_ENTRY_crc32:
                case BCH_EXTENT_ENTRY_crc64:
                case BCH_EXTENT_ENTRY_crc128: {
index 3fd0169b98c18279759e92ef148b08c4847b4fed..528e817eacbdad3a058eaf20c8a7526fd9dea3d9 100644 (file)
@@ -676,6 +676,7 @@ bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
 void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
 
 bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
 void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
 int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
index 084247c13bd8fd18133708f131132d6e55b53aaa..1ac4135cca1c3dccc71a75a0d062ee30df33111c 100644 (file)
@@ -368,11 +368,11 @@ enum fsck_err_opts {
          OPT_STR_NOLIMIT(bch2_recovery_passes),                        \
          BCH2_NO_SB_OPT,               0,                              \
          NULL,         "Exit recovery after specified pass")           \
-       x(keep_journal,                 u8,                             \
+       x(retain_recovery_info,         u8,                             \
          0,                                                            \
          OPT_BOOL(),                                                   \
          BCH2_NO_SB_OPT,               false,                          \
-         NULL,         "Don't free journal entries/keys after startup")\
+         NULL,         "Don't free journal entries/keys, scanned btree nodes after startup")\
        x(read_entire_journal,          u8,                             \
          0,                                                            \
          OPT_BOOL(),                                                   \
index f234de0ac834d4ba443cd663306bae916eb71903..24671020f22b1a6a77addfe582e0d55b34225795 100644 (file)
@@ -4,6 +4,7 @@
 #include "alloc_background.h"
 #include "bkey_buf.h"
 #include "btree_journal_iter.h"
+#include "btree_node_scan.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
@@ -271,7 +272,7 @@ int bch2_journal_replay(struct bch_fs *c)
        bch2_trans_put(trans);
        trans = NULL;
 
-       if (!c->opts.keep_journal &&
+       if (!c->opts.retain_recovery_info &&
            c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay)
                bch2_journal_keys_put_initial(c);
 
@@ -435,10 +436,9 @@ static int journal_replay_early(struct bch_fs *c,
 
 static int read_btree_roots(struct bch_fs *c)
 {
-       unsigned i;
        int ret = 0;
 
-       for (i = 0; i < btree_id_nr_alive(c); i++) {
+       for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
                struct btree_root *r = bch2_btree_id_root(c, i);
 
                if (!r->alive)
@@ -447,33 +447,36 @@ static int read_btree_roots(struct bch_fs *c)
                if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc)
                        continue;
 
-               if (r->error) {
-                       __fsck_err(c,
-                                  btree_id_is_alloc(i)
-                                  ? FSCK_CAN_IGNORE : 0,
-                                  btree_root_bkey_invalid,
-                                  "invalid btree root %s",
-                                  bch2_btree_id_str(i));
-                       if (i == BTREE_ID_alloc)
+               if (mustfix_fsck_err_on((ret = r->error),
+                                       c, btree_root_bkey_invalid,
+                                       "invalid btree root %s",
+                                       bch2_btree_id_str(i)) ||
+                   mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)),
+                                       c, btree_root_read_error,
+                                       "error reading btree root %s l=%u: %s",
+                                       bch2_btree_id_str(i), r->level, bch2_err_str(ret))) {
+                       if (btree_id_is_alloc(i)) {
+                               c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
+                               c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
+                               c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
+                               c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
+                               c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
                                c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
-               }
+                               r->error = 0;
+                       } else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
+                               bch_info(c, "will run btree node scan");
+                               c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
+                               c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+                       }
 
-               ret = bch2_btree_root_read(c, i, &r->key, r->level);
-               if (ret) {
-                       fsck_err(c,
-                                btree_root_read_error,
-                                "error reading btree root %s",
-                                bch2_btree_id_str(i));
-                       if (btree_id_is_alloc(i))
-                               c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
                        ret = 0;
                }
        }
 
-       for (i = 0; i < BTREE_ID_NR; i++) {
+       for (unsigned i = 0; i < BTREE_ID_NR; i++) {
                struct btree_root *r = bch2_btree_id_root(c, i);
 
-               if (!r->b) {
+               if (!r->b && !r->error) {
                        r->alive = false;
                        r->level = 0;
                        bch2_btree_root_alloc_fake(c, i, 0);
@@ -653,7 +656,7 @@ int bch2_fs_recovery(struct bch_fs *c)
                goto err;
        }
 
-       if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
+       if (!c->sb.clean || c->opts.fsck || c->opts.retain_recovery_info) {
                struct genradix_iter iter;
                struct journal_replay **i;
 
@@ -883,9 +886,10 @@ use_clean:
 out:
        bch2_flush_fsck_errs(c);
 
-       if (!c->opts.keep_journal &&
-           test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+       if (!c->opts.retain_recovery_info) {
                bch2_journal_keys_put_initial(c);
+               bch2_find_btree_nodes_exit(&c->found_btree_nodes);
+       }
        kfree(clean);
 
        if (!ret &&
index b066c5155b7429ed8269b99f463ff66c214fada1..cb501460d6152b31a4ae57d9dea6db0792c47c0f 100644 (file)
@@ -4,6 +4,7 @@
 #include "alloc_background.h"
 #include "backpointers.h"
 #include "btree_gc.h"
+#include "btree_node_scan.h"
 #include "ec.h"
 #include "fsck.h"
 #include "inode.h"
index f30521285706831f3043c2e8109c68a90e9d37aa..840542cfd65b643d187440f7baf2213260202ae9 100644 (file)
@@ -13,6 +13,7 @@
  * must never change:
  */
 #define BCH_RECOVERY_PASSES()                                                  \
+       x(scan_for_btree_nodes,                 37, 0)                          \
        x(check_topology,                        4, 0)                          \
        x(alloc_read,                            0, PASS_ALWAYS)                \
        x(stripes_read,                          1, PASS_ALWAYS)                \
index bc026a77eb99d4141c74603885b410373c6a5104..ed63018f21bef58b2aa854f9c3f05ad1b3f26202 100644 (file)
@@ -15,6 +15,7 @@
 #include "btree_gc.h"
 #include "btree_journal_iter.h"
 #include "btree_key_cache.h"
+#include "btree_node_scan.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
 #include "btree_write_buffer.h"
@@ -536,6 +537,7 @@ static void __bch2_fs_free(struct bch_fs *c)
        for (i = 0; i < BCH_TIME_STAT_NR; i++)
                bch2_time_stats_exit(&c->times[i]);
 
+       bch2_find_btree_nodes_exit(&c->found_btree_nodes);
        bch2_free_pending_node_rewrites(c);
        bch2_fs_sb_errors_exit(c);
        bch2_fs_counters_exit(c);
@@ -560,6 +562,7 @@ static void __bch2_fs_free(struct bch_fs *c)
        bch2_io_clock_exit(&c->io_clock[READ]);
        bch2_fs_compress_exit(c);
        bch2_journal_keys_put_initial(c);
+       bch2_find_btree_nodes_exit(&c->found_btree_nodes);
        BUG_ON(atomic_read(&c->journal_keys.ref));
        bch2_fs_btree_write_buffer_exit(c);
        percpu_free_rwsem(&c->mark_lock);