From: Kent Overstreet Date: Sat, 14 Nov 2020 14:59:58 +0000 (-0500) Subject: bcachefs: Don't require flush/fua on every journal write X-Git-Tag: v6.7-rc1~201^2~1880 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=adbcada43fa79197224b5a522b1faaf222b43bcd;p=users%2Fgriffoul%2Flinux.git bcachefs: Don't require flush/fua on every journal write This patch adds a flag to journal entries which, if set, indicates that they weren't done as flush/fua writes. - non flush/fua journal writes don't update last_seq (i.e. they don't free up space in the journal), thus the journal free space calculations now check whether nonflush journal writes are currently allowed (i.e. are we low on free space, or would doing a flush write free up a lot of space in the journal) - write_delay_ms, the user configurable option for when open journal entries are automatically written, is now interpreted as the max delay between flush journal writes (default 1 second). - bch2_journal_flush_seq_async is changed to ensure a flush write >= the requested sequence number has happened - journal read/replay must now ignore, and blacklist, any journal entries newer than the most recent flush entry in the journal. Also, the way the read_entire_journal option is handled has been improved; struct journal_replay now has an entry, 'ignore', for entries that were read but should not be used. - assorted refactoring and improvements related to journal read in journal_io.c and recovery.c Previously, we'd have to issue a flush/fua write every time we accumulated a full journal entry - typically the bucket size. Now we need to issue them much less frequently: when an fsync is requested, or it's been more than write_delay_ms since the last flush, or when we need to free up space in the journal. This is a significant performance improvement on many write heavy workloads. Signed-off-by: Kent Overstreet Signed-off-by: Kent Overstreet --- diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index f072e865e43f..7df2bc7ecd4f 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1336,14 +1336,16 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); x(extents_above_btree_updates, 12) \ x(btree_updates_journalled, 13) \ x(reflink_inline_data, 14) \ - x(new_varint, 15) + x(new_varint, 15) \ + x(journal_no_flush, 16) #define BCH_SB_FEATURES_ALL \ ((1ULL << BCH_FEATURE_new_siphash)| \ (1ULL << BCH_FEATURE_new_extent_overwrite)| \ (1ULL << BCH_FEATURE_btree_ptr_v2)| \ (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ - (1ULL << BCH_FEATURE_new_varint))\ + (1ULL << BCH_FEATURE_new_varint)| \ + (1ULL << BCH_FEATURE_journal_no_flush)) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, @@ -1582,6 +1584,7 @@ struct jset { LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); +LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); #define BCH_JOURNAL_BUCKETS_MIN 8 diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 3bbb23d7739a..31168754d6b8 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -79,6 +79,8 @@ static void bch2_journal_buf_init(struct journal *j) struct journal_buf *buf = journal_cur_buf(j); bkey_extent_init(&buf->key); + buf->noflush = false; + buf->must_flush = false; memset(buf->has_inode, 0, sizeof(buf->has_inode)); @@ -574,7 +576,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct journal_buf *buf; int ret = 0; - if (seq <= j->seq_ondisk) + if (seq <= j->flushed_seq_ondisk) return 1; spin_lock(&j->lock); @@ -585,16 +587,53 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, goto out; } - if (seq <= j->seq_ondisk) { + if (seq <= j->flushed_seq_ondisk) { ret = 1; goto out; } - if (parent && - (buf = journal_seq_to_buf(j, seq))) - if (!closure_wait(&buf->wait, parent)) + /* if seq was written, but not flushed - flush a newer one instead */ + seq = max(seq, last_unwritten_seq(j)); + +recheck_need_open: + if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) { + struct journal_res res = { 0 }; + + spin_unlock(&j->lock); + + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + if (ret) + return ret; + + seq = res.seq; + buf = j->buf + (seq & JOURNAL_BUF_MASK); + buf->must_flush = true; + set_bit(JOURNAL_NEED_WRITE, &j->flags); + + if (parent && !closure_wait(&buf->wait, parent)) BUG(); + bch2_journal_res_put(j, &res); + + spin_lock(&j->lock); + goto want_write; + } + + /* + * if write was kicked off without a flush, flush the next sequence + * number instead + */ + buf = journal_seq_to_buf(j, seq); + if (buf->noflush) { + seq++; + goto recheck_need_open; + } + + buf->must_flush = true; + + if (parent && !closure_wait(&buf->wait, parent)) + BUG(); +want_write: if (seq == journal_cur_seq(j)) journal_entry_want_write(j); out: @@ -979,6 +1018,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, spin_lock(&j->lock); set_bit(JOURNAL_STARTED, &j->flags); + j->last_flush_write = jiffies; journal_pin_new_entry(j, 1); @@ -1116,6 +1156,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) "last_seq:\t\t%llu\n" "last_seq_ondisk:\t%llu\n" "prereserved:\t\t%u/%u\n" + "nr flush writes:\t%llu\n" + "nr noflush writes:\t%llu\n" "nr direct reclaim:\t%llu\n" "nr background reclaim:\t%llu\n" "current entry sectors:\t%u\n" @@ -1127,6 +1169,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) j->last_seq_ondisk, j->prereserved.reserved, j->prereserved.remaining, + j->nr_flush_writes, + j->nr_noflush_writes, j->nr_direct_reclaim, j->nr_background_reclaim, j->cur_entry_sectors, diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 1b6175cd6f1b..2c0014c3c02f 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -136,7 +136,7 @@ static inline u64 journal_last_seq(struct journal *j) static inline u64 journal_cur_seq(struct journal *j) { - BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); + EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); return j->pin.back - 1; } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 1aeeb58d3c2a..26556bb381b2 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -10,9 +10,26 @@ #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" +#include "journal_seq_blacklist.h" #include "replicas.h" #include "trace.h" +static void __journal_replay_free(struct journal_replay *i) +{ + list_del(&i->list); + kvpfree(i, offsetof(struct journal_replay, j) + + vstruct_bytes(&i->j)); + +} + +static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) +{ + i->ignore = true; + + if (!c->opts.read_entire_journal) + __journal_replay_free(i); +} + struct journal_list { struct closure cl; struct mutex lock; @@ -35,28 +52,29 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, struct bch_devs_list devs = { .nr = 0 }; struct list_head *where; size_t bytes = vstruct_bytes(j); - __le64 last_seq; + u64 last_seq = 0; int ret; - last_seq = !list_empty(jlist->head) - ? list_last_entry(jlist->head, struct journal_replay, - list)->j.last_seq - : 0; - - if (!c->opts.read_entire_journal) { - /* Is this entry older than the range we need? */ - if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { - ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; - goto out; + list_for_each_entry_reverse(i, jlist->head, list) { + if (!JSET_NO_FLUSH(&i->j)) { + last_seq = le64_to_cpu(i->j.last_seq); + break; } + } - /* Drop entries we don't need anymore */ + /* Is this entry older than the range we need? */ + if (!c->opts.read_entire_journal && + le64_to_cpu(j->seq) < last_seq) { + ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; + goto out; + } + + /* Drop entries we don't need anymore */ + if (!JSET_NO_FLUSH(j)) { list_for_each_entry_safe(i, pos, jlist->head, list) { if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) break; - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); + journal_replay_free(c, i); } } @@ -80,9 +98,7 @@ add: if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { if (i->bad) { devs = i->devs; - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); + __journal_replay_free(i); } else if (bad) { goto found; } else { @@ -104,6 +120,7 @@ add: list_add(&i->list, where); i->devs = devs; i->bad = bad; + i->ignore = false; unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); found: if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) @@ -698,14 +715,16 @@ err: goto out; } -int bch2_journal_read(struct bch_fs *c, struct list_head *list) +int bch2_journal_read(struct bch_fs *c, struct list_head *list, + u64 *blacklist_seq, u64 *start_seq) { struct journal_list jlist; - struct journal_replay *i; + struct journal_replay *i, *t; struct bch_dev *ca; unsigned iter; size_t keys = 0, entries = 0; bool degraded = false; + u64 seq, last_seq = 0; int ret = 0; closure_init_stack(&jlist.cl); @@ -734,12 +753,97 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) if (jlist.ret) return jlist.ret; + if (list_empty(list)) { + bch_info(c, "journal read done, but no entries found"); + return 0; + } + + i = list_last_entry(list, struct journal_replay, list); + *start_seq = le64_to_cpu(i->j.seq) + 1; + + /* + * Find most recent flush entry, and ignore newer non flush entries - + * those entries will be blacklisted: + */ + list_for_each_entry_safe_reverse(i, t, list, list) { + if (i->ignore) + continue; + + if (!JSET_NO_FLUSH(&i->j)) { + last_seq = le64_to_cpu(i->j.last_seq); + *blacklist_seq = le64_to_cpu(i->j.seq) + 1; + break; + } + + journal_replay_free(c, i); + } + + if (!last_seq) { + fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); + return -1; + } + + /* Drop blacklisted entries and entries older than last_seq: */ + list_for_each_entry_safe(i, t, list, list) { + if (i->ignore) + continue; + + seq = le64_to_cpu(i->j.seq); + if (seq < last_seq) { + journal_replay_free(c, i); + continue; + } + + if (bch2_journal_seq_is_blacklisted(c, seq, true)) { + fsck_err_on(!JSET_NO_FLUSH(&i->j), c, + "found blacklisted journal entry %llu", seq); + + journal_replay_free(c, i); + } + } + + /* Check for missing entries: */ + seq = last_seq; + list_for_each_entry(i, list, list) { + if (i->ignore) + continue; + + BUG_ON(seq > le64_to_cpu(i->j.seq)); + + while (seq < le64_to_cpu(i->j.seq)) { + u64 missing_start, missing_end; + + while (seq < le64_to_cpu(i->j.seq) && + bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + if (seq == le64_to_cpu(i->j.seq)) + break; + + missing_start = seq; + + while (seq < le64_to_cpu(i->j.seq) && + !bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + missing_end = seq - 1; + fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)", + missing_start, missing_end, + last_seq, *blacklist_seq - 1); + } + + seq++; + } + list_for_each_entry(i, list, list) { struct jset_entry *entry; struct bkey_i *k, *_n; struct bch_replicas_padded replicas; char buf[80]; + if (i->ignore) + continue; + ret = jset_validate_entries(c, &i->j, READ); if (ret) goto fsck_err; @@ -767,12 +871,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) entries++; } - if (!list_empty(list)) { - i = list_last_entry(list, struct journal_replay, list); + bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", + keys, entries, *start_seq); - bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", - keys, entries, le64_to_cpu(i->j.seq)); - } + if (*start_seq != *blacklist_seq) + bch_info(c, "dropped unflushed entries %llu-%llu", + *blacklist_seq, *start_seq - 1); fsck_err: return ret; } @@ -990,8 +1094,12 @@ static void journal_write_done(struct closure *cl) j->seq_ondisk = seq; if (err && (!j->err_seq || seq < j->err_seq)) j->err_seq = seq; - j->last_seq_ondisk = last_seq; - bch2_journal_space_available(j); + + if (!w->noflush) { + j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = last_seq; + bch2_journal_space_available(j); + } /* * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard @@ -1067,6 +1175,22 @@ void bch2_journal_write(struct closure *cl) j->write_start_time = local_clock(); + spin_lock(&j->lock); + if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) && + !w->must_flush && + (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) && + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) { + w->noflush = true; + SET_JSET_NO_FLUSH(jset, true); + jset->last_seq = cpu_to_le64(j->last_seq_ondisk); + + j->nr_noflush_writes++; + } else { + j->last_flush_write = jiffies; + j->nr_flush_writes++; + } + spin_unlock(&j->lock); + /* * New btree roots are set by journalling them; when the journal entry * gets written we have to propagate them to c->btree_roots @@ -1183,11 +1307,12 @@ retry_alloc: sectors); bio = ca->journal.bio; - bio_reset(bio, ca->disk_sb.bdev, - REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; bio->bi_private = ca; + if (!JSET_NO_FLUSH(jset)) + bio->bi_opf |= REQ_PREFLUSH|REQ_FUA; bch2_bio_map(bio, jset, sectors << 9); trace_journal_write(bio); @@ -1196,18 +1321,19 @@ retry_alloc: ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); } - for_each_rw_member(ca, c, i) - if (journal_flushes_device(ca) && - !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { - percpu_ref_get(&ca->io_ref); - - bio = ca->journal.bio; - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - closure_bio_submit(bio, cl); - } - + if (!JSET_NO_FLUSH(jset)) { + for_each_rw_member(ca, c, i) + if (journal_flushes_device(ca) && + !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { + percpu_ref_get(&ca->io_ref); + + bio = ca->journal.bio; + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + closure_bio_submit(bio, cl); + } + } no_io: bch2_bucket_seq_cleanup(c); diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 6958ee0f8cf2..6b4c80968f52 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -11,6 +11,7 @@ struct journal_replay { struct bch_devs_list devs; /* checksum error, but we may want to try using it anyways: */ bool bad; + bool ignore; /* must be last: */ struct jset j; }; @@ -37,7 +38,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ vstruct_for_each_safe(entry, k, _n) -int bch2_journal_read(struct bch_fs *, struct list_head *); +int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *); void bch2_journal_write(struct closure *); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index c6267284a028..a3d5405991b9 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -158,7 +158,7 @@ void bch2_journal_space_available(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; - unsigned clean; + unsigned clean, clean_ondisk, total; unsigned overhead, u64s_remaining = 0; unsigned max_entry_size = min(j->buf[0].buf_size >> 9, j->buf[1].buf_size >> 9); @@ -204,13 +204,21 @@ void bch2_journal_space_available(struct journal *j) for (i = 0; i < journal_space_nr; i++) j->space[i] = __journal_space_available(j, nr_devs_want, i); + clean_ondisk = j->space[journal_space_clean_ondisk].total; clean = j->space[journal_space_clean].total; + total = j->space[journal_space_total].total; if (!j->space[journal_space_discarded].next_entry) ret = cur_entry_journal_full; else if (!fifo_free(&j->pin)) ret = cur_entry_journal_pin_full; + if ((clean - clean_ondisk <= total / 8) && + (clean_ondisk * 2 > clean )) + set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + else + clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + overhead = DIV_ROUND_UP(clean, max_entry_size) * journal_entry_overhead(j); u64s_remaining = clean << 6; diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index d0f1bbf8f6a7..e1b63f3879f4 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -118,7 +118,7 @@ out_write_sb: out: mutex_unlock(&c->sb_lock); - return ret; + return ret ?: bch2_blacklist_table_initialize(c); } static int journal_seq_blacklist_table_cmp(const void *_l, @@ -164,8 +164,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) struct journal_seq_blacklist_table *t; unsigned i, nr = blacklist_nr_entries(bl); - BUG_ON(c->journal_seq_blacklist_table); - if (!bl) return 0; @@ -187,6 +185,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) journal_seq_blacklist_table_cmp, NULL); + kfree(c->journal_seq_blacklist_table); c->journal_seq_blacklist_table = t; return 0; } diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 6b525dc6ab7c..cf9675310f2b 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -29,6 +29,8 @@ struct journal_buf { unsigned disk_sectors; /* maximum size entry could have been, if buf_size was bigger */ unsigned u64s_reserved; + bool noflush; /* write has already been kicked off, and was noflush */ + bool must_flush; /* something wants a flush */ /* bloom filter: */ unsigned long has_inode[1024 / sizeof(unsigned long)]; }; @@ -146,6 +148,7 @@ enum { JOURNAL_RECLAIM_STARTED, JOURNAL_NEED_WRITE, JOURNAL_MAY_GET_UNRESERVED, + JOURNAL_MAY_SKIP_FLUSH, }; /* Embedded in struct bch_fs */ @@ -203,6 +206,7 @@ struct journal { /* seq, last_seq from the most recent journal entry successfully written */ u64 seq_ondisk; + u64 flushed_seq_ondisk; u64 last_seq_ondisk; u64 err_seq; u64 last_empty_seq; @@ -252,11 +256,15 @@ struct journal { unsigned write_delay_ms; unsigned reclaim_delay_ms; + unsigned long last_flush_write; u64 res_get_blocked_start; u64 need_write_time; u64 write_start_time; + u64 nr_flush_writes; + u64 nr_noflush_writes; + struct bch2_time_stats *write_time; struct bch2_time_stats *delay_time; struct bch2_time_stats *blocked_time; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 7ad5b8234747..ecd51d45743a 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -313,7 +313,7 @@ void bch2_journal_keys_free(struct journal_keys *keys) static struct journal_keys journal_keys_sort(struct list_head *journal_entries) { - struct journal_replay *p; + struct journal_replay *i; struct jset_entry *entry; struct bkey_i *k, *_n; struct journal_keys keys = { NULL }; @@ -323,35 +323,35 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) if (list_empty(journal_entries)) return keys; - keys.journal_seq_base = - le64_to_cpu(list_last_entry(journal_entries, - struct journal_replay, list)->j.last_seq); - - list_for_each_entry(p, journal_entries, list) { - if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) + list_for_each_entry(i, journal_entries, list) { + if (i->ignore) continue; - for_each_jset_key(k, _n, entry, &p->j) + if (!keys.journal_seq_base) + keys.journal_seq_base = le64_to_cpu(i->j.seq); + + for_each_jset_key(k, _n, entry, &i->j) nr_keys++; } - keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); if (!keys.d) goto err; - list_for_each_entry(p, journal_entries, list) { - if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) + list_for_each_entry(i, journal_entries, list) { + if (i->ignore) continue; - for_each_jset_key(k, _n, entry, &p->j) + BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX); + + for_each_jset_key(k, _n, entry, &i->j) keys.d[keys.nr++] = (struct journal_key) { .btree_id = entry->btree_id, .level = entry->level, .k = k, - .journal_seq = le64_to_cpu(p->j.seq) - + .journal_seq = le64_to_cpu(i->j.seq) - keys.journal_seq_base, - .journal_offset = k->_data - p->j._data, + .journal_offset = k->_data - i->j._data, }; } @@ -643,46 +643,6 @@ err: return ret; } -static bool journal_empty(struct list_head *journal) -{ - return list_empty(journal) || - journal_entry_empty(&list_last_entry(journal, - struct journal_replay, list)->j); -} - -static int -verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, - struct list_head *journal) -{ - struct journal_replay *i = - list_last_entry(journal, struct journal_replay, list); - u64 start_seq = le64_to_cpu(i->j.last_seq); - u64 end_seq = le64_to_cpu(i->j.seq); - u64 seq = start_seq; - int ret = 0; - - list_for_each_entry(i, journal, list) { - if (le64_to_cpu(i->j.seq) < start_seq) - continue; - - fsck_err_on(seq != le64_to_cpu(i->j.seq), c, - "journal entries %llu-%llu missing! (replaying %llu-%llu)", - seq, le64_to_cpu(i->j.seq) - 1, - start_seq, end_seq); - - seq = le64_to_cpu(i->j.seq); - - fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c, - "found blacklisted journal entry %llu", seq); - - do { - seq++; - } while (bch2_journal_seq_is_blacklisted(c, seq, false)); - } -fsck_err: - return ret; -} - /* journal replay early: */ static int journal_replay_entry_early(struct bch_fs *c, @@ -767,6 +727,7 @@ static int journal_replay_early(struct bch_fs *c, struct bch_sb_field_clean *clean, struct list_head *journal) { + struct journal_replay *i; struct jset_entry *entry; int ret; @@ -782,18 +743,19 @@ static int journal_replay_early(struct bch_fs *c, return ret; } } else { - struct journal_replay *i = - list_last_entry(journal, struct journal_replay, list); + list_for_each_entry(i, journal, list) { + if (i->ignore) + continue; - c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); - c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); + c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); + c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); - list_for_each_entry(i, journal, list) vstruct_for_each(&i->j, entry) { ret = journal_replay_entry_early(c, entry); if (ret) return ret; } + } } bch2_fs_usage_initialize(c); @@ -842,9 +804,6 @@ static int verify_superblock_clean(struct bch_fs *c, struct bch_sb_field_clean *clean = *cleanp; int ret = 0; - if (!c->sb.clean || !j) - return 0; - if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", le64_to_cpu(clean->journal_seq), @@ -971,7 +930,8 @@ int bch2_fs_recovery(struct bch_fs *c) { const char *err = "cannot allocate memory"; struct bch_sb_field_clean *clean = NULL; - u64 journal_seq; + struct jset *last_journal_entry = NULL; + u64 blacklist_seq, journal_seq; bool write_sb = false, need_write_alloc = false; int ret; @@ -991,24 +951,38 @@ int bch2_fs_recovery(struct bch_fs *c) set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); } + ret = bch2_blacklist_table_initialize(c); + if (ret) { + bch_err(c, "error initializing blacklist table"); + goto err; + } + if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { - struct jset *j; + struct journal_replay *i; - ret = bch2_journal_read(c, &c->journal_entries); + ret = bch2_journal_read(c, &c->journal_entries, + &blacklist_seq, &journal_seq); if (ret) goto err; - if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c, + list_for_each_entry_reverse(i, &c->journal_entries, list) + if (!i->ignore) { + last_journal_entry = &i->j; + break; + } + + if (mustfix_fsck_err_on(c->sb.clean && + last_journal_entry && + !journal_entry_empty(last_journal_entry), c, "filesystem marked clean but journal not empty")) { c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); c->sb.clean = false; } - if (!c->sb.clean && list_empty(&c->journal_entries)) { - bch_err(c, "no journal entries found"); - ret = BCH_FSCK_REPAIR_IMPOSSIBLE; - goto err; + if (!last_journal_entry) { + fsck_err_on(!c->sb.clean, c, "no journal entries found"); + goto use_clean; } c->journal_keys = journal_keys_sort(&c->journal_entries); @@ -1017,16 +991,21 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - j = &list_last_entry(&c->journal_entries, - struct journal_replay, list)->j; - - ret = verify_superblock_clean(c, &clean, j); - if (ret) + if (c->sb.clean && last_journal_entry) { + ret = verify_superblock_clean(c, &clean, + last_journal_entry); + if (ret) + goto err; + } + } else { +use_clean: + if (!clean) { + bch_err(c, "no superblock clean section found"); + ret = BCH_FSCK_REPAIR_IMPOSSIBLE; goto err; - journal_seq = le64_to_cpu(j->seq) + 1; - } else { - journal_seq = le64_to_cpu(clean->journal_seq) + 1; + } + blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; } if (!c->sb.clean && @@ -1045,30 +1024,23 @@ int bch2_fs_recovery(struct bch_fs *c) if (ret) goto err; - if (!c->sb.clean) { + /* + * After an unclean shutdown, skip then next few journal sequence + * numbers as they may have been referenced by btree writes that + * happened before their corresponding journal writes - those btree + * writes need to be ignored, by skipping and blacklisting the next few + * journal sequence numbers: + */ + if (!c->sb.clean) + journal_seq += 8; + + if (blacklist_seq != journal_seq) { ret = bch2_journal_seq_blacklist_add(c, - journal_seq, - journal_seq + 8); + blacklist_seq, journal_seq); if (ret) { bch_err(c, "error creating new journal seq blacklist entry"); goto err; } - - journal_seq += 8; - - /* - * The superblock needs to be written before we do any btree - * node writes: it will be in the read_write() path - */ - } - - ret = bch2_blacklist_table_initialize(c); - - if (!list_empty(&c->journal_entries)) { - ret = verify_journal_entries_not_blacklisted_or_missing(c, - &c->journal_entries); - if (ret) - goto err; } ret = bch2_fs_journal_start(&c->journal, journal_seq,