]> www.infradead.org Git - nvme.git/commitdiff
bcachefs: btree write buffer knows how to accumulate bch_accounting keys
authorKent Overstreet <kent.overstreet@linux.dev>
Fri, 17 Nov 2023 05:23:07 +0000 (00:23 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 14 Jul 2024 23:00:13 +0000 (19:00 -0400)
Teach the btree write buffer how to accumulate accounting keys - instead
of having the newer key overwrite the older key as we do with other
updates, we need to add them together.

Also, add a flag so that write buffer flush knows when journal replay is
finished flushing accounting, and teach it to hold accounting keys until
that flag is set.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/bcachefs.h
fs/bcachefs/btree_write_buffer.c
fs/bcachefs/recovery.c

index 6eec526c45d498b200b1e93d9fec7d12e731a2ac..b1a74d3ebc121f4900b7f5a442d227bd79ad9a97 100644 (file)
@@ -593,6 +593,7 @@ struct bch_dev {
        x(new_fs)                       \
        x(started)                      \
        x(btree_running)                \
+       x(accounting_replay_done)       \
        x(may_go_rw)                    \
        x(rw)                           \
        x(was_rw)                       \
index d0e92d948002d4a3e07aeeaec0cb95d6333884eb..e9e36d8aded97fb1776f0a761e3f8874c78a4b5f 100644 (file)
@@ -6,6 +6,7 @@
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_write_buffer.h"
+#include "disk_accounting.h"
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
@@ -134,7 +135,9 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
 
 static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
                               struct btree_write_buffered_key *wb,
-                              bool *write_locked, size_t *fast)
+                              bool *write_locked,
+                              bool *accounting_accumulated,
+                              size_t *fast)
 {
        struct btree_path *path;
        int ret;
@@ -147,6 +150,16 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
        if (ret)
                return ret;
 
+       if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
+               struct bkey u;
+               struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
+
+               if (k.k->type == KEY_TYPE_accounting)
+                       bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
+                                                  bkey_s_c_to_accounting(k));
+       }
+       *accounting_accumulated = true;
+
        /*
         * We can't clone a path that has write locks: unshare it now, before
         * set_pos and traverse():
@@ -259,8 +272,9 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
        struct journal *j = &c->journal;
        struct btree_write_buffer *wb = &c->btree_write_buffer;
        struct btree_iter iter = { NULL };
-       size_t skipped = 0, fast = 0, slowpath = 0;
+       size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
        bool write_locked = false;
+       bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
        int ret = 0;
 
        bch2_trans_unlock(trans);
@@ -301,11 +315,22 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 
                BUG_ON(!k->journal_seq);
 
+               if (!accounting_replay_done &&
+                   k->k.k.type == KEY_TYPE_accounting) {
+                       slowpath++;
+                       continue;
+               }
+
                if (i + 1 < &darray_top(wb->sorted) &&
                    wb_key_eq(i, i + 1)) {
                        struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
 
-                       skipped++;
+                       if (k->k.k.type == KEY_TYPE_accounting &&
+                           n->k.k.type == KEY_TYPE_accounting)
+                               bch2_accounting_accumulate(bkey_i_to_accounting(&n->k),
+                                                          bkey_i_to_s_c_accounting(&k->k));
+
+                       overwritten++;
                        n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
                        k->journal_seq = 0;
                        continue;
@@ -340,13 +365,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
                bch2_btree_iter_set_pos(&iter, k->k.k.p);
                btree_iter_path(trans, &iter)->preserve = false;
 
+               bool accounting_accumulated = false;
                do {
                        if (race_fault()) {
                                ret = -BCH_ERR_journal_reclaim_would_deadlock;
                                break;
                        }
 
-                       ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
+                       ret = wb_flush_one(trans, &iter, k, &write_locked,
+                                          &accounting_accumulated, &fast);
                        if (!write_locked)
                                bch2_trans_begin(trans);
                } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
@@ -387,8 +414,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
                        if (!i->journal_seq)
                                continue;
 
-                       bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
-                                               bch2_btree_write_buffer_journal_flush);
+                       if (!accounting_replay_done &&
+                           i->k.k.type == KEY_TYPE_accounting) {
+                               could_not_insert++;
+                               continue;
+                       }
+
+                       if (!could_not_insert)
+                               bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
+                                                       bch2_btree_write_buffer_journal_flush);
 
                        bch2_trans_begin(trans);
 
@@ -401,13 +435,45 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
                                        btree_write_buffered_insert(trans, i));
                        if (ret)
                                goto err;
+
+                       i->journal_seq = 0;
+               }
+
+               /*
+                * If journal replay hasn't finished with accounting keys we
+                * can't flush accounting keys at all - condense them and leave
+                * them for next time.
+                *
+                * Q: Can the write buffer overflow?
+                * A Shouldn't be any actual risk. It's just new accounting
+                * updates that the write buffer can't flush, and those are only
+                * going to be generated by interior btree node updates as
+                * journal replay has to split/rewrite nodes to make room for
+                * its updates.
+                *
+                * And for those new acounting updates, updates to the same
+                * counters get accumulated as they're flushed from the journal
+                * to the write buffer - see the patch for eytzingcer tree
+                * accumulated. So we could only overflow if the number of
+                * distinct counters touched somehow was very large.
+                */
+               if (could_not_insert) {
+                       struct btree_write_buffered_key *dst = wb->flushing.keys.data;
+
+                       darray_for_each(wb->flushing.keys, i)
+                               if (i->journal_seq)
+                                       *dst++ = *i;
+                       wb->flushing.keys.nr = dst - wb->flushing.keys.data;
                }
        }
 err:
+       if (ret || !could_not_insert) {
+               bch2_journal_pin_drop(j, &wb->flushing.pin);
+               wb->flushing.keys.nr = 0;
+       }
+
        bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
-       trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
-       bch2_journal_pin_drop(j, &wb->flushing.pin);
-       wb->flushing.keys.nr = 0;
+       trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);
        return ret;
 }
 
index 0091af3beeefc5e0f7798468db5f918dddf4a553..5c6bfa9e69d5061bc47ea9b91026c05d80b8b457 100644 (file)
@@ -290,6 +290,8 @@ int bch2_journal_replay(struct bch_fs *c)
                k->overwritten = true;
        }
 
+       set_bit(BCH_FS_accounting_replay_done, &c->flags);
+
        /*
         * First, attempt to replay keys in sorted order. This is more
         * efficient - better locality of btree access -  but some might fail if
@@ -1060,6 +1062,7 @@ int bch2_fs_initialize(struct bch_fs *c)
         * set up the journal.pin FIFO and journal.cur pointer:
         */
        bch2_fs_journal_start(&c->journal, 1);
+       set_bit(BCH_FS_accounting_replay_done, &c->flags);
        bch2_journal_set_replay_done(&c->journal);
 
        ret = bch2_fs_read_write_early(c);