bcachefs: btree write buffer knows how to accumulate bch_accounting keys

author Kent Overstreet <kent.overstreet@linux.dev>

Fri, 17 Nov 2023 05:23:07 +0000 (00:23 -0500)

committer Kent Overstreet <kent.overstreet@linux.dev>

Sun, 14 Jul 2024 23:00:13 +0000 (19:00 -0400)
author Kent Overstreet <kent.overstreet@linux.dev>
Fri, 17 Nov 2023 05:23:07 +0000 (00:23 -0500)
committer Kent Overstreet <kent.overstreet@linux.dev>
Sun, 14 Jul 2024 23:00:13 +0000 (19:00 -0400)
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h

index 6eec526c45d498b200b1e93d9fec7d12e731a2ac..b1a74d3ebc121f4900b7f5a442d227bd79ad9a97 100644 (file)
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -593,6 +593,7 @@ struct bch_dev {
         x(new_fs)                       \
         x(started)                      \
         x(btree_running)                \
+       x(accounting_replay_done)       \
         x(may_go_rw)                    \
         x(rw)                           \
         x(was_rw)                       \
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c

index d0e92d948002d4a3e07aeeaec0cb95d6333884eb..e9e36d8aded97fb1776f0a761e3f8874c78a4b5f 100644 (file)
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -6,6 +6,7 @@
  #include "btree_update.h"
  #include "btree_update_interior.h"
  #include "btree_write_buffer.h"
+#include "disk_accounting.h"
  #include "error.h"
  #include "extents.h"
  #include "journal.h"
@@ -134,7 +135,9 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
  
  static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
                                struct btree_write_buffered_key *wb,
-                              bool *write_locked, size_t *fast)
+                              bool *write_locked,
+                              bool *accounting_accumulated,
+                              size_t *fast)
  {
         struct btree_path *path;
         int ret;
@@ -147,6 +150,16 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
         if (ret)
                 return ret;
  
+       if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
+               struct bkey u;
+               struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
+
+               if (k.k->type == KEY_TYPE_accounting)
+                       bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
+                                                  bkey_s_c_to_accounting(k));
+       }
+       *accounting_accumulated = true;
+
         /*
          * We can't clone a path that has write locks: unshare it now, before
          * set_pos and traverse():
@@ -259,8 +272,9 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
         struct journal *j = &c->journal;
         struct btree_write_buffer *wb = &c->btree_write_buffer;
         struct btree_iter iter = { NULL };
-       size_t skipped = 0, fast = 0, slowpath = 0;
+       size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
         bool write_locked = false;
+       bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
         int ret = 0;
  
         bch2_trans_unlock(trans);
@@ -301,11 +315,22 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
  
                 BUG_ON(!k->journal_seq);
  
+               if (!accounting_replay_done &&
+                   k->k.k.type == KEY_TYPE_accounting) {
+                       slowpath++;
+                       continue;
+               }
+
                 if (i + 1 < &darray_top(wb->sorted) &&
                     wb_key_eq(i, i + 1)) {
                         struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
  
-                       skipped++;
+                       if (k->k.k.type == KEY_TYPE_accounting &&
+                           n->k.k.type == KEY_TYPE_accounting)
+                               bch2_accounting_accumulate(bkey_i_to_accounting(&n->k),
+                                                          bkey_i_to_s_c_accounting(&k->k));
+
+                       overwritten++;
                         n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
                         k->journal_seq = 0;
                         continue;
@@ -340,13 +365,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
                 bch2_btree_iter_set_pos(&iter, k->k.k.p);
                 btree_iter_path(trans, &iter)->preserve = false;
  
+               bool accounting_accumulated = false;
                 do {
                         if (race_fault()) {
                                 ret = -BCH_ERR_journal_reclaim_would_deadlock;
                                 break;
                         }
  
-                       ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
+                       ret = wb_flush_one(trans, &iter, k, &write_locked,
+                                          &accounting_accumulated, &fast);
                         if (!write_locked)
                                 bch2_trans_begin(trans);
                 } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
@@ -387,8 +414,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
                         if (!i->journal_seq)
                                 continue;
  
-                       bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
-                                               bch2_btree_write_buffer_journal_flush);
+                       if (!accounting_replay_done &&
+                           i->k.k.type == KEY_TYPE_accounting) {
+                               could_not_insert++;
+                               continue;
+                       }
+
+                       if (!could_not_insert)
+                               bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
+                                                       bch2_btree_write_buffer_journal_flush);
  
                         bch2_trans_begin(trans);
  
@@ -401,13 +435,45 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
                                         btree_write_buffered_insert(trans, i));
                         if (ret)
                                 goto err;
+
+                       i->journal_seq = 0;
+               }
+
+               /*
+                * If journal replay hasn't finished with accounting keys we
+                * can't flush accounting keys at all - condense them and leave
+                * them for next time.
+                *
+                * Q: Can the write buffer overflow?
+                * A Shouldn't be any actual risk. It's just new accounting
+                * updates that the write buffer can't flush, and those are only
+                * going to be generated by interior btree node updates as
+                * journal replay has to split/rewrite nodes to make room for
+                * its updates.
+                *
+                * And for those new acounting updates, updates to the same
+                * counters get accumulated as they're flushed from the journal
+                * to the write buffer - see the patch for eytzingcer tree
+                * accumulated. So we could only overflow if the number of
+                * distinct counters touched somehow was very large.
+                */
+               if (could_not_insert) {
+                       struct btree_write_buffered_key *dst = wb->flushing.keys.data;
+
+                       darray_for_each(wb->flushing.keys, i)
+                               if (i->journal_seq)
+                                       *dst++ = *i;
+                       wb->flushing.keys.nr = dst - wb->flushing.keys.data;
                 }
         }
  err:
+       if (ret || !could_not_insert) {
+               bch2_journal_pin_drop(j, &wb->flushing.pin);
+               wb->flushing.keys.nr = 0;
+       }
+
         bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
-       trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
-       bch2_journal_pin_drop(j, &wb->flushing.pin);
-       wb->flushing.keys.nr = 0;
+       trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);
         return ret;
  }
  
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c

index 0091af3beeefc5e0f7798468db5f918dddf4a553..5c6bfa9e69d5061bc47ea9b91026c05d80b8b457 100644 (file)
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -290,6 +290,8 @@ int bch2_journal_replay(struct bch_fs *c)
                 k->overwritten = true;
         }
  
+       set_bit(BCH_FS_accounting_replay_done, &c->flags);
+
         /*
          * First, attempt to replay keys in sorted order. This is more
          * efficient - better locality of btree access -  but some might fail if
@@ -1060,6 +1062,7 @@ int bch2_fs_initialize(struct bch_fs *c)
          * set up the journal.pin FIFO and journal.cur pointer:
          */
         bch2_fs_journal_start(&c->journal, 1);
+       set_bit(BCH_FS_accounting_replay_done, &c->flags);
         bch2_journal_set_replay_done(&c->journal);
  
         ret = bch2_fs_read_write_early(c);
author	Kent Overstreet <kent.overstreet@linux.dev>
	Fri, 17 Nov 2023 05:23:07 +0000 (00:23 -0500)
committer	Kent Overstreet <kent.overstreet@linux.dev>
	Sun, 14 Jul 2024 23:00:13 +0000 (19:00 -0400)
fs/bcachefs/bcachefs.h		patch \| blob \| history
fs/bcachefs/btree_write_buffer.c		patch \| blob \| history
fs/bcachefs/recovery.c		patch \| blob \| history