From 11223d0e7b091b11e0e533850c1007e8fc797c68 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 27 Jan 2025 17:52:39 +0100 Subject: [PATCH 01/16] bcachefs: implement eytzinger0_find_ge directly Implement eytzinger0_find_ge() directly instead of implementing it in terms of eytzinger0_find_le() and adjusting the result. This turns eytzinger0_find_ge() into a minimum search, so when there are duplicate elements, the result of eytzinger0_find_ge() will now always point at the first matching element. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 568a04b16d09..e3713b7b4c27 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -275,15 +275,17 @@ static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, return n - 1; } +/* return smallest node >= @search, or -1 if not found */ static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size, cmp_func_t cmp, const void *search) { - ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); - - if (idx < nr && !cmp(base + idx * size, search)) - return idx; + void *base1 = base - size; + unsigned n = 1; - return eytzinger0_next(idx, nr); + while (n <= nr) + n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0); + n >>= __ffs(n + 1) + 1; + return n - 1; } #define eytzinger0_find(base, nr, size, _cmp, search) \ -- 2.51.0 From 63ce189b00c37a6fd0297d45ddede5442adb0a28 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 27 Jan 2025 17:15:36 +0100 Subject: [PATCH 02/16] bcachefs: add eytzinger0_find_ge self test Add an eytzinger0_find_ge() self test similar to eytzinger0_find_gt(). Note that this test requires eytzinger0_find_ge() to return the first matching element in the array in case of duplicates. To prevent bisection errors, we only add this test after strenghening the original implementation (see the previous commit). Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/util.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 9c6e5d7122b4..14686ff32003 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -862,10 +862,48 @@ static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search) } } +static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search) +{ + int r, s; + bool bad; + + r = eytzinger0_find_ge(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + if (r >= 0) { + if (test_array[r] < search) { + bad = true; + } else { + s = eytzinger0_prev(r, nr); + bad = s >= 0 && test_array[s] >= search; + } + } else { + s = eytzinger0_first(nr); + bad = s >= 0 && test_array[s] >= search; + } + + if (bad) { + s = -1; + eytzinger0_for_each(j, nr) { + if (test_array[j] >= search) { + s = j; + break; + } + } + + eytzinger0_for_each(j, nr) + pr_info("[%3u] = %12u\n", j, test_array[j]); + pr_info("find_ge(%12u) = %3i should be %3i\n", + search, r, s); + BUG(); + } +} + static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) { eytzinger0_find_test_le(test_array, nr, search); eytzinger0_find_test_gt(test_array, nr, search); + eytzinger0_find_test_ge(test_array, nr, search); } void eytzinger0_find_test(void) -- 2.51.0 From 956032edd25d971da2820754242eaa7a925a8215 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sat, 1 Feb 2025 13:55:46 +0100 Subject: [PATCH 03/16] bcachefs: Add eytzinger0_find self test Function eytzinger0_find() isn't currently covered, so add a self test. We can rely on eytzinger0_find_le() here because it is being tested independently. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/util.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 14686ff32003..525734528f35 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -899,11 +899,40 @@ static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search) } } +static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search) +{ + unsigned r; + int s; + bool bad; + + r = eytzinger0_find(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + + if (r < nr) { + bad = test_array[r] != search; + } else { + s = eytzinger0_find_le(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + bad = s >= 0 && test_array[s] == search; + } + + if (bad) { + eytzinger0_for_each(j, nr) + pr_info("[%3u] = %12u\n", j, test_array[j]); + pr_info("find(%12u) = %3i is incorrect\n", + search, r); + BUG(); + } +} + static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) { eytzinger0_find_test_le(test_array, nr, search); eytzinger0_find_test_gt(test_array, nr, search); eytzinger0_find_test_ge(test_array, nr, search); + eytzinger0_find_test_eq(test_array, nr, search); } void eytzinger0_find_test(void) -- 2.51.0 From 3849bcab4d3f0cf1aee56bdfc7bcae7b40b11657 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 28 Jan 2025 10:56:37 +0100 Subject: [PATCH 04/16] bcachefs: convert eytzinger0_find to be 1-based Several of the algorithms on eytzinger trees are implemented in terms of the eytzinger0 primitives. However, those algorithms can just as easily be expressed in terms of the eytzinger1 primitives, and that leads to better and easier to understand code. Start by converting eytzinger0_find(). Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index e3713b7b4c27..90cd5648b177 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -290,17 +290,17 @@ static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size, #define eytzinger0_find(base, nr, size, _cmp, search) \ ({ \ - void *_base = (base); \ + size_t _size = (size); \ + void *_base1 = (void *)(base) - _size; \ const void *_search = (search); \ size_t _nr = (nr); \ - size_t _size = (size); \ - size_t _i = 0; \ + size_t _i = 1; \ int _res; \ \ - while (_i < _nr && \ - (_res = _cmp(_search, _base + _i * _size))) \ - _i = eytzinger0_child(_i, _res > 0); \ - _i; \ + while (_i <= _nr && \ + (_res = _cmp(_search, _base1 + _i * _size))) \ + _i = eytzinger1_child(_i, _res > 0); \ + _i - 1; \ }) void eytzinger0_sort_r(void *, size_t, size_t, -- 2.51.0 From 3ff0dd28d61e3b3ca378b897f98f0ea6810bf822 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 27 Nov 2024 13:26:10 +0100 Subject: [PATCH 05/16] bcachefs: convert eytzinger sort to be 1-based (1) In this first step, convert the eytzinger sort functions to use 1-based primitives. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.c | 48 +++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c index 2eaffe37b5e7..4fe02c93bfb3 100644 --- a/fs/bcachefs/eytzinger.c +++ b/fs/bcachefs/eytzinger.c @@ -148,28 +148,28 @@ static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *pr return cmp(a, b, priv); } -static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size, +static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size, cmp_r_func_t cmp_func, const void *priv, size_t l, size_t r) { - return do_cmp(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, + return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size, + base1 + inorder_to_eytzinger1(r, n) * size, cmp_func, priv); } -static inline void eytzinger0_do_swap(void *base, size_t n, size_t size, +static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size, swap_r_func_t swap_func, const void *priv, size_t l, size_t r) { - do_swap(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, + do_swap(base1 + inorder_to_eytzinger1(l, n) * size, + base1 + inorder_to_eytzinger1(r, n) * size, size, swap_func, priv); } -void eytzinger0_sort_r(void *base, size_t n, size_t size, - cmp_r_func_t cmp_func, - swap_r_func_t swap_func, - const void *priv) +static void eytzinger1_sort_r(void *base1, size_t n, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) { int i, j, k; @@ -178,9 +178,9 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size, swap_func = NULL; if (!swap_func) { - if (is_aligned(base, size, 8)) + if (is_aligned(base1, size, 8)) swap_func = SWAP_WORDS_64; - else if (is_aligned(base, size, 4)) + else if (is_aligned(base1, size, 4)) swap_func = SWAP_WORDS_32; else swap_func = SWAP_BYTES; @@ -190,47 +190,57 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size, for (i = n / 2 - 1; i >= 0; --i) { /* Find the sift-down path all the way to the leaves. */ for (j = i; k = j * 2 + 1, k + 1 < n;) - j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; + j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k + 1, k + 2) > 0 ? k : k + 1; /* Special case for the last leaf with no sibling. */ if (j * 2 + 2 == n) j = j * 2 + 1; /* Backtrack to the correct location. */ - while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0) + while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i + 1, j + 1) >= 0) j = (j - 1) / 2; /* Shift the element into its correct place. */ for (k = j; j != i;) { j = (j - 1) / 2; - eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); + eytzinger1_do_swap(base1, n, size, swap_func, priv, j + 1, k + 1); } } /* sort */ for (i = n - 1; i > 0; --i) { - eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); + eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i + 1); /* Find the sift-down path all the way to the leaves. */ for (j = 0; k = j * 2 + 1, k + 1 < i;) - j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; + j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k + 1, k + 2) > 0 ? k : k + 1; /* Special case for the last leaf with no sibling. */ if (j * 2 + 2 == i) j = j * 2 + 1; /* Backtrack to the correct location. */ - while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0) + while (j && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j + 1) >= 0) j = (j - 1) / 2; /* Shift the element into its correct place. */ for (k = j; j;) { j = (j - 1) / 2; - eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); + eytzinger1_do_swap(base1, n, size, swap_func, priv, j + 1, k + 1); } } } +void eytzinger0_sort_r(void *base, size_t n, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) +{ + void *base1 = base - size; + + return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv); +} + void eytzinger0_sort(void *base, size_t n, size_t size, cmp_func_t cmp_func, swap_func_t swap_func) -- 2.51.0 From 68eb4c5fea4146e060a32d6434009dfae353709c Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 27 Jan 2025 20:54:52 +0100 Subject: [PATCH 06/16] bcachefs: convert eytzinger sort to be 1-based (2) In this second step, transform the eytzinger indexes i, j, and k in eytzinger1_sort_r() from 0-based to 1-based. This step looks a bit messy, but the resulting code is slightly better. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.c | 42 ++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c index 4fe02c93bfb3..0e742555cb0a 100644 --- a/fs/bcachefs/eytzinger.c +++ b/fs/bcachefs/eytzinger.c @@ -171,7 +171,7 @@ static void eytzinger1_sort_r(void *base1, size_t n, size_t size, swap_r_func_t swap_func, const void *priv) { - int i, j, k; + unsigned i, j, k; /* called from 'sort' without swap function, let's pick the default */ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) @@ -187,46 +187,46 @@ static void eytzinger1_sort_r(void *base1, size_t n, size_t size, } /* heapify */ - for (i = n / 2 - 1; i >= 0; --i) { + for (i = n / 2; i >= 1; --i) { /* Find the sift-down path all the way to the leaves. */ - for (j = i; k = j * 2 + 1, k + 1 < n;) - j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k + 1, k + 2) > 0 ? k : k + 1; + for (j = i; k = j * 2, k < n;) + j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; /* Special case for the last leaf with no sibling. */ - if (j * 2 + 2 == n) - j = j * 2 + 1; + if (j * 2 == n) + j *= 2; /* Backtrack to the correct location. */ - while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i + 1, j + 1) >= 0) - j = (j - 1) / 2; + while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0) + j /= 2; /* Shift the element into its correct place. */ for (k = j; j != i;) { - j = (j - 1) / 2; - eytzinger1_do_swap(base1, n, size, swap_func, priv, j + 1, k + 1); + j /= 2; + eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); } } /* sort */ - for (i = n - 1; i > 0; --i) { - eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i + 1); + for (i = n; i > 1; --i) { + eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i); /* Find the sift-down path all the way to the leaves. */ - for (j = 0; k = j * 2 + 1, k + 1 < i;) - j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k + 1, k + 2) > 0 ? k : k + 1; + for (j = 1; k = j * 2, k + 1 < i;) + j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; /* Special case for the last leaf with no sibling. */ - if (j * 2 + 2 == i) - j = j * 2 + 1; + if (j * 2 + 1 == i) + j *= 2; /* Backtrack to the correct location. */ - while (j && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j + 1) >= 0) - j = (j - 1) / 2; + while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0) + j /= 2; /* Shift the element into its correct place. */ - for (k = j; j;) { - j = (j - 1) / 2; - eytzinger1_do_swap(base1, n, size, swap_func, priv, j + 1, k + 1); + for (k = j; j > 1;) { + j /= 2; + eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); } } } -- 2.51.0 From f27614652cd33184fb8a8464c1b0b893d350b33d Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 28 Jan 2025 18:24:15 +0100 Subject: [PATCH 07/16] bcachefs: eytzinger1_{next,prev} cleanup The eytzinger code was previously relying on the following wrap-around properties and their "eytzinger0" equivalents: eytzinger1_prev(0, size) == eytzinger1_last(size) eytzinger1_next(0, size) == eytzinger1_first(size) However, these properties are no longer relied upon and no longer necessary, so remove the corresponding asserts and forbid the use of eytzinger1_prev(0, size) and eytzinger1_next(0, size). This allows to further simplify the code in eytzinger1_next() and eytzinger1_prev(): where the left shifting happens, eytzinger1_next() is trying to move i to the lowest child on the left, which is equivalent to doubling i until the next doubling would cause it to be greater than size. This is implemented by shifting i to the left so that the most significant bits align and then shifting i to the right by one if the result is greater than size. Likewise, eytzinger1_prev() is trying to move to the lowest child on the right; the same applies here. The 1-offset in (size - 1) in eytzinger1_next() isn't needed at all, but the equivalent offset in eytzinger1_prev() is surprisingly needed to preserve the 'eytzinger1_prev(0, size) == eytzinger1_last(size)' property. However, since we no longer support that property, we can get rid of these offsets as well. This saves one addition in each function and makes the code less confusing. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.h | 18 ++++-------------- fs/bcachefs/util.c | 12 ------------ 2 files changed, 4 insertions(+), 26 deletions(-) diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 90cd5648b177..643c1f716061 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -57,24 +57,14 @@ static inline unsigned eytzinger1_last(unsigned size) return rounddown_pow_of_two(size + 1) - 1; } -/* - * eytzinger1_next() and eytzinger1_prev() have the nice properties that - * - * eytzinger1_next(0) == eytzinger1_first()) - * eytzinger1_prev(0) == eytzinger1_last()) - * - * eytzinger1_prev(eytzinger1_first()) == 0 - * eytzinger1_next(eytzinger1_last()) == 0 - */ - static inline unsigned eytzinger1_next(unsigned i, unsigned size) { - EYTZINGER_BUG_ON(i > size); + EYTZINGER_BUG_ON(i == 0 || i > size); if (eytzinger1_right_child(i) <= size) { i = eytzinger1_right_child(i); - i <<= __fls(size + 1) - __fls(i); + i <<= __fls(size) - __fls(i); i >>= i > size; } else { i >>= ffz(i) + 1; @@ -85,12 +75,12 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) static inline unsigned eytzinger1_prev(unsigned i, unsigned size) { - EYTZINGER_BUG_ON(i > size); + EYTZINGER_BUG_ON(i == 0 || i > size); if (eytzinger1_left_child(i) <= size) { i = eytzinger1_left_child(i) + 1; - i <<= __fls(size + 1) - __fls(i); + i <<= __fls(size) - __fls(i); i -= 1; i >>= i > size; } else { diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 525734528f35..a7edbcca1a84 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -719,12 +719,6 @@ void eytzinger1_test(void) if (!(size % 4096)) pr_info("tree size %u\n", size); - BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); - BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); - - BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); - BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); - inorder = 1; eytzinger1_for_each(eytz, size) { BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz); @@ -753,12 +747,6 @@ void eytzinger0_test(void) if (!(size % 4096)) pr_info("tree size %u\n", size); - BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); - BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); - - BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); - BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); - inorder = 0; eytzinger0_for_each(eytz, size) { BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz); -- 2.51.0 From 3faa4647a0c3fd0e27e966a8c72ab9863014d518 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 11:55:33 -0500 Subject: [PATCH 08/16] bcachefs: metadata_target is not an inode option This option only applies filesystem wide. Signed-off-by: Kent Overstreet --- fs/bcachefs/opts.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 9d397fc2a1f0..071a92ec8a14 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -197,7 +197,7 @@ enum fsck_err_opts { BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ NULL, "Hash function for directory entries and xattrs")\ x(metadata_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_METADATA_TARGET, 0, \ "(target)", "Device or label for metadata writes") \ -- 2.51.0 From 1ccbcd320577271c85d9a5bfbdd3394cb9baadb3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 17:04:08 -0500 Subject: [PATCH 09/16] bcachefs: bch2_write_op_error() now prints info about data update A user has been seeing the "error verifying existing checksum while rewriting existing data (memory corruption?)" error. This generally indicates a hardware issue (and that may be the case here), but it might also indicate a bug, in which case we need more information to look for patterns. Reported-by: Roland Vet Signed-off-by: Kent Overstreet --- fs/bcachefs/compress.c | 8 ++-- fs/bcachefs/error.c | 6 +++ fs/bcachefs/error.h | 1 + fs/bcachefs/io_write.c | 92 ++++++++++++++++++++++++++++-------------- fs/bcachefs/io_write.h | 8 +++- 5 files changed, 80 insertions(+), 35 deletions(-) diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 114bf2f3879f..31467f77930f 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -271,8 +271,8 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || crc->compressed_size << 9 > c->opts.encoded_extent_max) { struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "error rewriting existing data: extent too big"); + bch2_write_op_error(&buf, op, op->pos.offset, + "extent too big to decompress"); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); return -EIO; @@ -283,8 +283,8 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, if (__bio_uncompress(c, bio, data.b, *crc)) { if (!c->opts.no_data_io) { struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "error rewriting existing data: decompression error"); + bch2_write_op_error(&buf, op, op->pos.offset, + "decompression error"); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); } diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index c8fc58fab958..3f93a5a6bbfa 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -580,3 +580,9 @@ int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printb prt_printf(out, " offset %llu: ", pos.offset << 8); return 0; } + +void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out, + struct bpos pos) +{ + bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); +} diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 76da0e88cee8..b3cc69f29fd9 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -243,5 +243,6 @@ int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subv void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos); +void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos); #endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 076e39474610..738bdbfbdb14 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -396,29 +396,61 @@ static int bch2_write_index_default(struct bch_write_op *op) /* Writes */ -static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, - u64 offset) +void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, + struct bch_write_op *op, u64 offset, const char *fmt, ...) { - bch2_inum_offset_err_msg(op->c, out, - (subvol_inum) { op->subvol, op->pos.inode, }, - offset << 9); - prt_printf(out, "write error%s: ", - op->flags & BCH_WRITE_move ? "(internal move)" : ""); -} + if (op->subvol) + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, out, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9)); + else { + struct bpos pos = op->pos; + pos.offset = offset; + lockrestart_do(trans, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); + } -void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) -{ - __bch2_write_op_error(out, op, op->pos.offset); + prt_str(out, "write error: "); + + va_list args; + va_start(args, fmt); + prt_vprintf(out, fmt, args); + va_end(args); + + if (op->flags & BCH_WRITE_move) { + struct data_update *u = container_of(op, struct data_update, op); + + prt_printf(out, "\n from internal move "); + bch2_bkey_val_to_text(out, op->c, bkey_i_to_s_c(u->k.k)); + } } -static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, - struct bch_write_op *op, u64 offset) +void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, u64 offset, + const char *fmt, ...) { - bch2_inum_offset_err_msg_trans(trans, out, - (subvol_inum) { op->subvol, op->pos.inode, }, - offset << 9); - prt_printf(out, "write error%s: ", - op->flags & BCH_WRITE_move ? "(internal move)" : ""); + if (op->subvol) + bch2_inum_offset_err_msg(op->c, out, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9); + else { + struct bpos pos = op->pos; + pos.offset = offset; + bch2_inum_snap_offset_err_msg(op->c, out, pos); + } + + prt_str(out, "write error: "); + + va_list args; + va_start(args, fmt); + prt_vprintf(out, fmt, args); + va_end(args); + + if (op->flags & BCH_WRITE_move) { + struct data_update *u = container_of(op, struct data_update, op); + + prt_printf(out, "\n from internal move "); + bch2_bkey_val_to_text(out, op->c, bkey_i_to_s_c(u->k.k)); + } } void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, @@ -561,8 +593,8 @@ static void __bch2_write_index(struct bch_write_op *op) struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); struct printbuf buf = PRINTBUF; - __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); - prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); + bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k), + "btree update error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); } @@ -1114,8 +1146,8 @@ do_write: csum_err: { struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)"); + bch2_write_op_error(&buf, op, op->pos.offset, + "error verifying existing checksum while rewriting existing data (memory corruption?)"); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); } @@ -1211,8 +1243,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); struct printbuf buf = PRINTBUF; - bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k)); - prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); + bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k), + "btree update error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); } @@ -1379,8 +1411,8 @@ err: if (ret) { struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); + bch2_write_op_error(&buf, op, op->pos.offset, + "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); op->error = ret; @@ -1502,8 +1534,8 @@ err: if (unlikely(ret < 0)) { if (!(op->flags & BCH_WRITE_alloc_nowait)) { struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); + bch2_write_op_error(&buf, op, op->pos.offset, + "%s(): %s", __func__, bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); } @@ -1634,8 +1666,8 @@ CLOSURE_CALLBACK(bch2_write) if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "misaligned write"); + bch2_write_op_error(&buf, op, op->pos.offset, + "misaligned write"); printbuf_exit(&buf); op->error = -EIO; goto err; diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index 02cca52be0bd..bf942566a8eb 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -20,7 +20,13 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *, bool); -void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op); +__printf(5, 6) +void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, + struct bch_write_op *op, u64, const char *, ...); + +__printf(4, 5) +void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, u64, + const char *, ...); #define BCH_WRITE_FLAGS() \ x(alloc_nowait) \ -- 2.51.0 From cb87f623c1ef5320431989c5215f9d46f2bc2a6f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 12 Feb 2025 09:47:39 -0500 Subject: [PATCH 10/16] bcachefs: minor journal errcode cleanup Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 2 +- fs/bcachefs/journal_io.c | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 40d3ad5a1e5c..8d4f3bfaa228 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -981,7 +981,7 @@ int bch2_journal_meta(struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal)) - return -EROFS; + return -BCH_ERR_erofs_no_writes; int ret = __bch2_journal_meta(j); bch2_write_ref_put(c, BCH_WRITE_REF_journal); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 61f71e7baff2..7d59ccc07315 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1515,7 +1515,7 @@ static void __journal_write_alloc(struct journal *j, * @j: journal object * @w: journal buf (entry to be written) * - * Returns: 0 on success, or -EROFS on failure + * Returns: 0 on success, or -BCH_ERR_insufficient_devices on failure */ static int journal_write_alloc(struct journal *j, struct journal_buf *w) { @@ -1624,8 +1624,7 @@ static CLOSURE_CALLBACK(journal_write_done) } else { bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, w->devs_written); - if (bch2_mark_replicas(c, &replicas.e)) - err = -EIO; + err = bch2_mark_replicas(c, &replicas.e); } if (err) @@ -1988,7 +1987,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * * write anything at all. */ if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) - return -EIO; + return error; if (error || w->noflush || -- 2.51.0 From e1304967078c17af2520402e3e53a48c1e001072 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 18:37:50 -0500 Subject: [PATCH 11/16] bcachefs: bch2_lru_change() checks for no-op Minor cleanup, no reason for the caller to have to this. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 32 +++++++++++++------------------- fs/bcachefs/lru.c | 6 +++--- fs/bcachefs/lru.h | 11 ++++++++++- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index a35455802280..e1061524bdf5 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -889,26 +889,20 @@ int bch2_trigger_alloc(struct btree_trans *trans, !new_a->io_time[READ]) new_a->io_time[READ] = bch2_current_io_time(c, READ); - u64 old_lru = alloc_lru_idx_read(*old_a); - u64 new_lru = alloc_lru_idx_read(*new_a); - if (old_lru != new_lru) { - ret = bch2_lru_change(trans, new.k->p.inode, - bucket_to_u64(new.k->p), - old_lru, new_lru); - if (ret) - goto err; - } + ret = bch2_lru_change(trans, new.k->p.inode, + bucket_to_u64(new.k->p), + alloc_lru_idx_read(*old_a), + alloc_lru_idx_read(*new_a)); + if (ret) + goto err; - old_lru = alloc_lru_idx_fragmentation(*old_a, ca); - new_lru = alloc_lru_idx_fragmentation(*new_a, ca); - if (old_lru != new_lru) { - ret = bch2_lru_change(trans, - BCH_LRU_FRAGMENTATION_START, - bucket_to_u64(new.k->p), - old_lru, new_lru); - if (ret) - goto err; - } + ret = bch2_lru_change(trans, + BCH_LRU_FRAGMENTATION_START, + bucket_to_u64(new.k->p), + alloc_lru_idx_fragmentation(*old_a, ca), + alloc_lru_idx_fragmentation(*new_a, ca)); + if (ret) + goto err; if (old_a->gen != new_a->gen) { ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index ce794d55818f..8ec16ae8daa6 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -59,9 +59,9 @@ int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set); } -int bch2_lru_change(struct btree_trans *trans, - u16 lru_id, u64 dev_bucket, - u64 old_time, u64 new_time) +int __bch2_lru_change(struct btree_trans *trans, + u16 lru_id, u64 dev_bucket, + u64 old_time, u64 new_time) { if (old_time == new_time) return 0; diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index f31a6cf1514c..2facc0758cb3 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -46,7 +46,16 @@ void bch2_lru_pos_to_text(struct printbuf *, struct bpos); int bch2_lru_del(struct btree_trans *, u16, u64, u64); int bch2_lru_set(struct btree_trans *, u16, u64, u64); -int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); +int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); + +static inline int bch2_lru_change(struct btree_trans *trans, + u16 lru_id, u64 dev_bucket, + u64 old_time, u64 new_time) +{ + return old_time != new_time + ? __bch2_lru_change(trans, lru_id, dev_bucket, old_time, new_time) + : 0; +} struct bkey_buf; int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *); -- 2.51.0 From b8e37c1645e96348adcfe48786f6f46930048914 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 18:39:50 -0500 Subject: [PATCH 12/16] bcachefs: s/BCH_LRU_FRAGMENTATION_START/BCH_LRU_BUCKET_FRAGMENTATION/ FRAGMENTATION_START was incorrect, there's currently only one fragmentation LRU (at the end of the reserved bits for LRU type), and we're getting ready to add a stripe fragmentation lru - so give it a better name. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 4 ++-- fs/bcachefs/lru.h | 2 +- fs/bcachefs/lru_format.h | 2 +- fs/bcachefs/movinggc.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index e1061524bdf5..87ff50a3cd81 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -897,7 +897,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, goto err; ret = bch2_lru_change(trans, - BCH_LRU_FRAGMENTATION_START, + BCH_LRU_BUCKET_FRAGMENTATION, bucket_to_u64(new.k->p), alloc_lru_idx_fragmentation(*old_a, ca), alloc_lru_idx_fragmentation(*new_a, ca)); @@ -1699,7 +1699,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); if (lru_idx) { - ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START, + ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION, lru_idx, alloc_k, last_flushed); if (ret) goto err; diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index 2facc0758cb3..398cc25db459 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -28,7 +28,7 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) { u16 lru_id = l.k->p.inode >> 48; - if (lru_id == BCH_LRU_FRAGMENTATION_START) + if (lru_id == BCH_LRU_BUCKET_FRAGMENTATION) return BCH_LRU_fragmentation; return BCH_LRU_read; } diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h index f372cb3b8cda..353a352d3fb9 100644 --- a/fs/bcachefs/lru_format.h +++ b/fs/bcachefs/lru_format.h @@ -17,7 +17,7 @@ enum bch_lru_type { #undef x }; -#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1) +#define BCH_LRU_BUCKET_FRAGMENTATION ((1U << 16) - 1) #define LRU_TIME_BITS 48 #define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 6718dc37c5a3..fa19fc44622c 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -167,8 +167,8 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, bch2_trans_begin(trans); ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, - lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), - lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), + lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0), + lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX), 0, k, ({ struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; int ret2 = 0; -- 2.51.0 From 3aff608b86440a7fd1a5486c90124f1963f6d4dc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 18:42:45 -0500 Subject: [PATCH 13/16] bcachefs: decouple bch2_lru_check_set() from alloc btree Pass in the backpointer explicitly, instead of assuming 'referring_k' is an alloc key and calculating it. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 5 ++++- fs/bcachefs/lru.c | 10 +++++----- fs/bcachefs/lru.h | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 87ff50a3cd81..58cdb6a0acf9 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1700,6 +1700,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); if (lru_idx) { ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION, + bucket_to_u64(alloc_k.k->p), lru_idx, alloc_k, last_flushed); if (ret) goto err; @@ -1729,7 +1730,9 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, a = &a_mut->v; } - ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ], + ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, + bucket_to_u64(alloc_k.k->p), + a->io_time[READ], alloc_k, last_flushed); if (ret) goto err; diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 8ec16ae8daa6..dc6b9a80a8b5 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -78,7 +78,9 @@ static const char * const bch2_lru_types[] = { }; int bch2_lru_check_set(struct btree_trans *trans, - u16 lru_id, u64 time, + u16 lru_id, + u64 dev_bucket, + u64 time, struct bkey_s_c referring_k, struct bkey_buf *last_flushed) { @@ -87,9 +89,7 @@ int bch2_lru_check_set(struct btree_trans *trans, struct btree_iter lru_iter; struct bkey_s_c lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, - lru_pos(lru_id, - bucket_to_u64(referring_k.k->p), - time), 0); + lru_pos(lru_id, dev_bucket, time), 0); int ret = bkey_err(lru_k); if (ret) return ret; @@ -104,7 +104,7 @@ int bch2_lru_check_set(struct btree_trans *trans, " %s", bch2_lru_types[lru_type(lru_k)], (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { - ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time); + ret = bch2_lru_set(trans, lru_id, dev_bucket, time); if (ret) goto err; } diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index 398cc25db459..dea1d75cc9c1 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -58,7 +58,7 @@ static inline int bch2_lru_change(struct btree_trans *trans, } struct bkey_buf; -int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *); +int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *); int bch2_check_lrus(struct bch_fs *); -- 2.51.0 From bc76ba70d213ea6a84a824c3bd5c4100900b18cc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 18:48:12 -0500 Subject: [PATCH 14/16] bcachefs: Rework bch2_check_lru_key() It's now easier to add new LRU types. Signed-off-by: Kent Overstreet --- fs/bcachefs/lru.c | 77 +++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 30 deletions(-) diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index dc6b9a80a8b5..98ab8496f29d 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -116,49 +116,67 @@ fsck_err: return ret; } +static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k) +{ + enum bch_lru_type type = lru_type(lru_k); + + switch (type) { + case BCH_LRU_read: + case BCH_LRU_fragmentation: + return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset)); + default: + BUG(); + } +} + +static u64 bkey_lru_type_idx(struct bch_fs *c, + enum bch_lru_type type, + struct bkey_s_c k) +{ + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + + switch (type) { + case BCH_LRU_read: + a = bch2_alloc_to_v4(k, &a_convert); + return alloc_lru_idx_read(*a); + case BCH_LRU_fragmentation: { + a = bch2_alloc_to_v4(k, &a_convert); + + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); + u64 idx = ca + ? alloc_lru_idx_fragmentation(*a, ca) + : 0; + rcu_read_unlock(); + return idx; + } + default: + BUG(); + } +} + static int bch2_check_lru_key(struct btree_trans *trans, struct btree_iter *lru_iter, struct bkey_s_c lru_k, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; struct printbuf buf1 = PRINTBUF; struct printbuf buf2 = PRINTBUF; - enum bch_lru_type type = lru_type(lru_k); - struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset); - u64 idx; - int ret; - - struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_pos); - if (fsck_err_on(!ca, - trans, lru_entry_to_invalid_bucket, - "lru key points to nonexistent device:bucket %llu:%llu", - alloc_pos.inode, alloc_pos.offset)) - return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); + struct bbpos bp = lru_pos_to_bp(lru_k); - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0); - ret = bkey_err(k); + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0); + int ret = bkey_err(k); if (ret) goto err; - a = bch2_alloc_to_v4(k, &a_convert); - - switch (type) { - case BCH_LRU_read: - idx = alloc_lru_idx_read(*a); - break; - case BCH_LRU_fragmentation: - idx = alloc_lru_idx_fragmentation(*a, ca); - break; - } + enum bch_lru_type type = lru_type(lru_k); + u64 idx = bkey_lru_type_idx(c, type, k); - if (lru_k.k->type != KEY_TYPE_set || - lru_pos_time(lru_k.k->p) != idx) { + if (lru_pos_time(lru_k.k->p) != idx) { ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed); if (ret) goto err; @@ -176,7 +194,6 @@ static int bch2_check_lru_key(struct btree_trans *trans, err: fsck_err: bch2_trans_iter_exit(trans, &iter); - bch2_dev_put(ca); printbuf_exit(&buf2); printbuf_exit(&buf1); return ret; -- 2.51.0 From cc297dfb41834f91cf594893dfff7ebe321190eb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 20:32:37 -0500 Subject: [PATCH 15/16] bcachefs: bch2_trigger_stripe_ptr() no longer uses ec_stripes_heap_lock Introduce per-entry locks, like with struct bucket - the stripes heap is going away. Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.c | 6 +++--- fs/bcachefs/buckets.h | 27 --------------------------- fs/bcachefs/buckets_types.h | 27 +++++++++++++++++++++++++++ fs/bcachefs/ec.h | 14 ++++++++++++++ fs/bcachefs/ec_types.h | 5 ++--- 5 files changed, 46 insertions(+), 33 deletions(-) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 345b117a4a4a..88af61bc799d 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -674,10 +674,10 @@ err: return -BCH_ERR_ENOMEM_mark_stripe_ptr; } - mutex_lock(&c->ec_stripes_heap_lock); + gc_stripe_lock(m); if (!m || !m->alive) { - mutex_unlock(&c->ec_stripes_heap_lock); + gc_stripe_unlock(m); struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s", @@ -693,7 +693,7 @@ err: .type = BCH_DISK_ACCOUNTING_replicas, }; memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e)); - mutex_unlock(&c->ec_stripes_heap_lock); + gc_stripe_unlock(m); acc.replicas.data_type = data_type; int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, true); diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index a9acdd6c0c86..6aeec1c0973c 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -39,33 +39,6 @@ static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t for (_b = (_buckets)->b + (_buckets)->first_bucket; \ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) -/* - * Ugly hack alert: - * - * We need to cram a spinlock in a single byte, because that's what we have left - * in struct bucket, and we care about the size of these - during fsck, we need - * in memory state for every single bucket on every device. - * - * We used to do - * while (xchg(&b->lock, 1) cpu_relax(); - * but, it turns out not all architectures support xchg on a single byte. - * - * So now we use bit_spin_lock(), with fun games since we can't burn a whole - * ulong for this - we just need to make sure the lock bit always ends up in the - * first byte. - */ - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define BUCKET_LOCK_BITNR 0 -#else -#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1) -#endif - -union ulong_byte_assert { - ulong ulong; - u8 byte; -}; - static inline void bucket_unlock(struct bucket *b) { BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 7174047b8e92..900b8680c8b5 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -7,6 +7,33 @@ #define BUCKET_JOURNAL_SEQ_BITS 16 +/* + * Ugly hack alert: + * + * We need to cram a spinlock in a single byte, because that's what we have left + * in struct bucket, and we care about the size of these - during fsck, we need + * in memory state for every single bucket on every device. + * + * We used to do + * while (xchg(&b->lock, 1) cpu_relax(); + * but, it turns out not all architectures support xchg on a single byte. + * + * So now we use bit_spin_lock(), with fun games since we can't burn a whole + * ulong for this - we just need to make sure the lock bit always ends up in the + * first byte. + */ + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define BUCKET_LOCK_BITNR 0 +#else +#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1) +#endif + +union ulong_byte_assert { + ulong ulong; + u8 byte; +}; + struct bucket { u8 lock; u8 gen_valid:1; diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 583ca6a226da..4c9511887655 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -132,6 +132,20 @@ static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m, m->sectors); } +static inline void gc_stripe_unlock(struct gc_stripe *s) +{ + BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); + + clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock); + wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR); +} + +static inline void gc_stripe_lock(struct gc_stripe *s) +{ + wait_on_bit_lock((void *) &s->lock, BUCKET_LOCK_BITNR, + TASK_UNINTERRUPTIBLE); +} + struct bch_read_bio; struct ec_stripe_buf { diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h index 8d1e70e830ac..37558cc2d89f 100644 --- a/fs/bcachefs/ec_types.h +++ b/fs/bcachefs/ec_types.h @@ -20,12 +20,11 @@ struct stripe { }; struct gc_stripe { + u8 lock; + unsigned alive:1; /* does a corresponding key exist in stripes btree? */ u16 sectors; - u8 nr_blocks; u8 nr_redundant; - - unsigned alive:1; /* does a corresponding key exist in stripes btree? */ u16 block_sectors[BCH_BKEY_PTRS_MAX]; struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; -- 2.51.0 From c7c07bf250cb0f391656f90bc8b11248df767ed3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 11 Feb 2025 10:09:31 -0500 Subject: [PATCH 16/16] bcachefs: Better trigger ordering Transactional triggers need to run in a defined ordering, which is not quite the same as btree ID integer comparison. Previously this was handled in a hacky way in bch2_trans_commit_run_triggers(), since it was only the alloc btree that needed special handling, but upcoming stripe btree changes are going to require more ordering changes - so, define that ordering. Next patch will change the transaction commit path to use it. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_types.h | 13 +++++++++++++ fs/bcachefs/btree_update.c | 1 + 2 files changed, 14 insertions(+) diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index a09cbe9cd94f..77578da2d23f 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -423,6 +423,7 @@ static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b) struct btree_insert_entry { unsigned flags; + u8 sort_order; u8 bkey_type; enum btree_id btree_id:8; u8 level:4; @@ -853,6 +854,18 @@ static inline bool btree_type_uses_write_buffer(enum btree_id btree) return BIT_ULL(btree) & mask; } +static inline u8 btree_trigger_order(enum btree_id btree) +{ + switch (btree) { + case BTREE_ID_alloc: + return U8_MAX; + case BTREE_ID_stripes: + return U8_MAX - 1; + default: + return btree; + } +} + struct btree_root { struct btree *b; diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 13d794f201a5..47e54eedd0bc 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -397,6 +397,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, n = (struct btree_insert_entry) { .flags = flags, + .sort_order = btree_trigger_order(path->btree_id), .bkey_type = __btree_node_type(path->level, path->btree_id), .btree_id = path->btree_id, .level = path->level, -- 2.51.0