From 04d4ce91b0bed4120e0c5fadc5291cebaa9c2a06 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 6 Dec 2024 19:06:22 +0800
Subject: [PATCH 01/16] selftests/bpf: Add more test cases for LPM trie

Add more test cases for LPM trie in test_maps:

1) test_lpm_trie_update_flags
It constructs various use cases for BPF_EXIST and BPF_NOEXIST and check
whether the return value of update operation is expected.

2) test_lpm_trie_update_full_maps
It tests the update operations on a full LPM trie map. Adding new node
will fail and overwriting the value of existed node will succeed.

3) test_lpm_trie_iterate_strs and test_lpm_trie_iterate_ints
There two test cases test whether the iteration through get_next_key is
sorted and expected. These two test cases delete the minimal key after
each iteration and check whether next iteration returns the second
minimal key. The only difference between these two test cases is the
former one saves strings in the LPM trie and the latter saves integers.
Without the fix of get_next_key, these two cases will fail as shown
below:
  test_lpm_trie_iterate_strs(1091):FAIL:iterate #2 got abc exp abS
  test_lpm_trie_iterate_ints(1142):FAIL:iterate #1 got 0x2 exp 0x1

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20241206110622.1161752-10-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../bpf/map_tests/lpm_trie_map_basic_ops.c    | 395 ++++++++++++++++++
 1 file changed, 395 insertions(+)

diff --git a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_basic_ops.c b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_basic_ops.c
index f375c89d78a4..d32e4edac930 100644
--- a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_basic_ops.c
+++ b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_basic_ops.c
@@ -20,10 +20,12 @@
 #include <string.h>
 #include <time.h>
 #include <unistd.h>
+#include <endian.h>
 #include <arpa/inet.h>
 #include <sys/time.h>
 
 #include <bpf/bpf.h>
+#include <test_maps.h>
 
 #include "bpf_util.h"
 
@@ -33,6 +35,22 @@ struct tlpm_node {
 	uint8_t key[];
 };
 
+struct lpm_trie_bytes_key {
+	union {
+		struct bpf_lpm_trie_key_hdr hdr;
+		__u32 prefixlen;
+	};
+	unsigned char data[8];
+};
+
+struct lpm_trie_int_key {
+	union {
+		struct bpf_lpm_trie_key_hdr hdr;
+		__u32 prefixlen;
+	};
+	unsigned int data;
+};
+
 static struct tlpm_node *tlpm_match(struct tlpm_node *list,
 				    const uint8_t *key,
 				    size_t n_bits);
@@ -770,6 +788,378 @@ static void test_lpm_multi_thread(void)
 	close(map_fd);
 }
 
+static int lpm_trie_create(unsigned int key_size, unsigned int value_size, unsigned int max_entries)
+{
+	LIBBPF_OPTS(bpf_map_create_opts, opts);
+	int fd;
+
+	opts.map_flags = BPF_F_NO_PREALLOC;
+	fd = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, "lpm_trie", key_size, value_size, max_entries,
+			    &opts);
+	CHECK(fd < 0, "bpf_map_create", "error %d\n", errno);
+
+	return fd;
+}
+
+static void test_lpm_trie_update_flags(void)
+{
+	struct lpm_trie_int_key key;
+	unsigned int value, got;
+	int fd, err;
+
+	fd = lpm_trie_create(sizeof(key), sizeof(value), 3);
+
+	/* invalid flags (Error) */
+	key.prefixlen = 32;
+	key.data = 0;
+	value = 0;
+	err = bpf_map_update_elem(fd, &key, &value, BPF_F_LOCK);
+	CHECK(err != -EINVAL, "invalid update flag", "error %d\n", err);
+
+	/* invalid flags (Error) */
+	key.prefixlen = 32;
+	key.data = 0;
+	value = 0;
+	err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST | BPF_EXIST);
+	CHECK(err != -EINVAL, "invalid update flag", "error %d\n", err);
+
+	/* overwrite an empty qp-trie (Error) */
+	key.prefixlen = 32;
+	key.data = 0;
+	value = 2;
+	err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
+	CHECK(err != -ENOENT, "overwrite empty qp-trie", "error %d\n", err);
+
+	/* add a new node */
+	key.prefixlen = 16;
+	key.data = 0;
+	value = 1;
+	err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
+	CHECK(err, "add new elem", "error %d\n", err);
+	got = 0;
+	err = bpf_map_lookup_elem(fd, &key, &got);
+	CHECK(err, "lookup elem", "error %d\n", err);
+	CHECK(got != value, "check value", "got %d exp %d\n", got, value);
+
+	/* add the same node as new node (Error) */
+	err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
+	CHECK(err != -EEXIST, "add new elem again", "error %d\n", err);
+
+	/* overwrite the existed node */
+	value = 4;
+	err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
+	CHECK(err, "overwrite elem", "error %d\n", err);
+	got = 0;
+	err = bpf_map_lookup_elem(fd, &key, &got);
+	CHECK(err, "lookup elem", "error %d\n", err);
+	CHECK(got != value, "check value", "got %d exp %d\n", got, value);
+
+	/* overwrite the node */
+	value = 1;
+	err = bpf_map_update_elem(fd, &key, &value, BPF_ANY);
+	CHECK(err, "update elem", "error %d\n", err);
+	got = 0;
+	err = bpf_map_lookup_elem(fd, &key, &got);
+	CHECK(err, "lookup elem", "error %d\n", err);
+	CHECK(got != value, "check value", "got %d exp %d\n", got, value);
+
+	/* overwrite a non-existent node which is the prefix of the first
+	 * node (Error).
+	 */
+	key.prefixlen = 8;
+	key.data = 0;
+	value = 2;
+	err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
+	CHECK(err != -ENOENT, "overwrite nonexistent elem", "error %d\n", err);
+
+	/* add a new node which is the prefix of the first node */
+	err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
+	CHECK(err, "add new elem", "error %d\n", err);
+	got = 0;
+	err = bpf_map_lookup_elem(fd, &key, &got);
+	CHECK(err, "lookup key", "error %d\n", err);
+	CHECK(got != value, "check value", "got %d exp %d\n", got, value);
+
+	/* add another new node which will be the sibling of the first node */
+	key.prefixlen = 9;
+	key.data = htobe32(1 << 23);
+	value = 5;
+	err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
+	CHECK(err, "add new elem", "error %d\n", err);
+	got = 0;
+	err = bpf_map_lookup_elem(fd, &key, &got);
+	CHECK(err, "lookup key", "error %d\n", err);
+	CHECK(got != value, "check value", "got %d exp %d\n", got, value);
+
+	/* overwrite the third node */
+	value = 3;
+	err = bpf_map_update_elem(fd, &key, &value, BPF_ANY);
+	CHECK(err, "overwrite elem", "error %d\n", err);
+	got = 0;
+	err = bpf_map_lookup_elem(fd, &key, &got);
+	CHECK(err, "lookup key", "error %d\n", err);
+	CHECK(got != value, "check value", "got %d exp %d\n", got, value);
+
+	/* delete the second node to make it an intermediate node */
+	key.prefixlen = 8;
+	key.data = 0;
+	err = bpf_map_delete_elem(fd, &key);
+	CHECK(err, "del elem", "error %d\n", err);
+
+	/* overwrite the intermediate node (Error) */
+	value = 2;
+	err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
+	CHECK(err != -ENOENT, "overwrite nonexistent elem", "error %d\n", err);
+
+	close(fd);
+}
+
+static void test_lpm_trie_update_full_map(void)
+{
+	struct lpm_trie_int_key key;
+	int value, got;
+	int fd, err;
+
+	fd = lpm_trie_create(sizeof(key), sizeof(value), 3);
+
+	/* add a new node */
+	key.prefixlen = 16;
+	key.data = 0;
+	value = 0;
+	err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
+	CHECK(err, "add new elem", "error %d\n", err);
+	got = 0;
+	err = bpf_map_lookup_elem(fd, &key, &got);
+	CHECK(err, "lookup elem", "error %d\n", err);
+	CHECK(got != value, "check value", "got %d exp %d\n", got, value);
+
+	/* add new node */
+	key.prefixlen = 8;
+	key.data = 0;
+	value = 1;
+	err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
+	CHECK(err, "add new elem", "error %d\n", err);
+	got = 0;
+	err = bpf_map_lookup_elem(fd, &key, &got);
+	CHECK(err, "lookup elem", "error %d\n", err);
+	CHECK(got != value, "check value", "got %d exp %d\n", got, value);
+
+	/* add new node */
+	key.prefixlen = 9;
+	key.data = htobe32(1 << 23);
+	value = 2;
+	err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
+	CHECK(err, "add new elem", "error %d\n", err);
+	got = 0;
+	err = bpf_map_lookup_elem(fd, &key, &got);
+	CHECK(err, "lookup elem", "error %d\n", err);
+	CHECK(got != value, "check value", "got %d exp %d\n", got, value);
+
+	/* try to add more node (Error) */
+	key.prefixlen = 32;
+	key.data = 0;
+	value = 3;
+	err = bpf_map_update_elem(fd, &key, &value, BPF_ANY);
+	CHECK(err != -ENOSPC, "add to full trie", "error %d\n", err);
+
+	/* update the value of an existed node with BPF_EXIST */
+	key.prefixlen = 16;
+	key.data = 0;
+	value = 4;
+	err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
+	CHECK(err, "overwrite elem", "error %d\n", err);
+	got = 0;
+	err = bpf_map_lookup_elem(fd, &key, &got);
+	CHECK(err, "lookup elem", "error %d\n", err);
+	CHECK(got != value, "check value", "got %d exp %d\n", got, value);
+
+	/* update the value of an existed node with BPF_ANY */
+	key.prefixlen = 9;
+	key.data = htobe32(1 << 23);
+	value = 5;
+	err = bpf_map_update_elem(fd, &key, &value, BPF_ANY);
+	CHECK(err, "overwrite elem", "error %d\n", err);
+	got = 0;
+	err = bpf_map_lookup_elem(fd, &key, &got);
+	CHECK(err, "lookup elem", "error %d\n", err);
+	CHECK(got != value, "check value", "got %d exp %d\n", got, value);
+
+	close(fd);
+}
+
+static int cmp_str(const void *a, const void *b)
+{
+	const char *str_a = *(const char **)a, *str_b = *(const char **)b;
+
+	return strcmp(str_a, str_b);
+}
+
+/* Save strings in LPM trie. The trailing '\0' for each string will be
+ * accounted in the prefixlen. The strings returned during the iteration
+ * should be sorted as expected.
+ */
+static void test_lpm_trie_iterate_strs(void)
+{
+	static const char * const keys[] = {
+		"ab", "abO", "abc", "abo", "abS", "abcd",
+	};
+	const char *sorted_keys[ARRAY_SIZE(keys)];
+	struct lpm_trie_bytes_key key, next_key;
+	unsigned int value, got, i, j, len;
+	struct lpm_trie_bytes_key *cur;
+	int fd, err;
+
+	fd = lpm_trie_create(sizeof(key), sizeof(value), ARRAY_SIZE(keys));
+
+	for (i = 0; i < ARRAY_SIZE(keys); i++) {
+		unsigned int flags;
+
+		/* add i-th element */
+		flags = i % 2 ? BPF_NOEXIST : 0;
+		len = strlen(keys[i]);
+		/* include the trailing '\0' */
+		key.prefixlen = (len + 1) * 8;
+		memset(key.data, 0, sizeof(key.data));
+		memcpy(key.data, keys[i], len);
+		value = i + 100;
+		err = bpf_map_update_elem(fd, &key, &value, flags);
+		CHECK(err, "add elem", "#%u error %d\n", i, err);
+
+		err = bpf_map_lookup_elem(fd, &key, &got);
+		CHECK(err, "lookup elem", "#%u error %d\n", i, err);
+		CHECK(got != value, "lookup elem", "#%u expect %u got %u\n", i, value, got);
+
+		/* re-add i-th element (Error) */
+		err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
+		CHECK(err != -EEXIST, "re-add elem", "#%u error %d\n", i, err);
+
+		/* Overwrite i-th element */
+		flags = i % 2 ? 0 : BPF_EXIST;
+		value = i;
+		err = bpf_map_update_elem(fd, &key, &value, flags);
+		CHECK(err, "update elem", "error %d\n", err);
+
+		/* Lookup #[0~i] elements */
+		for (j = 0; j <= i; j++) {
+			len = strlen(keys[j]);
+			key.prefixlen = (len + 1) * 8;
+			memset(key.data, 0, sizeof(key.data));
+			memcpy(key.data, keys[j], len);
+			err = bpf_map_lookup_elem(fd, &key, &got);
+			CHECK(err, "lookup elem", "#%u/%u error %d\n", i, j, err);
+			CHECK(got != j, "lookup elem", "#%u/%u expect %u got %u\n",
+			      i, j, value, got);
+		}
+	}
+
+	/* Add element to a full qp-trie (Error) */
+	key.prefixlen = sizeof(key.data) * 8;
+	memset(key.data, 0, sizeof(key.data));
+	value = 0;
+	err = bpf_map_update_elem(fd, &key, &value, 0);
+	CHECK(err != -ENOSPC, "add to full qp-trie", "error %d\n", err);
+
+	/* Iterate sorted elements: no deletion */
+	memcpy(sorted_keys, keys, sizeof(keys));
+	qsort(sorted_keys, ARRAY_SIZE(sorted_keys), sizeof(sorted_keys[0]), cmp_str);
+	cur = NULL;
+	for (i = 0; i < ARRAY_SIZE(sorted_keys); i++) {
+		len = strlen(sorted_keys[i]);
+		err = bpf_map_get_next_key(fd, cur, &next_key);
+		CHECK(err, "iterate", "#%u error %d\n", i, err);
+		CHECK(next_key.prefixlen != (len + 1) * 8, "iterate",
+		      "#%u invalid len %u expect %u\n",
+		      i, next_key.prefixlen, (len + 1) * 8);
+		CHECK(memcmp(sorted_keys[i], next_key.data, len + 1), "iterate",
+		      "#%u got %.*s exp %.*s\n", i, len, next_key.data, len, sorted_keys[i]);
+
+		cur = &next_key;
+	}
+	err = bpf_map_get_next_key(fd, cur, &next_key);
+	CHECK(err != -ENOENT, "more element", "error %d\n", err);
+
+	/* Iterate sorted elements: delete the found key after each iteration */
+	cur = NULL;
+	for (i = 0; i < ARRAY_SIZE(sorted_keys); i++) {
+		len = strlen(sorted_keys[i]);
+		err = bpf_map_get_next_key(fd, cur, &next_key);
+		CHECK(err, "iterate", "#%u error %d\n", i, err);
+		CHECK(next_key.prefixlen != (len + 1) * 8, "iterate",
+		      "#%u invalid len %u expect %u\n",
+		      i, next_key.prefixlen, (len + 1) * 8);
+		CHECK(memcmp(sorted_keys[i], next_key.data, len + 1), "iterate",
+		      "#%u got %.*s exp %.*s\n", i, len, next_key.data, len, sorted_keys[i]);
+
+		cur = &next_key;
+
+		err = bpf_map_delete_elem(fd, cur);
+		CHECK(err, "delete", "#%u error %d\n", i, err);
+	}
+	err = bpf_map_get_next_key(fd, cur, &next_key);
+	CHECK(err != -ENOENT, "non-empty qp-trie", "error %d\n", err);
+
+	close(fd);
+}
+
+/* Use the fixed prefixlen (32) and save integers in LPM trie. The iteration of
+ * LPM trie will return these integers in big-endian order, therefore, convert
+ * these integers to big-endian before update. After each iteration, delete the
+ * found key (the smallest integer) and expect the next iteration will return
+ * the second smallest number.
+ */
+static void test_lpm_trie_iterate_ints(void)
+{
+	struct lpm_trie_int_key key, next_key;
+	unsigned int i, max_entries;
+	struct lpm_trie_int_key *cur;
+	unsigned int *data_set;
+	int fd, err;
+	bool value;
+
+	max_entries = 4096;
+	data_set = calloc(max_entries, sizeof(*data_set));
+	CHECK(!data_set, "malloc", "no mem\n");
+	for (i = 0; i < max_entries; i++)
+		data_set[i] = i;
+
+	fd = lpm_trie_create(sizeof(key), sizeof(value), max_entries);
+	value = true;
+	for (i = 0; i < max_entries; i++) {
+		key.prefixlen = 32;
+		key.data = htobe32(data_set[i]);
+
+		err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
+		CHECK(err, "add elem", "#%u error %d\n", i, err);
+	}
+
+	cur = NULL;
+	for (i = 0; i < max_entries; i++) {
+		err = bpf_map_get_next_key(fd, cur, &next_key);
+		CHECK(err, "iterate", "#%u error %d\n", i, err);
+		CHECK(next_key.prefixlen != 32, "iterate", "#%u invalid len %u\n",
+		      i, next_key.prefixlen);
+		CHECK(be32toh(next_key.data) != data_set[i], "iterate", "#%u got 0x%x exp 0x%x\n",
+		      i, be32toh(next_key.data), data_set[i]);
+		cur = &next_key;
+
+		/*
+		 * Delete the minimal key, the next call of bpf_get_next_key()
+		 * will return the second minimal key.
+		 */
+		err = bpf_map_delete_elem(fd, &next_key);
+		CHECK(err, "del elem", "#%u elem error %d\n", i, err);
+	}
+	err = bpf_map_get_next_key(fd, cur, &next_key);
+	CHECK(err != -ENOENT, "more element", "error %d\n", err);
+
+	err = bpf_map_get_next_key(fd, NULL, &next_key);
+	CHECK(err != -ENOENT, "no-empty qp-trie", "error %d\n", err);
+
+	free(data_set);
+
+	close(fd);
+}
+
 void test_lpm_trie_map_basic_ops(void)
 {
 	int i;
@@ -789,5 +1179,10 @@ void test_lpm_trie_map_basic_ops(void)
 	test_lpm_get_next_key();
 	test_lpm_multi_thread();
 
+	test_lpm_trie_update_flags();
+	test_lpm_trie_update_full_map();
+	test_lpm_trie_iterate_strs();
+	test_lpm_trie_iterate_ints();
+
 	printf("%s: PASS\n", __func__);
 }
-- 
2.51.0


From 492077668fb453b8b16c842fcf3fafc2ebc190e9 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 6 Dec 2024 08:20:06 -0800
Subject: [PATCH 02/16] x86/CPU/AMD: WARN when setting EFER.AUTOIBRS if and
 only if the WRMSR fails

When ensuring EFER.AUTOIBRS is set, WARN only on a negative return code
from msr_set_bit(), as '1' is used to indicate the WRMSR was successful
('0' indicates the MSR bit was already set).

Fixes: 8cc68c9c9e92 ("x86/CPU/AMD: Make sure EFER[AIBRSE] is set")
Reported-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/Z1MkNofJjt7Oq0G6@google.com
Closes: https://lore.kernel.org/all/20241205220604.GA2054199@thelio-3990X
---
 arch/x86/kernel/cpu/amd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index d8408aafeed9..79d2e17f6582 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1065,7 +1065,7 @@ static void init_amd(struct cpuinfo_x86 *c)
 	 */
 	if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
 	    cpu_has(c, X86_FEATURE_AUTOIBRS))
-		WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS));
+		WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS) < 0);
 
 	/* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
 	clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
-- 
2.51.0


From b4d83c8323b0c4a899a996fed919cfe10720d289 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 6 Dec 2024 10:19:31 +0100
Subject: [PATCH 03/16] headers/cleanup.h: Remove the if_not_guard() facility

Linus noticed that the new if_not_guard() definition is fragile:

   "This macro generates actively wrong code if it happens to be inside an
    if-statement or a loop without a block.

    IOW, code like this:

      for (iterate-over-something)
          if_not_guard(a)
              return -BUSY;

    looks like will build fine, but will generate completely incorrect code."

The reason is that the __if_not_guard() macro is multi-statement, so
while most kernel developers expect macros to be simple or at least
compound statements - but for __if_not_guard() it is not so:

 #define __if_not_guard(_name, _id, args...)            \
        BUILD_BUG_ON(!__is_cond_ptr(_name));            \
        CLASS(_name, _id)(args);                        \
        if (!__guard_ptr(_name)(&_id))

To add insult to injury, the placement of the BUILD_BUG_ON() line makes
the macro appear to compile fine, but it will generate incorrect code
as Linus reported, for example if used within iteration or conditional
statements that will use the first statement of a macro as a loop body
or conditional statement body.

[ I'd also like to note that the original submission by David Lechner did
  not contain the BUILD_BUG_ON() line, so it was safer than what we ended
  up committing. Mea culpa. ]

It doesn't appear to be possible to turn this macro into a robust
single or compound statement that could be used in single statements,
due to the necessity to define an auto scope variable with an open
scope and the necessity of it having to expand to a partial 'if'
statement with no body.

Instead of trying to work around this fragility, just remove the
construct before it gets used.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: David Lechner <dlechner@baylibre.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/Z1LBnX9TpZLR5Dkf@gmail.com
---
 include/linux/cleanup.h | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 966fcc5ff8ef..ec00e3f7af2b 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -273,12 +273,6 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
  *	an anonymous instance of the (guard) class, not recommended for
  *	conditional locks.
  *
- * if_not_guard(name, args...) { <error handling> }:
- *	convenience macro for conditional guards that calls the statement that
- *	follows only if the lock was not acquired (typically an error return).
- *
- *	Only for conditional locks.
- *
  * scoped_guard (name, args...) { }:
  *	similar to CLASS(name, scope)(args), except the variable (with the
  *	explicit name 'scope') is declard in a for-loop such that its scope is
@@ -350,14 +344,6 @@ _label:									\
 #define scoped_cond_guard(_name, _fail, args...)	\
 	__scoped_cond_guard(_name, _fail, __UNIQUE_ID(label), args)
 
-#define __if_not_guard(_name, _id, args...)		\
-	BUILD_BUG_ON(!__is_cond_ptr(_name));		\
-	CLASS(_name, _id)(args);			\
-	if (!__guard_ptr(_name)(&_id))
-
-#define if_not_guard(_name, args...) \
-	__if_not_guard(_name, __UNIQUE_ID(guard), args)
-
 /*
  * Additional helper macros for generating lock guards with types, either for
  * locks that don't have a native type (eg. RCU, preempt) or those that need a
-- 
2.51.0


From 7912405643a14b527cd4a4f33c1d4392da900888 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 1 Dec 2024 12:17:30 +0100
Subject: [PATCH 04/16] modpost: Add .irqentry.text to OTHER_SECTIONS

The compiler can fully inline the actual handler function of an interrupt
entry into the .irqentry.text entry point. If such a function contains an
access which has an exception table entry, modpost complains about a
section mismatch:

  WARNING: vmlinux.o(__ex_table+0x447c): Section mismatch in reference ...

  The relocation at __ex_table+0x447c references section ".irqentry.text"
  which is not in the list of authorized sections.

Add .irqentry.text to OTHER_SECTIONS to cure the issue.

Reported-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org # needed for linux-5.4-y
Link: https://lore.kernel.org/all/20241128111844.GE10431@google.com/
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 scripts/mod/modpost.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 0584cbcdbd2d..fb787a5715f5 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -772,7 +772,7 @@ static void check_section(const char *modname, struct elf_info *elf,
 		".ltext", ".ltext.*"
 #define OTHER_TEXT_SECTIONS ".ref.text", ".head.text", ".spinlock.text", \
 		".fixup", ".entry.text", ".exception.text", \
-		".coldtext", ".softirqentry.text"
+		".coldtext", ".softirqentry.text", ".irqentry.text"
 
 #define ALL_TEXT_SECTIONS  ".init.text", ".exit.text", \
 		TEXT_SECTIONS, OTHER_TEXT_SECTIONS
-- 
2.51.0


From d8d326d64f6702caab01ea0cd48f6c0054f3d1b4 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sun, 8 Dec 2024 16:56:45 +0900
Subject: [PATCH 05/16] kbuild: deb-pkg: fix build error with O=

Since commit 13b25489b6f8 ("kbuild: change working directory to external
module directory with M="), the Debian package build fails if a relative
path is specified with the O= option.

  $ make O=build bindeb-pkg
    [ snip ]
  dpkg-deb: building package 'linux-image-6.13.0-rc1' in '../linux-image-6.13.0-rc1_6.13.0-rc1-6_amd64.deb'.
  Rebuilding host programs with x86_64-linux-gnu-gcc...
  make[6]: Entering directory '/home/masahiro/linux/build'
  /home/masahiro/linux/Makefile:190: *** specified kernel directory "build" does not exist.  Stop.

This occurs because the sub_make_done flag is cleared, even though the
working directory is already in the output directory.

Passing KBUILD_OUTPUT=. resolves the issue.

Fixes: 13b25489b6f8 ("kbuild: change working directory to external module directory with M=")
Reported-by: Charlie Jenkins <charlie@rivosinc.com>
Closes: https://lore.kernel.org/all/Z1DnP-GJcfseyrM3@ghost/
Tested-by: Charlie Jenkins <charlie@rivosinc.com>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 scripts/package/install-extmod-build | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/package/install-extmod-build b/scripts/package/install-extmod-build
index 64d958ee45f3..d3c5b104c063 100755
--- a/scripts/package/install-extmod-build
+++ b/scripts/package/install-extmod-build
@@ -69,7 +69,7 @@ if [ "${CC}" != "${HOSTCC}" ]; then
 	#
 	# Use the single-target build to avoid the modpost invocation, which
 	# would overwrite Module.symvers.
-	"${MAKE}" HOSTCC="${CC}" KBUILD_EXTMOD="${destdir}" scripts/
+	"${MAKE}" HOSTCC="${CC}" KBUILD_OUTPUT=. KBUILD_EXTMOD="${destdir}" scripts/
 
 	cat <<-'EOF' >  "${destdir}/scripts/Kbuild"
 	subdir-y := basic
@@ -78,7 +78,7 @@ if [ "${CC}" != "${HOSTCC}" ]; then
 	EOF
 
 	# Run once again to rebuild scripts/basic/ and scripts/mod/modpost.
-	"${MAKE}" HOSTCC="${CC}" KBUILD_EXTMOD="${destdir}" scripts/
+	"${MAKE}" HOSTCC="${CC}" KBUILD_OUTPUT=. KBUILD_EXTMOD="${destdir}" scripts/
 
 	rm -f "${destdir}/Kbuild" "${destdir}/scripts/Kbuild"
 fi
-- 
2.51.0


From 494b332064c0ce2f7392fa92632bc50191c1b517 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Sat, 30 Nov 2024 01:47:47 +0900
Subject: [PATCH 06/16] tracing/eprobe: Fix to release eprobe when failed to
 add dyn_event

Fix eprobe event to unregister event call and release eprobe when it fails
to add dynamic event correctly.

Link: https://lore.kernel.org/all/173289886698.73724.1959899350183686006.stgit@devnote2/

Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/trace_eprobe.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index ebda68ee9abf..be8be0c1aaf0 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -963,6 +963,11 @@ static int __trace_eprobe_create(int argc, const char *argv[])
 		goto error;
 	}
 	ret = dyn_event_add(&ep->devent, &ep->tp.event->call);
+	if (ret < 0) {
+		trace_probe_unregister_event_call(&ep->tp);
+		mutex_unlock(&event_mutex);
+		goto error;
+	}
 	mutex_unlock(&event_mutex);
 	return ret;
 parse_error:
-- 
2.51.0


From fac04efc5c793dccbd07e2d59af9f90b7fc0dca4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 8 Dec 2024 14:03:39 -0800
Subject: [PATCH 07/16] Linux 6.13-rc2

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 93ab62cef244..64c594bd7ad0 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 6
 PATCHLEVEL = 13
 SUBLEVEL = 0
-EXTRAVERSION = -rc1
+EXTRAVERSION = -rc2
 NAME = Baby Opossum Posse
 
 # *DOCUMENTATION*
-- 
2.51.0


From 32913f348229c9f72dda45fc2c08c6d9dfcd3d6d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 9 Dec 2024 10:00:25 -0800
Subject: [PATCH 08/16] futex: fix user access on powerpc

The powerpc user access code is special, and unlike other architectures
distinguishes between user access for reading and writing.

And commit 43a43faf5376 ("futex: improve user space accesses") messed
that up.  It went undetected elsewhere, but caused ppc32 to fail early
during boot, because the user access had been started with
user_read_access_begin(), but then finished off with just a plain
"user_access_end()".

Note that the address-masking user access helpers don't even have that
read-vs-write distinction, so if powerpc ever wants to do address
masking tricks, we'll have to do some extra work for it.

[ Make sure to also do it for the EFAULT case, as pointed out by
  Christophe Leroy ]

Reported-by: Andreas Schwab <schwab@linux-m68k.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Link: https://lore.kernel.org/all/87bjxl6b0i.fsf@igel.home/
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex/futex.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 618ce1fe870e..99b32e728c4a 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -265,11 +265,11 @@ static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from)
 	else if (!user_read_access_begin(from, sizeof(*from)))
 		return -EFAULT;
 	unsafe_get_user(val, from, Efault);
-	user_access_end();
+	user_read_access_end();
 	*dest = val;
 	return 0;
 Efault:
-	user_access_end();
+	user_read_access_end();
 	return -EFAULT;
 }
 
-- 
2.51.0


From aeb68937614f4aeceaaa762bd7f0212ce842b797 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Mon, 9 Dec 2024 08:53:32 +0900
Subject: [PATCH 09/16] x86: Fix build regression with CONFIG_KEXEC_JUMP
 enabled

Build 6.13-rc12 for x86_64 with gcc 14.2.1 fails with the error:

  ld: vmlinux.o: in function `virtual_mapped':
  linux/arch/x86/kernel/relocate_kernel_64.S:249:(.text+0x5915b): undefined reference to `saved_context_gdt_desc'

when CONFIG_KEXEC_JUMP is enabled.

This was introduced by commit 07fa619f2a40 ("x86/kexec: Restore GDT on
return from ::preserve_context kexec") which introduced a use of
saved_context_gdt_desc without a declaration for it.

Fix that by including asm/asm-offsets.h where saved_context_gdt_desc
is defined (indirectly in include/generated/asm-offsets.h which
asm/asm-offsets.h includes).

Fixes: 07fa619f2a40 ("x86/kexec: Restore GDT on return from ::preserve_context kexec")
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Acked-by: Borislav Petkov (AMD) <bp@alien8.de>
Acked-by: David Woodhouse <dwmw@amazon.co.uk>
Closes: https://lore.kernel.org/oe-kbuild-all/202411270006.ZyyzpYf8-lkp@intel.com/
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/relocate_kernel_64.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 1236f25fc8d1..540443d699e3 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -13,6 +13,7 @@
 #include <asm/pgtable_types.h>
 #include <asm/nospec-branch.h>
 #include <asm/unwind_hints.h>
+#include <asm/asm-offsets.h>
 
 /*
  * Must be relocatable PIC code callable as a C function, in particular
-- 
2.51.0


From a440a28ddbdcb861150987b4d6e828631656b92f Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:24 -0800
Subject: [PATCH 10/16] xfs: fix off-by-one error in fsmap's end_daddr usage

In commit ca6448aed4f10a, we created an "end_daddr" variable to fix
fsmap reporting when the end of the range requested falls in the middle
of an unknown (aka free on the rmapbt) region.  Unfortunately, I didn't
notice that the the code sets end_daddr to the last sector of the device
but then uses that quantity to compute the length of the synthesized
mapping.

Zizhi Wo later observed that when end_daddr isn't set, we still don't
report the last fsblock on a device because in that case (aka when
info->last is true), the info->high mapping that we pass to
xfs_getfsmap_group_helper has a startblock that points to the last
fsblock.  This is also wrong because the code uses startblock to
compute the length of the synthesized mapping.

Fix the second problem by setting end_daddr unconditionally, and fix the
first problem by setting start_daddr to one past the end of the range to
query.

Cc: <stable@vger.kernel.org> # v6.11
Fixes: ca6448aed4f10a ("xfs: Fix missing interval for missing_owner in xfs fsmap")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reported-by: Zizhi Wo <wozizhi@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_fsmap.c | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 82f2e0dd2249..3290dd8524a6 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -163,7 +163,8 @@ struct xfs_getfsmap_info {
 	xfs_daddr_t		next_daddr;	/* next daddr we expect */
 	/* daddr of low fsmap key when we're using the rtbitmap */
 	xfs_daddr_t		low_daddr;
-	xfs_daddr_t		end_daddr;	/* daddr of high fsmap key */
+	/* daddr of high fsmap key, or the last daddr on the device */
+	xfs_daddr_t		end_daddr;
 	u64			missing_owner;	/* owner of holes */
 	u32			dev;		/* device id */
 	/*
@@ -387,8 +388,8 @@ xfs_getfsmap_group_helper(
 	 * we calculated from userspace's high key to synthesize the record.
 	 * Note that if the btree query found a mapping, there won't be a gap.
 	 */
-	if (info->last && info->end_daddr != XFS_BUF_DADDR_NULL)
-		frec->start_daddr = info->end_daddr;
+	if (info->last)
+		frec->start_daddr = info->end_daddr + 1;
 	else
 		frec->start_daddr = xfs_gbno_to_daddr(xg, startblock);
 
@@ -736,11 +737,10 @@ xfs_getfsmap_rtdev_rtbitmap_helper(
 	 * we calculated from userspace's high key to synthesize the record.
 	 * Note that if the btree query found a mapping, there won't be a gap.
 	 */
-	if (info->last && info->end_daddr != XFS_BUF_DADDR_NULL) {
-		frec.start_daddr = info->end_daddr;
-	} else {
+	if (info->last)
+		frec.start_daddr = info->end_daddr + 1;
+	else
 		frec.start_daddr = xfs_rtb_to_daddr(mp, start_rtb);
-	}
 
 	frec.len_daddr = XFS_FSB_TO_BB(mp, rtbcount);
 	return xfs_getfsmap_helper(tp, info, &frec);
@@ -933,7 +933,10 @@ xfs_getfsmap(
 	struct xfs_trans		*tp = NULL;
 	struct xfs_fsmap		dkeys[2];	/* per-dev keys */
 	struct xfs_getfsmap_dev		handlers[XFS_GETFSMAP_DEVS];
-	struct xfs_getfsmap_info	info = { NULL };
+	struct xfs_getfsmap_info	info = {
+		.fsmap_recs		= fsmap_recs,
+		.head			= head,
+	};
 	bool				use_rmap;
 	int				i;
 	int				error = 0;
@@ -998,9 +1001,6 @@ xfs_getfsmap(
 
 	info.next_daddr = head->fmh_keys[0].fmr_physical +
 			  head->fmh_keys[0].fmr_length;
-	info.end_daddr = XFS_BUF_DADDR_NULL;
-	info.fsmap_recs = fsmap_recs;
-	info.head = head;
 
 	/* For each device we support... */
 	for (i = 0; i < XFS_GETFSMAP_DEVS; i++) {
@@ -1013,17 +1013,23 @@ xfs_getfsmap(
 			break;
 
 		/*
-		 * If this device number matches the high key, we have
-		 * to pass the high key to the handler to limit the
-		 * query results.  If the device number exceeds the
-		 * low key, zero out the low key so that we get
-		 * everything from the beginning.
+		 * If this device number matches the high key, we have to pass
+		 * the high key to the handler to limit the query results, and
+		 * set the end_daddr so that we can synthesize records at the
+		 * end of the query range or device.
 		 */
 		if (handlers[i].dev == head->fmh_keys[1].fmr_device) {
 			dkeys[1] = head->fmh_keys[1];
 			info.end_daddr = min(handlers[i].nr_sectors - 1,
 					     dkeys[1].fmr_physical);
+		} else {
+			info.end_daddr = handlers[i].nr_sectors - 1;
 		}
+
+		/*
+		 * If the device number exceeds the low key, zero out the low
+		 * key so that we get everything from the beginning.
+		 */
 		if (handlers[i].dev > head->fmh_keys[0].fmr_device)
 			memset(&dkeys[0], 0, sizeof(struct xfs_fsmap));
 
-- 
2.51.0


From 9b7280010366dbe32791acd498a37dc522f568db Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:24 -0800
Subject: [PATCH 11/16] xfs: metapath scrubber should use the already loaded
 inodes

Don't waste time in xchk_setup_metapath_dqinode doing a second lookup of
the quota inodes, just grab them from the quotainfo structure.  The
whole point of this scrubber is to make sure that the dirents exist, so
it's completely silly to do lookups.

Cc: <stable@vger.kernel.org> # v6.13-rc1
Fixes: 128a055291ebbc ("xfs: scrub quota file metapaths")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/metapath.c | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
index b78db6513465..80467d6bc763 100644
--- a/fs/xfs/scrub/metapath.c
+++ b/fs/xfs/scrub/metapath.c
@@ -196,36 +196,45 @@ xchk_setup_metapath_dqinode(
 	struct xfs_scrub	*sc,
 	xfs_dqtype_t		type)
 {
+	struct xfs_quotainfo	*qi = sc->mp->m_quotainfo;
 	struct xfs_trans	*tp = NULL;
 	struct xfs_inode	*dp = NULL;
 	struct xfs_inode	*ip = NULL;
-	const char		*path;
 	int			error;
 
+	if (!qi)
+		return -ENOENT;
+
+	switch (type) {
+	case XFS_DQTYPE_USER:
+		ip = qi->qi_uquotaip;
+		break;
+	case XFS_DQTYPE_GROUP:
+		ip = qi->qi_gquotaip;
+		break;
+	case XFS_DQTYPE_PROJ:
+		ip = qi->qi_pquotaip;
+		break;
+	default:
+		ASSERT(0);
+		return -EINVAL;
+	}
+	if (!ip)
+		return -ENOENT;
+
 	error = xfs_trans_alloc_empty(sc->mp, &tp);
 	if (error)
 		return error;
 
 	error = xfs_dqinode_load_parent(tp, &dp);
-	if (error)
-		goto out_cancel;
-
-	error = xfs_dqinode_load(tp, dp, type, &ip);
-	if (error)
-		goto out_dp;
-
 	xfs_trans_cancel(tp);
-	tp = NULL;
+	if (error)
+		return error;
 
-	path = kasprintf(GFP_KERNEL, "%s", xfs_dqinode_path(type));
-	error = xchk_setup_metapath_scan(sc, dp, path, ip);
+	error = xchk_setup_metapath_scan(sc, dp,
+			kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip);
 
-	xfs_irele(ip);
-out_dp:
 	xfs_irele(dp);
-out_cancel:
-	if (tp)
-		xfs_trans_cancel(tp);
 	return error;
 }
 #else
-- 
2.51.0


From e1d8602b6cfb757952827d11c7d26f2a1714fe82 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:25 -0800
Subject: [PATCH 12/16] xfs: keep quota directory inode loaded

In the same vein as the previous patch, there's no point in the metapath
scrub setup function doing a lookup on the quota metadir just so it can
validate that lookups work correctly.  Instead, retain the quota
directory inode in memory for the lifetime of the mount so that we can
check this meaningfully.

Cc: <stable@vger.kernel.org> # v6.13-rc1
Fixes: 128a055291ebbc ("xfs: scrub quota file metapaths")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/metapath.c | 37 ++++++--------------------------
 fs/xfs/xfs_qm.c         | 47 ++++++++++++++++++++++-------------------
 fs/xfs/xfs_qm.h         |  1 +
 3 files changed, 32 insertions(+), 53 deletions(-)

diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
index 80467d6bc763..c678cba1ffc3 100644
--- a/fs/xfs/scrub/metapath.c
+++ b/fs/xfs/scrub/metapath.c
@@ -171,23 +171,13 @@ static int
 xchk_setup_metapath_quotadir(
 	struct xfs_scrub	*sc)
 {
-	struct xfs_trans	*tp;
-	struct xfs_inode	*dp = NULL;
-	int			error;
-
-	error = xfs_trans_alloc_empty(sc->mp, &tp);
-	if (error)
-		return error;
+	struct xfs_quotainfo	*qi = sc->mp->m_quotainfo;
 
-	error = xfs_dqinode_load_parent(tp, &dp);
-	xfs_trans_cancel(tp);
-	if (error)
-		return error;
+	if (!qi || !qi->qi_dirip)
+		return -ENOENT;
 
-	error = xchk_setup_metapath_scan(sc, sc->mp->m_metadirip,
-			kasprintf(GFP_KERNEL, "quota"), dp);
-	xfs_irele(dp);
-	return error;
+	return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip,
+			kstrdup("quota", GFP_KERNEL), qi->qi_dirip);
 }
 
 /* Scan a quota inode under the /quota directory. */
@@ -197,10 +187,7 @@ xchk_setup_metapath_dqinode(
 	xfs_dqtype_t		type)
 {
 	struct xfs_quotainfo	*qi = sc->mp->m_quotainfo;
-	struct xfs_trans	*tp = NULL;
-	struct xfs_inode	*dp = NULL;
 	struct xfs_inode	*ip = NULL;
-	int			error;
 
 	if (!qi)
 		return -ENOENT;
@@ -222,20 +209,8 @@ xchk_setup_metapath_dqinode(
 	if (!ip)
 		return -ENOENT;
 
-	error = xfs_trans_alloc_empty(sc->mp, &tp);
-	if (error)
-		return error;
-
-	error = xfs_dqinode_load_parent(tp, &dp);
-	xfs_trans_cancel(tp);
-	if (error)
-		return error;
-
-	error = xchk_setup_metapath_scan(sc, dp,
+	return xchk_setup_metapath_scan(sc, qi->qi_dirip,
 			kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip);
-
-	xfs_irele(dp);
-	return error;
 }
 #else
 # define xchk_setup_metapath_quotadir(...)	(-ENOENT)
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 61ee110b47d7..3c0189831f14 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -241,6 +241,10 @@ xfs_qm_destroy_quotainos(
 		xfs_irele(qi->qi_pquotaip);
 		qi->qi_pquotaip = NULL;
 	}
+	if (qi->qi_dirip) {
+		xfs_irele(qi->qi_dirip);
+		qi->qi_dirip = NULL;
+	}
 }
 
 /*
@@ -646,8 +650,7 @@ xfs_qm_init_timelimits(
 static int
 xfs_qm_load_metadir_qinos(
 	struct xfs_mount	*mp,
-	struct xfs_quotainfo	*qi,
-	struct xfs_inode	**dpp)
+	struct xfs_quotainfo	*qi)
 {
 	struct xfs_trans	*tp;
 	int			error;
@@ -656,7 +659,7 @@ xfs_qm_load_metadir_qinos(
 	if (error)
 		return error;
 
-	error = xfs_dqinode_load_parent(tp, dpp);
+	error = xfs_dqinode_load_parent(tp, &qi->qi_dirip);
 	if (error == -ENOENT) {
 		/* no quota dir directory, but we'll create one later */
 		error = 0;
@@ -666,21 +669,21 @@ xfs_qm_load_metadir_qinos(
 		goto out_trans;
 
 	if (XFS_IS_UQUOTA_ON(mp)) {
-		error = xfs_dqinode_load(tp, *dpp, XFS_DQTYPE_USER,
+		error = xfs_dqinode_load(tp, qi->qi_dirip, XFS_DQTYPE_USER,
 				&qi->qi_uquotaip);
 		if (error && error != -ENOENT)
 			goto out_trans;
 	}
 
 	if (XFS_IS_GQUOTA_ON(mp)) {
-		error = xfs_dqinode_load(tp, *dpp, XFS_DQTYPE_GROUP,
+		error = xfs_dqinode_load(tp, qi->qi_dirip, XFS_DQTYPE_GROUP,
 				&qi->qi_gquotaip);
 		if (error && error != -ENOENT)
 			goto out_trans;
 	}
 
 	if (XFS_IS_PQUOTA_ON(mp)) {
-		error = xfs_dqinode_load(tp, *dpp, XFS_DQTYPE_PROJ,
+		error = xfs_dqinode_load(tp, qi->qi_dirip, XFS_DQTYPE_PROJ,
 				&qi->qi_pquotaip);
 		if (error && error != -ENOENT)
 			goto out_trans;
@@ -696,34 +699,33 @@ out_trans:
 STATIC int
 xfs_qm_create_metadir_qinos(
 	struct xfs_mount	*mp,
-	struct xfs_quotainfo	*qi,
-	struct xfs_inode	**dpp)
+	struct xfs_quotainfo	*qi)
 {
 	int			error;
 
-	if (!*dpp) {
-		error = xfs_dqinode_mkdir_parent(mp, dpp);
+	if (!qi->qi_dirip) {
+		error = xfs_dqinode_mkdir_parent(mp, &qi->qi_dirip);
 		if (error && error != -EEXIST)
 			return error;
 	}
 
 	if (XFS_IS_UQUOTA_ON(mp) && !qi->qi_uquotaip) {
-		error = xfs_dqinode_metadir_create(*dpp, XFS_DQTYPE_USER,
-				&qi->qi_uquotaip);
+		error = xfs_dqinode_metadir_create(qi->qi_dirip,
+				XFS_DQTYPE_USER, &qi->qi_uquotaip);
 		if (error)
 			return error;
 	}
 
 	if (XFS_IS_GQUOTA_ON(mp) && !qi->qi_gquotaip) {
-		error = xfs_dqinode_metadir_create(*dpp, XFS_DQTYPE_GROUP,
-				&qi->qi_gquotaip);
+		error = xfs_dqinode_metadir_create(qi->qi_dirip,
+				XFS_DQTYPE_GROUP, &qi->qi_gquotaip);
 		if (error)
 			return error;
 	}
 
 	if (XFS_IS_PQUOTA_ON(mp) && !qi->qi_pquotaip) {
-		error = xfs_dqinode_metadir_create(*dpp, XFS_DQTYPE_PROJ,
-				&qi->qi_pquotaip);
+		error = xfs_dqinode_metadir_create(qi->qi_dirip,
+				XFS_DQTYPE_PROJ, &qi->qi_pquotaip);
 		if (error)
 			return error;
 	}
@@ -768,7 +770,6 @@ xfs_qm_init_metadir_qinos(
 	struct xfs_mount	*mp)
 {
 	struct xfs_quotainfo	*qi = mp->m_quotainfo;
-	struct xfs_inode	*dp = NULL;
 	int			error;
 
 	if (!xfs_has_quota(mp)) {
@@ -777,20 +778,22 @@ xfs_qm_init_metadir_qinos(
 			return error;
 	}
 
-	error = xfs_qm_load_metadir_qinos(mp, qi, &dp);
+	error = xfs_qm_load_metadir_qinos(mp, qi);
 	if (error)
 		goto out_err;
 
-	error = xfs_qm_create_metadir_qinos(mp, qi, &dp);
+	error = xfs_qm_create_metadir_qinos(mp, qi);
 	if (error)
 		goto out_err;
 
-	xfs_irele(dp);
+	/* The only user of the quota dir inode is online fsck */
+#if !IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB)
+	xfs_irele(qi->qi_dirip);
+	qi->qi_dirip = NULL;
+#endif
 	return 0;
 out_err:
 	xfs_qm_destroy_quotainos(mp->m_quotainfo);
-	if (dp)
-		xfs_irele(dp);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index e919c7f62f57..35b64bc3a7a8 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -55,6 +55,7 @@ struct xfs_quotainfo {
 	struct xfs_inode	*qi_uquotaip;	/* user quota inode */
 	struct xfs_inode	*qi_gquotaip;	/* group quota inode */
 	struct xfs_inode	*qi_pquotaip;	/* project quota inode */
+	struct xfs_inode	*qi_dirip;	/* quota metadir */
 	struct list_lru		qi_lru;
 	int			qi_dquots;
 	struct mutex		qi_quotaofflock;/* to serialize quotaoff */
-- 
2.51.0


From bd27c7bcdca25ce8067ebb94ded6ac1bd7b47317 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:26 -0800
Subject: [PATCH 13/16] xfs: return a 64-bit block count from
 xfs_btree_count_blocks

With the nrext64 feature enabled, it's possible for a data fork to have
2^48 extent mappings.  Even with a 64k fsblock size, that maps out to
a bmbt containing more than 2^32 blocks.  Therefore, this predicate must
return a u64 count to avoid an integer wraparound that will cause scrub
to do the wrong thing.

It's unlikely that any such filesystem currently exists, because the
incore bmbt would consume more than 64GB of kernel memory on its own,
and so far nobody except me has driven a filesystem that far, judging
from the lack of complaints.

Cc: <stable@vger.kernel.org> # v5.19
Fixes: df9ad5cc7a5240 ("xfs: Introduce macros to represent new maximum extent counts for data/attr forks")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_btree.c        | 4 ++--
 fs/xfs/libxfs/xfs_btree.h        | 2 +-
 fs/xfs/libxfs/xfs_ialloc_btree.c | 4 +++-
 fs/xfs/scrub/agheader.c          | 6 +++---
 fs/xfs/scrub/agheader_repair.c   | 6 +++---
 fs/xfs/scrub/fscounters.c        | 2 +-
 fs/xfs/scrub/ialloc.c            | 4 ++--
 fs/xfs/scrub/refcount.c          | 2 +-
 fs/xfs/xfs_bmap_util.c           | 2 +-
 9 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 2b5fc5fd1643..c748866ef923 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -5144,7 +5144,7 @@ xfs_btree_count_blocks_helper(
 	int			level,
 	void			*data)
 {
-	xfs_extlen_t		*blocks = data;
+	xfs_filblks_t		*blocks = data;
 	(*blocks)++;
 
 	return 0;
@@ -5154,7 +5154,7 @@ xfs_btree_count_blocks_helper(
 int
 xfs_btree_count_blocks(
 	struct xfs_btree_cur	*cur,
-	xfs_extlen_t		*blocks)
+	xfs_filblks_t		*blocks)
 {
 	*blocks = 0;
 	return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 3b739459ebb0..c5bff273cae2 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -484,7 +484,7 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
 int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
 		xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data);
 
-int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks);
+int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_filblks_t *blocks);
 
 union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n,
 		struct xfs_btree_block *block);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 9b34896dd1a3..6f270d8f4270 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -744,6 +744,7 @@ xfs_finobt_count_blocks(
 {
 	struct xfs_buf		*agbp = NULL;
 	struct xfs_btree_cur	*cur;
+	xfs_filblks_t		blocks;
 	int			error;
 
 	error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
@@ -751,9 +752,10 @@ xfs_finobt_count_blocks(
 		return error;
 
 	cur = xfs_finobt_init_cursor(pag, tp, agbp);
-	error = xfs_btree_count_blocks(cur, tree_blocks);
+	error = xfs_btree_count_blocks(cur, &blocks);
 	xfs_btree_del_cursor(cur, error);
 	xfs_trans_brelse(tp, agbp);
+	*tree_blocks = blocks;
 
 	return error;
 }
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 61f80a6410c7..1d41b85478da 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -458,7 +458,7 @@ xchk_agf_xref_btreeblks(
 {
 	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
 	struct xfs_mount	*mp = sc->mp;
-	xfs_agblock_t		blocks;
+	xfs_filblks_t		blocks;
 	xfs_agblock_t		btreeblks;
 	int			error;
 
@@ -507,7 +507,7 @@ xchk_agf_xref_refcblks(
 	struct xfs_scrub	*sc)
 {
 	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
-	xfs_agblock_t		blocks;
+	xfs_filblks_t		blocks;
 	int			error;
 
 	if (!sc->sa.refc_cur)
@@ -840,7 +840,7 @@ xchk_agi_xref_fiblocks(
 	struct xfs_scrub	*sc)
 {
 	struct xfs_agi		*agi = sc->sa.agi_bp->b_addr;
-	xfs_agblock_t		blocks;
+	xfs_filblks_t		blocks;
 	int			error = 0;
 
 	if (!xfs_has_inobtcounts(sc->mp))
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 0fad0baaba2f..b45d2b32051a 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -256,7 +256,7 @@ xrep_agf_calc_from_btrees(
 	struct xfs_agf		*agf = agf_bp->b_addr;
 	struct xfs_mount	*mp = sc->mp;
 	xfs_agblock_t		btreeblks;
-	xfs_agblock_t		blocks;
+	xfs_filblks_t		blocks;
 	int			error;
 
 	/* Update the AGF counters from the bnobt. */
@@ -946,7 +946,7 @@ xrep_agi_calc_from_btrees(
 	if (error)
 		goto err;
 	if (xfs_has_inobtcounts(mp)) {
-		xfs_agblock_t	blocks;
+		xfs_filblks_t	blocks;
 
 		error = xfs_btree_count_blocks(cur, &blocks);
 		if (error)
@@ -959,7 +959,7 @@ xrep_agi_calc_from_btrees(
 	agi->agi_freecount = cpu_to_be32(freecount);
 
 	if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) {
-		xfs_agblock_t	blocks;
+		xfs_filblks_t	blocks;
 
 		cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
 		error = xfs_btree_count_blocks(cur, &blocks);
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 4a50f8e00040..ca23cf4db6c5 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -261,7 +261,7 @@ xchk_fscount_btreeblks(
 	struct xchk_fscounters	*fsc,
 	xfs_agnumber_t		agno)
 {
-	xfs_extlen_t		blocks;
+	xfs_filblks_t		blocks;
 	int			error;
 
 	error = xchk_ag_init_existing(sc, agno, &sc->sa);
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index abad54c3621d..4dc7c83dc08a 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -650,8 +650,8 @@ xchk_iallocbt_xref_rmap_btreeblks(
 	struct xfs_scrub	*sc)
 {
 	xfs_filblks_t		blocks;
-	xfs_extlen_t		inobt_blocks = 0;
-	xfs_extlen_t		finobt_blocks = 0;
+	xfs_filblks_t		inobt_blocks = 0;
+	xfs_filblks_t		finobt_blocks = 0;
 	int			error;
 
 	if (!sc->sa.ino_cur || !sc->sa.rmap_cur ||
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 2b6be75e9424..1c5e45cc6419 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -491,7 +491,7 @@ xchk_refcount_xref_rmap(
 	struct xfs_scrub	*sc,
 	xfs_filblks_t		cow_blocks)
 {
-	xfs_extlen_t		refcbt_blocks = 0;
+	xfs_filblks_t		refcbt_blocks = 0;
 	xfs_filblks_t		blocks;
 	int			error;
 
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index a59bbe767a7d..0836fea2d6d8 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -103,7 +103,7 @@ xfs_bmap_count_blocks(
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_cur	*cur;
-	xfs_extlen_t		btblocks = 0;
+	xfs_filblks_t		btblocks = 0;
 	int			error;
 
 	*nextents = 0;
-- 
2.51.0


From 7ce31f20a0771d71779c3b0ec9cdf474cc3c8e9a Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:27 -0800
Subject: [PATCH 14/16] xfs: don't drop errno values when we fail to ficlone
 the entire range

Way back when we first implemented FICLONE for XFS, life was simple --
either the the entire remapping completed, or something happened and we
had to return an errno explaining what happened.  Neither of those
ioctls support returning partial results, so it's all or nothing.

Then things got complicated when copy_file_range came along, because it
actually can return the number of bytes copied, so commit 3f68c1f562f1e4
tried to make it so that we could return a partial result if the
REMAP_FILE_CAN_SHORTEN flag is set.  This is also how FIDEDUPERANGE can
indicate that the kernel performed a partial deduplication.

Unfortunately, the logic is wrong if an error stops the remapping and
CAN_SHORTEN is not set.  Because those callers cannot return partial
results, it is an error for ->remap_file_range to return a positive
quantity that is less than the @len passed in.  Implementations really
should be returning a negative errno in this case, because that's what
btrfs (which introduced FICLONE{,RANGE}) did.

Therefore, ->remap_range implementations cannot silently drop an errno
that they might have when the number of bytes remapped is less than the
number of bytes requested and CAN_SHORTEN is not set.

Found by running generic/562 on a 64k fsblock filesystem and wondering
why it reported corrupt files.

Cc: <stable@vger.kernel.org> # v4.20
Fixes: 3fc9f5e409319e ("xfs: remove xfs_reflink_remap_range")
Really-Fixes: 3f68c1f562f1e4 ("xfs: support returning partial reflink results")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_file.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 4a0b7de4f7ae..9a435b1ff264 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1242,6 +1242,14 @@ out_unlock:
 	xfs_iunlock2_remapping(src, dest);
 	if (ret)
 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
+	/*
+	 * If the caller did not set CAN_SHORTEN, then it is not prepared to
+	 * handle partial results -- either the whole remap succeeds, or we
+	 * must say why it did not.  In this case, any error should be returned
+	 * to the caller.
+	 */
+	if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
+		return ret;
 	return remapped > 0 ? remapped : ret;
 }
 
-- 
2.51.0


From aa7bfb537edf62085d7718845f6644b0e4efb9df Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:27 -0800
Subject: [PATCH 15/16] xfs: separate healthy clearing mask during repair

In commit d9041681dd2f53 we introduced some XFS_SICK_*ZAPPED flags so
that the inode record repair code could clean up a damaged inode record
enough to iget the inode but still be able to remember that the higher
level repair code needs to be called.  As part of that, we introduced a
xchk_mark_healthy_if_clean helper that is supposed to cause the ZAPPED
state to be removed if that higher level metadata actually checks out.
This was done by setting additional bits in sick_mask hoping that
xchk_update_health will clear all those bits after a healthy scrub.

Unfortunately, that's not quite what sick_mask means -- bits in that
mask are indeed cleared if the metadata is healthy, but they're set if
the metadata is NOT healthy.  fsck is only intended to set the ZAPPED
bits explicitly.

If something else sets the CORRUPT/XCORRUPT state after the
xchk_mark_healthy_if_clean call, we end up marking the metadata zapped.
This can happen if the following sequence happens:

1. Scrub runs, discovers that the metadata is fine but could be
   optimized and calls xchk_mark_healthy_if_clean on a ZAPPED flag.
   That causes the ZAPPED flag to be set in sick_mask because the
   metadata is not CORRUPT or XCORRUPT.

2. Repair runs to optimize the metadata.

3. Some other metadata used for cross-referencing in (1) becomes
   corrupt.

4. Post-repair scrub runs, but this time it sets CORRUPT or XCORRUPT due
   to the events in (3).

5. Now the xchk_health_update sets the ZAPPED flag on the metadata we
   just repaired.  This is not the correct state.

Fix this by moving the "if healthy" mask to a separate field, and only
ever using it to clear the sick state.

Cc: <stable@vger.kernel.org> # v6.8
Fixes: d9041681dd2f53 ("xfs: set inode sick state flags when we zap either ondisk fork")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/health.c | 57 +++++++++++++++++++++++++------------------
 fs/xfs/scrub/scrub.h  |  6 +++++
 2 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index ce86bdad37fa..ccc6ca5934ca 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -71,7 +71,8 @@
 /* Map our scrub type to a sick mask and a set of health update functions. */
 
 enum xchk_health_group {
-	XHG_FS = 1,
+	XHG_NONE = 1,
+	XHG_FS,
 	XHG_AG,
 	XHG_INO,
 	XHG_RTGROUP,
@@ -83,6 +84,7 @@ struct xchk_health_map {
 };
 
 static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
+	[XFS_SCRUB_TYPE_PROBE]		= { XHG_NONE,  0 },
 	[XFS_SCRUB_TYPE_SB]		= { XHG_AG,  XFS_SICK_AG_SB },
 	[XFS_SCRUB_TYPE_AGF]		= { XHG_AG,  XFS_SICK_AG_AGF },
 	[XFS_SCRUB_TYPE_AGFL]		= { XHG_AG,  XFS_SICK_AG_AGFL },
@@ -133,7 +135,7 @@ xchk_mark_healthy_if_clean(
 {
 	if (!(sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
 				  XFS_SCRUB_OFLAG_XCORRUPT)))
-		sc->sick_mask |= mask;
+		sc->healthy_mask |= mask;
 }
 
 /*
@@ -189,6 +191,7 @@ xchk_update_health(
 {
 	struct xfs_perag	*pag;
 	struct xfs_rtgroup	*rtg;
+	unsigned int		mask = sc->sick_mask;
 	bool			bad;
 
 	/*
@@ -203,50 +206,56 @@ xchk_update_health(
 		return;
 	}
 
-	if (!sc->sick_mask)
-		return;
-
 	bad = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
 				   XFS_SCRUB_OFLAG_XCORRUPT));
+	if (!bad)
+		mask |= sc->healthy_mask;
 	switch (type_to_health_flag[sc->sm->sm_type].group) {
+	case XHG_NONE:
+		break;
 	case XHG_AG:
+		if (!mask)
+			return;
 		pag = xfs_perag_get(sc->mp, sc->sm->sm_agno);
 		if (bad)
-			xfs_group_mark_corrupt(pag_group(pag), sc->sick_mask);
+			xfs_group_mark_corrupt(pag_group(pag), mask);
 		else
-			xfs_group_mark_healthy(pag_group(pag), sc->sick_mask);
+			xfs_group_mark_healthy(pag_group(pag), mask);
 		xfs_perag_put(pag);
 		break;
 	case XHG_INO:
 		if (!sc->ip)
 			return;
-		if (bad) {
-			unsigned int	mask = sc->sick_mask;
-
-			/*
-			 * If we're coming in for repairs then we don't want
-			 * sickness flags to propagate to the incore health
-			 * status if the inode gets inactivated before we can
-			 * fix it.
-			 */
-			if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
-				mask |= XFS_SICK_INO_FORGET;
+		/*
+		 * If we're coming in for repairs then we don't want sickness
+		 * flags to propagate to the incore health status if the inode
+		 * gets inactivated before we can fix it.
+		 */
+		if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+			mask |= XFS_SICK_INO_FORGET;
+		if (!mask)
+			return;
+		if (bad)
 			xfs_inode_mark_corrupt(sc->ip, mask);
-		} else
-			xfs_inode_mark_healthy(sc->ip, sc->sick_mask);
+		else
+			xfs_inode_mark_healthy(sc->ip, mask);
 		break;
 	case XHG_FS:
+		if (!mask)
+			return;
 		if (bad)
-			xfs_fs_mark_corrupt(sc->mp, sc->sick_mask);
+			xfs_fs_mark_corrupt(sc->mp, mask);
 		else
-			xfs_fs_mark_healthy(sc->mp, sc->sick_mask);
+			xfs_fs_mark_healthy(sc->mp, mask);
 		break;
 	case XHG_RTGROUP:
+		if (!mask)
+			return;
 		rtg = xfs_rtgroup_get(sc->mp, sc->sm->sm_agno);
 		if (bad)
-			xfs_group_mark_corrupt(rtg_group(rtg), sc->sick_mask);
+			xfs_group_mark_corrupt(rtg_group(rtg), mask);
 		else
-			xfs_group_mark_healthy(rtg_group(rtg), sc->sick_mask);
+			xfs_group_mark_healthy(rtg_group(rtg), mask);
 		xfs_rtgroup_put(rtg);
 		break;
 	default:
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index a7fda3e2b013..5dbbe93cb49b 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -184,6 +184,12 @@ struct xfs_scrub {
 	 */
 	unsigned int			sick_mask;
 
+	/*
+	 * Clear these XFS_SICK_* flags but only if the scan is ok.  Useful for
+	 * removing ZAPPED flags after a repair.
+	 */
+	unsigned int			healthy_mask;
+
 	/* next time we want to cond_resched() */
 	struct xchk_relax		relax;
 
-- 
2.51.0


From 6f4669708a69fd21f0299c2d5c4780a6ce358ab5 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 2 Dec 2024 10:57:28 -0800
Subject: [PATCH 16/16] xfs: set XFS_SICK_INO_SYMLINK_ZAPPED explicitly when
 zapping a symlink

If we need to reset a symlink target to the "durr it's busted" string,
then we clear the zapped flag as well.  However, this should be using
the provided helper so that we don't set the zapped state on an
otherwise ok symlink.

Cc: <stable@vger.kernel.org> # v6.10
Fixes: 2651923d8d8db0 ("xfs: online repair of symbolic links")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/symlink_repair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c
index d015a86ef460..953ce7be78dc 100644
--- a/fs/xfs/scrub/symlink_repair.c
+++ b/fs/xfs/scrub/symlink_repair.c
@@ -36,6 +36,7 @@
 #include "scrub/tempfile.h"
 #include "scrub/tempexch.h"
 #include "scrub/reap.h"
+#include "scrub/health.h"
 
 /*
  * Symbolic Link Repair
@@ -233,7 +234,7 @@ xrep_symlink_salvage(
 	 * target zapped flag.
 	 */
 	if (buflen == 0) {
-		sc->sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED;
+		xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_SYMLINK_ZAPPED);
 		sprintf(target_buf, DUMMY_TARGET);
 	}
 
-- 
2.51.0