From 3e18f5f1e5a152a51dbb6c234e2a0421aec11e31 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 6 Dec 2024 19:06:21 +0800 Subject: [PATCH 01/16] selftests/bpf: Move test_lpm_map.c to map_tests Move test_lpm_map.c to map_tests/ to include LPM trie test cases in regular test_maps run. Most code remains unchanged, including the use of assert(). Only reduce n_lookups from 64K to 512, which decreases test_lpm_map runtime from 37s to 0.7s. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20241206110622.1161752-9-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 2 +- .../lpm_trie_map_basic_ops.c} | 10 +++------- 3 files changed, 4 insertions(+), 9 deletions(-) rename tools/testing/selftests/bpf/{test_lpm_map.c => map_tests/lpm_trie_map_basic_ops.c} (99%) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index c2a1842c3d8b..e9c377001f93 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -5,7 +5,6 @@ bpf-syscall* test_verifier test_maps test_lru_map -test_lpm_map test_tag FEATURE-DUMP.libbpf FEATURE-DUMP.selftests diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 6ad3b1ba1920..7eeb3cbe18c7 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -83,7 +83,7 @@ CLANG_CPUV4 := 1 endif # Order correspond to 'make run_tests' order -TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ +TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_progs \ test_sockmap \ test_tcpnotify_user test_sysctl \ test_progs-no_alu32 diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_basic_ops.c similarity index 99% rename from tools/testing/selftests/bpf/test_lpm_map.c rename to tools/testing/selftests/bpf/map_tests/lpm_trie_map_basic_ops.c index d98c72dc563e..f375c89d78a4 100644 --- a/tools/testing/selftests/bpf/test_lpm_map.c +++ b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_basic_ops.c @@ -223,7 +223,7 @@ static void test_lpm_map(int keysize) n_matches = 0; n_matches_after_delete = 0; n_nodes = 1 << 8; - n_lookups = 1 << 16; + n_lookups = 1 << 9; data = alloca(keysize); memset(data, 0, keysize); @@ -770,16 +770,13 @@ static void test_lpm_multi_thread(void) close(map_fd); } -int main(void) +void test_lpm_trie_map_basic_ops(void) { int i; /* we want predictable, pseudo random tests */ srand(0xf00ba1); - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - test_lpm_basic(); test_lpm_order(); @@ -792,6 +789,5 @@ int main(void) test_lpm_get_next_key(); test_lpm_multi_thread(); - printf("test_lpm: OK\n"); - return 0; + printf("%s: PASS\n", __func__); } -- 2.51.0 From 04d4ce91b0bed4120e0c5fadc5291cebaa9c2a06 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 6 Dec 2024 19:06:22 +0800 Subject: [PATCH 02/16] selftests/bpf: Add more test cases for LPM trie Add more test cases for LPM trie in test_maps: 1) test_lpm_trie_update_flags It constructs various use cases for BPF_EXIST and BPF_NOEXIST and check whether the return value of update operation is expected. 2) test_lpm_trie_update_full_maps It tests the update operations on a full LPM trie map. Adding new node will fail and overwriting the value of existed node will succeed. 3) test_lpm_trie_iterate_strs and test_lpm_trie_iterate_ints There two test cases test whether the iteration through get_next_key is sorted and expected. These two test cases delete the minimal key after each iteration and check whether next iteration returns the second minimal key. The only difference between these two test cases is the former one saves strings in the LPM trie and the latter saves integers. Without the fix of get_next_key, these two cases will fail as shown below: test_lpm_trie_iterate_strs(1091):FAIL:iterate #2 got abc exp abS test_lpm_trie_iterate_ints(1142):FAIL:iterate #1 got 0x2 exp 0x1 Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20241206110622.1161752-10-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- .../bpf/map_tests/lpm_trie_map_basic_ops.c | 395 ++++++++++++++++++ 1 file changed, 395 insertions(+) diff --git a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_basic_ops.c b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_basic_ops.c index f375c89d78a4..d32e4edac930 100644 --- a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_basic_ops.c +++ b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_basic_ops.c @@ -20,10 +20,12 @@ #include #include #include +#include #include #include #include +#include #include "bpf_util.h" @@ -33,6 +35,22 @@ struct tlpm_node { uint8_t key[]; }; +struct lpm_trie_bytes_key { + union { + struct bpf_lpm_trie_key_hdr hdr; + __u32 prefixlen; + }; + unsigned char data[8]; +}; + +struct lpm_trie_int_key { + union { + struct bpf_lpm_trie_key_hdr hdr; + __u32 prefixlen; + }; + unsigned int data; +}; + static struct tlpm_node *tlpm_match(struct tlpm_node *list, const uint8_t *key, size_t n_bits); @@ -770,6 +788,378 @@ static void test_lpm_multi_thread(void) close(map_fd); } +static int lpm_trie_create(unsigned int key_size, unsigned int value_size, unsigned int max_entries) +{ + LIBBPF_OPTS(bpf_map_create_opts, opts); + int fd; + + opts.map_flags = BPF_F_NO_PREALLOC; + fd = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, "lpm_trie", key_size, value_size, max_entries, + &opts); + CHECK(fd < 0, "bpf_map_create", "error %d\n", errno); + + return fd; +} + +static void test_lpm_trie_update_flags(void) +{ + struct lpm_trie_int_key key; + unsigned int value, got; + int fd, err; + + fd = lpm_trie_create(sizeof(key), sizeof(value), 3); + + /* invalid flags (Error) */ + key.prefixlen = 32; + key.data = 0; + value = 0; + err = bpf_map_update_elem(fd, &key, &value, BPF_F_LOCK); + CHECK(err != -EINVAL, "invalid update flag", "error %d\n", err); + + /* invalid flags (Error) */ + key.prefixlen = 32; + key.data = 0; + value = 0; + err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST | BPF_EXIST); + CHECK(err != -EINVAL, "invalid update flag", "error %d\n", err); + + /* overwrite an empty qp-trie (Error) */ + key.prefixlen = 32; + key.data = 0; + value = 2; + err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST); + CHECK(err != -ENOENT, "overwrite empty qp-trie", "error %d\n", err); + + /* add a new node */ + key.prefixlen = 16; + key.data = 0; + value = 1; + err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST); + CHECK(err, "add new elem", "error %d\n", err); + got = 0; + err = bpf_map_lookup_elem(fd, &key, &got); + CHECK(err, "lookup elem", "error %d\n", err); + CHECK(got != value, "check value", "got %d exp %d\n", got, value); + + /* add the same node as new node (Error) */ + err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST); + CHECK(err != -EEXIST, "add new elem again", "error %d\n", err); + + /* overwrite the existed node */ + value = 4; + err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST); + CHECK(err, "overwrite elem", "error %d\n", err); + got = 0; + err = bpf_map_lookup_elem(fd, &key, &got); + CHECK(err, "lookup elem", "error %d\n", err); + CHECK(got != value, "check value", "got %d exp %d\n", got, value); + + /* overwrite the node */ + value = 1; + err = bpf_map_update_elem(fd, &key, &value, BPF_ANY); + CHECK(err, "update elem", "error %d\n", err); + got = 0; + err = bpf_map_lookup_elem(fd, &key, &got); + CHECK(err, "lookup elem", "error %d\n", err); + CHECK(got != value, "check value", "got %d exp %d\n", got, value); + + /* overwrite a non-existent node which is the prefix of the first + * node (Error). + */ + key.prefixlen = 8; + key.data = 0; + value = 2; + err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST); + CHECK(err != -ENOENT, "overwrite nonexistent elem", "error %d\n", err); + + /* add a new node which is the prefix of the first node */ + err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST); + CHECK(err, "add new elem", "error %d\n", err); + got = 0; + err = bpf_map_lookup_elem(fd, &key, &got); + CHECK(err, "lookup key", "error %d\n", err); + CHECK(got != value, "check value", "got %d exp %d\n", got, value); + + /* add another new node which will be the sibling of the first node */ + key.prefixlen = 9; + key.data = htobe32(1 << 23); + value = 5; + err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST); + CHECK(err, "add new elem", "error %d\n", err); + got = 0; + err = bpf_map_lookup_elem(fd, &key, &got); + CHECK(err, "lookup key", "error %d\n", err); + CHECK(got != value, "check value", "got %d exp %d\n", got, value); + + /* overwrite the third node */ + value = 3; + err = bpf_map_update_elem(fd, &key, &value, BPF_ANY); + CHECK(err, "overwrite elem", "error %d\n", err); + got = 0; + err = bpf_map_lookup_elem(fd, &key, &got); + CHECK(err, "lookup key", "error %d\n", err); + CHECK(got != value, "check value", "got %d exp %d\n", got, value); + + /* delete the second node to make it an intermediate node */ + key.prefixlen = 8; + key.data = 0; + err = bpf_map_delete_elem(fd, &key); + CHECK(err, "del elem", "error %d\n", err); + + /* overwrite the intermediate node (Error) */ + value = 2; + err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST); + CHECK(err != -ENOENT, "overwrite nonexistent elem", "error %d\n", err); + + close(fd); +} + +static void test_lpm_trie_update_full_map(void) +{ + struct lpm_trie_int_key key; + int value, got; + int fd, err; + + fd = lpm_trie_create(sizeof(key), sizeof(value), 3); + + /* add a new node */ + key.prefixlen = 16; + key.data = 0; + value = 0; + err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST); + CHECK(err, "add new elem", "error %d\n", err); + got = 0; + err = bpf_map_lookup_elem(fd, &key, &got); + CHECK(err, "lookup elem", "error %d\n", err); + CHECK(got != value, "check value", "got %d exp %d\n", got, value); + + /* add new node */ + key.prefixlen = 8; + key.data = 0; + value = 1; + err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST); + CHECK(err, "add new elem", "error %d\n", err); + got = 0; + err = bpf_map_lookup_elem(fd, &key, &got); + CHECK(err, "lookup elem", "error %d\n", err); + CHECK(got != value, "check value", "got %d exp %d\n", got, value); + + /* add new node */ + key.prefixlen = 9; + key.data = htobe32(1 << 23); + value = 2; + err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST); + CHECK(err, "add new elem", "error %d\n", err); + got = 0; + err = bpf_map_lookup_elem(fd, &key, &got); + CHECK(err, "lookup elem", "error %d\n", err); + CHECK(got != value, "check value", "got %d exp %d\n", got, value); + + /* try to add more node (Error) */ + key.prefixlen = 32; + key.data = 0; + value = 3; + err = bpf_map_update_elem(fd, &key, &value, BPF_ANY); + CHECK(err != -ENOSPC, "add to full trie", "error %d\n", err); + + /* update the value of an existed node with BPF_EXIST */ + key.prefixlen = 16; + key.data = 0; + value = 4; + err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST); + CHECK(err, "overwrite elem", "error %d\n", err); + got = 0; + err = bpf_map_lookup_elem(fd, &key, &got); + CHECK(err, "lookup elem", "error %d\n", err); + CHECK(got != value, "check value", "got %d exp %d\n", got, value); + + /* update the value of an existed node with BPF_ANY */ + key.prefixlen = 9; + key.data = htobe32(1 << 23); + value = 5; + err = bpf_map_update_elem(fd, &key, &value, BPF_ANY); + CHECK(err, "overwrite elem", "error %d\n", err); + got = 0; + err = bpf_map_lookup_elem(fd, &key, &got); + CHECK(err, "lookup elem", "error %d\n", err); + CHECK(got != value, "check value", "got %d exp %d\n", got, value); + + close(fd); +} + +static int cmp_str(const void *a, const void *b) +{ + const char *str_a = *(const char **)a, *str_b = *(const char **)b; + + return strcmp(str_a, str_b); +} + +/* Save strings in LPM trie. The trailing '\0' for each string will be + * accounted in the prefixlen. The strings returned during the iteration + * should be sorted as expected. + */ +static void test_lpm_trie_iterate_strs(void) +{ + static const char * const keys[] = { + "ab", "abO", "abc", "abo", "abS", "abcd", + }; + const char *sorted_keys[ARRAY_SIZE(keys)]; + struct lpm_trie_bytes_key key, next_key; + unsigned int value, got, i, j, len; + struct lpm_trie_bytes_key *cur; + int fd, err; + + fd = lpm_trie_create(sizeof(key), sizeof(value), ARRAY_SIZE(keys)); + + for (i = 0; i < ARRAY_SIZE(keys); i++) { + unsigned int flags; + + /* add i-th element */ + flags = i % 2 ? BPF_NOEXIST : 0; + len = strlen(keys[i]); + /* include the trailing '\0' */ + key.prefixlen = (len + 1) * 8; + memset(key.data, 0, sizeof(key.data)); + memcpy(key.data, keys[i], len); + value = i + 100; + err = bpf_map_update_elem(fd, &key, &value, flags); + CHECK(err, "add elem", "#%u error %d\n", i, err); + + err = bpf_map_lookup_elem(fd, &key, &got); + CHECK(err, "lookup elem", "#%u error %d\n", i, err); + CHECK(got != value, "lookup elem", "#%u expect %u got %u\n", i, value, got); + + /* re-add i-th element (Error) */ + err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST); + CHECK(err != -EEXIST, "re-add elem", "#%u error %d\n", i, err); + + /* Overwrite i-th element */ + flags = i % 2 ? 0 : BPF_EXIST; + value = i; + err = bpf_map_update_elem(fd, &key, &value, flags); + CHECK(err, "update elem", "error %d\n", err); + + /* Lookup #[0~i] elements */ + for (j = 0; j <= i; j++) { + len = strlen(keys[j]); + key.prefixlen = (len + 1) * 8; + memset(key.data, 0, sizeof(key.data)); + memcpy(key.data, keys[j], len); + err = bpf_map_lookup_elem(fd, &key, &got); + CHECK(err, "lookup elem", "#%u/%u error %d\n", i, j, err); + CHECK(got != j, "lookup elem", "#%u/%u expect %u got %u\n", + i, j, value, got); + } + } + + /* Add element to a full qp-trie (Error) */ + key.prefixlen = sizeof(key.data) * 8; + memset(key.data, 0, sizeof(key.data)); + value = 0; + err = bpf_map_update_elem(fd, &key, &value, 0); + CHECK(err != -ENOSPC, "add to full qp-trie", "error %d\n", err); + + /* Iterate sorted elements: no deletion */ + memcpy(sorted_keys, keys, sizeof(keys)); + qsort(sorted_keys, ARRAY_SIZE(sorted_keys), sizeof(sorted_keys[0]), cmp_str); + cur = NULL; + for (i = 0; i < ARRAY_SIZE(sorted_keys); i++) { + len = strlen(sorted_keys[i]); + err = bpf_map_get_next_key(fd, cur, &next_key); + CHECK(err, "iterate", "#%u error %d\n", i, err); + CHECK(next_key.prefixlen != (len + 1) * 8, "iterate", + "#%u invalid len %u expect %u\n", + i, next_key.prefixlen, (len + 1) * 8); + CHECK(memcmp(sorted_keys[i], next_key.data, len + 1), "iterate", + "#%u got %.*s exp %.*s\n", i, len, next_key.data, len, sorted_keys[i]); + + cur = &next_key; + } + err = bpf_map_get_next_key(fd, cur, &next_key); + CHECK(err != -ENOENT, "more element", "error %d\n", err); + + /* Iterate sorted elements: delete the found key after each iteration */ + cur = NULL; + for (i = 0; i < ARRAY_SIZE(sorted_keys); i++) { + len = strlen(sorted_keys[i]); + err = bpf_map_get_next_key(fd, cur, &next_key); + CHECK(err, "iterate", "#%u error %d\n", i, err); + CHECK(next_key.prefixlen != (len + 1) * 8, "iterate", + "#%u invalid len %u expect %u\n", + i, next_key.prefixlen, (len + 1) * 8); + CHECK(memcmp(sorted_keys[i], next_key.data, len + 1), "iterate", + "#%u got %.*s exp %.*s\n", i, len, next_key.data, len, sorted_keys[i]); + + cur = &next_key; + + err = bpf_map_delete_elem(fd, cur); + CHECK(err, "delete", "#%u error %d\n", i, err); + } + err = bpf_map_get_next_key(fd, cur, &next_key); + CHECK(err != -ENOENT, "non-empty qp-trie", "error %d\n", err); + + close(fd); +} + +/* Use the fixed prefixlen (32) and save integers in LPM trie. The iteration of + * LPM trie will return these integers in big-endian order, therefore, convert + * these integers to big-endian before update. After each iteration, delete the + * found key (the smallest integer) and expect the next iteration will return + * the second smallest number. + */ +static void test_lpm_trie_iterate_ints(void) +{ + struct lpm_trie_int_key key, next_key; + unsigned int i, max_entries; + struct lpm_trie_int_key *cur; + unsigned int *data_set; + int fd, err; + bool value; + + max_entries = 4096; + data_set = calloc(max_entries, sizeof(*data_set)); + CHECK(!data_set, "malloc", "no mem\n"); + for (i = 0; i < max_entries; i++) + data_set[i] = i; + + fd = lpm_trie_create(sizeof(key), sizeof(value), max_entries); + value = true; + for (i = 0; i < max_entries; i++) { + key.prefixlen = 32; + key.data = htobe32(data_set[i]); + + err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST); + CHECK(err, "add elem", "#%u error %d\n", i, err); + } + + cur = NULL; + for (i = 0; i < max_entries; i++) { + err = bpf_map_get_next_key(fd, cur, &next_key); + CHECK(err, "iterate", "#%u error %d\n", i, err); + CHECK(next_key.prefixlen != 32, "iterate", "#%u invalid len %u\n", + i, next_key.prefixlen); + CHECK(be32toh(next_key.data) != data_set[i], "iterate", "#%u got 0x%x exp 0x%x\n", + i, be32toh(next_key.data), data_set[i]); + cur = &next_key; + + /* + * Delete the minimal key, the next call of bpf_get_next_key() + * will return the second minimal key. + */ + err = bpf_map_delete_elem(fd, &next_key); + CHECK(err, "del elem", "#%u elem error %d\n", i, err); + } + err = bpf_map_get_next_key(fd, cur, &next_key); + CHECK(err != -ENOENT, "more element", "error %d\n", err); + + err = bpf_map_get_next_key(fd, NULL, &next_key); + CHECK(err != -ENOENT, "no-empty qp-trie", "error %d\n", err); + + free(data_set); + + close(fd); +} + void test_lpm_trie_map_basic_ops(void) { int i; @@ -789,5 +1179,10 @@ void test_lpm_trie_map_basic_ops(void) test_lpm_get_next_key(); test_lpm_multi_thread(); + test_lpm_trie_update_flags(); + test_lpm_trie_update_full_map(); + test_lpm_trie_iterate_strs(); + test_lpm_trie_iterate_ints(); + printf("%s: PASS\n", __func__); } -- 2.51.0 From 492077668fb453b8b16c842fcf3fafc2ebc190e9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2024 08:20:06 -0800 Subject: [PATCH 03/16] x86/CPU/AMD: WARN when setting EFER.AUTOIBRS if and only if the WRMSR fails When ensuring EFER.AUTOIBRS is set, WARN only on a negative return code from msr_set_bit(), as '1' is used to indicate the WRMSR was successful ('0' indicates the MSR bit was already set). Fixes: 8cc68c9c9e92 ("x86/CPU/AMD: Make sure EFER[AIBRSE] is set") Reported-by: Nathan Chancellor Signed-off-by: Sean Christopherson Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/Z1MkNofJjt7Oq0G6@google.com Closes: https://lore.kernel.org/all/20241205220604.GA2054199@thelio-3990X --- arch/x86/kernel/cpu/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index d8408aafeed9..79d2e17f6582 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1065,7 +1065,7 @@ static void init_amd(struct cpuinfo_x86 *c) */ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && cpu_has(c, X86_FEATURE_AUTOIBRS)) - WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS)); + WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS) < 0); /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); -- 2.51.0 From 7912405643a14b527cd4a4f33c1d4392da900888 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 1 Dec 2024 12:17:30 +0100 Subject: [PATCH 04/16] modpost: Add .irqentry.text to OTHER_SECTIONS The compiler can fully inline the actual handler function of an interrupt entry into the .irqentry.text entry point. If such a function contains an access which has an exception table entry, modpost complains about a section mismatch: WARNING: vmlinux.o(__ex_table+0x447c): Section mismatch in reference ... The relocation at __ex_table+0x447c references section ".irqentry.text" which is not in the list of authorized sections. Add .irqentry.text to OTHER_SECTIONS to cure the issue. Reported-by: Sergey Senozhatsky Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org # needed for linux-5.4-y Link: https://lore.kernel.org/all/20241128111844.GE10431@google.com/ Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 0584cbcdbd2d..fb787a5715f5 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -772,7 +772,7 @@ static void check_section(const char *modname, struct elf_info *elf, ".ltext", ".ltext.*" #define OTHER_TEXT_SECTIONS ".ref.text", ".head.text", ".spinlock.text", \ ".fixup", ".entry.text", ".exception.text", \ - ".coldtext", ".softirqentry.text" + ".coldtext", ".softirqentry.text", ".irqentry.text" #define ALL_TEXT_SECTIONS ".init.text", ".exit.text", \ TEXT_SECTIONS, OTHER_TEXT_SECTIONS -- 2.51.0 From d8d326d64f6702caab01ea0cd48f6c0054f3d1b4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 8 Dec 2024 16:56:45 +0900 Subject: [PATCH 05/16] kbuild: deb-pkg: fix build error with O= Since commit 13b25489b6f8 ("kbuild: change working directory to external module directory with M="), the Debian package build fails if a relative path is specified with the O= option. $ make O=build bindeb-pkg [ snip ] dpkg-deb: building package 'linux-image-6.13.0-rc1' in '../linux-image-6.13.0-rc1_6.13.0-rc1-6_amd64.deb'. Rebuilding host programs with x86_64-linux-gnu-gcc... make[6]: Entering directory '/home/masahiro/linux/build' /home/masahiro/linux/Makefile:190: *** specified kernel directory "build" does not exist. Stop. This occurs because the sub_make_done flag is cleared, even though the working directory is already in the output directory. Passing KBUILD_OUTPUT=. resolves the issue. Fixes: 13b25489b6f8 ("kbuild: change working directory to external module directory with M=") Reported-by: Charlie Jenkins Closes: https://lore.kernel.org/all/Z1DnP-GJcfseyrM3@ghost/ Tested-by: Charlie Jenkins Reviewed-by: Charlie Jenkins Signed-off-by: Masahiro Yamada --- scripts/package/install-extmod-build | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/package/install-extmod-build b/scripts/package/install-extmod-build index 64d958ee45f3..d3c5b104c063 100755 --- a/scripts/package/install-extmod-build +++ b/scripts/package/install-extmod-build @@ -69,7 +69,7 @@ if [ "${CC}" != "${HOSTCC}" ]; then # # Use the single-target build to avoid the modpost invocation, which # would overwrite Module.symvers. - "${MAKE}" HOSTCC="${CC}" KBUILD_EXTMOD="${destdir}" scripts/ + "${MAKE}" HOSTCC="${CC}" KBUILD_OUTPUT=. KBUILD_EXTMOD="${destdir}" scripts/ cat <<-'EOF' > "${destdir}/scripts/Kbuild" subdir-y := basic @@ -78,7 +78,7 @@ if [ "${CC}" != "${HOSTCC}" ]; then EOF # Run once again to rebuild scripts/basic/ and scripts/mod/modpost. - "${MAKE}" HOSTCC="${CC}" KBUILD_EXTMOD="${destdir}" scripts/ + "${MAKE}" HOSTCC="${CC}" KBUILD_OUTPUT=. KBUILD_EXTMOD="${destdir}" scripts/ rm -f "${destdir}/Kbuild" "${destdir}/scripts/Kbuild" fi -- 2.51.0 From fac04efc5c793dccbd07e2d59af9f90b7fc0dca4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 8 Dec 2024 14:03:39 -0800 Subject: [PATCH 06/16] Linux 6.13-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 93ab62cef244..64c594bd7ad0 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.51.0 From 42314738906380cbd3b6e9caf3ad34e1b2d66035 Mon Sep 17 00:00:00 2001 From: Richard Acayan Date: Wed, 13 Nov 2024 19:47:16 -0500 Subject: [PATCH 07/16] iommu/arm-smmu-qcom: add sdm670 adreno iommu compatible Add the compatible for the separate IOMMU on SDM670 for the Adreno GPU. This IOMMU has the compatible strings: "qcom,sdm670-smmu-v2", "qcom,adreno-smmu", "qcom,smmu-v2" While the SMMU 500 doesn't need an entry for this specific SoC, the SMMU v2 compatible should have its own entry, as the fallback entry in arm-smmu.c handles "qcom,smmu-v2" without per-process page table support unless there is an entry here. This entry can't be the "qcom,adreno-smmu" compatible because dedicated GPU IOMMUs can also be SMMU 500 with different handling. Signed-off-by: Richard Acayan Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20241114004713.42404-6-mailingradian@gmail.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 6372f3e25c4b..601fb878d0ef 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -567,6 +567,7 @@ static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = { { .compatible = "qcom,sc8180x-smmu-500", .data = &qcom_smmu_500_impl0_data }, { .compatible = "qcom,sc8280xp-smmu-500", .data = &qcom_smmu_500_impl0_data }, { .compatible = "qcom,sdm630-smmu-v2", .data = &qcom_smmu_v2_data }, + { .compatible = "qcom,sdm670-smmu-v2", .data = &qcom_smmu_v2_data }, { .compatible = "qcom,sdm845-smmu-v2", .data = &qcom_smmu_v2_data }, { .compatible = "qcom,sdm845-smmu-500", .data = &sdm845_smmu_500_data }, { .compatible = "qcom,sm6115-smmu-500", .data = &qcom_smmu_500_impl0_data}, -- 2.51.0 From 43ca55f5555b53d0c37039c62d75e882cccbfb69 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Tue, 3 Dec 2024 18:49:05 +0000 Subject: [PATCH 08/16] iommu/arm-smmu-v3: Introduce struct arm_smmu_event Introduce `struct arm_smmu_event` to represent event records. Parse out relevant fields from raw event records for ease and use the new `struct arm_smmu_event` instead. Signed-off-by: Pranjal Shrivastava Link: https://lore.kernel.org/r/20241203184906.2264528-2-praan@google.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 55 ++++++++++++++------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 18 +++++++ 2 files changed, 54 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index a5c7002ff75b..9fcba8fab9e3 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1759,17 +1759,34 @@ arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid) } /* IRQ and event handlers */ -static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) +static void arm_smmu_decode_event(u64 *raw, struct arm_smmu_event *event) +{ + event->id = FIELD_GET(EVTQ_0_ID, raw[0]); + event->sid = FIELD_GET(EVTQ_0_SID, raw[0]); + event->ssv = FIELD_GET(EVTQ_0_SSV, raw[0]); + event->ssid = event->ssv ? FIELD_GET(EVTQ_0_SSID, raw[0]) : IOMMU_NO_PASID; + event->privileged = FIELD_GET(EVTQ_1_PnU, raw[1]); + event->instruction = FIELD_GET(EVTQ_1_InD, raw[1]); + event->s2 = FIELD_GET(EVTQ_1_S2, raw[1]); + event->read = FIELD_GET(EVTQ_1_RnW, raw[1]); + event->stag = FIELD_GET(EVTQ_1_STAG, raw[1]); + event->stall = FIELD_GET(EVTQ_1_STALL, raw[1]); + event->class = FIELD_GET(EVTQ_1_CLASS, raw[1]); + event->iova = FIELD_GET(EVTQ_2_ADDR, raw[2]); + event->ipa = raw[3] & EVTQ_3_IPA; + event->fetch_addr = raw[3] & EVTQ_3_FETCH_ADDR; +} + +static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, + struct arm_smmu_event *event) { int ret = 0; u32 perm = 0; struct arm_smmu_master *master; - bool ssid_valid = evt[0] & EVTQ_0_SSV; - u32 sid = FIELD_GET(EVTQ_0_SID, evt[0]); struct iopf_fault fault_evt = { }; struct iommu_fault *flt = &fault_evt.fault; - switch (FIELD_GET(EVTQ_0_ID, evt[0])) { + switch (event->id) { case EVT_ID_TRANSLATION_FAULT: case EVT_ID_ADDR_SIZE_FAULT: case EVT_ID_ACCESS_FAULT: @@ -1779,35 +1796,35 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) return -EOPNOTSUPP; } - if (!(evt[1] & EVTQ_1_STALL)) + if (!event->stall) return -EOPNOTSUPP; - if (evt[1] & EVTQ_1_RnW) + if (event->read) perm |= IOMMU_FAULT_PERM_READ; else perm |= IOMMU_FAULT_PERM_WRITE; - if (evt[1] & EVTQ_1_InD) + if (event->instruction) perm |= IOMMU_FAULT_PERM_EXEC; - if (evt[1] & EVTQ_1_PnU) + if (event->privileged) perm |= IOMMU_FAULT_PERM_PRIV; flt->type = IOMMU_FAULT_PAGE_REQ; flt->prm = (struct iommu_fault_page_request) { .flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE, - .grpid = FIELD_GET(EVTQ_1_STAG, evt[1]), + .grpid = event->stag, .perm = perm, - .addr = FIELD_GET(EVTQ_2_ADDR, evt[2]), + .addr = event->iova, }; - if (ssid_valid) { + if (event->ssv) { flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - flt->prm.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]); + flt->prm.pasid = event->ssid; } mutex_lock(&smmu->streams_mutex); - master = arm_smmu_find_master(smmu, sid); + master = arm_smmu_find_master(smmu, event->sid); if (!master) { ret = -EINVAL; goto out_unlock; @@ -1822,23 +1839,23 @@ out_unlock: static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev) { int i, ret; + u64 evt[EVTQ_ENT_DWORDS]; + struct arm_smmu_event event = {0}; struct arm_smmu_device *smmu = dev; struct arm_smmu_queue *q = &smmu->evtq.q; struct arm_smmu_ll_queue *llq = &q->llq; static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - u64 evt[EVTQ_ENT_DWORDS]; do { while (!queue_remove_raw(q, evt)) { - u8 id = FIELD_GET(EVTQ_0_ID, evt[0]); - - ret = arm_smmu_handle_evt(smmu, evt); + arm_smmu_decode_event(evt, &event); + ret = arm_smmu_handle_evt(smmu, &event); if (!ret || !__ratelimit(&rs)) continue; - dev_info(smmu->dev, "event 0x%02x received:\n", id); - for (i = 0; i < ARRAY_SIZE(evt); ++i) + dev_info(smmu->dev, "event 0x%02x received:\n", event.id); + for (i = 0; i < EVTQ_ENT_DWORDS; ++i) dev_info(smmu->dev, "\t0x%016llx\n", (unsigned long long)evt[i]); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 0107d3f333a1..c37ed3c925ec 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -470,6 +470,7 @@ static inline unsigned int arm_smmu_cdtab_l2_idx(unsigned int ssid) #define EVTQ_1_TT_READ (1UL << 44) #define EVTQ_2_ADDR GENMASK_ULL(63, 0) #define EVTQ_3_IPA GENMASK_ULL(51, 12) +#define EVTQ_3_FETCH_ADDR GENMASK_ULL(51, 3) /* PRI queue */ #define PRIQ_ENT_SZ_SHIFT 4 @@ -789,6 +790,23 @@ struct arm_smmu_stream { struct rb_node node; }; +struct arm_smmu_event { + u8 stall : 1, + ssv : 1, + privileged : 1, + instruction : 1, + s2 : 1, + read : 1; + u8 id; + u8 class; + u16 stag; + u32 sid; + u32 ssid; + u64 iova; + u64 ipa; + u64 fetch_addr; +}; + /* SMMU private data for each master */ struct arm_smmu_master { struct arm_smmu_device *smmu; -- 2.51.0 From d814b70b9b901c823ddedd12757ca4a19b18c8f7 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Tue, 3 Dec 2024 18:49:06 +0000 Subject: [PATCH 09/16] iommu/arm-smmu-v3: Log better event records Currently, the driver dumps the raw hex for a received event record. Improve this by leveraging `struct arm_smmu_event` for event fields and log human-readable event records with meaningful information. Signed-off-by: Pranjal Shrivastava Link: https://lore.kernel.org/r/20241203184906.2264528-3-praan@google.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 114 +++++++++++++++++--- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 14 ++- 2 files changed, 115 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 9fcba8fab9e3..143ff6336a95 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -83,6 +83,28 @@ static struct arm_smmu_option_prop arm_smmu_options[] = { { 0, NULL}, }; +static const char * const event_str[] = { + [EVT_ID_BAD_STREAMID_CONFIG] = "C_BAD_STREAMID", + [EVT_ID_STE_FETCH_FAULT] = "F_STE_FETCH", + [EVT_ID_BAD_STE_CONFIG] = "C_BAD_STE", + [EVT_ID_STREAM_DISABLED_FAULT] = "F_STREAM_DISABLED", + [EVT_ID_BAD_SUBSTREAMID_CONFIG] = "C_BAD_SUBSTREAMID", + [EVT_ID_CD_FETCH_FAULT] = "F_CD_FETCH", + [EVT_ID_BAD_CD_CONFIG] = "C_BAD_CD", + [EVT_ID_TRANSLATION_FAULT] = "F_TRANSLATION", + [EVT_ID_ADDR_SIZE_FAULT] = "F_ADDR_SIZE", + [EVT_ID_ACCESS_FAULT] = "F_ACCESS", + [EVT_ID_PERMISSION_FAULT] = "F_PERMISSION", + [EVT_ID_VMS_FETCH_FAULT] = "F_VMS_FETCH", +}; + +static const char * const event_class_str[] = { + [0] = "CD fetch", + [1] = "Stage 1 translation table fetch", + [2] = "Input address caused fault", + [3] = "Reserved", +}; + static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain, struct arm_smmu_device *smmu, u32 flags); static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master); @@ -1759,8 +1781,11 @@ arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid) } /* IRQ and event handlers */ -static void arm_smmu_decode_event(u64 *raw, struct arm_smmu_event *event) +static void arm_smmu_decode_event(struct arm_smmu_device *smmu, u64 *raw, + struct arm_smmu_event *event) { + struct arm_smmu_master *master; + event->id = FIELD_GET(EVTQ_0_ID, raw[0]); event->sid = FIELD_GET(EVTQ_0_SID, raw[0]); event->ssv = FIELD_GET(EVTQ_0_SSV, raw[0]); @@ -1775,9 +1800,21 @@ static void arm_smmu_decode_event(u64 *raw, struct arm_smmu_event *event) event->iova = FIELD_GET(EVTQ_2_ADDR, raw[2]); event->ipa = raw[3] & EVTQ_3_IPA; event->fetch_addr = raw[3] & EVTQ_3_FETCH_ADDR; + event->ttrnw = FIELD_GET(EVTQ_1_TT_READ, raw[1]); + event->class_tt = false; + event->dev = NULL; + + if (event->id == EVT_ID_PERMISSION_FAULT) + event->class_tt = (event->class == EVTQ_1_CLASS_TT); + + mutex_lock(&smmu->streams_mutex); + master = arm_smmu_find_master(smmu, event->sid); + if (master) + event->dev = get_device(master->dev); + mutex_unlock(&smmu->streams_mutex); } -static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, +static int arm_smmu_handle_event(struct arm_smmu_device *smmu, struct arm_smmu_event *event) { int ret = 0; @@ -1836,9 +1873,67 @@ out_unlock: return ret; } +static void arm_smmu_dump_raw_event(struct arm_smmu_device *smmu, u64 *raw, + struct arm_smmu_event *event) +{ + int i; + + dev_err(smmu->dev, "event 0x%02x received:\n", event->id); + + for (i = 0; i < EVTQ_ENT_DWORDS; ++i) + dev_err(smmu->dev, "\t0x%016llx\n", raw[i]); +} + +#define ARM_SMMU_EVT_KNOWN(e) ((e)->id < ARRAY_SIZE(event_str) && event_str[(e)->id]) +#define ARM_SMMU_LOG_EVT_STR(e) ARM_SMMU_EVT_KNOWN(e) ? event_str[(e)->id] : "UNKNOWN" +#define ARM_SMMU_LOG_CLIENT(e) (e)->dev ? dev_name((e)->dev) : "(unassigned sid)" + +static void arm_smmu_dump_event(struct arm_smmu_device *smmu, u64 *raw, + struct arm_smmu_event *evt, + struct ratelimit_state *rs) +{ + if (!__ratelimit(rs)) + return; + + arm_smmu_dump_raw_event(smmu, raw, evt); + + switch (evt->id) { + case EVT_ID_TRANSLATION_FAULT: + case EVT_ID_ADDR_SIZE_FAULT: + case EVT_ID_ACCESS_FAULT: + case EVT_ID_PERMISSION_FAULT: + dev_err(smmu->dev, "event: %s client: %s sid: %#x ssid: %#x iova: %#llx ipa: %#llx", + ARM_SMMU_LOG_EVT_STR(evt), ARM_SMMU_LOG_CLIENT(evt), + evt->sid, evt->ssid, evt->iova, evt->ipa); + + dev_err(smmu->dev, "%s %s %s %s \"%s\"%s%s stag: %#x", + evt->privileged ? "priv" : "unpriv", + evt->instruction ? "inst" : "data", + evt->read ? "read" : "write", + evt->s2 ? "s2" : "s1", event_class_str[evt->class], + evt->class_tt ? (evt->ttrnw ? " ttd_read" : " ttd_write") : "", + evt->stall ? " stall" : "", evt->stag); + + break; + + case EVT_ID_STE_FETCH_FAULT: + case EVT_ID_CD_FETCH_FAULT: + case EVT_ID_VMS_FETCH_FAULT: + dev_err(smmu->dev, "event: %s client: %s sid: %#x ssid: %#x fetch_addr: %#llx", + ARM_SMMU_LOG_EVT_STR(evt), ARM_SMMU_LOG_CLIENT(evt), + evt->sid, evt->ssid, evt->fetch_addr); + + break; + + default: + dev_err(smmu->dev, "event: %s client: %s sid: %#x ssid: %#x", + ARM_SMMU_LOG_EVT_STR(evt), ARM_SMMU_LOG_CLIENT(evt), + evt->sid, evt->ssid); + } +} + static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev) { - int i, ret; u64 evt[EVTQ_ENT_DWORDS]; struct arm_smmu_event event = {0}; struct arm_smmu_device *smmu = dev; @@ -1849,16 +1944,11 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev) do { while (!queue_remove_raw(q, evt)) { - arm_smmu_decode_event(evt, &event); - ret = arm_smmu_handle_evt(smmu, &event); - if (!ret || !__ratelimit(&rs)) - continue; - - dev_info(smmu->dev, "event 0x%02x received:\n", event.id); - for (i = 0; i < EVTQ_ENT_DWORDS; ++i) - dev_info(smmu->dev, "\t0x%016llx\n", - (unsigned long long)evt[i]); + arm_smmu_decode_event(smmu, evt, &event); + if (arm_smmu_handle_event(smmu, &event)) + arm_smmu_dump_event(smmu, evt, &event, &rs); + put_device(event.dev); cond_resched(); } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index c37ed3c925ec..c7f37fd47768 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -452,10 +452,18 @@ static inline unsigned int arm_smmu_cdtab_l2_idx(unsigned int ssid) #define EVTQ_0_ID GENMASK_ULL(7, 0) +#define EVT_ID_BAD_STREAMID_CONFIG 0x02 +#define EVT_ID_STE_FETCH_FAULT 0x03 +#define EVT_ID_BAD_STE_CONFIG 0x04 +#define EVT_ID_STREAM_DISABLED_FAULT 0x06 +#define EVT_ID_BAD_SUBSTREAMID_CONFIG 0x08 +#define EVT_ID_CD_FETCH_FAULT 0x09 +#define EVT_ID_BAD_CD_CONFIG 0x0a #define EVT_ID_TRANSLATION_FAULT 0x10 #define EVT_ID_ADDR_SIZE_FAULT 0x11 #define EVT_ID_ACCESS_FAULT 0x12 #define EVT_ID_PERMISSION_FAULT 0x13 +#define EVT_ID_VMS_FETCH_FAULT 0x25 #define EVTQ_0_SSV (1UL << 11) #define EVTQ_0_SSID GENMASK_ULL(31, 12) @@ -467,6 +475,7 @@ static inline unsigned int arm_smmu_cdtab_l2_idx(unsigned int ssid) #define EVTQ_1_RnW (1UL << 35) #define EVTQ_1_S2 (1UL << 39) #define EVTQ_1_CLASS GENMASK_ULL(41, 40) +#define EVTQ_1_CLASS_TT 0x01 #define EVTQ_1_TT_READ (1UL << 44) #define EVTQ_2_ADDR GENMASK_ULL(63, 0) #define EVTQ_3_IPA GENMASK_ULL(51, 12) @@ -796,7 +805,9 @@ struct arm_smmu_event { privileged : 1, instruction : 1, s2 : 1, - read : 1; + read : 1, + ttrnw : 1, + class_tt : 1; u8 id; u8 class; u16 stag; @@ -805,6 +816,7 @@ struct arm_smmu_event { u64 iova; u64 ipa; u64 fetch_addr; + struct device *dev; }; /* SMMU private data for each master */ -- 2.51.0 From 9b640ae7fbba13d45a8b9712dff2911a0c2b5ff4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 5 Dec 2024 11:40:15 -0400 Subject: [PATCH 10/16] iommu/arm-smmuv3: Update comments about ATS and bypass The SMMUv3 spec has a note that BYPASS and ATS don't work together under the STE EATS field definition. However there is another section "13.6.4 Full ATS skipping stage 1" that explains under certain conditions BYPASS and ATS do work together if the STE is using S1DSS to select BYPASS and the CD table has the possibility for a substream. When these comments were written the understanding was that all forms of BYPASS just didn't work and this was to be a future problem to solve. It turns out that ATS and IDENTITY will always work just fine: - If STE.Config = BYPASS then the PCI ATS is disabled - If a PASID domain is attached then S1DSS = BYPASS and ATS will be enabled. This meets the requirements of 13.6.4 to automatically generate 1:1 ATS replies on the RID. Update the comments to reflect this. Fixes: 7497f4211f4f ("iommu/arm-smmu-v3: Make changing domains be hitless for ATS") Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/0-v1-f27174f44f39+27a33-smmuv3_ats_note_jgg@nvidia.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 143ff6336a95..45e87ce4b973 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2852,9 +2852,14 @@ int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, * Translation Requests and Translated transactions are denied * as though ATS is disabled for the stream (STE.EATS == 0b00), * causing F_BAD_ATS_TREQ and F_TRANSL_FORBIDDEN events - * (IHI0070Ea 5.2 Stream Table Entry). Thus ATS can only be - * enabled if we have arm_smmu_domain, those always have page - * tables. + * (IHI0070Ea 5.2 Stream Table Entry). + * + * However, if we have installed a CD table and are using S1DSS + * then ATS will work in S1DSS bypass. See "13.6.4 Full ATS + * skipping stage 1". + * + * Disable ATS if we are going to create a normal 0b100 bypass + * STE. */ state->ats_enabled = !state->disable_ats && arm_smmu_ats_supported(master); @@ -3177,8 +3182,10 @@ static void arm_smmu_attach_dev_ste(struct iommu_domain *domain, if (arm_smmu_ssids_in_use(&master->cd_table)) { /* * If a CD table has to be present then we need to run with ATS - * on even though the RID will fail ATS queries with UR. This is - * because we have no idea what the PASID's need. + * on because we have to assume a PASID is using ATS. For + * IDENTITY this will setup things so that S1DSS=bypass which + * follows the explanation in "13.6.4 Full ATS skipping stage 1" + * and allows for ATS on the RID to work. */ state.cd_needs_ats = true; arm_smmu_attach_prepare(&state, domain); -- 2.51.0 From 7d835134d4e13e9c30509fd24a42f8c2b94135ea Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 5 Dec 2024 16:33:55 +0000 Subject: [PATCH 11/16] iommu/arm-smmu: Make instance lookup robust Relying on the driver list was a cute idea for minimising the scope of our SMMU device lookups, however it turns out to have a subtle flaw. The SMMU device only gets added to that list after arm_smmu_device_probe() returns success, so there's actually no way the iommu_device_register() call from there could ever work as intended, even if it wasn't already hampered by the fwspec setup not happening early enough. Switch both arm_smmu_get_by_fwnode() implementations to use a platform bus lookup instead, which *will* reliably work. Also make sure that we don't register SMMUv2 instances until we've fully initialised them, to avoid similar consequences of the lookup now finding a device with no drvdata. Moving the error returns is also a perfect excuse to streamline them with dev_err_probe() in the process. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/6d7ce1dc31873abdb75c895fb8bd2097cce098b4.1733406914.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 4 +-- drivers/iommu/arm/arm-smmu/arm-smmu.c | 29 +++++++++------------ 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 45e87ce4b973..dbacf8986fa7 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3351,8 +3351,8 @@ static struct platform_driver arm_smmu_driver; static struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode) { - struct device *dev = driver_find_device_by_fwnode(&arm_smmu_driver.driver, - fwnode); + struct device *dev = bus_find_device_by_fwnode(&platform_bus_type, fwnode); + put_device(dev); return dev ? dev_get_drvdata(dev) : NULL; } diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 650664e0f6e3..0949f2734e5d 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1411,8 +1411,8 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap) static struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode) { - struct device *dev = driver_find_device_by_fwnode(&arm_smmu_driver.driver, - fwnode); + struct device *dev = bus_find_device_by_fwnode(&platform_bus_type, fwnode); + put_device(dev); return dev ? dev_get_drvdata(dev) : NULL; } @@ -2227,29 +2227,26 @@ static int arm_smmu_device_probe(struct platform_device *pdev) i, irq); } + platform_set_drvdata(pdev, smmu); + + /* Check for RMRs and install bypass SMRs if any */ + arm_smmu_rmr_install_bypass_smr(smmu); + + arm_smmu_device_reset(smmu); + arm_smmu_test_smr_masks(smmu); + err = iommu_device_sysfs_add(&smmu->iommu, smmu->dev, NULL, "smmu.%pa", &smmu->ioaddr); - if (err) { - dev_err(dev, "Failed to register iommu in sysfs\n"); - return err; - } + if (err) + return dev_err_probe(dev, err, "Failed to register iommu in sysfs\n"); err = iommu_device_register(&smmu->iommu, &arm_smmu_ops, using_legacy_binding ? NULL : dev); if (err) { - dev_err(dev, "Failed to register iommu\n"); iommu_device_sysfs_remove(&smmu->iommu); - return err; + return dev_err_probe(dev, err, "Failed to register iommu\n"); } - platform_set_drvdata(pdev, smmu); - - /* Check for RMRs and install bypass SMRs if any */ - arm_smmu_rmr_install_bypass_smr(smmu); - - arm_smmu_device_reset(smmu); - arm_smmu_test_smr_masks(smmu); - /* * We want to avoid touching dev->power.lock in fastpaths unless * it's really going to do something useful - pm_runtime_enabled() -- 2.51.0 From 97cb1fa0272646c2a033b05338bb8e0260879968 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 5 Dec 2024 16:33:56 +0000 Subject: [PATCH 12/16] iommu/arm-smmu: Retire probe deferral workaround This reverts commit 229e6ee43d2a160a1592b83aad620d6027084aad. Now that the fundamental ordering issue between arm_smmu_get_by_fwnode() and iommu_device_register() is resolved, the race condition for client probe no longer exists either, so retire the specific workaround. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/4167c5dfa052d4c8bb780f0a30af63dcfc4ce6c1.1733406914.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 0949f2734e5d..79afc92e1d8b 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1437,17 +1437,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) goto out_free; } else { smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode); - - /* - * Defer probe if the relevant SMMU instance hasn't finished - * probing yet. This is a fragile hack and we'd ideally - * avoid this race in the core code. Until that's ironed - * out, however, this is the most pragmatic option on the - * table. - */ - if (!smmu) - return ERR_PTR(dev_err_probe(dev, -EPROBE_DEFER, - "smmu dev has not bound yet\n")); } ret = -EINVAL; -- 2.51.0 From fcbd621567420b3a2f21f49bbc056de8b273c625 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 5 Dec 2024 16:33:57 +0000 Subject: [PATCH 13/16] iommu/arm-smmu-v3: Clean up more on probe failure kmemleak noticed that the iopf queue allocated deep down within arm_smmu_init_structures() can be leaked by a subsequent error return from arm_smmu_device_probe(). Furthermore, after arm_smmu_device_reset() we will also leave the SMMU enabled with an empty Stream Table, silently blocking all DMA. This proves rather annoying for debugging said probe failure, so let's handle it a bit better by putting the SMMU back into (more or less) the same state as if it hadn't probed at all. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/5137901958471cf67f2fad5c2229f8a8f1ae901a.1733406914.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index dbacf8986fa7..d8ebe18a5507 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -4777,7 +4777,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev) /* Initialise in-memory data structures */ ret = arm_smmu_init_structures(smmu); if (ret) - return ret; + goto err_free_iopf; /* Record our private device structure */ platform_set_drvdata(pdev, smmu); @@ -4788,22 +4788,29 @@ static int arm_smmu_device_probe(struct platform_device *pdev) /* Reset the device */ ret = arm_smmu_device_reset(smmu); if (ret) - return ret; + goto err_disable; /* And we're up. Go go go! */ ret = iommu_device_sysfs_add(&smmu->iommu, dev, NULL, "smmu3.%pa", &ioaddr); if (ret) - return ret; + goto err_disable; ret = iommu_device_register(&smmu->iommu, &arm_smmu_ops, dev); if (ret) { dev_err(dev, "Failed to register iommu\n"); - iommu_device_sysfs_remove(&smmu->iommu); - return ret; + goto err_free_sysfs; } return 0; + +err_free_sysfs: + iommu_device_sysfs_remove(&smmu->iommu); +err_disable: + arm_smmu_device_disable(smmu); +err_free_iopf: + iopf_queue_free(smmu->evtq.iopf); + return ret; } static void arm_smmu_device_remove(struct platform_device *pdev) -- 2.51.0 From 46b3df8eb9bd035620bc48bd7a1f028490626621 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 5 Dec 2024 16:33:58 +0000 Subject: [PATCH 14/16] iommu: Manage driver probe deferral better Since iommu_fwspec_init() absorbed the basic driver probe deferral check to wait for an IOMMU to register, we may as well handle the probe deferral timeout there as well. The current inconsistency of callers results in client devices deferring forever on an arm64 ACPI system where an SMMU has failed its own driver probe. Acked-by: Will Deacon Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/41fa59f156ef8d196d08fa75c4901e6d4b12e6c4.1733406914.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/iommu/iommu.c | 2 +- drivers/iommu/of_iommu.c | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 599030e1e890..851fd5aeccf5 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2819,7 +2819,7 @@ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode) struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); if (!ops) - return -EPROBE_DEFER; + return driver_deferred_probe_check_state(dev); if (fwspec) return ops == iommu_fwspec_ops(fwspec) ? 0 : -EINVAL; diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index e7a6a1611d19..97987cd78da9 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -29,8 +29,6 @@ static int of_iommu_xlate(struct device *dev, return -ENODEV; ret = iommu_fwspec_init(dev, of_fwnode_handle(iommu_spec->np)); - if (ret == -EPROBE_DEFER) - return driver_deferred_probe_check_state(dev); if (ret) return ret; -- 2.51.0 From 6e192214c6c82c2f52238d5e1865f11594e58a6f Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 5 Dec 2024 13:48:09 +0000 Subject: [PATCH 15/16] iommu/arm-smmu-v3: Document SVA interaction with new pagetable features Process pagetables may now be using new permission-indirection-based features which an SMMU may not understand when given such a table for SVA. Although SMMUv3.4 does add its own S1PIE feature, realistically we're still going to have to cope with feature mismatches between CPUs and SMMUs, so let's start simple and essentially just document the expectations for what falls out as-is. Although it seems unlikely for SVA applications to also depend on memory-hardening features, or vice-versa, the relative lifecycles make it tricky to enforce mutual exclusivity. Thankfully our PIE index allocation makes it relatively benign for an SMMU to keep interpreting them as direct permissions, the only real implication is that an SVA application cannot harden itself against its own devices with these features. Thus, inform the user about that just in case they have other expectations. Also we don't (yet) support LPA2, so deny SVA entirely if we're going to misunderstand the pagetable format altogether. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/68a37b00a720f0827cac0e4f40e4d3a688924054.1733406275.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 1d3e71569775..9ba596430e7c 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -112,6 +112,15 @@ void arm_smmu_make_sva_cd(struct arm_smmu_cd *target, * from the current CPU register */ target->data[3] = cpu_to_le64(read_sysreg(mair_el1)); + + /* + * Note that we don't bother with S1PIE on the SMMU, we just rely on + * our default encoding scheme matching direct permissions anyway. + * SMMU has no notion of S1POE nor GCS, so make sure that is clear if + * either is enabled for CPUs, just in case anyone imagines otherwise. + */ + if (system_supports_poe() || system_supports_gcs()) + dev_warn_once(master->smmu->dev, "SVA devices ignore permission overlays and GCS\n"); } EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_sva_cd); @@ -206,8 +215,12 @@ bool arm_smmu_sva_supported(struct arm_smmu_device *smmu) unsigned long asid_bits; u32 feat_mask = ARM_SMMU_FEAT_COHERENCY; - if (vabits_actual == 52) + if (vabits_actual == 52) { + /* We don't support LPA2 */ + if (PAGE_SIZE != SZ_64K) + return false; feat_mask |= ARM_SMMU_FEAT_VAX; + } if ((smmu->features & feat_mask) != feat_mask) return false; -- 2.51.0 From 48e7b8e284e5be9fd1b54b60246bcbe9711d43e4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 5 Dec 2024 11:43:27 -0400 Subject: [PATCH 16/16] iommu/arm-smmu-v3: Remove arm_smmu_domain_finalise() during attach Domains are now always finalized during allocation because the core code no longer permits a NULL dev argument to domain_alloc_paging/_flags(). Remove the late finalize during attach that supported domains that were not fully initialized. Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v1-0bb8d5313a27+27b-smmuv3_paging_flags_jgg@nvidia.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 37 +++++---------------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 1 - 2 files changed, 9 insertions(+), 29 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index d8ebe18a5507..dc5fab268f7d 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2460,7 +2460,6 @@ struct arm_smmu_domain *arm_smmu_domain_alloc(void) if (!smmu_domain) return ERR_PTR(-ENOMEM); - mutex_init(&smmu_domain->init_mutex); INIT_LIST_HEAD(&smmu_domain->devices); spin_lock_init(&smmu_domain->devices_lock); @@ -2469,7 +2468,9 @@ struct arm_smmu_domain *arm_smmu_domain_alloc(void) static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev) { + struct arm_smmu_master *master = dev_iommu_priv_get(dev); struct arm_smmu_domain *smmu_domain; + int ret; /* * Allocate the domain and initialise some of its data structures. @@ -2480,15 +2481,10 @@ static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev) if (IS_ERR(smmu_domain)) return ERR_CAST(smmu_domain); - if (dev) { - struct arm_smmu_master *master = dev_iommu_priv_get(dev); - int ret; - - ret = arm_smmu_domain_finalise(smmu_domain, master->smmu, 0); - if (ret) { - kfree(smmu_domain); - return ERR_PTR(ret); - } + ret = arm_smmu_domain_finalise(smmu_domain, master->smmu, 0); + if (ret) { + kfree(smmu_domain); + return ERR_PTR(ret); } return &smmu_domain->domain; } @@ -2965,15 +2961,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) state.master = master = dev_iommu_priv_get(dev); smmu = master->smmu; - mutex_lock(&smmu_domain->init_mutex); - - if (!smmu_domain->smmu) { - ret = arm_smmu_domain_finalise(smmu_domain, smmu, 0); - } else if (smmu_domain->smmu != smmu) - ret = -EINVAL; - - mutex_unlock(&smmu_domain->init_mutex); - if (ret) + if (smmu_domain->smmu != smmu) return ret; if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { @@ -3030,16 +3018,9 @@ static int arm_smmu_s1_set_dev_pasid(struct iommu_domain *domain, struct arm_smmu_master *master = dev_iommu_priv_get(dev); struct arm_smmu_device *smmu = master->smmu; struct arm_smmu_cd target_cd; - int ret = 0; - mutex_lock(&smmu_domain->init_mutex); - if (!smmu_domain->smmu) - ret = arm_smmu_domain_finalise(smmu_domain, smmu, 0); - else if (smmu_domain->smmu != smmu) - ret = -EINVAL; - mutex_unlock(&smmu_domain->init_mutex); - if (ret) - return ret; + if (smmu_domain->smmu != smmu) + return -EINVAL; if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) return -EINVAL; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index c7f37fd47768..bd9d7c85576a 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -843,7 +843,6 @@ enum arm_smmu_domain_stage { struct arm_smmu_domain { struct arm_smmu_device *smmu; - struct mutex init_mutex; /* Protects smmu pointer */ struct io_pgtable_ops *pgtbl_ops; atomic_t nr_ats_masters; -- 2.51.0