From eea6c14c10ce208e5f6ab309fb1c141a39446a1b Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Tue, 22 Oct 2024 15:29:04 +0000 Subject: [PATCH 01/16] selftests/bpf: Retire test_sock.c Completely remove test_sock.c and associated config. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20241022152913.574836-5-jrife@google.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 3 +- tools/testing/selftests/bpf/test_sock.c | 231 ------------------------ 3 files changed, 1 insertion(+), 234 deletions(-) delete mode 100644 tools/testing/selftests/bpf/test_sock.c diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index e6533b3400de..d45c9a9b304d 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -16,7 +16,6 @@ fixdep /test_progs-cpuv4 test_verifier_log feature -test_sock urandom_read test_sockmap test_lirc_mode2_user diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 28a76baa854d..c4fc9a3291a8 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -84,7 +84,7 @@ endif # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ - test_sock test_sockmap \ + test_sockmap \ test_tcpnotify_user test_sysctl \ test_progs-no_alu32 TEST_INST_SUBDIRS := no_alu32 @@ -335,7 +335,6 @@ JSON_WRITER := $(OUTPUT)/json_writer.o CAP_HELPERS := $(OUTPUT)/cap_helpers.o NETWORK_HELPERS := $(OUTPUT)/network_helpers.o -$(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS) $(OUTPUT)/test_sock_fields: $(CGROUP_HELPERS) $(TESTING_HELPERS) diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c deleted file mode 100644 index f97850f1d84a..000000000000 --- a/tools/testing/selftests/bpf/test_sock.c +++ /dev/null @@ -1,231 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2018 Facebook - -#include -#include - -#include -#include -#include - -#include - -#include - -#include "cgroup_helpers.h" -#include -#include "bpf_util.h" - -#define CG_PATH "/foo" -#define MAX_INSNS 512 - -char bpf_log_buf[BPF_LOG_BUF_SIZE]; -static bool verbose = false; - -struct sock_test { - const char *descr; - /* BPF prog properties */ - struct bpf_insn insns[MAX_INSNS]; - enum bpf_attach_type expected_attach_type; - enum bpf_attach_type attach_type; - /* Socket properties */ - int domain; - int type; - /* Endpoint to bind() to */ - const char *ip; - unsigned short port; - unsigned short port_retry; - /* Expected test result */ - enum { - LOAD_REJECT, - ATTACH_REJECT, - BIND_REJECT, - SUCCESS, - RETRY_SUCCESS, - RETRY_REJECT - } result; -}; - -static struct sock_test tests[] = { -}; - -static size_t probe_prog_length(const struct bpf_insn *fp) -{ - size_t len; - - for (len = MAX_INSNS - 1; len > 0; --len) - if (fp[len].code != 0 || fp[len].imm != 0) - break; - return len + 1; -} - -static int load_sock_prog(const struct bpf_insn *prog, - enum bpf_attach_type attach_type) -{ - LIBBPF_OPTS(bpf_prog_load_opts, opts); - int ret, insn_cnt; - - insn_cnt = probe_prog_length(prog); - - opts.expected_attach_type = attach_type; - opts.log_buf = bpf_log_buf; - opts.log_size = BPF_LOG_BUF_SIZE; - opts.log_level = 2; - - ret = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", prog, insn_cnt, &opts); - if (verbose && ret < 0) - fprintf(stderr, "%s\n", bpf_log_buf); - - return ret; -} - -static int attach_sock_prog(int cgfd, int progfd, - enum bpf_attach_type attach_type) -{ - return bpf_prog_attach(progfd, cgfd, attach_type, BPF_F_ALLOW_OVERRIDE); -} - -static int bind_sock(int domain, int type, const char *ip, - unsigned short port, unsigned short port_retry) -{ - struct sockaddr_storage addr; - struct sockaddr_in6 *addr6; - struct sockaddr_in *addr4; - int sockfd = -1; - socklen_t len; - int res = SUCCESS; - - sockfd = socket(domain, type, 0); - if (sockfd < 0) - goto err; - - memset(&addr, 0, sizeof(addr)); - - if (domain == AF_INET) { - len = sizeof(struct sockaddr_in); - addr4 = (struct sockaddr_in *)&addr; - addr4->sin_family = domain; - addr4->sin_port = htons(port); - if (inet_pton(domain, ip, (void *)&addr4->sin_addr) != 1) - goto err; - } else if (domain == AF_INET6) { - len = sizeof(struct sockaddr_in6); - addr6 = (struct sockaddr_in6 *)&addr; - addr6->sin6_family = domain; - addr6->sin6_port = htons(port); - if (inet_pton(domain, ip, (void *)&addr6->sin6_addr) != 1) - goto err; - } else { - goto err; - } - - if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1) { - /* sys_bind() may fail for different reasons, errno has to be - * checked to confirm that BPF program rejected it. - */ - if (errno != EPERM) - goto err; - if (port_retry) - goto retry; - res = BIND_REJECT; - goto out; - } - - goto out; -retry: - if (domain == AF_INET) - addr4->sin_port = htons(port_retry); - else - addr6->sin6_port = htons(port_retry); - if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1) { - if (errno != EPERM) - goto err; - res = RETRY_REJECT; - } else { - res = RETRY_SUCCESS; - } - goto out; -err: - res = -1; -out: - close(sockfd); - return res; -} - -static int run_test_case(int cgfd, const struct sock_test *test) -{ - int progfd = -1; - int err = 0; - int res; - - printf("Test case: %s .. ", test->descr); - progfd = load_sock_prog(test->insns, test->expected_attach_type); - if (progfd < 0) { - if (test->result == LOAD_REJECT) - goto out; - else - goto err; - } - - if (attach_sock_prog(cgfd, progfd, test->attach_type) < 0) { - if (test->result == ATTACH_REJECT) - goto out; - else - goto err; - } - - res = bind_sock(test->domain, test->type, test->ip, test->port, - test->port_retry); - if (res > 0 && test->result == res) - goto out; - -err: - err = -1; -out: - /* Detaching w/o checking return code: best effort attempt. */ - if (progfd != -1) - bpf_prog_detach(cgfd, test->attach_type); - close(progfd); - printf("[%s]\n", err ? "FAIL" : "PASS"); - return err; -} - -static int run_tests(int cgfd) -{ - int passes = 0; - int fails = 0; - int i; - - for (i = 0; i < ARRAY_SIZE(tests); ++i) { - if (run_test_case(cgfd, &tests[i])) - ++fails; - else - ++passes; - } - printf("Summary: %d PASSED, %d FAILED\n", passes, fails); - return fails ? -1 : 0; -} - -int main(int argc, char **argv) -{ - int cgfd = -1; - int err = 0; - - cgfd = cgroup_setup_and_join(CG_PATH); - if (cgfd < 0) - goto err; - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - if (run_tests(cgfd)) - goto err; - - goto out; -err: - err = -1; -out: - close(cgfd); - cleanup_cgroup_environment(); - return err; -} -- 2.51.0 From 1f7c33630724dfe47f99748bd2a9a56ec8bd337f Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Wed, 23 Oct 2024 16:53:14 +0100 Subject: [PATCH 02/16] selftests/bpf: Increase verifier log limit in veristat The current default buffer size of 16MB allocated by veristat is no longer sufficient to hold the verifier logs of some production BPF programs. To address this issue, we need to increase the verifier log limit. Commit 7a9f5c65abcc ("bpf: increase verifier log limit") has already increased the supported buffer size by the kernel, but veristat users need to explicitly pass a log size argument to use the bigger log. This patch adds a function to detect the maximum verifier log size supported by the kernel and uses that by default in veristat. This ensures that veristat can handle larger verifier logs without requiring users to manually specify the log size. Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241023155314.126255-1-mykyta.yatsenko5@gmail.com --- tools/testing/selftests/bpf/veristat.c | 32 +++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index c8efd44590d9..e12ef953fba8 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -1109,6 +1110,35 @@ skip_freplace_fixup: return; } +static int max_verifier_log_size(void) +{ + const int SMALL_LOG_SIZE = UINT_MAX >> 8; + const int BIG_LOG_SIZE = UINT_MAX >> 2; + struct bpf_insn insns[] = { + { .code = BPF_ALU | BPF_MOV | BPF_X, .dst_reg = BPF_REG_0, }, + { .code = BPF_JMP | BPF_EXIT, }, + }; + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_size = BIG_LOG_SIZE, + .log_buf = (void *)-1, + .log_level = 4 + ); + int ret, insn_cnt = ARRAY_SIZE(insns); + static int log_size; + + if (log_size != 0) + return log_size; + + ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, &opts); + + if (ret == -EFAULT) + log_size = BIG_LOG_SIZE; + else /* ret == -EINVAL, big log size is not supported by the verifier */ + log_size = SMALL_LOG_SIZE; + + return log_size; +} + static int process_prog(const char *filename, struct bpf_object *obj, struct bpf_program *prog) { const char *base_filename = basename(strdupa(filename)); @@ -1132,7 +1162,7 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf memset(stats, 0, sizeof(*stats)); if (env.verbose || env.top_src_lines > 0) { - buf_sz = env.log_size ? env.log_size : 16 * 1024 * 1024; + buf_sz = env.log_size ? env.log_size : max_verifier_log_size(); buf = malloc(buf_sz); if (!buf) return -ENOMEM; -- 2.51.0 From 2c3d022abe6c3165109393b75a127b06c2c70063 Mon Sep 17 00:00:00 2001 From: Eder Zulian Date: Tue, 22 Oct 2024 19:23:27 +0200 Subject: [PATCH 03/16] resolve_btfids: Fix compiler warnings MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Initialize 'set' and 'set8' pointers to NULL in sets_patch to prevent possible compiler warnings which are issued for various optimization levels, but do not happen when compiling with current default compilation options. For example, when compiling resolve_btfids with $ make "HOSTCFLAGS=-O2 -Wall" -C tools/bpf/resolve_btfids/ clean all Clang version 17.0.6 and GCC 13.3.1 issue following -Wmaybe-uninitialized warnings for variables 'set8' and 'set': In function ‘sets_patch’, inlined from ‘symbols_patch’ at main.c:748:6, inlined from ‘main’ at main.c:823:6: main.c:163:9: warning: ‘set8’ may be used uninitialized [-Wmaybe-uninitialized] 163 | eprintf(1, verbose, pr_fmt(fmt), ##__VA_ARGS__) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ main.c:729:17: note: in expansion of macro ‘pr_debug’ 729 | pr_debug("sorting addr %5lu: cnt %6d [%s]\n", | ^~~~~~~~ main.c: In function ‘main’: main.c:682:37: note: ‘set8’ was declared here 682 | struct btf_id_set8 *set8; | ^~~~ In function ‘sets_patch’, inlined from ‘symbols_patch’ at main.c:748:6, inlined from ‘main’ at main.c:823:6: main.c:163:9: warning: ‘set’ may be used uninitialized [-Wmaybe-uninitialized] 163 | eprintf(1, verbose, pr_fmt(fmt), ##__VA_ARGS__) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ main.c:729:17: note: in expansion of macro ‘pr_debug’ 729 | pr_debug("sorting addr %5lu: cnt %6d [%s]\n", | ^~~~~~~~ main.c: In function ‘main’: main.c:683:36: note: ‘set’ was declared here 683 | struct btf_id_set *set; | ^~~ Signed-off-by: Eder Zulian Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20241022172329.3871958-2-ezulian@redhat.com --- tools/bpf/resolve_btfids/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index d54aaa0619df..bd9f960bce3d 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -679,8 +679,8 @@ static int sets_patch(struct object *obj) next = rb_first(&obj->sets); while (next) { - struct btf_id_set8 *set8; - struct btf_id_set *set; + struct btf_id_set8 *set8 = NULL; + struct btf_id_set *set = NULL; unsigned long addr, off; struct btf_id *id; -- 2.51.0 From 7f4ec77f3fee41dd6a41f03a40703889e6e8f7b2 Mon Sep 17 00:00:00 2001 From: Eder Zulian Date: Tue, 22 Oct 2024 19:23:28 +0200 Subject: [PATCH 04/16] libbpf: Prevent compiler warnings/errors MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Initialize 'new_off' and 'pad_bits' to 0 and 'pad_type' to NULL in btf_dump_emit_bit_padding to prevent compiler warnings/errors which are observed when compiling with 'EXTRA_CFLAGS=-g -Og' options, but do not happen when compiling with current default options. For example, when compiling libbpf with $ make "EXTRA_CFLAGS=-g -Og" -C tools/lib/bpf/ clean all Clang version 17.0.6 and GCC 13.3.1 fail to compile btf_dump.c due to following errors: btf_dump.c: In function ‘btf_dump_emit_bit_padding’: btf_dump.c:903:42: error: ‘new_off’ may be used uninitialized [-Werror=maybe-uninitialized] 903 | if (new_off > cur_off && new_off <= next_off) { | ~~~~~~~~^~~~~~~~~~~ btf_dump.c:870:13: note: ‘new_off’ was declared here 870 | int new_off, pad_bits, bits, i; | ^~~~~~~ btf_dump.c:917:25: error: ‘pad_type’ may be used uninitialized [-Werror=maybe-uninitialized] 917 | btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 918 | in_bitfield ? new_off - cur_off : 0); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ btf_dump.c:871:21: note: ‘pad_type’ was declared here 871 | const char *pad_type; | ^~~~~~~~ btf_dump.c:930:20: error: ‘pad_bits’ may be used uninitialized [-Werror=maybe-uninitialized] 930 | if (bits == pad_bits) { | ^ btf_dump.c:870:22: note: ‘pad_bits’ was declared here 870 | int new_off, pad_bits, bits, i; | ^~~~~~~~ cc1: all warnings being treated as errors Signed-off-by: Eder Zulian Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20241022172329.3871958-3-ezulian@redhat.com --- tools/lib/bpf/btf_dump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 8440c2c5ad3e..468392f9882d 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -867,8 +867,8 @@ static void btf_dump_emit_bit_padding(const struct btf_dump *d, } pads[] = { {"long", d->ptr_sz * 8}, {"int", 32}, {"short", 16}, {"char", 8} }; - int new_off, pad_bits, bits, i; - const char *pad_type; + int new_off = 0, pad_bits = 0, bits, i; + const char *pad_type = NULL; if (cur_off >= next_off) return; /* no gap */ -- 2.51.0 From 7a4ffec9fd54ea27395e24dff726dbf58e2fe06b Mon Sep 17 00:00:00 2001 From: Eder Zulian Date: Tue, 22 Oct 2024 19:23:29 +0200 Subject: [PATCH 05/16] libsubcmd: Silence compiler warning MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Initialize the pointer 'o' in options__order to NULL to prevent a compiler warning/error which is observed when compiling with the '-Og' option, but is not emitted by the compiler with the current default compilation options. For example, when compiling libsubcmd with $ make "EXTRA_CFLAGS=-Og" -C tools/lib/subcmd/ clean all Clang version 17.0.6 and GCC 13.3.1 fail to compile parse-options.c due to following error: parse-options.c: In function ‘options__order’: parse-options.c:832:9: error: ‘o’ may be used uninitialized [-Werror=maybe-uninitialized] 832 | memcpy(&ordered[nr_opts], o, sizeof(*o)); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ parse-options.c:810:30: note: ‘o’ was declared here 810 | const struct option *o, *p = opts; | ^ cc1: all warnings being treated as errors Signed-off-by: Eder Zulian Signed-off-by: Andrii Nakryiko Acked-by: Arnaldo Carvalho de Melo Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20241022172329.3871958-4-ezulian@redhat.com --- tools/lib/subcmd/parse-options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/subcmd/parse-options.c b/tools/lib/subcmd/parse-options.c index eb896d30545b..555d617c1f50 100644 --- a/tools/lib/subcmd/parse-options.c +++ b/tools/lib/subcmd/parse-options.c @@ -807,7 +807,7 @@ static int option__cmp(const void *va, const void *vb) static struct option *options__order(const struct option *opts) { int nr_opts = 0, nr_group = 0, nr_parent = 0, len; - const struct option *o, *p = opts; + const struct option *o = NULL, *p = opts; struct option *opt, *ordered = NULL, *group; /* flatten the options that have parents */ -- 2.51.0 From 1b2bfc29695d273492c3dd8512775261f3272686 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 22 Oct 2024 21:39:06 -0700 Subject: [PATCH 06/16] selftests/bpf: fix test_spin_lock_fail.c's global vars usage Global variables of special types (like `struct bpf_spin_lock`) make underlying ARRAY maps non-mmapable. To make this work with libbpf's mmaping logic, application is expected to declare such special variables as static, so libbpf doesn't even attempt to mmap() such ARRAYs. test_spin_lock_fail.c didn't follow this rule, but given it relied on this test to trigger failures, this went unnoticed, as we never got to the step of mmap()'ing these ARRAY maps. It is fragile and relies on specific sequence of libbpf steps, which are an internal implementation details. Fix the test by marking lockA and lockB as static. Fixes: c48748aea4f8 ("selftests/bpf: Add failure test cases for spin lock pairing") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20241023043908.3834423-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_spin_lock_fail.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c b/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c index 43f40c4fe241..1c8b678e2e9a 100644 --- a/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c +++ b/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c @@ -28,8 +28,8 @@ struct { }, }; -SEC(".data.A") struct bpf_spin_lock lockA; -SEC(".data.B") struct bpf_spin_lock lockB; +static struct bpf_spin_lock lockA SEC(".data.A"); +static struct bpf_spin_lock lockB SEC(".data.B"); SEC("?tc") int lock_id_kptr_preserve(void *ctx) -- 2.51.0 From 137978f422516a128326df55c0ba23605f925e21 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 22 Oct 2024 21:39:07 -0700 Subject: [PATCH 07/16] libbpf: move global data mmap()'ing into bpf_object__load() Since BPF skeleton inception libbpf has been doing mmap()'ing of global data ARRAY maps in bpf_object__load_skeleton() API, which is used by code generated .skel.h files (i.e., by BPF skeletons only). This is wrong because if BPF object is loaded through generic bpf_object__load() API, global data maps won't be re-mmap()'ed after load step, and memory pointers returned from bpf_map__initial_value() would be wrong and won't reflect the actual memory shared between BPF program and user space. bpf_map__initial_value() return result is rarely used after load, so this went unnoticed for a really long time, until bpftrace project attempted to load BPF object through generic bpf_object__load() API and then used BPF subskeleton instantiated from such bpf_object. It turned out that .data/.rodata/.bss data updates through such subskeleton was "blackholed", all because libbpf wouldn't re-mmap() those maps during bpf_object__load() phase. Long story short, this step should be done by libbpf regardless of BPF skeleton usage, right after BPF map is created in the kernel. This patch moves this functionality into bpf_object__populate_internal_map() to achieve this. And bpf_object__load_skeleton() is now simple and almost trivial, only propagating these mmap()'ed pointers into user-supplied skeleton structs. We also do trivial adjustments to error reporting inside bpf_object__populate_internal_map() for consistency with the rest of libbpf's map-handling code. Reported-by: Alastair Robertson Reported-by: Jonathan Wiepert Fixes: d66562fba1ce ("libbpf: Add BPF object skeleton support") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20241023043908.3834423-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 83 ++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 43 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 7c40286c3948..711173acbcef 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5122,6 +5122,7 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) enum libbpf_map_type map_type = map->libbpf_type; char *cp, errmsg[STRERR_BUFSIZE]; int err, zero = 0; + size_t mmap_sz; if (obj->gen_loader) { bpf_gen__map_update_elem(obj->gen_loader, map - obj->maps, @@ -5135,8 +5136,8 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) if (err) { err = -errno; cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); - pr_warn("Error setting initial map(%s) contents: %s\n", - map->name, cp); + pr_warn("map '%s': failed to set initial contents: %s\n", + bpf_map__name(map), cp); return err; } @@ -5146,11 +5147,43 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) if (err) { err = -errno; cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); - pr_warn("Error freezing map(%s) as read-only: %s\n", - map->name, cp); + pr_warn("map '%s': failed to freeze as read-only: %s\n", + bpf_map__name(map), cp); return err; } } + + /* Remap anonymous mmap()-ed "map initialization image" as + * a BPF map-backed mmap()-ed memory, but preserving the same + * memory address. This will cause kernel to change process' + * page table to point to a different piece of kernel memory, + * but from userspace point of view memory address (and its + * contents, being identical at this point) will stay the + * same. This mapping will be released by bpf_object__close() + * as per normal clean up procedure. + */ + mmap_sz = bpf_map_mmap_sz(map); + if (map->def.map_flags & BPF_F_MMAPABLE) { + void *mmaped; + int prot; + + if (map->def.map_flags & BPF_F_RDONLY_PROG) + prot = PROT_READ; + else + prot = PROT_READ | PROT_WRITE; + mmaped = mmap(map->mmaped, mmap_sz, prot, MAP_SHARED | MAP_FIXED, map->fd, 0); + if (mmaped == MAP_FAILED) { + err = -errno; + pr_warn("map '%s': failed to re-mmap() contents: %d\n", + bpf_map__name(map), err); + return err; + } + map->mmaped = mmaped; + } else if (map->mmaped) { + munmap(map->mmaped, mmap_sz); + map->mmaped = NULL; + } + return 0; } @@ -5467,8 +5500,7 @@ retry: err = bpf_object__populate_internal_map(obj, map); if (err < 0) goto err_out; - } - if (map->def.type == BPF_MAP_TYPE_ARENA) { + } else if (map->def.type == BPF_MAP_TYPE_ARENA) { map->mmaped = mmap((void *)(long)map->map_extra, bpf_map_mmap_sz(map), PROT_READ | PROT_WRITE, map->map_extra ? MAP_SHARED | MAP_FIXED : MAP_SHARED, @@ -13916,46 +13948,11 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s) for (i = 0; i < s->map_cnt; i++) { struct bpf_map_skeleton *map_skel = (void *)s->maps + i * s->map_skel_sz; struct bpf_map *map = *map_skel->map; - size_t mmap_sz = bpf_map_mmap_sz(map); - int prot, map_fd = map->fd; - void **mmaped = map_skel->mmaped; - - if (!mmaped) - continue; - - if (!(map->def.map_flags & BPF_F_MMAPABLE)) { - *mmaped = NULL; - continue; - } - if (map->def.type == BPF_MAP_TYPE_ARENA) { - *mmaped = map->mmaped; + if (!map_skel->mmaped) continue; - } - - if (map->def.map_flags & BPF_F_RDONLY_PROG) - prot = PROT_READ; - else - prot = PROT_READ | PROT_WRITE; - /* Remap anonymous mmap()-ed "map initialization image" as - * a BPF map-backed mmap()-ed memory, but preserving the same - * memory address. This will cause kernel to change process' - * page table to point to a different piece of kernel memory, - * but from userspace point of view memory address (and its - * contents, being identical at this point) will stay the - * same. This mapping will be released by bpf_object__close() - * as per normal clean up procedure, so we don't need to worry - * about it from skeleton's clean up perspective. - */ - *mmaped = mmap(map->mmaped, mmap_sz, prot, MAP_SHARED | MAP_FIXED, map_fd, 0); - if (*mmaped == MAP_FAILED) { - err = -errno; - *mmaped = NULL; - pr_warn("failed to re-mmap() map '%s': %d\n", - bpf_map__name(map), err); - return libbpf_err(err); - } + *map_skel->mmaped = map->mmaped; } return 0; -- 2.51.0 From 80a54566b7f03351f77445ed3ac8d4eff3b04fcc Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 22 Oct 2024 21:39:08 -0700 Subject: [PATCH 08/16] selftests/bpf: validate generic bpf_object and subskel APIs work together Add a new subtest validating that bpf_object loaded and initialized through generic APIs is still interoperable with BPF subskeleton, including initialization and reading of global variables. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20241023043908.3834423-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/subskeleton.c | 76 ++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/subskeleton.c b/tools/testing/selftests/bpf/prog_tests/subskeleton.c index 9c31b7004f9c..fdf13ed0152a 100644 --- a/tools/testing/selftests/bpf/prog_tests/subskeleton.c +++ b/tools/testing/selftests/bpf/prog_tests/subskeleton.c @@ -46,7 +46,8 @@ static int subskeleton_lib_subresult(struct bpf_object *obj) return result; } -void test_subskeleton(void) +/* initialize and load through skeleton, then instantiate subskeleton out of it */ +static void subtest_skel_subskeleton(void) { int err, result; struct test_subskeleton *skel; @@ -76,3 +77,76 @@ void test_subskeleton(void) cleanup: test_subskeleton__destroy(skel); } + +/* initialize and load through generic bpf_object API, then instantiate subskeleton out of it */ +static void subtest_obj_subskeleton(void) +{ + int err, result; + const void *elf_bytes; + size_t elf_bytes_sz = 0, rodata_sz = 0, bss_sz = 0; + struct bpf_object *obj; + const struct bpf_map *map; + const struct bpf_program *prog; + struct bpf_link *link = NULL; + struct test_subskeleton__rodata *rodata; + struct test_subskeleton__bss *bss; + + elf_bytes = test_subskeleton__elf_bytes(&elf_bytes_sz); + if (!ASSERT_OK_PTR(elf_bytes, "elf_bytes")) + return; + + obj = bpf_object__open_mem(elf_bytes, elf_bytes_sz, NULL); + if (!ASSERT_OK_PTR(obj, "obj_open_mem")) + return; + + map = bpf_object__find_map_by_name(obj, ".rodata"); + if (!ASSERT_OK_PTR(map, "rodata_map_by_name")) + goto cleanup; + + rodata = bpf_map__initial_value(map, &rodata_sz); + if (!ASSERT_OK_PTR(rodata, "rodata_get")) + goto cleanup; + + rodata->rovar1 = 10; + rodata->var1 = 1; + subskeleton_lib_setup(obj); + + err = bpf_object__load(obj); + if (!ASSERT_OK(err, "obj_load")) + goto cleanup; + + prog = bpf_object__find_program_by_name(obj, "handler1"); + if (!ASSERT_OK_PTR(prog, "prog_by_name")) + goto cleanup; + + link = bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "prog_attach")) + goto cleanup; + + /* trigger tracepoint */ + usleep(1); + + map = bpf_object__find_map_by_name(obj, ".bss"); + if (!ASSERT_OK_PTR(map, "bss_map_by_name")) + goto cleanup; + + bss = bpf_map__initial_value(map, &bss_sz); + if (!ASSERT_OK_PTR(rodata, "rodata_get")) + goto cleanup; + + result = subskeleton_lib_subresult(obj) * 10; + ASSERT_EQ(bss->out1, result, "out1"); + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); +} + + +void test_subskeleton(void) +{ + if (test__start_subtest("skel_subskel")) + subtest_skel_subskeleton(); + if (test__start_subtest("obj_subskel")) + subtest_obj_subskeleton(); +} -- 2.51.0 From 1cb80d9e93f861018fabe81a69ea0ded20f5a2d0 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 23 Oct 2024 16:47:48 -0700 Subject: [PATCH 09/16] bpf: Support __uptr type tag in BTF MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This patch introduces the "__uptr" type tag to BTF. It is to define a pointer pointing to the user space memory. This patch adds BTF logic to pass the "__uptr" type tag. btf_find_kptr() is reused for the "__uptr" tag. The "__uptr" will only be supported in the map_value of the task storage map. However, btf_parse_struct_meta() also uses btf_find_kptr() but it is not interested in "__uptr". This patch adds a "field_mask" argument to btf_find_kptr() which will return BTF_FIELD_IGNORE if the caller is not interested in a “__uptr” field. btf_parse_kptr() is also reused to parse the uptr. The btf_check_and_fixup_fields() is changed to do extra checks on the uptr to ensure that its struct size is not larger than PAGE_SIZE. It is not clear how a uptr pointing to a CO-RE supported kernel struct will be used, so it is also not allowed now. Signed-off-by: Kui-Feng Lee Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20241023234759.860539-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++++ kernel/bpf/btf.c | 34 +++++++++++++++++++++++++++++----- kernel/bpf/syscall.c | 2 ++ 3 files changed, 36 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0c216e71cec7..bb31bc6d0c4d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -203,6 +203,7 @@ enum btf_field_type { BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD, BPF_REFCOUNT = (1 << 9), BPF_WORKQUEUE = (1 << 10), + BPF_UPTR = (1 << 11), }; typedef void (*btf_dtor_kfunc_t)(void *); @@ -322,6 +323,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "kptr"; case BPF_KPTR_PERCPU: return "percpu_kptr"; + case BPF_UPTR: + return "uptr"; case BPF_LIST_HEAD: return "bpf_list_head"; case BPF_LIST_NODE: @@ -350,6 +353,7 @@ static inline u32 btf_field_type_size(enum btf_field_type type) case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: return sizeof(u64); case BPF_LIST_HEAD: return sizeof(struct bpf_list_head); @@ -379,6 +383,7 @@ static inline u32 btf_field_type_align(enum btf_field_type type) case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: return __alignof__(u64); case BPF_LIST_HEAD: return __alignof__(struct bpf_list_head); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 13dd1fa1d1b9..76cafff2d99c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3334,7 +3334,7 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t, } static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, - u32 off, int sz, struct btf_field_info *info) + u32 off, int sz, struct btf_field_info *info, u32 field_mask) { enum btf_field_type type; u32 res_id; @@ -3358,9 +3358,14 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, type = BPF_KPTR_REF; else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off))) type = BPF_KPTR_PERCPU; + else if (!strcmp("uptr", __btf_name_by_offset(btf, t->name_off))) + type = BPF_UPTR; else return -EINVAL; + if (!(type & field_mask)) + return BTF_FIELD_IGNORE; + /* Get the base type */ t = btf_type_skip_modifiers(btf, t->type, &res_id); /* Only pointer to struct is allowed */ @@ -3502,7 +3507,7 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_ field_mask_test_name(BPF_REFCOUNT, "bpf_refcount"); /* Only return BPF_KPTR when all other types with matchable names fail */ - if (field_mask & BPF_KPTR && !__btf_type_is_struct(var_type)) { + if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) { type = BPF_KPTR_REF; goto end; } @@ -3535,6 +3540,7 @@ static int btf_repeat_fields(struct btf_field_info *info, case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: case BPF_LIST_HEAD: case BPF_RB_ROOT: break; @@ -3661,8 +3667,9 @@ static int btf_find_field_one(const struct btf *btf, case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: ret = btf_find_kptr(btf, var_type, off, sz, - info_cnt ? &info[0] : &tmp); + info_cnt ? &info[0] : &tmp, field_mask); if (ret < 0) return ret; break; @@ -3985,6 +3992,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]); if (ret < 0) goto end; @@ -4044,12 +4052,28 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) * Hence we only need to ensure that bpf_{list_head,rb_root} ownership * does not form cycles. */ - if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_GRAPH_ROOT)) + if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & (BPF_GRAPH_ROOT | BPF_UPTR))) return 0; for (i = 0; i < rec->cnt; i++) { struct btf_struct_meta *meta; + const struct btf_type *t; u32 btf_id; + if (rec->fields[i].type == BPF_UPTR) { + /* The uptr only supports pinning one page and cannot + * point to a kernel struct + */ + if (btf_is_kernel(rec->fields[i].kptr.btf)) + return -EINVAL; + t = btf_type_by_id(rec->fields[i].kptr.btf, + rec->fields[i].kptr.btf_id); + if (!t->size) + return -EINVAL; + if (t->size > PAGE_SIZE) + return -E2BIG; + continue; + } + if (!(rec->fields[i].type & BPF_GRAPH_ROOT)) continue; btf_id = rec->fields[i].graph_root.value_btf_id; @@ -5560,7 +5584,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) goto free_aof; } - ret = btf_find_kptr(btf, t, 0, 0, &tmp); + ret = btf_find_kptr(btf, t, 0, 0, &tmp, BPF_KPTR); if (ret != BTF_FIELD_FOUND) continue; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4d04d4d9c1f3..2d2935d9c096 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -548,6 +548,7 @@ void btf_record_free(struct btf_record *rec) case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: if (rec->fields[i].kptr.module) module_put(rec->fields[i].kptr.module); if (btf_is_kernel(rec->fields[i].kptr.btf)) @@ -597,6 +598,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: if (btf_is_kernel(fields[i].kptr.btf)) btf_get(fields[i].kptr.btf); if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { -- 2.51.0 From 99dde42e37497b3062516b1db7231f9dec744a00 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 23 Oct 2024 16:47:49 -0700 Subject: [PATCH 10/16] bpf: Handle BPF_UPTR in verifier This patch adds BPF_UPTR support to the verifier. Not that only the map_value will support the "__uptr" type tag. This patch enforces only BPF_LDX is allowed to the value of an uptr. After BPF_LDX, it will mark the dst_reg as PTR_TO_MEM | PTR_MAYBE_NULL with size deduced from the field.kptr.btf_id. This will make the dst_reg pointed memory to be readable and writable as scalar. There is a redundant "val_reg = reg_state(env, value_regno);" statement in the check_map_kptr_access(). This patch takes this chance to remove it also. Signed-off-by: Kui-Feng Lee Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20241023234759.860539-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f514247ba8ba..1bd0c3f41f2f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5485,6 +5485,22 @@ static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr return ret; } +static int mark_uptr_ld_reg(struct bpf_verifier_env *env, u32 regno, + struct btf_field *field) +{ + struct bpf_reg_state *reg; + const struct btf_type *t; + + t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id); + mark_reg_known_zero(env, cur_regs(env), regno); + reg = reg_state(env, regno); + reg->type = PTR_TO_MEM | PTR_MAYBE_NULL; + reg->mem_size = t->size; + reg->id = ++env->id_gen; + + return 0; +} + static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, int value_regno, int insn_idx, struct btf_field *kptr_field) @@ -5513,9 +5529,15 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, verbose(env, "store to referenced kptr disallowed\n"); return -EACCES; } + if (class != BPF_LDX && kptr_field->type == BPF_UPTR) { + verbose(env, "store to uptr disallowed\n"); + return -EACCES; + } if (class == BPF_LDX) { - val_reg = reg_state(env, value_regno); + if (kptr_field->type == BPF_UPTR) + return mark_uptr_ld_reg(env, value_regno, kptr_field); + /* We can simply mark the value_regno receiving the pointer * value from map as PTR_TO_BTF_ID, with the correct type. */ @@ -5573,21 +5595,26 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: if (src != ACCESS_DIRECT) { - verbose(env, "kptr cannot be accessed indirectly by helper\n"); + verbose(env, "%s cannot be accessed indirectly by helper\n", + btf_field_type_name(field->type)); return -EACCES; } if (!tnum_is_const(reg->var_off)) { - verbose(env, "kptr access cannot have variable offset\n"); + verbose(env, "%s access cannot have variable offset\n", + btf_field_type_name(field->type)); return -EACCES; } if (p != off + reg->var_off.value) { - verbose(env, "kptr access misaligned expected=%u off=%llu\n", + verbose(env, "%s access misaligned expected=%u off=%llu\n", + btf_field_type_name(field->type), p, off + reg->var_off.value); return -EACCES; } if (size != bpf_size_to_bytes(BPF_DW)) { - verbose(env, "kptr access size must be BPF_DW\n"); + verbose(env, "%s access size must be BPF_DW\n", + btf_field_type_name(field->type)); return -EACCES; } break; @@ -6953,7 +6980,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; if (tnum_is_const(reg->var_off)) kptr_field = btf_record_find(reg->map_ptr->record, - off + reg->var_off.value, BPF_KPTR); + off + reg->var_off.value, BPF_KPTR | BPF_UPTR); if (kptr_field) { err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field); } else if (t == BPF_READ && value_regno >= 0) { -- 2.51.0 From b9a5a07aeaa2a903fb1306eb422880b2fa5f937f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 23 Oct 2024 16:47:50 -0700 Subject: [PATCH 11/16] bpf: Add "bool swap_uptrs" arg to bpf_local_storage_update() and bpf_selem_alloc() In a later patch, the task local storage will only accept uptr from the syscall update_elem and will not accept uptr from the bpf prog. The reason is the bpf prog does not have a way to provide a valid user space address. bpf_local_storage_update() and bpf_selem_alloc() are used by both bpf prog bpf_task_storage_get(BPF_LOCAL_STORAGE_GET_F_CREATE) and bpf syscall update_elem. "bool swap_uptrs" arg is added to bpf_local_storage_update() and bpf_selem_alloc() to tell if it is called by the bpf prog or by the bpf syscall. When swap_uptrs==true, it is called by the syscall. The arg is named (swap_)uptrs because the later patch will swap the uptrs between the newly allocated selem and the user space provided map_value. It will make error handling easier in case map->ops->map_update_elem() fails and the caller can decide if it needs to unpin the uptr in the user space provided map_value or the bpf_local_storage_update() has already taken the uptr ownership and will take care of unpinning it also. Only swap_uptrs==false is passed now. The logic to handle the true case will be added in a later patch. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20241023234759.860539-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 4 ++-- kernel/bpf/bpf_cgrp_storage.c | 4 ++-- kernel/bpf/bpf_inode_storage.c | 4 ++-- kernel/bpf/bpf_local_storage.c | 8 ++++---- kernel/bpf/bpf_task_storage.c | 4 ++-- net/core/bpf_sk_storage.c | 6 +++--- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index dcddb0aef7d8..0c7216c065d5 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -181,7 +181,7 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, - bool charge_mem, gfp_t gfp_flags); + bool charge_mem, bool swap_uptrs, gfp_t gfp_flags); void bpf_selem_free(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *smap, @@ -195,7 +195,7 @@ bpf_local_storage_alloc(void *owner, struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags, gfp_t gfp_flags); + void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags); u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map); diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 28efd0a3f220..20f05de92e9c 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -107,7 +107,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, bpf_cgrp_storage_lock(); sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, - value, map_flags, GFP_ATOMIC); + value, map_flags, false, GFP_ATOMIC); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return PTR_ERR_OR_ZERO(sdata); @@ -181,7 +181,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, if (!percpu_ref_is_dying(&cgroup->self.refcnt) && (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, - value, BPF_NOEXIST, gfp_flags); + value, BPF_NOEXIST, false, gfp_flags); unlock: bpf_cgrp_storage_unlock(); diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 29da6d3838f6..44ccebc745e5 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -100,7 +100,7 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, sdata = bpf_local_storage_update(file_inode(fd_file(f)), (struct bpf_local_storage_map *)map, - value, map_flags, GFP_ATOMIC); + value, map_flags, false, GFP_ATOMIC); return PTR_ERR_OR_ZERO(sdata); } @@ -154,7 +154,7 @@ BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { sdata = bpf_local_storage_update( inode, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST, gfp_flags); + BPF_NOEXIST, false, gfp_flags); return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index c938dea5ddbf..1cf772cb26eb 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -73,7 +73,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, - void *value, bool charge_mem, gfp_t gfp_flags) + void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags) { struct bpf_local_storage_elem *selem; @@ -524,7 +524,7 @@ uncharge: */ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags, gfp_t gfp_flags) + void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags) { struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *alloc_selem, *selem = NULL; @@ -550,7 +550,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (err) return ERR_PTR(err); - selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); if (!selem) return ERR_PTR(-ENOMEM); @@ -584,7 +584,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, /* A lookup has just been done before and concluded a new selem is * needed. The chance of an unnecessary alloc is unlikely. */ - alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); if (!alloc_selem) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index adf6dfe0ba68..45dc3ca334d3 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -147,7 +147,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, bpf_task_storage_lock(); sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, map_flags, - GFP_ATOMIC); + false, GFP_ATOMIC); bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); @@ -219,7 +219,7 @@ static void *__bpf_task_storage_get(struct bpf_map *map, (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) { sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST, gfp_flags); + BPF_NOEXIST, false, gfp_flags); return IS_ERR(sdata) ? NULL : sdata->data; } diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index bc01b3aa6b0f..2f4ed83a75ae 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -106,7 +106,7 @@ static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, if (sock) { sdata = bpf_local_storage_update( sock->sk, (struct bpf_local_storage_map *)map, value, - map_flags, GFP_ATOMIC); + map_flags, false, GFP_ATOMIC); sockfd_put(sock); return PTR_ERR_OR_ZERO(sdata); } @@ -137,7 +137,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk, { struct bpf_local_storage_elem *copy_selem; - copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, GFP_ATOMIC); + copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, false, GFP_ATOMIC); if (!copy_selem) return NULL; @@ -243,7 +243,7 @@ BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, refcount_inc_not_zero(&sk->sk_refcnt)) { sdata = bpf_local_storage_update( sk, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST, gfp_flags); + BPF_NOEXIST, false, gfp_flags); /* sk must be a fullsock (guaranteed by verifier), * so sock_gen_put() is unnecessary. */ -- 2.51.0 From 5bd5bab76669b1e1551f03f5fcbc165f3fa8d269 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 23 Oct 2024 16:47:51 -0700 Subject: [PATCH 12/16] bpf: Postpone bpf_selem_free() in bpf_selem_unlink_storage_nolock() In a later patch, bpf_selem_free() will call unpin_user_page() through bpf_obj_free_fields(). unpin_user_page() may take spin_lock. However, some bpf_selem_free() call paths have held a raw_spin_lock. Like this: raw_spin_lock_irqsave() bpf_selem_unlink_storage_nolock() bpf_selem_free() unpin_user_page() spin_lock() To avoid spinlock nested in raw_spinlock, bpf_selem_free() should be done after releasing the raw_spinlock. The "bool reuse_now" arg is replaced with "struct hlist_head *free_selem_list" in bpf_selem_unlink_storage_nolock(). The bpf_selem_unlink_storage_nolock() will append the to-be-free selem at the free_selem_list. The caller of bpf_selem_unlink_storage_nolock() will need to call the new bpf_selem_free_list(free_selem_list, reuse_now) to free the selem after releasing the raw_spinlock. Note that the selem->snode cannot be reused for linking to the free_selem_list because the selem->snode is protected by the raw_spinlock that we want to avoid holding. A new "struct hlist_node free_node;" is union-ized with the rcu_head. Only the first one successfully hlist_del_init_rcu(&selem->snode) will be able to use the free_node. After succeeding hlist_del_init_rcu(&selem->snode), the free_node and rcu_head usage is serialized such that they can share the 16 bytes in a union. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20241023234759.860539-5-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 8 ++++++- kernel/bpf/bpf_local_storage.c | 35 ++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 0c7216c065d5..ab7244d8108f 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -77,7 +77,13 @@ struct bpf_local_storage_elem { struct hlist_node map_node; /* Linked to bpf_local_storage_map */ struct hlist_node snode; /* Linked to bpf_local_storage */ struct bpf_local_storage __rcu *local_storage; - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct hlist_node free_node; /* used to postpone + * bpf_selem_free + * after raw_spin_unlock + */ + }; /* 8 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 1cf772cb26eb..09a67dff2336 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -246,13 +246,30 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, } } +static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; + struct hlist_node *n; + + /* The "_safe" iteration is needed. + * The loop is not removing the selem from the list + * but bpf_selem_free will use the selem->rcu_head + * which is union-ized with the selem->free_node. + */ + hlist_for_each_entry_safe(selem, n, list, free_node) { + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + bpf_selem_free(selem, smap, reuse_now); + } +} + /* local_storage->lock must be held and selem->local_storage == local_storage. * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. */ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, - bool uncharge_mem, bool reuse_now) + bool uncharge_mem, struct hlist_head *free_selem_list) { struct bpf_local_storage_map *smap; bool free_local_storage; @@ -296,7 +313,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); - bpf_selem_free(selem, smap, reuse_now); + hlist_add_head(&selem->free_node, free_selem_list); if (rcu_access_pointer(local_storage->smap) == smap) RCU_INIT_POINTER(local_storage->smap, NULL); @@ -345,6 +362,7 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *storage_smap; struct bpf_local_storage *local_storage; bool bpf_ma, free_local_storage = false; + HLIST_HEAD(selem_free_list); unsigned long flags; if (unlikely(!selem_linked_to_storage_lockless(selem))) @@ -360,9 +378,11 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true, reuse_now); + local_storage, selem, true, &selem_free_list); raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_selem_free_list(&selem_free_list, reuse_now); + if (free_local_storage) bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now); } @@ -529,6 +549,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *alloc_selem, *selem = NULL; struct bpf_local_storage *local_storage; + HLIST_HEAD(old_selem_free_list); unsigned long flags; int err; @@ -624,11 +645,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - true, false); + true, &old_selem_free_list); } unlock: raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_selem_free_list(&old_selem_free_list, false); if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); bpf_selem_free(alloc_selem, smap, true); @@ -706,6 +728,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) struct bpf_local_storage_map *storage_smap; struct bpf_local_storage_elem *selem; bool bpf_ma, free_storage = false; + HLIST_HEAD(free_selem_list); struct hlist_node *n; unsigned long flags; @@ -734,10 +757,12 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * of the loop will set the free_cgroup_storage to true. */ free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true, true); + local_storage, selem, true, &free_selem_list); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_selem_free_list(&free_selem_list, true); + if (free_storage) bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true); } -- 2.51.0 From 9bac675e6368b96f448289010caba4ee3320ab24 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 23 Oct 2024 16:47:52 -0700 Subject: [PATCH 13/16] bpf: Postpone bpf_obj_free_fields to the rcu callback A later patch will enable the uptr usage in the task_local_storage map. This will require the unpin_user_page() to be done after the rcu task trace gp for the cases that the uptr may still be used by a bpf prog. The bpf_obj_free_fields() will be the one doing unpin_user_page(), so this patch is to postpone calling bpf_obj_free_fields() to the rcu callback. The bpf_obj_free_fields() is only required to be done in the rcu callback when bpf->bpf_ma==true and reuse_now==false. bpf->bpf_ma==true case is because uptr will only be enabled in task storage which has already been moved to bpf_mem_alloc. The bpf->bpf_ma==false case can be supported in the future also if there is a need. reuse_now==false when the selem (aka storage) is deleted by bpf prog (bpf_task_storage_delete) or by syscall delete_elem(). In both cases, bpf_obj_free_fields() needs to wait for rcu gp. A few words on reuse_now==true. reuse_now==true when the storage's owner (i.e. the task_struct) is destructing or the map itself is doing map_free(). In both cases, no bpf prog should have a hold on the selem and its uptrs, so there is no need to postpone bpf_obj_free_fields(). reuse_now==true should be the common case for local storage usage where the storage exists throughout the lifetime of its owner (task_struct). The bpf_obj_free_fields() needs to use the map->record. Doing bpf_obj_free_fields() in a rcu callback will require the bpf_local_storage_map_free() to wait for rcu_barrier. An optimization could be only waiting for rcu_barrier when the map has uptr in its map_value. This will require either yet another rcu callback function or adding a bool in the selem to flag if the SDATA(selem)->smap is still valid. This patch chooses to keep it simple and wait for rcu_barrier for maps that use bpf_mem_alloc. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20241023234759.860539-6-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_local_storage.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 09a67dff2336..ca871be1c42d 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -209,8 +209,12 @@ static void __bpf_selem_free(struct bpf_local_storage_elem *selem, static void bpf_selem_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); + /* The bpf_local_storage_map_free will wait for rcu_barrier */ + smap = rcu_dereference_check(SDATA(selem)->smap, 1); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); bpf_mem_cache_raw_free(selem); } @@ -226,16 +230,25 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *smap, bool reuse_now) { - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - if (!smap->bpf_ma) { + /* Only task storage has uptrs and task storage + * has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true + * for task storage, so this bpf_obj_free_fields() won't unpin + * any uptr. + */ + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); return; } - if (!reuse_now) { - call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); - } else { + if (reuse_now) { + /* reuse_now == true only happens when the storage owner + * (e.g. task_struct) is being destructed or the map itself + * is being destructed (ie map_free). In both cases, + * no bpf prog can have a hold on the selem. It is + * safe to unpin the uptrs and free the selem now. + */ + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); /* Instead of using the vanilla call_rcu(), * bpf_mem_cache_free will be able to reuse selem * immediately. @@ -243,7 +256,10 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, migrate_disable(); bpf_mem_cache_free(&smap->selem_ma, selem); migrate_enable(); + return; } + + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); } static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) @@ -908,6 +924,9 @@ void bpf_local_storage_map_free(struct bpf_map *map, synchronize_rcu(); if (smap->bpf_ma) { + rcu_barrier_tasks_trace(); + if (!rcu_trace_implies_rcu_gp()) + rcu_barrier(); bpf_mem_alloc_destroy(&smap->selem_ma); bpf_mem_alloc_destroy(&smap->storage_ma); } -- 2.51.0 From ba512b00e5efbf7e19cfb7fa9f66ce82669b7077 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 23 Oct 2024 16:47:53 -0700 Subject: [PATCH 14/16] bpf: Add uptr support in the map_value of the task local storage. This patch adds uptr support in the map_value of the task local storage. struct map_value { struct user_data __uptr *uptr; }; struct { __uint(type, BPF_MAP_TYPE_TASK_STORAGE); __uint(map_flags, BPF_F_NO_PREALLOC); __type(key, int); __type(value, struct value_type); } datamap SEC(".maps"); A new bpf_obj_pin_uptrs() is added to pin the user page and also stores the kernel address back to the uptr for the bpf prog to use later. It currently does not support the uptr pointing to a user struct across two pages. It also excludes PageHighMem support to keep it simple. As of now, the 32bit bpf jit is missing other more crucial bpf features. For example, many important bpf features depend on bpf kfunc now but so far only one arch (x86-32) supports it which was added by me as an example when kfunc was first introduced to bpf. The uptr can only be stored to the task local storage by the syscall update_elem. Meaning the uptr will not be considered if it is provided by the bpf prog through bpf_task_storage_get(BPF_LOCAL_STORAGE_GET_F_CREATE). This is enforced by only calling bpf_local_storage_update(swap_uptrs==true) in bpf_pid_task_storage_update_elem. Everywhere else will have swap_uptrs==false. This will pump down to bpf_selem_alloc(swap_uptrs==true). It is the only case that bpf_selem_alloc() will take the uptr value when updating the newly allocated selem. bpf_obj_swap_uptrs() is added to swap the uptr between the SDATA(selem)->data and the user provided map_value in "void *value". bpf_obj_swap_uptrs() makes the SDATA(selem)->data takes the ownership of the uptr and the user space provided map_value will have NULL in the uptr. The bpf_obj_unpin_uptrs() is called after map->ops->map_update_elem() returning error. If the map->ops->map_update_elem has reached a state that the local storage has taken the uptr ownership, the bpf_obj_unpin_uptrs() will be a no op because the uptr is NULL. A "__"bpf_obj_unpin_uptrs is added to make this error path unpin easier such that it does not have to check the map->record is NULL or not. BPF_F_LOCK is not supported when the map_value has uptr. This can be revisited later if there is a use case. A similar swap_uptrs idea can be considered. The final bit is to do unpin_user_page in the bpf_obj_free_fields(). The earlier patch has ensured that the bpf_obj_free_fields() has gone through the rcu gp when needed. Cc: linux-mm@kvack.org Cc: Shakeel Butt Signed-off-by: Martin KaFai Lau Acked-by: Shakeel Butt Link: https://lore.kernel.org/r/20241023234759.860539-7-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 20 +++++++ kernel/bpf/bpf_local_storage.c | 7 ++- kernel/bpf/bpf_task_storage.c | 5 +- kernel/bpf/syscall.c | 106 +++++++++++++++++++++++++++++++-- 4 files changed, 131 insertions(+), 7 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bb31bc6d0c4d..8888689aa917 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -424,6 +424,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: break; default: WARN_ON_ONCE(1); @@ -512,6 +513,25 @@ static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src bpf_obj_memcpy(map->record, dst, src, map->value_size, true); } +static inline void bpf_obj_swap_uptrs(const struct btf_record *rec, void *dst, void *src) +{ + unsigned long *src_uptr, *dst_uptr; + const struct btf_field *field; + int i; + + if (!btf_record_has_field(rec, BPF_UPTR)) + return; + + for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) { + if (field->type != BPF_UPTR) + continue; + + src_uptr = src + field->offset; + dst_uptr = dst + field->offset; + swap(*src_uptr, *dst_uptr); + } +} + static inline void bpf_obj_memzero(struct btf_record *rec, void *dst, u32 size) { u32 curr_off = 0; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index ca871be1c42d..7e6a0af0afc1 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -99,9 +99,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, } if (selem) { - if (value) + if (value) { + /* No need to call check_and_init_map_value as memory is zero init */ copy_map_value(&smap->map, SDATA(selem)->data, value); - /* No need to call check_and_init_map_value as memory is zero init */ + if (swap_uptrs) + bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value); + } return selem; } diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 45dc3ca334d3..09705f9988f3 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -129,6 +129,9 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, struct pid *pid; int fd, err; + if ((map_flags & BPF_F_LOCK) && btf_record_has_field(map->record, BPF_UPTR)) + return -EOPNOTSUPP; + fd = *(int *)key; pid = pidfd_get_pid(fd, &f_flags); if (IS_ERR(pid)) @@ -147,7 +150,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, bpf_task_storage_lock(); sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, map_flags, - false, GFP_ATOMIC); + true, GFP_ATOMIC); bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2d2935d9c096..426a52e5c7da 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -155,6 +155,89 @@ static void maybe_wait_bpf_programs(struct bpf_map *map) synchronize_rcu(); } +static void unpin_uptr_kaddr(void *kaddr) +{ + if (kaddr) + unpin_user_page(virt_to_page(kaddr)); +} + +static void __bpf_obj_unpin_uptrs(struct btf_record *rec, u32 cnt, void *obj) +{ + const struct btf_field *field; + void **uptr_addr; + int i; + + for (i = 0, field = rec->fields; i < cnt; i++, field++) { + if (field->type != BPF_UPTR) + continue; + + uptr_addr = obj + field->offset; + unpin_uptr_kaddr(*uptr_addr); + } +} + +static void bpf_obj_unpin_uptrs(struct btf_record *rec, void *obj) +{ + if (!btf_record_has_field(rec, BPF_UPTR)) + return; + + __bpf_obj_unpin_uptrs(rec, rec->cnt, obj); +} + +static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj) +{ + const struct btf_field *field; + const struct btf_type *t; + unsigned long start, end; + struct page *page; + void **uptr_addr; + int i, err; + + if (!btf_record_has_field(rec, BPF_UPTR)) + return 0; + + for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) { + if (field->type != BPF_UPTR) + continue; + + uptr_addr = obj + field->offset; + start = *(unsigned long *)uptr_addr; + if (!start) + continue; + + t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id); + /* t->size was checked for zero before */ + if (check_add_overflow(start, t->size - 1, &end)) { + err = -EFAULT; + goto unpin_all; + } + + /* The uptr's struct cannot span across two pages */ + if ((start & PAGE_MASK) != (end & PAGE_MASK)) { + err = -EOPNOTSUPP; + goto unpin_all; + } + + err = pin_user_pages_fast(start, 1, FOLL_LONGTERM | FOLL_WRITE, &page); + if (err != 1) + goto unpin_all; + + if (PageHighMem(page)) { + err = -EOPNOTSUPP; + unpin_user_page(page); + goto unpin_all; + } + + *uptr_addr = page_address(page) + offset_in_page(start); + } + + return 0; + +unpin_all: + __bpf_obj_unpin_uptrs(rec, i, obj); + return err; +} + static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, void *key, void *value, __u64 flags) { @@ -199,9 +282,14 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { err = map->ops->map_push_elem(map, value, flags); } else { - rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value, flags); - rcu_read_unlock(); + err = bpf_obj_pin_uptrs(map->record, value); + if (!err) { + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, flags); + rcu_read_unlock(); + if (err) + bpf_obj_unpin_uptrs(map->record, value); + } } bpf_enable_instrumentation(); @@ -716,6 +804,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) field->kptr.dtor(xchgd_field); } break; + case BPF_UPTR: + /* The caller ensured that no one is using the uptr */ + unpin_uptr_kaddr(*(void **)field_ptr); + break; case BPF_LIST_HEAD: if (WARN_ON_ONCE(rec->spin_lock_off < 0)) continue; @@ -1107,7 +1199,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE, + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1163,6 +1255,12 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, goto free_map_tab; } break; + case BPF_UPTR: + if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; case BPF_LIST_HEAD: case BPF_RB_ROOT: if (map->map_type != BPF_MAP_TYPE_HASH && -- 2.51.0 From 7aa12b8d9f24e9623effa12a3fc330de056d572e Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 23 Oct 2024 16:47:54 -0700 Subject: [PATCH 15/16] libbpf: define __uptr. Make __uptr available to BPF programs to enable them to define uptrs. Acked-by: Andrii Nakryiko Signed-off-by: Kui-Feng Lee Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20241023234759.860539-8-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf_helpers.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 80bc0242e8dc..686824b8b413 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -185,6 +185,7 @@ enum libbpf_tristate { #define __kptr_untrusted __attribute__((btf_type_tag("kptr_untrusted"))) #define __kptr __attribute__((btf_type_tag("kptr"))) #define __percpu_kptr __attribute__((btf_type_tag("percpu_kptr"))) +#define __uptr __attribute__((btf_type_tag("uptr"))) #if defined (__clang__) #define bpf_ksym_exists(sym) ({ \ -- 2.51.0 From 4579b4a4279ec7df9499943f764da03ae837021c Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 23 Oct 2024 16:47:55 -0700 Subject: [PATCH 16/16] selftests/bpf: Some basic __uptr tests Make sure the memory of uptrs have been mapped to the kernel properly. Also ensure the values of uptrs in the kernel haven't been copied to userspace. It also has the syscall update_elem/delete_elem test to test the pin/unpin code paths. Signed-off-by: Kui-Feng Lee Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20241023234759.860539-9-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/task_local_storage.c | 142 ++++++++++++++++++ .../selftests/bpf/progs/task_ls_uptr.c | 63 ++++++++ .../testing/selftests/bpf/uptr_test_common.h | 35 +++++ 3 files changed, 240 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/task_ls_uptr.c create mode 100644 tools/testing/selftests/bpf/uptr_test_common.h diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c index c33c05161a9e..4c8eadd1f083 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c @@ -7,12 +7,15 @@ #include #include /* For SYS_xxx definitions */ #include +#include #include #include "task_local_storage_helpers.h" #include "task_local_storage.skel.h" #include "task_local_storage_exit_creds.skel.h" #include "task_ls_recursion.skel.h" #include "task_storage_nodeadlock.skel.h" +#include "uptr_test_common.h" +#include "task_ls_uptr.skel.h" static void test_sys_enter_exit(void) { @@ -227,6 +230,143 @@ done: sched_setaffinity(getpid(), sizeof(old), &old); } +static struct user_data udata __attribute__((aligned(16))) = { + .a = 1, + .b = 2, +}; + +static struct user_data udata2 __attribute__((aligned(16))) = { + .a = 3, + .b = 4, +}; + +static void check_udata2(int expected) +{ + udata2.result = udata2.nested_result = 0; + usleep(1); + ASSERT_EQ(udata2.result, expected, "udata2.result"); + ASSERT_EQ(udata2.nested_result, expected, "udata2.nested_result"); +} + +static void test_uptr_basic(void) +{ + int map_fd, parent_task_fd, ev_fd; + struct value_type value = {}; + struct task_ls_uptr *skel; + pid_t child_pid, my_tid; + __u64 ev_dummy_data = 1; + int err; + + my_tid = syscall(SYS_gettid); + parent_task_fd = sys_pidfd_open(my_tid, 0); + if (!ASSERT_OK_FD(parent_task_fd, "parent_task_fd")) + return; + + ev_fd = eventfd(0, 0); + if (!ASSERT_OK_FD(ev_fd, "ev_fd")) { + close(parent_task_fd); + return; + } + + skel = task_ls_uptr__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + goto out; + + map_fd = bpf_map__fd(skel->maps.datamap); + value.udata = &udata; + value.nested.udata = &udata; + err = bpf_map_update_elem(map_fd, &parent_task_fd, &value, BPF_NOEXIST); + if (!ASSERT_OK(err, "update_elem(udata)")) + goto out; + + err = task_ls_uptr__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + + child_pid = fork(); + if (!ASSERT_NEQ(child_pid, -1, "fork")) + goto out; + + /* Call syscall in the child process, but access the map value of + * the parent process in the BPF program to check if the user kptr + * is translated/mapped correctly. + */ + if (child_pid == 0) { + /* child */ + + /* Overwrite the user_data in the child process to check if + * the BPF program accesses the user_data of the parent. + */ + udata.a = 0; + udata.b = 0; + + /* Wait for the parent to set child_pid */ + read(ev_fd, &ev_dummy_data, sizeof(ev_dummy_data)); + exit(0); + } + + skel->bss->parent_pid = my_tid; + skel->bss->target_pid = child_pid; + + write(ev_fd, &ev_dummy_data, sizeof(ev_dummy_data)); + + err = waitpid(child_pid, NULL, 0); + ASSERT_EQ(err, child_pid, "waitpid"); + ASSERT_EQ(udata.result, MAGIC_VALUE + udata.a + udata.b, "udata.result"); + ASSERT_EQ(udata.nested_result, MAGIC_VALUE + udata.a + udata.b, "udata.nested_result"); + + skel->bss->target_pid = my_tid; + + /* update_elem: uptr changes from udata1 to udata2 */ + value.udata = &udata2; + value.nested.udata = &udata2; + err = bpf_map_update_elem(map_fd, &parent_task_fd, &value, BPF_EXIST); + if (!ASSERT_OK(err, "update_elem(udata2)")) + goto out; + check_udata2(MAGIC_VALUE + udata2.a + udata2.b); + + /* update_elem: uptr changes from udata2 uptr to NULL */ + memset(&value, 0, sizeof(value)); + err = bpf_map_update_elem(map_fd, &parent_task_fd, &value, BPF_EXIST); + if (!ASSERT_OK(err, "update_elem(udata2)")) + goto out; + check_udata2(0); + + /* update_elem: uptr changes from NULL to udata2 */ + value.udata = &udata2; + value.nested.udata = &udata2; + err = bpf_map_update_elem(map_fd, &parent_task_fd, &value, BPF_EXIST); + if (!ASSERT_OK(err, "update_elem(udata2)")) + goto out; + check_udata2(MAGIC_VALUE + udata2.a + udata2.b); + + /* Check if user programs can access the value of user kptrs + * through bpf_map_lookup_elem(). Make sure the kernel value is not + * leaked. + */ + err = bpf_map_lookup_elem(map_fd, &parent_task_fd, &value); + if (!ASSERT_OK(err, "bpf_map_lookup_elem")) + goto out; + ASSERT_EQ(value.udata, NULL, "value.udata"); + ASSERT_EQ(value.nested.udata, NULL, "value.nested.udata"); + + /* delete_elem */ + err = bpf_map_delete_elem(map_fd, &parent_task_fd); + ASSERT_OK(err, "delete_elem(udata2)"); + check_udata2(0); + + /* update_elem: add uptr back to test map_free */ + value.udata = &udata2; + value.nested.udata = &udata2; + err = bpf_map_update_elem(map_fd, &parent_task_fd, &value, BPF_NOEXIST); + ASSERT_OK(err, "update_elem(udata2)"); + +out: + task_ls_uptr__destroy(skel); + close(ev_fd); + close(parent_task_fd); +} + void test_task_local_storage(void) { if (test__start_subtest("sys_enter_exit")) @@ -237,4 +377,6 @@ void test_task_local_storage(void) test_recursion(); if (test__start_subtest("nodeadlock")) test_nodeadlock(); + if (test__start_subtest("uptr_basic")) + test_uptr_basic(); } diff --git a/tools/testing/selftests/bpf/progs/task_ls_uptr.c b/tools/testing/selftests/bpf/progs/task_ls_uptr.c new file mode 100644 index 000000000000..ddbe11b46eef --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_ls_uptr.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "uptr_test_common.h" + +struct task_struct *bpf_task_from_pid(s32 pid) __ksym; +void bpf_task_release(struct task_struct *p) __ksym; +void bpf_cgroup_release(struct cgroup *cgrp) __ksym; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct value_type); +} datamap SEC(".maps"); + +pid_t target_pid = 0; +pid_t parent_pid = 0; + +SEC("tp_btf/sys_enter") +int on_enter(__u64 *ctx) +{ + struct task_struct *task, *data_task; + struct value_type *ptr; + struct user_data *udata; + struct cgroup *cgrp; + + task = bpf_get_current_task_btf(); + if (task->pid != target_pid) + return 0; + + data_task = bpf_task_from_pid(parent_pid); + if (!data_task) + return 0; + + ptr = bpf_task_storage_get(&datamap, data_task, 0, 0); + bpf_task_release(data_task); + if (!ptr) + return 0; + + cgrp = bpf_kptr_xchg(&ptr->cgrp, NULL); + if (cgrp) { + int lvl = cgrp->level; + + bpf_cgroup_release(cgrp); + return lvl; + } + + udata = ptr->udata; + if (!udata || udata->result) + return 0; + udata->result = MAGIC_VALUE + udata->a + udata->b; + + udata = ptr->nested.udata; + if (udata && !udata->nested_result) + udata->nested_result = udata->result; + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/uptr_test_common.h b/tools/testing/selftests/bpf/uptr_test_common.h new file mode 100644 index 000000000000..feb41176888c --- /dev/null +++ b/tools/testing/selftests/bpf/uptr_test_common.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#ifndef _UPTR_TEST_COMMON_H +#define _UPTR_TEST_COMMON_H + +#define MAGIC_VALUE 0xabcd1234 + +#ifdef __BPF__ +/* Avoid fwd btf type being generated for the following struct */ +struct user_data *dummy_data; +struct cgroup *dummy_cgrp; +#else +#define __uptr +#define __kptr +#endif + +struct user_data { + int a; + int b; + int result; + int nested_result; +}; + +struct nested_udata { + struct user_data __uptr *udata; +}; + +struct value_type { + struct user_data __uptr *udata; + struct cgroup __kptr *cgrp; + struct nested_udata nested; +}; + +#endif -- 2.51.0