From 3f36ee29e732b68044d531f47d31f22d265954c6 Mon Sep 17 00:00:00 2001 From: Konstantin Shkolnyy Date: Tue, 3 Dec 2024 09:06:55 -0600 Subject: [PATCH 01/16] vsock/test: fix parameter types in SO_VM_SOCKETS_* calls Change parameters of SO_VM_SOCKETS_* to unsigned long long as documented in the vm_sockets.h, because the corresponding kernel code requires them to be at least 64-bit, no matter what architecture. Otherwise they are too small on 32-bit machines. Fixes: 5c338112e48a ("test/vsock: rework message bounds test") Fixes: 685a21c314a8 ("test/vsock: add big message test") Fixes: 542e893fbadc ("vsock/test: two tests to check credit update logic") Fixes: 8abbffd27ced ("test/vsock: vsock_perf utility") Signed-off-by: Konstantin Shkolnyy Reviewed-by: Stefano Garzarella Signed-off-by: Paolo Abeni --- tools/testing/vsock/vsock_perf.c | 4 ++-- tools/testing/vsock/vsock_test.c | 22 +++++++++++++++++----- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/tools/testing/vsock/vsock_perf.c b/tools/testing/vsock/vsock_perf.c index 22633c2848cc..8e0a6c0770d3 100644 --- a/tools/testing/vsock/vsock_perf.c +++ b/tools/testing/vsock/vsock_perf.c @@ -33,7 +33,7 @@ static unsigned int port = DEFAULT_PORT; static unsigned long buf_size_bytes = DEFAULT_BUF_SIZE_BYTES; -static unsigned long vsock_buf_bytes = DEFAULT_VSOCK_BUF_BYTES; +static unsigned long long vsock_buf_bytes = DEFAULT_VSOCK_BUF_BYTES; static bool zerocopy; static void error(const char *s) @@ -162,7 +162,7 @@ static void run_receiver(int rcvlowat_bytes) printf("Run as receiver\n"); printf("Listen port %u\n", port); printf("RX buffer %lu bytes\n", buf_size_bytes); - printf("vsock buffer %lu bytes\n", vsock_buf_bytes); + printf("vsock buffer %llu bytes\n", vsock_buf_bytes); printf("SO_RCVLOWAT %d bytes\n", rcvlowat_bytes); fd = socket(AF_VSOCK, SOCK_STREAM, 0); diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 7fd25b814b4b..0b7f5bf546da 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -429,7 +429,7 @@ static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) static void test_seqpacket_msg_bounds_server(const struct test_opts *opts) { - unsigned long sock_buf_size; + unsigned long long sock_buf_size; unsigned long remote_hash; unsigned long curr_hash; int fd; @@ -634,7 +634,8 @@ static void test_seqpacket_timeout_server(const struct test_opts *opts) static void test_seqpacket_bigmsg_client(const struct test_opts *opts) { - unsigned long sock_buf_size; + unsigned long long sock_buf_size; + size_t buf_size; socklen_t len; void *data; int fd; @@ -655,13 +656,20 @@ static void test_seqpacket_bigmsg_client(const struct test_opts *opts) sock_buf_size++; - data = malloc(sock_buf_size); + /* size_t can be < unsigned long long */ + buf_size = (size_t)sock_buf_size; + if (buf_size != sock_buf_size) { + fprintf(stderr, "Returned BUFFER_SIZE too large\n"); + exit(EXIT_FAILURE); + } + + data = malloc(buf_size); if (!data) { perror("malloc"); exit(EXIT_FAILURE); } - send_buf(fd, data, sock_buf_size, 0, -EMSGSIZE); + send_buf(fd, data, buf_size, 0, -EMSGSIZE); control_writeln("CLISENT"); @@ -1360,6 +1368,7 @@ static void test_stream_credit_update_test(const struct test_opts *opts, int recv_buf_size; struct pollfd fds; size_t buf_size; + unsigned long long sock_buf_size; void *buf; int fd; @@ -1371,8 +1380,11 @@ static void test_stream_credit_update_test(const struct test_opts *opts, buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE; + /* size_t can be < unsigned long long */ + sock_buf_size = buf_size; + if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, - &buf_size, sizeof(buf_size))) { + &sock_buf_size, sizeof(sock_buf_size))) { perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); exit(EXIT_FAILURE); } -- 2.50.1 From 86814d8ffd55fd4ad19c512eccd721522a370fb2 Mon Sep 17 00:00:00 2001 From: Konstantin Shkolnyy Date: Tue, 3 Dec 2024 09:06:56 -0600 Subject: [PATCH 02/16] vsock/test: verify socket options after setting them Replace setsockopt() calls with calls to functions that follow setsockopt() with getsockopt() and check that the returned value and its size are the same as have been set. (Except in vsock_perf.) Signed-off-by: Konstantin Shkolnyy Reviewed-by: Stefano Garzarella Signed-off-by: Paolo Abeni --- tools/testing/vsock/control.c | 9 +- tools/testing/vsock/msg_zerocopy_common.c | 10 -- tools/testing/vsock/msg_zerocopy_common.h | 1 - tools/testing/vsock/util.c | 142 ++++++++++++++++++++++ tools/testing/vsock/util.h | 7 ++ tools/testing/vsock/vsock_perf.c | 10 ++ tools/testing/vsock/vsock_test.c | 51 +++----- tools/testing/vsock/vsock_test_zerocopy.c | 2 +- tools/testing/vsock/vsock_uring_test.c | 2 +- 9 files changed, 181 insertions(+), 53 deletions(-) diff --git a/tools/testing/vsock/control.c b/tools/testing/vsock/control.c index d2deb4b15b94..0066e0324d35 100644 --- a/tools/testing/vsock/control.c +++ b/tools/testing/vsock/control.c @@ -27,6 +27,7 @@ #include "timeout.h" #include "control.h" +#include "util.h" static int control_fd = -1; @@ -50,7 +51,6 @@ void control_init(const char *control_host, for (ai = result; ai; ai = ai->ai_next) { int fd; - int val = 1; fd = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); if (fd < 0) @@ -65,11 +65,8 @@ void control_init(const char *control_host, break; } - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, - &val, sizeof(val)) < 0) { - perror("setsockopt"); - exit(EXIT_FAILURE); - } + setsockopt_int_check(fd, SOL_SOCKET, SO_REUSEADDR, 1, + "setsockopt SO_REUSEADDR"); if (bind(fd, ai->ai_addr, ai->ai_addrlen) < 0) goto next; diff --git a/tools/testing/vsock/msg_zerocopy_common.c b/tools/testing/vsock/msg_zerocopy_common.c index 5a4bdf7b5132..8622e5a0f8b7 100644 --- a/tools/testing/vsock/msg_zerocopy_common.c +++ b/tools/testing/vsock/msg_zerocopy_common.c @@ -14,16 +14,6 @@ #include "msg_zerocopy_common.h" -void enable_so_zerocopy(int fd) -{ - int val = 1; - - if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &val, sizeof(val))) { - perror("setsockopt"); - exit(EXIT_FAILURE); - } -} - void vsock_recv_completion(int fd, const bool *zerocopied) { struct sock_extended_err *serr; diff --git a/tools/testing/vsock/msg_zerocopy_common.h b/tools/testing/vsock/msg_zerocopy_common.h index 3763c5ccedb9..ad14139e93ca 100644 --- a/tools/testing/vsock/msg_zerocopy_common.h +++ b/tools/testing/vsock/msg_zerocopy_common.h @@ -12,7 +12,6 @@ #define VSOCK_RECVERR 1 #endif -void enable_so_zerocopy(int fd); void vsock_recv_completion(int fd, const bool *zerocopied); #endif /* MSG_ZEROCOPY_COMMON_H */ diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c index a3d448a075e3..34e9dac0a105 100644 --- a/tools/testing/vsock/util.c +++ b/tools/testing/vsock/util.c @@ -651,3 +651,145 @@ void free_test_iovec(const struct iovec *test_iovec, free(iovec); } + +/* Set "unsigned long long" socket option and check that it's indeed set */ +void setsockopt_ull_check(int fd, int level, int optname, + unsigned long long val, char const *errmsg) +{ + unsigned long long chkval; + socklen_t chklen; + int err; + + err = setsockopt(fd, level, optname, &val, sizeof(val)); + if (err) { + fprintf(stderr, "setsockopt err: %s (%d)\n", + strerror(errno), errno); + goto fail; + } + + chkval = ~val; /* just make storage != val */ + chklen = sizeof(chkval); + + err = getsockopt(fd, level, optname, &chkval, &chklen); + if (err) { + fprintf(stderr, "getsockopt err: %s (%d)\n", + strerror(errno), errno); + goto fail; + } + + if (chklen != sizeof(chkval)) { + fprintf(stderr, "size mismatch: set %zu got %d\n", sizeof(val), + chklen); + goto fail; + } + + if (chkval != val) { + fprintf(stderr, "value mismatch: set %llu got %llu\n", val, + chkval); + goto fail; + } + return; +fail: + fprintf(stderr, "%s val %llu\n", errmsg, val); + exit(EXIT_FAILURE); +; +} + +/* Set "int" socket option and check that it's indeed set */ +void setsockopt_int_check(int fd, int level, int optname, int val, + char const *errmsg) +{ + int chkval; + socklen_t chklen; + int err; + + err = setsockopt(fd, level, optname, &val, sizeof(val)); + if (err) { + fprintf(stderr, "setsockopt err: %s (%d)\n", + strerror(errno), errno); + goto fail; + } + + chkval = ~val; /* just make storage != val */ + chklen = sizeof(chkval); + + err = getsockopt(fd, level, optname, &chkval, &chklen); + if (err) { + fprintf(stderr, "getsockopt err: %s (%d)\n", + strerror(errno), errno); + goto fail; + } + + if (chklen != sizeof(chkval)) { + fprintf(stderr, "size mismatch: set %zu got %d\n", sizeof(val), + chklen); + goto fail; + } + + if (chkval != val) { + fprintf(stderr, "value mismatch: set %d got %d\n", val, chkval); + goto fail; + } + return; +fail: + fprintf(stderr, "%s val %d\n", errmsg, val); + exit(EXIT_FAILURE); +} + +static void mem_invert(unsigned char *mem, size_t size) +{ + size_t i; + + for (i = 0; i < size; i++) + mem[i] = ~mem[i]; +} + +/* Set "timeval" socket option and check that it's indeed set */ +void setsockopt_timeval_check(int fd, int level, int optname, + struct timeval val, char const *errmsg) +{ + struct timeval chkval; + socklen_t chklen; + int err; + + err = setsockopt(fd, level, optname, &val, sizeof(val)); + if (err) { + fprintf(stderr, "setsockopt err: %s (%d)\n", + strerror(errno), errno); + goto fail; + } + + /* just make storage != val */ + chkval = val; + mem_invert((unsigned char *)&chkval, sizeof(chkval)); + chklen = sizeof(chkval); + + err = getsockopt(fd, level, optname, &chkval, &chklen); + if (err) { + fprintf(stderr, "getsockopt err: %s (%d)\n", + strerror(errno), errno); + goto fail; + } + + if (chklen != sizeof(chkval)) { + fprintf(stderr, "size mismatch: set %zu got %d\n", sizeof(val), + chklen); + goto fail; + } + + if (memcmp(&chkval, &val, sizeof(val)) != 0) { + fprintf(stderr, "value mismatch: set %ld:%ld got %ld:%ld\n", + val.tv_sec, val.tv_usec, chkval.tv_sec, chkval.tv_usec); + goto fail; + } + return; +fail: + fprintf(stderr, "%s val %ld:%ld\n", errmsg, val.tv_sec, val.tv_usec); + exit(EXIT_FAILURE); +} + +void enable_so_zerocopy_check(int fd) +{ + setsockopt_int_check(fd, SOL_SOCKET, SO_ZEROCOPY, 1, + "setsockopt SO_ZEROCOPY"); +} diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h index fff22d4a14c0..ba84d296d8b7 100644 --- a/tools/testing/vsock/util.h +++ b/tools/testing/vsock/util.h @@ -68,4 +68,11 @@ unsigned long iovec_hash_djb2(const struct iovec *iov, size_t iovnum); struct iovec *alloc_test_iovec(const struct iovec *test_iovec, int iovnum); void free_test_iovec(const struct iovec *test_iovec, struct iovec *iovec, int iovnum); +void setsockopt_ull_check(int fd, int level, int optname, + unsigned long long val, char const *errmsg); +void setsockopt_int_check(int fd, int level, int optname, int val, + char const *errmsg); +void setsockopt_timeval_check(int fd, int level, int optname, + struct timeval val, char const *errmsg); +void enable_so_zerocopy_check(int fd); #endif /* UTIL_H */ diff --git a/tools/testing/vsock/vsock_perf.c b/tools/testing/vsock/vsock_perf.c index 8e0a6c0770d3..75971ac708c9 100644 --- a/tools/testing/vsock/vsock_perf.c +++ b/tools/testing/vsock/vsock_perf.c @@ -251,6 +251,16 @@ static void run_receiver(int rcvlowat_bytes) close(fd); } +static void enable_so_zerocopy(int fd) +{ + int val = 1; + + if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &val, sizeof(val))) { + perror("setsockopt"); + exit(EXIT_FAILURE); + } +} + static void run_sender(int peer_cid, unsigned long to_send_bytes) { time_t tx_begin_ns; diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 0b7f5bf546da..48f17641ca50 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -444,17 +444,13 @@ static void test_seqpacket_msg_bounds_server(const struct test_opts *opts) sock_buf_size = SOCK_BUF_SIZE; - if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE, - &sock_buf_size, sizeof(sock_buf_size))) { - perror("setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)"); - exit(EXIT_FAILURE); - } + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)"); - if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, - &sock_buf_size, sizeof(sock_buf_size))) { - perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); - exit(EXIT_FAILURE); - } + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); /* Ready to receive data. */ control_writeln("SRVREADY"); @@ -586,10 +582,8 @@ static void test_seqpacket_timeout_client(const struct test_opts *opts) tv.tv_sec = RCVTIMEO_TIMEOUT_SEC; tv.tv_usec = 0; - if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, (void *)&tv, sizeof(tv)) == -1) { - perror("setsockopt(SO_RCVTIMEO)"); - exit(EXIT_FAILURE); - } + setsockopt_timeval_check(fd, SOL_SOCKET, SO_RCVTIMEO, tv, + "setsockopt(SO_RCVTIMEO)"); read_enter_ns = current_nsec(); @@ -855,11 +849,8 @@ static void test_stream_poll_rcvlowat_client(const struct test_opts *opts) exit(EXIT_FAILURE); } - if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, - &lowat_val, sizeof(lowat_val))) { - perror("setsockopt(SO_RCVLOWAT)"); - exit(EXIT_FAILURE); - } + setsockopt_int_check(fd, SOL_SOCKET, SO_RCVLOWAT, + lowat_val, "setsockopt(SO_RCVLOWAT)"); control_expectln("SRVSENT"); @@ -1383,11 +1374,9 @@ static void test_stream_credit_update_test(const struct test_opts *opts, /* size_t can be < unsigned long long */ sock_buf_size = buf_size; - if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, - &sock_buf_size, sizeof(sock_buf_size))) { - perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); - exit(EXIT_FAILURE); - } + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); if (low_rx_bytes_test) { /* Set new SO_RCVLOWAT here. This enables sending credit @@ -1396,11 +1385,8 @@ static void test_stream_credit_update_test(const struct test_opts *opts, */ recv_buf_size = 1 + VIRTIO_VSOCK_MAX_PKT_BUF_SIZE; - if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, - &recv_buf_size, sizeof(recv_buf_size))) { - perror("setsockopt(SO_RCVLOWAT)"); - exit(EXIT_FAILURE); - } + setsockopt_int_check(fd, SOL_SOCKET, SO_RCVLOWAT, + recv_buf_size, "setsockopt(SO_RCVLOWAT)"); } /* Send one dummy byte here, because 'setsockopt()' above also @@ -1442,11 +1428,8 @@ static void test_stream_credit_update_test(const struct test_opts *opts, recv_buf_size++; /* Updating SO_RCVLOWAT will send credit update. */ - if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, - &recv_buf_size, sizeof(recv_buf_size))) { - perror("setsockopt(SO_RCVLOWAT)"); - exit(EXIT_FAILURE); - } + setsockopt_int_check(fd, SOL_SOCKET, SO_RCVLOWAT, + recv_buf_size, "setsockopt(SO_RCVLOWAT)"); } fds.fd = fd; diff --git a/tools/testing/vsock/vsock_test_zerocopy.c b/tools/testing/vsock/vsock_test_zerocopy.c index 04c376b6937f..9d9a6cb9614a 100644 --- a/tools/testing/vsock/vsock_test_zerocopy.c +++ b/tools/testing/vsock/vsock_test_zerocopy.c @@ -162,7 +162,7 @@ static void test_client(const struct test_opts *opts, } if (test_data->so_zerocopy) - enable_so_zerocopy(fd); + enable_so_zerocopy_check(fd); iovec = alloc_test_iovec(test_data->vecs, test_data->vecs_cnt); diff --git a/tools/testing/vsock/vsock_uring_test.c b/tools/testing/vsock/vsock_uring_test.c index 6c3e6f70c457..5c3078969659 100644 --- a/tools/testing/vsock/vsock_uring_test.c +++ b/tools/testing/vsock/vsock_uring_test.c @@ -73,7 +73,7 @@ static void vsock_io_uring_client(const struct test_opts *opts, } if (msg_zerocopy) - enable_so_zerocopy(fd); + enable_so_zerocopy_check(fd); iovec = alloc_test_iovec(test_data->vecs, test_data->vecs_cnt); -- 2.50.1 From 750e51603395e755537da08f745864c93e3ce741 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 3 Dec 2024 17:09:33 +0000 Subject: [PATCH 03/16] net: avoid potential UAF in default_operstate() syzbot reported an UAF in default_operstate() [1] Issue is a race between device and netns dismantles. After calling __rtnl_unlock() from netdev_run_todo(), we can not assume the netns of each device is still alive. Make sure the device is not in NETREG_UNREGISTERED state, and add an ASSERT_RTNL() before the call to __dev_get_by_index(). We might move this ASSERT_RTNL() in __dev_get_by_index() in the future. [1] BUG: KASAN: slab-use-after-free in __dev_get_by_index+0x5d/0x110 net/core/dev.c:852 Read of size 8 at addr ffff888043eba1b0 by task syz.0.0/5339 CPU: 0 UID: 0 PID: 5339 Comm: syz.0.0 Not tainted 6.12.0-syzkaller-10296-gaaf20f870da0 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0x169/0x550 mm/kasan/report.c:489 kasan_report+0x143/0x180 mm/kasan/report.c:602 __dev_get_by_index+0x5d/0x110 net/core/dev.c:852 default_operstate net/core/link_watch.c:51 [inline] rfc2863_policy+0x224/0x300 net/core/link_watch.c:67 linkwatch_do_dev+0x3e/0x170 net/core/link_watch.c:170 netdev_run_todo+0x461/0x1000 net/core/dev.c:10894 rtnl_unlock net/core/rtnetlink.c:152 [inline] rtnl_net_unlock include/linux/rtnetlink.h:133 [inline] rtnl_dellink+0x760/0x8d0 net/core/rtnetlink.c:3520 rtnetlink_rcv_msg+0x791/0xcf0 net/core/rtnetlink.c:6911 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2541 netlink_unicast_kernel net/netlink/af_netlink.c:1321 [inline] netlink_unicast+0x7f6/0x990 net/netlink/af_netlink.c:1347 netlink_sendmsg+0x8e4/0xcb0 net/netlink/af_netlink.c:1891 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:726 ____sys_sendmsg+0x52a/0x7e0 net/socket.c:2583 ___sys_sendmsg net/socket.c:2637 [inline] __sys_sendmsg+0x269/0x350 net/socket.c:2669 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f2a3cb80809 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f2a3d9cd058 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f2a3cd45fa0 RCX: 00007f2a3cb80809 RDX: 0000000000000000 RSI: 0000000020000000 RDI: 0000000000000008 RBP: 00007f2a3cbf393e R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00007f2a3cd45fa0 R15: 00007ffd03bc65c8 Allocated by task 5339: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x98/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x243/0x390 mm/slub.c:4314 kmalloc_noprof include/linux/slab.h:901 [inline] kmalloc_array_noprof include/linux/slab.h:945 [inline] netdev_create_hash net/core/dev.c:11870 [inline] netdev_init+0x10c/0x250 net/core/dev.c:11890 ops_init+0x31e/0x590 net/core/net_namespace.c:138 setup_net+0x287/0x9e0 net/core/net_namespace.c:362 copy_net_ns+0x33f/0x570 net/core/net_namespace.c:500 create_new_namespaces+0x425/0x7b0 kernel/nsproxy.c:110 unshare_nsproxy_namespaces+0x124/0x180 kernel/nsproxy.c:228 ksys_unshare+0x57d/0xa70 kernel/fork.c:3314 __do_sys_unshare kernel/fork.c:3385 [inline] __se_sys_unshare kernel/fork.c:3383 [inline] __x64_sys_unshare+0x38/0x40 kernel/fork.c:3383 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 12: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:582 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x59/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2338 [inline] slab_free mm/slub.c:4598 [inline] kfree+0x196/0x420 mm/slub.c:4746 netdev_exit+0x65/0xd0 net/core/dev.c:11992 ops_exit_list net/core/net_namespace.c:172 [inline] cleanup_net+0x802/0xcc0 net/core/net_namespace.c:632 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xa63/0x1850 kernel/workqueue.c:3310 worker_thread+0x870/0xd30 kernel/workqueue.c:3391 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 The buggy address belongs to the object at ffff888043eba000 which belongs to the cache kmalloc-2k of size 2048 The buggy address is located 432 bytes inside of freed 2048-byte region [ffff888043eba000, ffff888043eba800) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x43eb8 head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 flags: 0x4fff00000000040(head|node=1|zone=1|lastcpupid=0x7ff) page_type: f5(slab) raw: 04fff00000000040 ffff88801ac42000 dead000000000122 0000000000000000 raw: 0000000000000000 0000000000080008 00000001f5000000 0000000000000000 head: 04fff00000000040 ffff88801ac42000 dead000000000122 0000000000000000 head: 0000000000000000 0000000000080008 00000001f5000000 0000000000000000 head: 04fff00000000003 ffffea00010fae01 ffffffffffffffff 0000000000000000 head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 3, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 5339, tgid 5338 (syz.0.0), ts 69674195892, free_ts 69663220888 set_page_owner include/linux/page_owner.h:32 [inline] post_alloc_hook+0x1f3/0x230 mm/page_alloc.c:1556 prep_new_page mm/page_alloc.c:1564 [inline] get_page_from_freelist+0x3649/0x3790 mm/page_alloc.c:3474 __alloc_pages_noprof+0x292/0x710 mm/page_alloc.c:4751 alloc_pages_mpol_noprof+0x3e8/0x680 mm/mempolicy.c:2265 alloc_slab_page+0x6a/0x140 mm/slub.c:2408 allocate_slab+0x5a/0x2f0 mm/slub.c:2574 new_slab mm/slub.c:2627 [inline] ___slab_alloc+0xcd1/0x14b0 mm/slub.c:3815 __slab_alloc+0x58/0xa0 mm/slub.c:3905 __slab_alloc_node mm/slub.c:3980 [inline] slab_alloc_node mm/slub.c:4141 [inline] __do_kmalloc_node mm/slub.c:4282 [inline] __kmalloc_noprof+0x2e6/0x4c0 mm/slub.c:4295 kmalloc_noprof include/linux/slab.h:905 [inline] sk_prot_alloc+0xe0/0x210 net/core/sock.c:2165 sk_alloc+0x38/0x370 net/core/sock.c:2218 __netlink_create+0x65/0x260 net/netlink/af_netlink.c:629 __netlink_kernel_create+0x174/0x6f0 net/netlink/af_netlink.c:2015 netlink_kernel_create include/linux/netlink.h:62 [inline] uevent_net_init+0xed/0x2d0 lib/kobject_uevent.c:783 ops_init+0x31e/0x590 net/core/net_namespace.c:138 setup_net+0x287/0x9e0 net/core/net_namespace.c:362 page last free pid 1032 tgid 1032 stack trace: reset_page_owner include/linux/page_owner.h:25 [inline] free_pages_prepare mm/page_alloc.c:1127 [inline] free_unref_page+0xdf9/0x1140 mm/page_alloc.c:2657 __slab_free+0x31b/0x3d0 mm/slub.c:4509 qlink_free mm/kasan/quarantine.c:163 [inline] qlist_free_all+0x9a/0x140 mm/kasan/quarantine.c:179 kasan_quarantine_reduce+0x14f/0x170 mm/kasan/quarantine.c:286 __kasan_slab_alloc+0x23/0x80 mm/kasan/common.c:329 kasan_slab_alloc include/linux/kasan.h:250 [inline] slab_post_alloc_hook mm/slub.c:4104 [inline] slab_alloc_node mm/slub.c:4153 [inline] kmem_cache_alloc_node_noprof+0x1d9/0x380 mm/slub.c:4205 __alloc_skb+0x1c3/0x440 net/core/skbuff.c:668 alloc_skb include/linux/skbuff.h:1323 [inline] alloc_skb_with_frags+0xc3/0x820 net/core/skbuff.c:6612 sock_alloc_send_pskb+0x91a/0xa60 net/core/sock.c:2881 sock_alloc_send_skb include/net/sock.h:1797 [inline] mld_newpack+0x1c3/0xaf0 net/ipv6/mcast.c:1747 add_grhead net/ipv6/mcast.c:1850 [inline] add_grec+0x1492/0x19a0 net/ipv6/mcast.c:1988 mld_send_initial_cr+0x228/0x4b0 net/ipv6/mcast.c:2234 ipv6_mc_dad_complete+0x88/0x490 net/ipv6/mcast.c:2245 addrconf_dad_completed+0x712/0xcd0 net/ipv6/addrconf.c:4342 addrconf_dad_work+0xdc2/0x16f0 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xa63/0x1850 kernel/workqueue.c:3310 Memory state around the buggy address: ffff888043eba080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888043eba100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff888043eba180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888043eba200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888043eba280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 8c55facecd7a ("net: linkwatch: only report IF_OPER_LOWERLAYERDOWN if iflink is actually down") Reported-by: syzbot+1939f24bdb783e9e43d9@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/674f3a18.050a0220.48a03.0041.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241203170933.2449307-1-edumazet@google.com Signed-off-by: Paolo Abeni --- net/core/link_watch.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/core/link_watch.c b/net/core/link_watch.c index ab150641142a..1b4d39e38084 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -45,9 +45,14 @@ static unsigned int default_operstate(const struct net_device *dev) int iflink = dev_get_iflink(dev); struct net_device *peer; - if (iflink == dev->ifindex) + /* If called from netdev_run_todo()/linkwatch_sync_dev(), + * dev_net(dev) can be already freed, and RTNL is not held. + */ + if (dev->reg_state == NETREG_UNREGISTERED || + iflink == dev->ifindex) return IF_OPER_DOWN; + ASSERT_RTNL(); peer = __dev_get_by_index(dev_net(dev), iflink); if (!peer) return IF_OPER_DOWN; -- 2.50.1 From 31f1b55d5d7e531cd827419e5d71c19f24de161c Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Tue, 3 Dec 2024 21:48:20 -0800 Subject: [PATCH 04/16] net :mana :Request a V2 response version for MANA_QUERY_GF_STAT The current requested response version(V1) for MANA_QUERY_GF_STAT query results in STATISTICS_FLAGS_TX_ERRORS_GDMA_ERROR value being set to 0 always. In order to get the correct value for this counter we request the response version to be V2. Cc: stable@vger.kernel.org Fixes: e1df5202e879 ("net :mana :Add remaining GDMA stats for MANA to ethtool") Signed-off-by: Shradha Gupta Reviewed-by: Haiyang Zhang Link: https://patch.msgid.link/1733291300-12593-1-git-send-email-shradhagupta@linux.microsoft.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/mana_en.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 57ac732e7707..f73848a4feb3 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -2536,6 +2536,7 @@ void mana_query_gf_stats(struct mana_port_context *apc) mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_GF_STAT, sizeof(req), sizeof(resp)); + req.hdr.resp.msg_version = GDMA_MESSAGE_V2; req.req_stats = STATISTICS_FLAGS_RX_DISCARDS_NO_WQE | STATISTICS_FLAGS_RX_ERRORS_VPORT_DISABLED | STATISTICS_FLAGS_HC_RX_BYTES | -- 2.50.1 From dc1b157b828dfe412c776ac1dd8db158f6016b39 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 4 Dec 2024 10:04:14 -0500 Subject: [PATCH 05/16] tracing: Fix archs that still call tracepoints without RCU watching Tracepoints require having RCU "watching" as it uses RCU to do updates to the tracepoints. There are some cases that would call a tracepoint when RCU was not "watching". This was usually in the idle path where RCU has "shutdown". For the few locations that had tracepoints without RCU watching, there was an trace_*_rcuidle() variant that could be used. This used SRCU for protection. There are tracepoints that trace when interrupts and preemption are enabled and disabled. In some architectures, these tracepoints are called in a path where RCU is not watching. When x86 and arm64 removed these locations, it was incorrectly assumed that it would be safe to remove the trace_*_rcuidle() variant and also remove the SRCU logic, as it made the code more complex and harder to implement new tracepoint features (like faultable tracepoints and tracepoints in rust). Instead of bringing back the trace_*_rcuidle(), as it will not be trivial to do as new code has already been added depending on its removal, add a workaround to the one file that still requires it (trace_preemptirq.c). If the architecture does not define CONFIG_ARCH_WANTS_NO_INSTR, then check if the code is in the idle path, and if so, call ct_irq_enter/exit() which will enable RCU around the tracepoint. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Mark Rutland Link: https://lore.kernel.org/20241204100414.4d3e06d0@gandalf.local.home Reported-by: Geert Uytterhoeven Fixes: 48bcda684823 ("tracing: Remove definition of trace_*_rcuidle()") Closes: https://lore.kernel.org/all/bddb02de-957a-4df5-8e77-829f55728ea2@roeck-us.net/ Acked-by: Paul E. McKenney Tested-by: Guenter Roeck Tested-by: Geert Uytterhoeven Tested-by: Madhavan Srinivasan Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_preemptirq.c | 43 ++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 5c03633316a6..0c42b15c3800 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -10,11 +10,42 @@ #include #include #include +#include #include "trace.h" #define CREATE_TRACE_POINTS #include +/* + * Use regular trace points on architectures that implement noinstr + * tooling: these calls will only happen with RCU enabled, which can + * use a regular tracepoint. + * + * On older architectures, RCU may not be watching in idle. In that + * case, wake up RCU to watch while calling the tracepoint. These + * aren't NMI-safe - so exclude NMI contexts: + */ +#ifdef CONFIG_ARCH_WANTS_NO_INSTR +#define trace(point, args) trace_##point(args) +#else +#define trace(point, args) \ + do { \ + if (trace_##point##_enabled()) { \ + bool exit_rcu = false; \ + if (in_nmi()) \ + break; \ + if (!IS_ENABLED(CONFIG_TINY_RCU) && \ + is_idle_task(current)) { \ + ct_irq_enter(); \ + exit_rcu = true; \ + } \ + trace_##point(args); \ + if (exit_rcu) \ + ct_irq_exit(); \ + } \ + } while (0) +#endif + #ifdef CONFIG_TRACE_IRQFLAGS /* Per-cpu variable to prevent redundant calls when IRQs already off */ static DEFINE_PER_CPU(int, tracing_irq_cpu); @@ -28,7 +59,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu); void trace_hardirqs_on_prepare(void) { if (this_cpu_read(tracing_irq_cpu)) { - trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_enable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1)); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -39,7 +70,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare); void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { - trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_enable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1)); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -61,7 +92,7 @@ void trace_hardirqs_off_finish(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_disable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1)); } } @@ -75,7 +106,7 @@ void trace_hardirqs_off(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_disable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1)); } } EXPORT_SYMBOL(trace_hardirqs_off); @@ -86,13 +117,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off); void trace_preempt_on(unsigned long a0, unsigned long a1) { - trace_preempt_enable(a0, a1); + trace(preempt_enable, TP_ARGS(a0, a1)); tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - trace_preempt_disable(a0, a1); + trace(preempt_disable, TP_ARGS(a0, a1)); tracer_preempt_off(a0, a1); } #endif -- 2.50.1 From b04d86fff66b15c07505d226431f808c15b1703c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Dec 2024 17:05:48 +0000 Subject: [PATCH 06/16] tipc: fix NULL deref in cleanup_bearer() syzbot found [1] that after blamed commit, ub->ubsock->sk was NULL when attempting the atomic_dec() : atomic_dec(&tipc_net(sock_net(ub->ubsock->sk))->wq_count); Fix this by caching the tipc_net pointer. [1] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000006: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000030-0x0000000000000037] CPU: 0 UID: 0 PID: 5896 Comm: kworker/0:3 Not tainted 6.13.0-rc1-next-20241203-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Workqueue: events cleanup_bearer RIP: 0010:read_pnet include/net/net_namespace.h:387 [inline] RIP: 0010:sock_net include/net/sock.h:655 [inline] RIP: 0010:cleanup_bearer+0x1f7/0x280 net/tipc/udp_media.c:820 Code: 18 48 89 d8 48 c1 e8 03 42 80 3c 28 00 74 08 48 89 df e8 3c f7 99 f6 48 8b 1b 48 83 c3 30 e8 f0 e4 60 00 48 89 d8 48 c1 e8 03 <42> 80 3c 28 00 74 08 48 89 df e8 1a f7 99 f6 49 83 c7 e8 48 8b 1b RSP: 0018:ffffc9000410fb70 EFLAGS: 00010206 RAX: 0000000000000006 RBX: 0000000000000030 RCX: ffff88802fe45a00 RDX: 0000000000000001 RSI: 0000000000000008 RDI: ffffc9000410f900 RBP: ffff88807e1f0908 R08: ffffc9000410f907 R09: 1ffff92000821f20 R10: dffffc0000000000 R11: fffff52000821f21 R12: ffff888031d19980 R13: dffffc0000000000 R14: dffffc0000000000 R15: ffff88807e1f0918 FS: 0000000000000000(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000556ca050b000 CR3: 0000000031c0c000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: 6a2fa13312e5 ("tipc: Fix use-after-free of kernel socket in cleanup_bearer().") Reported-by: syzbot+46aa5474f179dacd1a3b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/67508b5f.050a0220.17bd51.0070.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241204170548.4152658-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/tipc/udp_media.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index b7e25e7e9933..108a4cc2e001 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -807,6 +807,7 @@ static void cleanup_bearer(struct work_struct *work) { struct udp_bearer *ub = container_of(work, struct udp_bearer, work); struct udp_replicast *rcast, *tmp; + struct tipc_net *tn; list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { dst_cache_destroy(&rcast->dst_cache); @@ -814,10 +815,14 @@ static void cleanup_bearer(struct work_struct *work) kfree_rcu(rcast, rcu); } + tn = tipc_net(sock_net(ub->ubsock->sk)); + dst_cache_destroy(&ub->rcast.dst_cache); udp_tunnel_sock_release(ub->ubsock); + + /* Note: could use a call_rcu() to avoid another synchronize_net() */ synchronize_net(); - atomic_dec(&tipc_net(sock_net(ub->ubsock->sk))->wq_count); + atomic_dec(&tn->wq_count); kfree(ub); } -- 2.50.1 From 11776cff0b563c8b8a4fa76cab620bfb633a8cb8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 4 Dec 2024 15:06:41 +0300 Subject: [PATCH 07/16] net/mlx5: DR, prevent potential error pointer dereference The dr_domain_add_vport_cap() function generally returns NULL on error but sometimes we want it to return ERR_PTR(-EBUSY) so the caller can retry. The problem here is that "ret" can be either -EBUSY or -ENOMEM and if it's and -ENOMEM then the error pointer is propogated back and eventually dereferenced in dr_ste_v0_build_src_gvmi_qpn_tag(). Fixes: 11a45def2e19 ("net/mlx5: DR, Add support for SF vports") Signed-off-by: Dan Carpenter Reviewed-by: Tariq Toukan Link: https://patch.msgid.link/07477254-e179-43e2-b1b3-3b9db4674195@stanley.mountain Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c index 3d74109f8230..49f22cad92bf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c @@ -297,7 +297,9 @@ dr_domain_add_vport_cap(struct mlx5dr_domain *dmn, u16 vport) if (ret) { mlx5dr_dbg(dmn, "Couldn't insert new vport into xarray (%d)\n", ret); kvfree(vport_caps); - return ERR_PTR(ret); + if (ret == -EBUSY) + return ERR_PTR(-EBUSY); + return NULL; } return vport_caps; -- 2.50.1 From cf3515c556907b4da290967a2a6cbbd9ee0ee723 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Thu, 5 Dec 2024 17:35:59 +0100 Subject: [PATCH 08/16] selftests: mlxsw: sharedbuffer: Remove h1 ingress test case The test is sending only one packet generated with mausezahn from $h1 to $h2. However, for some reason, it is testing for non-zero maximum occupancy in both the ingress pool of $h1 and $h2. The former only passes when $h2 happens to send a packet. Avoid intermittent failures by removing unintentional test case regarding the ingress pool of $h1. Fixes: a865ad999603 ("selftests: mlxsw: Add shared buffer traffic test") Signed-off-by: Danielle Ratson Reviewed-by: Ido Schimmel Signed-off-by: Ido Schimmel Signed-off-by: Petr Machata Link: https://patch.msgid.link/5b7344608d5e06f38209e48d8af8c92fa11b6742.1733414773.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh index 0c47faff9274..a7b3d6cf3185 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh @@ -108,11 +108,6 @@ port_pool_test() devlink sb occupancy snapshot $DEVLINK_DEV - RET=0 - max_occ=$(sb_occ_pool_check $dl_port1 $SB_POOL_ING $exp_max_occ) - check_err $? "Expected iPool($SB_POOL_ING) max occupancy to be $exp_max_occ, but got $max_occ" - log_test "physical port's($h1) ingress pool" - RET=0 max_occ=$(sb_occ_pool_check $dl_port2 $SB_POOL_ING $exp_max_occ) check_err $? "Expected iPool($SB_POOL_ING) max occupancy to be $exp_max_occ, but got $max_occ" -- 2.50.1 From 6c46ad4d1bb2e8ec2265296e53765190f6e32f33 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Thu, 5 Dec 2024 17:36:00 +0100 Subject: [PATCH 09/16] selftests: mlxsw: sharedbuffer: Remove duplicate test cases On both port_tc_ip_test() and port_tc_arp_test(), the max occupancy is checked on $h2 twice, when only the error message is different and does not match the check itself. Remove the two duplicated test cases from the test. Fixes: a865ad999603 ("selftests: mlxsw: Add shared buffer traffic test") Signed-off-by: Danielle Ratson Reviewed-by: Ido Schimmel Signed-off-by: Ido Schimmel Signed-off-by: Petr Machata Link: https://patch.msgid.link/d9eb26f6fc16a06a30b5c2c16ad80caf502bc561.1733414773.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/mlxsw/sharedbuffer.sh | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh index a7b3d6cf3185..21bebc5726f6 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh @@ -131,11 +131,6 @@ port_tc_ip_test() devlink sb occupancy snapshot $DEVLINK_DEV - RET=0 - max_occ=$(sb_occ_itc_check $dl_port2 $SB_ITC $exp_max_occ) - check_err $? "Expected ingress TC($SB_ITC) max occupancy to be $exp_max_occ, but got $max_occ" - log_test "physical port's($h1) ingress TC - IP packet" - RET=0 max_occ=$(sb_occ_itc_check $dl_port2 $SB_ITC $exp_max_occ) check_err $? "Expected ingress TC($SB_ITC) max occupancy to be $exp_max_occ, but got $max_occ" @@ -158,11 +153,6 @@ port_tc_arp_test() devlink sb occupancy snapshot $DEVLINK_DEV - RET=0 - max_occ=$(sb_occ_itc_check $dl_port2 $SB_ITC $exp_max_occ) - check_err $? "Expected ingress TC($SB_ITC) max occupancy to be $exp_max_occ, but got $max_occ" - log_test "physical port's($h1) ingress TC - ARP packet" - RET=0 max_occ=$(sb_occ_itc_check $dl_port2 $SB_ITC $exp_max_occ) check_err $? "Expected ingress TC($SB_ITC) max occupancy to be $exp_max_occ, but got $max_occ" -- 2.50.1 From 5f2c7ab15fd806043db1a7d54b5ec36be0bd93b1 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Thu, 5 Dec 2024 17:36:01 +0100 Subject: [PATCH 10/16] selftests: mlxsw: sharedbuffer: Ensure no extra packets are counted The test assumes that the packet it is sending is the only packet being passed to the device. However, it is not the case and so other packets are filling the buffers as well. Therefore, the test sometimes fails because it is reading a maximum occupancy that is larger than expected. Add egress filters on $h1 and $h2 that will guarantee the above. Fixes: a865ad999603 ("selftests: mlxsw: Add shared buffer traffic test") Signed-off-by: Danielle Ratson Reviewed-by: Ido Schimmel Signed-off-by: Ido Schimmel Signed-off-by: Petr Machata Link: https://patch.msgid.link/64c28bc9b1cc1d78c4a73feda7cedbe9526ccf8b.1733414773.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../drivers/net/mlxsw/sharedbuffer.sh | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh index 21bebc5726f6..c068e6c2a580 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh @@ -22,20 +22,34 @@ SB_ITC=0 h1_create() { simple_if_init $h1 192.0.1.1/24 + tc qdisc add dev $h1 clsact + + # Add egress filter on $h1 that will guarantee that the packet sent, + # will be the only packet being passed to the device. + tc filter add dev $h1 egress pref 2 handle 102 matchall action drop } h1_destroy() { + tc filter del dev $h1 egress pref 2 handle 102 matchall action drop + tc qdisc del dev $h1 clsact simple_if_fini $h1 192.0.1.1/24 } h2_create() { simple_if_init $h2 192.0.1.2/24 + tc qdisc add dev $h2 clsact + + # Add egress filter on $h2 that will guarantee that the packet sent, + # will be the only packet being passed to the device. + tc filter add dev $h2 egress pref 1 handle 101 matchall action drop } h2_destroy() { + tc filter del dev $h2 egress pref 1 handle 101 matchall action drop + tc qdisc del dev $h2 clsact simple_if_fini $h2 192.0.1.2/24 } @@ -101,6 +115,11 @@ port_pool_test() local exp_max_occ=$(devlink_cell_size_get) local max_occ + tc filter add dev $h1 egress protocol ip pref 1 handle 101 flower \ + src_mac $h1mac dst_mac $h2mac \ + src_ip 192.0.1.1 dst_ip 192.0.1.2 \ + action pass + devlink sb occupancy clearmax $DEVLINK_DEV $MZ $h1 -c 1 -p 10 -a $h1mac -b $h2mac -A 192.0.1.1 -B 192.0.1.2 \ @@ -117,6 +136,11 @@ port_pool_test() max_occ=$(sb_occ_pool_check $cpu_dl_port $SB_POOL_EGR_CPU $exp_max_occ) check_err $? "Expected ePool($SB_POOL_EGR_CPU) max occupancy to be $exp_max_occ, but got $max_occ" log_test "CPU port's egress pool" + + tc filter del dev $h1 egress protocol ip pref 1 handle 101 flower \ + src_mac $h1mac dst_mac $h2mac \ + src_ip 192.0.1.1 dst_ip 192.0.1.2 \ + action pass } port_tc_ip_test() @@ -124,6 +148,11 @@ port_tc_ip_test() local exp_max_occ=$(devlink_cell_size_get) local max_occ + tc filter add dev $h1 egress protocol ip pref 1 handle 101 flower \ + src_mac $h1mac dst_mac $h2mac \ + src_ip 192.0.1.1 dst_ip 192.0.1.2 \ + action pass + devlink sb occupancy clearmax $DEVLINK_DEV $MZ $h1 -c 1 -p 10 -a $h1mac -b $h2mac -A 192.0.1.1 -B 192.0.1.2 \ @@ -140,6 +169,11 @@ port_tc_ip_test() max_occ=$(sb_occ_etc_check $cpu_dl_port $SB_ITC_CPU_IP $exp_max_occ) check_err $? "Expected egress TC($SB_ITC_CPU_IP) max occupancy to be $exp_max_occ, but got $max_occ" log_test "CPU port's egress TC - IP packet" + + tc filter del dev $h1 egress protocol ip pref 1 handle 101 flower \ + src_mac $h1mac dst_mac $h2mac \ + src_ip 192.0.1.1 dst_ip 192.0.1.2 \ + action pass } port_tc_arp_test() @@ -147,6 +181,9 @@ port_tc_arp_test() local exp_max_occ=$(devlink_cell_size_get) local max_occ + tc filter add dev $h1 egress protocol arp pref 1 handle 101 flower \ + src_mac $h1mac action pass + devlink sb occupancy clearmax $DEVLINK_DEV $MZ $h1 -c 1 -p 10 -a $h1mac -A 192.0.1.1 -t arp -q @@ -162,6 +199,9 @@ port_tc_arp_test() max_occ=$(sb_occ_etc_check $cpu_dl_port $SB_ITC_CPU_ARP $exp_max_occ) check_err $? "Expected egress TC($SB_ITC_IP2ME) max occupancy to be $exp_max_occ, but got $max_occ" log_test "CPU port's egress TC - ARP packet" + + tc filter del dev $h1 egress protocol arp pref 1 handle 101 flower \ + src_mac $h1mac action pass } setup_prepare() -- 2.50.1 From 5e7aa97c7acf171275ac02a8bb018c31b8918d13 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 3 Dec 2024 18:09:55 +0100 Subject: [PATCH 11/16] ptp: kvm: x86: Return EOPNOTSUPP instead of ENODEV from kvm_arch_ptp_init() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The caller, ptp_kvm_init(), emits a warning if kvm_arch_ptp_init() exits with any error which is not EOPNOTSUPP: "fail to initialize ptp_kvm" Replace ENODEV with EOPNOTSUPP to avoid this spurious warning, aligning with the ARM implementation. Fixes: a86ed2cfa13c ("ptp: Don't print an error if ptp_kvm is not supported") Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20241203-kvm_ptp-eopnotsuppp-v2-1-d1d060f27aa6@weissschuh.net Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_kvm_x86.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/ptp/ptp_kvm_x86.c b/drivers/ptp/ptp_kvm_x86.c index 617c8d6706d3..6cea4fe39bcf 100644 --- a/drivers/ptp/ptp_kvm_x86.c +++ b/drivers/ptp/ptp_kvm_x86.c @@ -26,7 +26,7 @@ int kvm_arch_ptp_init(void) long ret; if (!kvm_para_available()) - return -ENODEV; + return -EOPNOTSUPP; if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { p = alloc_page(GFP_KERNEL | __GFP_ZERO); @@ -46,14 +46,14 @@ int kvm_arch_ptp_init(void) clock_pair_gpa = slow_virt_to_phys(clock_pair); if (!pvclock_get_pvti_cpu0_va()) { - ret = -ENODEV; + ret = -EOPNOTSUPP; goto err; } ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa, KVM_CLOCK_PAIRING_WALLCLOCK); if (ret == -KVM_ENOSYS) { - ret = -ENODEV; + ret = -EOPNOTSUPP; goto err; } -- 2.50.1 From de37faf41ac55619dd329229a9bd9698faeabc52 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 4 Dec 2024 13:59:17 -0800 Subject: [PATCH 12/16] bnxt_en: Fix GSO type for HW GRO packets on 5750X chips The existing code is using RSS profile to determine IPV4/IPV6 GSO type on all chips older than 5760X. This won't work on 5750X chips that may be using modified RSS profiles. This commit from 2018 has updated the driver to not use RSS profile for HW GRO packets on newer chips: 50f011b63d8c ("bnxt_en: Update RSS setup and GRO-HW logic according to the latest spec.") However, a recent commit to add support for the newest 5760X chip broke the logic. If the GRO packet needs to be re-segmented by the stack, the wrong GSO type will cause the packet to be dropped. Fix it to only use RSS profile to determine GSO type on the oldest 5730X/5740X chips which cannot use the new method and is safe to use the RSS profiles. Also fix the L3/L4 hash type for RX packets by not using the RSS profile for the same reason. Use the ITYPE field in the RX completion to determine L3/L4 hash types correctly. Fixes: a7445d69809f ("bnxt_en: Add support for new RX and TPA_START completion types for P7") Reviewed-by: Colin Winegarden Reviewed-by: Somnath Kotur Reviewed-by: Kalesh AP Signed-off-by: Michael Chan Link: https://patch.msgid.link/20241204215918.1692597-2-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 14 ++++++-------- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 3 +++ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 6b963086c1d3..454453bd1a5b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -1531,7 +1531,7 @@ static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, if (TPA_START_IS_IPV6(tpa_start1)) tpa_info->gso_type = SKB_GSO_TCPV6; /* RSS profiles 1 and 3 with extract code 0 for inner 4-tuple */ - else if (cmp_type == CMP_TYPE_RX_L2_TPA_START_CMP && + else if (!BNXT_CHIP_P4_PLUS(bp) && TPA_START_HASH_TYPE(tpa_start) == 3) tpa_info->gso_type = SKB_GSO_TCPV6; tpa_info->rss_hash = @@ -2226,15 +2226,13 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, if (cmp_type == CMP_TYPE_RX_L2_V3_CMP) { type = bnxt_rss_ext_op(bp, rxcmp); } else { - u32 hash_type = RX_CMP_HASH_TYPE(rxcmp); + u32 itypes = RX_CMP_ITYPES(rxcmp); - /* RSS profiles 1 and 3 with extract code 0 for inner - * 4-tuple - */ - if (hash_type != 1 && hash_type != 3) - type = PKT_HASH_TYPE_L3; - else + if (itypes == RX_CMP_FLAGS_ITYPE_TCP || + itypes == RX_CMP_FLAGS_ITYPE_UDP) type = PKT_HASH_TYPE_L4; + else + type = PKT_HASH_TYPE_L3; } skb_set_hash(skb, le32_to_cpu(rxcmp->rx_cmp_rss_hash), type); } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 23f1aff214b4..c9f1cb7e3740 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -267,6 +267,9 @@ struct rx_cmp { (((le32_to_cpu((rxcmp)->rx_cmp_misc_v1) & RX_CMP_RSS_HASH_TYPE) >>\ RX_CMP_RSS_HASH_TYPE_SHIFT) & RSS_PROFILE_ID_MASK) +#define RX_CMP_ITYPES(rxcmp) \ + (le32_to_cpu((rxcmp)->rx_cmp_len_flags_type) & RX_CMP_FLAGS_ITYPES_MASK) + #define RX_CMP_V3_HASH_TYPE_LEGACY(rxcmp) \ ((le32_to_cpu((rxcmp)->rx_cmp_misc_v1) & RX_CMP_V3_RSS_EXT_OP_LEGACY) >>\ RX_CMP_V3_RSS_EXT_OP_LEGACY_SHIFT) -- 2.50.1 From fab4b4d2c903443d605cdf256af2e80571d2d212 Mon Sep 17 00:00:00 2001 From: Hongguang Gao Date: Wed, 4 Dec 2024 13:59:18 -0800 Subject: [PATCH 13/16] bnxt_en: Fix potential crash when dumping FW log coredump If the FW log context memory is retained after FW reset, the existing code is not handling the condition correctly and zeroes out the data structures. This potentially will cause a division by zero crash when the user runs ethtool -w. The last_type is also not set correctly when the context memory is retained. This will cause errors because the last_type signals to the FW that all context memory types have been configured. Oops: divide error: 0000 1 PREEMPT SMP NOPTI CPU: 53 UID: 0 PID: 7019 Comm: ethtool Kdump: loaded Tainted: G OE 6.12.0-rc7+ #1 Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE Hardware name: Supermicro SYS-621C-TN12R/X13DDW-A, BIOS 1.4 08/10/2023 RIP: 0010:__bnxt_copy_ctx_mem.constprop.0.isra.0+0x86/0x160 [bnxt_en] Code: 0a 31 d2 4c 89 6c 24 10 45 8b a5 fc df ff ff 4c 8b 74 24 20 31 db 66 89 44 24 06 48 63 c5 c1 e5 09 4c 0f af e0 48 8b 44 24 30 <49> f7 f4 4c 89 64 24 08 48 63 c5 4d 89 ec 31 ed 48 89 44 24 18 49 RSP: 0018:ff480591603d78b8 EFLAGS: 00010206 RAX: 0000000000100000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ff23959e46740000 RDI: 0000000000000000 RBP: 0000000000000000 R08: 0000000000100000 R09: ff23959e46740000 R10: ff480591603d7a18 R11: 0000000000000010 R12: 0000000000000000 R13: ff23959e46742008 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f04227c1740(0000) GS:ff2395adbf680000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f04225b33a5 CR3: 000000108b9a4001 CR4: 0000000000773ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? die+0x33/0x90 ? do_trap+0xd9/0x100 ? __bnxt_copy_ctx_mem.constprop.0.isra.0+0x86/0x160 [bnxt_en] ? do_error_trap+0x65/0x80 ? __bnxt_copy_ctx_mem.constprop.0.isra.0+0x86/0x160 [bnxt_en] ? exc_divide_error+0x36/0x50 ? __bnxt_copy_ctx_mem.constprop.0.isra.0+0x86/0x160 [bnxt_en] ? asm_exc_divide_error+0x16/0x20 ? __bnxt_copy_ctx_mem.constprop.0.isra.0+0x86/0x160 [bnxt_en] ? __bnxt_copy_ctx_mem.constprop.0.isra.0+0xda/0x160 [bnxt_en] bnxt_get_ctx_coredump.constprop.0+0x1ed/0x390 [bnxt_en] ? __memcg_slab_post_alloc_hook+0x21c/0x3c0 ? __bnxt_get_coredump+0x473/0x4b0 [bnxt_en] __bnxt_get_coredump+0x473/0x4b0 [bnxt_en] ? security_file_alloc+0x74/0xe0 ? cred_has_capability.isra.0+0x78/0x120 bnxt_get_coredump_length+0x4b/0xf0 [bnxt_en] bnxt_get_dump_flag+0x40/0x60 [bnxt_en] __dev_ethtool+0x17e4/0x1fc0 ? syscall_exit_to_user_mode+0xc/0x1d0 ? do_syscall_64+0x85/0x150 ? unmap_page_range+0x299/0x4b0 ? vma_interval_tree_remove+0x215/0x2c0 ? __kmalloc_cache_noprof+0x10a/0x300 dev_ethtool+0xa8/0x170 dev_ioctl+0x1b5/0x580 ? sk_ioctl+0x4a/0x110 sock_do_ioctl+0xab/0xf0 sock_ioctl+0x1ca/0x2e0 __x64_sys_ioctl+0x87/0xc0 do_syscall_64+0x79/0x150 Fixes: 24d694aec139 ("bnxt_en: Allocate backing store memory for FW trace logs") Signed-off-by: Hongguang Gao Signed-off-by: Michael Chan Link: https://patch.msgid.link/20241204215918.1692597-3-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 454453bd1a5b..b86f980fa7ea 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -8365,7 +8365,7 @@ static int bnxt_alloc_all_ctx_pg_info(struct bnxt *bp, int ctx_max) struct bnxt_ctx_mem_type *ctxm = &ctx->ctx_arr[type]; int n = 1; - if (!ctxm->max_entries) + if (!ctxm->max_entries || ctxm->pg_info) continue; if (ctxm->instance_bmap) @@ -8969,8 +8969,8 @@ static int bnxt_backing_store_cfg_v2(struct bnxt *bp, u32 ena) continue; } bnxt_bs_trace_init(bp, ctxm); - last_type = type; } + last_type = type; } if (last_type == BNXT_CTX_INV) { -- 2.50.1 From a6d75ecee2bf828ac6a1b52724aba0a977e4eaf4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Dec 2024 14:10:31 +0000 Subject: [PATCH 14/16] net: lapb: increase LAPB_HEADER_LEN It is unclear if net/lapb code is supposed to be ready for 8021q. We can at least avoid crashes like the following : skbuff: skb_under_panic: text:ffffffff8aabe1f6 len:24 put:20 head:ffff88802824a400 data:ffff88802824a3fe tail:0x16 end:0x140 dev:nr0.2 ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:206 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 UID: 0 PID: 5508 Comm: dhcpcd Not tainted 6.12.0-rc7-syzkaller-00144-g66418447d27b #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/30/2024 RIP: 0010:skb_panic net/core/skbuff.c:206 [inline] RIP: 0010:skb_under_panic+0x14b/0x150 net/core/skbuff.c:216 Code: 0d 8d 48 c7 c6 2e 9e 29 8e 48 8b 54 24 08 8b 0c 24 44 8b 44 24 04 4d 89 e9 50 41 54 41 57 41 56 e8 1a 6f 37 02 48 83 c4 20 90 <0f> 0b 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 RSP: 0018:ffffc90002ddf638 EFLAGS: 00010282 RAX: 0000000000000086 RBX: dffffc0000000000 RCX: 7a24750e538ff600 RDX: 0000000000000000 RSI: 0000000000000201 RDI: 0000000000000000 RBP: ffff888034a86650 R08: ffffffff8174b13c R09: 1ffff920005bbe60 R10: dffffc0000000000 R11: fffff520005bbe61 R12: 0000000000000140 R13: ffff88802824a400 R14: ffff88802824a3fe R15: 0000000000000016 FS: 00007f2a5990d740(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000110c2631fd CR3: 0000000029504000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_push+0xe5/0x100 net/core/skbuff.c:2636 nr_header+0x36/0x320 net/netrom/nr_dev.c:69 dev_hard_header include/linux/netdevice.h:3148 [inline] vlan_dev_hard_header+0x359/0x480 net/8021q/vlan_dev.c:83 dev_hard_header include/linux/netdevice.h:3148 [inline] lapbeth_data_transmit+0x1f6/0x2a0 drivers/net/wan/lapbether.c:257 lapb_data_transmit+0x91/0xb0 net/lapb/lapb_iface.c:447 lapb_transmit_buffer+0x168/0x1f0 net/lapb/lapb_out.c:149 lapb_establish_data_link+0x84/0xd0 lapb_device_event+0x4e0/0x670 notifier_call_chain+0x19f/0x3e0 kernel/notifier.c:93 __dev_notify_flags+0x207/0x400 dev_change_flags+0xf0/0x1a0 net/core/dev.c:8922 devinet_ioctl+0xa4e/0x1aa0 net/ipv4/devinet.c:1188 inet_ioctl+0x3d7/0x4f0 net/ipv4/af_inet.c:1003 sock_do_ioctl+0x158/0x460 net/socket.c:1227 sock_ioctl+0x626/0x8e0 net/socket.c:1346 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl+0xf9/0x170 fs/ioctl.c:893 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+fb99d1b0c0f81d94a5e2@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/67506220.050a0220.17bd51.006c.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241204141031.4030267-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/lapb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/lapb.h b/include/net/lapb.h index 124ee122f2c8..6c07420644e4 100644 --- a/include/net/lapb.h +++ b/include/net/lapb.h @@ -4,7 +4,7 @@ #include #include -#define LAPB_HEADER_LEN 20 /* LAPB over Ethernet + a bit more */ +#define LAPB_HEADER_LEN MAX_HEADER /* LAPB over Ethernet + a bit more */ #define LAPB_ACK_PENDING_CONDITION 0x01 #define LAPB_REJECT_CONDITION 0x02 -- 2.50.1 From 0f6ede9fbc747e2553612271bce108f7517e7a45 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Dec 2024 12:54:55 +0000 Subject: [PATCH 15/16] net: defer final 'struct net' free in netns dismantle Ilya reported a slab-use-after-free in dst_destroy [1] Issue is in xfrm6_net_init() and xfrm4_net_init() : They copy xfrm[46]_dst_ops_template into net->xfrm.xfrm[46]_dst_ops. But net structure might be freed before all the dst callbacks are called. So when dst_destroy() calls later : if (dst->ops->destroy) dst->ops->destroy(dst); dst->ops points to the old net->xfrm.xfrm[46]_dst_ops, which has been freed. See a relevant issue fixed in : ac888d58869b ("net: do not delay dst_entries_add() in dst_release()") A fix is to queue the 'struct net' to be freed after one another cleanup_net() round (and existing rcu_barrier()) [1] BUG: KASAN: slab-use-after-free in dst_destroy (net/core/dst.c:112) Read of size 8 at addr ffff8882137ccab0 by task swapper/37/0 Dec 03 05:46:18 kernel: CPU: 37 UID: 0 PID: 0 Comm: swapper/37 Kdump: loaded Not tainted 6.12.0 #67 Hardware name: Red Hat KVM/RHEL, BIOS 1.16.1-1.el9 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:124) print_address_description.constprop.0 (mm/kasan/report.c:378) ? dst_destroy (net/core/dst.c:112) print_report (mm/kasan/report.c:489) ? dst_destroy (net/core/dst.c:112) ? kasan_addr_to_slab (mm/kasan/common.c:37) kasan_report (mm/kasan/report.c:603) ? dst_destroy (net/core/dst.c:112) ? rcu_do_batch (kernel/rcu/tree.c:2567) dst_destroy (net/core/dst.c:112) rcu_do_batch (kernel/rcu/tree.c:2567) ? __pfx_rcu_do_batch (kernel/rcu/tree.c:2491) ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4339 kernel/locking/lockdep.c:4406) rcu_core (kernel/rcu/tree.c:2825) handle_softirqs (kernel/softirq.c:554) __irq_exit_rcu (kernel/softirq.c:589 kernel/softirq.c:428 kernel/softirq.c:637) irq_exit_rcu (kernel/softirq.c:651) sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1049 arch/x86/kernel/apic/apic.c:1049) asm_sysvec_apic_timer_interrupt (./arch/x86/include/asm/idtentry.h:702) RIP: 0010:default_idle (./arch/x86/include/asm/irqflags.h:37 ./arch/x86/include/asm/irqflags.h:92 arch/x86/kernel/process.c:743) Code: 00 4d 29 c8 4c 01 c7 4c 29 c2 e9 6e ff ff ff 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 90 0f 00 2d c7 c9 27 00 fb f4 c3 cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 90 RSP: 0018:ffff888100d2fe00 EFLAGS: 00000246 RAX: 00000000001870ed RBX: 1ffff110201a5fc2 RCX: ffffffffb61a3e46 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffb3d4d123 RBP: 0000000000000000 R08: 0000000000000001 R09: ffffed11c7e1835d R10: ffff888e3f0c1aeb R11: 0000000000000000 R12: 0000000000000000 R13: ffff888100d20000 R14: dffffc0000000000 R15: 0000000000000000 ? ct_kernel_exit.constprop.0 (kernel/context_tracking.c:148) ? cpuidle_idle_call (kernel/sched/idle.c:186) default_idle_call (./include/linux/cpuidle.h:143 kernel/sched/idle.c:118) cpuidle_idle_call (kernel/sched/idle.c:186) ? __pfx_cpuidle_idle_call (kernel/sched/idle.c:168) ? lock_release (kernel/locking/lockdep.c:467 kernel/locking/lockdep.c:5848) ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4347 kernel/locking/lockdep.c:4406) ? tsc_verify_tsc_adjust (arch/x86/kernel/tsc_sync.c:59) do_idle (kernel/sched/idle.c:326) cpu_startup_entry (kernel/sched/idle.c:423 (discriminator 1)) start_secondary (arch/x86/kernel/smpboot.c:202 arch/x86/kernel/smpboot.c:282) ? __pfx_start_secondary (arch/x86/kernel/smpboot.c:232) ? soft_restart_cpu (arch/x86/kernel/head_64.S:452) common_startup_64 (arch/x86/kernel/head_64.S:414) Dec 03 05:46:18 kernel: Allocated by task 12184: kasan_save_stack (mm/kasan/common.c:48) kasan_save_track (./arch/x86/include/asm/current.h:49 mm/kasan/common.c:60 mm/kasan/common.c:69) __kasan_slab_alloc (mm/kasan/common.c:319 mm/kasan/common.c:345) kmem_cache_alloc_noprof (mm/slub.c:4085 mm/slub.c:4134 mm/slub.c:4141) copy_net_ns (net/core/net_namespace.c:421 net/core/net_namespace.c:480) create_new_namespaces (kernel/nsproxy.c:110) unshare_nsproxy_namespaces (kernel/nsproxy.c:228 (discriminator 4)) ksys_unshare (kernel/fork.c:3313) __x64_sys_unshare (kernel/fork.c:3382) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Dec 03 05:46:18 kernel: Freed by task 11: kasan_save_stack (mm/kasan/common.c:48) kasan_save_track (./arch/x86/include/asm/current.h:49 mm/kasan/common.c:60 mm/kasan/common.c:69) kasan_save_free_info (mm/kasan/generic.c:582) __kasan_slab_free (mm/kasan/common.c:271) kmem_cache_free (mm/slub.c:4579 mm/slub.c:4681) cleanup_net (net/core/net_namespace.c:456 net/core/net_namespace.c:446 net/core/net_namespace.c:647) process_one_work (kernel/workqueue.c:3229) worker_thread (kernel/workqueue.c:3304 kernel/workqueue.c:3391) kthread (kernel/kthread.c:389) ret_from_fork (arch/x86/kernel/process.c:147) ret_from_fork_asm (arch/x86/entry/entry_64.S:257) Dec 03 05:46:18 kernel: Last potentially related work creation: kasan_save_stack (mm/kasan/common.c:48) __kasan_record_aux_stack (mm/kasan/generic.c:541) insert_work (./include/linux/instrumented.h:68 ./include/asm-generic/bitops/instrumented-non-atomic.h:141 kernel/workqueue.c:788 kernel/workqueue.c:795 kernel/workqueue.c:2186) __queue_work (kernel/workqueue.c:2340) queue_work_on (kernel/workqueue.c:2391) xfrm_policy_insert (net/xfrm/xfrm_policy.c:1610) xfrm_add_policy (net/xfrm/xfrm_user.c:2116) xfrm_user_rcv_msg (net/xfrm/xfrm_user.c:3321) netlink_rcv_skb (net/netlink/af_netlink.c:2536) xfrm_netlink_rcv (net/xfrm/xfrm_user.c:3344) netlink_unicast (net/netlink/af_netlink.c:1316 net/netlink/af_netlink.c:1342) netlink_sendmsg (net/netlink/af_netlink.c:1886) sock_write_iter (net/socket.c:729 net/socket.c:744 net/socket.c:1165) vfs_write (fs/read_write.c:590 fs/read_write.c:683) ksys_write (fs/read_write.c:736) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Dec 03 05:46:18 kernel: Second to last potentially related work creation: kasan_save_stack (mm/kasan/common.c:48) __kasan_record_aux_stack (mm/kasan/generic.c:541) insert_work (./include/linux/instrumented.h:68 ./include/asm-generic/bitops/instrumented-non-atomic.h:141 kernel/workqueue.c:788 kernel/workqueue.c:795 kernel/workqueue.c:2186) __queue_work (kernel/workqueue.c:2340) queue_work_on (kernel/workqueue.c:2391) __xfrm_state_insert (./include/linux/workqueue.h:723 net/xfrm/xfrm_state.c:1150 net/xfrm/xfrm_state.c:1145 net/xfrm/xfrm_state.c:1513) xfrm_state_update (./include/linux/spinlock.h:396 net/xfrm/xfrm_state.c:1940) xfrm_add_sa (net/xfrm/xfrm_user.c:912) xfrm_user_rcv_msg (net/xfrm/xfrm_user.c:3321) netlink_rcv_skb (net/netlink/af_netlink.c:2536) xfrm_netlink_rcv (net/xfrm/xfrm_user.c:3344) netlink_unicast (net/netlink/af_netlink.c:1316 net/netlink/af_netlink.c:1342) netlink_sendmsg (net/netlink/af_netlink.c:1886) sock_write_iter (net/socket.c:729 net/socket.c:744 net/socket.c:1165) vfs_write (fs/read_write.c:590 fs/read_write.c:683) ksys_write (fs/read_write.c:736) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Fixes: a8a572a6b5f2 ("xfrm: dst_entries_init() per-net dst_ops") Reported-by: Ilya Maximets Closes: https://lore.kernel.org/netdev/CANn89iKKYDVpB=MtmfH7nyv2p=rJWSLedO5k7wSZgtY_tO8WQg@mail.gmail.com/T/#m02c98c3009fe66382b73cfb4db9cf1df6fab3fbf Signed-off-by: Eric Dumazet Acked-by: Paolo Abeni Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241204125455.3871859-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/net_namespace.h | 1 + net/core/net_namespace.c | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index f943bd79ef03..5a2a0df8ad91 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -80,6 +80,7 @@ struct net { * or to unregister pernet ops * (pernet_ops_rwsem write locked). */ + struct llist_node defer_free_list; struct llist_node cleanup_list; /* namespaces on death row */ #ifdef CONFIG_KEYS diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index ae34ac818cda..b5cd3ae4f04c 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -449,6 +449,21 @@ out_free: goto out; } +static LLIST_HEAD(defer_free_list); + +static void net_complete_free(void) +{ + struct llist_node *kill_list; + struct net *net, *next; + + /* Get the list of namespaces to free from last round. */ + kill_list = llist_del_all(&defer_free_list); + + llist_for_each_entry_safe(net, next, kill_list, defer_free_list) + kmem_cache_free(net_cachep, net); + +} + static void net_free(struct net *net) { if (refcount_dec_and_test(&net->passive)) { @@ -457,7 +472,8 @@ static void net_free(struct net *net) /* There should not be any trackers left there. */ ref_tracker_dir_exit(&net->notrefcnt_tracker); - kmem_cache_free(net_cachep, net); + /* Wait for an extra rcu_barrier() before final free. */ + llist_add(&net->defer_free_list, &defer_free_list); } } @@ -642,6 +658,8 @@ static void cleanup_net(struct work_struct *work) */ rcu_barrier(); + net_complete_free(); + /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); -- 2.50.1 From 4c49f38e20a57f8abaebdf95b369295b153d1f8e Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 6 Dec 2024 12:40:11 +0000 Subject: [PATCH 16/16] net: stmmac: fix TSO DMA API usage causing oops Commit 66600fac7a98 ("net: stmmac: TSO: Fix unbalanced DMA map/unmap for non-paged SKB data") moved the assignment of tx_skbuff_dma[]'s members to be later in stmmac_tso_xmit(). The buf (dma cookie) and len stored in this structure are passed to dma_unmap_single() by stmmac_tx_clean(). The DMA API requires that the dma cookie passed to dma_unmap_single() is the same as the value returned from dma_map_single(). However, by moving the assignment later, this is not the case when priv->dma_cap.addr64 > 32 as "des" is offset by proto_hdr_len. This causes problems such as: dwc-eth-dwmac 2490000.ethernet eth0: Tx DMA map failed and with DMA_API_DEBUG enabled: DMA-API: dwc-eth-dwmac 2490000.ethernet: device driver tries to +free DMA memory it has not allocated [device address=0x000000ffffcf65c0] [size=66 bytes] Fix this by maintaining "des" as the original DMA cookie, and use tso_des to pass the offset DMA cookie to stmmac_tso_allocator(). Full details of the crashes can be found at: https://lore.kernel.org/all/d8112193-0386-4e14-b516-37c2d838171a@nvidia.com/ https://lore.kernel.org/all/klkzp5yn5kq5efgtrow6wbvnc46bcqfxs65nz3qy77ujr5turc@bwwhelz2l4dw/ Reported-by: Jon Hunter Reported-by: Thierry Reding Fixes: 66600fac7a98 ("net: stmmac: TSO: Fix unbalanced DMA map/unmap for non-paged SKB data") Tested-by: Jon Hunter Signed-off-by: Russell King (Oracle) Reviewed-by: Furong Xu <0x1207@gmail.com> Link: https://patch.msgid.link/E1tJXcx-006N4Z-PC@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 9b262cdad60b..c81ea8cdfe6e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4192,8 +4192,8 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) struct stmmac_txq_stats *txq_stats; struct stmmac_tx_queue *tx_q; u32 pay_len, mss, queue; + dma_addr_t tso_des, des; u8 proto_hdr_len, hdr; - dma_addr_t des; bool set_ic; int i; @@ -4289,14 +4289,15 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) /* If needed take extra descriptors to fill the remaining payload */ tmp_pay_len = pay_len - TSO_MAX_BUFF_SIZE; + tso_des = des; } else { stmmac_set_desc_addr(priv, first, des); tmp_pay_len = pay_len; - des += proto_hdr_len; + tso_des = des + proto_hdr_len; pay_len = 0; } - stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue); + stmmac_tso_allocator(priv, tso_des, tmp_pay_len, (nfrags == 0), queue); /* In case two or more DMA transmit descriptors are allocated for this * non-paged SKB data, the DMA buffer address should be saved to -- 2.50.1