From 4695f64e028dd1407d077565fbdb30ddb364180a Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 28 Jan 2025 14:15:32 +0100 Subject: [PATCH 01/16] vsock/test: Add test for connect() retries Deliberately fail a connect() attempt; expect error. Then verify that subsequent attempt (using the same socket) can still succeed, rather than fail outright. Reviewed-by: Stefano Garzarella Reviewed-by: Luigi Leonardi Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20250128-vsock-transport-vs-autobind-v3-6-1cf57065b770@rbox.co Signed-off-by: Jakub Kicinski --- tools/testing/vsock/vsock_test.c | 47 ++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 92cfd92bbfdc..dfff8b288265 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1746,6 +1746,48 @@ static void test_stream_transport_uaf_server(const struct test_opts *opts) control_expectln("DONE"); } +static void test_stream_connect_retry_client(const struct test_opts *opts) +{ + int fd; + + fd = socket(AF_VSOCK, SOCK_STREAM, 0); + if (fd < 0) { + perror("socket"); + exit(EXIT_FAILURE); + } + + if (!vsock_connect_fd(fd, opts->peer_cid, opts->peer_port)) { + fprintf(stderr, "Unexpected connect() #1 success\n"); + exit(EXIT_FAILURE); + } + + control_writeln("LISTEN"); + control_expectln("LISTENING"); + + if (vsock_connect_fd(fd, opts->peer_cid, opts->peer_port)) { + perror("connect() #2"); + exit(EXIT_FAILURE); + } + + close(fd); +} + +static void test_stream_connect_retry_server(const struct test_opts *opts) +{ + int fd; + + control_expectln("LISTEN"); + + fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + vsock_wait_remote_close(fd); + close(fd); +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -1896,6 +1938,11 @@ static struct test_case test_cases[] = { .run_client = test_stream_transport_uaf_client, .run_server = test_stream_transport_uaf_server, }, + { + .name = "SOCK_STREAM retry failed connect()", + .run_client = test_stream_connect_retry_client, + .run_server = test_stream_connect_retry_server, + }, {}, }; -- 2.51.0 From 752e5fcc2e77358936d36ef8e522d6439372e201 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Mon, 27 Jan 2025 09:51:59 -0800 Subject: [PATCH 02/16] bgmac: reduce max frame size to support just MTU 1500 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit bgmac allocates new replacement buffer before handling each received frame. Allocating & DMA-preparing 9724 B each time consumes a lot of CPU time. Ideally bgmac should just respect currently set MTU but it isn't the case right now. For now just revert back to the old limited frame size. This change bumps NAT masquerade speed by ~95%. Since commit 8218f62c9c9b ("mm: page_frag: use initial zero offset for page_frag_alloc_align()"), the bgmac driver fails to open its network interface successfully and runs out of memory in the following call stack: bgmac_open -> bgmac_dma_init -> bgmac_dma_rx_skb_for_slot -> netdev_alloc_frag BGMAC_RX_ALLOC_SIZE = 10048 and PAGE_FRAG_CACHE_MAX_SIZE = 32768. Eventually we land into __page_frag_alloc_align() with the following parameters across multiple successive calls: __page_frag_alloc_align: fragsz=10048, align_mask=-1, size=32768, offset=0 __page_frag_alloc_align: fragsz=10048, align_mask=-1, size=32768, offset=10048 __page_frag_alloc_align: fragsz=10048, align_mask=-1, size=32768, offset=20096 __page_frag_alloc_align: fragsz=10048, align_mask=-1, size=32768, offset=30144 So in that case we do indeed have offset + fragsz (40192) > size (32768) and so we would eventually return NULL. Reverting to the older 1500 bytes MTU allows the network driver to be usable again. Fixes: 8c7da63978f1 ("bgmac: configure MTU and add support for frames beyond 8192 byte size") Signed-off-by: Rafał Miłecki [florian: expand commit message about recent commits] Reviewed-by: Simon Horman Signed-off-by: Florian Fainelli Link: https://patch.msgid.link/20250127175159.1788246-1-florian.fainelli@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bgmac.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bgmac.h b/drivers/net/ethernet/broadcom/bgmac.h index d73ef262991d..6fee9a41839c 100644 --- a/drivers/net/ethernet/broadcom/bgmac.h +++ b/drivers/net/ethernet/broadcom/bgmac.h @@ -328,8 +328,7 @@ #define BGMAC_RX_FRAME_OFFSET 30 /* There are 2 unused bytes between header and real data */ #define BGMAC_RX_BUF_OFFSET (NET_SKB_PAD + NET_IP_ALIGN - \ BGMAC_RX_FRAME_OFFSET) -/* Jumbo frame size with FCS */ -#define BGMAC_RX_MAX_FRAME_SIZE 9724 +#define BGMAC_RX_MAX_FRAME_SIZE 1536 #define BGMAC_RX_BUF_SIZE (BGMAC_RX_FRAME_OFFSET + BGMAC_RX_MAX_FRAME_SIZE) #define BGMAC_RX_ALLOC_SIZE (SKB_DATA_ALIGN(BGMAC_RX_BUF_SIZE + BGMAC_RX_BUF_OFFSET) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) -- 2.51.0 From 8c670bdfa58e48abad1d5b6ca1ee843ca91f7303 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 27 Jan 2025 18:13:04 -0500 Subject: [PATCH 03/16] tcp: correct handling of extreme memory squeeze MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Testing with iperf3 using the "pasta" protocol splicer has revealed a problem in the way tcp handles window advertising in extreme memory squeeze situations. Under memory pressure, a socket endpoint may temporarily advertise a zero-sized window, but this is not stored as part of the socket data. The reasoning behind this is that it is considered a temporary setting which shouldn't influence any further calculations. However, if we happen to stall at an unfortunate value of the current window size, the algorithm selecting a new value will consistently fail to advertise a non-zero window once we have freed up enough memory. This means that this side's notion of the current window size is different from the one last advertised to the peer, causing the latter to not send any data to resolve the sitution. The problem occurs on the iperf3 server side, and the socket in question is a completely regular socket with the default settings for the fedora40 kernel. We do not use SO_PEEK or SO_RCVBUF on the socket. The following excerpt of a logging session, with own comments added, shows more in detail what is happening: // tcp_v4_rcv(->) // tcp_rcv_established(->) [5201<->39222]: ==== Activating log @ net/ipv4/tcp_input.c/tcp_data_queue()/5257 ==== [5201<->39222]: tcp_data_queue(->) [5201<->39222]: DROPPING skb [265600160..265665640], reason: SKB_DROP_REASON_PROTO_MEM [rcv_nxt 265600160, rcv_wnd 262144, snt_ack 265469200, win_now 131184] [copied_seq 259909392->260034360 (124968), unread 5565800, qlen 85, ofoq 0] [OFO queue: gap: 65480, len: 0] [5201<->39222]: tcp_data_queue(<-) [5201<->39222]: __tcp_transmit_skb(->) [tp->rcv_wup: 265469200, tp->rcv_wnd: 262144, tp->rcv_nxt 265600160] [5201<->39222]: tcp_select_window(->) [5201<->39222]: (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM) ? --> TRUE [tp->rcv_wup: 265469200, tp->rcv_wnd: 262144, tp->rcv_nxt 265600160] returning 0 [5201<->39222]: tcp_select_window(<-) [5201<->39222]: ADVERTISING WIN 0, ACK_SEQ: 265600160 [5201<->39222]: [__tcp_transmit_skb(<-) [5201<->39222]: tcp_rcv_established(<-) [5201<->39222]: tcp_v4_rcv(<-) // Receive queue is at 85 buffers and we are out of memory. // We drop the incoming buffer, although it is in sequence, and decide // to send an advertisement with a window of zero. // We don't update tp->rcv_wnd and tp->rcv_wup accordingly, which means // we unconditionally shrink the window. [5201<->39222]: tcp_recvmsg_locked(->) [5201<->39222]: __tcp_cleanup_rbuf(->) tp->rcv_wup: 265469200, tp->rcv_wnd: 262144, tp->rcv_nxt 265600160 [5201<->39222]: [new_win = 0, win_now = 131184, 2 * win_now = 262368] [5201<->39222]: [new_win >= (2 * win_now) ? --> time_to_ack = 0] [5201<->39222]: NOT calling tcp_send_ack() [tp->rcv_wup: 265469200, tp->rcv_wnd: 262144, tp->rcv_nxt 265600160] [5201<->39222]: __tcp_cleanup_rbuf(<-) [rcv_nxt 265600160, rcv_wnd 262144, snt_ack 265469200, win_now 131184] [copied_seq 260040464->260040464 (0), unread 5559696, qlen 85, ofoq 0] returning 6104 bytes [5201<->39222]: tcp_recvmsg_locked(<-) // After each read, the algorithm for calculating the new receive // window in __tcp_cleanup_rbuf() finds it is too small to advertise // or to update tp->rcv_wnd. // Meanwhile, the peer thinks the window is zero, and will not send // any more data to trigger an update from the interrupt mode side. [5201<->39222]: tcp_recvmsg_locked(->) [5201<->39222]: __tcp_cleanup_rbuf(->) tp->rcv_wup: 265469200, tp->rcv_wnd: 262144, tp->rcv_nxt 265600160 [5201<->39222]: [new_win = 262144, win_now = 131184, 2 * win_now = 262368] [5201<->39222]: [new_win >= (2 * win_now) ? --> time_to_ack = 0] [5201<->39222]: NOT calling tcp_send_ack() [tp->rcv_wup: 265469200, tp->rcv_wnd: 262144, tp->rcv_nxt 265600160] [5201<->39222]: __tcp_cleanup_rbuf(<-) [rcv_nxt 265600160, rcv_wnd 262144, snt_ack 265469200, win_now 131184] [copied_seq 260099840->260171536 (71696), unread 5428624, qlen 83, ofoq 0] returning 131072 bytes [5201<->39222]: tcp_recvmsg_locked(<-) // The above pattern repeats again and again, since nothing changes // between the reads. [...] [5201<->39222]: tcp_recvmsg_locked(->) [5201<->39222]: __tcp_cleanup_rbuf(->) tp->rcv_wup: 265469200, tp->rcv_wnd: 262144, tp->rcv_nxt 265600160 [5201<->39222]: [new_win = 262144, win_now = 131184, 2 * win_now = 262368] [5201<->39222]: [new_win >= (2 * win_now) ? --> time_to_ack = 0] [5201<->39222]: NOT calling tcp_send_ack() [tp->rcv_wup: 265469200, tp->rcv_wnd: 262144, tp->rcv_nxt 265600160] [5201<->39222]: __tcp_cleanup_rbuf(<-) [rcv_nxt 265600160, rcv_wnd 262144, snt_ack 265469200, win_now 131184] [copied_seq 265600160->265600160 (0), unread 0, qlen 0, ofoq 0] returning 54672 bytes [5201<->39222]: tcp_recvmsg_locked(<-) // The receive queue is empty, but no new advertisement has been sent. // The peer still thinks the receive window is zero, and sends nothing. // We have ended up in a deadlock situation. Note that well behaved endpoints will send win0 probes, so the problem will not occur. Furthermore, we have observed that in these situations this side may send out an updated 'th->ack_seq´ which is not stored in tp->rcv_wup as it should be. Backing ack_seq seems to be harmless, but is of course still wrong from a protocol viewpoint. We fix this by updating the socket state correctly when a packet has been dropped because of memory exhaustion and we have to advertize a zero window. Further testing shows that the connection recovers neatly from the squeeze situation, and traffic can continue indefinitely. Fixes: e2142825c120 ("net: tcp: send zero-window ACK when no memory") Cc: Menglong Dong Reviewed-by: Stefano Brivio Signed-off-by: Jon Maloy Reviewed-by: Jason Xing Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250127231304.1465565-1-jmaloy@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0e5b9a654254..bc95d2a5924f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -265,11 +265,14 @@ static u16 tcp_select_window(struct sock *sk) u32 cur_win, new_win; /* Make the window 0 if we failed to queue the data because we - * are out of memory. The window is temporary, so we don't store - * it on the socket. + * are out of memory. */ - if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) + if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) { + tp->pred_flags = 0; + tp->rcv_wnd = 0; + tp->rcv_wup = tp->rcv_nxt; return 0; + } cur_win = tcp_receive_window(tp); new_win = __tcp_select_window(sk); -- 2.51.0 From 3595599fa8360bb3c7afa7ee50c810b4a64106ea Mon Sep 17 00:00:00 2001 From: =?utf8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 27 Jan 2025 14:13:42 +0100 Subject: [PATCH 04/16] net: xdp: Disallow attaching device-bound programs in generic mode MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Device-bound programs are used to support RX metadata kfuncs. These kfuncs are driver-specific and rely on the driver context to read the metadata. This means they can't work in generic XDP mode. However, there is no check to disallow such programs from being attached in generic mode, in which case the metadata kfuncs will be called in an invalid context, leading to crashes. Fix this by adding a check to disallow attaching device-bound programs in generic mode. Fixes: 2b3486bc2d23 ("bpf: Introduce device-bound XDP programs") Reported-by: Marcus Wichelmann Closes: https://lore.kernel.org/r/dae862ec-43b5-41a0-8edf-46c59071cdda@hetzner-cloud.de Tested-by: Marcus Wichelmann Acked-by: Stanislav Fomichev Signed-off-by: Toke Høiland-Jørgensen Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20250127131344.238147-1-toke@redhat.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index 07b2bb1ce64f..02cd75b512f9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9924,6 +9924,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack NL_SET_ERR_MSG(extack, "Program bound to different device"); return -EINVAL; } + if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) { + NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode"); + return -EINVAL; + } if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); return -EINVAL; -- 2.51.0 From f7bf624b1fedf232195804ac0f6584cb3e4b86bc Mon Sep 17 00:00:00 2001 From: =?utf8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 27 Jan 2025 14:13:43 +0100 Subject: [PATCH 05/16] selftests/net: Add test for loading devbound XDP program in generic mode MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add a test to bpf_offload.py for loading a devbound XDP program in generic mode, checking that it fails correctly. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250127131344.238147-2-toke@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bpf_offload.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/bpf_offload.py b/tools/testing/selftests/net/bpf_offload.py index d10f420e4ef6..fd0d959914e4 100755 --- a/tools/testing/selftests/net/bpf_offload.py +++ b/tools/testing/selftests/net/bpf_offload.py @@ -215,12 +215,14 @@ def bpftool_map_list_wait(expected=0, n_retry=20, ns=""): raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps)) def bpftool_prog_load(sample, file_name, maps=[], prog_type="xdp", dev=None, - fail=True, include_stderr=False): + fail=True, include_stderr=False, dev_bind=None): args = "prog load %s %s" % (os.path.join(bpf_test_dir, sample), file_name) if prog_type is not None: args += " type " + prog_type if dev is not None: args += " dev " + dev + elif dev_bind is not None: + args += " xdpmeta_dev " + dev_bind if len(maps): args += " map " + " map ".join(maps) @@ -980,6 +982,16 @@ try: rm("/sys/fs/bpf/offload") sim.wait_for_flush() + bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/devbound", + dev_bind=sim['ifname']) + devbound = bpf_pinned("/sys/fs/bpf/devbound") + start_test("Test dev-bound program in generic mode...") + ret, _, err = sim.set_xdp(devbound, "generic", fail=False, include_stderr=True) + fail(ret == 0, "devbound program in generic mode allowed") + check_extack(err, "Can't attach device-bound programs in generic mode.", args) + rm("/sys/fs/bpf/devbound") + sim.wait_for_flush() + start_test("Test XDP load failure...") sim.dfs["dev/bpf_bind_verifier_accept"] = 0 ret, _, err = bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload", -- 2.51.0 From 2c2ebb2b49573e5f8726112ad06b1dffc3c9ea03 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 29 Jan 2025 10:50:46 +0100 Subject: [PATCH 06/16] net: ravb: Fix missing rtnl lock in suspend/resume path MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Fix the suspend/resume path by ensuring the rtnl lock is held where required. Calls to ravb_open, ravb_close and wol operations must be performed under the rtnl lock to prevent conflicts with ongoing ndo operations. Without this fix, the following warning is triggered: [ 39.032969] ============================= [ 39.032983] WARNING: suspicious RCU usage [ 39.033019] ----------------------------- [ 39.033033] drivers/net/phy/phy_device.c:2004 suspicious rcu_dereference_protected() usage! ... [ 39.033597] stack backtrace: [ 39.033613] CPU: 0 UID: 0 PID: 174 Comm: python3 Not tainted 6.13.0-rc7-next-20250116-arm64-renesas-00002-g35245dfdc62c #7 [ 39.033623] Hardware name: Renesas SMARC EVK version 2 based on r9a08g045s33 (DT) [ 39.033628] Call trace: [ 39.033633] show_stack+0x14/0x1c (C) [ 39.033652] dump_stack_lvl+0xb4/0xc4 [ 39.033664] dump_stack+0x14/0x1c [ 39.033671] lockdep_rcu_suspicious+0x16c/0x22c [ 39.033682] phy_detach+0x160/0x190 [ 39.033694] phy_disconnect+0x40/0x54 [ 39.033703] ravb_close+0x6c/0x1cc [ 39.033714] ravb_suspend+0x48/0x120 [ 39.033721] dpm_run_callback+0x4c/0x14c [ 39.033731] device_suspend+0x11c/0x4dc [ 39.033740] dpm_suspend+0xdc/0x214 [ 39.033748] dpm_suspend_start+0x48/0x60 [ 39.033758] suspend_devices_and_enter+0x124/0x574 [ 39.033769] pm_suspend+0x1ac/0x274 [ 39.033778] state_store+0x88/0x124 [ 39.033788] kobj_attr_store+0x14/0x24 [ 39.033798] sysfs_kf_write+0x48/0x6c [ 39.033808] kernfs_fop_write_iter+0x118/0x1a8 [ 39.033817] vfs_write+0x27c/0x378 [ 39.033825] ksys_write+0x64/0xf4 [ 39.033833] __arm64_sys_write+0x18/0x20 [ 39.033841] invoke_syscall+0x44/0x104 [ 39.033852] el0_svc_common.constprop.0+0xb4/0xd4 [ 39.033862] do_el0_svc+0x18/0x20 [ 39.033870] el0_svc+0x3c/0xf0 [ 39.033880] el0t_64_sync_handler+0xc0/0xc4 [ 39.033888] el0t_64_sync+0x154/0x158 [ 39.041274] ravb 11c30000.ethernet eth0: Link is Down Reported-by: Claudiu Beznea Closes: https://lore.kernel.org/netdev/4c6419d8-c06b-495c-b987-d66c2e1ff848@tuxon.dev/ Fixes: 0184165b2f42 ("ravb: add sleep PM suspend/resume support") Signed-off-by: Kory Maincent Tested-by: Niklas Söderlund Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index bc395294a32d..c9f4976a3527 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -3217,10 +3217,15 @@ static int ravb_suspend(struct device *dev) netif_device_detach(ndev); - if (priv->wol_enabled) - return ravb_wol_setup(ndev); + rtnl_lock(); + if (priv->wol_enabled) { + ret = ravb_wol_setup(ndev); + rtnl_unlock(); + return ret; + } ret = ravb_close(ndev); + rtnl_unlock(); if (ret) return ret; @@ -3245,19 +3250,20 @@ static int ravb_resume(struct device *dev) if (!netif_running(ndev)) return 0; + rtnl_lock(); /* If WoL is enabled restore the interface. */ - if (priv->wol_enabled) { + if (priv->wol_enabled) ret = ravb_wol_restore(ndev); - if (ret) - return ret; - } else { + else ret = pm_runtime_force_resume(dev); - if (ret) - return ret; + if (ret) { + rtnl_unlock(); + return ret; } /* Reopening the interface will restore the device to the working state. */ ret = ravb_open(ndev); + rtnl_unlock(); if (ret < 0) goto out_rpm_put; -- 2.51.0 From b95102215a8d0987789715ce11c0d4ec031cbfbe Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 29 Jan 2025 10:50:47 +0100 Subject: [PATCH 07/16] net: sh_eth: Fix missing rtnl lock in suspend/resume path MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Fix the suspend/resume path by ensuring the rtnl lock is held where required. Calls to sh_eth_close, sh_eth_open and wol operations must be performed under the rtnl lock to prevent conflicts with ongoing ndo operations. Fixes: b71af04676e9 ("sh_eth: add more PM methods") Tested-by: Niklas Söderlund Reviewed-by: Sergey Shtylyov Signed-off-by: Kory Maincent Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/sh_eth.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 8887b8921009..5fc8027c92c7 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -3494,10 +3494,12 @@ static int sh_eth_suspend(struct device *dev) netif_device_detach(ndev); + rtnl_lock(); if (mdp->wol_enabled) ret = sh_eth_wol_setup(ndev); else ret = sh_eth_close(ndev); + rtnl_unlock(); return ret; } @@ -3511,10 +3513,12 @@ static int sh_eth_resume(struct device *dev) if (!netif_running(ndev)) return 0; + rtnl_lock(); if (mdp->wol_enabled) ret = sh_eth_wol_restore(ndev); else ret = sh_eth_open(ndev); + rtnl_unlock(); if (ret < 0) return ret; -- 2.51.0 From 1b9335a8000fb70742f7db10af314104b6ace220 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 28 Jan 2025 12:26:33 +0100 Subject: [PATCH 08/16] netfilter: nf_tables: reject mismatching sum of field_len with set key length The field length description provides the length of each separated key field in the concatenation, each field gets rounded up to 32-bits to calculate the pipapo rule width from pipapo_init(). The set key length provides the total size of the key aligned to 32-bits. Register-based arithmetics still allows for combining mismatching set key length and field length description, eg. set key length 10 and field description [ 5, 4 ] leading to pipapo width of 12. Cc: stable@vger.kernel.org Fixes: 3ce67e3793f4 ("netfilter: nf_tables: do not allow mismatch field size and set key length") Reported-by: Noam Rathaus Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c4af283356e7..e5662dc087c8 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5065,7 +5065,7 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr, static int nft_set_desc_concat(struct nft_set_desc *desc, const struct nlattr *nla) { - u32 num_regs = 0, key_num_regs = 0; + u32 len = 0, num_regs; struct nlattr *attr; int rem, err, i; @@ -5079,12 +5079,12 @@ static int nft_set_desc_concat(struct nft_set_desc *desc, } for (i = 0; i < desc->field_count; i++) - num_regs += DIV_ROUND_UP(desc->field_len[i], sizeof(u32)); + len += round_up(desc->field_len[i], sizeof(u32)); - key_num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); - if (key_num_regs != num_regs) + if (len != desc->klen) return -EINVAL; + num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); if (num_regs > NFT_REG32_COUNT) return -E2BIG; -- 2.51.0 From e598d8981fd34470b78a1ae777dbf131b15d5bf2 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 29 Jan 2025 13:24:32 +0100 Subject: [PATCH 09/16] mptcp: blackhole only if 1st SYN retrans w/o MPC is accepted The Fixes commit mentioned this: > An MPTCP firewall blackhole can be detected if the following SYN > retransmission after a fallback to "plain" TCP is accepted. But in fact, this blackhole was detected if any following SYN retransmissions after a fallback to TCP was accepted. That's because 'mptcp_subflow_early_fallback()' will set 'request_mptcp' to 0, and 'mpc_drop' will never be reset to 0 after. This is an issue, because some not so unusual situations might cause the kernel to detect a false-positive blackhole, e.g. a client trying to connect to a server while the network is not ready yet, causing a few SYN retransmissions, before reaching the end server. Fixes: 27069e7cb3d1 ("mptcp: disable active MPTCP in case of blackhole") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/ctrl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 3999e0ba2c35..2dd81e6c26bd 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -418,9 +418,9 @@ void mptcp_active_detect_blackhole(struct sock *ssk, bool expired) MPTCP_INC_STATS(net, MPTCP_MIB_MPCAPABLEACTIVEDROP); subflow->mpc_drop = 1; mptcp_subflow_early_fallback(mptcp_sk(subflow->conn), subflow); - } else { - subflow->mpc_drop = 0; } + } else if (ssk->sk_state == TCP_SYN_SENT) { + subflow->mpc_drop = 0; } } -- 2.51.0 From 18da4b5d123285dea470b15ff51c7fbe61dc37fd Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 29 Jan 2025 13:24:33 +0100 Subject: [PATCH 10/16] doc: mptcp: sysctl: blackhole_timeout is per-netns All other sysctl entries mention it, and it is a per-namespace sysctl. So mention it as well. Fixes: 27069e7cb3d1 ("mptcp: disable active MPTCP in case of blackhole") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- Documentation/networking/mptcp-sysctl.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index dc45c0211353..03e1d3610333 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -41,7 +41,7 @@ blackhole_timeout - INTEGER (seconds) MPTCP is re-enabled and will reset to the initial value when the blackhole issue goes away. - 0 to disable the blackhole detection. + 0 to disable the blackhole detection. This is a per-namespace sysctl. Default: 3600 -- 2.51.0 From 0f5697f1a3f99bc2b674b8aa3c5da822c5673c11 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Jan 2025 13:00:07 +0000 Subject: [PATCH 11/16] net: hsr: fix fill_frame_info() regression vs VLAN packets Stephan Wurm reported that my recent patch broke VLAN support. Apparently skb->mac_len is not correct for VLAN traffic as shown by debug traces [1]. Use instead pskb_may_pull() to make sure the expected header is present in skb->head. Many thanks to Stephan for his help. [1] kernel: skb len=170 headroom=2 headlen=170 tailroom=20 mac=(2,14) mac_len=14 net=(16,-1) trans=-1 shinfo(txflags=0 nr_frags=0 gso(size=0 type=0 segs=0)) csum(0x0 start=0 offset=0 ip_summed=0 complete_sw=0 valid=0 level=0) hash(0x0 sw=0 l4=0) proto=0x0000 pkttype=0 iif=0 priority=0x0 mark=0x0 alloc_cpu=0 vlan_all=0x0 encapsulation=0 inner(proto=0x0000, mac=0, net=0, trans=0) kernel: dev name=prp0 feat=0x0000000000007000 kernel: sk family=17 type=3 proto=0 kernel: skb headroom: 00000000: 74 00 kernel: skb linear: 00000000: 01 0c cd 01 00 01 00 d0 93 53 9c cb 81 00 80 00 kernel: skb linear: 00000010: 88 b8 00 01 00 98 00 00 00 00 61 81 8d 80 16 52 kernel: skb linear: 00000020: 45 47 44 4e 43 54 52 4c 2f 4c 4c 4e 30 24 47 4f kernel: skb linear: 00000030: 24 47 6f 43 62 81 01 14 82 16 52 45 47 44 4e 43 kernel: skb linear: 00000040: 54 52 4c 2f 4c 4c 4e 30 24 44 73 47 6f 6f 73 65 kernel: skb linear: 00000050: 83 07 47 6f 49 64 65 6e 74 84 08 67 8d f5 93 7e kernel: skb linear: 00000060: 76 c8 00 85 01 01 86 01 00 87 01 00 88 01 01 89 kernel: skb linear: 00000070: 01 00 8a 01 02 ab 33 a2 15 83 01 00 84 03 03 00 kernel: skb linear: 00000080: 00 91 08 67 8d f5 92 77 4b c6 1f 83 01 00 a2 1a kernel: skb linear: 00000090: a2 06 85 01 00 83 01 00 84 03 03 00 00 91 08 67 kernel: skb linear: 000000a0: 8d f5 92 77 4b c6 1f 83 01 00 kernel: skb tailroom: 00000000: 80 18 02 00 fe 4e 00 00 01 01 08 0a 4f fd 5e d1 kernel: skb tailroom: 00000010: 4f fd 5e cd Fixes: b9653d19e556 ("net: hsr: avoid potential out-of-bound access in fill_frame_info()") Reported-by: Stephan Wurm Tested-by: Stephan Wurm Closes: https://lore.kernel.org/netdev/Z4o_UC0HweBHJ_cw@PC-LX-SteWu/ Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250129130007.644084-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/hsr/hsr_forward.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 87bb3a91598e..a4bacf198555 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -700,9 +700,12 @@ static int fill_frame_info(struct hsr_frame_info *frame, frame->is_vlan = true; if (frame->is_vlan) { - if (skb->mac_len < offsetofend(struct hsr_vlan_ethhdr, vlanhdr)) + /* Note: skb->mac_len might be wrong here. */ + if (!pskb_may_pull(skb, + skb_mac_offset(skb) + + offsetofend(struct hsr_vlan_ethhdr, vlanhdr))) return -EINVAL; - vlan_hdr = (struct hsr_vlan_ethhdr *)ethhdr; + vlan_hdr = (struct hsr_vlan_ethhdr *)skb_mac_header(skb); proto = vlan_hdr->vlanhdr.h_vlan_encapsulated_proto; } -- 2.51.0 From e759e1e4a4bd2926d082afe56046a90224433a31 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Jan 2025 14:27:26 +0000 Subject: [PATCH 12/16] net: revert RTNL changes in unregister_netdevice_many_notify() This patch reverts following changes: 83419b61d187 net: reduce RTNL hold duration in unregister_netdevice_many_notify() (part 2) ae646f1a0bb9 net: reduce RTNL hold duration in unregister_netdevice_many_notify() (part 1) cfa579f66656 net: no longer hold RTNL while calling flush_all_backlogs() This caused issues in layers holding a private mutex: cleanup_net() rtnl_lock(); mutex_lock(subsystem_mutex); unregister_netdevice(); rtnl_unlock(); // LOCKDEP violation rtnl_lock(); I will revisit this in next cycle, opt-in for the new behavior from safe contexts only. Fixes: cfa579f66656 ("net: no longer hold RTNL while calling flush_all_backlogs()") Fixes: ae646f1a0bb9 ("net: reduce RTNL hold duration in unregister_netdevice_many_notify() (part 1)") Fixes: 83419b61d187 ("net: reduce RTNL hold duration in unregister_netdevice_many_notify() (part 2)") Reported-by: syzbot+5b9196ecf74447172a9a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6789d55f.050a0220.20d369.004e.GAE@google.com/ Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250129142726.747726-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 33 +++------------------------------ 1 file changed, 3 insertions(+), 30 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 02cd75b512f9..c0021cbd28fc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10264,37 +10264,14 @@ static bool from_cleanup_net(void) #endif } -static void rtnl_drop_if_cleanup_net(void) -{ - if (from_cleanup_net()) - __rtnl_unlock(); -} - -static void rtnl_acquire_if_cleanup_net(void) -{ - if (from_cleanup_net()) - rtnl_lock(); -} - /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); -static LIST_HEAD(net_todo_list_for_cleanup_net); - -/* TODO: net_todo_list/net_todo_list_for_cleanup_net should probably - * be provided by callers, instead of being static, rtnl protected. - */ -static struct list_head *todo_list(void) -{ - return from_cleanup_net() ? &net_todo_list_for_cleanup_net : - &net_todo_list; -} - DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); atomic_t dev_unreg_count = ATOMIC_INIT(0); static void net_set_todo(struct net_device *dev) { - list_add_tail(&dev->todo_list, todo_list()); + list_add_tail(&dev->todo_list, &net_todo_list); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, @@ -11144,7 +11121,7 @@ void netdev_run_todo(void) #endif /* Snapshot list, allow later requests */ - list_replace_init(todo_list(), &list); + list_replace_init(&net_todo_list, &list); __rtnl_unlock(); @@ -11789,11 +11766,9 @@ void unregister_netdevice_many_notify(struct list_head *head, WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); netdev_unlock(dev); } - - rtnl_drop_if_cleanup_net(); flush_all_backlogs(); + synchronize_net(); - rtnl_acquire_if_cleanup_net(); list_for_each_entry(dev, head, unreg_list) { struct sk_buff *skb = NULL; @@ -11853,9 +11828,7 @@ void unregister_netdevice_many_notify(struct list_head *head, #endif } - rtnl_drop_if_cleanup_net(); synchronize_net(); - rtnl_acquire_if_cleanup_net(); list_for_each_entry(dev, head, unreg_list) { netdev_put(dev, &dev->dev_registered_tracker); -- 2.51.0 From d7dda216ca49f1ac214cb577cbeeee760a2b425b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 Jan 2025 11:13:32 -0800 Subject: [PATCH 13/16] MAINTAINERS: add Neal to TCP maintainers Neal Cardwell has been indispensable in TCP reviews and investigations, especially protocol-related. Neal is also the author of packetdrill. Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250129191332.2526140-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index d996f393fcf4..720c692b9dfb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16525,6 +16525,7 @@ F: tools/testing/selftests/net/mptcp/ NETWORKING [TCP] M: Eric Dumazet +M: Neal Cardwell L: netdev@vger.kernel.org S: Maintained F: Documentation/networking/net_cachelines/tcp_sock.rst -- 2.51.0 From c3a392bdd31adc474f1009ee85c13fdd01fe800d Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 3 Dec 2024 07:58:09 +0100 Subject: [PATCH 14/16] ice: count combined queues using Rx/Tx count Previous implementation assumes that there is 1:1 matching between vectors and queues. It isn't always true. Get minimum value from Rx/Tx queues to determine combined queues number. Reviewed-by: Jacob Keller Tested-by: Pucha Himasekhar Reddy Signed-off-by: Michal Swiatkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ethtool.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index f241493a6ac8..6bbb304ad9ab 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -3817,8 +3817,7 @@ static u32 ice_get_combined_cnt(struct ice_vsi *vsi) ice_for_each_q_vector(vsi, q_idx) { struct ice_q_vector *q_vector = vsi->q_vectors[q_idx]; - if (q_vector->rx.rx_ring && q_vector->tx.tx_ring) - combined++; + combined += min(q_vector->num_ring_tx, q_vector->num_ring_rx); } return combined; -- 2.51.0 From b2657259fce933ae0ba3e07cd0052fb6a8e90689 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 3 Dec 2024 07:58:10 +0100 Subject: [PATCH 15/16] ice: devlink PF MSI-X max and min parameter Use generic devlink PF MSI-X parameter to allow user to change MSI-X range. Add notes about this parameters into ice devlink documentation. Tested-by: Pucha Himasekhar Reddy Signed-off-by: Michal Swiatkowski Signed-off-by: Tony Nguyen --- Documentation/networking/devlink/ice.rst | 11 +++ .../net/ethernet/intel/ice/devlink/devlink.c | 81 +++++++++++++++++++ drivers/net/ethernet/intel/ice/ice.h | 7 ++ drivers/net/ethernet/intel/ice/ice_irq.c | 7 ++ 4 files changed, 106 insertions(+) diff --git a/Documentation/networking/devlink/ice.rst b/Documentation/networking/devlink/ice.rst index e3972d03cea0..792e9f8c846a 100644 --- a/Documentation/networking/devlink/ice.rst +++ b/Documentation/networking/devlink/ice.rst @@ -69,6 +69,17 @@ Parameters To verify that value has been set: $ devlink dev param show pci/0000:16:00.0 name tx_scheduling_layers + * - ``msix_vec_per_pf_max`` + - driverinit + - Set the max MSI-X that can be used by the PF, rest can be utilized for + SRIOV. The range is from min value set in msix_vec_per_pf_min to + 2k/number of ports. + * - ``msix_vec_per_pf_min`` + - driverinit + - Set the min MSI-X that will be used by the PF. This value inform how many + MSI-X will be allocated statically. The range is from 2 to value set + in msix_vec_per_pf_max. + .. list-table:: Driver specific parameters implemented :widths: 5 5 90 diff --git a/drivers/net/ethernet/intel/ice/devlink/devlink.c b/drivers/net/ethernet/intel/ice/devlink/devlink.c index d116e2b10bce..81de5fbc130e 100644 --- a/drivers/net/ethernet/intel/ice/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ice/devlink/devlink.c @@ -1202,6 +1202,25 @@ static int ice_devlink_set_parent(struct devlink_rate *devlink_rate, return status; } +static void ice_set_min_max_msix(struct ice_pf *pf) +{ + struct devlink *devlink = priv_to_devlink(pf); + union devlink_param_value val; + int err; + + err = devl_param_driverinit_value_get(devlink, + DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, + &val); + if (!err) + pf->msix.min = val.vu32; + + err = devl_param_driverinit_value_get(devlink, + DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, + &val); + if (!err) + pf->msix.max = val.vu32; +} + /** * ice_devlink_reinit_up - do reinit of the given PF * @pf: pointer to the PF struct @@ -1217,6 +1236,9 @@ static int ice_devlink_reinit_up(struct ice_pf *pf) return err; } + /* load MSI-X values */ + ice_set_min_max_msix(pf); + err = ice_init_dev(pf); if (err) goto unroll_hw_init; @@ -1530,6 +1552,30 @@ static int ice_devlink_local_fwd_validate(struct devlink *devlink, u32 id, return 0; } +static int +ice_devlink_msix_max_pf_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + struct ice_pf *pf = devlink_priv(devlink); + + if (val.vu32 > pf->hw.func_caps.common_cap.num_msix_vectors) + return -EINVAL; + + return 0; +} + +static int +ice_devlink_msix_min_pf_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + if (val.vu32 < ICE_MIN_MSIX) + return -EINVAL; + + return 0; +} + enum ice_param_id { ICE_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX, ICE_DEVLINK_PARAM_ID_TX_SCHED_LAYERS, @@ -1547,6 +1593,15 @@ static const struct devlink_param ice_dvl_rdma_params[] = { ice_devlink_enable_iw_validate), }; +static const struct devlink_param ice_dvl_msix_params[] = { + DEVLINK_PARAM_GENERIC(MSIX_VEC_PER_PF_MAX, + BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + NULL, NULL, ice_devlink_msix_max_pf_validate), + DEVLINK_PARAM_GENERIC(MSIX_VEC_PER_PF_MIN, + BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + NULL, NULL, ice_devlink_msix_min_pf_validate), +}; + static const struct devlink_param ice_dvl_sched_params[] = { DEVLINK_PARAM_DRIVER(ICE_DEVLINK_PARAM_ID_TX_SCHED_LAYERS, "tx_scheduling_layers", @@ -1648,6 +1703,7 @@ void ice_devlink_unregister(struct ice_pf *pf) int ice_devlink_register_params(struct ice_pf *pf) { struct devlink *devlink = priv_to_devlink(pf); + union devlink_param_value value; struct ice_hw *hw = &pf->hw; int status; @@ -1656,10 +1712,33 @@ int ice_devlink_register_params(struct ice_pf *pf) if (status) return status; + status = devl_params_register(devlink, ice_dvl_msix_params, + ARRAY_SIZE(ice_dvl_msix_params)); + if (status) + goto unregister_rdma_params; + if (hw->func_caps.common_cap.tx_sched_topo_comp_mode_en) status = devl_params_register(devlink, ice_dvl_sched_params, ARRAY_SIZE(ice_dvl_sched_params)); + if (status) + goto unregister_msix_params; + + value.vu32 = pf->msix.max; + devl_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, + value); + value.vu32 = pf->msix.min; + devl_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, + value); + return 0; +unregister_msix_params: + devl_params_unregister(devlink, ice_dvl_msix_params, + ARRAY_SIZE(ice_dvl_msix_params)); +unregister_rdma_params: + devl_params_unregister(devlink, ice_dvl_rdma_params, + ARRAY_SIZE(ice_dvl_rdma_params)); return status; } @@ -1670,6 +1749,8 @@ void ice_devlink_unregister_params(struct ice_pf *pf) devl_params_unregister(devlink, ice_dvl_rdma_params, ARRAY_SIZE(ice_dvl_rdma_params)); + devl_params_unregister(devlink, ice_dvl_msix_params, + ARRAY_SIZE(ice_dvl_msix_params)); if (hw->func_caps.common_cap.tx_sched_topo_comp_mode_en) devl_params_unregister(devlink, ice_dvl_sched_params, diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 71e05d30f0fd..d041b04ff324 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -542,6 +542,12 @@ struct ice_agg_node { u8 valid; }; +struct ice_pf_msix { + u32 cur; + u32 min; + u32 max; +}; + struct ice_pf { struct pci_dev *pdev; struct ice_adapter *adapter; @@ -612,6 +618,7 @@ struct ice_pf { struct msi_map ll_ts_irq; /* LL_TS interrupt MSIX vector */ u16 max_pf_txqs; /* Total Tx queues PF wide */ u16 max_pf_rxqs; /* Total Rx queues PF wide */ + struct ice_pf_msix msix; u16 num_lan_msix; /* Total MSIX vectors for base driver */ u16 num_lan_tx; /* num LAN Tx queues setup */ u16 num_lan_rx; /* num LAN Rx queues setup */ diff --git a/drivers/net/ethernet/intel/ice/ice_irq.c b/drivers/net/ethernet/intel/ice/ice_irq.c index ad82ff7d1995..0659b96b9b8c 100644 --- a/drivers/net/ethernet/intel/ice/ice_irq.c +++ b/drivers/net/ethernet/intel/ice/ice_irq.c @@ -254,6 +254,13 @@ int ice_init_interrupt_scheme(struct ice_pf *pf) int total_vectors = pf->hw.func_caps.common_cap.num_msix_vectors; int vectors, max_vectors; + /* load default PF MSI-X range */ + if (!pf->msix.min) + pf->msix.min = ICE_MIN_MSIX; + + if (!pf->msix.max) + pf->msix.max = total_vectors / 2; + vectors = ice_ena_msix_range(pf); if (vectors < 0) -- 2.51.0 From 79d97b8cf9a8500e2a18bb37f96268b17b7e7dd5 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 3 Dec 2024 07:58:11 +0100 Subject: [PATCH 16/16] ice: remove splitting MSI-X between features With dynamic approach to alloc MSI-X there is no sense to statically split MSI-X between PF features. Splitting was also calculating needed MSI-X. Move this part to separate function and use as max value. Remove ICE_ESWITCH_MSIX, as there is no need for additional MSI-X for switchdev. Reviewed-by: Jacob Keller Reviewed-by: Wojciech Drewek Tested-by: Pucha Himasekhar Reddy Signed-off-by: Michal Swiatkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 2 - drivers/net/ethernet/intel/ice/ice_irq.c | 172 +++-------------------- 2 files changed, 16 insertions(+), 158 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index d041b04ff324..c78bd45cf016 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -98,8 +98,6 @@ #define ICE_MIN_MSIX (ICE_MIN_LAN_TXRX_MSIX + ICE_MIN_LAN_OICR_MSIX) #define ICE_FDIR_MSIX 2 #define ICE_RDMA_NUM_AEQ_MSIX 4 -#define ICE_MIN_RDMA_MSIX 2 -#define ICE_ESWITCH_MSIX 1 #define ICE_NO_VSI 0xffff #define ICE_VSI_MAP_CONTIG 0 #define ICE_VSI_MAP_SCATTER 1 diff --git a/drivers/net/ethernet/intel/ice/ice_irq.c b/drivers/net/ethernet/intel/ice/ice_irq.c index 0659b96b9b8c..4a50a6dc817e 100644 --- a/drivers/net/ethernet/intel/ice/ice_irq.c +++ b/drivers/net/ethernet/intel/ice/ice_irq.c @@ -84,155 +84,11 @@ static struct ice_irq_entry *ice_get_irq_res(struct ice_pf *pf, bool dyn_only) return entry; } -/** - * ice_reduce_msix_usage - Reduce usage of MSI-X vectors - * @pf: board private structure - * @v_remain: number of remaining MSI-X vectors to be distributed - * - * Reduce the usage of MSI-X vectors when entire request cannot be fulfilled. - * pf->num_lan_msix and pf->num_rdma_msix values are set based on number of - * remaining vectors. - */ -static void ice_reduce_msix_usage(struct ice_pf *pf, int v_remain) +static int ice_get_default_msix_amount(struct ice_pf *pf) { - int v_rdma; - - if (!ice_is_rdma_ena(pf)) { - pf->num_lan_msix = v_remain; - return; - } - - /* RDMA needs at least 1 interrupt in addition to AEQ MSIX */ - v_rdma = ICE_RDMA_NUM_AEQ_MSIX + 1; - - if (v_remain < ICE_MIN_LAN_TXRX_MSIX + ICE_MIN_RDMA_MSIX) { - dev_warn(ice_pf_to_dev(pf), "Not enough MSI-X vectors to support RDMA.\n"); - clear_bit(ICE_FLAG_RDMA_ENA, pf->flags); - - pf->num_rdma_msix = 0; - pf->num_lan_msix = ICE_MIN_LAN_TXRX_MSIX; - } else if ((v_remain < ICE_MIN_LAN_TXRX_MSIX + v_rdma) || - (v_remain - v_rdma < v_rdma)) { - /* Support minimum RDMA and give remaining vectors to LAN MSIX - */ - pf->num_rdma_msix = ICE_MIN_RDMA_MSIX; - pf->num_lan_msix = v_remain - ICE_MIN_RDMA_MSIX; - } else { - /* Split remaining MSIX with RDMA after accounting for AEQ MSIX - */ - pf->num_rdma_msix = (v_remain - ICE_RDMA_NUM_AEQ_MSIX) / 2 + - ICE_RDMA_NUM_AEQ_MSIX; - pf->num_lan_msix = v_remain - pf->num_rdma_msix; - } -} - -/** - * ice_ena_msix_range - Request a range of MSIX vectors from the OS - * @pf: board private structure - * - * Compute the number of MSIX vectors wanted and request from the OS. Adjust - * device usage if there are not enough vectors. Return the number of vectors - * reserved or negative on failure. - */ -static int ice_ena_msix_range(struct ice_pf *pf) -{ - int num_cpus, hw_num_msix, v_other, v_wanted, v_actual; - struct device *dev = ice_pf_to_dev(pf); - int err; - - hw_num_msix = pf->hw.func_caps.common_cap.num_msix_vectors; - num_cpus = num_online_cpus(); - - /* LAN miscellaneous handler */ - v_other = ICE_MIN_LAN_OICR_MSIX; - - /* Flow Director */ - if (test_bit(ICE_FLAG_FD_ENA, pf->flags)) - v_other += ICE_FDIR_MSIX; - - /* switchdev */ - v_other += ICE_ESWITCH_MSIX; - - v_wanted = v_other; - - /* LAN traffic */ - pf->num_lan_msix = num_cpus; - v_wanted += pf->num_lan_msix; - - /* RDMA auxiliary driver */ - if (ice_is_rdma_ena(pf)) { - pf->num_rdma_msix = num_cpus + ICE_RDMA_NUM_AEQ_MSIX; - v_wanted += pf->num_rdma_msix; - } - - if (v_wanted > hw_num_msix) { - int v_remain; - - dev_warn(dev, "not enough device MSI-X vectors. wanted = %d, available = %d\n", - v_wanted, hw_num_msix); - - if (hw_num_msix < ICE_MIN_MSIX) { - err = -ERANGE; - goto exit_err; - } - - v_remain = hw_num_msix - v_other; - if (v_remain < ICE_MIN_LAN_TXRX_MSIX) { - v_other = ICE_MIN_MSIX - ICE_MIN_LAN_TXRX_MSIX; - v_remain = ICE_MIN_LAN_TXRX_MSIX; - } - - ice_reduce_msix_usage(pf, v_remain); - v_wanted = pf->num_lan_msix + pf->num_rdma_msix + v_other; - - dev_notice(dev, "Reducing request to %d MSI-X vectors for LAN traffic.\n", - pf->num_lan_msix); - if (ice_is_rdma_ena(pf)) - dev_notice(dev, "Reducing request to %d MSI-X vectors for RDMA.\n", - pf->num_rdma_msix); - } - - /* actually reserve the vectors */ - v_actual = pci_alloc_irq_vectors(pf->pdev, ICE_MIN_MSIX, v_wanted, - PCI_IRQ_MSIX); - if (v_actual < 0) { - dev_err(dev, "unable to reserve MSI-X vectors\n"); - err = v_actual; - goto exit_err; - } - - if (v_actual < v_wanted) { - dev_warn(dev, "not enough OS MSI-X vectors. requested = %d, obtained = %d\n", - v_wanted, v_actual); - - if (v_actual < ICE_MIN_MSIX) { - /* error if we can't get minimum vectors */ - pci_free_irq_vectors(pf->pdev); - err = -ERANGE; - goto exit_err; - } else { - int v_remain = v_actual - v_other; - - if (v_remain < ICE_MIN_LAN_TXRX_MSIX) - v_remain = ICE_MIN_LAN_TXRX_MSIX; - - ice_reduce_msix_usage(pf, v_remain); - - dev_notice(dev, "Enabled %d MSI-X vectors for LAN traffic.\n", - pf->num_lan_msix); - - if (ice_is_rdma_ena(pf)) - dev_notice(dev, "Enabled %d MSI-X vectors for RDMA.\n", - pf->num_rdma_msix); - } - } - - return v_actual; - -exit_err: - pf->num_rdma_msix = 0; - pf->num_lan_msix = 0; - return err; + return ICE_MIN_LAN_OICR_MSIX + num_online_cpus() + + (test_bit(ICE_FLAG_FD_ENA, pf->flags) ? ICE_FDIR_MSIX : 0) + + (ice_is_rdma_ena(pf) ? num_online_cpus() + ICE_RDMA_NUM_AEQ_MSIX : 0); } /** @@ -259,17 +115,21 @@ int ice_init_interrupt_scheme(struct ice_pf *pf) pf->msix.min = ICE_MIN_MSIX; if (!pf->msix.max) - pf->msix.max = total_vectors / 2; - - vectors = ice_ena_msix_range(pf); + pf->msix.max = min(total_vectors, + ice_get_default_msix_amount(pf)); - if (vectors < 0) - return -ENOMEM; - - if (pci_msix_can_alloc_dyn(pf->pdev)) + if (pci_msix_can_alloc_dyn(pf->pdev)) { + vectors = pf->msix.min; max_vectors = total_vectors; - else + } else { + vectors = pf->msix.max; max_vectors = vectors; + } + + vectors = pci_alloc_irq_vectors(pf->pdev, pf->msix.min, vectors, + PCI_IRQ_MSIX); + if (vectors < pf->msix.min) + return -ENOMEM; ice_init_irq_tracker(pf, max_vectors, vectors); -- 2.51.0