From df137da9d6d166e87e40980e36eb8e0bc90483ef Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 10 Jan 2025 09:35:09 +0100 Subject: [PATCH 01/16] vsock/virtio: cancel close work in the destructor During virtio_transport_release() we can schedule a delayed work to perform the closing of the socket before destruction. The destructor is called either when the socket is really destroyed (reference counter to zero), or it can also be called when we are de-assigning the transport. In the former case, we are sure the delayed work has completed, because it holds a reference until it completes, so the destructor will definitely be called after the delayed work is finished. But in the latter case, the destructor is called by AF_VSOCK core, just after the release(), so there may still be delayed work scheduled. Refactor the code, moving the code to delete the close work already in the do_close() to a new function. Invoke it during destruction to make sure we don't leave any pending work. Fixes: c0cfa2d8a788 ("vsock: add multi-transports support") Cc: stable@vger.kernel.org Reported-by: Hyunwoo Kim Closes: https://lore.kernel.org/netdev/Z37Sh+utS+iV3+eb@v4bel-B760M-AORUS-ELITE-AX/ Signed-off-by: Stefano Garzarella Reviewed-by: Luigi Leonardi Tested-by: Hyunwoo Kim Signed-off-by: Paolo Abeni --- net/vmw_vsock/virtio_transport_common.c | 29 ++++++++++++++++++------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 51a494b69be8..7f7de6d88096 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -26,6 +26,9 @@ /* Threshold for detecting small packets to copy */ #define GOOD_COPY_LEN 128 +static void virtio_transport_cancel_close_work(struct vsock_sock *vsk, + bool cancel_timeout); + static const struct virtio_transport * virtio_transport_get_ops(struct vsock_sock *vsk) { @@ -1109,6 +1112,8 @@ void virtio_transport_destruct(struct vsock_sock *vsk) { struct virtio_vsock_sock *vvs = vsk->trans; + virtio_transport_cancel_close_work(vsk, true); + kfree(vvs); vsk->trans = NULL; } @@ -1204,17 +1209,11 @@ static void virtio_transport_wait_close(struct sock *sk, long timeout) } } -static void virtio_transport_do_close(struct vsock_sock *vsk, - bool cancel_timeout) +static void virtio_transport_cancel_close_work(struct vsock_sock *vsk, + bool cancel_timeout) { struct sock *sk = sk_vsock(vsk); - sock_set_flag(sk, SOCK_DONE); - vsk->peer_shutdown = SHUTDOWN_MASK; - if (vsock_stream_has_data(vsk) <= 0) - sk->sk_state = TCP_CLOSING; - sk->sk_state_change(sk); - if (vsk->close_work_scheduled && (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) { vsk->close_work_scheduled = false; @@ -1226,6 +1225,20 @@ static void virtio_transport_do_close(struct vsock_sock *vsk, } } +static void virtio_transport_do_close(struct vsock_sock *vsk, + bool cancel_timeout) +{ + struct sock *sk = sk_vsock(vsk); + + sock_set_flag(sk, SOCK_DONE); + vsk->peer_shutdown = SHUTDOWN_MASK; + if (vsock_stream_has_data(vsk) <= 0) + sk->sk_state = TCP_CLOSING; + sk->sk_state_change(sk); + + virtio_transport_cancel_close_work(vsk, cancel_timeout); +} + static void virtio_transport_close_timeout(struct work_struct *work) { struct vsock_sock *vsk = -- 2.51.0 From a24009bc9be60242651a21702609381b5092459e Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 10 Jan 2025 09:35:10 +0100 Subject: [PATCH 02/16] vsock: reset socket state when de-assigning the transport Transport's release() and destruct() are called when de-assigning the vsock transport. These callbacks can touch some socket state like sock flags, sk_state, and peer_shutdown. Since we are reassigning the socket to a new transport during vsock_connect(), let's reset these fields to have a clean state with the new transport. Fixes: c0cfa2d8a788 ("vsock: add multi-transports support") Cc: stable@vger.kernel.org Signed-off-by: Stefano Garzarella Reviewed-by: Luigi Leonardi Signed-off-by: Paolo Abeni --- net/vmw_vsock/af_vsock.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 5cf8109f672a..74d35a871644 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -491,6 +491,15 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) */ vsk->transport->release(vsk); vsock_deassign_transport(vsk); + + /* transport's release() and destruct() can touch some socket + * state, since we are reassigning the socket to a new transport + * during vsock_connect(), let's reset these fields to have a + * clean state. + */ + sock_reset_flag(sk, SOCK_DONE); + sk->sk_state = TCP_CLOSE; + vsk->peer_shutdown = 0; } /* We increase the module refcnt to prevent the transport unloading -- 2.51.0 From 91751e248256efc111e52e15115840c35d85abaf Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 10 Jan 2025 09:35:11 +0100 Subject: [PATCH 03/16] vsock: prevent null-ptr-deref in vsock_*[has_data|has_space] Recent reports have shown how we sometimes call vsock_*_has_data() when a vsock socket has been de-assigned from a transport (see attached links), but we shouldn't. Previous commits should have solved the real problems, but we may have more in the future, so to avoid null-ptr-deref, we can return 0 (no space, no data available) but with a warning. This way the code should continue to run in a nearly consistent state and have a warning that allows us to debug future problems. Fixes: c0cfa2d8a788 ("vsock: add multi-transports support") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/netdev/Z2K%2FI4nlHdfMRTZC@v4bel-B760M-AORUS-ELITE-AX/ Link: https://lore.kernel.org/netdev/5ca20d4c-1017-49c2-9516-f6f75fd331e9@rbox.co/ Link: https://lore.kernel.org/netdev/677f84a8.050a0220.25a300.01b3.GAE@google.com/ Co-developed-by: Hyunwoo Kim Signed-off-by: Hyunwoo Kim Co-developed-by: Wongi Lee Signed-off-by: Wongi Lee Signed-off-by: Stefano Garzarella Reviewed-by: Luigi Leonardi Reviewed-by: Hyunwoo Kim Signed-off-by: Paolo Abeni --- net/vmw_vsock/af_vsock.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 74d35a871644..fa9d1b49599b 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -879,6 +879,9 @@ EXPORT_SYMBOL_GPL(vsock_create_connected); s64 vsock_stream_has_data(struct vsock_sock *vsk) { + if (WARN_ON(!vsk->transport)) + return 0; + return vsk->transport->stream_has_data(vsk); } EXPORT_SYMBOL_GPL(vsock_stream_has_data); @@ -887,6 +890,9 @@ s64 vsock_connectible_has_data(struct vsock_sock *vsk) { struct sock *sk = sk_vsock(vsk); + if (WARN_ON(!vsk->transport)) + return 0; + if (sk->sk_type == SOCK_SEQPACKET) return vsk->transport->seqpacket_has_data(vsk); else @@ -896,6 +902,9 @@ EXPORT_SYMBOL_GPL(vsock_connectible_has_data); s64 vsock_stream_has_space(struct vsock_sock *vsk) { + if (WARN_ON(!vsk->transport)) + return 0; + return vsk->transport->stream_has_space(vsk); } EXPORT_SYMBOL_GPL(vsock_stream_has_space); -- 2.51.0 From 2ca06a2f65310aeef30bb69b7405437a14766e4d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 13 Jan 2025 16:44:56 +0100 Subject: [PATCH 04/16] mptcp: be sure to send ack when mptcp-level window re-opens mptcp_cleanup_rbuf() is responsible to send acks when the user-space reads enough data to update the receive windows significantly. It tries hard to avoid acquiring the subflow sockets locks by checking conditions similar to the ones implemented at the TCP level. To avoid too much code duplication - the MPTCP protocol can't reuse the TCP helpers as part of the relevant status is maintained into the msk socket - and multiple costly window size computation, mptcp_cleanup_rbuf uses a rough estimate for the most recently advertised window size: the MPTCP receive free space, as recorded as at last-ack time. Unfortunately the above does not allow mptcp_cleanup_rbuf() to detect a zero to non-zero win change in some corner cases, skipping the tcp_cleanup_rbuf call and leaving the peer stuck. After commit ea66758c1795 ("tcp: allow MPTCP to update the announced window"), MPTCP has actually cheap access to the announced window value. Use it in mptcp_cleanup_rbuf() for a more accurate ack generation. Fixes: e3859603ba13 ("mptcp: better msk receive window updates") Cc: stable@vger.kernel.org Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/20250107131845.5e5de3c5@kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250113-net-mptcp-connect-st-flakes-v1-1-0d986ee7b1b6@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index a62bc874bf1e..123f3f297284 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -607,7 +607,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, } opts->ext_copy.use_ack = 1; opts->suboptions = OPTION_MPTCP_DSS; - WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk)); /* Add kind/length/subtype/flag overhead if mapping is not populated */ if (dss_size == 0) @@ -1288,7 +1287,7 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) } MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT); } - return; + goto update_wspace; } if (rcv_wnd_new != rcv_wnd_old) { @@ -1313,6 +1312,9 @@ raise_win: th->window = htons(new_win); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED); } + +update_wspace: + WRITE_ONCE(msk->old_wspace, tp->rcv_wnd); } __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) -- 2.51.0 From e226d9259dc4f5d2c19e6682ad1356fa97cf38f4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 13 Jan 2025 16:44:57 +0100 Subject: [PATCH 05/16] mptcp: fix spurious wake-up on under memory pressure The wake-up condition currently implemented by mptcp_epollin_ready() is wrong, as it could mark the MPTCP socket as readable even when no data are present and the system is under memory pressure. Explicitly check for some data being available in the receive queue. Fixes: 5684ab1a0eff ("mptcp: give rcvlowat some love") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250113-net-mptcp-connect-st-flakes-v1-2-0d986ee7b1b6@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a93e661ef5c4..73526f1d768f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -760,10 +760,15 @@ static inline u64 mptcp_data_avail(const struct mptcp_sock *msk) static inline bool mptcp_epollin_ready(const struct sock *sk) { + u64 data_avail = mptcp_data_avail(mptcp_sk(sk)); + + if (!data_avail) + return false; + /* mptcp doesn't have to deal with small skbs in the receive queue, - * at it can always coalesce them + * as it can always coalesce them */ - return (mptcp_data_avail(mptcp_sk(sk)) >= sk->sk_rcvlowat) || + return (data_avail >= sk->sk_rcvlowat) || (mem_cgroup_sockets_enabled && sk->sk_memcg && mem_cgroup_under_socket_pressure(sk->sk_memcg)) || READ_ONCE(tcp_memory_pressure); -- 2.51.0 From 218cc166321fb3cc8786677ffe0d09a78778a910 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 13 Jan 2025 16:44:58 +0100 Subject: [PATCH 06/16] selftests: mptcp: avoid spurious errors on disconnect MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The disconnect test-case generates spurious errors: INFO: disconnect INFO: extra options: -I 3 -i /tmp/tmp.r43niviyoI 01 ns1 MPTCP -> ns1 (10.0.1.1:10000 ) MPTCP (duration 140ms) [FAIL] file received by server does not match (in, out): Unexpected revents: POLLERR/POLLNVAL(19) -rw-r--r-- 1 root root 10028676 Jan 10 10:47 /tmp/tmp.r43niviyoI.disconnect Trailing bytes are: ��\����R���!8��u2��5N% -rw------- 1 root root 9992290 Jan 10 10:47 /tmp/tmp.Os4UbnWbI1 Trailing bytes are: ��\����R���!8��u2��5N% 02 ns1 MPTCP -> ns1 (dead:beef:1::1:10001) MPTCP (duration 206ms) [ OK ] 03 ns1 MPTCP -> ns1 (dead:beef:1::1:10002) TCP (duration 31ms) [ OK ] 04 ns1 TCP -> ns1 (dead:beef:1::1:10003) MPTCP (duration 26ms) [ OK ] [FAIL] Tests of the full disconnection have failed Time: 2 seconds The root cause is actually in the user-space bits: the test program currently disconnects as soon as all the pending data has been spooled, generating an FASTCLOSE. If such option reaches the peer before the latter has reached the closed status, the msk socket will report an error to the user-space, as per protocol specification, causing the above failure. Address the issue explicitly waiting for all the relevant sockets to reach a closed status before performing the disconnect. Fixes: 05be5e273c84 ("selftests: mptcp: add disconnect tests") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250113-net-mptcp-connect-st-flakes-v1-3-0d986ee7b1b6@kernel.org Signed-off-by: Jakub Kicinski --- .../selftests/net/mptcp/mptcp_connect.c | 43 ++++++++++++++----- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 4209b9569039..414addef9a45 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -25,6 +25,8 @@ #include #include +#include + #include #include @@ -1211,23 +1213,42 @@ static void parse_setsock_options(const char *name) exit(1); } -void xdisconnect(int fd, int addrlen) +void xdisconnect(int fd) { - struct sockaddr_storage empty; + socklen_t addrlen = sizeof(struct sockaddr_storage); + struct sockaddr_storage addr, empty; int msec_sleep = 10; - int queued = 1; - int i; + void *raw_addr; + int i, cmdlen; + char cmd[128]; + + /* get the local address and convert it to string */ + if (getsockname(fd, (struct sockaddr *)&addr, &addrlen) < 0) + xerror("getsockname"); + + if (addr.ss_family == AF_INET) + raw_addr = &(((struct sockaddr_in *)&addr)->sin_addr); + else if (addr.ss_family == AF_INET6) + raw_addr = &(((struct sockaddr_in6 *)&addr)->sin6_addr); + else + xerror("bad family"); + + strcpy(cmd, "ss -M | grep -q "); + cmdlen = strlen(cmd); + if (!inet_ntop(addr.ss_family, raw_addr, &cmd[cmdlen], + sizeof(cmd) - cmdlen)) + xerror("inet_ntop"); shutdown(fd, SHUT_WR); - /* while until the pending data is completely flushed, the later + /* + * wait until the pending data is completely flushed and all + * the MPTCP sockets reached the closed status. * disconnect will bypass/ignore/drop any pending data. */ for (i = 0; ; i += msec_sleep) { - if (ioctl(fd, SIOCOUTQ, &queued) < 0) - xerror("can't query out socket queue: %d", errno); - - if (!queued) + /* closed socket are not listed by 'ss' */ + if (system(cmd) != 0) break; if (i > poll_timeout) @@ -1281,9 +1302,9 @@ again: return ret; if (cfg_truncate > 0) { - xdisconnect(fd, peer->ai_addrlen); + xdisconnect(fd); } else if (--cfg_repeat > 0) { - xdisconnect(fd, peer->ai_addrlen); + xdisconnect(fd); /* the socket could be unblocking at this point, we need the * connect to be blocking -- 2.51.0 From 16ebb6f5b6295c9688749862a39a4889c56227f8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 13 Jan 2025 09:18:39 +0300 Subject: [PATCH 07/16] nfp: bpf: prevent integer overflow in nfp_bpf_event_output() The "sizeof(struct cmsg_bpf_event) + pkt_size + data_size" math could potentially have an integer wrapping bug on 32bit systems. Check for this and return an error. Fixes: 9816dd35ecec ("nfp: bpf: perf event output helpers support") Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/6074805b-e78d-4b8a-bf05-e929b5377c28@stanley.mountain Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/bpf/offload.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 9d97cd281f18..c03558adda91 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -458,7 +458,8 @@ int nfp_bpf_event_output(struct nfp_app_bpf *bpf, const void *data, map_id_full = be64_to_cpu(cbe->map_ptr); map_id = map_id_full; - if (len < sizeof(struct cmsg_bpf_event) + pkt_size + data_size) + if (size_add(pkt_size, data_size) > INT_MAX || + len < sizeof(struct cmsg_bpf_event) + pkt_size + data_size) return -EINVAL; if (cbe->hdr.ver != NFP_CCM_ABI_VERSION) return -EINVAL; -- 2.51.0 From c17ff476f53afb30f90bb3c2af77de069c81a622 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Mon, 13 Jan 2025 11:30:00 -0500 Subject: [PATCH 08/16] net: xilinx: axienet: Fix IRQ coalescing packet count overflow If coalesce_count is greater than 255 it will not fit in the register and will overflow. This can be reproduced by running # ethtool -C ethX rx-frames 256 which will result in a timeout of 0us instead. Fix this by checking for invalid values and reporting an error. Fixes: 8a3b7a252dca ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver") Signed-off-by: Sean Anderson Reviewed-by: Shannon Nelson Reviewed-by: Radhey Shyam Pandey Link: https://patch.msgid.link/20250113163001.2335235-1-sean.anderson@linux.dev Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 0f4b02fe6f85..ae743991117c 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -2056,6 +2056,12 @@ axienet_ethtools_set_coalesce(struct net_device *ndev, return -EBUSY; } + if (ecoalesce->rx_max_coalesced_frames > 255 || + ecoalesce->tx_max_coalesced_frames > 255) { + NL_SET_ERR_MSG(extack, "frames must be less than 256"); + return -EINVAL; + } + if (ecoalesce->rx_max_coalesced_frames) lp->coalesce_count_rx = ecoalesce->rx_max_coalesced_frames; if (ecoalesce->rx_coalesce_usecs) -- 2.51.0 From f0d0277796db613c124206544b6dbe95b520ab6c Mon Sep 17 00:00:00 2001 From: John Sperbeck Date: Mon, 13 Jan 2025 17:13:54 -0800 Subject: [PATCH 09/16] net: netpoll: ensure skb_pool list is always initialized When __netpoll_setup() is called directly, instead of through netpoll_setup(), the np->skb_pool list head isn't initialized. If skb_pool_flush() is later called, then we hit a NULL pointer in skb_queue_purge_reason(). This can be seen with this repro, when CONFIG_NETCONSOLE is enabled as a module: ip tuntap add mode tap tap0 ip link add name br0 type bridge ip link set dev tap0 master br0 modprobe netconsole netconsole=4444@10.0.0.1/br0,9353@10.0.0.2/ rmmod netconsole The backtrace is: BUG: kernel NULL pointer dereference, address: 0000000000000008 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page ... ... ... Call Trace: __netpoll_free+0xa5/0xf0 br_netpoll_cleanup+0x43/0x50 [bridge] do_netpoll_cleanup+0x43/0xc0 netconsole_netdev_event+0x1e3/0x300 [netconsole] unregister_netdevice_notifier+0xd9/0x150 cleanup_module+0x45/0x920 [netconsole] __se_sys_delete_module+0x205/0x290 do_syscall_64+0x70/0x150 entry_SYSCALL_64_after_hwframe+0x76/0x7e Move the skb_pool list setup and initial skb fill into __netpoll_setup(). Fixes: 221a9c1df790 ("net: netpoll: Individualize the skb pool") Signed-off-by: John Sperbeck Reviewed-by: Breno Leitao Link: https://patch.msgid.link/20250114011354.2096812-1-jsperbeck@google.com Signed-off-by: Jakub Kicinski --- net/core/netpoll.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 2e459b9d88eb..96a6ed37d4cc 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -627,6 +627,8 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) const struct net_device_ops *ops; int err; + skb_queue_head_init(&np->skb_pool); + if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { np_err(np, "%s doesn't support polling, aborting\n", ndev->name); @@ -662,6 +664,9 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) strscpy(np->dev_name, ndev->name, IFNAMSIZ); npinfo->netpoll = np; + /* fill up the skb queue */ + refill_skbs(np); + /* last thing to do is link it to the net device structure */ rcu_assign_pointer(ndev->npinfo, npinfo); @@ -681,8 +686,6 @@ int netpoll_setup(struct netpoll *np) struct in_device *in_dev; int err; - skb_queue_head_init(&np->skb_pool); - rtnl_lock(); if (np->dev_name[0]) { struct net *net = current->nsproxy->net_ns; @@ -782,9 +785,6 @@ put_noaddr: } } - /* fill up the skb queue */ - refill_skbs(np); - err = __netpoll_setup(np, ndev); if (err) goto flush; -- 2.51.0 From 001ba0902046cb6c352494df610718c0763e77a5 Mon Sep 17 00:00:00 2001 From: Kevin Groeneveld Date: Mon, 13 Jan 2025 10:48:45 -0500 Subject: [PATCH 10/16] net: fec: handle page_pool_dev_alloc_pages error The fec_enet_update_cbd function calls page_pool_dev_alloc_pages but did not handle the case when it returned NULL. There was a WARN_ON(!new_page) but it would still proceed to use the NULL pointer and then crash. This case does seem somewhat rare but when the system is under memory pressure it can happen. One case where I can duplicate this with some frequency is when writing over a smbd share to a SATA HDD attached to an imx6q. Setting /proc/sys/vm/min_free_kbytes to higher values also seems to solve the problem for my test case. But it still seems wrong that the fec driver ignores the memory allocation error and can crash. This commit handles the allocation error by dropping the current packet. Fixes: 95698ff6177b5 ("net: fec: using page pool to manage RX buffers") Signed-off-by: Kevin Groeneveld Reviewed-by: Jacob Keller Reviewed-by: Wei Fang Link: https://patch.msgid.link/20250113154846.1765414-1-kgroeneveld@lenbrook.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec_main.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 1b55047c0237..4566848e1d7c 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1591,19 +1591,22 @@ static void fec_enet_tx(struct net_device *ndev, int budget) fec_enet_tx_queue(ndev, i, budget); } -static void fec_enet_update_cbd(struct fec_enet_priv_rx_q *rxq, +static int fec_enet_update_cbd(struct fec_enet_priv_rx_q *rxq, struct bufdesc *bdp, int index) { struct page *new_page; dma_addr_t phys_addr; new_page = page_pool_dev_alloc_pages(rxq->page_pool); - WARN_ON(!new_page); - rxq->rx_skb_info[index].page = new_page; + if (unlikely(!new_page)) + return -ENOMEM; + rxq->rx_skb_info[index].page = new_page; rxq->rx_skb_info[index].offset = FEC_ENET_XDP_HEADROOM; phys_addr = page_pool_get_dma_addr(new_page) + FEC_ENET_XDP_HEADROOM; bdp->cbd_bufaddr = cpu_to_fec32(phys_addr); + + return 0; } static u32 @@ -1698,6 +1701,7 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id) int cpu = smp_processor_id(); struct xdp_buff xdp; struct page *page; + __fec32 cbd_bufaddr; u32 sub_len = 4; #if !defined(CONFIG_M5272) @@ -1766,12 +1770,17 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id) index = fec_enet_get_bd_index(bdp, &rxq->bd); page = rxq->rx_skb_info[index].page; + cbd_bufaddr = bdp->cbd_bufaddr; + if (fec_enet_update_cbd(rxq, bdp, index)) { + ndev->stats.rx_dropped++; + goto rx_processing_done; + } + dma_sync_single_for_cpu(&fep->pdev->dev, - fec32_to_cpu(bdp->cbd_bufaddr), + fec32_to_cpu(cbd_bufaddr), pkt_len, DMA_FROM_DEVICE); prefetch(page_address(page)); - fec_enet_update_cbd(rxq, bdp, index); if (xdp_prog) { xdp_buff_clear_frags_flag(&xdp); -- 2.51.0 From 0a5b8fff01bde1b9908f00004c676f2e2459333b Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Sat, 11 Jan 2025 18:15:15 -0300 Subject: [PATCH 11/16] selftests: net: Adapt ethtool mq tests to fix in qdisc graft Because of patch[1] the graft behaviour changed So the command: tcq replace parent 100:1 handle 204: Is no longer valid and will not delete 100:4 added by command: tcq replace parent 100:4 handle 204: pfifo_fast So to maintain the original behaviour, this patch manually deletes 100:4 and grafts 100:1 Note: This change will also work fine without [1] [1] https://lore.kernel.org/netdev/20250111151455.75480-1-jhs@mojatatu.com/T/#u Signed-off-by: Victor Nogueira Reviewed-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- .../selftests/drivers/net/netdevsim/tc-mq-visibility.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/drivers/net/netdevsim/tc-mq-visibility.sh b/tools/testing/selftests/drivers/net/netdevsim/tc-mq-visibility.sh index fd13c8cfb7a8..b411fe66510f 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/tc-mq-visibility.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/tc-mq-visibility.sh @@ -58,9 +58,12 @@ for root in mq mqprio; do ethtool -L $NDEV combined 4 n_child_assert 4 "One real queue, rest default" - # Graft some - tcq replace parent 100:1 handle 204: - n_child_assert 3 "Grafted" + # Remove real one + tcq del parent 100:4 handle 204: + + # Replace default with pfifo + tcq replace parent 100:1 handle 205: pfifo limit 1000 + n_child_assert 3 "Deleting real one, replacing default one with pfifo" ethtool -L $NDEV combined 1 n_child_assert 1 "Grafted, one" -- 2.51.0 From 5c71729ab92c7e710d48ed93043a2d1e35cc8d3c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 14 Jan 2025 18:47:20 +0200 Subject: [PATCH 12/16] net: pcs: xpcs: fix DW_VR_MII_DIG_CTRL1_2G5_EN bit being set for 1G SGMII w/o inband On a port with SGMII fixed-link at SPEED_1000, DW_VR_MII_DIG_CTRL1 gets set to 0x2404. This is incorrect, because bit 2 (DW_VR_MII_DIG_CTRL1_2G5_EN) is set. It comes from the previous write to DW_VR_MII_AN_CTRL, because the "val" variable is reused and is dirty. Actually, its value is 0x4, aka FIELD_PREP(DW_VR_MII_PCS_MODE_MASK, DW_VR_MII_PCS_MODE_C37_SGMII). Resolve the issue by clearing "val" to 0 when writing to a new register. After the fix, the register value is 0x2400. Prior to the blamed commit, when the read-modify-write was open-coded, the code saved the content of the DW_VR_MII_DIG_CTRL1 register in the "ret" variable. Fixes: ce8d6081fcf4 ("net: pcs: xpcs: add _modify() accessors") Signed-off-by: Vladimir Oltean Reviewed-by: Maxime Chevallier Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20250114164721.2879380-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/pcs/pcs-xpcs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c index 7246a910728d..85cbf144ca44 100644 --- a/drivers/net/pcs/pcs-xpcs.c +++ b/drivers/net/pcs/pcs-xpcs.c @@ -684,6 +684,7 @@ static int xpcs_config_aneg_c37_sgmii(struct dw_xpcs *xpcs, if (ret < 0) return ret; + val = 0; mask = DW_VR_MII_DIG_CTRL1_MAC_AUTO_SW; if (neg_mode == PHYLINK_PCS_NEG_INBAND_ENABLED) val = DW_VR_MII_DIG_CTRL1_MAC_AUTO_SW; -- 2.51.0 From d6e3316a1680305da291a5b5deaf424559aaf06c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 14 Jan 2025 18:47:21 +0200 Subject: [PATCH 13/16] net: pcs: xpcs: actively unset DW_VR_MII_DIG_CTRL1_2G5_EN for 1G SGMII xpcs_config_2500basex() sets DW_VR_MII_DIG_CTRL1_2G5_EN, but xpcs_config_aneg_c37_sgmii() never unsets it. So, on a protocol change from 2500base-x to sgmii, the DW_VR_MII_DIG_CTRL1_2G5_EN bit will remain set. Fixes: f27abde3042a ("net: pcs: add 2500BASEX support for Intel mGbE controller") Signed-off-by: Vladimir Oltean Reviewed-by: Maxime Chevallier Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20250114164721.2879380-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/pcs/pcs-xpcs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c index 85cbf144ca44..3059435af596 100644 --- a/drivers/net/pcs/pcs-xpcs.c +++ b/drivers/net/pcs/pcs-xpcs.c @@ -685,7 +685,8 @@ static int xpcs_config_aneg_c37_sgmii(struct dw_xpcs *xpcs, return ret; val = 0; - mask = DW_VR_MII_DIG_CTRL1_MAC_AUTO_SW; + mask = DW_VR_MII_DIG_CTRL1_2G5_EN | DW_VR_MII_DIG_CTRL1_MAC_AUTO_SW; + if (neg_mode == PHYLINK_PCS_NEG_INBAND_ENABLED) val = DW_VR_MII_DIG_CTRL1_MAC_AUTO_SW; -- 2.51.0 From 6be7aca91009865d8c2b73589270224a6b6e67ab Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 12 Jan 2025 22:59:59 +0100 Subject: [PATCH 14/16] net: ethernet: xgbe: re-add aneg to supported features in PHY quirks In 4.19, before the switch to linkmode bitmaps, PHY_GBIT_FEATURES included feature bits for aneg and TP/MII ports. SUPPORTED_TP | \ SUPPORTED_MII) SUPPORTED_10baseT_Full) SUPPORTED_100baseT_Full) SUPPORTED_1000baseT_Full) PHY_100BT_FEATURES | \ PHY_DEFAULT_FEATURES) PHY_1000BT_FEATURES) Referenced commit expanded PHY_GBIT_FEATURES, silently removing PHY_DEFAULT_FEATURES. The removed part can be re-added by using the new PHY_GBIT_FEATURES definition. Not clear to me is why nobody seems to have noticed this issue. I stumbled across this when checking what it takes to make phy_10_100_features_array et al private to phylib. Fixes: d0939c26c53a ("net: ethernet: xgbe: expand PHY_GBIT_FEAUTRES") Cc: stable@vger.kernel.org Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/46521973-7738-4157-9f5e-0bb6f694acba@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c index 6a716337f48b..268399dfcf22 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c @@ -923,7 +923,6 @@ static void xgbe_phy_free_phy_device(struct xgbe_prv_data *pdata) static bool xgbe_phy_finisar_phy_quirks(struct xgbe_prv_data *pdata) { - __ETHTOOL_DECLARE_LINK_MODE_MASK(supported) = { 0, }; struct xgbe_phy_data *phy_data = pdata->phy_data; unsigned int phy_id = phy_data->phydev->phy_id; @@ -945,14 +944,7 @@ static bool xgbe_phy_finisar_phy_quirks(struct xgbe_prv_data *pdata) phy_write(phy_data->phydev, 0x04, 0x0d01); phy_write(phy_data->phydev, 0x00, 0x9140); - linkmode_set_bit_array(phy_10_100_features_array, - ARRAY_SIZE(phy_10_100_features_array), - supported); - linkmode_set_bit_array(phy_gbit_features_array, - ARRAY_SIZE(phy_gbit_features_array), - supported); - - linkmode_copy(phy_data->phydev->supported, supported); + linkmode_copy(phy_data->phydev->supported, PHY_GBIT_FEATURES); phy_support_asym_pause(phy_data->phydev); @@ -964,7 +956,6 @@ static bool xgbe_phy_finisar_phy_quirks(struct xgbe_prv_data *pdata) static bool xgbe_phy_belfuse_phy_quirks(struct xgbe_prv_data *pdata) { - __ETHTOOL_DECLARE_LINK_MODE_MASK(supported) = { 0, }; struct xgbe_phy_data *phy_data = pdata->phy_data; struct xgbe_sfp_eeprom *sfp_eeprom = &phy_data->sfp_eeprom; unsigned int phy_id = phy_data->phydev->phy_id; @@ -1028,13 +1019,7 @@ static bool xgbe_phy_belfuse_phy_quirks(struct xgbe_prv_data *pdata) reg = phy_read(phy_data->phydev, 0x00); phy_write(phy_data->phydev, 0x00, reg & ~0x00800); - linkmode_set_bit_array(phy_10_100_features_array, - ARRAY_SIZE(phy_10_100_features_array), - supported); - linkmode_set_bit_array(phy_gbit_features_array, - ARRAY_SIZE(phy_gbit_features_array), - supported); - linkmode_copy(phy_data->phydev->supported, supported); + linkmode_copy(phy_data->phydev->supported, PHY_GBIT_FEATURES); phy_support_asym_pause(phy_data->phydev); netif_dbg(pdata, drv, pdata->netdev, -- 2.51.0 From cbc16bceea784210d585a42ac9f8f10ce62b300e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 8 Jan 2025 14:06:22 -0800 Subject: [PATCH 15/16] net: make page_pool_ref_netmem work with net iovs page_pool_ref_netmem() should work with either netmem representation, but currently it casts to a page with netmem_to_page(), which will fail with net iovs. Use netmem_get_pp_ref_count_ref() instead. Fixes: 8ab79ed50cf1 ("page_pool: devmem support") Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://lore.kernel.org/20250108220644.3528845-2-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/page_pool/helpers.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 793e6fd78bc5..60a5347922be 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -294,7 +294,7 @@ static inline long page_pool_unref_page(struct page *page, long nr) static inline void page_pool_ref_netmem(netmem_ref netmem) { - atomic_long_inc(&netmem_to_page(netmem)->pp_ref_count); + atomic_long_inc(netmem_get_pp_ref_count_ref(netmem)); } static inline void page_pool_ref_page(struct page *page) -- 2.51.0 From c08d3e62b2e73e14da318a1d20b52d0486a28ee0 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 15 Jan 2025 13:39:04 +0200 Subject: [PATCH 16/16] net/mlx5: Fix RDMA TX steering prio User added steering rules at RDMA_TX were being added to the first prio, which is the counters prio. Fix that so that they are correctly added to the BYPASS_PRIO instead. Fixes: 24670b1a3166 ("net/mlx5: Add support for RDMA TX steering") Signed-off-by: Patrisious Haddad Reviewed-by: Mark Bloch Reviewed-by: Jacob Keller Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 2eabfcc247c6..0ce999706d41 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -2709,6 +2709,7 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev, break; case MLX5_FLOW_NAMESPACE_RDMA_TX: root_ns = steering->rdma_tx_root_ns; + prio = RDMA_TX_BYPASS_PRIO; break; case MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS: root_ns = steering->rdma_rx_root_ns; -- 2.51.0