From ccb989e4d1efe0dd81b28c437443532d80d9ecee Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 25 Nov 2024 09:40:50 +0100 Subject: [PATCH 01/16] net: phy: microchip: Reset LAN88xx PHY to ensure clean link state on LAN7800/7850 Fix outdated MII_LPA data in the LAN88xx PHY, which is used in LAN7800 and LAN7850 USB Ethernet controllers. Due to a hardware limitation, the PHY cannot reliably update link status after parallel detection when the link partner does not support auto-negotiation. To mitigate this, add a PHY reset in `lan88xx_link_change_notify()` when `phydev->state` is `PHY_NOLINK`, ensuring the PHY starts in a clean state and reports accurate fixed link parallel detection results. Fixes: 792aec47d59d9 ("add microchip LAN88xx phy driver") Signed-off-by: Oleksij Rempel Link: https://patch.msgid.link/20241125084050.414352-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/phy/microchip.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/drivers/net/phy/microchip.c b/drivers/net/phy/microchip.c index d3273bc0da4a..691969a4910f 100644 --- a/drivers/net/phy/microchip.c +++ b/drivers/net/phy/microchip.c @@ -351,6 +351,22 @@ static int lan88xx_config_aneg(struct phy_device *phydev) static void lan88xx_link_change_notify(struct phy_device *phydev) { int temp; + int ret; + + /* Reset PHY to ensure MII_LPA provides up-to-date information. This + * issue is reproducible only after parallel detection, as described + * in IEEE 802.3-2022, Section 28.2.3.1 ("Parallel detection function"), + * where the link partner does not support auto-negotiation. + */ + if (phydev->state == PHY_NOLINK) { + ret = phy_init_hw(phydev); + if (ret < 0) + goto link_change_notify_failed; + + ret = _phy_start_aneg(phydev); + if (ret < 0) + goto link_change_notify_failed; + } /* At forced 100 F/H mode, chip may fail to set mode correctly * when cable is switched between long(~50+m) and short one. @@ -377,6 +393,11 @@ static void lan88xx_link_change_notify(struct phy_device *phydev) temp |= LAN88XX_INT_MASK_MDINTPIN_EN_; phy_write(phydev, LAN88XX_INT_MASK, temp); } + + return; + +link_change_notify_failed: + phydev_err(phydev, "Link change process failed %pe\n", ERR_PTR(ret)); } /** -- 2.51.0 From 3301ab7d5aeb0fe270f73a3d4810c9d1b6a9f045 Mon Sep 17 00:00:00 2001 From: Jiri Wiesner Date: Thu, 28 Nov 2024 09:59:50 +0100 Subject: [PATCH 02/16] net/ipv6: release expired exception dst cached in socket Dst objects get leaked in ip6_negative_advice() when this function is executed for an expired IPv6 route located in the exception table. There are several conditions that must be fulfilled for the leak to occur: * an ICMPv6 packet indicating a change of the MTU for the path is received, resulting in an exception dst being created * a TCP connection that uses the exception dst for routing packets must start timing out so that TCP begins retransmissions * after the exception dst expires, the FIB6 garbage collector must not run before TCP executes ip6_negative_advice() for the expired exception dst When TCP executes ip6_negative_advice() for an exception dst that has expired and if no other socket holds a reference to the exception dst, the refcount of the exception dst is 2, which corresponds to the increment made by dst_init() and the increment made by the TCP socket for which the connection is timing out. The refcount made by the socket is never released. The refcount of the dst is decremented in sk_dst_reset() but that decrement is counteracted by a dst_hold() intentionally placed just before the sk_dst_reset() in ip6_negative_advice(). After ip6_negative_advice() has finished, there is no other object tied to the dst. The socket lost its reference stored in sk_dst_cache and the dst is no longer in the exception table. The exception dst becomes a leaked object. As a result of this dst leak, an unbalanced refcount is reported for the loopback device of a net namespace being destroyed under kernels that do not contain e5f80fcf869a ("ipv6: give an IPv6 dev to blackhole_netdev"): unregister_netdevice: waiting for lo to become free. Usage count = 2 Fix the dst leak by removing the dst_hold() in ip6_negative_advice(). The patch that introduced the dst_hold() in ip6_negative_advice() was 92f1655aa2b22 ("net: fix __dst_negative_advice() race"). But 92f1655aa2b22 merely refactored the code with regards to the dst refcount so the issue was present even before 92f1655aa2b22. The bug was introduced in 54c1a859efd9f ("ipv6: Don't drop cache route entry unless timer actually expired.") where the expired cached route is deleted and the sk_dst_cache member of the socket is set to NULL by calling dst_negative_advice() but the refcount belonging to the socket is left unbalanced. The IPv4 version - ipv4_negative_advice() - is not affected by this bug. When the TCP connection times out ipv4_negative_advice() merely resets the sk_dst_cache of the socket while decrementing the refcount of the exception dst. Fixes: 92f1655aa2b22 ("net: fix __dst_negative_advice() race") Fixes: 54c1a859efd9f ("ipv6: Don't drop cache route entry unless timer actually expired.") Link: https://lore.kernel.org/netdev/20241113105611.GA6723@incl/T/#u Signed-off-by: Jiri Wiesner Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241128085950.GA4505@incl Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 63d7681c929f..67ff16c04718 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2780,10 +2780,10 @@ static void ip6_negative_advice(struct sock *sk, if (rt->rt6i_flags & RTF_CACHE) { rcu_read_lock(); if (rt6_check_expired(rt)) { - /* counteract the dst_release() in sk_dst_reset() */ - dst_hold(dst); + /* rt/dst can not be destroyed yet, + * because of rcu_read_lock() + */ sk_dst_reset(sk); - rt6_remove_exception_rt(rt); } rcu_read_unlock(); -- 2.51.0 From 22be4727a8f898442066bcac34f8a1ad0bc72e14 Mon Sep 17 00:00:00 2001 From: Ivan Solodovnikov Date: Tue, 26 Nov 2024 17:39:02 +0300 Subject: [PATCH 03/16] dccp: Fix memory leak in dccp_feat_change_recv If dccp_feat_push_confirm() fails after new value for SP feature was accepted without reconciliation ('entry == NULL' branch), memory allocated for that value with dccp_feat_clone_sp_val() is never freed. Here is the kmemleak stack for this: unreferenced object 0xffff88801d4ab488 (size 8): comm "syz-executor310", pid 1127, jiffies 4295085598 (age 41.666s) hex dump (first 8 bytes): 01 b4 4a 1d 80 88 ff ff ..J..... backtrace: [<00000000db7cabfe>] kmemdup+0x23/0x50 mm/util.c:128 [<0000000019b38405>] kmemdup include/linux/string.h:465 [inline] [<0000000019b38405>] dccp_feat_clone_sp_val net/dccp/feat.c:371 [inline] [<0000000019b38405>] dccp_feat_clone_sp_val net/dccp/feat.c:367 [inline] [<0000000019b38405>] dccp_feat_change_recv net/dccp/feat.c:1145 [inline] [<0000000019b38405>] dccp_feat_parse_options+0x1196/0x2180 net/dccp/feat.c:1416 [<00000000b1f6d94a>] dccp_parse_options+0xa2a/0x1260 net/dccp/options.c:125 [<0000000030d7b621>] dccp_rcv_state_process+0x197/0x13d0 net/dccp/input.c:650 [<000000001f74c72e>] dccp_v4_do_rcv+0xf9/0x1a0 net/dccp/ipv4.c:688 [<00000000a6c24128>] sk_backlog_rcv include/net/sock.h:1041 [inline] [<00000000a6c24128>] __release_sock+0x139/0x3b0 net/core/sock.c:2570 [<00000000cf1f3a53>] release_sock+0x54/0x1b0 net/core/sock.c:3111 [<000000008422fa23>] inet_wait_for_connect net/ipv4/af_inet.c:603 [inline] [<000000008422fa23>] __inet_stream_connect+0x5d0/0xf70 net/ipv4/af_inet.c:696 [<0000000015b6f64d>] inet_stream_connect+0x53/0xa0 net/ipv4/af_inet.c:735 [<0000000010122488>] __sys_connect_file+0x15c/0x1a0 net/socket.c:1865 [<00000000b4b70023>] __sys_connect+0x165/0x1a0 net/socket.c:1882 [<00000000f4cb3815>] __do_sys_connect net/socket.c:1892 [inline] [<00000000f4cb3815>] __se_sys_connect net/socket.c:1889 [inline] [<00000000f4cb3815>] __x64_sys_connect+0x6e/0xb0 net/socket.c:1889 [<00000000e7b1e839>] do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46 [<0000000055e91434>] entry_SYSCALL_64_after_hwframe+0x67/0xd1 Clean up the allocated memory in case of dccp_feat_push_confirm() failure and bail out with an error reset code. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: e77b8363b2ea ("dccp: Process incoming Change feature-negotiation options") Signed-off-by: Ivan Solodovnikov Link: https://patch.msgid.link/20241126143902.190853-1-solodovnikov.ia@phystech.edu Signed-off-by: Paolo Abeni --- net/dccp/feat.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 54086bb05c42..f7554dcdaaba 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -1166,8 +1166,12 @@ static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, goto not_valid_or_not_known; } - return dccp_feat_push_confirm(fn, feat, local, &fval); + if (dccp_feat_push_confirm(fn, feat, local, &fval)) { + kfree(fval.sp.vec); + return DCCP_RESET_CODE_TOO_BUSY; + } + return 0; } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */ return 0; } -- 2.51.0 From 6a2fa13312e51a621f652d522d7e2df7066330b6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 27 Nov 2024 14:05:12 +0900 Subject: [PATCH 04/16] tipc: Fix use-after-free of kernel socket in cleanup_bearer(). syzkaller reported a use-after-free of UDP kernel socket in cleanup_bearer() without repro. [0][1] When bearer_disable() calls tipc_udp_disable(), cleanup of the UDP kernel socket is deferred by work calling cleanup_bearer(). tipc_net_stop() waits for such works to finish by checking tipc_net(net)->wq_count. However, the work decrements the count too early before releasing the kernel socket, unblocking cleanup_net() and resulting in use-after-free. Let's move the decrement after releasing the socket in cleanup_bearer(). [0]: ref_tracker: net notrefcnt@000000009b3d1faf has 1/1 users at sk_alloc+0x438/0x608 inet_create+0x4c8/0xcb0 __sock_create+0x350/0x6b8 sock_create_kern+0x58/0x78 udp_sock_create4+0x68/0x398 udp_sock_create+0x88/0xc8 tipc_udp_enable+0x5e8/0x848 __tipc_nl_bearer_enable+0x84c/0xed8 tipc_nl_bearer_enable+0x38/0x60 genl_family_rcv_msg_doit+0x170/0x248 genl_rcv_msg+0x400/0x5b0 netlink_rcv_skb+0x1dc/0x398 genl_rcv+0x44/0x68 netlink_unicast+0x678/0x8b0 netlink_sendmsg+0x5e4/0x898 ____sys_sendmsg+0x500/0x830 [1]: BUG: KMSAN: use-after-free in udp_hashslot include/net/udp.h:85 [inline] BUG: KMSAN: use-after-free in udp_lib_unhash+0x3b8/0x930 net/ipv4/udp.c:1979 udp_hashslot include/net/udp.h:85 [inline] udp_lib_unhash+0x3b8/0x930 net/ipv4/udp.c:1979 sk_common_release+0xaf/0x3f0 net/core/sock.c:3820 inet_release+0x1e0/0x260 net/ipv4/af_inet.c:437 inet6_release+0x6f/0xd0 net/ipv6/af_inet6.c:489 __sock_release net/socket.c:658 [inline] sock_release+0xa0/0x210 net/socket.c:686 cleanup_bearer+0x42d/0x4c0 net/tipc/udp_media.c:819 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xcaf/0x1c90 kernel/workqueue.c:3310 worker_thread+0xf6c/0x1510 kernel/workqueue.c:3391 kthread+0x531/0x6b0 kernel/kthread.c:389 ret_from_fork+0x60/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:244 Uninit was created at: slab_free_hook mm/slub.c:2269 [inline] slab_free mm/slub.c:4580 [inline] kmem_cache_free+0x207/0xc40 mm/slub.c:4682 net_free net/core/net_namespace.c:454 [inline] cleanup_net+0x16f2/0x19d0 net/core/net_namespace.c:647 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xcaf/0x1c90 kernel/workqueue.c:3310 worker_thread+0xf6c/0x1510 kernel/workqueue.c:3391 kthread+0x531/0x6b0 kernel/kthread.c:389 ret_from_fork+0x60/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:244 CPU: 0 UID: 0 PID: 54 Comm: kworker/0:2 Not tainted 6.12.0-rc1-00131-gf66ebf37d69c #7 91723d6f74857f70725e1583cba3cf4adc716cfa Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 Workqueue: events cleanup_bearer Fixes: 26abe14379f8 ("net: Modify sk_alloc to not reference count the netns of kernel sockets.") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241127050512.28438-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/tipc/udp_media.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 439f75539977..b7e25e7e9933 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -814,10 +814,10 @@ static void cleanup_bearer(struct work_struct *work) kfree_rcu(rcast, rcu); } - atomic_dec(&tipc_net(sock_net(ub->ubsock->sk))->wq_count); dst_cache_destroy(&ub->rcast.dst_cache); udp_tunnel_sock_release(ub->ubsock); synchronize_net(); + atomic_dec(&tipc_net(sock_net(ub->ubsock->sk))->wq_count); kfree(ub); } -- 2.51.0 From 0541db8ee32c09463a72d0987382b3a3336b0043 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Wed, 27 Nov 2024 21:30:13 +0800 Subject: [PATCH 05/16] net/smc: initialize close_work early to avoid warning We encountered a warning that close_work was canceled before initialization. WARNING: CPU: 7 PID: 111103 at kernel/workqueue.c:3047 __flush_work+0x19e/0x1b0 Workqueue: events smc_lgr_terminate_work [smc] RIP: 0010:__flush_work+0x19e/0x1b0 Call Trace: ? __wake_up_common+0x7a/0x190 ? work_busy+0x80/0x80 __cancel_work_timer+0xe3/0x160 smc_close_cancel_work+0x1a/0x70 [smc] smc_close_active_abort+0x207/0x360 [smc] __smc_lgr_terminate.part.38+0xc8/0x180 [smc] process_one_work+0x19e/0x340 worker_thread+0x30/0x370 ? process_one_work+0x340/0x340 kthread+0x117/0x130 ? __kthread_cancel_work+0x50/0x50 ret_from_fork+0x22/0x30 This is because when smc_close_cancel_work is triggered, e.g. the RDMA driver is rmmod and the LGR is terminated, the conn->close_work is flushed before initialization, resulting in WARN_ON(!work->func). __smc_lgr_terminate | smc_connect_{rdma|ism} ------------------------------------------------------------- | smc_conn_create | \- smc_lgr_register_conn for conn in lgr->conns_all | \- smc_conn_kill | \- smc_close_active_abort | \- smc_close_cancel_work | \- cancel_work_sync | \- __flush_work | (close_work) | | smc_close_init | \- INIT_WORK(&close_work) So fix this by initializing close_work before establishing the connection. Fixes: 46c28dbd4c23 ("net/smc: no socket state changes in tasklet context") Fixes: 413498440e30 ("net/smc: add SMC-D support in af_smc") Signed-off-by: Wen Gu Reviewed-by: Wenjia Zhang Reviewed-by: Alexandra Winter Signed-off-by: Paolo Abeni --- net/smc/af_smc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9d76e902fd77..ed6d4d520bc7 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -383,6 +383,7 @@ void smc_sk_init(struct net *net, struct sock *sk, int protocol) smc->limit_smc_hs = net->smc.limit_smc_hs; smc->use_fallback = false; /* assume rdma capability first */ smc->fallback_rsn = 0; + smc_close_init(smc); } static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, @@ -1299,7 +1300,6 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } - smc_close_init(smc); smc_rx_init(smc); if (ini->first_contact_local) { @@ -1435,7 +1435,6 @@ static int smc_connect_ism(struct smc_sock *smc, goto connect_abort; } } - smc_close_init(smc); smc_rx_init(smc); smc_tx_init(smc); @@ -2479,7 +2478,6 @@ static void smc_listen_work(struct work_struct *work) goto out_decl; mutex_lock(&smc_server_lgr_pending); - smc_close_init(new_smc); smc_rx_init(new_smc); smc_tx_init(new_smc); -- 2.51.0 From 2c7f14ed9c19ec0f149479d1c2842ec1f9bf76d7 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Wed, 27 Nov 2024 21:30:14 +0800 Subject: [PATCH 06/16] net/smc: fix LGR and link use-after-free issue We encountered a LGR/link use-after-free issue, which manifested as the LGR/link refcnt reaching 0 early and entering the clear process, making resource access unsafe. refcount_t: addition on 0; use-after-free. WARNING: CPU: 14 PID: 107447 at lib/refcount.c:25 refcount_warn_saturate+0x9c/0x140 Workqueue: events smc_lgr_terminate_work [smc] Call trace: refcount_warn_saturate+0x9c/0x140 __smc_lgr_terminate.part.45+0x2a8/0x370 [smc] smc_lgr_terminate_work+0x28/0x30 [smc] process_one_work+0x1b8/0x420 worker_thread+0x158/0x510 kthread+0x114/0x118 or refcount_t: underflow; use-after-free. WARNING: CPU: 6 PID: 93140 at lib/refcount.c:28 refcount_warn_saturate+0xf0/0x140 Workqueue: smc_hs_wq smc_listen_work [smc] Call trace: refcount_warn_saturate+0xf0/0x140 smcr_link_put+0x1cc/0x1d8 [smc] smc_conn_free+0x110/0x1b0 [smc] smc_conn_abort+0x50/0x60 [smc] smc_listen_find_device+0x75c/0x790 [smc] smc_listen_work+0x368/0x8a0 [smc] process_one_work+0x1b8/0x420 worker_thread+0x158/0x510 kthread+0x114/0x118 It is caused by repeated release of LGR/link refcnt. One suspect is that smc_conn_free() is called repeatedly because some smc_conn_free() from server listening path are not protected by sock lock. e.g. Calls under socklock | smc_listen_work ------------------------------------------------------- lock_sock(sk) | smc_conn_abort smc_conn_free | \- smc_conn_free \- smcr_link_put | \- smcr_link_put (duplicated) release_sock(sk) So here add sock lock protection in smc_listen_work() path, making it exclusive with other connection operations. Fixes: 3b2dec2603d5 ("net/smc: restructure client and server code in af_smc") Co-developed-by: Guangguan Wang Signed-off-by: Guangguan Wang Co-developed-by: Kai Signed-off-by: Kai Signed-off-by: Wen Gu Reviewed-by: Wenjia Zhang Signed-off-by: Paolo Abeni --- net/smc/af_smc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index ed6d4d520bc7..9e6c69d18581 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1900,6 +1900,7 @@ static void smc_listen_out(struct smc_sock *new_smc) if (tcp_sk(new_smc->clcsock->sk)->syn_smc) atomic_dec(&lsmc->queued_smc_hs); + release_sock(newsmcsk); /* lock in smc_listen_work() */ if (lsmc->sk.sk_state == SMC_LISTEN) { lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); smc_accept_enqueue(&lsmc->sk, newsmcsk); @@ -2421,6 +2422,7 @@ static void smc_listen_work(struct work_struct *work) u8 accept_version; int rc = 0; + lock_sock(&new_smc->sk); /* release in smc_listen_out() */ if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN) return smc_listen_out_err(new_smc); -- 2.51.0 From 7a0ea70da56ee8c2716d0b79e9959d3c47efab62 Mon Sep 17 00:00:00 2001 From: Louis Leseur Date: Thu, 28 Nov 2024 09:33:58 +0100 Subject: [PATCH 07/16] net/qed: allow old cards not supporting "num_images" to work Commit 43645ce03e00 ("qed: Populate nvm image attribute shadow.") added support for populating flash image attributes, notably "num_images". However, some cards were not able to return this information. In such cases, the driver would return EINVAL, causing the driver to exit. Add check to return EOPNOTSUPP instead of EINVAL when the card is not able to return these information. The caller function already handles EOPNOTSUPP without error. Fixes: 43645ce03e00 ("qed: Populate nvm image attribute shadow.") Co-developed-by: Florian Forestier Signed-off-by: Florian Forestier Signed-off-by: Louis Leseur Link: https://patch.msgid.link/20241128083633.26431-1-louis.leseur@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/qlogic/qed/qed_mcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index 26a714bfad4e..b45efc272fdb 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -3301,7 +3301,9 @@ int qed_mcp_bist_nvm_get_num_images(struct qed_hwfn *p_hwfn, if (rc) return rc; - if (((rsp & FW_MSG_CODE_MASK) != FW_MSG_CODE_OK)) + if (((rsp & FW_MSG_CODE_MASK) == FW_MSG_CODE_UNSUPPORTED)) + rc = -EOPNOTSUPP; + else if (((rsp & FW_MSG_CODE_MASK) != FW_MSG_CODE_OK)) rc = -EINVAL; return rc; -- 2.51.0 From 48327566769a6ff2e873b6bf075392bd756625ca Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 29 Nov 2024 13:25:19 -0800 Subject: [PATCH 08/16] rtnetlink: fix double call of rtnl_link_get_net_ifla() Currently rtnl_link_get_net_ifla() gets called twice when we create peer devices, once in rtnl_add_peer_net() and once in each ->newlink() implementation. This looks safer, however, it leads to a classic Time-of-Check to Time-of-Use (TOCTOU) bug since IFLA_NET_NS_PID is very dynamic. And because of the lack of checking error pointer of the second call, it also leads to a kernel crash as reported by syzbot. Fix this by getting rid of the second call, which already becomes redudant after Kuniyuki's work. We have to propagate the result of the first rtnl_link_get_net_ifla() down to each ->newlink(). Reported-by: syzbot+21ba4d5adff0b6a7cfc6@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=21ba4d5adff0b6a7cfc6 Fixes: 0eb87b02a705 ("veth: Set VETH_INFO_PEER to veth_link_ops.peer_type.") Fixes: 6b84e558e95d ("vxcan: Set VXCAN_INFO_PEER to vxcan_link_ops.peer_type.") Fixes: fefd5d082172 ("netkit: Set IFLA_NETKIT_PEER_INFO to netkit_link_ops.peer_type.") Cc: Kuniyuki Iwashima Signed-off-by: Cong Wang Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241129212519.825567-1-xiyou.wangcong@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/can/vxcan.c | 10 ++-------- drivers/net/netkit.c | 11 +++-------- drivers/net/veth.c | 12 +++-------- net/core/rtnetlink.c | 44 +++++++++++++++++++++-------------------- 4 files changed, 31 insertions(+), 46 deletions(-) diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c index da7c72105fb6..ca8811941085 100644 --- a/drivers/net/can/vxcan.c +++ b/drivers/net/can/vxcan.c @@ -172,13 +172,12 @@ static void vxcan_setup(struct net_device *dev) /* forward declaration for rtnl_create_link() */ static struct rtnl_link_ops vxcan_link_ops; -static int vxcan_newlink(struct net *net, struct net_device *dev, +static int vxcan_newlink(struct net *peer_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct vxcan_priv *priv; struct net_device *peer; - struct net *peer_net; struct nlattr *peer_tb[IFLA_MAX + 1], **tbp = tb; char ifname[IFNAMSIZ]; @@ -203,20 +202,15 @@ static int vxcan_newlink(struct net *net, struct net_device *dev, name_assign_type = NET_NAME_ENUM; } - peer_net = rtnl_link_get_net(net, tbp); peer = rtnl_create_link(peer_net, ifname, name_assign_type, &vxcan_link_ops, tbp, extack); - if (IS_ERR(peer)) { - put_net(peer_net); + if (IS_ERR(peer)) return PTR_ERR(peer); - } if (ifmp && dev->ifindex) peer->ifindex = ifmp->ifi_index; err = register_netdevice(peer); - put_net(peer_net); - peer_net = NULL; if (err < 0) { free_netdev(peer); return err; diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index bb07725d1c72..c1d881dc6409 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -327,7 +327,7 @@ static int netkit_validate(struct nlattr *tb[], struct nlattr *data[], static struct rtnl_link_ops netkit_link_ops; -static int netkit_new_link(struct net *src_net, struct net_device *dev, +static int netkit_new_link(struct net *peer_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { @@ -342,7 +342,6 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, struct net_device *peer; char ifname[IFNAMSIZ]; struct netkit *nk; - struct net *net; int err; if (data) { @@ -385,13 +384,10 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, (tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS])) return -EOPNOTSUPP; - net = rtnl_link_get_net(src_net, tbp); - peer = rtnl_create_link(net, ifname, ifname_assign_type, + peer = rtnl_create_link(peer_net, ifname, ifname_assign_type, &netkit_link_ops, tbp, extack); - if (IS_ERR(peer)) { - put_net(net); + if (IS_ERR(peer)) return PTR_ERR(peer); - } netif_inherit_tso_max(peer, dev); @@ -408,7 +404,6 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, bpf_mprog_bundle_init(&nk->bundle); err = register_netdevice(peer); - put_net(net); if (err < 0) goto err_register_peer; netif_carrier_off(peer); diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 0d6d0d749d44..07ebb800edf1 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1765,7 +1765,7 @@ static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) return 0; } -static int veth_newlink(struct net *src_net, struct net_device *dev, +static int veth_newlink(struct net *peer_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { @@ -1776,7 +1776,6 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; unsigned char name_assign_type; struct ifinfomsg *ifmp; - struct net *net; /* * create and register peer first @@ -1800,13 +1799,10 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, name_assign_type = NET_NAME_ENUM; } - net = rtnl_link_get_net(src_net, tbp); - peer = rtnl_create_link(net, ifname, name_assign_type, + peer = rtnl_create_link(peer_net, ifname, name_assign_type, &veth_link_ops, tbp, extack); - if (IS_ERR(peer)) { - put_net(net); + if (IS_ERR(peer)) return PTR_ERR(peer); - } if (!ifmp || !tbp[IFLA_ADDRESS]) eth_hw_addr_random(peer); @@ -1817,8 +1813,6 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, netif_inherit_tso_max(peer, dev); err = register_netdevice(peer); - put_net(net); - net = NULL; if (err < 0) goto err_register_peer; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 58df76fe408a..ab5f201bf0ab 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3746,6 +3746,7 @@ static int rtnl_group_changelink(const struct sk_buff *skb, static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, const struct rtnl_link_ops *ops, struct net *tgt_net, struct net *link_net, + struct net *peer_net, const struct nlmsghdr *nlh, struct nlattr **tb, struct nlattr **data, struct netlink_ext_ack *extack) @@ -3776,8 +3777,13 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, dev->ifindex = ifm->ifi_index; + if (link_net) + net = link_net; + if (peer_net) + net = peer_net; + if (ops->newlink) - err = ops->newlink(link_net ? : net, dev, tb, data, extack); + err = ops->newlink(net, dev, tb, data, extack); else err = register_netdevice(dev); if (err < 0) { @@ -3812,40 +3818,33 @@ out_unregister: goto out; } -static int rtnl_add_peer_net(struct rtnl_nets *rtnl_nets, - const struct rtnl_link_ops *ops, - struct nlattr *data[], - struct netlink_ext_ack *extack) +static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, + struct nlattr *data[], + struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_MAX + 1]; - struct net *net; int err; if (!data || !data[ops->peer_type]) - return 0; + return NULL; err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); if (err < 0) - return err; + return ERR_PTR(err); if (ops->validate) { err = ops->validate(tb, NULL, extack); if (err < 0) - return err; + return ERR_PTR(err); } - net = rtnl_link_get_net_ifla(tb); - if (IS_ERR(net)) - return PTR_ERR(net); - if (net) - rtnl_nets_add(rtnl_nets, net); - - return 0; + return rtnl_link_get_net_ifla(tb); } static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, struct net *tgt_net, struct net *link_net, + struct net *peer_net, struct rtnl_newlink_tbs *tbs, struct nlattr **data, struct netlink_ext_ack *extack) @@ -3894,14 +3893,15 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, return -EOPNOTSUPP; } - return rtnl_newlink_create(skb, ifm, ops, tgt_net, link_net, nlh, tb, data, extack); + return rtnl_newlink_create(skb, ifm, ops, tgt_net, link_net, peer_net, nlh, + tb, data, extack); } static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct net *tgt_net, *link_net = NULL, *peer_net = NULL; struct nlattr **tb, **linkinfo, **data = NULL; - struct net *tgt_net, *link_net = NULL; struct rtnl_link_ops *ops = NULL; struct rtnl_newlink_tbs *tbs; struct rtnl_nets rtnl_nets; @@ -3971,9 +3971,11 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, } if (ops->peer_type) { - ret = rtnl_add_peer_net(&rtnl_nets, ops, data, extack); - if (ret < 0) + peer_net = rtnl_get_peer_net(ops, data, extack); + if (IS_ERR(peer_net)) goto put_ops; + if (peer_net) + rtnl_nets_add(&rtnl_nets, peer_net); } } @@ -4004,7 +4006,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, } rtnl_nets_lock(&rtnl_nets); - ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, tbs, data, extack); + ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, peer_net, tbs, data, extack); rtnl_nets_unlock(&rtnl_nets); put_net: -- 2.51.0 From af8edaeddbc52e53207d859c912b017fd9a77629 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Dec 2024 10:05:58 +0000 Subject: [PATCH 09/16] net: hsr: must allocate more bytes for RedBox support Blamed commit forgot to change hsr_init_skb() to allocate larger skb for RedBox case. Indeed, send_hsr_supervision_frame() will add two additional components (struct hsr_sup_tlv and struct hsr_sup_payload) syzbot reported the following crash: skbuff: skb_over_panic: text:ffffffff8afd4b0a len:34 put:6 head:ffff88802ad29e00 data:ffff88802ad29f22 tail:0x144 end:0x140 dev:gretap0 ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:206 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI CPU: 2 UID: 0 PID: 7611 Comm: syz-executor Not tainted 6.12.0-syzkaller #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 RIP: 0010:skb_panic+0x157/0x1d0 net/core/skbuff.c:206 Code: b6 04 01 84 c0 74 04 3c 03 7e 21 8b 4b 70 41 56 45 89 e8 48 c7 c7 a0 7d 9b 8c 41 57 56 48 89 ee 52 4c 89 e2 e8 9a 76 79 f8 90 <0f> 0b 4c 89 4c 24 10 48 89 54 24 08 48 89 34 24 e8 94 76 fb f8 4c RSP: 0018:ffffc90000858ab8 EFLAGS: 00010282 RAX: 0000000000000087 RBX: ffff8880598c08c0 RCX: ffffffff816d3e69 RDX: 0000000000000000 RSI: ffffffff816de786 RDI: 0000000000000005 RBP: ffffffff8c9b91c0 R08: 0000000000000005 R09: 0000000000000000 R10: 0000000000000302 R11: ffffffff961cc1d0 R12: ffffffff8afd4b0a R13: 0000000000000006 R14: ffff88804b938130 R15: 0000000000000140 FS: 000055558a3d6500(0000) GS:ffff88806a800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1295974ff8 CR3: 000000002ab6e000 CR4: 0000000000352ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_over_panic net/core/skbuff.c:211 [inline] skb_put+0x174/0x1b0 net/core/skbuff.c:2617 send_hsr_supervision_frame+0x6fa/0x9e0 net/hsr/hsr_device.c:342 hsr_proxy_announce+0x1a3/0x4a0 net/hsr/hsr_device.c:436 call_timer_fn+0x1a0/0x610 kernel/time/timer.c:1794 expire_timers kernel/time/timer.c:1845 [inline] __run_timers+0x6e8/0x930 kernel/time/timer.c:2419 __run_timer_base kernel/time/timer.c:2430 [inline] __run_timer_base kernel/time/timer.c:2423 [inline] run_timer_base+0x111/0x190 kernel/time/timer.c:2439 run_timer_softirq+0x1a/0x40 kernel/time/timer.c:2449 handle_softirqs+0x213/0x8f0 kernel/softirq.c:554 __do_softirq kernel/softirq.c:588 [inline] invoke_softirq kernel/softirq.c:428 [inline] __irq_exit_rcu kernel/softirq.c:637 [inline] irq_exit_rcu+0xbb/0x120 kernel/softirq.c:649 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1049 [inline] sysvec_apic_timer_interrupt+0xa4/0xc0 arch/x86/kernel/apic/apic.c:1049 Fixes: 5055cccfc2d1 ("net: hsr: Provide RedBox support (HSR-SAN)") Reported-by: syzbot+7f4643b267cc680bfa1c@syzkaller.appspotmail.com Signed-off-by: Eric Dumazet Cc: Lukasz Majewski Link: https://patch.msgid.link/20241202100558.507765-1-edumazet@google.com Signed-off-by: Paolo Abeni --- net/hsr/hsr_device.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 31a416ee21ad..03eadd6c51fd 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -246,20 +246,22 @@ static const struct header_ops hsr_header_ops = { .parse = eth_header_parse, }; -static struct sk_buff *hsr_init_skb(struct hsr_port *master) +static struct sk_buff *hsr_init_skb(struct hsr_port *master, int extra) { struct hsr_priv *hsr = master->hsr; struct sk_buff *skb; int hlen, tlen; + int len; hlen = LL_RESERVED_SPACE(master->dev); tlen = master->dev->needed_tailroom; + len = sizeof(struct hsr_sup_tag) + sizeof(struct hsr_sup_payload); /* skb size is same for PRP/HSR frames, only difference * being, for PRP it is a trailer and for HSR it is a - * header + * header. + * RedBox might use @extra more bytes. */ - skb = dev_alloc_skb(sizeof(struct hsr_sup_tag) + - sizeof(struct hsr_sup_payload) + hlen + tlen); + skb = dev_alloc_skb(len + extra + hlen + tlen); if (!skb) return skb; @@ -295,6 +297,7 @@ static void send_hsr_supervision_frame(struct hsr_port *port, struct hsr_sup_tlv *hsr_stlv; struct hsr_sup_tag *hsr_stag; struct sk_buff *skb; + int extra = 0; *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); if (hsr->announce_count < 3 && hsr->prot_version == 0) { @@ -303,7 +306,11 @@ static void send_hsr_supervision_frame(struct hsr_port *port, hsr->announce_count++; } - skb = hsr_init_skb(port); + if (hsr->redbox) + extra = sizeof(struct hsr_sup_tlv) + + sizeof(struct hsr_sup_payload); + + skb = hsr_init_skb(port, extra); if (!skb) { netdev_warn_once(port->dev, "HSR: Could not send supervision frame\n"); return; @@ -362,7 +369,7 @@ static void send_prp_supervision_frame(struct hsr_port *master, struct hsr_sup_tag *hsr_stag; struct sk_buff *skb; - skb = hsr_init_skb(master); + skb = hsr_init_skb(master, 0); if (!skb) { netdev_warn_once(master->dev, "PRP: Could not send supervision frame\n"); return; -- 2.51.0 From cecc1555a8c2acd65f9d36182c28ae463db0ad7e Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Mon, 2 Dec 2024 18:21:02 +0000 Subject: [PATCH 10/16] net: Make napi_hash_lock irq safe Make napi_hash_lock IRQ safe. It is used during the control path, and is taken and released in napi_hash_add and napi_hash_del, which will typically be called by calls to napi_enable and napi_disable. This change avoids a deadlock in pcnet32 (and other any other drivers which follow the same pattern): CPU 0: pcnet32_open spin_lock_irqsave(&lp->lock, ...) napi_enable napi_hash_add <- before this executes, CPU 1 proceeds spin_lock(napi_hash_lock) [...] spin_unlock_irqrestore(&lp->lock, flags); CPU 1: pcnet32_close napi_disable napi_hash_del spin_lock(napi_hash_lock) < INTERRUPT > pcnet32_interrupt spin_lock(lp->lock) <- DEADLOCK Changing the napi_hash_lock to be IRQ safe prevents the IRQ from firing on CPU 1 until napi_hash_lock is released, preventing the deadlock. Cc: stable@vger.kernel.org Fixes: 86e25f40aa1e ("net: napi: Add napi_config") Reported-by: Guenter Roeck Closes: https://lore.kernel.org/netdev/85dd4590-ea6b-427d-876a-1d8559c7ad82@roeck-us.net/ Suggested-by: Jakub Kicinski Signed-off-by: Joe Damato Tested-by: Guenter Roeck Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241202182103.363038-1-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 13d00fc10f55..45a8c3dd4a64 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6557,18 +6557,22 @@ static void __napi_hash_add_with_id(struct napi_struct *napi, static void napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id) { - spin_lock(&napi_hash_lock); + unsigned long flags; + + spin_lock_irqsave(&napi_hash_lock, flags); WARN_ON_ONCE(napi_by_id(napi_id)); __napi_hash_add_with_id(napi, napi_id); - spin_unlock(&napi_hash_lock); + spin_unlock_irqrestore(&napi_hash_lock, flags); } static void napi_hash_add(struct napi_struct *napi) { + unsigned long flags; + if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) return; - spin_lock(&napi_hash_lock); + spin_lock_irqsave(&napi_hash_lock, flags); /* 0..NR_CPUS range is reserved for sender_cpu use */ do { @@ -6578,7 +6582,7 @@ static void napi_hash_add(struct napi_struct *napi) __napi_hash_add_with_id(napi, napi_gen_id); - spin_unlock(&napi_hash_lock); + spin_unlock_irqrestore(&napi_hash_lock, flags); } /* Warning : caller is responsible to make sure rcu grace period @@ -6586,11 +6590,13 @@ static void napi_hash_add(struct napi_struct *napi) */ static void napi_hash_del(struct napi_struct *napi) { - spin_lock(&napi_hash_lock); + unsigned long flags; + + spin_lock_irqsave(&napi_hash_lock, flags); hlist_del_init_rcu(&napi->napi_hash_node); - spin_unlock(&napi_hash_lock); + spin_unlock_irqrestore(&napi_hash_lock, flags); } static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) -- 2.51.0 From 3d501f562f63b290351169e3e9931ffe3d57b2ae Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Mon, 2 Dec 2024 15:56:08 +0000 Subject: [PATCH 11/16] Revert "udp: avoid calling sock_def_readable() if possible" This reverts commit 612b1c0dec5bc7367f90fc508448b8d0d7c05414. On a scenario with multiple threads blocking on a recvfrom(), we need to call sock_def_readable() on every __udp_enqueue_schedule_skb() otherwise the threads won't be woken up as __skb_wait_for_more_packets() is using prepare_to_wait_exclusive(). Link: https://bugzilla.redhat.com/2308477 Fixes: 612b1c0dec5b ("udp: avoid calling sock_def_readable() if possible") Signed-off-by: Fernando Fernandez Mancera Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241202155620.1719-1-ffmancera@riseup.net Signed-off-by: Jakub Kicinski --- net/ipv4/udp.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6a01905d379f..e8953e88efef 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1674,7 +1674,6 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) struct sk_buff_head *list = &sk->sk_receive_queue; int rmem, err = -ENOMEM; spinlock_t *busy = NULL; - bool becomes_readable; int size, rcvbuf; /* Immediately drop when the receive queue is full. @@ -1715,19 +1714,12 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) */ sock_skb_set_dropcount(sk, skb); - becomes_readable = skb_queue_empty(list); __skb_queue_tail(list, skb); spin_unlock(&list->lock); - if (!sock_flag(sk, SOCK_DEAD)) { - if (becomes_readable || - sk->sk_data_ready != sock_def_readable || - READ_ONCE(sk->sk_peek_off) >= 0) - INDIRECT_CALL_1(sk->sk_data_ready, - sock_def_readable, sk); - else - sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); - } + if (!sock_flag(sk, SOCK_DEAD)) + INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); + busylock_release(busy); return 0; -- 2.51.0 From 94071909477677fc2a1abf3fb281f203f66cf3ca Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 2 Dec 2024 18:48:05 +0200 Subject: [PATCH 12/16] ethtool: Fix access to uninitialized fields in set RXNFC command The check for non-zero ring with RSS is only relevant for ETHTOOL_SRXCLSRLINS command, in other cases the check tries to access memory which was not initialized by the userspace tool. Only perform the check in case of ETHTOOL_SRXCLSRLINS. Without this patch, filter deletion (for example) could statistically result in a false error: # ethtool --config-ntuple eth3 delete 484 rmgr: Cannot delete RX class rule: Invalid argument Cannot delete classification rule Fixes: 9e43ad7a1ede ("net: ethtool: only allow set_rxnfc with rss + ring_cookie if driver opts in") Link: https://lore.kernel.org/netdev/871a9ecf-1e14-40dd-bbd7-e90c92f89d47@nvidia.com/ Reviewed-by: Dragos Tatulea Reviewed-by: Tariq Toukan Signed-off-by: Gal Pressman Reviewed-by: Edward Cree Link: https://patch.msgid.link/20241202164805.1637093-1-gal@nvidia.com Signed-off-by: Jakub Kicinski --- net/ethtool/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 61df8ce44379..7bb94875a7ec 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -993,7 +993,8 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, return rc; /* Nonzero ring with RSS only makes sense if NIC adds them together */ - if (info.flow_type & FLOW_RSS && !ops->cap_rss_rxnfc_adds && + if (cmd == ETHTOOL_SRXCLSRLINS && info.flow_type & FLOW_RSS && + !ops->cap_rss_rxnfc_adds && ethtool_get_flow_spec_ring(info.fs.ring_cookie)) return -EINVAL; -- 2.51.0 From 292207809486d99c78068d3f459cbbbffde88415 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 2 Dec 2024 10:21:38 -0500 Subject: [PATCH 13/16] net: sched: fix erspan_opt settings in cls_flower When matching erspan_opt in cls_flower, only the (version, dir, hwid) fields are relevant. However, in fl_set_erspan_opt() it initializes all bits of erspan_opt and its mask to 1. This inadvertently requires packets to match not only the (version, dir, hwid) fields but also the other fields that are unexpectedly set to 1. This patch resolves the issue by ensuring that only the (version, dir, hwid) fields are configured in fl_set_erspan_opt(), leaving the other fields to 0 in erspan_opt. Fixes: 79b1011cb33d ("net: sched: allow flower to match erspan options") Reported-by: Shuang Li Signed-off-by: Xin Long Reviewed-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index e280c27cb9f9..1008ec8a464c 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1369,7 +1369,6 @@ static int fl_set_erspan_opt(const struct nlattr *nla, struct fl_flow_key *key, int err; md = (struct erspan_metadata *)&key->enc_opts.data[key->enc_opts.len]; - memset(md, 0xff, sizeof(*md)); md->version = 1; if (!depth) @@ -1398,9 +1397,9 @@ static int fl_set_erspan_opt(const struct nlattr *nla, struct fl_flow_key *key, NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index"); return -EINVAL; } + memset(&md->u.index, 0xff, sizeof(md->u.index)); if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) { nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]; - memset(&md->u, 0x00, sizeof(md->u)); md->u.index = nla_get_be32(nla); } } else if (md->version == 2) { @@ -1409,10 +1408,12 @@ static int fl_set_erspan_opt(const struct nlattr *nla, struct fl_flow_key *key, NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid"); return -EINVAL; } + md->u.md2.dir = 1; if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR]) { nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR]; md->u.md2.dir = nla_get_u8(nla); } + set_hwid(&md->u.md2, 0xff); if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]) { nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]; set_hwid(&md->u.md2, nla_get_u8(nla)); -- 2.51.0 From 5eb7de8cd58e73851cd37ff8d0666517d9926948 Mon Sep 17 00:00:00 2001 From: Lion Ackermann Date: Mon, 2 Dec 2024 17:22:57 +0100 Subject: [PATCH 14/16] net: sched: fix ordering of qlen adjustment MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Changes to sch->q.qlen around qdisc_tree_reduce_backlog() need to happen _before_ a call to said function because otherwise it may fail to notify parent qdiscs when the child is about to become empty. Signed-off-by: Lion Ackermann Acked-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 2 +- net/sched/sch_choke.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index f2f9b75008bb..8d8b2db4653c 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1525,7 +1525,6 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) b->backlogs[idx] -= len; b->tin_backlog -= len; sch->qstats.backlog -= len; - qdisc_tree_reduce_backlog(sch, 1, len); flow->dropped++; b->tin_dropped++; @@ -1536,6 +1535,7 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) __qdisc_drop(skb, to_free); sch->q.qlen--; + qdisc_tree_reduce_backlog(sch, 1, len); cake_heapify(q, 0); diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 1e940ad0d2fa..59e7bdf5063e 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -123,10 +123,10 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx, if (idx == q->tail) choke_zap_tail_holes(q); + --sch->q.qlen; qdisc_qstats_backlog_dec(sch, skb); qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_drop(skb, sch, to_free); - --sch->q.qlen; } struct choke_skb_cb { -- 2.51.0 From 50b94204446e1215af081fd713d7d566d9258e35 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 3 Dec 2024 10:48:15 +0100 Subject: [PATCH 15/16] ipmr: tune the ipmr_can_free_table() checks. Eric reported a syzkaller-triggered splat caused by recent ipmr changes: WARNING: CPU: 2 PID: 6041 at net/ipv6/ip6mr.c:419 ip6mr_free_table+0xbd/0x120 net/ipv6/ip6mr.c:419 Modules linked in: CPU: 2 UID: 0 PID: 6041 Comm: syz-executor183 Not tainted 6.12.0-syzkaller-10681-g65ae975e97d5 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 RIP: 0010:ip6mr_free_table+0xbd/0x120 net/ipv6/ip6mr.c:419 Code: 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 80 3c 02 00 75 58 49 83 bc 24 c0 0e 00 00 00 74 09 e8 44 ef a9 f7 90 <0f> 0b 90 e8 3b ef a9 f7 48 8d 7b 38 e8 12 a3 96 f7 48 89 df be 0f RSP: 0018:ffffc90004267bd8 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff88803c710000 RCX: ffffffff89e4d844 RDX: ffff88803c52c880 RSI: ffffffff89e4d87c RDI: ffff88803c578ec0 RBP: 0000000000000001 R08: 0000000000000005 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000001 R12: ffff88803c578000 R13: ffff88803c710000 R14: ffff88803c710008 R15: dead000000000100 FS: 00007f7a855ee6c0(0000) GS:ffff88806a800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7a85689938 CR3: 000000003c492000 CR4: 0000000000352ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ip6mr_rules_exit+0x176/0x2d0 net/ipv6/ip6mr.c:283 ip6mr_net_exit_batch+0x53/0xa0 net/ipv6/ip6mr.c:1388 ops_exit_list+0x128/0x180 net/core/net_namespace.c:177 setup_net+0x4fe/0x860 net/core/net_namespace.c:394 copy_net_ns+0x2b4/0x6b0 net/core/net_namespace.c:500 create_new_namespaces+0x3ea/0xad0 kernel/nsproxy.c:110 unshare_nsproxy_namespaces+0xc0/0x1f0 kernel/nsproxy.c:228 ksys_unshare+0x45d/0xa40 kernel/fork.c:3334 __do_sys_unshare kernel/fork.c:3405 [inline] __se_sys_unshare kernel/fork.c:3403 [inline] __x64_sys_unshare+0x31/0x40 kernel/fork.c:3403 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f7a856332d9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 51 18 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f7a855ee238 EFLAGS: 00000246 ORIG_RAX: 0000000000000110 RAX: ffffffffffffffda RBX: 00007f7a856bd308 RCX: 00007f7a856332d9 RDX: 00007f7a8560f8c6 RSI: 0000000000000000 RDI: 0000000062040200 RBP: 00007f7a856bd300 R08: 00007fff932160a7 R09: 00007f7a855ee6c0 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f7a856bd30c R13: 0000000000000000 R14: 00007fff93215fc0 R15: 00007fff932160a8 The root cause is a network namespace creation failing after successful initialization of the ipmr subsystem. Such a case is not currently matched by the ipmr_can_free_table() helper. New namespaces are zeroed on allocation and inserted into net ns list only after successful creation; when deleting an ipmr table, the list next pointer can be NULL only on netns initialization failure. Update the ipmr_can_free_table() checks leveraging such condition. Reported-by: Eric Dumazet Reported-by: syzbot+6e8cb445d4b43d006e0c@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=6e8cb445d4b43d006e0c Fixes: 11b6e701bce9 ("ipmr: add debug check for mr table cleanup") Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/8bde975e21bbca9d9c27e36209b2dd4f1d7a3f00.1733212078.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/net/net_namespace.h | 5 +++++ net/ipv4/ipmr.c | 2 +- net/ipv6/ip6mr.c | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 873c0f9fdac6..f943bd79ef03 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -325,6 +325,11 @@ static inline int check_net(const struct net *net) #define net_drop_ns NULL #endif +/* Returns true if the netns initialization is completed successfully */ +static inline bool net_initialized(const struct net *net) +{ + return READ_ONCE(net->list.next); +} static inline void __netns_tracker_alloc(struct net *net, netns_tracker *tracker, diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c5b8ec5c0a8c..99d8faa508e5 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -122,7 +122,7 @@ static void ipmr_expire_process(struct timer_list *t); static bool ipmr_can_free_table(struct net *net) { - return !check_net(net) || !net->ipv4.mr_rules_ops; + return !check_net(net) || !net_initialized(net); } static struct mr_table *ipmr_mr_table_iter(struct net *net, diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 7f1902ac3586..578ff1336afe 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -110,7 +110,7 @@ static void ipmr_expire_process(struct timer_list *t); static bool ip6mr_can_free_table(struct net *net) { - return !check_net(net) || !net->ipv6.mr6_rules_ops; + return !check_net(net) || !net_initialized(net); } static struct mr_table *ip6mr_mr_table_iter(struct net *net, -- 2.51.0 From 910c4788d6155b2202ec88273376cd7ecdc24f0a Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Mon, 2 Dec 2024 16:33:57 +0100 Subject: [PATCH 16/16] ethtool: Fix wrong mod state in case of verbose and no_mask bitset A bitset without mask in a _SET request means we want exactly the bits in the bitset to be set. This works correctly for compact format but when verbose format is parsed, ethnl_update_bitset32_verbose() only sets the bits present in the request bitset but does not clear the rest. The commit 6699170376ab ("ethtool: fix application of verbose no_mask bitset") fixes this issue by clearing the whole target bitmap before we start iterating. The solution proposed brought an issue with the behavior of the mod variable. As the bitset is always cleared the old value will always differ to the new value. Fix it by adding a new function to compare bitmaps and a temporary variable which save the state of the old bitmap. Fixes: 6699170376ab ("ethtool: fix application of verbose no_mask bitset") Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20241202153358.1142095-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- net/ethtool/bitset.c | 48 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c index 0515d6604b3b..f0883357d12e 100644 --- a/net/ethtool/bitset.c +++ b/net/ethtool/bitset.c @@ -425,12 +425,32 @@ static int ethnl_parse_bit(unsigned int *index, bool *val, unsigned int nbits, return 0; } +/** + * ethnl_bitmap32_equal() - Compare two bitmaps + * @map1: first bitmap + * @map2: second bitmap + * @nbits: bit size to compare + * + * Return: true if first @nbits are equal, false if not + */ +static bool ethnl_bitmap32_equal(const u32 *map1, const u32 *map2, + unsigned int nbits) +{ + if (memcmp(map1, map2, nbits / 32 * sizeof(u32))) + return false; + if (nbits % 32 == 0) + return true; + return !((map1[nbits / 32] ^ map2[nbits / 32]) & + ethnl_lower_bits(nbits % 32)); +} + static int ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits, const struct nlattr *attr, struct nlattr **tb, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { + u32 *saved_bitmap = NULL; struct nlattr *bit_attr; bool no_mask; int rem; @@ -448,8 +468,20 @@ ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits, } no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; - if (no_mask) - ethnl_bitmap32_clear(bitmap, 0, nbits, mod); + if (no_mask) { + unsigned int nwords = DIV_ROUND_UP(nbits, 32); + unsigned int nbytes = nwords * sizeof(u32); + bool dummy; + + /* The bitmap size is only the size of the map part without + * its mask part. + */ + saved_bitmap = kcalloc(nwords, sizeof(u32), GFP_KERNEL); + if (!saved_bitmap) + return -ENOMEM; + memcpy(saved_bitmap, bitmap, nbytes); + ethnl_bitmap32_clear(bitmap, 0, nbits, &dummy); + } nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { bool old_val, new_val; @@ -458,22 +490,30 @@ ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits, if (nla_type(bit_attr) != ETHTOOL_A_BITSET_BITS_BIT) { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "only ETHTOOL_A_BITSET_BITS_BIT allowed in ETHTOOL_A_BITSET_BITS"); + kfree(saved_bitmap); return -EINVAL; } ret = ethnl_parse_bit(&idx, &new_val, nbits, bit_attr, no_mask, names, extack); - if (ret < 0) + if (ret < 0) { + kfree(saved_bitmap); return ret; + } old_val = bitmap[idx / 32] & ((u32)1 << (idx % 32)); if (new_val != old_val) { if (new_val) bitmap[idx / 32] |= ((u32)1 << (idx % 32)); else bitmap[idx / 32] &= ~((u32)1 << (idx % 32)); - *mod = true; + if (!no_mask) + *mod = true; } } + if (no_mask && !ethnl_bitmap32_equal(saved_bitmap, bitmap, nbits)) + *mod = true; + + kfree(saved_bitmap); return 0; } -- 2.51.0