From 087a9eb9e5978e3ba362e1163691e41097e8ca20 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 23 Apr 2025 17:51:31 +0300 Subject: [PATCH 01/16] vxlan: vnifilter: Fix unlocked deletion of default FDB entry When a VNI is deleted from a VXLAN device in 'vnifilter' mode, the FDB entry associated with the default remote (assuming one was configured) is deleted without holding the hash lock. This is wrong and will result in a warning [1] being generated by the lockdep annotation that was added by commit ebe642067455 ("vxlan: Create wrappers for FDB lookup"). Reproducer: # ip link add vx0 up type vxlan dstport 4789 external vnifilter local 192.0.2.1 # bridge vni add vni 10010 remote 198.51.100.1 dev vx0 # bridge vni del vni 10010 dev vx0 Fix by acquiring the hash lock before the deletion and releasing it afterwards. Blame the original commit that introduced the issue rather than the one that exposed it. [1] WARNING: CPU: 3 PID: 392 at drivers/net/vxlan/vxlan_core.c:417 vxlan_find_mac+0x17f/0x1a0 [...] RIP: 0010:vxlan_find_mac+0x17f/0x1a0 [...] Call Trace: __vxlan_fdb_delete+0xbe/0x560 vxlan_vni_delete_group+0x2ba/0x940 vxlan_vni_del.isra.0+0x15f/0x580 vxlan_process_vni_filter+0x38b/0x7b0 vxlan_vnifilter_process+0x3bb/0x510 rtnetlink_rcv_msg+0x2f7/0xb70 netlink_rcv_skb+0x131/0x360 netlink_unicast+0x426/0x710 netlink_sendmsg+0x75a/0xc20 __sock_sendmsg+0xc1/0x150 ____sys_sendmsg+0x5aa/0x7b0 ___sys_sendmsg+0xfc/0x180 __sys_sendmsg+0x121/0x1b0 do_syscall_64+0xbb/0x1d0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: f9c4bb0b245c ("vxlan: vni filtering support on collect metadata device") Signed-off-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250423145131.513029-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_vnifilter.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c index 6e6e9f05509a..06d19e90eadb 100644 --- a/drivers/net/vxlan/vxlan_vnifilter.c +++ b/drivers/net/vxlan/vxlan_vnifilter.c @@ -627,7 +627,11 @@ static void vxlan_vni_delete_group(struct vxlan_dev *vxlan, * default dst remote_ip previously added for this vni */ if (!vxlan_addr_any(&vninode->remote_ip) || - !vxlan_addr_any(&dst->remote_ip)) + !vxlan_addr_any(&dst->remote_ip)) { + u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, + vninode->vni); + + spin_lock_bh(&vxlan->hash_lock[hash_index]); __vxlan_fdb_delete(vxlan, all_zeros_mac, (vxlan_addr_any(&vninode->remote_ip) ? dst->remote_ip : vninode->remote_ip), @@ -635,6 +639,8 @@ static void vxlan_vni_delete_group(struct vxlan_dev *vxlan, vninode->vni, vninode->vni, dst->remote_ifindex, true); + spin_unlock_bh(&vxlan->hash_lock[hash_index]); + } if (vxlan->dev->flags & IFF_UP) { if (vxlan_addr_multicast(&vninode->remote_ip) && -- 2.51.0 From a1356ac7749cafc4e27aa62c0c4604b5dca4983e Mon Sep 17 00:00:00 2001 From: "e.kubanski" Date: Wed, 16 Apr 2025 12:19:08 +0200 Subject: [PATCH 02/16] xsk: Fix race condition in AF_XDP generic RX path Move rx_lock from xsk_socket to xsk_buff_pool. Fix synchronization for shared umem mode in generic RX path where multiple sockets share single xsk_buff_pool. RX queue is exclusive to xsk_socket, while FILL queue can be shared between multiple sockets. This could result in race condition where two CPU cores access RX path of two different sockets sharing the same umem. Protect both queues by acquiring spinlock in shared xsk_buff_pool. Lock contention may be minimized in the future by some per-thread FQ buffering. It's safe and necessary to move spin_lock_bh(rx_lock) after xsk_rcv_check(): * xs->pool and spinlock_init is synchronized by xsk_bind() -> xsk_is_bound() memory barriers. * xsk_rcv_check() may return true at the moment of xsk_release() or xsk_unbind_dev(), however this will not cause any data races or race conditions. xsk_unbind_dev() removes xdp socket from all maps and waits for completion of all outstanding rx operations. Packets in RX path will either complete safely or drop. Signed-off-by: Eryk Kubanski Fixes: bf0bdd1343efb ("xdp: fix race on generic receive path") Acked-by: Magnus Karlsson Link: https://patch.msgid.link/20250416101908.10919-1-e.kubanski@partner.samsung.com Signed-off-by: Jakub Kicinski --- include/net/xdp_sock.h | 3 --- include/net/xsk_buff_pool.h | 2 ++ net/xdp/xsk.c | 6 +++--- net/xdp/xsk_buff_pool.c | 1 + 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index a58ae7589d12..e8bd6ddb7b12 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -71,9 +71,6 @@ struct xdp_sock { */ u32 tx_budget_spent; - /* Protects generic receive. */ - spinlock_t rx_lock; - /* Statistics */ u64 rx_dropped; u64 rx_queue_full; diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 1dcd4d71468a..3b243ea70e38 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -53,6 +53,8 @@ struct xsk_buff_pool { refcount_t users; struct xdp_umem *umem; struct work_struct work; + /* Protects generic receive in shared and non-shared umem mode. */ + spinlock_t rx_lock; struct list_head free_list; struct list_head xskb_list; u32 heads_cnt; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 5696af45bcf7..4abc81f33d3e 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -338,13 +338,14 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) u32 len = xdp_get_buff_len(xdp); int err; - spin_lock_bh(&xs->rx_lock); err = xsk_rcv_check(xs, xdp, len); if (!err) { + spin_lock_bh(&xs->pool->rx_lock); err = __xsk_rcv(xs, xdp, len); xsk_flush(xs); + spin_unlock_bh(&xs->pool->rx_lock); } - spin_unlock_bh(&xs->rx_lock); + return err; } @@ -1734,7 +1735,6 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, xs = xdp_sk(sk); xs->state = XSK_READY; mutex_init(&xs->mutex); - spin_lock_init(&xs->rx_lock); INIT_LIST_HEAD(&xs->map_list); spin_lock_init(&xs->map_list_lock); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 25a76c5ce0f1..c5181a9044ad 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -89,6 +89,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, pool->addrs = umem->addrs; pool->tx_metadata_len = umem->tx_metadata_len; pool->tx_sw_csum = umem->flags & XDP_UMEM_TX_SW_CSUM; + spin_lock_init(&pool->rx_lock); INIT_LIST_HEAD(&pool->free_list); INIT_LIST_HEAD(&pool->xskb_list); INIT_LIST_HEAD(&pool->xsk_tx_list); -- 2.51.0 From bf20af07909925ec0ae6cd4f3b7be0279dfa8768 Mon Sep 17 00:00:00 2001 From: "e.kubanski" Date: Wed, 16 Apr 2025 13:29:25 +0200 Subject: [PATCH 03/16] xsk: Fix offset calculation in unaligned mode Bring back previous offset calculation behaviour in AF_XDP unaligned umem mode. In unaligned mode, upper 16 bits should contain data offset, lower 48 bits should contain only specific chunk location without offset. Remove pool->headroom duplication into 48bit address. Signed-off-by: Eryk Kubanski Fixes: bea14124bacb ("xsk: Get rid of xdp_buff_xsk::orig_addr") Acked-by: Magnus Karlsson Link: https://patch.msgid.link/20250416112925.7501-1-e.kubanski@partner.samsung.com Signed-off-by: Jakub Kicinski --- include/net/xsk_buff_pool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 3b243ea70e38..cac56e6b0869 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -240,8 +240,8 @@ static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb, return orig_addr; offset = xskb->xdp.data - xskb->xdp.data_hard_start; - orig_addr -= offset; offset += pool->headroom; + orig_addr -= offset; return orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } -- 2.51.0 From eacc77a73275895eca0e3655dc6c671853500e2e Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Wed, 23 Apr 2025 11:36:07 +0300 Subject: [PATCH 04/16] net/mlx5e: Use custom tunnel header for vxlan gbp Symbolic (e.g. "vxlan") and custom (e.g. "tunnel_header_0") tunnels cannot be combined, but the match params interface does not have fields for matching on vxlan gbp. To match vxlan bgp, the tc_tun layer uses tunnel_header_0. Allow matching on both VNI and GBP by matching the VNI with a custom tunnel header instead of the symbolic field name. Matching solely on the VNI continues to use the symbolic field name. Fixes: 74a778b4a63f ("net/mlx5: HWS, added definers handling") Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Signed-off-by: Mark Bloch Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250423083611.324567-2-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/en/tc_tun_vxlan.c | 32 +++++++++++++++++-- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c index 5c762a71818d..7a18a469961d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c @@ -165,9 +165,6 @@ static int mlx5e_tc_tun_parse_vxlan(struct mlx5e_priv *priv, struct flow_match_enc_keyid enc_keyid; void *misc_c, *misc_v; - misc_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters); - misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters); - if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_KEYID)) return 0; @@ -182,6 +179,30 @@ static int mlx5e_tc_tun_parse_vxlan(struct mlx5e_priv *priv, err = mlx5e_tc_tun_parse_vxlan_gbp_option(priv, spec, f); if (err) return err; + + /* We can't mix custom tunnel headers with symbolic ones and we + * don't have a symbolic field name for GBP, so we use custom + * tunnel headers in this case. We need hardware support to + * match on custom tunnel headers, but we already know it's + * supported because the previous call successfully checked for + * that. + */ + misc_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters_5); + misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters_5); + + /* Shift by 8 to account for the reserved bits in the vxlan + * header after the VNI. + */ + MLX5_SET(fte_match_set_misc5, misc_c, tunnel_header_1, + be32_to_cpu(enc_keyid.mask->keyid) << 8); + MLX5_SET(fte_match_set_misc5, misc_v, tunnel_header_1, + be32_to_cpu(enc_keyid.key->keyid) << 8); + + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_5; + + return 0; } /* match on VNI is required */ @@ -195,6 +216,11 @@ static int mlx5e_tc_tun_parse_vxlan(struct mlx5e_priv *priv, return -EOPNOTSUPP; } + misc_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters); + misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters); + MLX5_SET(fte_match_set_misc, misc_c, vxlan_vni, be32_to_cpu(enc_keyid.mask->keyid)); MLX5_SET(fte_match_set_misc, misc_v, vxlan_vni, -- 2.51.0 From 5d1a04f347e6cbf5ffe74da409a5d71fbe8c5f19 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Wed, 23 Apr 2025 11:36:08 +0300 Subject: [PATCH 05/16] net/mlx5: E-Switch, Initialize MAC Address for Default GID Initialize the source MAC address when creating the default GID entry. Since this entry is used only for loopback traffic, it only needs to be a unicast address. A zeroed-out MAC address is sufficient for this purpose. Without this fix, random bits would be assigned as the source address. If these bits formed a multicast address, the firmware would return an error, preventing the user from switching to switchdev mode: Error: mlx5_core: Failed setting eswitch to offloads. kernel answers: Invalid argument Fixes: 80f09dfc237f ("net/mlx5: Eswitch, enable RoCE loopback traffic") Signed-off-by: Maor Gottlieb Signed-off-by: Mark Bloch Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250423083611.324567-3-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rdma.c b/drivers/net/ethernet/mellanox/mlx5/core/rdma.c index a42f6cd99b74..f585ef5a3424 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/rdma.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/rdma.c @@ -118,8 +118,8 @@ static void mlx5_rdma_make_default_gid(struct mlx5_core_dev *dev, union ib_gid * static int mlx5_rdma_add_roce_addr(struct mlx5_core_dev *dev) { + u8 mac[ETH_ALEN] = {}; union ib_gid gid; - u8 mac[ETH_ALEN]; mlx5_rdma_make_default_gid(dev, &gid); return mlx5_core_roce_gid_set(dev, 0, -- 2.51.0 From 172c034264c894518c012387f2de2f9d6443505d Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Wed, 23 Apr 2025 11:36:09 +0300 Subject: [PATCH 06/16] net/mlx5e: TC, Continue the attr process even if encap entry is invalid Previously the offload of the rule with header rewrite and mirror to both internal and external destinations is skipped if the encap entry is not valid. But it shouldn't because driver will try to offload it again if neighbor is updated and encap entry is valid, to replace the old FTE added for slow path. But the extra split attr doesn't exist at that time as the process is skipped, driver then fails to offload it. To fix this issue, remove the checking and continue the attr process if encap entry is invalid. Fixes: b11bde56246e ("net/mlx5e: TC, Offload rewrite and mirror to both internal and external dests") Signed-off-by: Jianbo Liu Reviewed-by: Cosmin Ratiu Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250423083611.324567-4-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 9ba99609999f..f1d908f61134 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1750,9 +1750,6 @@ extra_split_attr_dests_needed(struct mlx5e_tc_flow *flow, struct mlx5_flow_attr !list_is_first(&attr->list, &flow->attrs)) return 0; - if (flow_flag_test(flow, SLOW)) - return 0; - esw_attr = attr->esw_attr; if (!esw_attr->split_count || esw_attr->split_count == esw_attr->out_count - 1) @@ -1766,7 +1763,7 @@ extra_split_attr_dests_needed(struct mlx5e_tc_flow *flow, struct mlx5_flow_attr for (i = esw_attr->split_count; i < esw_attr->out_count; i++) { /* external dest with encap is considered as internal by firmware */ if (esw_attr->dests[i].vport == MLX5_VPORT_UPLINK && - !(esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP_VALID)) + !(esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP)) ext_dest = true; else int_dest = true; -- 2.51.0 From 1c2940ec0ddf51c689ee9ab85ead85c11b77809d Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Wed, 23 Apr 2025 11:36:10 +0300 Subject: [PATCH 07/16] net/mlx5e: Fix lock order in mlx5e_tx_reporter_ptpsq_unhealthy_recover RTNL needs to be acquired before state_lock. Fixes: fdce06bda7e5 ("net/mlx5e: Acquire RTNL lock before RQs/SQs activation/deactivation") Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250423083611.324567-5-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index 532c7fa94d17..dbd9482359e1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -176,6 +176,7 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx) priv = ptpsq->txqsq.priv; + rtnl_lock(); mutex_lock(&priv->state_lock); chs = &priv->channels; netdev = priv->netdev; @@ -183,22 +184,19 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx) carrier_ok = netif_carrier_ok(netdev); netif_carrier_off(netdev); - rtnl_lock(); mlx5e_deactivate_priv_channels(priv); - rtnl_unlock(); mlx5e_ptp_close(chs->ptp); err = mlx5e_ptp_open(priv, &chs->params, chs->c[0]->lag_port, &chs->ptp); - rtnl_lock(); mlx5e_activate_priv_channels(priv); - rtnl_unlock(); /* return carrier back if needed */ if (carrier_ok) netif_carrier_on(netdev); mutex_unlock(&priv->state_lock); + rtnl_unlock(); return err; } -- 2.51.0 From 90538d23278a981e344d364e923162fce752afeb Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Wed, 23 Apr 2025 11:36:11 +0300 Subject: [PATCH 08/16] net/mlx5: E-switch, Fix error handling for enabling roce The cited commit assumes enabling roce always succeeds. But it is not true. Add error handling for it. Fixes: 80f09dfc237f ("net/mlx5: Eswitch, enable RoCE loopback traffic") Signed-off-by: Chris Mi Reviewed-by: Roi Dayan Reviewed-by: Maor Gottlieb Signed-off-by: Mark Bloch Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250423083611.324567-6-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 5 ++++- drivers/net/ethernet/mellanox/mlx5/core/rdma.c | 9 +++++---- drivers/net/ethernet/mellanox/mlx5/core/rdma.h | 4 ++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index a6a8eea5980c..0e3a977d5332 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -3533,7 +3533,9 @@ int esw_offloads_enable(struct mlx5_eswitch *esw) int err; mutex_init(&esw->offloads.termtbl_mutex); - mlx5_rdma_enable_roce(esw->dev); + err = mlx5_rdma_enable_roce(esw->dev); + if (err) + goto err_roce; err = mlx5_esw_host_number_init(esw); if (err) @@ -3594,6 +3596,7 @@ err_vport_metadata: esw_offloads_metadata_uninit(esw); err_metadata: mlx5_rdma_disable_roce(esw->dev); +err_roce: mutex_destroy(&esw->offloads.termtbl_mutex); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rdma.c b/drivers/net/ethernet/mellanox/mlx5/core/rdma.c index f585ef5a3424..5c552b71e371 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/rdma.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/rdma.c @@ -140,17 +140,17 @@ void mlx5_rdma_disable_roce(struct mlx5_core_dev *dev) mlx5_nic_vport_disable_roce(dev); } -void mlx5_rdma_enable_roce(struct mlx5_core_dev *dev) +int mlx5_rdma_enable_roce(struct mlx5_core_dev *dev) { int err; if (!MLX5_CAP_GEN(dev, roce)) - return; + return 0; err = mlx5_nic_vport_enable_roce(dev); if (err) { mlx5_core_err(dev, "Failed to enable RoCE: %d\n", err); - return; + return err; } err = mlx5_rdma_add_roce_addr(dev); @@ -165,10 +165,11 @@ void mlx5_rdma_enable_roce(struct mlx5_core_dev *dev) goto del_roce_addr; } - return; + return err; del_roce_addr: mlx5_rdma_del_roce_addr(dev); disable_roce: mlx5_nic_vport_disable_roce(dev); + return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rdma.h b/drivers/net/ethernet/mellanox/mlx5/core/rdma.h index 750cff2a71a4..3d9e76c3d42f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/rdma.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/rdma.h @@ -8,12 +8,12 @@ #ifdef CONFIG_MLX5_ESWITCH -void mlx5_rdma_enable_roce(struct mlx5_core_dev *dev); +int mlx5_rdma_enable_roce(struct mlx5_core_dev *dev); void mlx5_rdma_disable_roce(struct mlx5_core_dev *dev); #else /* CONFIG_MLX5_ESWITCH */ -static inline void mlx5_rdma_enable_roce(struct mlx5_core_dev *dev) {} +static inline int mlx5_rdma_enable_roce(struct mlx5_core_dev *dev) { return 0; } static inline void mlx5_rdma_disable_roce(struct mlx5_core_dev *dev) {} #endif /* CONFIG_MLX5_ESWITCH */ -- 2.51.0 From 4c2227656d9003f4d77afc76f34dd81b95e4c2c4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 23 Apr 2025 15:36:00 +0200 Subject: [PATCH 09/16] vmxnet3: Fix malformed packet sizing in vmxnet3_process_xdp vmxnet3 driver's XDP handling is buggy for packet sizes using ring0 (that is, packet sizes between 128 - 3k bytes). We noticed MTU-related connectivity issues with Cilium's service load- balancing in case of vmxnet3 as NIC underneath. A simple curl to a HTTP backend service where the XDP LB was doing IPIP encap led to overly large packet sizes but only for *some* of the packets (e.g. HTTP GET request) while others (e.g. the prior TCP 3WHS) looked completely fine on the wire. In fact, the pcap recording on the backend node actually revealed that the node with the XDP LB was leaking uninitialized kernel data onto the wire for the affected packets, for example, while the packets should have been 152 bytes their actual size was 1482 bytes, so the remainder after 152 bytes was padded with whatever other data was in that page at the time (e.g. we saw user/payload data from prior processed packets). We only noticed this through an MTU issue, e.g. when the XDP LB node and the backend node both had the same MTU (e.g. 1500) then the curl request got dropped on the backend node's NIC given the packet was too large even though the IPIP-encapped packet normally would never even come close to the MTU limit. Lowering the MTU on the XDP LB (e.g. 1480) allowed to let the curl request succeed (which also indicates that the kernel ignored the padding, and thus the issue wasn't very user-visible). Commit e127ce7699c1 ("vmxnet3: Fix missing reserved tailroom") was too eager to also switch xdp_prepare_buff() from rcd->len to rbi->len. It really needs to stick to rcd->len which is the actual packet length from the descriptor. The latter we also feed into vmxnet3_process_xdp_small(), by the way, and it indicates the correct length needed to initialize the xdp->{data,data_end} parts. For e127ce7699c1 ("vmxnet3: Fix missing reserved tailroom") the relevant part was adapting xdp_init_buff() to address the warning given the xdp_data_hard_end() depends on xdp->frame_sz. With that fixed, traffic on the wire looks good again. Fixes: e127ce7699c1 ("vmxnet3: Fix missing reserved tailroom") Signed-off-by: Daniel Borkmann Tested-by: Andrew Sauber Cc: Anton Protopopov Cc: William Tu Cc: Martin Zaharinov Cc: Ronak Doshi Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250423133600.176689-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- drivers/net/vmxnet3/vmxnet3_xdp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/vmxnet3/vmxnet3_xdp.c b/drivers/net/vmxnet3/vmxnet3_xdp.c index 616ecc38d172..5f470499e600 100644 --- a/drivers/net/vmxnet3/vmxnet3_xdp.c +++ b/drivers/net/vmxnet3/vmxnet3_xdp.c @@ -397,7 +397,7 @@ vmxnet3_process_xdp(struct vmxnet3_adapter *adapter, xdp_init_buff(&xdp, PAGE_SIZE, &rq->xdp_rxq); xdp_prepare_buff(&xdp, page_address(page), rq->page_pool->p.offset, - rbi->len, false); + rcd->len, false); xdp_buff_clear_frags_flag(&xdp); xdp_prog = rcu_dereference(rq->adapter->xdp_bpf_prog); -- 2.51.0 From 5ec6d7d737a491256cd37e33910f7ac1978db591 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 25 Apr 2025 01:37:33 +0300 Subject: [PATCH 10/16] net: mscc: ocelot: delete PVID VLAN when readding it as non-PVID The following set of commands: ip link add br0 type bridge vlan_filtering 1 # vlan_default_pvid 1 is implicit ip link set swp0 master br0 bridge vlan add dev swp0 vid 1 should result in the dropping of untagged and 802.1p-tagged traffic, but we see that it continues to be accepted. Whereas, had we deleted VID 1 instead, the aforementioned dropping would have worked This is because the ANA_PORT_DROP_CFG update logic doesn't run, because ocelot_vlan_add() only calls ocelot_port_set_pvid() if the new VLAN has the BRIDGE_VLAN_INFO_PVID flag. Similar to other drivers like mt7530_port_vlan_add() which handle this case correctly, we need to test whether the VLAN we're changing used to have the BRIDGE_VLAN_INFO_PVID flag, but lost it now. That amounts to a PVID deletion and should be treated as such. Regarding blame attribution: this never worked properly since the introduction of bridge VLAN filtering in commit 7142529f1688 ("net: mscc: ocelot: add VLAN filtering"). However, there was a significant paradigm shift which aligned the ANA_PORT_DROP_CFG register with the PVID concept rather than with the native VLAN concept, and that change wasn't targeted for 'stable'. Realistically, that is as far as this fix needs to be propagated to. Fixes: be0576fed6d3 ("net: mscc: ocelot: move the logic to drop 802.1p traffic to the pvid deletion") Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250424223734.3096202-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index ef93df520887..08bee56aea35 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -830,6 +830,7 @@ EXPORT_SYMBOL(ocelot_vlan_prepare); int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid, bool untagged) { + struct ocelot_port *ocelot_port = ocelot->ports[port]; int err; /* Ignore VID 0 added to our RX filter by the 8021q module, since @@ -849,6 +850,11 @@ int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid, ocelot_bridge_vlan_find(ocelot, vid)); if (err) return err; + } else if (ocelot_port->pvid_vlan && + ocelot_bridge_vlan_find(ocelot, vid) == ocelot_port->pvid_vlan) { + err = ocelot_port_set_pvid(ocelot, port, NULL); + if (err) + return err; } /* Untagged egress vlan clasification */ -- 2.51.0 From bf9de1dcd0eecd16020a677c900a70ea9b0a9714 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 25 Apr 2025 01:37:34 +0300 Subject: [PATCH 11/16] selftests: net: bridge_vlan_aware: test untagged/8021p-tagged with and without PVID Recent discussions around commit ad1afb003939 ("vlan_dev: VLAN 0 should be treated as "no vlan tag" (802.1p packet)") have sparked the question what happens with the DSA (and possibly other switchdev) data path when the bridge says that ports should have no PVID VLAN, but the 8021q module, as the result of a NETDEV_UP event, decides it should add VID 0 to the RX filter of those bridge ports. Do those bridge ports receive packets tagged with VID 0 or not, now? We don't know, there is no test. In the veth realm, this passes trivially, because veth is not VLAN filtering and this, the 8021q module lacks the instinct to add VID 0 in the first place. In the realm of VLAN filtering NICs with no switchdev offload, this should also pass, because the VLAN groups of the software bridge are consulted, where it can clearly be seen that a PVID is missing, even though the packet was initially accepted by the NIC. The test only poses a challenge for switchdev drivers, which usually have to program to hardware both VLANs from RX filtering, as well as from switchdev. Especially when a switchdev port joins a VLAN-aware bridge, it is unavoidable that it gains the NETIF_F_HW_VLAN_CTAG_FILTER feature, i.e. any 8021q uppers that the bridge port may have must also be committed to the RX filtering table of the interface. When a VLAN-tagged packet is physically received by the port, it is initially indistinguishable whether it will reach the bridge data path or the 8021q upper data path. That is rather the final step of the new tests that we introduce. We need to build context up to that stage, which means the following: - we need to test that 802.1p (VID 0) tagged traffic is received in the first place (on bridge ports with a valid PVID). This is the "8021p" test. - we need to test that the usual paths of reaching a configuration with no PVID on a bridge port are all covered and they all reach the same state. Signed-off-by: Vladimir Oltean Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Link: https://patch.msgid.link/20250424223734.3096202-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- .../net/forwarding/bridge_vlan_aware.sh | 96 ++++++++++++++++++- 1 file changed, 95 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh index 90f8a244ea90..e59fba366a0a 100755 --- a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh +++ b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding vlan_deletion extern_learn other_tpid" +ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding vlan_deletion extern_learn other_tpid 8021p drop_untagged" NUM_NETIFS=4 CHECK_TC="yes" source lib.sh @@ -194,6 +194,100 @@ other_tpid() tc qdisc del dev $h2 clsact } +8021p_do() +{ + local should_fail=$1; shift + local mac=de:ad:be:ef:13:37 + + tc filter add dev $h2 ingress protocol all pref 1 handle 101 \ + flower dst_mac $mac action drop + + $MZ -q $h1 -c 1 -b $mac -a own "81:00 00:00 08:00 aa-aa-aa-aa-aa-aa-aa-aa-aa" + sleep 1 + + tc -j -s filter show dev $h2 ingress \ + | jq -e ".[] | select(.options.handle == 101) \ + | select(.options.actions[0].stats.packets == 1)" &> /dev/null + check_err_fail $should_fail $? "802.1p-tagged reception" + + tc filter del dev $h2 ingress pref 1 +} + +8021p() +{ + RET=0 + + tc qdisc add dev $h2 clsact + ip link set $h2 promisc on + + # Test that with the default_pvid, 1, packets tagged with VID 0 are + # accepted. + 8021p_do 0 + + # Test that packets tagged with VID 0 are still accepted after changing + # the default_pvid. + ip link set br0 type bridge vlan_default_pvid 10 + 8021p_do 0 + + log_test "Reception of 802.1p-tagged traffic" + + ip link set $h2 promisc off + tc qdisc del dev $h2 clsact +} + +send_untagged_and_8021p() +{ + ping_do $h1 192.0.2.2 + check_fail $? + + 8021p_do 1 +} + +drop_untagged() +{ + RET=0 + + tc qdisc add dev $h2 clsact + ip link set $h2 promisc on + + # Test that with no PVID, untagged and 802.1p-tagged traffic is + # dropped. + ip link set br0 type bridge vlan_default_pvid 1 + + # First we reconfigure the default_pvid, 1, as a non-PVID VLAN. + bridge vlan add dev $swp1 vid 1 untagged + send_untagged_and_8021p + bridge vlan add dev $swp1 vid 1 pvid untagged + + # Next we try to delete VID 1 altogether + bridge vlan del dev $swp1 vid 1 + send_untagged_and_8021p + bridge vlan add dev $swp1 vid 1 pvid untagged + + # Set up the bridge without a default_pvid, then check that the 8021q + # module, when the bridge port goes down and then up again, does not + # accidentally re-enable untagged packet reception. + ip link set br0 type bridge vlan_default_pvid 0 + ip link set $swp1 down + ip link set $swp1 up + setup_wait + send_untagged_and_8021p + + # Remove swp1 as a bridge port and let it rejoin the bridge while it + # has no default_pvid. + ip link set $swp1 nomaster + ip link set $swp1 master br0 + send_untagged_and_8021p + + # Restore settings + ip link set br0 type bridge vlan_default_pvid 1 + + log_test "Dropping of untagged and 802.1p-tagged traffic with no PVID" + + ip link set $h2 promisc off + tc qdisc del dev $h2 clsact +} + trap cleanup EXIT setup_prepare -- 2.51.0 From 765f253e28909f161b0211f85cf0431cfee7d6df Mon Sep 17 00:00:00 2001 From: Christian Heusel Date: Thu, 24 Apr 2025 16:00:28 +0200 Subject: [PATCH 12/16] Revert "rndis_host: Flag RNDIS modems as WWAN devices" This reverts commit 67d1a8956d2d62fe6b4c13ebabb57806098511d8. Since this commit has been proven to be problematic for the setup of USB-tethered ethernet connections and the related breakage is very noticeable for users it should be reverted until a fixed version of the change can be rolled out. Closes: https://lore.kernel.org/all/e0df2d85-1296-4317-b717-bd757e3ab928@heusel.eu/ Link: https://chaos.social/@gromit/114377862699921553 Link: https://bugzilla.kernel.org/show_bug.cgi?id=220002 Link: https://bugs.gentoo.org/953555 Link: https://bbs.archlinux.org/viewtopic.php?id=304892 Cc: stable@vger.kernel.org Acked-by: Lubomir Rintel Signed-off-by: Christian Heusel Link: https://patch.msgid.link/20250424-usb-tethering-fix-v1-1-b65cf97c740e@heusel.eu Signed-off-by: Jakub Kicinski --- drivers/net/usb/rndis_host.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/net/usb/rndis_host.c b/drivers/net/usb/rndis_host.c index bb0bf1415872..7b3739b29c8f 100644 --- a/drivers/net/usb/rndis_host.c +++ b/drivers/net/usb/rndis_host.c @@ -630,16 +630,6 @@ static const struct driver_info zte_rndis_info = { .tx_fixup = rndis_tx_fixup, }; -static const struct driver_info wwan_rndis_info = { - .description = "Mobile Broadband RNDIS device", - .flags = FLAG_WWAN | FLAG_POINTTOPOINT | FLAG_FRAMING_RN | FLAG_NO_SETINT, - .bind = rndis_bind, - .unbind = rndis_unbind, - .status = rndis_status, - .rx_fixup = rndis_rx_fixup, - .tx_fixup = rndis_tx_fixup, -}; - /*-------------------------------------------------------------------------*/ static const struct usb_device_id products [] = { @@ -676,11 +666,9 @@ static const struct usb_device_id products [] = { USB_INTERFACE_INFO(USB_CLASS_WIRELESS_CONTROLLER, 1, 3), .driver_info = (unsigned long) &rndis_info, }, { - /* Mobile Broadband Modem, seen in Novatel Verizon USB730L and - * Telit FN990A (RNDIS) - */ + /* Novatel Verizon USB730L */ USB_INTERFACE_INFO(USB_CLASS_MISC, 4, 1), - .driver_info = (unsigned long)&wwan_rndis_info, + .driver_info = (unsigned long) &rndis_info, }, { }, // END }; -- 2.51.0 From 8548c84c004be3da4ffbe35ed0589041a4050c03 Mon Sep 17 00:00:00 2001 From: Sathesh B Edara Date: Thu, 24 Apr 2025 06:39:44 -0700 Subject: [PATCH 13/16] octeon_ep_vf: Resolve netdevice usage count issue The netdevice usage count increases during transmit queue timeouts because netdev_hold is called in ndo_tx_timeout, scheduling a task to reinitialize the card. Although netdev_put is called at the end of the scheduled work, rtnl_unlock checks the reference count during cleanup. This could cause issues if transmit timeout is called on multiple queues. Fixes: cb7dd712189f ("octeon_ep_vf: Add driver framework and device initialization") Signed-off-by: Sathesh B Edara Link: https://patch.msgid.link/20250424133944.28128-1-sedara@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c index 18c922dd5fc6..ccb69bc5c952 100644 --- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c @@ -835,7 +835,9 @@ static void octep_vf_tx_timeout(struct net_device *netdev, unsigned int txqueue) struct octep_vf_device *oct = netdev_priv(netdev); netdev_hold(netdev, NULL, GFP_ATOMIC); - schedule_work(&oct->tx_timeout_task); + if (!schedule_work(&oct->tx_timeout_task)) + netdev_put(netdev, NULL); + } static int octep_vf_set_mac(struct net_device *netdev, void *p) -- 2.51.0 From 8f7ae5a85137b913cb97e2d24409d36548d0bab1 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Thu, 24 Apr 2025 05:55:47 -0700 Subject: [PATCH 14/16] bnxt_en: improve TX timestamping FIFO configuration Reconfiguration of netdev may trigger close/open procedure which can break FIFO status by adjusting the amount of empty slots for TX timestamps. But it is not really needed because timestamps for the packets sent over the wire still can be retrieved. On the other side, during netdev close procedure any skbs waiting for TX timestamps can be leaked because there is no cleaning procedure called. Free skbs waiting for TX timestamps when closing netdev. Fixes: 8aa2a79e9b95 ("bnxt_en: Increase the max total outstanding PTP TX packets to 4") Reviewed-by: Michael Chan Reviewed-by: Pavan Chebbi Signed-off-by: Vadim Fedorenko Link: https://patch.msgid.link/20250424125547.460632-1-vadfed@meta.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 ++-- drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c | 29 ++++++++++++++----- drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h | 1 + 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c8e3468eee61..2c8e2c19d854 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3414,6 +3414,9 @@ static void bnxt_free_tx_skbs(struct bnxt *bp) bnxt_free_one_tx_ring_skbs(bp, txr, i); } + + if (bp->ptp_cfg && !(bp->fw_cap & BNXT_FW_CAP_TX_TS_CMP)) + bnxt_ptp_free_txts_skbs(bp->ptp_cfg); } static void bnxt_free_one_rx_ring(struct bnxt *bp, struct bnxt_rx_ring_info *rxr) @@ -12797,8 +12800,6 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init) /* VF-reps may need to be re-opened after the PF is re-opened */ if (BNXT_PF(bp)) bnxt_vf_reps_open(bp); - if (bp->ptp_cfg && !(bp->fw_cap & BNXT_FW_CAP_TX_TS_CMP)) - WRITE_ONCE(bp->ptp_cfg->tx_avail, BNXT_MAX_TX_TS); bnxt_ptp_init_rtc(bp, true); bnxt_ptp_cfg_tstamp_filters(bp); if (BNXT_SUPPORTS_MULTI_RSS_CTX(bp)) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c index 2d4e19b96ee7..0669d43472f5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c @@ -794,6 +794,27 @@ next_slot: return HZ; } +void bnxt_ptp_free_txts_skbs(struct bnxt_ptp_cfg *ptp) +{ + struct bnxt_ptp_tx_req *txts_req; + u16 cons = ptp->txts_cons; + + /* make sure ptp aux worker finished with + * possible BNXT_STATE_OPEN set + */ + ptp_cancel_worker_sync(ptp->ptp_clock); + + ptp->tx_avail = BNXT_MAX_TX_TS; + while (cons != ptp->txts_prod) { + txts_req = &ptp->txts_req[cons]; + if (!IS_ERR_OR_NULL(txts_req->tx_skb)) + dev_kfree_skb_any(txts_req->tx_skb); + cons = NEXT_TXTS(cons); + } + ptp->txts_cons = cons; + ptp_schedule_worker(ptp->ptp_clock, 0); +} + int bnxt_ptp_get_txts_prod(struct bnxt_ptp_cfg *ptp, u16 *prod) { spin_lock_bh(&ptp->ptp_tx_lock); @@ -1105,7 +1126,6 @@ out: void bnxt_ptp_clear(struct bnxt *bp) { struct bnxt_ptp_cfg *ptp = bp->ptp_cfg; - int i; if (!ptp) return; @@ -1117,12 +1137,5 @@ void bnxt_ptp_clear(struct bnxt *bp) kfree(ptp->ptp_info.pin_config); ptp->ptp_info.pin_config = NULL; - for (i = 0; i < BNXT_MAX_TX_TS; i++) { - if (ptp->txts_req[i].tx_skb) { - dev_kfree_skb_any(ptp->txts_req[i].tx_skb); - ptp->txts_req[i].tx_skb = NULL; - } - } - bnxt_unmap_ptp_regs(bp); } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h index a95f05e9c579..0481161d26ef 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h @@ -162,6 +162,7 @@ int bnxt_ptp_cfg_tstamp_filters(struct bnxt *bp); void bnxt_ptp_reapply_pps(struct bnxt *bp); int bnxt_hwtstamp_set(struct net_device *dev, struct ifreq *ifr); int bnxt_hwtstamp_get(struct net_device *dev, struct ifreq *ifr); +void bnxt_ptp_free_txts_skbs(struct bnxt_ptp_cfg *ptp); int bnxt_ptp_get_txts_prod(struct bnxt_ptp_cfg *ptp, u16 *prod); void bnxt_get_tx_ts_p5(struct bnxt *bp, struct sk_buff *skb, u16 prod); int bnxt_get_rx_ts_p5(struct bnxt *bp, u64 *ts, u32 pkt_ts); -- 2.51.0 From 68f9d8974b545668e1be2422240b25a92e304b14 Mon Sep 17 00:00:00 2001 From: Justin Lai Date: Thu, 24 Apr 2025 12:04:44 +0800 Subject: [PATCH 15/16] rtase: Modify the condition used to detect overflow in rtase_calc_time_mitigation Fix the following compile error reported by the kernel test robot by modifying the condition used to detect overflow in rtase_calc_time_mitigation. In file included from include/linux/mdio.h:10:0, from drivers/net/ethernet/realtek/rtase/rtase_main.c:58: In function 'u16_encode_bits', inlined from 'rtase_calc_time_mitigation.constprop' at drivers/net/ ethernet/realtek/rtase/rtase_main.c:1915:13, inlined from 'rtase_init_software_variable.isra.41' at drivers/net/ ethernet/realtek/rtase/rtase_main.c:1961:13, inlined from 'rtase_init_one' at drivers/net/ethernet/realtek/ rtase/rtase_main.c:2111:2: >> include/linux/bitfield.h:178:3: error: call to '__field_overflow' declared with attribute error: value doesn't fit into mask __field_overflow(); \ ^~~~~~~~~~~~~~~~~~ include/linux/bitfield.h:198:2: note: in expansion of macro '____MAKE_OP' ____MAKE_OP(u##size,u##size,,) ^~~~~~~~~~~ include/linux/bitfield.h:200:1: note: in expansion of macro '__MAKE_OP' __MAKE_OP(16) ^~~~~~~~~ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202503182158.nkAlbJWX-lkp@intel.com/ Fixes: a36e9f5cfe9e ("rtase: Add support for a pci table in this module") Signed-off-by: Justin Lai Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250424040444.5530-1-justinlai0215@realtek.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/rtase/rtase_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/rtase/rtase_main.c b/drivers/net/ethernet/realtek/rtase/rtase_main.c index 2aacc1996796..55b8d3666153 100644 --- a/drivers/net/ethernet/realtek/rtase/rtase_main.c +++ b/drivers/net/ethernet/realtek/rtase/rtase_main.c @@ -1925,8 +1925,8 @@ static u16 rtase_calc_time_mitigation(u32 time_us) time_us = min_t(int, time_us, RTASE_MITI_MAX_TIME); - msb = fls(time_us); - if (msb >= RTASE_MITI_COUNT_BIT_NUM) { + if (time_us > RTASE_MITI_TIME_COUNT_MASK) { + msb = fls(time_us); time_unit = msb - RTASE_MITI_COUNT_BIT_NUM; time_count = time_us >> (msb - RTASE_MITI_COUNT_BIT_NUM); } else { -- 2.51.0 From 6fe0866014486736cc3ba1c6fd4606d3dbe55c9c Mon Sep 17 00:00:00 2001 From: Louis-Alexis Eyraud Date: Thu, 24 Apr 2025 10:38:48 +0200 Subject: [PATCH 16/16] net: ethernet: mtk-star-emac: fix spinlock recursion issues on rx/tx poll Use spin_lock_irqsave and spin_unlock_irqrestore instead of spin_lock and spin_unlock in mtk_star_emac driver to avoid spinlock recursion occurrence that can happen when enabling the DMA interrupts again in rx/tx poll. ``` BUG: spinlock recursion on CPU#0, swapper/0/0 lock: 0xffff00000db9cf20, .magic: dead4ead, .owner: swapper/0/0, .owner_cpu: 0 CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.15.0-rc2-next-20250417-00001-gf6a27738686c-dirty #28 PREEMPT Hardware name: MediaTek MT8365 Open Platform EVK (DT) Call trace: show_stack+0x18/0x24 (C) dump_stack_lvl+0x60/0x80 dump_stack+0x18/0x24 spin_dump+0x78/0x88 do_raw_spin_lock+0x11c/0x120 _raw_spin_lock+0x20/0x2c mtk_star_handle_irq+0xc0/0x22c [mtk_star_emac] __handle_irq_event_percpu+0x48/0x140 handle_irq_event+0x4c/0xb0 handle_fasteoi_irq+0xa0/0x1bc handle_irq_desc+0x34/0x58 generic_handle_domain_irq+0x1c/0x28 gic_handle_irq+0x4c/0x120 do_interrupt_handler+0x50/0x84 el1_interrupt+0x34/0x68 el1h_64_irq_handler+0x18/0x24 el1h_64_irq+0x6c/0x70 regmap_mmio_read32le+0xc/0x20 (P) _regmap_bus_reg_read+0x6c/0xac _regmap_read+0x60/0xdc regmap_read+0x4c/0x80 mtk_star_rx_poll+0x2f4/0x39c [mtk_star_emac] __napi_poll+0x38/0x188 net_rx_action+0x164/0x2c0 handle_softirqs+0x100/0x244 __do_softirq+0x14/0x20 ____do_softirq+0x10/0x20 call_on_irq_stack+0x24/0x64 do_softirq_own_stack+0x1c/0x40 __irq_exit_rcu+0xd4/0x10c irq_exit_rcu+0x10/0x1c el1_interrupt+0x38/0x68 el1h_64_irq_handler+0x18/0x24 el1h_64_irq+0x6c/0x70 cpuidle_enter_state+0xac/0x320 (P) cpuidle_enter+0x38/0x50 do_idle+0x1e4/0x260 cpu_startup_entry+0x34/0x3c rest_init+0xdc/0xe0 console_on_rootfs+0x0/0x6c __primary_switched+0x88/0x90 ``` Fixes: 0a8bd81fd6aa ("net: ethernet: mtk-star-emac: separate tx/rx handling with two NAPIs") Signed-off-by: Louis-Alexis Eyraud Reviewed-by: Maxime Chevallier Acked-by: Bartosz Golaszewski Link: https://patch.msgid.link/20250424-mtk_star_emac-fix-spinlock-recursion-issue-v2-1-f3fde2e529d8@collabora.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_star_emac.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_star_emac.c b/drivers/net/ethernet/mediatek/mtk_star_emac.c index 76f202d7f055..23115881d8e8 100644 --- a/drivers/net/ethernet/mediatek/mtk_star_emac.c +++ b/drivers/net/ethernet/mediatek/mtk_star_emac.c @@ -1163,6 +1163,7 @@ static int mtk_star_tx_poll(struct napi_struct *napi, int budget) struct net_device *ndev = priv->ndev; unsigned int head = ring->head; unsigned int entry = ring->tail; + unsigned long flags; while (entry != head && count < (MTK_STAR_RING_NUM_DESCS - 1)) { ret = mtk_star_tx_complete_one(priv); @@ -1182,9 +1183,9 @@ static int mtk_star_tx_poll(struct napi_struct *napi, int budget) netif_wake_queue(ndev); if (napi_complete(napi)) { - spin_lock(&priv->lock); + spin_lock_irqsave(&priv->lock, flags); mtk_star_enable_dma_irq(priv, false, true); - spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->lock, flags); } return 0; @@ -1341,6 +1342,7 @@ push_new_skb: static int mtk_star_rx_poll(struct napi_struct *napi, int budget) { struct mtk_star_priv *priv; + unsigned long flags; int work_done = 0; priv = container_of(napi, struct mtk_star_priv, rx_napi); @@ -1348,9 +1350,9 @@ static int mtk_star_rx_poll(struct napi_struct *napi, int budget) work_done = mtk_star_rx(priv, budget); if (work_done < budget) { napi_complete_done(napi, work_done); - spin_lock(&priv->lock); + spin_lock_irqsave(&priv->lock, flags); mtk_star_enable_dma_irq(priv, true, false); - spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->lock, flags); } return work_done; -- 2.51.0