From ed16b8a4d1ca901fc13ced042b76dde54738249a Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 25 Feb 2025 18:17:48 +0100 Subject: [PATCH 01/16] bpf: cpumap: switch to napi_skb_cache_get_bulk() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Now that cpumap uses GRO, which drops unused skb heads to the NAPI cache, use napi_skb_cache_get_bulk() to try to reuse cached entries and lower MM layer pressure. Always disable the BH before checking and running the cpumap-pinned XDP prog and don't re-enable it in between that and allocating an skb bulk, as we can access the NAPI caches only from the BH context. The better GRO aggregates packets, the less new skbs will be allocated. If an aggregated skb contains 16 frags, this means 15 skbs were returned to the cache, so next 15 skbs will be built without allocating anything. The same trafficgen UDP GRO test now shows: GRO off GRO on threaded GRO 2.3 4 Mpps thr bulk GRO 2.4 4.7 Mpps diff +4 +17 % Comparing to the baseline cpumap: baseline 2.7 N/A Mpps thr bulk GRO 2.4 4.7 Mpps diff -11 +74 % Tested-by: Daniel Xu Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- kernel/bpf/cpumap.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 85936f09d8d7..67e8a2fc1a99 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -253,7 +253,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, if (!rcpu->prog) goto out; - rcu_read_lock_bh(); + rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats); @@ -265,7 +265,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, xdp_do_flush(); bpf_net_ctx_clear(bpf_net_ctx); - rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ + rcu_read_unlock(); out: if (unlikely(ret->skb_n) && ret->xdp_n) @@ -303,7 +303,6 @@ static int cpu_map_kthread_run(void *data) while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { struct xdp_cpumap_stats stats = {}; /* zero stats */ unsigned int kmem_alloc_drops = 0, sched = 0; - gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; struct cpu_map_ret ret = { }; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; @@ -355,15 +354,14 @@ static int cpu_map_kthread_run(void *data) prefetchw(page); } + local_bh_disable(); + /* Support running another XDP prog on this CPU */ cpu_map_bpf_prog_run(rcpu, frames, skbs, &ret, &stats); - if (!ret.xdp_n) { - local_bh_disable(); + if (!ret.xdp_n) goto stats; - } - m = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, - ret.xdp_n, skbs); + m = napi_skb_cache_get_bulk(skbs, ret.xdp_n); if (unlikely(m < ret.xdp_n)) { for (i = m; i < ret.xdp_n; i++) xdp_return_frame(frames[i]); @@ -376,7 +374,6 @@ static int cpu_map_kthread_run(void *data) ret.xdp_n = m; } - local_bh_disable(); for (i = 0; i < ret.xdp_n; i++) { struct xdp_frame *xdpf = frames[i]; -- 2.51.0 From 1c5bf4de975dd4d493cea3567703404819c81425 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 25 Feb 2025 18:17:49 +0100 Subject: [PATCH 02/16] veth: use napi_skb_cache_get_bulk() instead of xdp_alloc_skb_bulk() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Now that we can bulk-allocate skbs from the NAPI cache, use that function to do that in veth as well instead of direct allocation from the kmem caches. veth uses NAPI and GRO, so this is both context-safe and beneficial. Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- drivers/net/veth.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index ba3ae2d8092f..05f5eeef539f 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -684,8 +684,7 @@ static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, void *skbs[VETH_XDP_BATCH]; int i; - if (xdp_alloc_skb_bulk(skbs, n_xdpf, - GFP_ATOMIC | __GFP_ZERO) < 0) { + if (unlikely(!napi_skb_cache_get_bulk(skbs, n_xdpf))) { for (i = 0; i < n_xdpf; i++) xdp_return_frame(frames[i]); stats->rx_drops += n_xdpf; -- 2.51.0 From b696d289c07d8480a7d4752e448f4ee2bee9e443 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 25 Feb 2025 18:17:50 +0100 Subject: [PATCH 03/16] xdp: remove xdp_alloc_skb_bulk() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The only user was veth, which now uses napi_skb_cache_get_bulk(). It's now preferred over a direct allocation and is exported as well, so remove this one. Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- include/net/xdp.h | 1 - net/core/xdp.c | 10 ---------- 2 files changed, 11 deletions(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index 4dafc5e021f1..48efacbaa35d 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -343,7 +343,6 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); -int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline diff --git a/net/core/xdp.c b/net/core/xdp.c index 2c6ab6fb452f..f86eedad586a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -618,16 +618,6 @@ void xdp_warn(const char *msg, const char *func, const int line) }; EXPORT_SYMBOL_GPL(xdp_warn); -int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp) -{ - n_skb = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, n_skb, skbs); - if (unlikely(!n_skb)) - return -ENOMEM; - - return 0; -} -EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk); - /** * xdp_build_skb_from_buff - create an skb from &xdp_buff * @xdp: &xdp_buff to convert to an skb -- 2.51.0 From c64a0727f9b1cbc63a5538c8c0014e9a175ad864 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Tue, 25 Feb 2025 18:51:38 +0100 Subject: [PATCH 04/16] net: ipv6: fix dst ref loop on input in seg6 lwt Prevent a dst ref loop on input in seg6_iptunnel. Fixes: af4a2209b134 ("ipv6: sr: use dst_cache in seg6_input") Cc: David Lebrun Cc: Ido Schimmel Reviewed-by: Ido Schimmel Signed-off-by: Justin Iurman Signed-off-by: Paolo Abeni --- net/ipv6/seg6_iptunnel.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 33833b2064c0..51583461ae29 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -472,10 +472,18 @@ static int seg6_input_core(struct net *net, struct sock *sk, { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; + struct lwtunnel_state *lwtst; struct seg6_lwt *slwt; int err; - slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); + /* We cannot dereference "orig_dst" once ip6_route_input() or + * skb_dst_drop() is called. However, in order to detect a dst loop, we + * need the address of its lwtstate. So, save the address of lwtstate + * now and use it later as a comparison. + */ + lwtst = orig_dst->lwtstate; + + slwt = seg6_lwt_lwtunnel(lwtst); local_bh_disable(); dst = dst_cache_get(&slwt->cache); @@ -490,7 +498,9 @@ static int seg6_input_core(struct net *net, struct sock *sk, if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); - if (!dst->error) { + + /* cache only if we don't create a dst reference loop */ + if (!dst->error && lwtst != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &ipv6_hdr(skb)->saddr); -- 2.51.0 From 13e55fbaec176119cff68a7e1693b251c8883c5f Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Tue, 25 Feb 2025 18:51:39 +0100 Subject: [PATCH 05/16] net: ipv6: fix dst ref loop on input in rpl lwt Prevent a dst ref loop on input in rpl_iptunnel. Fixes: a7a29f9c361f ("net: ipv6: add rpl sr tunnel") Cc: Alexander Aring Cc: Ido Schimmel Reviewed-by: Ido Schimmel Signed-off-by: Justin Iurman Signed-off-by: Paolo Abeni --- net/ipv6/rpl_iptunnel.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index 0ac4283acdf2..7c05ac846646 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -262,10 +262,18 @@ static int rpl_input(struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; + struct lwtunnel_state *lwtst; struct rpl_lwt *rlwt; int err; - rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); + /* We cannot dereference "orig_dst" once ip6_route_input() or + * skb_dst_drop() is called. However, in order to detect a dst loop, we + * need the address of its lwtstate. So, save the address of lwtstate + * now and use it later as a comparison. + */ + lwtst = orig_dst->lwtstate; + + rlwt = rpl_lwt_lwtunnel(lwtst); local_bh_disable(); dst = dst_cache_get(&rlwt->cache); @@ -280,7 +288,9 @@ static int rpl_input(struct sk_buff *skb) if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); - if (!dst->error) { + + /* cache only if we don't create a dst reference loop */ + if (!dst->error && lwtst != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &ipv6_hdr(skb)->saddr); -- 2.51.0 From 7fe0353606d77a32c4c7f2814833dd1c043ebdd2 Mon Sep 17 00:00:00 2001 From: Eric Woudstra Date: Tue, 25 Feb 2025 21:15:09 +0100 Subject: [PATCH 06/16] net: ethernet: mtk_ppe_offload: Allow QinQ, double ETH_P_8021Q only mtk_foe_entry_set_vlan() in mtk_ppe.c already supports double vlan tagging, but mtk_flow_offload_replace() in mtk_ppe_offload.c only allows for 1 vlan tag, optionally in combination with pppoe and dsa tags. However, mtk_foe_entry_set_vlan() only allows for setting the vlan id. The protocol cannot be set, it is always ETH_P_8021Q, for inner and outer tag. This patch adds QinQ support to mtk_flow_offload_replace(), only in the case that both inner and outer tags are ETH_P_8021Q. Only PPPoE-in-Q (as before) and Q-in-Q are allowed. A combination of PPPoE and Q-in-Q is not allowed. Signed-off-by: Eric Woudstra Link: https://patch.msgid.link/20250225201509.20843-1-ericwouds@gmail.com Signed-off-by: Paolo Abeni --- .../net/ethernet/mediatek/mtk_ppe_offload.c | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c index f20bb390df3a..c855fb799ce1 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c @@ -34,8 +34,10 @@ struct mtk_flow_data { u16 vlan_in; struct { - u16 id; - __be16 proto; + struct { + u16 id; + __be16 proto; + } vlans[2]; u8 num; } vlan; struct { @@ -349,18 +351,19 @@ mtk_flow_offload_replace(struct mtk_eth *eth, struct flow_cls_offload *f, case FLOW_ACTION_CSUM: break; case FLOW_ACTION_VLAN_PUSH: - if (data.vlan.num == 1 || + if (data.vlan.num + data.pppoe.num == 2 || act->vlan.proto != htons(ETH_P_8021Q)) return -EOPNOTSUPP; - data.vlan.id = act->vlan.vid; - data.vlan.proto = act->vlan.proto; + data.vlan.vlans[data.vlan.num].id = act->vlan.vid; + data.vlan.vlans[data.vlan.num].proto = act->vlan.proto; data.vlan.num++; break; case FLOW_ACTION_VLAN_POP: break; case FLOW_ACTION_PPPOE_PUSH: - if (data.pppoe.num == 1) + if (data.pppoe.num == 1 || + data.vlan.num == 2) return -EOPNOTSUPP; data.pppoe.sid = act->pppoe.sid; @@ -450,12 +453,9 @@ mtk_flow_offload_replace(struct mtk_eth *eth, struct flow_cls_offload *f, if (offload_type == MTK_PPE_PKT_TYPE_BRIDGE) foe.bridge.vlan = data.vlan_in; - if (data.vlan.num == 1) { - if (data.vlan.proto != htons(ETH_P_8021Q)) - return -EOPNOTSUPP; + for (i = 0; i < data.vlan.num; i++) + mtk_foe_entry_set_vlan(eth, &foe, data.vlan.vlans[i].id); - mtk_foe_entry_set_vlan(eth, &foe, data.vlan.id); - } if (data.pppoe.num == 1) mtk_foe_entry_set_pppoe(eth, &foe, data.pppoe.sid); -- 2.51.0 From 1cbddbddee68d17feb6467fc556c144777af91ef Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 26 Feb 2025 18:19:57 +0000 Subject: [PATCH 07/16] selftests: drv-net: Check if combined-count exists Some drivers, like tg3, do not set combined-count: $ ethtool -l enp4s0f1 Channel parameters for enp4s0f1: Pre-set maximums: RX: 4 TX: 4 Other: n/a Combined: n/a Current hardware settings: RX: 4 TX: 1 Other: n/a Combined: n/a In the case where combined-count is not set, the ethtool netlink code in the kernel elides the value and the code in the test: netnl.channels_get(...) With a tg3 device, the returned dictionary looks like: {'header': {'dev-index': 3, 'dev-name': 'enp4s0f1'}, 'rx-max': 4, 'rx-count': 4, 'tx-max': 4, 'tx-count': 1} Note that the key 'combined-count' is missing. As a result of this missing key the test raises an exception: # Exception| if channels['combined-count'] == 0: # Exception| ~~~~~~~~^^^^^^^^^^^^^^^^^^ # Exception| KeyError: 'combined-count' Change the test to check if 'combined-count' is a key in the dictionary first and if not assume that this means the driver has separate RX and TX queues. With this change, the test now passes successfully on tg3 and mlx5 (which does have a 'combined-count'). Fixes: 1cf270424218 ("net: selftest: add test for netdev netlink queue-get API") Signed-off-by: Joe Damato Reviewed-by: David Wei Link: https://patch.msgid.link/20250226181957.212189-1-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/queues.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/drivers/net/queues.py b/tools/testing/selftests/drivers/net/queues.py index 38303da957ee..8a518905a9f9 100755 --- a/tools/testing/selftests/drivers/net/queues.py +++ b/tools/testing/selftests/drivers/net/queues.py @@ -45,10 +45,9 @@ def addremove_queues(cfg, nl) -> None: netnl = EthtoolFamily() channels = netnl.channels_get({'header': {'dev-index': cfg.ifindex}}) - if channels['combined-count'] == 0: - rx_type = 'rx' - else: - rx_type = 'combined' + rx_type = 'rx' + if channels.get('combined-count', 0) > 0: + rx_type = 'combined' expected = curr_queues - 1 cmd(f"ethtool -L {cfg.dev['ifname']} {rx_type} {expected}", timeout=10) -- 2.51.0 From 674fcb4f4a7e3e277417a01788cc6daae47c3804 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Feb 2025 22:12:52 +0000 Subject: [PATCH 08/16] idpf: fix checksums set in idpf_rx_rsc() idpf_rx_rsc() uses skb_transport_offset(skb) while the transport header is not set yet. This triggers the following warning for CONFIG_DEBUG_NET=y builds. DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb)) [ 69.261620] WARNING: CPU: 7 PID: 0 at ./include/linux/skbuff.h:3020 idpf_vport_splitq_napi_poll (include/linux/skbuff.h:3020) idpf [ 69.261629] Modules linked in: vfat fat dummy bridge intel_uncore_frequency_tpmi intel_uncore_frequency_common intel_vsec_tpmi idpf intel_vsec cdc_ncm cdc_eem cdc_ether usbnet mii xhci_pci xhci_hcd ehci_pci ehci_hcd libeth [ 69.261644] CPU: 7 UID: 0 PID: 0 Comm: swapper/7 Tainted: G S W 6.14.0-smp-DEV #1697 [ 69.261648] Tainted: [S]=CPU_OUT_OF_SPEC, [W]=WARN [ 69.261650] RIP: 0010:idpf_vport_splitq_napi_poll (include/linux/skbuff.h:3020) idpf [ 69.261677] ? __warn (kernel/panic.c:242 kernel/panic.c:748) [ 69.261682] ? idpf_vport_splitq_napi_poll (include/linux/skbuff.h:3020) idpf [ 69.261687] ? report_bug (lib/bug.c:?) [ 69.261690] ? handle_bug (arch/x86/kernel/traps.c:285) [ 69.261694] ? exc_invalid_op (arch/x86/kernel/traps.c:309) [ 69.261697] ? asm_exc_invalid_op (arch/x86/include/asm/idtentry.h:621) [ 69.261700] ? __pfx_idpf_vport_splitq_napi_poll (drivers/net/ethernet/intel/idpf/idpf_txrx.c:4011) idpf [ 69.261704] ? idpf_vport_splitq_napi_poll (include/linux/skbuff.h:3020) idpf [ 69.261708] ? idpf_vport_splitq_napi_poll (drivers/net/ethernet/intel/idpf/idpf_txrx.c:3072) idpf [ 69.261712] __napi_poll (net/core/dev.c:7194) [ 69.261716] net_rx_action (net/core/dev.c:7265) [ 69.261718] ? __qdisc_run (net/sched/sch_generic.c:293) [ 69.261721] ? sched_clock (arch/x86/include/asm/preempt.h:84 arch/x86/kernel/tsc.c:288) [ 69.261726] handle_softirqs (kernel/softirq.c:561) Fixes: 3a8845af66edb ("idpf: add RX splitq napi poll support") Signed-off-by: Eric Dumazet Cc: Alan Brady Cc: Joshua Hay Cc: Willem de Bruijn Acked-by: Przemek Kitszel Link: https://patch.msgid.link/20250226221253.1927782-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 9be6a6b59c4e..977741c41498 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -3013,7 +3013,6 @@ static int idpf_rx_rsc(struct idpf_rx_queue *rxq, struct sk_buff *skb, skb_shinfo(skb)->gso_size = rsc_seg_len; skb_reset_network_header(skb); - len = skb->len - skb_transport_offset(skb); if (ipv4) { struct iphdr *ipv4h = ip_hdr(skb); @@ -3022,6 +3021,7 @@ static int idpf_rx_rsc(struct idpf_rx_queue *rxq, struct sk_buff *skb, /* Reset and set transport header offset in skb */ skb_set_transport_header(skb, sizeof(struct iphdr)); + len = skb->len - skb_transport_offset(skb); /* Compute the TCP pseudo header checksum*/ tcp_hdr(skb)->check = @@ -3031,6 +3031,7 @@ static int idpf_rx_rsc(struct idpf_rx_queue *rxq, struct sk_buff *skb, skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + len = skb->len - skb_transport_offset(skb); tcp_hdr(skb)->check = ~tcp_v6_check(len, &ipv6h->saddr, &ipv6h->daddr, 0); } -- 2.51.0 From 54e1b4becf5e220be03db4e1be773c1310e8cbbd Mon Sep 17 00:00:00 2001 From: Meghana Malladi Date: Thu, 27 Feb 2025 14:54:41 +0530 Subject: [PATCH 09/16] net: ti: icss-iep: Reject perout generation request IEP driver supports both perout and pps signal generation but perout feature is faulty with half-cooked support due to some missing configuration. Remove perout support from the driver and reject perout requests with "not supported" error code. Fixes: c1e0230eeaab2 ("net: ti: icss-iep: Add IEP driver") Signed-off-by: Meghana Malladi Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250227092441.1848419-1-m-malladi@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/icssg/icss_iep.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c index 768578c0d958..d59c1744840a 100644 --- a/drivers/net/ethernet/ti/icssg/icss_iep.c +++ b/drivers/net/ethernet/ti/icssg/icss_iep.c @@ -474,26 +474,7 @@ static int icss_iep_perout_enable_hw(struct icss_iep *iep, static int icss_iep_perout_enable(struct icss_iep *iep, struct ptp_perout_request *req, int on) { - int ret = 0; - - mutex_lock(&iep->ptp_clk_mutex); - - if (iep->pps_enabled) { - ret = -EBUSY; - goto exit; - } - - if (iep->perout_enabled == !!on) - goto exit; - - ret = icss_iep_perout_enable_hw(iep, req, on); - if (!ret) - iep->perout_enabled = !!on; - -exit: - mutex_unlock(&iep->ptp_clk_mutex); - - return ret; + return -EOPNOTSUPP; } static void icss_iep_cap_cmp_work(struct work_struct *work) -- 2.51.0 From f8131f4cc5bda6154d81ee5bafe3db7e2e72a89c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 25 Feb 2025 21:09:23 +0100 Subject: [PATCH 10/16] net: qed: make 'qed_ll2_ops_pass' as __maybe_unused gcc warns about unused const variables even in header files when building with W=1: In file included from include/linux/qed/qed_rdma_if.h:14, from drivers/net/ethernet/qlogic/qed/qed_rdma.h:16, from drivers/net/ethernet/qlogic/qed/qed_cxt.c:23: include/linux/qed/qed_ll2_if.h:270:33: error: 'qed_ll2_ops_pass' defined but not used [-Werror=unused-const-variable=] 270 | static const struct qed_ll2_ops qed_ll2_ops_pass = { This one is intentional, so mark it as __maybe_unused to it can be included from a file that doesn't use this variable. Signed-off-by: Arnd Bergmann Reviewed-by: Simon Horman Tested-by: Simon Horman # build-tested Link: https://patch.msgid.link/20250225200926.4057723-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/qed/qed_ll2_if.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h index 5b67cd03276e..aa29ac53b833 100644 --- a/include/linux/qed/qed_ll2_if.h +++ b/include/linux/qed/qed_ll2_if.h @@ -267,7 +267,7 @@ struct qed_ll2_ops { int qed_ll2_alloc_if(struct qed_dev *); void qed_ll2_dealloc_if(struct qed_dev *); #else -static const struct qed_ll2_ops qed_ll2_ops_pass = { +static __maybe_unused const struct qed_ll2_ops qed_ll2_ops_pass = { .start = NULL, .stop = NULL, .start_xmit = NULL, -- 2.51.0 From 047e059cf21210f633bb538799107af10650a9c7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 25 Feb 2025 22:29:27 +0100 Subject: [PATCH 11/16] netkit: Remove double invocation to clear ipvs property flag With ipvs_reset() now done unconditionally in skb_scrub_packet() we would then call the former twice netkit_prep_forward(). Thus remove the now unnecessary explicit call. Signed-off-by: Daniel Borkmann Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250225212927.69271-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- drivers/net/netkit.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 751347392570..d072a7968f56 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -65,7 +65,6 @@ static void netkit_prep_forward(struct sk_buff *skb, skb_reset_mac_header(skb); if (!xnet) return; - ipvs_reset(skb); skb_clear_tstamp(skb); if (xnet_scrub) netkit_xnet(skb); -- 2.51.0 From bf08fd32cc55961c720f6f48a0fe317f0c710f09 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 26 Feb 2025 13:47:32 +1030 Subject: [PATCH 12/16] net/mlx5e: Avoid a hundred -Wflex-array-member-not-at-end warnings -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. So, in this particular case, we create a new `struct mlx5e_umr_wqe_hdr` to enclose the header part of flexible structure `struct mlx5e_umr_wqe`. This is, all the members except the flexible arrays `inline_mtts`, `inline_klms` and `inline_ksms` in the anonymous union. We then replace the header part with `struct mlx5e_umr_wqe_hdr hdr;` in `struct mlx5e_umr_wqe`, and change the type of the object currently causing trouble `umr_wqe` from `struct mlx5e_umr_wqe` to `struct mlx5e_umr_wqe_hdr` --this last bit gets rid of the flex-array-in-the-middle part and avoid the warnings. Also, no new members should be added to `struct mlx5e_umr_wqe`, instead any new members must be included in the header structure `struct mlx5e_umr_wqe_hdr`. To enforce this, we use `static_assert()`, ensuring that the memory layout of both the flexible structure and the newly created header struct remain consistent. The next step is to refactor the rest of the related code accordingly, which means adding a bunch of `hdr.` wherever needed. Lastly, we use `container_of()` whenever we need to retrieve a pointer to the flexible structure `struct mlx5e_umr_wqe`. So, with these changes, fix 125 of the following warnings: drivers/net/ethernet/mellanox/mlx5/core/en.h:664:48: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Signed-off-by: Gustavo A. R. Silva Reviewed-by: Saeed Mahameed Link: https://patch.msgid.link/Z76HzPW1dFTLOSSy@kspp Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 10 +++++++-- .../ethernet/mellanox/mlx5/core/en/xsk/rx.c | 6 ++--- .../net/ethernet/mellanox/mlx5/core/en_main.c | 8 ++++--- .../net/ethernet/mellanox/mlx5/core/en_rx.c | 22 +++++++++---------- 4 files changed, 27 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 769e683f2488..8df185e2ef7f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -230,16 +230,22 @@ struct mlx5e_rx_wqe_cyc { DECLARE_FLEX_ARRAY(struct mlx5_wqe_data_seg, data); }; -struct mlx5e_umr_wqe { +struct mlx5e_umr_wqe_hdr { struct mlx5_wqe_ctrl_seg ctrl; struct mlx5_wqe_umr_ctrl_seg uctrl; struct mlx5_mkey_seg mkc; +}; + +struct mlx5e_umr_wqe { + struct mlx5e_umr_wqe_hdr hdr; union { DECLARE_FLEX_ARRAY(struct mlx5_mtt, inline_mtts); DECLARE_FLEX_ARRAY(struct mlx5_klm, inline_klms); DECLARE_FLEX_ARRAY(struct mlx5_ksm, inline_ksms); }; }; +static_assert(offsetof(struct mlx5e_umr_wqe, inline_mtts) == sizeof(struct mlx5e_umr_wqe_hdr), + "struct members should be included in struct mlx5e_umr_wqe_hdr, not in struct mlx5e_umr_wqe"); enum mlx5e_priv_flag { MLX5E_PFLAG_RX_CQE_BASED_MODER, @@ -657,7 +663,7 @@ struct mlx5e_rq { } wqe; struct { struct mlx5_wq_ll wq; - struct mlx5e_umr_wqe umr_wqe; + struct mlx5e_umr_wqe_hdr umr_wqe; struct mlx5e_mpw_info *info; mlx5e_fp_skb_from_cqe_mpwrq skb_from_cqe_mpwrq; __be32 umr_mkey_be; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c index 1b7132fa70de..2b05536d564a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c @@ -123,7 +123,7 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) bitmap_zero(wi->skip_release_bitmap, rq->mpwqe.pages_per_wqe); wi->consumed_strides = 0; - umr_wqe->ctrl.opmod_idx_opcode = + umr_wqe->hdr.ctrl.opmod_idx_opcode = cpu_to_be32((icosq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | MLX5_OPCODE_UMR); /* Optimized for speed: keep in sync with mlx5e_mpwrq_umr_entry_size. */ @@ -134,7 +134,7 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) offset = offset * sizeof(struct mlx5_klm) * 2 / MLX5_OCTWORD; else if (unlikely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_TRIPLE)) offset = offset * sizeof(struct mlx5_ksm) * 4 / MLX5_OCTWORD; - umr_wqe->uctrl.xlt_offset = cpu_to_be16(offset); + umr_wqe->hdr.uctrl.xlt_offset = cpu_to_be16(offset); icosq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) { .wqe_type = MLX5E_ICOSQ_WQE_UMR_RX, @@ -144,7 +144,7 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) icosq->pc += rq->mpwqe.umr_wqebbs; - icosq->doorbell_cseg = &umr_wqe->ctrl; + icosq->doorbell_cseg = &umr_wqe->hdr.ctrl; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 5d5e7b19c396..7e3aad015111 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -311,8 +311,8 @@ static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq, struct mlx5e_icosq *sq, struct mlx5e_umr_wqe *wqe) { - struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; - struct mlx5_wqe_umr_ctrl_seg *ucseg = &wqe->uctrl; + struct mlx5_wqe_ctrl_seg *cseg = &wqe->hdr.ctrl; + struct mlx5_wqe_umr_ctrl_seg *ucseg = &wqe->hdr.uctrl; u16 octowords; u8 ds_cnt; @@ -393,7 +393,9 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq, int node) bitmap_fill(wi->skip_release_bitmap, rq->mpwqe.pages_per_wqe); } - mlx5e_build_umr_wqe(rq, rq->icosq, &rq->mpwqe.umr_wqe); + mlx5e_build_umr_wqe(rq, rq->icosq, + container_of(&rq->mpwqe.umr_wqe, + struct mlx5e_umr_wqe, hdr)); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 1963bc5adb18..5fd70b4d55be 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -631,16 +631,16 @@ static void build_ksm_umr(struct mlx5e_icosq *sq, struct mlx5e_umr_wqe *umr_wqe, __be32 key, u16 offset, u16 ksm_len) { memset(umr_wqe, 0, offsetof(struct mlx5e_umr_wqe, inline_ksms)); - umr_wqe->ctrl.opmod_idx_opcode = + umr_wqe->hdr.ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | MLX5_OPCODE_UMR); - umr_wqe->ctrl.umr_mkey = key; - umr_wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << MLX5_WQE_CTRL_QPN_SHIFT) + umr_wqe->hdr.ctrl.umr_mkey = key; + umr_wqe->hdr.ctrl.qpn_ds = cpu_to_be32((sq->sqn << MLX5_WQE_CTRL_QPN_SHIFT) | MLX5E_KSM_UMR_DS_CNT(ksm_len)); - umr_wqe->uctrl.flags = MLX5_UMR_TRANSLATION_OFFSET_EN | MLX5_UMR_INLINE; - umr_wqe->uctrl.xlt_offset = cpu_to_be16(offset); - umr_wqe->uctrl.xlt_octowords = cpu_to_be16(ksm_len); - umr_wqe->uctrl.mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); + umr_wqe->hdr.uctrl.flags = MLX5_UMR_TRANSLATION_OFFSET_EN | MLX5_UMR_INLINE; + umr_wqe->hdr.uctrl.xlt_offset = cpu_to_be16(offset); + umr_wqe->hdr.uctrl.xlt_octowords = cpu_to_be16(ksm_len); + umr_wqe->hdr.uctrl.mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); } static struct mlx5e_frag_page *mlx5e_shampo_hd_to_frag_page(struct mlx5e_rq *rq, int header_index) @@ -704,7 +704,7 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq, shampo->pi = (shampo->pi + ksm_entries) & (shampo->hd_per_wq - 1); sq->pc += wqe_bbs; - sq->doorbell_cseg = &umr_wqe->ctrl; + sq->doorbell_cseg = &umr_wqe->hdr.ctrl; return 0; @@ -814,12 +814,12 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) bitmap_zero(wi->skip_release_bitmap, rq->mpwqe.pages_per_wqe); wi->consumed_strides = 0; - umr_wqe->ctrl.opmod_idx_opcode = + umr_wqe->hdr.ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | MLX5_OPCODE_UMR); offset = (ix * rq->mpwqe.mtts_per_wqe) * sizeof(struct mlx5_mtt) / MLX5_OCTWORD; - umr_wqe->uctrl.xlt_offset = cpu_to_be16(offset); + umr_wqe->hdr.uctrl.xlt_offset = cpu_to_be16(offset); sq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) { .wqe_type = MLX5E_ICOSQ_WQE_UMR_RX, @@ -829,7 +829,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) sq->pc += rq->mpwqe.umr_wqebbs; - sq->doorbell_cseg = &umr_wqe->ctrl; + sq->doorbell_cseg = &umr_wqe->hdr.ctrl; return 0; -- 2.51.0 From e1f95b1992b8cc31e37505787806918b0447909b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 26 Feb 2025 19:20:29 +0100 Subject: [PATCH 13/16] geneve: Allow users to specify source port range Recently, in case of Cilium, we run into users on Azure who require to use tunneling for east/west traffic due to hitting IPAM API limits for Kubernetes Pods if they would have gone with publicly routable IPs for Pods. In case of tunneling, Cilium supports the option of vxlan or geneve. In order to RSS spread flows among remote CPUs both derive a source port hash via udp_flow_src_port() which takes the inner packet's skb->hash into account. For clusters with many nodes, this can then hit a new limitation [0]: Today, the Azure networking stack supports 1M total flows (500k inbound and 500k outbound) for a VM. [...] Once this limit is hit, other connections are dropped. [...] Each flow is distinguished by a 5-tuple (protocol, local IP address, remote IP address, local port, and remote port) information. [...] For vxlan and geneve, this can create a massive amount of UDP flows which then run into the limits if stale flows are not evicted fast enough. One option to mitigate this for vxlan is to narrow the source port range via IFLA_VXLAN_PORT_RANGE while still being able to benefit from RSS. However, geneve currently does not have this option and it spreads traffic across the full source port range of [1, USHRT_MAX]. To overcome this limitation also for geneve, add an equivalent IFLA_GENEVE_PORT_RANGE setting for users. Note that struct geneve_config before/after still remains at 2 cachelines on x86-64. The low/high members of struct ifla_geneve_port_range (which is uapi exposed) are of type __be16. While they would be perfectly fine to be of __u16 type, the consensus was that it would be good to be consistent with the existing struct ifla_vxlan_port_range from a uapi consumer PoV. Signed-off-by: Daniel Borkmann Link: https://learn.microsoft.com/en-us/azure/virtual-network/virtual-machine-network-throughput [0] Link: https://patch.msgid.link/20250226182030.89440-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- drivers/net/geneve.c | 52 +++++++++++++++++++++++++++++++++--- include/uapi/linux/if_link.h | 6 +++++ 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index fc62b25e0362..2c65f867fd31 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -57,6 +57,8 @@ struct geneve_config { bool ttl_inherit; enum ifla_geneve_df df; bool inner_proto_inherit; + u16 port_min; + u16 port_max; }; /* Pseudo network device */ @@ -835,7 +837,9 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, use_cache = ip_tunnel_dst_cache_usable(skb, info); tos = geneve_get_dsfield(skb, dev, info, &use_cache); - sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); + sport = udp_flow_src_port(geneve->net, skb, + geneve->cfg.port_min, + geneve->cfg.port_max, true); rt = udp_tunnel_dst_lookup(skb, dev, geneve->net, 0, &saddr, &info->key, @@ -945,7 +949,9 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, use_cache = ip_tunnel_dst_cache_usable(skb, info); prio = geneve_get_dsfield(skb, dev, info, &use_cache); - sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); + sport = udp_flow_src_port(geneve->net, skb, + geneve->cfg.port_min, + geneve->cfg.port_max, true); dst = udp_tunnel6_dst_lookup(skb, dev, geneve->net, gs6->sock, 0, &saddr, key, sport, @@ -1084,7 +1090,8 @@ static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) use_cache = ip_tunnel_dst_cache_usable(skb, info); tos = geneve_get_dsfield(skb, dev, info, &use_cache); sport = udp_flow_src_port(geneve->net, skb, - 1, USHRT_MAX, true); + geneve->cfg.port_min, + geneve->cfg.port_max, true); rt = udp_tunnel_dst_lookup(skb, dev, geneve->net, 0, &saddr, &info->key, @@ -1110,7 +1117,8 @@ static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) use_cache = ip_tunnel_dst_cache_usable(skb, info); prio = geneve_get_dsfield(skb, dev, info, &use_cache); sport = udp_flow_src_port(geneve->net, skb, - 1, USHRT_MAX, true); + geneve->cfg.port_min, + geneve->cfg.port_max, true); dst = udp_tunnel6_dst_lookup(skb, dev, geneve->net, gs6->sock, 0, &saddr, &info->key, sport, @@ -1234,6 +1242,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = { [IFLA_GENEVE_TTL_INHERIT] = { .type = NLA_U8 }, [IFLA_GENEVE_DF] = { .type = NLA_U8 }, [IFLA_GENEVE_INNER_PROTO_INHERIT] = { .type = NLA_FLAG }, + [IFLA_GENEVE_PORT_RANGE] = NLA_POLICY_EXACT_LEN(sizeof(struct ifla_geneve_port_range)), }; static int geneve_validate(struct nlattr *tb[], struct nlattr *data[], @@ -1279,6 +1288,17 @@ static int geneve_validate(struct nlattr *tb[], struct nlattr *data[], } } + if (data[IFLA_GENEVE_PORT_RANGE]) { + const struct ifla_geneve_port_range *p; + + p = nla_data(data[IFLA_GENEVE_PORT_RANGE]); + if (ntohs(p->high) < ntohs(p->low)) { + NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_PORT_RANGE], + "Invalid source port range"); + return -EINVAL; + } + } + return 0; } @@ -1506,6 +1526,18 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], info->key.tp_dst = nla_get_be16(data[IFLA_GENEVE_PORT]); } + if (data[IFLA_GENEVE_PORT_RANGE]) { + const struct ifla_geneve_port_range *p; + + if (changelink) { + attrtype = IFLA_GENEVE_PORT_RANGE; + goto change_notsup; + } + p = nla_data(data[IFLA_GENEVE_PORT_RANGE]); + cfg->port_min = ntohs(p->low); + cfg->port_max = ntohs(p->high); + } + if (data[IFLA_GENEVE_COLLECT_METADATA]) { if (changelink) { attrtype = IFLA_GENEVE_COLLECT_METADATA; @@ -1626,6 +1658,8 @@ static int geneve_newlink(struct net_device *dev, .use_udp6_rx_checksums = false, .ttl_inherit = false, .collect_md = false, + .port_min = 1, + .port_max = USHRT_MAX, }; int err; @@ -1744,6 +1778,7 @@ static size_t geneve_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL_INHERIT */ nla_total_size(0) + /* IFLA_GENEVE_INNER_PROTO_INHERIT */ + nla_total_size(sizeof(struct ifla_geneve_port_range)) + /* IFLA_GENEVE_PORT_RANGE */ 0; } @@ -1753,6 +1788,10 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) struct ip_tunnel_info *info = &geneve->cfg.info; bool ttl_inherit = geneve->cfg.ttl_inherit; bool metadata = geneve->cfg.collect_md; + struct ifla_geneve_port_range ports = { + .low = htons(geneve->cfg.port_min), + .high = htons(geneve->cfg.port_max), + }; __u8 tmp_vni[3]; __u32 vni; @@ -1809,6 +1848,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_flag(skb, IFLA_GENEVE_INNER_PROTO_INHERIT)) goto nla_put_failure; + if (nla_put(skb, IFLA_GENEVE_PORT_RANGE, sizeof(ports), &ports)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -1841,6 +1883,8 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name, .use_udp6_rx_checksums = true, .ttl_inherit = false, .collect_md = true, + .port_min = 1, + .port_max = USHRT_MAX, }; memset(tb, 0, sizeof(tb)); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index bfe880fbbb24..3b586fb0bc4c 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1438,6 +1438,7 @@ enum { IFLA_GENEVE_TTL_INHERIT, IFLA_GENEVE_DF, IFLA_GENEVE_INNER_PROTO_INHERIT, + IFLA_GENEVE_PORT_RANGE, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) @@ -1450,6 +1451,11 @@ enum ifla_geneve_df { GENEVE_DF_MAX = __GENEVE_DF_END - 1, }; +struct ifla_geneve_port_range { + __be16 low; + __be16 high; +}; + /* Bareudp section */ enum { IFLA_BAREUDP_UNSPEC, -- 2.51.0 From 5a41a00cd5d5bac4c7f081f2333df2256eb74d90 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 26 Feb 2025 19:20:30 +0100 Subject: [PATCH 14/16] geneve, specs: Add port range to rt_link specification Add the port range to rt_link, example: # tools/net/ynl/pyynl/cli.py --spec Documentation/netlink/specs/rt_link.yaml \ --do getlink --json '{"ifname": "geneve1"}' --output-json | jq { "ifname": "geneve1", [...] "linkinfo": { "kind": "geneve", "data": { "id": 1000, "remote": "147.28.227.100", "udp-csum": 0, "ttl": 0, "tos": 0, "label": 0, "df": 0, "port": 49431, "udp-zero-csum6-rx": 1, "ttl-inherit": 0, "port-range": { "low": 4000, "high": 5000 } } }, [...] } Signed-off-by: Daniel Borkmann Link: https://patch.msgid.link/20250226182030.89440-2-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt_link.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt_link.yaml index 0d492500c7e5..8b5c0f067328 100644 --- a/Documentation/netlink/specs/rt_link.yaml +++ b/Documentation/netlink/specs/rt_link.yaml @@ -770,6 +770,18 @@ definitions: - name: to type: u32 + - + name: ifla-geneve-port-range + type: struct + members: + - + name: low + type: u16 + byte-order: big-endian + - + name: high + type: u16 + byte-order: big-endian - name: ifla-vf-mac type: struct @@ -1915,6 +1927,10 @@ attribute-sets: - name: inner-proto-inherit type: flag + - + name: port-range + type: binary + struct: ifla-geneve-port-range - name: linkinfo-iptun-attrs name-prefix: ifla-iptun- -- 2.51.0 From 38d41cf575f755fb8db3c4879006aa383ca4341e Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 26 Feb 2025 18:46:43 +0100 Subject: [PATCH 15/16] net-sysfs: remove unused initial ret values In some net-sysfs functions the ret value is initialized but never used as it is always overridden. Remove those. Signed-off-by: Antoine Tenart Reviewed-by: Mateusz Polchlopek Link: https://patch.msgid.link/20250226174644.311136-1-atenart@kernel.org Signed-off-by: Jakub Kicinski --- net/core/net-sysfs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f61c1d829811..8d9dc048a548 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -568,7 +568,7 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); size_t count = len; - ssize_t ret = 0; + ssize_t ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -597,7 +597,7 @@ static ssize_t ifalias_show(struct device *dev, { const struct net_device *netdev = to_net_dev(dev); char tmp[IFALIASZ]; - ssize_t ret = 0; + ssize_t ret; ret = dev_get_alias(netdev, tmp, sizeof(tmp)); if (ret > 0) @@ -638,7 +638,7 @@ static ssize_t phys_port_id_show(struct device *dev, { struct net_device *netdev = to_net_dev(dev); struct netdev_phys_item_id ppid; - ssize_t ret = -EINVAL; + ssize_t ret; /* The check is also done in dev_get_phys_port_id; this helps returning * early without hitting the locking section below. @@ -664,8 +664,8 @@ static ssize_t phys_port_name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); - ssize_t ret = -EINVAL; char name[IFNAMSIZ]; + ssize_t ret; /* The checks are also done in dev_get_phys_port_name; this helps * returning early without hitting the locking section below. @@ -693,7 +693,7 @@ static ssize_t phys_switch_id_show(struct device *dev, { struct net_device *netdev = to_net_dev(dev); struct netdev_phys_item_id ppid = { }; - ssize_t ret = -EINVAL; + ssize_t ret; /* The checks are also done in dev_get_phys_port_name; this helps * returning early without hitting the locking section below. This works -- 2.51.0 From 5ace19bd8395e8a98ff0bca0fd20ae3fac3e1d6f Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 26 Feb 2025 11:39:00 +0200 Subject: [PATCH 16/16] coccinelle: Add missing (GE)NL_SET_ERR_MSG_* to strings ending with newline test Add missing (GE)NL_SET_ERR_MSG_*() variants to the list of macros checked for strings ending with a newline. Reviewed-by: Tariq Toukan Signed-off-by: Gal Pressman Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250226093904.6632-2-gal@nvidia.com Signed-off-by: Jakub Kicinski --- scripts/coccinelle/misc/newline_in_nl_msg.cocci | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/scripts/coccinelle/misc/newline_in_nl_msg.cocci b/scripts/coccinelle/misc/newline_in_nl_msg.cocci index 9baffe55d917..2814f6b205b9 100644 --- a/scripts/coccinelle/misc/newline_in_nl_msg.cocci +++ b/scripts/coccinelle/misc/newline_in_nl_msg.cocci @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /// -/// Catch strings ending in newline with GENL_SET_ERR_MSG, NL_SET_ERR_MSG, -/// NL_SET_ERR_MSG_MOD. +/// Catch strings ending in newline with (GE)NL_SET_ERR_MSG*. /// // Confidence: Very High // Copyright: (C) 2020 Intel Corporation @@ -17,7 +16,11 @@ expression e; constant m; position p; @@ - \(GENL_SET_ERR_MSG\|NL_SET_ERR_MSG\|NL_SET_ERR_MSG_MOD\)(e,m@p) + \(GENL_SET_ERR_MSG\|GENL_SET_ERR_MSG_FMT\|NL_SET_ERR_MSG\|NL_SET_ERR_MSG_MOD\| + NL_SET_ERR_MSG_FMT\|NL_SET_ERR_MSG_FMT_MOD\|NL_SET_ERR_MSG_WEAK\| + NL_SET_ERR_MSG_WEAK_MOD\|NL_SET_ERR_MSG_ATTR_POL\| + NL_SET_ERR_MSG_ATTR_POL_FMT\|NL_SET_ERR_MSG_ATTR\| + NL_SET_ERR_MSG_ATTR_FMT\)(e,m@p,...) @script:python@ m << r.m; @@ -32,7 +35,7 @@ expression r.e; constant r.m; position r.p; @@ - fname(e,m@p) + fname(e,m@p,...) //---------------------------------------------------------- // For context mode @@ -43,7 +46,7 @@ identifier r1.fname; expression r.e; constant r.m; @@ -* fname(e,m) +* fname(e,m,...) //---------------------------------------------------------- // For org mode -- 2.51.0