From 8f02c48f8f623eedc3c0a26a64c7ef155c35bfb9 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 17 Feb 2025 07:48:13 -0800 Subject: [PATCH 01/16] net: Remove redundant variable declaration in __dev_change_flags() The old_flags variable is declared twice in __dev_change_flags(), causing a shadow variable warning. This patch fixes the issue by removing the redundant declaration, reusing the existing old_flags variable instead. net/core/dev.c:9225:16: warning: declaration shadows a local variable [-Wshadow] 9225 | unsigned int old_flags = dev->flags; | ^ net/core/dev.c:9185:15: note: previous declaration is here 9185 | unsigned int old_flags = dev->flags; | ^ 1 warning generated. Remove the redundant inner declaration and reuse the existing old_flags variable since its value is not needed outside the if block, and it is safe to reuse the variable. This eliminates the warning while maintaining the same functionality. Signed-off-by: Breno Leitao Reviewed-by: Mateusz Polchlopek Reviewed-by: Kalesh AP Reviewed-by: Nicolas Dichtel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250217-old_flags-v2-1-4cda3b43a35f@debian.org Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index bcb266ab2912..ebc000b56828 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9182,7 +9182,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; - unsigned int old_flags = dev->flags; + old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; -- 2.51.0 From 3a03f9ec5d333b9998fbc63fd3e075b9d1991b89 Mon Sep 17 00:00:00 2001 From: Yu-Chun Lin Date: Mon, 17 Feb 2025 23:58:33 +0800 Subject: [PATCH 02/16] net: stmmac: Use str_enabled_disabled() helper As kernel test robot reported, the following warning occurs: cocci warnings: (new ones prefixed by >>) >> drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c:582:6-8: opportunity for str_enabled_disabled(on) Replace ternary (condition ? "enabled" : "disabled") with str_enabled_disabled() from string_choices.h to improve readability, maintain uniform string usage, and reduce binary size through linker deduplication. Reviewed-by: Huacai Chen Reviewed-by: Russell King (Oracle) Signed-off-by: Yu-Chun Lin Link: https://patch.msgid.link/20250217155833.3105775-1-eleanor15x@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c index 7900bf3effa7..a8b901cdf5cb 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "stmmac.h" #include "stmmac_pcs.h" #include "stmmac_ptp.h" @@ -625,7 +626,7 @@ int dwmac1000_ptp_enable(struct ptp_clock_info *ptp, } netdev_dbg(priv->dev, "Auxiliary Snapshot %s.\n", - on ? "enabled" : "disabled"); + str_enabled_disabled(on)); writel(tcr_val, ptpaddr + PTP_TCR); /* wait for auxts fifo clear to finish */ -- 2.51.0 From aaf6532d119d8ad4c75420b021d2649864133583 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:26:58 -0500 Subject: [PATCH 03/16] tcp: only initialize sockcm tsflags field TCP only reads the tsflags field. Don't bother initializing others. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-2-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5d78ab3b416e..6a8f19a10911 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1127,7 +1127,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } - sockcm_init(&sockc, sk); + sockc = (struct sockcm_cookie) { .tsflags = READ_ONCE(sk->sk_tsflags)}; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) { -- 2.51.0 From 6ad861519a69ecf3cf032c579e18569f62b81263 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:26:59 -0500 Subject: [PATCH 04/16] net: initialize mark in sockcm_init Avoid open coding initialization of sockcm fields. Avoid reading the sk_priority field twice. This ensures all callers, existing and future, will correctly try a cmsg passed mark before sk_mark. This patch extends support for cmsg mark to: packet_spkt and packet_tpacket and net/can/raw.c. This patch extends support for cmsg priority to: packet_spkt and packet_tpacket. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-3-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 1 + net/can/raw.c | 2 +- net/packet/af_packet.c | 9 ++++----- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 60ebf3c7b229..fac65ed30983 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1829,6 +1829,7 @@ static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { *sockc = (struct sockcm_cookie) { + .mark = READ_ONCE(sk->sk_mark), .tsflags = READ_ONCE(sk->sk_tsflags), .priority = READ_ONCE(sk->sk_priority), }; diff --git a/net/can/raw.c b/net/can/raw.c index 46e8ed9d64da..9b1d5f036f57 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -963,7 +963,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) skb->dev = dev; skb->priority = sockc.priority; - skb->mark = READ_ONCE(sk->sk_mark); + skb->mark = sockc.mark; skb->tstamp = sockc.transmit_time; skb_setup_tx_timestamp(skb, &sockc); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c131e5ceea37..3e9ddf72cd03 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2102,8 +2102,8 @@ retry: skb->protocol = proto; skb->dev = dev; - skb->priority = READ_ONCE(sk->sk_priority); - skb->mark = READ_ONCE(sk->sk_mark); + skb->priority = sockc.priority; + skb->mark = sockc.mark; skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); skb_setup_tx_timestamp(skb, &sockc); @@ -2634,8 +2634,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->protocol = proto; skb->dev = dev; - skb->priority = READ_ONCE(po->sk.sk_priority); - skb->mark = READ_ONCE(po->sk.sk_mark); + skb->priority = sockc->priority; + skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid); skb_setup_tx_timestamp(skb, sockc); skb_zcopy_set_nouarg(skb, ph.raw); @@ -3039,7 +3039,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_unlock; sockcm_init(&sockc, sk); - sockc.mark = READ_ONCE(sk->sk_mark); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) -- 2.51.0 From 94788792f37902f1f4d417f6f9663831cf7e91fc Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:27:00 -0500 Subject: [PATCH 05/16] ipv4: initialize inet socket cookies with sockcm_init Avoid open coding the same logic. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-4-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 305eccdf4ff7..3c4ef5ddad83 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -94,9 +94,8 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, { ipcm_init(ipcm); - ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark); - ipcm->sockc.priority = READ_ONCE(inet->sk.sk_priority); - ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags); + sockcm_init(&ipcm->sockc, &inet->sk); + ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; ipcm->protocol = inet->inet_num; -- 2.51.0 From 9329b58395e51bba9c847419cc4ba176df3dd2b7 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:27:01 -0500 Subject: [PATCH 06/16] ipv4: remove get_rttos Initialize the ip cookie tos field when initializing the cookie, in ipcm_init_sk. The existing code inverts the standard pattern for initializing cookie fields. Default is to initialize the field from the sk, then possibly overwrite that when parsing cmsgs (the unlikely case). This field inverts that, setting the field to an illegal value and after cmsg parsing checking whether the value is still illegal and thus should be overridden. Be careful to always apply mask INET_DSCP_MASK, as before. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-5-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 11 +++-------- net/ipv4/ping.c | 6 +++--- net/ipv4/raw.c | 6 +++--- net/ipv4/udp.c | 6 +++--- 4 files changed, 12 insertions(+), 17 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 3c4ef5ddad83..ce5e59957dd5 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -92,7 +92,9 @@ static inline void ipcm_init(struct ipcm_cookie *ipcm) static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, const struct inet_sock *inet) { - ipcm_init(ipcm); + *ipcm = (struct ipcm_cookie) { + .tos = READ_ONCE(inet->tos), + }; sockcm_init(&ipcm->sockc, &inet->sk); @@ -256,13 +258,6 @@ static inline u8 ip_sendmsg_scope(const struct inet_sock *inet, return RT_SCOPE_UNIVERSE; } -static inline __u8 get_rttos(struct ipcm_cookie* ipc, struct inet_sock *inet) -{ - u8 dsfield = ipc->tos != -1 ? ipc->tos : READ_ONCE(inet->tos); - - return dsfield & INET_DSCP_MASK; -} - /* datagram.c */ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 619ddc087957..85d09f2ecadc 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -705,7 +705,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ip_options_data opt_copy; int free = 0; __be32 saddr, daddr, faddr; - u8 tos, scope; + u8 scope; int err; pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); @@ -768,7 +768,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } faddr = ipc.opt->opt.faddr; } - tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); if (ipv4_is_multicast(daddr)) { @@ -779,7 +778,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } else if (!ipc.oif) ipc.oif = READ_ONCE(inet->uc_index); - flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, + ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, sk->sk_uid); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 4304a68d1db0..6aace4d55733 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -486,7 +486,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ipcm_cookie ipc; struct rtable *rt = NULL; struct flowi4 fl4; - u8 tos, scope; + u8 scope; int free = 0; __be32 daddr; __be32 saddr; @@ -581,7 +581,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) daddr = ipc.opt->opt.faddr; } } - tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); uc_index = READ_ONCE(inet->uc_index); @@ -606,7 +605,8 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } } - flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, + ipc.tos & INET_DSCP_MASK, scope, hdrincl ? ipc.protocol : sk->sk_protocol, inet_sk_flowi_flags(sk) | (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3485989cd4bd..17c7736d8349 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1280,7 +1280,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int free = 0; int connected = 0; __be32 daddr, faddr, saddr; - u8 tos, scope; + u8 scope; __be16 dport; int err, is_udplite = IS_UDPLITE(sk); int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; @@ -1404,7 +1404,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) faddr = ipc.opt->opt.faddr; connected = 0; } - tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); if (scope == RT_SCOPE_LINK) connected = 0; @@ -1441,7 +1440,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &fl4_stack; - flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, scope, + flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, + ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, flow_flags, faddr, saddr, dport, inet->inet_sport, sk->sk_uid); -- 2.51.0 From e8485911050a60091d1bf51a162f0a2654729fad Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:27:02 -0500 Subject: [PATCH 07/16] icmp: reflect tos through ip cookie rather than updating inet_sk Do not modify socket fields if it can be avoided. The current code predates the introduction of ip cookies in commit aa6615814533 ("ipv4: processing ancillary IP_TOS or IP_TTL"). Now that cookies exist and support tos, update that field directly. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-6-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/icmp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5482edb5aade..799775ba97d4 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -405,7 +405,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct ipcm_cookie ipc; struct flowi4 fl4; struct sock *sk; - struct inet_sock *inet; __be32 daddr, saddr; u32 mark = IP4_REPLY_MARK(net, skb->mark); int type = icmp_param->data.icmph.type; @@ -424,12 +423,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) sk = icmp_xmit_lock(net); if (!sk) goto out_bh_enable; - inet = inet_sk(sk); icmp_param->data.icmph.checksum = 0; ipcm_init(&ipc); - inet->tos = ip_hdr(skb)->tos; + ipc.tos = ip_hdr(skb)->tos; ipc.sockc.mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); @@ -737,8 +735,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, icmp_param.data.icmph.checksum = 0; icmp_param.skb = skb_in; icmp_param.offset = skb_network_offset(skb_in); - inet_sk(sk)->tos = tos; ipcm_init(&ipc); + ipc.tos = tos; ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts.opt; ipc.sockc.mark = mark; -- 2.51.0 From 096208592b09c2f5fc0c1a174694efa41c04209d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:27:03 -0500 Subject: [PATCH 08/16] ipv6: replace ipcm6_init calls with ipcm6_init_sk This initializes tclass and dontfrag before cmsg parsing, removing the need for explicit checks against -1 in each caller. Leave hlimit set to -1, because its full initialization (in ip6_sk_dst_hoplimit) requires more state (dst, flowi6, ..). This also prepares for calling sockcm_init in a follow-on patch. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-7-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 9 --------- net/ipv6/raw.c | 8 +------- net/ipv6/udp.c | 7 +------ net/l2tp/l2tp_ip6.c | 8 +------- 4 files changed, 3 insertions(+), 29 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index f5c43ad1565e..46a679d9b334 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -363,15 +363,6 @@ struct ipcm6_cookie { struct ipv6_txoptions *opt; }; -static inline void ipcm6_init(struct ipcm6_cookie *ipc6) -{ - *ipc6 = (struct ipcm6_cookie) { - .hlimit = -1, - .tclass = -1, - .dontfrag = -1, - }; -} - static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6, const struct sock *sk) { diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index a45aba090aa4..ae68d3f7dd32 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -777,7 +777,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = READ_ONCE(sk->sk_mark); fl6.flowi6_uid = sk->sk_uid; - ipcm6_init(&ipc6); + ipcm6_init_sk(&ipc6, sk); ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = fl6.flowi6_mark; ipc6.sockc.priority = READ_ONCE(sk->sk_priority); @@ -891,9 +891,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (hdrincl) fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); @@ -904,9 +901,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (ipc6.dontfrag < 0) - ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); - if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c6ea438b5c75..7096b7e84c10 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1494,7 +1494,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); - ipcm6_init(&ipc6); + ipcm6_init_sk(&ipc6, sk); ipc6.gso_size = READ_ONCE(up->gso_size); ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = READ_ONCE(sk->sk_mark); @@ -1704,9 +1704,6 @@ do_udp_sendmsg: security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6->flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6->flowlabel); dst = ip6_sk_dst_lookup_flow(sk, fl6, final_p, connected); @@ -1752,8 +1749,6 @@ back_from_confirm: WRITE_ONCE(up->pending, AF_INET6); do_append_data: - if (ipc6.dontfrag < 0) - ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); up->len += ulen; err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, fl6, dst_rt6_info(dst), diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index f4c1da070826..b98d13584c81 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -547,7 +547,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = READ_ONCE(sk->sk_mark); fl6.flowi6_uid = sk->sk_uid; - ipcm6_init(&ipc6); + ipcm6_init_sk(&ipc6, sk); if (lsa) { if (addr_len < SIN6_LEN_RFC2133) @@ -634,9 +634,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); @@ -648,9 +645,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (ipc6.dontfrag < 0) - ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); - if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; -- 2.51.0 From 5cd2f78886dd86de1b13d6502808a149f1b77959 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:27:04 -0500 Subject: [PATCH 09/16] ipv6: initialize inet socket cookies with sockcm_init Avoid open coding the same logic. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-8-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 2 ++ net/ipv6/ping.c | 3 --- net/ipv6/raw.c | 9 +++------ net/ipv6/udp.c | 3 --- 4 files changed, 5 insertions(+), 12 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 46a679d9b334..9614006f483c 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -371,6 +371,8 @@ static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6, .tclass = inet6_sk(sk)->tclass, .dontfrag = inet6_test_bit(DONTFRAG, sk), }; + + sockcm_init(&ipc6->sockc, sk); } static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 46b8adf6e7f8..84d90dd8b3f0 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -119,9 +119,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return -EINVAL; ipcm6_init_sk(&ipc6, sk); - ipc6.sockc.priority = READ_ONCE(sk->sk_priority); - ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); - ipc6.sockc.mark = READ_ONCE(sk->sk_mark); fl6.flowi6_oif = oif; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ae68d3f7dd32..fda640ebd53f 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -769,19 +769,16 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) hdrincl = inet_test_bit(HDRINCL, sk); + ipcm6_init_sk(&ipc6, sk); + /* * Get and verify the address. */ memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_mark = READ_ONCE(sk->sk_mark); + fl6.flowi6_mark = ipc6.sockc.mark; fl6.flowi6_uid = sk->sk_uid; - ipcm6_init_sk(&ipc6, sk); - ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); - ipc6.sockc.mark = fl6.flowi6_mark; - ipc6.sockc.priority = READ_ONCE(sk->sk_priority); - if (sin6) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7096b7e84c10..3a0d6c5a8286 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1496,9 +1496,6 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipcm6_init_sk(&ipc6, sk); ipc6.gso_size = READ_ONCE(up->gso_size); - ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); - ipc6.sockc.mark = READ_ONCE(sk->sk_mark); - ipc6.sockc.priority = READ_ONCE(sk->sk_priority); /* destination address check */ if (sin6) { -- 2.51.0 From 27315836f4bcc8e4879d50dfc1fa6eb41e7952ef Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Sun, 16 Feb 2025 19:42:26 -0800 Subject: [PATCH 10/16] net: mana: Allow tso_max_size to go up-to GSO_MAX_SIZE Allow the max aggregated pkt size to go up-to GSO_MAX_SIZE for MANA NIC. This patch only increases the max allowable gso/gro pkt size for MANA devices and does not change the defaults. Following are the perf benefits by increasing the pkt aggregate size from legacy gso_max_size value(64K) to newer one(up-to 511K IPv4 tests for i in {1..10}; do netperf -t TCP_RR -H 10.0.0.5 -p50000 -- -r80000,80000 -O MIN_LATENCY,P90_LATENCY,P99_LATENCY,THROUGHPUT|tail -1; done min p90 p99 Throughput gso_max_size 93 171 194 6594.25 97 154 180 7183.74 95 165 189 6927.86 96 165 188 6976.04 93 154 185 7338.05 64K 93 168 189 6938.03 94 169 189 6784.93 92 166 189 7117.56 94 179 191 6678.44 95 157 183 7277.81 min p90 p99 Throughput 93 134 146 8448.75 95 134 140 8396.54 94 137 148 8204.12 94 137 148 8244.41 94 128 139 8666.52 80K 94 141 153 8116.86 94 138 149 8163.92 92 135 142 8362.72 92 134 142 8497.57 93 136 148 8393.23 IPv6 Tests for i in {1..10}; do netperf -t TCP_RR -H fd00:9013:cadd::4 -p50000 -- -r80000,80000 -O MIN_LATENCY,P90_LATENCY,P99_LATENCY,THROUGHPUT|tail -1; done min p90 p99 Throughput gso_max_size 108 165 170 6673.2 101 169 189 6451.69 101 165 169 6737.65 102 167 175 6614.64 101 178 189 6247.13 64K 107 163 169 6678.63 106 176 187 6350.86 100 164 169 6617.36 102 163 170 6849.21 102 168 175 6605.7 min p90 p99 Throughput 108 155 166 7183 110 154 163 7268.87 109 152 159 7434.35 107 145 157 7569.15 107 149 164 7496.17 80K 110 154 159 7245.85 108 156 162 7266.24 109 145 158 7526.66 106 145 151 7785.75 111 148 157 7246.65 Tested on azure env with Accelerated Networking enabled and disabled. Signed-off-by: Shradha Gupta Reviewed-by: Haiyang Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/microsoft/mana/mana_en.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index aa1e47233fe5..3b0fb4d95cf7 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -256,6 +256,9 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev) if (skb_cow_head(skb, MANA_HEADROOM)) goto tx_drop_count; + if (unlikely(ipv6_hopopt_jumbo_remove(skb))) + goto tx_drop_count; + txq = &apc->tx_qp[txq_idx].txq; gdma_sq = txq->gdma_sq; cq = &apc->tx_qp[txq_idx].tx_cq; @@ -2873,6 +2876,8 @@ static int mana_probe_port(struct mana_context *ac, int port_idx, ndev->dev_port = port_idx; SET_NETDEV_DEV(ndev, gc->dev); + netif_set_tso_max_size(ndev, GSO_MAX_SIZE); + netif_carrier_off(ndev); netdev_rss_key_fill(apc->hashkey, MANA_HASH_KEY_SIZE); -- 2.51.0 From 685920920e3d5f68a8c50107b97747b0f8ce050f Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Sun, 16 Feb 2025 19:42:42 -0800 Subject: [PATCH 11/16] hv_netvsc: Use VF's tso_max_size value when data path is VF On Azure, increasing VF's gso/gro packet size to up-to GSO_MAX_SIZE is not possible without allowing the same for netvsc NIC (as the NICs are bonded together). For bonded NICs, the min of the max aggregated pkt size of the members is propagated in the stack. Therefore, we use netif_set_tso_max_size() to set max aggregated pkt size to VF's packet size for netvsc too, when the data path is switched over to the VF Tested on azure env with Accelerated Networking enabled and disabled. Signed-off-by: Shradha Gupta Reviewed-by: Haiyang Zhang Signed-off-by: David S. Miller --- drivers/net/hyperv/hyperv_net.h | 2 ++ drivers/net/hyperv/netvsc_drv.c | 15 +++++++++++++++ drivers/net/hyperv/rndis_filter.c | 13 +++++++------ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index 234db693cefa..70f7cb383228 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -1166,6 +1166,8 @@ struct netvsc_device { u32 max_chn; u32 num_chn; + u32 netvsc_gso_max_size; + atomic_t open_chn; struct work_struct subchan_work; wait_queue_head_t subchan_open; diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index d6c4abfc3a28..9c6501bf27bd 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2461,6 +2461,21 @@ static int netvsc_vf_changed(struct net_device *vf_netdev, unsigned long event) } else { netdev_info(ndev, "Data path switched %s VF: %s\n", vf_is_up ? "to" : "from", vf_netdev->name); + + /* In Azure, when accelerated networking in enabled, other NICs + * like MANA, MLX, are configured as a bonded nic with + * Netvsc(failover) NIC. For bonded NICs, the min of the max + * pkt aggregate size of the members is propagated in the stack. + * In order to allow these NICs (MANA/MLX) to use up to + * GSO_MAX_SIZE gso packet size, we need to allow Netvsc NIC to + * also support this in the guest. + * This value is only increased for netvsc NIC when datapath is + * switched over to the VF + */ + if (vf_is_up) + netif_set_tso_max_size(ndev, vf_netdev->tso_max_size); + else + netif_set_tso_max_size(ndev, netvsc_dev->netvsc_gso_max_size); } return NOTIFY_OK; diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index c0ceeef4fcd8..82747dfacd70 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -1356,9 +1356,10 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, struct net_device_context *net_device_ctx = netdev_priv(net); struct ndis_offload hwcaps; struct ndis_offload_params offloads; - unsigned int gso_max_size = GSO_LEGACY_MAX_SIZE; int ret; + nvdev->netvsc_gso_max_size = GSO_LEGACY_MAX_SIZE; + /* Find HW offload capabilities */ ret = rndis_query_hwcaps(rndis_device, nvdev, &hwcaps); if (ret != 0) @@ -1390,8 +1391,8 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, offloads.lso_v2_ipv4 = NDIS_OFFLOAD_PARAMETERS_LSOV2_ENABLED; net->hw_features |= NETIF_F_TSO; - if (hwcaps.lsov2.ip4_maxsz < gso_max_size) - gso_max_size = hwcaps.lsov2.ip4_maxsz; + if (hwcaps.lsov2.ip4_maxsz < nvdev->netvsc_gso_max_size) + nvdev->netvsc_gso_max_size = hwcaps.lsov2.ip4_maxsz; } if (hwcaps.csum.ip4_txcsum & NDIS_TXCSUM_CAP_UDP4) { @@ -1411,8 +1412,8 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, offloads.lso_v2_ipv6 = NDIS_OFFLOAD_PARAMETERS_LSOV2_ENABLED; net->hw_features |= NETIF_F_TSO6; - if (hwcaps.lsov2.ip6_maxsz < gso_max_size) - gso_max_size = hwcaps.lsov2.ip6_maxsz; + if (hwcaps.lsov2.ip6_maxsz < nvdev->netvsc_gso_max_size) + nvdev->netvsc_gso_max_size = hwcaps.lsov2.ip6_maxsz; } if (hwcaps.csum.ip6_txcsum & NDIS_TXCSUM_CAP_UDP6) { @@ -1438,7 +1439,7 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, */ net->features &= ~NETVSC_SUPPORTED_HW_FEATURES | net->hw_features; - netif_set_tso_max_size(net, gso_max_size); + netif_set_tso_max_size(net, nvdev->netvsc_gso_max_size); ret = rndis_filter_set_offload_params(net, nvdev, &offloads); -- 2.51.0 From 9a369ae3d1431a83589dde57323a04692dd7fc12 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Mon, 17 Feb 2025 09:31:46 +0200 Subject: [PATCH 12/16] net: ethernet: ti: am65-cpsw: remove am65_cpsw_nuss_tx_compl_packets_2g() The only difference between am65_cpsw_nuss_tx_compl_packets_2g() and am65_cpsw_nuss_tx_compl_packets() is the usage of spin_lock() and netdev_tx_completed_queue() + am65_cpsw_nuss_tx_wake at every packet in the latter. Insted of having 2 separate functions for TX completion, merge them into one. This will reduce code duplication and make maintenance easier. Signed-off-by: Roger Quadros Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 98 ++++++------------------ 1 file changed, 25 insertions(+), 73 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 2806238629f8..0ccb8dbcbba4 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -1504,6 +1504,7 @@ static void am65_cpsw_nuss_tx_wake(struct am65_cpsw_tx_chn *tx_chn, struct net_d static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common, int chn, unsigned int budget, bool *tdown) { + bool single_port = AM65_CPSW_IS_CPSW2G(common); enum am65_cpsw_tx_buf_type buf_type; struct device *dev = common->dev; struct am65_cpsw_tx_chn *tx_chn; @@ -1511,6 +1512,7 @@ static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common, unsigned int total_bytes = 0; struct net_device *ndev; struct xdp_frame *xdpf; + unsigned int pkt_len; struct sk_buff *skb; dma_addr_t desc_dma; int res, num_tx = 0; @@ -1518,9 +1520,12 @@ static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common, tx_chn = &common->tx_chns[chn]; while (true) { - spin_lock(&tx_chn->lock); + if (!single_port) + spin_lock(&tx_chn->lock); res = k3_udma_glue_pop_tx_chn(tx_chn->tx_chn, &desc_dma); - spin_unlock(&tx_chn->lock); + if (!single_port) + spin_unlock(&tx_chn->lock); + if (res == -ENODATA) break; @@ -1535,23 +1540,35 @@ static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common, if (buf_type == AM65_CPSW_TX_BUF_TYPE_SKB) { skb = am65_cpsw_nuss_tx_compl_packet_skb(tx_chn, desc_dma); ndev = skb->dev; - total_bytes = skb->len; + pkt_len = skb->len; napi_consume_skb(skb, budget); } else { xdpf = am65_cpsw_nuss_tx_compl_packet_xdp(common, tx_chn, desc_dma, &ndev); - total_bytes = xdpf->len; + pkt_len = xdpf->len; if (buf_type == AM65_CPSW_TX_BUF_TYPE_XDP_TX) xdp_return_frame_rx_napi(xdpf); else xdp_return_frame(xdpf); } + + total_bytes += pkt_len; num_tx++; - netif_txq = netdev_get_tx_queue(ndev, chn); + if (!single_port) { + /* as packets from multi ports can be interleaved + * on the same channel, we have to figure out the + * port/queue at every packet and report it/wake queue. + */ + netif_txq = netdev_get_tx_queue(ndev, chn); + netdev_tx_completed_queue(netif_txq, 1, pkt_len); + am65_cpsw_nuss_tx_wake(tx_chn, ndev, netif_txq); + } + } + if (single_port) { + netif_txq = netdev_get_tx_queue(ndev, chn); netdev_tx_completed_queue(netif_txq, num_tx, total_bytes); - am65_cpsw_nuss_tx_wake(tx_chn, ndev, netif_txq); } @@ -1560,66 +1577,6 @@ static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common, return num_tx; } -static int am65_cpsw_nuss_tx_compl_packets_2g(struct am65_cpsw_common *common, - int chn, unsigned int budget, bool *tdown) -{ - enum am65_cpsw_tx_buf_type buf_type; - struct device *dev = common->dev; - struct am65_cpsw_tx_chn *tx_chn; - struct netdev_queue *netif_txq; - unsigned int total_bytes = 0; - struct net_device *ndev; - struct xdp_frame *xdpf; - struct sk_buff *skb; - dma_addr_t desc_dma; - int res, num_tx = 0; - - tx_chn = &common->tx_chns[chn]; - - while (true) { - res = k3_udma_glue_pop_tx_chn(tx_chn->tx_chn, &desc_dma); - if (res == -ENODATA) - break; - - if (cppi5_desc_is_tdcm(desc_dma)) { - if (atomic_dec_and_test(&common->tdown_cnt)) - complete(&common->tdown_complete); - *tdown = true; - break; - } - - buf_type = am65_cpsw_nuss_buf_type(tx_chn, desc_dma); - if (buf_type == AM65_CPSW_TX_BUF_TYPE_SKB) { - skb = am65_cpsw_nuss_tx_compl_packet_skb(tx_chn, desc_dma); - ndev = skb->dev; - total_bytes += skb->len; - napi_consume_skb(skb, budget); - } else { - xdpf = am65_cpsw_nuss_tx_compl_packet_xdp(common, tx_chn, - desc_dma, &ndev); - total_bytes += xdpf->len; - if (buf_type == AM65_CPSW_TX_BUF_TYPE_XDP_TX) - xdp_return_frame_rx_napi(xdpf); - else - xdp_return_frame(xdpf); - } - num_tx++; - } - - if (!num_tx) - return 0; - - netif_txq = netdev_get_tx_queue(ndev, chn); - - netdev_tx_completed_queue(netif_txq, num_tx, total_bytes); - - am65_cpsw_nuss_tx_wake(tx_chn, ndev, netif_txq); - - dev_dbg(dev, "%s:%u pkt:%d\n", __func__, chn, num_tx); - - return num_tx; -} - static enum hrtimer_restart am65_cpsw_nuss_tx_timer_callback(struct hrtimer *timer) { struct am65_cpsw_tx_chn *tx_chns = @@ -1635,13 +1592,8 @@ static int am65_cpsw_nuss_tx_poll(struct napi_struct *napi_tx, int budget) bool tdown = false; int num_tx; - if (AM65_CPSW_IS_CPSW2G(tx_chn->common)) - num_tx = am65_cpsw_nuss_tx_compl_packets_2g(tx_chn->common, tx_chn->id, - budget, &tdown); - else - num_tx = am65_cpsw_nuss_tx_compl_packets(tx_chn->common, - tx_chn->id, budget, &tdown); - + num_tx = am65_cpsw_nuss_tx_compl_packets(tx_chn->common, + tx_chn->id, budget, &tdown); if (num_tx >= budget) return budget; -- 2.51.0 From 1ae26bf6151706477fe2b4567be516f0173162fd Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Mon, 17 Feb 2025 09:31:47 +0200 Subject: [PATCH 13/16] net: ethernet: ti: am65_cpsw: remove cpu argument am65_cpsw_run_xdp am65_cpsw_run_xdp() can figure out the cpu id itself. No need to pass it around 2 functions so drop it. Signed-off-by: Roger Quadros Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 0ccb8dbcbba4..134802007c93 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -1167,14 +1167,14 @@ pool_free: static int am65_cpsw_run_xdp(struct am65_cpsw_rx_flow *flow, struct am65_cpsw_port *port, - struct xdp_buff *xdp, - int cpu, int *len) + struct xdp_buff *xdp, int *len) { struct am65_cpsw_common *common = flow->common; struct net_device *ndev = port->ndev; int ret = AM65_CPSW_XDP_CONSUMED; struct am65_cpsw_tx_chn *tx_chn; struct netdev_queue *netif_txq; + int cpu = smp_processor_id(); struct xdp_frame *xdpf; struct bpf_prog *prog; struct page *page; @@ -1274,7 +1274,7 @@ static void am65_cpsw_nuss_rx_csum(struct sk_buff *skb, u32 csum_info) } static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_rx_flow *flow, - int cpu, int *xdp_state) + int *xdp_state) { struct am65_cpsw_rx_chn *rx_chn = &flow->common->rx_chns; u32 buf_dma_len, pkt_len, port_id = 0, csum_info; @@ -1334,8 +1334,7 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_rx_flow *flow, xdp_init_buff(&xdp, PAGE_SIZE, &port->xdp_rxq[flow->id]); xdp_prepare_buff(&xdp, page_addr, AM65_CPSW_HEADROOM, pkt_len, false); - *xdp_state = am65_cpsw_run_xdp(flow, port, &xdp, - cpu, &pkt_len); + *xdp_state = am65_cpsw_run_xdp(flow, port, &xdp, &pkt_len); if (*xdp_state != AM65_CPSW_XDP_PASS) goto allocate; @@ -1401,7 +1400,6 @@ static int am65_cpsw_nuss_rx_poll(struct napi_struct *napi_rx, int budget) { struct am65_cpsw_rx_flow *flow = am65_cpsw_napi_to_rx_flow(napi_rx); struct am65_cpsw_common *common = flow->common; - int cpu = smp_processor_id(); int xdp_state_or = 0; int cur_budget, ret; int xdp_state; @@ -1410,7 +1408,7 @@ static int am65_cpsw_nuss_rx_poll(struct napi_struct *napi_rx, int budget) /* process only this flow */ cur_budget = budget; while (cur_budget--) { - ret = am65_cpsw_nuss_rx_packets(flow, cpu, &xdp_state); + ret = am65_cpsw_nuss_rx_packets(flow, &xdp_state); xdp_state_or |= xdp_state; if (ret) break; -- 2.51.0 From 09057ce3774ec42d8463548cb5125a5ac61b89ff Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Mon, 17 Feb 2025 09:31:48 +0200 Subject: [PATCH 14/16] net: ethernet: ti: am65-cpsw: use return instead of goto in am65_cpsw_run_xdp() In am65_cpsw_run_xdp() instead of goto followed by return, simply return. Signed-off-by: Roger Quadros Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 134802007c93..e1d7c3bf16e2 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -1193,8 +1193,7 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_rx_flow *flow, switch (act) { case XDP_PASS: - ret = AM65_CPSW_XDP_PASS; - goto out; + return AM65_CPSW_XDP_PASS; case XDP_TX: tx_chn = &common->tx_chns[cpu % AM65_CPSW_MAX_QUEUES]; netif_txq = netdev_get_tx_queue(ndev, tx_chn->id); @@ -1213,15 +1212,13 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_rx_flow *flow, goto drop; dev_sw_netstats_rx_add(ndev, pkt_len); - ret = AM65_CPSW_XDP_CONSUMED; - goto out; + return AM65_CPSW_XDP_CONSUMED; case XDP_REDIRECT: if (unlikely(xdp_do_redirect(ndev, xdp, prog))) goto drop; dev_sw_netstats_rx_add(ndev, pkt_len); - ret = AM65_CPSW_XDP_REDIRECT; - goto out; + return AM65_CPSW_XDP_REDIRECT; default: bpf_warn_invalid_xdp_action(ndev, prog, act); fallthrough; @@ -1236,7 +1233,6 @@ drop: page = virt_to_head_page(xdp->data); am65_cpsw_put_page(flow, page, true); -out: return ret; } -- 2.51.0 From 6d6c7933cea6e51a07be52cf5aba97cd656e0e54 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Mon, 17 Feb 2025 09:31:49 +0200 Subject: [PATCH 15/16] net: ethernet: ti: am65_cpsw: move am65_cpsw_put_page() out of am65_cpsw_run_xdp() This allows us to re-use am65_cpsw_run_xdp() for zero copy case. Add AM65_CPSW_XDP_TX case for successful XDP_TX so we don't free the page while in flight. Signed-off-by: Roger Quadros Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index e1d7c3bf16e2..20a4fc3e579f 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -164,6 +164,7 @@ #define AM65_CPSW_CPPI_TX_PKT_TYPE 0x7 /* XDP */ +#define AM65_CPSW_XDP_TX BIT(2) #define AM65_CPSW_XDP_CONSUMED BIT(1) #define AM65_CPSW_XDP_REDIRECT BIT(0) #define AM65_CPSW_XDP_PASS 0 @@ -1177,7 +1178,6 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_rx_flow *flow, int cpu = smp_processor_id(); struct xdp_frame *xdpf; struct bpf_prog *prog; - struct page *page; int pkt_len; u32 act; int err; @@ -1212,7 +1212,7 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_rx_flow *flow, goto drop; dev_sw_netstats_rx_add(ndev, pkt_len); - return AM65_CPSW_XDP_CONSUMED; + return AM65_CPSW_XDP_TX; case XDP_REDIRECT: if (unlikely(xdp_do_redirect(ndev, xdp, prog))) goto drop; @@ -1230,9 +1230,6 @@ drop: ndev->stats.rx_dropped++; } - page = virt_to_head_page(xdp->data); - am65_cpsw_put_page(flow, page, true); - return ret; } @@ -1331,6 +1328,12 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_rx_flow *flow, xdp_prepare_buff(&xdp, page_addr, AM65_CPSW_HEADROOM, pkt_len, false); *xdp_state = am65_cpsw_run_xdp(flow, port, &xdp, &pkt_len); + if (*xdp_state == AM65_CPSW_XDP_CONSUMED) { + page = virt_to_head_page(xdp.data); + am65_cpsw_put_page(flow, page, true); + goto allocate; + } + if (*xdp_state != AM65_CPSW_XDP_PASS) goto allocate; -- 2.51.0 From ce643fa62a70f0bb1c33d9fc98ed4d0300b00ff4 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Mon, 17 Feb 2025 09:31:50 +0200 Subject: [PATCH 16/16] net: ethernet: ti am65_cpsw: Drop separate TX completion functions Drop separate TX completion functions for SKB and XDP. To do that use the SW_DATA mechanism to store ndev and skb/xdpf for TX packets. Use BUILD_BUG_ON_MSG() to fail build if SW_DATA size exceeds whats available. i.e. AM65_CPSW_NAV_SW_DATA_SIZE. Signed-off-by: Roger Quadros Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 82 +++++++----------------- drivers/net/ethernet/ti/am65-cpsw-nuss.h | 8 +++ 2 files changed, 32 insertions(+), 58 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 20a4fc3e579f..3e671be95d6f 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -830,19 +830,19 @@ static void am65_cpsw_nuss_tx_cleanup(void *data, dma_addr_t desc_dma) { struct am65_cpsw_tx_chn *tx_chn = data; enum am65_cpsw_tx_buf_type buf_type; + struct am65_cpsw_tx_swdata *swdata; struct cppi5_host_desc_t *desc_tx; struct xdp_frame *xdpf; struct sk_buff *skb; - void **swdata; desc_tx = k3_cppi_desc_pool_dma2virt(tx_chn->desc_pool, desc_dma); swdata = cppi5_hdesc_get_swdata(desc_tx); buf_type = am65_cpsw_nuss_buf_type(tx_chn, desc_dma); if (buf_type == AM65_CPSW_TX_BUF_TYPE_SKB) { - skb = *(swdata); + skb = swdata->skb; dev_kfree_skb_any(skb); } else { - xdpf = *(swdata); + xdpf = swdata->xdpf; xdp_return_frame(xdpf); } @@ -1099,10 +1099,10 @@ static int am65_cpsw_xdp_tx_frame(struct net_device *ndev, struct am65_cpsw_common *common = am65_ndev_to_common(ndev); struct am65_cpsw_port *port = am65_ndev_to_port(ndev); struct cppi5_host_desc_t *host_desc; + struct am65_cpsw_tx_swdata *swdata; struct netdev_queue *netif_txq; dma_addr_t dma_desc, dma_buf; u32 pkt_len = xdpf->len; - void **swdata; int ret; host_desc = k3_cppi_desc_pool_alloc(tx_chn->desc_pool); @@ -1132,7 +1132,8 @@ static int am65_cpsw_xdp_tx_frame(struct net_device *ndev, cppi5_hdesc_attach_buf(host_desc, dma_buf, pkt_len, dma_buf, pkt_len); swdata = cppi5_hdesc_get_swdata(host_desc); - *(swdata) = xdpf; + swdata->ndev = ndev; + swdata->xdpf = xdpf; /* Report BQL before sending the packet */ netif_txq = netdev_get_tx_queue(ndev, tx_chn->id); @@ -1435,52 +1436,6 @@ static int am65_cpsw_nuss_rx_poll(struct napi_struct *napi_rx, int budget) return num_rx; } -static struct sk_buff * -am65_cpsw_nuss_tx_compl_packet_skb(struct am65_cpsw_tx_chn *tx_chn, - dma_addr_t desc_dma) -{ - struct cppi5_host_desc_t *desc_tx; - struct sk_buff *skb; - void **swdata; - - desc_tx = k3_cppi_desc_pool_dma2virt(tx_chn->desc_pool, - desc_dma); - swdata = cppi5_hdesc_get_swdata(desc_tx); - skb = *(swdata); - am65_cpsw_nuss_xmit_free(tx_chn, desc_tx); - - am65_cpts_tx_timestamp(tx_chn->common->cpts, skb); - - dev_sw_netstats_tx_add(skb->dev, 1, skb->len); - - return skb; -} - -static struct xdp_frame * -am65_cpsw_nuss_tx_compl_packet_xdp(struct am65_cpsw_common *common, - struct am65_cpsw_tx_chn *tx_chn, - dma_addr_t desc_dma, - struct net_device **ndev) -{ - struct cppi5_host_desc_t *desc_tx; - struct am65_cpsw_port *port; - struct xdp_frame *xdpf; - u32 port_id = 0; - void **swdata; - - desc_tx = k3_cppi_desc_pool_dma2virt(tx_chn->desc_pool, desc_dma); - cppi5_desc_get_tags_ids(&desc_tx->hdr, NULL, &port_id); - swdata = cppi5_hdesc_get_swdata(desc_tx); - xdpf = *(swdata); - am65_cpsw_nuss_xmit_free(tx_chn, desc_tx); - - port = am65_common_get_port(common, port_id); - dev_sw_netstats_tx_add(port->ndev, 1, xdpf->len); - *ndev = port->ndev; - - return xdpf; -} - static void am65_cpsw_nuss_tx_wake(struct am65_cpsw_tx_chn *tx_chn, struct net_device *ndev, struct netdev_queue *netif_txq) { @@ -1503,6 +1458,8 @@ static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common, { bool single_port = AM65_CPSW_IS_CPSW2G(common); enum am65_cpsw_tx_buf_type buf_type; + struct am65_cpsw_tx_swdata *swdata; + struct cppi5_host_desc_t *desc_tx; struct device *dev = common->dev; struct am65_cpsw_tx_chn *tx_chn; struct netdev_queue *netif_txq; @@ -1533,15 +1490,18 @@ static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common, break; } + desc_tx = k3_cppi_desc_pool_dma2virt(tx_chn->desc_pool, + desc_dma); + swdata = cppi5_hdesc_get_swdata(desc_tx); + ndev = swdata->ndev; buf_type = am65_cpsw_nuss_buf_type(tx_chn, desc_dma); if (buf_type == AM65_CPSW_TX_BUF_TYPE_SKB) { - skb = am65_cpsw_nuss_tx_compl_packet_skb(tx_chn, desc_dma); - ndev = skb->dev; + skb = swdata->skb; + am65_cpts_tx_timestamp(tx_chn->common->cpts, skb); pkt_len = skb->len; napi_consume_skb(skb, budget); } else { - xdpf = am65_cpsw_nuss_tx_compl_packet_xdp(common, tx_chn, - desc_dma, &ndev); + xdpf = swdata->xdpf; pkt_len = xdpf->len; if (buf_type == AM65_CPSW_TX_BUF_TYPE_XDP_TX) xdp_return_frame_rx_napi(xdpf); @@ -1551,7 +1511,8 @@ static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common, total_bytes += pkt_len; num_tx++; - + am65_cpsw_nuss_xmit_free(tx_chn, desc_tx); + dev_sw_netstats_tx_add(ndev, 1, pkt_len); if (!single_port) { /* as packets from multi ports can be interleaved * on the same channel, we have to figure out the @@ -1634,12 +1595,12 @@ static netdev_tx_t am65_cpsw_nuss_ndo_slave_xmit(struct sk_buff *skb, struct am65_cpsw_common *common = am65_ndev_to_common(ndev); struct cppi5_host_desc_t *first_desc, *next_desc, *cur_desc; struct am65_cpsw_port *port = am65_ndev_to_port(ndev); + struct am65_cpsw_tx_swdata *swdata; struct device *dev = common->dev; struct am65_cpsw_tx_chn *tx_chn; struct netdev_queue *netif_txq; dma_addr_t desc_dma, buf_dma; int ret, q_idx, i; - void **swdata; u32 *psdata; u32 pkt_len; @@ -1685,7 +1646,8 @@ static netdev_tx_t am65_cpsw_nuss_ndo_slave_xmit(struct sk_buff *skb, k3_udma_glue_tx_dma_to_cppi5_addr(tx_chn->tx_chn, &buf_dma); cppi5_hdesc_attach_buf(first_desc, buf_dma, pkt_len, buf_dma, pkt_len); swdata = cppi5_hdesc_get_swdata(first_desc); - *(swdata) = skb; + swdata->ndev = ndev; + swdata->skb = skb; psdata = cppi5_hdesc_get_psdata(first_desc); /* HW csum offload if enabled */ @@ -3527,6 +3489,10 @@ static int am65_cpsw_nuss_probe(struct platform_device *pdev) __be64 id_temp; int ret, i; + BUILD_BUG_ON_MSG(sizeof(struct am65_cpsw_tx_swdata) > AM65_CPSW_NAV_SW_DATA_SIZE, + "TX SW_DATA size exceeds AM65_CPSW_NAV_SW_DATA_SIZE"); + BUILD_BUG_ON_MSG(sizeof(struct am65_cpsw_swdata) > AM65_CPSW_NAV_SW_DATA_SIZE, + "SW_DATA size exceeds AM65_CPSW_NAV_SW_DATA_SIZE"); common = devm_kzalloc(dev, sizeof(struct am65_cpsw_common), GFP_KERNEL); if (!common) return -ENOMEM; diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.h b/drivers/net/ethernet/ti/am65-cpsw-nuss.h index e7832a5cf3cc..917c37e4e89b 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.h +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.h @@ -104,6 +104,14 @@ struct am65_cpsw_rx_flow { char name[32]; }; +struct am65_cpsw_tx_swdata { + struct net_device *ndev; + union { + struct sk_buff *skb; + struct xdp_frame *xdpf; + }; +}; + struct am65_cpsw_swdata { u32 flow_id; struct page *page; -- 2.51.0