From 9faaaef27c5df617223ad725f3fac9a21d333e81 Mon Sep 17 00:00:00 2001 From: Pei Xiao Date: Mon, 17 Feb 2025 09:29:30 +0800 Subject: [PATCH 01/16] net: freescale: ucc_geth: make ugeth_mac_ops be static const sparse warning: sparse: symbol 'ugeth_mac_ops' was not declared. Should it be static. Add static to fix sparse warnings and add const. phylink_create() will accept a const struct. Reported-by: kernel test robot Closes: https://lore.kernel.org/202502141128.9HfxcdIE-lkp@intel.com Signed-off-by: Pei Xiao Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/ucc_geth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index 88510f822759..affd5a6c44e7 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -3408,7 +3408,7 @@ static int ucc_geth_parse_clock(struct device_node *np, const char *which, return 0; } -struct phylink_mac_ops ugeth_mac_ops = { +static const struct phylink_mac_ops ugeth_mac_ops = { .mac_link_up = ugeth_mac_link_up, .mac_link_down = ugeth_mac_link_down, .mac_config = ugeth_mac_config, -- 2.51.0 From 952d7325362ffbefa6ce5619fb4e53c2159ec7a7 Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Mon, 17 Feb 2025 17:40:21 +0800 Subject: [PATCH 02/16] net: ethernet: mediatek: add EEE support Add EEE support to MediaTek SoC Ethernet. The register fields are similar to the ones in MT7531, except that the LPI threshold is in milliseconds. Signed-off-by: Qingfang Deng Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20250217094022.1065436-1-dqfext@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 67 +++++++++++++++++++++ drivers/net/ethernet/mediatek/mtk_eth_soc.h | 11 ++++ 2 files changed, 78 insertions(+) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 0ad965ced5ef..922330b3f4d7 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -815,12 +815,60 @@ static void mtk_mac_link_up(struct phylink_config *config, mtk_w32(mac->hw, mcr, MTK_MAC_MCR(mac->id)); } +static void mtk_mac_disable_tx_lpi(struct phylink_config *config) +{ + struct mtk_mac *mac = container_of(config, struct mtk_mac, + phylink_config); + struct mtk_eth *eth = mac->hw; + + mtk_m32(eth, MAC_MCR_EEE100M | MAC_MCR_EEE1G, 0, MTK_MAC_MCR(mac->id)); +} + +static int mtk_mac_enable_tx_lpi(struct phylink_config *config, u32 timer, + bool tx_clk_stop) +{ + struct mtk_mac *mac = container_of(config, struct mtk_mac, + phylink_config); + struct mtk_eth *eth = mac->hw; + u32 val; + + /* Tx idle timer in ms */ + timer = DIV_ROUND_UP(timer, 1000); + + /* If the timer is zero, then set LPI_MODE, which allows the + * system to enter LPI mode immediately rather than waiting for + * the LPI threshold. + */ + if (!timer) + val = MAC_EEE_LPI_MODE; + else if (FIELD_FIT(MAC_EEE_LPI_TXIDLE_THD, timer)) + val = FIELD_PREP(MAC_EEE_LPI_TXIDLE_THD, timer); + else + val = MAC_EEE_LPI_TXIDLE_THD; + + if (tx_clk_stop) + val |= MAC_EEE_CKG_TXIDLE; + + /* PHY Wake-up time, this field does not have a reset value, so use the + * reset value from MT7531 (36us for 100M and 17us for 1000M). + */ + val |= FIELD_PREP(MAC_EEE_WAKEUP_TIME_1000, 17) | + FIELD_PREP(MAC_EEE_WAKEUP_TIME_100, 36); + + mtk_w32(eth, val, MTK_MAC_EEECR(mac->id)); + mtk_m32(eth, 0, MAC_MCR_EEE100M | MAC_MCR_EEE1G, MTK_MAC_MCR(mac->id)); + + return 0; +} + static const struct phylink_mac_ops mtk_phylink_ops = { .mac_select_pcs = mtk_mac_select_pcs, .mac_config = mtk_mac_config, .mac_finish = mtk_mac_finish, .mac_link_down = mtk_mac_link_down, .mac_link_up = mtk_mac_link_up, + .mac_disable_tx_lpi = mtk_mac_disable_tx_lpi, + .mac_enable_tx_lpi = mtk_mac_enable_tx_lpi, }; static int mtk_mdio_init(struct mtk_eth *eth) @@ -4469,6 +4517,20 @@ static int mtk_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam return phylink_ethtool_set_pauseparam(mac->phylink, pause); } +static int mtk_get_eee(struct net_device *dev, struct ethtool_keee *eee) +{ + struct mtk_mac *mac = netdev_priv(dev); + + return phylink_ethtool_get_eee(mac->phylink, eee); +} + +static int mtk_set_eee(struct net_device *dev, struct ethtool_keee *eee) +{ + struct mtk_mac *mac = netdev_priv(dev); + + return phylink_ethtool_set_eee(mac->phylink, eee); +} + static u16 mtk_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { @@ -4501,6 +4563,8 @@ static const struct ethtool_ops mtk_ethtool_ops = { .set_pauseparam = mtk_set_pauseparam, .get_rxnfc = mtk_get_rxnfc, .set_rxnfc = mtk_set_rxnfc, + .get_eee = mtk_get_eee, + .set_eee = mtk_set_eee, }; static const struct net_device_ops mtk_netdev_ops = { @@ -4610,6 +4674,9 @@ static int mtk_add_mac(struct mtk_eth *eth, struct device_node *np) mac->phylink_config.type = PHYLINK_NETDEV; mac->phylink_config.mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000 | MAC_2500FD; + mac->phylink_config.lpi_capabilities = MAC_100FD | MAC_1000FD | + MAC_2500FD; + mac->phylink_config.lpi_timer_default = 1000; /* MT7623 gmac0 is now missing its speed-specific PLL configuration * in its .mac_config method (since state->speed is not valid there. diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h b/drivers/net/ethernet/mediatek/mtk_eth_soc.h index 0d5225f1d3ee..90a377ab4359 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h @@ -453,6 +453,8 @@ #define MAC_MCR_RX_FIFO_CLR_DIS BIT(12) #define MAC_MCR_BACKOFF_EN BIT(9) #define MAC_MCR_BACKPR_EN BIT(8) +#define MAC_MCR_EEE1G BIT(7) +#define MAC_MCR_EEE100M BIT(6) #define MAC_MCR_FORCE_RX_FC BIT(5) #define MAC_MCR_FORCE_TX_FC BIT(4) #define MAC_MCR_SPEED_1000 BIT(3) @@ -461,6 +463,15 @@ #define MAC_MCR_FORCE_LINK BIT(0) #define MAC_MCR_FORCE_LINK_DOWN (MAC_MCR_FORCE_MODE) +/* Mac EEE control registers */ +#define MTK_MAC_EEECR(x) (0x10104 + (x * 0x100)) +#define MAC_EEE_WAKEUP_TIME_1000 GENMASK(31, 24) +#define MAC_EEE_WAKEUP_TIME_100 GENMASK(23, 16) +#define MAC_EEE_LPI_TXIDLE_THD GENMASK(15, 8) +#define MAC_EEE_CKG_TXIDLE BIT(3) +#define MAC_EEE_CKG_RXLPI BIT(2) +#define MAC_EEE_LPI_MODE BIT(0) + /* Mac status registers */ #define MTK_MAC_MSR(x) (0x10108 + (x * 0x100)) #define MAC_MSR_EEE1G BIT(7) -- 2.51.0 From f29e41454b94e6e4f3cdf340947b61fd22950c96 Mon Sep 17 00:00:00 2001 From: Chandra Mohan Sundar Date: Mon, 17 Feb 2025 19:45:16 +0530 Subject: [PATCH 03/16] selftests: net: Fix few spelling mistakes Fix few spelling mistakes in net selftests Signed-off-by: Chandra Mohan Sundar Reviewed-by: Petr Machata Link: https://patch.msgid.link/20250217141520.81033-1-chandru.dav@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fcnal-test.sh | 4 ++-- tools/testing/selftests/net/fdb_flush.sh | 2 +- tools/testing/selftests/net/fib_nexthops.sh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index 899dbad0104b..4fcc38907e48 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -3667,7 +3667,7 @@ ipv6_addr_bind_novrf() # when it really should not a=${NSA_LO_IP6} log_start - show_hint "Tecnically should fail since address is not on device but kernel allows" + show_hint "Technically should fail since address is not on device but kernel allows" run_cmd nettest -6 -s -l ${a} -I ${NSA_DEV} -t1 -b log_test_addr ${a} $? 0 "TCP socket bind to out of scope local address" } @@ -3724,7 +3724,7 @@ ipv6_addr_bind_vrf() # passes when it really should not a=${VRF_IP6} log_start - show_hint "Tecnically should fail since address is not on device but kernel allows" + show_hint "Technically should fail since address is not on device but kernel allows" run_cmd nettest -6 -s -l ${a} -I ${NSA_DEV} -t1 -b log_test_addr ${a} $? 0 "TCP socket bind to VRF address with device bind" diff --git a/tools/testing/selftests/net/fdb_flush.sh b/tools/testing/selftests/net/fdb_flush.sh index d5e3abb8658c..9931a1e36e3d 100755 --- a/tools/testing/selftests/net/fdb_flush.sh +++ b/tools/testing/selftests/net/fdb_flush.sh @@ -583,7 +583,7 @@ vxlan_test_flush_by_remote_attributes() $IP link del dev vx10 $IP link add name vx10 type vxlan dstport "$VXPORT" external - # For multicat FDB entries, the VXLAN driver stores a linked list of + # For multicast FDB entries, the VXLAN driver stores a linked list of # remotes for a given key. Verify that only the expected remotes are # flushed. multicast_fdb_entries_add diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 77c83d9508d3..bea1282e0281 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -741,7 +741,7 @@ ipv6_fcnal() run_cmd "$IP nexthop add id 52 via 2001:db8:92::3" log_test $? 2 "Create nexthop - gw only" - # gw is not reachable throught given dev + # gw is not reachable through given dev run_cmd "$IP nexthop add id 53 via 2001:db8:3::3 dev veth1" log_test $? 2 "Create nexthop - invalid gw+dev combination" -- 2.51.0 From 8f02c48f8f623eedc3c0a26a64c7ef155c35bfb9 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 17 Feb 2025 07:48:13 -0800 Subject: [PATCH 04/16] net: Remove redundant variable declaration in __dev_change_flags() The old_flags variable is declared twice in __dev_change_flags(), causing a shadow variable warning. This patch fixes the issue by removing the redundant declaration, reusing the existing old_flags variable instead. net/core/dev.c:9225:16: warning: declaration shadows a local variable [-Wshadow] 9225 | unsigned int old_flags = dev->flags; | ^ net/core/dev.c:9185:15: note: previous declaration is here 9185 | unsigned int old_flags = dev->flags; | ^ 1 warning generated. Remove the redundant inner declaration and reuse the existing old_flags variable since its value is not needed outside the if block, and it is safe to reuse the variable. This eliminates the warning while maintaining the same functionality. Signed-off-by: Breno Leitao Reviewed-by: Mateusz Polchlopek Reviewed-by: Kalesh AP Reviewed-by: Nicolas Dichtel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250217-old_flags-v2-1-4cda3b43a35f@debian.org Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index bcb266ab2912..ebc000b56828 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9182,7 +9182,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; - unsigned int old_flags = dev->flags; + old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; -- 2.51.0 From 3a03f9ec5d333b9998fbc63fd3e075b9d1991b89 Mon Sep 17 00:00:00 2001 From: Yu-Chun Lin Date: Mon, 17 Feb 2025 23:58:33 +0800 Subject: [PATCH 05/16] net: stmmac: Use str_enabled_disabled() helper As kernel test robot reported, the following warning occurs: cocci warnings: (new ones prefixed by >>) >> drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c:582:6-8: opportunity for str_enabled_disabled(on) Replace ternary (condition ? "enabled" : "disabled") with str_enabled_disabled() from string_choices.h to improve readability, maintain uniform string usage, and reduce binary size through linker deduplication. Reviewed-by: Huacai Chen Reviewed-by: Russell King (Oracle) Signed-off-by: Yu-Chun Lin Link: https://patch.msgid.link/20250217155833.3105775-1-eleanor15x@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c index 7900bf3effa7..a8b901cdf5cb 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "stmmac.h" #include "stmmac_pcs.h" #include "stmmac_ptp.h" @@ -625,7 +626,7 @@ int dwmac1000_ptp_enable(struct ptp_clock_info *ptp, } netdev_dbg(priv->dev, "Auxiliary Snapshot %s.\n", - on ? "enabled" : "disabled"); + str_enabled_disabled(on)); writel(tcr_val, ptpaddr + PTP_TCR); /* wait for auxts fifo clear to finish */ -- 2.51.0 From aaf6532d119d8ad4c75420b021d2649864133583 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:26:58 -0500 Subject: [PATCH 06/16] tcp: only initialize sockcm tsflags field TCP only reads the tsflags field. Don't bother initializing others. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-2-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5d78ab3b416e..6a8f19a10911 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1127,7 +1127,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } - sockcm_init(&sockc, sk); + sockc = (struct sockcm_cookie) { .tsflags = READ_ONCE(sk->sk_tsflags)}; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) { -- 2.51.0 From 6ad861519a69ecf3cf032c579e18569f62b81263 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:26:59 -0500 Subject: [PATCH 07/16] net: initialize mark in sockcm_init Avoid open coding initialization of sockcm fields. Avoid reading the sk_priority field twice. This ensures all callers, existing and future, will correctly try a cmsg passed mark before sk_mark. This patch extends support for cmsg mark to: packet_spkt and packet_tpacket and net/can/raw.c. This patch extends support for cmsg priority to: packet_spkt and packet_tpacket. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-3-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 1 + net/can/raw.c | 2 +- net/packet/af_packet.c | 9 ++++----- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 60ebf3c7b229..fac65ed30983 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1829,6 +1829,7 @@ static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { *sockc = (struct sockcm_cookie) { + .mark = READ_ONCE(sk->sk_mark), .tsflags = READ_ONCE(sk->sk_tsflags), .priority = READ_ONCE(sk->sk_priority), }; diff --git a/net/can/raw.c b/net/can/raw.c index 46e8ed9d64da..9b1d5f036f57 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -963,7 +963,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) skb->dev = dev; skb->priority = sockc.priority; - skb->mark = READ_ONCE(sk->sk_mark); + skb->mark = sockc.mark; skb->tstamp = sockc.transmit_time; skb_setup_tx_timestamp(skb, &sockc); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c131e5ceea37..3e9ddf72cd03 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2102,8 +2102,8 @@ retry: skb->protocol = proto; skb->dev = dev; - skb->priority = READ_ONCE(sk->sk_priority); - skb->mark = READ_ONCE(sk->sk_mark); + skb->priority = sockc.priority; + skb->mark = sockc.mark; skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); skb_setup_tx_timestamp(skb, &sockc); @@ -2634,8 +2634,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->protocol = proto; skb->dev = dev; - skb->priority = READ_ONCE(po->sk.sk_priority); - skb->mark = READ_ONCE(po->sk.sk_mark); + skb->priority = sockc->priority; + skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid); skb_setup_tx_timestamp(skb, sockc); skb_zcopy_set_nouarg(skb, ph.raw); @@ -3039,7 +3039,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_unlock; sockcm_init(&sockc, sk); - sockc.mark = READ_ONCE(sk->sk_mark); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) -- 2.51.0 From 94788792f37902f1f4d417f6f9663831cf7e91fc Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:27:00 -0500 Subject: [PATCH 08/16] ipv4: initialize inet socket cookies with sockcm_init Avoid open coding the same logic. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-4-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 305eccdf4ff7..3c4ef5ddad83 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -94,9 +94,8 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, { ipcm_init(ipcm); - ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark); - ipcm->sockc.priority = READ_ONCE(inet->sk.sk_priority); - ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags); + sockcm_init(&ipcm->sockc, &inet->sk); + ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; ipcm->protocol = inet->inet_num; -- 2.51.0 From 9329b58395e51bba9c847419cc4ba176df3dd2b7 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:27:01 -0500 Subject: [PATCH 09/16] ipv4: remove get_rttos Initialize the ip cookie tos field when initializing the cookie, in ipcm_init_sk. The existing code inverts the standard pattern for initializing cookie fields. Default is to initialize the field from the sk, then possibly overwrite that when parsing cmsgs (the unlikely case). This field inverts that, setting the field to an illegal value and after cmsg parsing checking whether the value is still illegal and thus should be overridden. Be careful to always apply mask INET_DSCP_MASK, as before. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-5-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 11 +++-------- net/ipv4/ping.c | 6 +++--- net/ipv4/raw.c | 6 +++--- net/ipv4/udp.c | 6 +++--- 4 files changed, 12 insertions(+), 17 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 3c4ef5ddad83..ce5e59957dd5 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -92,7 +92,9 @@ static inline void ipcm_init(struct ipcm_cookie *ipcm) static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, const struct inet_sock *inet) { - ipcm_init(ipcm); + *ipcm = (struct ipcm_cookie) { + .tos = READ_ONCE(inet->tos), + }; sockcm_init(&ipcm->sockc, &inet->sk); @@ -256,13 +258,6 @@ static inline u8 ip_sendmsg_scope(const struct inet_sock *inet, return RT_SCOPE_UNIVERSE; } -static inline __u8 get_rttos(struct ipcm_cookie* ipc, struct inet_sock *inet) -{ - u8 dsfield = ipc->tos != -1 ? ipc->tos : READ_ONCE(inet->tos); - - return dsfield & INET_DSCP_MASK; -} - /* datagram.c */ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 619ddc087957..85d09f2ecadc 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -705,7 +705,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ip_options_data opt_copy; int free = 0; __be32 saddr, daddr, faddr; - u8 tos, scope; + u8 scope; int err; pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); @@ -768,7 +768,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } faddr = ipc.opt->opt.faddr; } - tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); if (ipv4_is_multicast(daddr)) { @@ -779,7 +778,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } else if (!ipc.oif) ipc.oif = READ_ONCE(inet->uc_index); - flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, + ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, sk->sk_uid); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 4304a68d1db0..6aace4d55733 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -486,7 +486,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ipcm_cookie ipc; struct rtable *rt = NULL; struct flowi4 fl4; - u8 tos, scope; + u8 scope; int free = 0; __be32 daddr; __be32 saddr; @@ -581,7 +581,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) daddr = ipc.opt->opt.faddr; } } - tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); uc_index = READ_ONCE(inet->uc_index); @@ -606,7 +605,8 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } } - flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, + ipc.tos & INET_DSCP_MASK, scope, hdrincl ? ipc.protocol : sk->sk_protocol, inet_sk_flowi_flags(sk) | (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3485989cd4bd..17c7736d8349 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1280,7 +1280,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int free = 0; int connected = 0; __be32 daddr, faddr, saddr; - u8 tos, scope; + u8 scope; __be16 dport; int err, is_udplite = IS_UDPLITE(sk); int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; @@ -1404,7 +1404,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) faddr = ipc.opt->opt.faddr; connected = 0; } - tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); if (scope == RT_SCOPE_LINK) connected = 0; @@ -1441,7 +1440,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &fl4_stack; - flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, scope, + flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, + ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, flow_flags, faddr, saddr, dport, inet->inet_sport, sk->sk_uid); -- 2.51.0 From e8485911050a60091d1bf51a162f0a2654729fad Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:27:02 -0500 Subject: [PATCH 10/16] icmp: reflect tos through ip cookie rather than updating inet_sk Do not modify socket fields if it can be avoided. The current code predates the introduction of ip cookies in commit aa6615814533 ("ipv4: processing ancillary IP_TOS or IP_TTL"). Now that cookies exist and support tos, update that field directly. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-6-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/icmp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5482edb5aade..799775ba97d4 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -405,7 +405,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct ipcm_cookie ipc; struct flowi4 fl4; struct sock *sk; - struct inet_sock *inet; __be32 daddr, saddr; u32 mark = IP4_REPLY_MARK(net, skb->mark); int type = icmp_param->data.icmph.type; @@ -424,12 +423,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) sk = icmp_xmit_lock(net); if (!sk) goto out_bh_enable; - inet = inet_sk(sk); icmp_param->data.icmph.checksum = 0; ipcm_init(&ipc); - inet->tos = ip_hdr(skb)->tos; + ipc.tos = ip_hdr(skb)->tos; ipc.sockc.mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); @@ -737,8 +735,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, icmp_param.data.icmph.checksum = 0; icmp_param.skb = skb_in; icmp_param.offset = skb_network_offset(skb_in); - inet_sk(sk)->tos = tos; ipcm_init(&ipc); + ipc.tos = tos; ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts.opt; ipc.sockc.mark = mark; -- 2.51.0 From 096208592b09c2f5fc0c1a174694efa41c04209d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:27:03 -0500 Subject: [PATCH 11/16] ipv6: replace ipcm6_init calls with ipcm6_init_sk This initializes tclass and dontfrag before cmsg parsing, removing the need for explicit checks against -1 in each caller. Leave hlimit set to -1, because its full initialization (in ip6_sk_dst_hoplimit) requires more state (dst, flowi6, ..). This also prepares for calling sockcm_init in a follow-on patch. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-7-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 9 --------- net/ipv6/raw.c | 8 +------- net/ipv6/udp.c | 7 +------ net/l2tp/l2tp_ip6.c | 8 +------- 4 files changed, 3 insertions(+), 29 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index f5c43ad1565e..46a679d9b334 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -363,15 +363,6 @@ struct ipcm6_cookie { struct ipv6_txoptions *opt; }; -static inline void ipcm6_init(struct ipcm6_cookie *ipc6) -{ - *ipc6 = (struct ipcm6_cookie) { - .hlimit = -1, - .tclass = -1, - .dontfrag = -1, - }; -} - static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6, const struct sock *sk) { diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index a45aba090aa4..ae68d3f7dd32 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -777,7 +777,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = READ_ONCE(sk->sk_mark); fl6.flowi6_uid = sk->sk_uid; - ipcm6_init(&ipc6); + ipcm6_init_sk(&ipc6, sk); ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = fl6.flowi6_mark; ipc6.sockc.priority = READ_ONCE(sk->sk_priority); @@ -891,9 +891,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (hdrincl) fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); @@ -904,9 +901,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (ipc6.dontfrag < 0) - ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); - if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c6ea438b5c75..7096b7e84c10 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1494,7 +1494,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); - ipcm6_init(&ipc6); + ipcm6_init_sk(&ipc6, sk); ipc6.gso_size = READ_ONCE(up->gso_size); ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = READ_ONCE(sk->sk_mark); @@ -1704,9 +1704,6 @@ do_udp_sendmsg: security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6->flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6->flowlabel); dst = ip6_sk_dst_lookup_flow(sk, fl6, final_p, connected); @@ -1752,8 +1749,6 @@ back_from_confirm: WRITE_ONCE(up->pending, AF_INET6); do_append_data: - if (ipc6.dontfrag < 0) - ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); up->len += ulen; err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, fl6, dst_rt6_info(dst), diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index f4c1da070826..b98d13584c81 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -547,7 +547,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = READ_ONCE(sk->sk_mark); fl6.flowi6_uid = sk->sk_uid; - ipcm6_init(&ipc6); + ipcm6_init_sk(&ipc6, sk); if (lsa) { if (addr_len < SIN6_LEN_RFC2133) @@ -634,9 +634,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); @@ -648,9 +645,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (ipc6.dontfrag < 0) - ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); - if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; -- 2.51.0 From 5cd2f78886dd86de1b13d6502808a149f1b77959 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:27:04 -0500 Subject: [PATCH 12/16] ipv6: initialize inet socket cookies with sockcm_init Avoid open coding the same logic. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-8-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 2 ++ net/ipv6/ping.c | 3 --- net/ipv6/raw.c | 9 +++------ net/ipv6/udp.c | 3 --- 4 files changed, 5 insertions(+), 12 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 46a679d9b334..9614006f483c 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -371,6 +371,8 @@ static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6, .tclass = inet6_sk(sk)->tclass, .dontfrag = inet6_test_bit(DONTFRAG, sk), }; + + sockcm_init(&ipc6->sockc, sk); } static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 46b8adf6e7f8..84d90dd8b3f0 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -119,9 +119,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return -EINVAL; ipcm6_init_sk(&ipc6, sk); - ipc6.sockc.priority = READ_ONCE(sk->sk_priority); - ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); - ipc6.sockc.mark = READ_ONCE(sk->sk_mark); fl6.flowi6_oif = oif; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ae68d3f7dd32..fda640ebd53f 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -769,19 +769,16 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) hdrincl = inet_test_bit(HDRINCL, sk); + ipcm6_init_sk(&ipc6, sk); + /* * Get and verify the address. */ memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_mark = READ_ONCE(sk->sk_mark); + fl6.flowi6_mark = ipc6.sockc.mark; fl6.flowi6_uid = sk->sk_uid; - ipcm6_init_sk(&ipc6, sk); - ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); - ipc6.sockc.mark = fl6.flowi6_mark; - ipc6.sockc.priority = READ_ONCE(sk->sk_priority); - if (sin6) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7096b7e84c10..3a0d6c5a8286 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1496,9 +1496,6 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipcm6_init_sk(&ipc6, sk); ipc6.gso_size = READ_ONCE(up->gso_size); - ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); - ipc6.sockc.mark = READ_ONCE(sk->sk_mark); - ipc6.sockc.priority = READ_ONCE(sk->sk_priority); /* destination address check */ if (sin6) { -- 2.51.0 From 27315836f4bcc8e4879d50dfc1fa6eb41e7952ef Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Sun, 16 Feb 2025 19:42:26 -0800 Subject: [PATCH 13/16] net: mana: Allow tso_max_size to go up-to GSO_MAX_SIZE Allow the max aggregated pkt size to go up-to GSO_MAX_SIZE for MANA NIC. This patch only increases the max allowable gso/gro pkt size for MANA devices and does not change the defaults. Following are the perf benefits by increasing the pkt aggregate size from legacy gso_max_size value(64K) to newer one(up-to 511K IPv4 tests for i in {1..10}; do netperf -t TCP_RR -H 10.0.0.5 -p50000 -- -r80000,80000 -O MIN_LATENCY,P90_LATENCY,P99_LATENCY,THROUGHPUT|tail -1; done min p90 p99 Throughput gso_max_size 93 171 194 6594.25 97 154 180 7183.74 95 165 189 6927.86 96 165 188 6976.04 93 154 185 7338.05 64K 93 168 189 6938.03 94 169 189 6784.93 92 166 189 7117.56 94 179 191 6678.44 95 157 183 7277.81 min p90 p99 Throughput 93 134 146 8448.75 95 134 140 8396.54 94 137 148 8204.12 94 137 148 8244.41 94 128 139 8666.52 80K 94 141 153 8116.86 94 138 149 8163.92 92 135 142 8362.72 92 134 142 8497.57 93 136 148 8393.23 IPv6 Tests for i in {1..10}; do netperf -t TCP_RR -H fd00:9013:cadd::4 -p50000 -- -r80000,80000 -O MIN_LATENCY,P90_LATENCY,P99_LATENCY,THROUGHPUT|tail -1; done min p90 p99 Throughput gso_max_size 108 165 170 6673.2 101 169 189 6451.69 101 165 169 6737.65 102 167 175 6614.64 101 178 189 6247.13 64K 107 163 169 6678.63 106 176 187 6350.86 100 164 169 6617.36 102 163 170 6849.21 102 168 175 6605.7 min p90 p99 Throughput 108 155 166 7183 110 154 163 7268.87 109 152 159 7434.35 107 145 157 7569.15 107 149 164 7496.17 80K 110 154 159 7245.85 108 156 162 7266.24 109 145 158 7526.66 106 145 151 7785.75 111 148 157 7246.65 Tested on azure env with Accelerated Networking enabled and disabled. Signed-off-by: Shradha Gupta Reviewed-by: Haiyang Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/microsoft/mana/mana_en.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index aa1e47233fe5..3b0fb4d95cf7 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -256,6 +256,9 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev) if (skb_cow_head(skb, MANA_HEADROOM)) goto tx_drop_count; + if (unlikely(ipv6_hopopt_jumbo_remove(skb))) + goto tx_drop_count; + txq = &apc->tx_qp[txq_idx].txq; gdma_sq = txq->gdma_sq; cq = &apc->tx_qp[txq_idx].tx_cq; @@ -2873,6 +2876,8 @@ static int mana_probe_port(struct mana_context *ac, int port_idx, ndev->dev_port = port_idx; SET_NETDEV_DEV(ndev, gc->dev); + netif_set_tso_max_size(ndev, GSO_MAX_SIZE); + netif_carrier_off(ndev); netdev_rss_key_fill(apc->hashkey, MANA_HASH_KEY_SIZE); -- 2.51.0 From 685920920e3d5f68a8c50107b97747b0f8ce050f Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Sun, 16 Feb 2025 19:42:42 -0800 Subject: [PATCH 14/16] hv_netvsc: Use VF's tso_max_size value when data path is VF On Azure, increasing VF's gso/gro packet size to up-to GSO_MAX_SIZE is not possible without allowing the same for netvsc NIC (as the NICs are bonded together). For bonded NICs, the min of the max aggregated pkt size of the members is propagated in the stack. Therefore, we use netif_set_tso_max_size() to set max aggregated pkt size to VF's packet size for netvsc too, when the data path is switched over to the VF Tested on azure env with Accelerated Networking enabled and disabled. Signed-off-by: Shradha Gupta Reviewed-by: Haiyang Zhang Signed-off-by: David S. Miller --- drivers/net/hyperv/hyperv_net.h | 2 ++ drivers/net/hyperv/netvsc_drv.c | 15 +++++++++++++++ drivers/net/hyperv/rndis_filter.c | 13 +++++++------ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index 234db693cefa..70f7cb383228 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -1166,6 +1166,8 @@ struct netvsc_device { u32 max_chn; u32 num_chn; + u32 netvsc_gso_max_size; + atomic_t open_chn; struct work_struct subchan_work; wait_queue_head_t subchan_open; diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index d6c4abfc3a28..9c6501bf27bd 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2461,6 +2461,21 @@ static int netvsc_vf_changed(struct net_device *vf_netdev, unsigned long event) } else { netdev_info(ndev, "Data path switched %s VF: %s\n", vf_is_up ? "to" : "from", vf_netdev->name); + + /* In Azure, when accelerated networking in enabled, other NICs + * like MANA, MLX, are configured as a bonded nic with + * Netvsc(failover) NIC. For bonded NICs, the min of the max + * pkt aggregate size of the members is propagated in the stack. + * In order to allow these NICs (MANA/MLX) to use up to + * GSO_MAX_SIZE gso packet size, we need to allow Netvsc NIC to + * also support this in the guest. + * This value is only increased for netvsc NIC when datapath is + * switched over to the VF + */ + if (vf_is_up) + netif_set_tso_max_size(ndev, vf_netdev->tso_max_size); + else + netif_set_tso_max_size(ndev, netvsc_dev->netvsc_gso_max_size); } return NOTIFY_OK; diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index c0ceeef4fcd8..82747dfacd70 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -1356,9 +1356,10 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, struct net_device_context *net_device_ctx = netdev_priv(net); struct ndis_offload hwcaps; struct ndis_offload_params offloads; - unsigned int gso_max_size = GSO_LEGACY_MAX_SIZE; int ret; + nvdev->netvsc_gso_max_size = GSO_LEGACY_MAX_SIZE; + /* Find HW offload capabilities */ ret = rndis_query_hwcaps(rndis_device, nvdev, &hwcaps); if (ret != 0) @@ -1390,8 +1391,8 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, offloads.lso_v2_ipv4 = NDIS_OFFLOAD_PARAMETERS_LSOV2_ENABLED; net->hw_features |= NETIF_F_TSO; - if (hwcaps.lsov2.ip4_maxsz < gso_max_size) - gso_max_size = hwcaps.lsov2.ip4_maxsz; + if (hwcaps.lsov2.ip4_maxsz < nvdev->netvsc_gso_max_size) + nvdev->netvsc_gso_max_size = hwcaps.lsov2.ip4_maxsz; } if (hwcaps.csum.ip4_txcsum & NDIS_TXCSUM_CAP_UDP4) { @@ -1411,8 +1412,8 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, offloads.lso_v2_ipv6 = NDIS_OFFLOAD_PARAMETERS_LSOV2_ENABLED; net->hw_features |= NETIF_F_TSO6; - if (hwcaps.lsov2.ip6_maxsz < gso_max_size) - gso_max_size = hwcaps.lsov2.ip6_maxsz; + if (hwcaps.lsov2.ip6_maxsz < nvdev->netvsc_gso_max_size) + nvdev->netvsc_gso_max_size = hwcaps.lsov2.ip6_maxsz; } if (hwcaps.csum.ip6_txcsum & NDIS_TXCSUM_CAP_UDP6) { @@ -1438,7 +1439,7 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, */ net->features &= ~NETVSC_SUPPORTED_HW_FEATURES | net->hw_features; - netif_set_tso_max_size(net, gso_max_size); + netif_set_tso_max_size(net, nvdev->netvsc_gso_max_size); ret = rndis_filter_set_offload_params(net, nvdev, &offloads); -- 2.51.0 From 9a369ae3d1431a83589dde57323a04692dd7fc12 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Mon, 17 Feb 2025 09:31:46 +0200 Subject: [PATCH 15/16] net: ethernet: ti: am65-cpsw: remove am65_cpsw_nuss_tx_compl_packets_2g() The only difference between am65_cpsw_nuss_tx_compl_packets_2g() and am65_cpsw_nuss_tx_compl_packets() is the usage of spin_lock() and netdev_tx_completed_queue() + am65_cpsw_nuss_tx_wake at every packet in the latter. Insted of having 2 separate functions for TX completion, merge them into one. This will reduce code duplication and make maintenance easier. Signed-off-by: Roger Quadros Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 98 ++++++------------------ 1 file changed, 25 insertions(+), 73 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 2806238629f8..0ccb8dbcbba4 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -1504,6 +1504,7 @@ static void am65_cpsw_nuss_tx_wake(struct am65_cpsw_tx_chn *tx_chn, struct net_d static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common, int chn, unsigned int budget, bool *tdown) { + bool single_port = AM65_CPSW_IS_CPSW2G(common); enum am65_cpsw_tx_buf_type buf_type; struct device *dev = common->dev; struct am65_cpsw_tx_chn *tx_chn; @@ -1511,6 +1512,7 @@ static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common, unsigned int total_bytes = 0; struct net_device *ndev; struct xdp_frame *xdpf; + unsigned int pkt_len; struct sk_buff *skb; dma_addr_t desc_dma; int res, num_tx = 0; @@ -1518,9 +1520,12 @@ static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common, tx_chn = &common->tx_chns[chn]; while (true) { - spin_lock(&tx_chn->lock); + if (!single_port) + spin_lock(&tx_chn->lock); res = k3_udma_glue_pop_tx_chn(tx_chn->tx_chn, &desc_dma); - spin_unlock(&tx_chn->lock); + if (!single_port) + spin_unlock(&tx_chn->lock); + if (res == -ENODATA) break; @@ -1535,23 +1540,35 @@ static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common, if (buf_type == AM65_CPSW_TX_BUF_TYPE_SKB) { skb = am65_cpsw_nuss_tx_compl_packet_skb(tx_chn, desc_dma); ndev = skb->dev; - total_bytes = skb->len; + pkt_len = skb->len; napi_consume_skb(skb, budget); } else { xdpf = am65_cpsw_nuss_tx_compl_packet_xdp(common, tx_chn, desc_dma, &ndev); - total_bytes = xdpf->len; + pkt_len = xdpf->len; if (buf_type == AM65_CPSW_TX_BUF_TYPE_XDP_TX) xdp_return_frame_rx_napi(xdpf); else xdp_return_frame(xdpf); } + + total_bytes += pkt_len; num_tx++; - netif_txq = netdev_get_tx_queue(ndev, chn); + if (!single_port) { + /* as packets from multi ports can be interleaved + * on the same channel, we have to figure out the + * port/queue at every packet and report it/wake queue. + */ + netif_txq = netdev_get_tx_queue(ndev, chn); + netdev_tx_completed_queue(netif_txq, 1, pkt_len); + am65_cpsw_nuss_tx_wake(tx_chn, ndev, netif_txq); + } + } + if (single_port) { + netif_txq = netdev_get_tx_queue(ndev, chn); netdev_tx_completed_queue(netif_txq, num_tx, total_bytes); - am65_cpsw_nuss_tx_wake(tx_chn, ndev, netif_txq); } @@ -1560,66 +1577,6 @@ static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common, return num_tx; } -static int am65_cpsw_nuss_tx_compl_packets_2g(struct am65_cpsw_common *common, - int chn, unsigned int budget, bool *tdown) -{ - enum am65_cpsw_tx_buf_type buf_type; - struct device *dev = common->dev; - struct am65_cpsw_tx_chn *tx_chn; - struct netdev_queue *netif_txq; - unsigned int total_bytes = 0; - struct net_device *ndev; - struct xdp_frame *xdpf; - struct sk_buff *skb; - dma_addr_t desc_dma; - int res, num_tx = 0; - - tx_chn = &common->tx_chns[chn]; - - while (true) { - res = k3_udma_glue_pop_tx_chn(tx_chn->tx_chn, &desc_dma); - if (res == -ENODATA) - break; - - if (cppi5_desc_is_tdcm(desc_dma)) { - if (atomic_dec_and_test(&common->tdown_cnt)) - complete(&common->tdown_complete); - *tdown = true; - break; - } - - buf_type = am65_cpsw_nuss_buf_type(tx_chn, desc_dma); - if (buf_type == AM65_CPSW_TX_BUF_TYPE_SKB) { - skb = am65_cpsw_nuss_tx_compl_packet_skb(tx_chn, desc_dma); - ndev = skb->dev; - total_bytes += skb->len; - napi_consume_skb(skb, budget); - } else { - xdpf = am65_cpsw_nuss_tx_compl_packet_xdp(common, tx_chn, - desc_dma, &ndev); - total_bytes += xdpf->len; - if (buf_type == AM65_CPSW_TX_BUF_TYPE_XDP_TX) - xdp_return_frame_rx_napi(xdpf); - else - xdp_return_frame(xdpf); - } - num_tx++; - } - - if (!num_tx) - return 0; - - netif_txq = netdev_get_tx_queue(ndev, chn); - - netdev_tx_completed_queue(netif_txq, num_tx, total_bytes); - - am65_cpsw_nuss_tx_wake(tx_chn, ndev, netif_txq); - - dev_dbg(dev, "%s:%u pkt:%d\n", __func__, chn, num_tx); - - return num_tx; -} - static enum hrtimer_restart am65_cpsw_nuss_tx_timer_callback(struct hrtimer *timer) { struct am65_cpsw_tx_chn *tx_chns = @@ -1635,13 +1592,8 @@ static int am65_cpsw_nuss_tx_poll(struct napi_struct *napi_tx, int budget) bool tdown = false; int num_tx; - if (AM65_CPSW_IS_CPSW2G(tx_chn->common)) - num_tx = am65_cpsw_nuss_tx_compl_packets_2g(tx_chn->common, tx_chn->id, - budget, &tdown); - else - num_tx = am65_cpsw_nuss_tx_compl_packets(tx_chn->common, - tx_chn->id, budget, &tdown); - + num_tx = am65_cpsw_nuss_tx_compl_packets(tx_chn->common, + tx_chn->id, budget, &tdown); if (num_tx >= budget) return budget; -- 2.51.0 From 1ae26bf6151706477fe2b4567be516f0173162fd Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Mon, 17 Feb 2025 09:31:47 +0200 Subject: [PATCH 16/16] net: ethernet: ti: am65_cpsw: remove cpu argument am65_cpsw_run_xdp am65_cpsw_run_xdp() can figure out the cpu id itself. No need to pass it around 2 functions so drop it. Signed-off-by: Roger Quadros Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 0ccb8dbcbba4..134802007c93 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -1167,14 +1167,14 @@ pool_free: static int am65_cpsw_run_xdp(struct am65_cpsw_rx_flow *flow, struct am65_cpsw_port *port, - struct xdp_buff *xdp, - int cpu, int *len) + struct xdp_buff *xdp, int *len) { struct am65_cpsw_common *common = flow->common; struct net_device *ndev = port->ndev; int ret = AM65_CPSW_XDP_CONSUMED; struct am65_cpsw_tx_chn *tx_chn; struct netdev_queue *netif_txq; + int cpu = smp_processor_id(); struct xdp_frame *xdpf; struct bpf_prog *prog; struct page *page; @@ -1274,7 +1274,7 @@ static void am65_cpsw_nuss_rx_csum(struct sk_buff *skb, u32 csum_info) } static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_rx_flow *flow, - int cpu, int *xdp_state) + int *xdp_state) { struct am65_cpsw_rx_chn *rx_chn = &flow->common->rx_chns; u32 buf_dma_len, pkt_len, port_id = 0, csum_info; @@ -1334,8 +1334,7 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_rx_flow *flow, xdp_init_buff(&xdp, PAGE_SIZE, &port->xdp_rxq[flow->id]); xdp_prepare_buff(&xdp, page_addr, AM65_CPSW_HEADROOM, pkt_len, false); - *xdp_state = am65_cpsw_run_xdp(flow, port, &xdp, - cpu, &pkt_len); + *xdp_state = am65_cpsw_run_xdp(flow, port, &xdp, &pkt_len); if (*xdp_state != AM65_CPSW_XDP_PASS) goto allocate; @@ -1401,7 +1400,6 @@ static int am65_cpsw_nuss_rx_poll(struct napi_struct *napi_rx, int budget) { struct am65_cpsw_rx_flow *flow = am65_cpsw_napi_to_rx_flow(napi_rx); struct am65_cpsw_common *common = flow->common; - int cpu = smp_processor_id(); int xdp_state_or = 0; int cur_budget, ret; int xdp_state; @@ -1410,7 +1408,7 @@ static int am65_cpsw_nuss_rx_poll(struct napi_struct *napi_rx, int budget) /* process only this flow */ cur_budget = budget; while (cur_budget--) { - ret = am65_cpsw_nuss_rx_packets(flow, cpu, &xdp_state); + ret = am65_cpsw_nuss_rx_packets(flow, &xdp_state); xdp_state_or |= xdp_state; if (ret) break; -- 2.51.0