From a13fc7ebd9946974d4898187938c4f92b28dccbe Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Mon, 17 Mar 2025 18:37:27 +0100 Subject: [PATCH 01/16] mlxsw: spectrum: Call mlxsw_sp_bridge_vxlan_{join, leave}() for VLAN-aware bridge mlxsw_sp_bridge_vxlan_{join,leave}() are not called when a VXLAN device joins or leaves a VLAN-aware bridge. As mentioned in the comment - when the bridge is VLAN-aware, the VNI of the VXLAN device needs to be mapped to a VLAN, but at this point no VLANs are configured on the VxLAN device. This means that we can call the APIs, but there is no point to do that, as they do not configure anything in such cases. Next patch will extend mlxsw_sp_bridge_vxlan_{join,leave}() to set hardware domain for VXLAN, this should be done also when a VXLAN device joins or leaves a VLAN-aware bridge. Call the APIs, which for now do not do anything in these flows. Align the call to mlxsw_sp_bridge_vxlan_leave() to be called like mlxsw_sp_bridge_vxlan_join(), only in case that the VXLAN device is up, so move the check to be done before calling mlxsw_sp_bridge_vxlan_{join,leave}(). This does not change the existing behavior, as there is a similar check inside mlxsw_sp_bridge_vxlan_leave(). Signed-off-by: Amit Cohen Reviewed-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Reviewed-by: Simon Horman Link: https://patch.msgid.link/994c1ea93520f9ea55d1011cd47dc2180d526484.1742224300.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlxsw/spectrum.c | 20 ++++--------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 2bc8a3dbc836..3080ea032e7f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -5230,25 +5230,13 @@ static int mlxsw_sp_netdevice_vxlan_event(struct mlxsw_sp *mlxsw_sp, return 0; if (!mlxsw_sp_bridge_vxlan_is_valid(upper_dev, extack)) return -EOPNOTSUPP; - if (cu_info->linking) { - if (!netif_running(dev)) - return 0; - /* When the bridge is VLAN-aware, the VNI of the VxLAN - * device needs to be mapped to a VLAN, but at this - * point no VLANs are configured on the VxLAN device - */ - if (br_vlan_enabled(upper_dev)) - return 0; + if (!netif_running(dev)) + return 0; + if (cu_info->linking) return mlxsw_sp_bridge_vxlan_join(mlxsw_sp, upper_dev, dev, 0, extack); - } else { - /* VLANs were already flushed, which triggered the - * necessary cleanup - */ - if (br_vlan_enabled(upper_dev)) - return 0; + else mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, dev); - } break; case NETDEV_PRE_UP: upper_dev = netdev_master_upper_dev_get(dev); -- 2.50.1 From 413e2c069969bd311a4043f062a686eff9e7b0a0 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Mon, 17 Mar 2025 18:37:28 +0100 Subject: [PATCH 02/16] mlxsw: spectrum_switchdev: Add an internal API for VXLAN leave There is asymmetry in how the VXLAN join and leave functions are used. The join function (mlxsw_sp_bridge_vxlan_join()) is only called in response to netdev events (e.g., VXLAN device joining a bridge), but the leave function is also called in response to switchdev events (e.g., VLAN configuration on top of the VXLAN device) in order to invalidate VNI to FID mappings. This asymmetry will cause problems when the functions will be later extended to mark VXLAN bridge ports as offloaded or not. Therefore, create an internal function (__mlxsw_sp_bridge_vxlan_leave()) that is used to invalidate VNI to FID mappings and call it from mlxsw_sp_bridge_vxlan_leave() which will only be invoked in response to netdev events, like mlxsw_sp_bridge_vxlan_join(). No functional changes intended. Signed-off-by: Amit Cohen Reviewed-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Reviewed-by: Simon Horman Link: https://patch.msgid.link/f3a32bd2d87a0b7ac4d2bb98a427dc6d95a01cd0.1742224300.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlxsw/spectrum_switchdev.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 6397ff0dc951..c95ef79eaf3d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -2944,8 +2944,8 @@ int mlxsw_sp_bridge_vxlan_join(struct mlxsw_sp *mlxsw_sp, extack); } -void mlxsw_sp_bridge_vxlan_leave(struct mlxsw_sp *mlxsw_sp, - const struct net_device *vxlan_dev) +static void __mlxsw_sp_bridge_vxlan_leave(struct mlxsw_sp *mlxsw_sp, + const struct net_device *vxlan_dev) { struct vxlan_dev *vxlan = netdev_priv(vxlan_dev); struct mlxsw_sp_fid *fid; @@ -2963,6 +2963,12 @@ void mlxsw_sp_bridge_vxlan_leave(struct mlxsw_sp *mlxsw_sp, mlxsw_sp_fid_put(fid); } +void mlxsw_sp_bridge_vxlan_leave(struct mlxsw_sp *mlxsw_sp, + const struct net_device *vxlan_dev) +{ + __mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, vxlan_dev); +} + static void mlxsw_sp_switchdev_vxlan_addr_convert(const union vxlan_addr *vxlan_addr, enum mlxsw_sp_l3proto *proto, @@ -3867,7 +3873,7 @@ mlxsw_sp_switchdev_vxlan_vlan_add(struct mlxsw_sp *mlxsw_sp, mlxsw_sp_fid_put(fid); return -EINVAL; } - mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, vxlan_dev); + __mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, vxlan_dev); mlxsw_sp_fid_put(fid); return 0; } @@ -3883,7 +3889,7 @@ mlxsw_sp_switchdev_vxlan_vlan_add(struct mlxsw_sp *mlxsw_sp, /* Fourth case: Thew new VLAN is PVID, which means the VLAN currently * mapped to the VNI should be unmapped */ - mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, vxlan_dev); + __mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, vxlan_dev); mlxsw_sp_fid_put(fid); /* Fifth case: The new VLAN is also egress untagged, which means the @@ -3923,7 +3929,7 @@ mlxsw_sp_switchdev_vxlan_vlan_del(struct mlxsw_sp *mlxsw_sp, if (mlxsw_sp_fid_8021q_vid(fid) != vid) goto out; - mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, vxlan_dev); + __mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, vxlan_dev); out: mlxsw_sp_fid_put(fid); -- 2.50.1 From 630e7e20d35f54a3b8c7c2965b4d9220f414fc2d Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Mon, 17 Mar 2025 18:37:29 +0100 Subject: [PATCH 03/16] mlxsw: spectrum_switchdev: Move mlxsw_sp_bridge_vxlan_join() Next patch will call __mlxsw_sp_bridge_vxlan_leave() from mlxsw_sp_bridge_vxlan_join() as part of error flow, move the function to be able to call the second one. Signed-off-by: Amit Cohen Reviewed-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Reviewed-by: Simon Horman Link: https://patch.msgid.link/64750a0965536530482318578bada30fac372b8a.1742224300.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlxsw/spectrum_switchdev.c | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index c95ef79eaf3d..13ad4e31d701 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -2929,21 +2929,6 @@ void mlxsw_sp_port_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_port, mlxsw_sp_bridge_port_put(mlxsw_sp->bridge, bridge_port); } -int mlxsw_sp_bridge_vxlan_join(struct mlxsw_sp *mlxsw_sp, - const struct net_device *br_dev, - const struct net_device *vxlan_dev, u16 vid, - struct netlink_ext_ack *extack) -{ - struct mlxsw_sp_bridge_device *bridge_device; - - bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, br_dev); - if (WARN_ON(!bridge_device)) - return -EINVAL; - - return bridge_device->ops->vxlan_join(bridge_device, vxlan_dev, vid, - extack); -} - static void __mlxsw_sp_bridge_vxlan_leave(struct mlxsw_sp *mlxsw_sp, const struct net_device *vxlan_dev) { @@ -2963,6 +2948,21 @@ static void __mlxsw_sp_bridge_vxlan_leave(struct mlxsw_sp *mlxsw_sp, mlxsw_sp_fid_put(fid); } +int mlxsw_sp_bridge_vxlan_join(struct mlxsw_sp *mlxsw_sp, + const struct net_device *br_dev, + const struct net_device *vxlan_dev, u16 vid, + struct netlink_ext_ack *extack) +{ + struct mlxsw_sp_bridge_device *bridge_device; + + bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, br_dev); + if (WARN_ON(!bridge_device)) + return -EINVAL; + + return bridge_device->ops->vxlan_join(bridge_device, vxlan_dev, vid, + extack); +} + void mlxsw_sp_bridge_vxlan_leave(struct mlxsw_sp *mlxsw_sp, const struct net_device *vxlan_dev) { -- 2.50.1 From 139ae87714ebef5ba130175a19fd53fa0df69247 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Mon, 17 Mar 2025 18:37:30 +0100 Subject: [PATCH 04/16] mlxsw: Add VXLAN bridge ports to same hardware domain as physical bridge ports When hardware floods packets to bridge ports, but flooding to VXLAN bridge port fails during encapsulation to one of the remote VTEPs, the packets are trapped to CPU. In such case, the packets are marked with skb->offload_fwd_mark, which means that packet was L2-forwarded in hardware. Software data path repeats flooding, but packets which are marked with skb->offload_fwd_mark will not be flooded by the bridge to bridge ports which are in the same hardware domain as the ingress port. Currently, mlxsw does not add VXLAN bridge ports to the same hardware domain as physical bridge ports despite the fact that the device is able to forward packets to and from VXLAN tunnels in hardware. In some scenarios (as mentioned above) this can result in remote VTEPs receiving duplicate packets. The packets are first flooded by hardware and after an encapsulation failure, they are flooded again to all remote VTEPs by software. Solve this by adding VXLAN bridge ports to the same hardware domain as physical bridge ports, so then nbp_switchdev_allowed_egress() will return false also for VXLAN, and packets will not be sent twice from VXLAN device. switchdev_bridge_port_offload() should get vxlan_dev not as const, so some changes are required. Call switchdev API from mlxsw_sp_bridge_vxlan_{join,leave}() which handle offload configurations. Reported-by: Vladimir Oltean Closes: https://lore.kernel.org/all/20250210152246.4ajumdchwhvbarik@skbuf/ Reported-by: Vladyslav Mykhaliuk Signed-off-by: Amit Cohen Reviewed-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Reviewed-by: Simon Horman Link: https://patch.msgid.link/7279056843140fae3a72c2d204c7886b79d03899.1742224300.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlxsw/spectrum.h | 4 +-- .../mellanox/mlxsw/spectrum_switchdev.c | 28 ++++++++++++++++--- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index fa7082ee5183..37cd1d002b3b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -661,10 +661,10 @@ bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp, const struct net_device *br_dev); int mlxsw_sp_bridge_vxlan_join(struct mlxsw_sp *mlxsw_sp, const struct net_device *br_dev, - const struct net_device *vxlan_dev, u16 vid, + struct net_device *vxlan_dev, u16 vid, struct netlink_ext_ack *extack); void mlxsw_sp_bridge_vxlan_leave(struct mlxsw_sp *mlxsw_sp, - const struct net_device *vxlan_dev); + struct net_device *vxlan_dev); extern struct notifier_block mlxsw_sp_switchdev_notifier; /* spectrum.c */ diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 13ad4e31d701..a48bf342084d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -2950,22 +2950,42 @@ static void __mlxsw_sp_bridge_vxlan_leave(struct mlxsw_sp *mlxsw_sp, int mlxsw_sp_bridge_vxlan_join(struct mlxsw_sp *mlxsw_sp, const struct net_device *br_dev, - const struct net_device *vxlan_dev, u16 vid, + struct net_device *vxlan_dev, u16 vid, struct netlink_ext_ack *extack) { struct mlxsw_sp_bridge_device *bridge_device; + struct mlxsw_sp_port *mlxsw_sp_port; + int err; bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, br_dev); if (WARN_ON(!bridge_device)) return -EINVAL; - return bridge_device->ops->vxlan_join(bridge_device, vxlan_dev, vid, - extack); + mlxsw_sp_port = mlxsw_sp_port_dev_lower_find(bridge_device->dev); + if (!mlxsw_sp_port) + return -EINVAL; + + err = bridge_device->ops->vxlan_join(bridge_device, vxlan_dev, vid, + extack); + if (err) + return err; + + err = switchdev_bridge_port_offload(vxlan_dev, mlxsw_sp_port->dev, + NULL, NULL, NULL, false, extack); + if (err) + goto err_bridge_port_offload; + + return 0; + +err_bridge_port_offload: + __mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, vxlan_dev); + return err; } void mlxsw_sp_bridge_vxlan_leave(struct mlxsw_sp *mlxsw_sp, - const struct net_device *vxlan_dev) + struct net_device *vxlan_dev) { + switchdev_bridge_port_unoffload(vxlan_dev, NULL, NULL, NULL); __mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, vxlan_dev); } -- 2.50.1 From 36ed81bcade9b96e0c7be7cb6bdb5b6773a3ace0 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Mon, 17 Mar 2025 18:37:31 +0100 Subject: [PATCH 05/16] selftests: vxlan_bridge: Test flood with unresolved FDB entry Extend flood test to configure FDB entry with unresolved destination IP, check that packets are not sent twice. Without the previous patch which handles such scenario in mlxsw, the tests fail: $ TESTS='test_flood' ./vxlan_bridge_1d.sh Running tests with UDP port 4789 TEST: VXLAN: flood [ OK ] TEST: VXLAN: flood, unresolved FDB entry [FAIL] vx2 ns2: Expected to capture 10 packets, got 20. $ TESTS='test_flood' ./vxlan_bridge_1q.sh INFO: Running tests with UDP port 4789 TEST: VXLAN: flood vlan 10 [ OK ] TEST: VXLAN: flood vlan 20 [ OK ] TEST: VXLAN: flood vlan 10, unresolved FDB entry [FAIL] vx10 ns2: Expected to capture 10 packets, got 20. TEST: VXLAN: flood vlan 20, unresolved FDB entry [FAIL] vx20 ns2: Expected to capture 10 packets, got 20. With the previous patch, the tests pass. Signed-off-by: Amit Cohen Reviewed-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Reviewed-by: Simon Horman Link: https://patch.msgid.link/7bc96e317531f3bf06319fb2ea447bd8666f29fa.1742224300.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../selftests/net/forwarding/vxlan_bridge_1d.sh | 8 ++++++++ .../selftests/net/forwarding/vxlan_bridge_1q.sh | 15 +++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh index 180c5eca556f..b43816dd998c 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh @@ -428,6 +428,14 @@ __test_flood() test_flood() { __test_flood de:ad:be:ef:13:37 192.0.2.100 "flood" + + # Add an entry with arbitrary destination IP. Verify that packets are + # not duplicated (this can happen if hardware floods the packets, and + # then traps them due to misconfiguration, so software data path repeats + # flooding and resends packets). + bridge fdb append dev vx1 00:00:00:00:00:00 dst 198.51.100.1 self + __test_flood de:ad:be:ef:13:37 192.0.2.100 "flood, unresolved FDB entry" + bridge fdb del dev vx1 00:00:00:00:00:00 dst 198.51.100.1 self } vxlan_fdb_add_del() diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh index fb9a34cb50c6..afc65647f673 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh @@ -539,6 +539,21 @@ test_flood() 10 10 0 10 0 __test_flood ca:fe:be:ef:13:37 198.51.100.100 20 "flood vlan 20" \ 10 0 10 0 10 + + # Add entries with arbitrary destination IP. Verify that packets are + # not duplicated (this can happen if hardware floods the packets, and + # then traps them due to misconfiguration, so software data path repeats + # flooding and resends packets). + bridge fdb append dev vx10 00:00:00:00:00:00 dst 203.0.113.1 self + bridge fdb append dev vx20 00:00:00:00:00:00 dst 203.0.113.2 self + + __test_flood de:ad:be:ef:13:37 192.0.2.100 10 \ + "flood vlan 10, unresolved FDB entry" 10 10 0 10 0 + __test_flood ca:fe:be:ef:13:37 198.51.100.100 20 \ + "flood vlan 20, unresolved FDB entry" 10 0 10 0 10 + + bridge fdb del dev vx20 00:00:00:00:00:00 dst 203.0.113.2 self + bridge fdb del dev vx10 00:00:00:00:00:00 dst 203.0.113.1 self } vxlan_fdb_add_del() -- 2.50.1 From f38805c5d26fe4af97837c10d58074a7496638bf Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Mon, 17 Mar 2025 20:03:13 +0800 Subject: [PATCH 06/16] tcp: support TCP_RTO_MIN_US for set/getsockopt use Support adjusting/reading RTO MIN for socket level by using set/getsockopt(). This new option has the same effect as TCP_BPF_RTO_MIN, which means it doesn't affect RTAX_RTO_MIN usage (by using ip route...). Considering that bpf option was implemented before this patch, so we need to use a standalone new option for pure tcp set/getsockopt() use. When the socket is created, its icsk_rto_min is set to the default value that is controlled by sysctl_tcp_rto_min_us. Then if application calls setsockopt() with TCP_RTO_MIN_US flag to pass a valid value, then icsk_rto_min will be overridden in jiffies unit. This patch adds WRITE_ONCE/READ_ONCE to avoid data-race around icsk_rto_min. Signed-off-by: Jason Xing Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250317120314.41404-2-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 4 ++-- include/net/tcp.h | 2 +- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 13 ++++++++++++- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 054561f8dcae..5c63ab928b97 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1229,8 +1229,8 @@ tcp_pingpong_thresh - INTEGER tcp_rto_min_us - INTEGER Minimal TCP retransmission timeout (in microseconds). Note that the rto_min route option has the highest precedence for configuring this - setting, followed by the TCP_BPF_RTO_MIN socket option, followed by - this tcp_rto_min_us sysctl. + setting, followed by the TCP_BPF_RTO_MIN and TCP_RTO_MIN_US socket + options, followed by this tcp_rto_min_us sysctl. The recommended practice is to use a value less or equal to 200000 microseconds. diff --git a/include/net/tcp.h b/include/net/tcp.h index f8efe56bbccb..4450c384ef17 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -844,7 +844,7 @@ u32 tcp_delack_max(const struct sock *sk); static inline u32 tcp_rto_min(const struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); - u32 rto_min = inet_csk(sk)->icsk_rto_min; + u32 rto_min = READ_ONCE(inet_csk(sk)->icsk_rto_min); if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 92a2e79222ea..a126b3860e63 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -140,6 +140,7 @@ enum { #define TCP_IS_MPTCP 43 /* Is MPTCP being used? */ #define TCP_RTO_MAX_MS 44 /* max rto time in ms */ +#define TCP_RTO_MIN_US 45 /* min rto time in us */ #define TCP_REPAIR_ON 1 #define TCP_REPAIR_OFF 0 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index fde56d28f586..a8a1370d642e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3352,7 +3352,7 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_probes_out = 0; icsk->icsk_probes_tstamp = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; - icsk->icsk_rto_min = TCP_RTO_MIN; + WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN); icsk->icsk_delack_max = TCP_DELACK_MAX; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_snd_cwnd_set(tp, TCP_INIT_CWND); @@ -3833,6 +3833,14 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; WRITE_ONCE(inet_csk(sk)->icsk_rto_max, msecs_to_jiffies(val)); return 0; + case TCP_RTO_MIN_US: { + int rto_min = usecs_to_jiffies(val); + + if (rto_min > TCP_RTO_MIN || rto_min < TCP_TIMEOUT_MIN) + return -EINVAL; + WRITE_ONCE(inet_csk(sk)->icsk_rto_min, rto_min); + return 0; + } } sockopt_lock_sock(sk); @@ -4672,6 +4680,9 @@ zerocopy_rcv_out: case TCP_RTO_MAX_MS: val = jiffies_to_msecs(tcp_rto_max(sk)); break; + case TCP_RTO_MIN_US: + val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_rto_min)); + break; default: return -ENOPROTOOPT; } -- 2.50.1 From 9552f90835ef3552d0af327e48dc360717777d62 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Mon, 17 Mar 2025 20:03:14 +0800 Subject: [PATCH 07/16] tcp: support TCP_DELACK_MAX_US for set/getsockopt use Support adjusting/reading delayed ack max for socket level by using set/getsockopt(). This option aligns with TCP_BPF_DELACK_MAX usage. Considering that bpf option was implemented before this patch, so we need to use a standalone new option for pure tcp set/getsockopt() use. Add WRITE_ONCE/READ_ONCE() to prevent data-race if setsockopt() happens to write one value to icsk_delack_max while icsk_delack_max is being read. Signed-off-by: Jason Xing Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250317120314.41404-3-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 13 ++++++++++++- net/ipv4/tcp_output.c | 2 +- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index a126b3860e63..dc8fdc80e16b 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -141,6 +141,7 @@ enum { #define TCP_IS_MPTCP 43 /* Is MPTCP being used? */ #define TCP_RTO_MAX_MS 44 /* max rto time in ms */ #define TCP_RTO_MIN_US 45 /* min rto time in us */ +#define TCP_DELACK_MAX_US 46 /* max delayed ack time in us */ #define TCP_REPAIR_ON 1 #define TCP_REPAIR_OFF 0 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a8a1370d642e..6edc441b3702 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3353,7 +3353,7 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_probes_tstamp = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN); - icsk->icsk_delack_max = TCP_DELACK_MAX; + WRITE_ONCE(icsk->icsk_delack_max, TCP_DELACK_MAX); tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_snd_cwnd_set(tp, TCP_INIT_CWND); tp->snd_cwnd_cnt = 0; @@ -3841,6 +3841,14 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, WRITE_ONCE(inet_csk(sk)->icsk_rto_min, rto_min); return 0; } + case TCP_DELACK_MAX_US: { + int delack_max = usecs_to_jiffies(val); + + if (delack_max > TCP_DELACK_MAX || delack_max < TCP_TIMEOUT_MIN) + return -EINVAL; + WRITE_ONCE(inet_csk(sk)->icsk_delack_max, delack_max); + return 0; + } } sockopt_lock_sock(sk); @@ -4683,6 +4691,9 @@ zerocopy_rcv_out: case TCP_RTO_MIN_US: val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_rto_min)); break; + case TCP_DELACK_MAX_US: + val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_delack_max)); + break; default: return -ENOPROTOOPT; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e0a4e5432399..9a6061017114 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -4179,7 +4179,7 @@ u32 tcp_delack_max(const struct sock *sk) { u32 delack_from_rto_min = max(tcp_rto_min(sk), 2) - 1; - return min(inet_csk(sk)->icsk_delack_max, delack_from_rto_min); + return min(READ_ONCE(inet_csk(sk)->icsk_delack_max), delack_from_rto_min); } /* Send out a delayed ack, the caller does the policy checking -- 2.50.1 From f9af583a2c769962de599f64627829d5b7a8e5f2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Mar 2025 20:48:48 -0700 Subject: [PATCH 08/16] af_unix: Sort headers. This is a prep patch to make the following changes cleaner. No functional change intended. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250318034934.86708-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 4 +-- net/unix/af_unix.c | 62 +++++++++++++++++++------------------- net/unix/diag.c | 15 ++++----- net/unix/garbage.c | 17 +++++------ net/unix/sysctl_net_unix.c | 1 - net/unix/unix_bpf.c | 4 +-- 6 files changed, 51 insertions(+), 52 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 63129c79b8cb..c8dfdb41916d 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -2,10 +2,10 @@ #ifndef __LINUX_NET_AFUNIX_H #define __LINUX_NET_AFUNIX_H -#include -#include #include #include +#include +#include #include #if IS_ENABLED(CONFIG_UNIX) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 7f8f3859cdb3..1ff0ac99f3f3 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -77,46 +77,46 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include -#include -#include -#include -#include -#include -#include +#include +#include #include -#include -#include -#include +#include #include +#include #include -#include -#include -#include -#include +#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include #include +#include +#include +#include +#include +#include +#include #include +#include #include -#include -#include +#include #include +#include +#include +#include +#include +#include +#include #include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include static atomic_long_t unix_nr_socks; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; diff --git a/net/unix/diag.c b/net/unix/diag.c index 9138af8b465e..ba715507556a 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -1,15 +1,16 @@ // SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include -#include -#include + #include +#include +#include +#include +#include #include -#include +#include #include -#include +#include #include +#include static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) { diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 9848b7b78701..6a641d4b5542 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -63,22 +63,21 @@ * wrt receive and holding up unrelated socket operations. */ +#include +#include #include -#include -#include -#include +#include #include -#include -#include #include -#include #include -#include +#include +#include +#include +#include #include - -#include #include #include +#include #include struct unix_sock *unix_get_socket(struct file *filp) diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index 357b3e5f3847..55118ae897d6 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -8,7 +8,6 @@ #include #include #include - #include static struct ctl_table unix_table[] = { diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c index bca2d86ba97d..86598a959eaa 100644 --- a/net/unix/unix_bpf.c +++ b/net/unix/unix_bpf.c @@ -1,10 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Cong Wang */ -#include #include -#include +#include #include +#include #define unix_sk_has_data(__sk, __psock) \ ({ !skb_queue_empty(&__sk->sk_receive_queue) || \ -- 2.50.1 From 84960bf2403123a5a15711d050455e195a8d5ba3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Mar 2025 20:48:49 -0700 Subject: [PATCH 09/16] af_unix: Move internal definitions to net/unix/. net/af_unix.h is included by core and some LSMs, but most definitions need not be. Let's move struct unix_{vertex,edge} to net/unix/garbage.c and other definitions to net/unix/af_unix.h. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250318034934.86708-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 76 +------------------------------------- net/unix/af_unix.c | 2 + net/unix/af_unix.h | 75 +++++++++++++++++++++++++++++++++++++ net/unix/diag.c | 2 + net/unix/garbage.c | 18 +++++++++ net/unix/sysctl_net_unix.c | 2 + net/unix/unix_bpf.c | 2 + 7 files changed, 102 insertions(+), 75 deletions(-) create mode 100644 net/unix/af_unix.h diff --git a/include/net/af_unix.h b/include/net/af_unix.h index c8dfdb41916d..b5d70baba52b 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -17,61 +17,17 @@ static inline struct unix_sock *unix_get_socket(struct file *filp) } #endif -extern unsigned int unix_tot_inflight; -void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); -void unix_del_edges(struct scm_fp_list *fpl); -void unix_update_edges(struct unix_sock *receiver); -int unix_prepare_fpl(struct scm_fp_list *fpl); -void unix_destroy_fpl(struct scm_fp_list *fpl); -void unix_gc(void); -void wait_for_unix_gc(struct scm_fp_list *fpl); - -struct unix_vertex { - struct list_head edges; - struct list_head entry; - struct list_head scc_entry; - unsigned long out_degree; - unsigned long index; - unsigned long scc_index; -}; - -struct unix_edge { - struct unix_sock *predecessor; - struct unix_sock *successor; - struct list_head vertex_entry; - struct list_head stack_entry; -}; - -struct sock *unix_peer_get(struct sock *sk); - -#define UNIX_HASH_MOD (256 - 1) -#define UNIX_HASH_SIZE (256 * 2) -#define UNIX_HASH_BITS 8 - struct unix_address { refcount_t refcnt; int len; struct sockaddr_un name[]; }; -struct unix_skb_parms { - struct pid *pid; /* Skb credentials */ - kuid_t uid; - kgid_t gid; - struct scm_fp_list *fp; /* Passed files */ -#ifdef CONFIG_SECURITY_NETWORK - u32 secid; /* Security ID */ -#endif - u32 consumed; -} __randomize_layout; - struct scm_stat { atomic_t nr_fds; unsigned long nr_unix_fds; }; -#define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) - /* The AF_UNIX socket */ struct unix_sock { /* WARNING: sk has to be the first member */ @@ -84,6 +40,7 @@ struct unix_sock { struct unix_vertex *vertex; spinlock_t lock; struct socket_wq peer_wq; +#define peer_wait peer_wq.wait wait_queue_entry_t peer_wake; struct scm_stat scm_stat; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) @@ -94,35 +51,4 @@ struct unix_sock { #define unix_sk(ptr) container_of_const(ptr, struct unix_sock, sk) #define unix_peer(sk) (unix_sk(sk)->peer) -#define unix_state_lock(s) spin_lock(&unix_sk(s)->lock) -#define unix_state_unlock(s) spin_unlock(&unix_sk(s)->lock) - -#define peer_wait peer_wq.wait - -long unix_inq_len(struct sock *sk); -long unix_outq_len(struct sock *sk); - -int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, - int flags); -int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, - int flags); -#ifdef CONFIG_SYSCTL -int unix_sysctl_register(struct net *net); -void unix_sysctl_unregister(struct net *net); -#else -static inline int unix_sysctl_register(struct net *net) { return 0; } -static inline void unix_sysctl_unregister(struct net *net) {} -#endif - -#ifdef CONFIG_BPF_SYSCALL -extern struct proto unix_dgram_proto; -extern struct proto unix_stream_proto; - -int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); -int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); -void __init unix_bpf_build_proto(void); -#else -static inline void __init unix_bpf_build_proto(void) -{} -#endif #endif diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1ff0ac99f3f3..6390e04fe916 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -118,6 +118,8 @@ #include #include +#include "af_unix.h" + static atomic_long_t unix_nr_socks; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; diff --git a/net/unix/af_unix.h b/net/unix/af_unix.h new file mode 100644 index 000000000000..ed4aedc42813 --- /dev/null +++ b/net/unix/af_unix.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __AF_UNIX_H +#define __AF_UNIX_H + +#include + +#define UNIX_HASH_MOD (256 - 1) +#define UNIX_HASH_SIZE (256 * 2) +#define UNIX_HASH_BITS 8 + +#define unix_state_lock(s) spin_lock(&unix_sk(s)->lock) +#define unix_state_unlock(s) spin_unlock(&unix_sk(s)->lock) + +struct sock *unix_peer_get(struct sock *sk); + +struct unix_skb_parms { + struct pid *pid; /* skb credentials */ + kuid_t uid; + kgid_t gid; + struct scm_fp_list *fp; /* Passed files */ +#ifdef CONFIG_SECURITY_NETWORK + u32 secid; /* Security ID */ +#endif + u32 consumed; +} __randomize_layout; + +#define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) + +/* GC for SCM_RIGHTS */ +extern unsigned int unix_tot_inflight; +void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); +void unix_del_edges(struct scm_fp_list *fpl); +void unix_update_edges(struct unix_sock *receiver); +int unix_prepare_fpl(struct scm_fp_list *fpl); +void unix_destroy_fpl(struct scm_fp_list *fpl); +void unix_gc(void); +void wait_for_unix_gc(struct scm_fp_list *fpl); + +/* SOCK_DIAG */ +long unix_inq_len(struct sock *sk); +long unix_outq_len(struct sock *sk); + +/* sysctl */ +#ifdef CONFIG_SYSCTL +int unix_sysctl_register(struct net *net); +void unix_sysctl_unregister(struct net *net); +#else +static inline int unix_sysctl_register(struct net *net) +{ + return 0; +} + +static inline void unix_sysctl_unregister(struct net *net) +{ +} +#endif + +/* BPF SOCKMAP */ +int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, int flags); +int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, int flags); + +#ifdef CONFIG_BPF_SYSCALL +extern struct proto unix_dgram_proto; +extern struct proto unix_stream_proto; + +int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); +int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); +void __init unix_bpf_build_proto(void); +#else +static inline void __init unix_bpf_build_proto(void) +{ +} +#endif + +#endif diff --git a/net/unix/diag.c b/net/unix/diag.c index ba715507556a..c7e8c7d008f6 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -12,6 +12,8 @@ #include #include +#include "af_unix.h" + static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) { /* might or might not have a hash table lock */ diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 6a641d4b5542..8c8c7360349d 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -80,6 +80,24 @@ #include #include +#include "af_unix.h" + +struct unix_vertex { + struct list_head edges; + struct list_head entry; + struct list_head scc_entry; + unsigned long out_degree; + unsigned long index; + unsigned long scc_index; +}; + +struct unix_edge { + struct unix_sock *predecessor; + struct unix_sock *successor; + struct list_head vertex_entry; + struct list_head stack_entry; +}; + struct unix_sock *unix_get_socket(struct file *filp) { struct inode *inode = file_inode(filp); diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index 55118ae897d6..236b7faa9254 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -10,6 +10,8 @@ #include #include +#include "af_unix.h" + static struct ctl_table unix_table[] = { { .procname = "max_dgram_qlen", diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c index 86598a959eaa..979dd4c4261a 100644 --- a/net/unix/unix_bpf.c +++ b/net/unix/unix_bpf.c @@ -6,6 +6,8 @@ #include #include +#include "af_unix.h" + #define unix_sk_has_data(__sk, __psock) \ ({ !skb_queue_empty(&__sk->sk_receive_queue) || \ !skb_queue_empty(&__psock->ingress_skb) || \ -- 2.50.1 From 3056172a261c88fc834147f6427b09dfe4d4290b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Mar 2025 20:48:50 -0700 Subject: [PATCH 10/16] af_unix: Explicitly include headers for non-pointer struct fields. include/net/af_unix.h indirectly includes some definitions for structs. Let's include such headers explicitly. linux/atomic.h : scm_stat.nr_fds linux/net.h : unix_sock.peer_wq linux/path.h : unix_sock.path linux/spinlock.h : unix_sock.lock linux/wait.h : unix_sock.peer_wake uapi/linux/un.h : unix_address.name[] linux/socket.h is removed as the structs there are not used directly, and linux/un.h is clarified with uapi as un.h only exists under include/uapi. While at it, duplicate headers are removed from .c files. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250318034934.86708-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 8 ++++++-- net/unix/af_unix.c | 3 --- net/unix/diag.c | 3 --- net/unix/garbage.c | 5 ----- net/unix/unix_bpf.c | 1 - 5 files changed, 6 insertions(+), 14 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index b5d70baba52b..b588069ece7e 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -2,11 +2,15 @@ #ifndef __LINUX_NET_AFUNIX_H #define __LINUX_NET_AFUNIX_H +#include #include +#include +#include #include -#include -#include +#include +#include #include +#include #if IS_ENABLED(CONFIG_UNIX) struct unix_sock *unix_get_socket(struct file *filp); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6390e04fe916..c081440cf576 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -92,7 +92,6 @@ #include #include #include -#include #include #include #include @@ -110,12 +109,10 @@ #include #include #include -#include #include #include #include #include -#include #include #include "af_unix.h" diff --git a/net/unix/diag.c b/net/unix/diag.c index c7e8c7d008f6..8b2247e05596 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -3,13 +3,10 @@ #include #include #include -#include #include -#include #include #include #include -#include #include #include "af_unix.h" diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 8c8c7360349d..cd75502c47f1 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -66,18 +66,13 @@ #include #include #include -#include -#include #include #include #include #include #include -#include -#include #include #include -#include #include #include "af_unix.h" diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c index 979dd4c4261a..e0d30d6d22ac 100644 --- a/net/unix/unix_bpf.c +++ b/net/unix/unix_bpf.c @@ -4,7 +4,6 @@ #include #include #include -#include #include "af_unix.h" -- 2.50.1 From 0083e3e37e07961465bfbc44e19035381eaeb574 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Mar 2025 20:48:51 -0700 Subject: [PATCH 11/16] af_unix: Clean up #include under net/unix/. net/unix/*.c include many unnecessary header files (rtnetlink.h, netdevice.h, etc). Let's clean them up. af_unix.c: +uapi/linux/sockios.h : Only exist under include/uapi +uapi/linux/termios.h : Only exist under include/uapi -linux/freezer.h : No longer use freezable_schedule_timeout() -linux/in.h : No ipv4_is_XXX() etc -linux/module.h : No longer support CONFIG_UNIX=m -linux/netdevice.h : No dev used -linux/rtnetlink.h : Not part of rtnetlink API -linux/signal.h : signal_pending() is defined in sched/signal.h -linux/stat.h : No struct stat used -net/checksum.h : CHECKSUM_UNNECESSARY is defined in skbuff.h diag.c: +linux/dcache.h : struct dentry in sk_diag_dump_vfs() +linux/user_namespace.h : struct user_namespace in sk_diag_dump_uid() +uapi/linux/unix_diag.h : Only exist under include/uapi/ garbage.c: +linux/list.h : struct unix_{vertex,edge}, etc +linux/workqueue.h : DECLARE_WORK(unix_gc_work, ...) -linux/file.h : No fget() etc -linux/kernel.h : No cond_resched() etc -linux/netdevice.h : No dev used -linux/proc_fs.h : No procfs provided -linux/string.h : No memcpy(), kmemdup(), etc sysctl_net_unix.c: +linux/string.h : kmemdup() +net/net_namespace.h : struct net, net_eq() -linux/mm.h : slab.h is enough Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250318034934.86708-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 12 ++---------- net/unix/diag.c | 4 +++- net/unix/garbage.c | 7 ++----- net/unix/sysctl_net_unix.c | 3 ++- 4 files changed, 9 insertions(+), 17 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c081440cf576..f78a2492826f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -84,36 +84,28 @@ #include #include #include -#include #include -#include #include #include -#include #include #include -#include #include #include -#include #include #include #include -#include #include #include #include -#include #include -#include #include -#include #include #include -#include #include #include #include +#include +#include #include "af_unix.h" diff --git a/net/unix/diag.c b/net/unix/diag.c index 8b2247e05596..79b182d0e62a 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -1,13 +1,15 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include #include -#include +#include #include #include #include +#include #include "af_unix.h" diff --git a/net/unix/garbage.c b/net/unix/garbage.c index cd75502c47f1..01e2b9452c75 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -63,14 +63,11 @@ * wrt receive and holding up unrelated socket operations. */ -#include #include -#include -#include -#include +#include #include #include -#include +#include #include #include #include diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index 236b7faa9254..e02ed6e3955c 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -5,10 +5,11 @@ * Authors: Mike Shaver. */ -#include #include +#include #include #include +#include #include "af_unix.h" -- 2.50.1 From 6165feda3d8cd42663e50aa090e341e673990950 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 18 Mar 2025 13:53:34 +0000 Subject: [PATCH 12/16] net: tulip: avoid unused variable warning MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit There is an effort to achieve W=1 kernel builds without warnings. As part of that effort Helge Deller highlighted the following warnings in the tulip driver when compiling with W=1 and CONFIG_TULIP_MWI=n: .../tulip_core.c: In function ‘tulip_init_one’: .../tulip_core.c:1309:22: warning: variable ‘force_csr0’ set but not used This patch addresses that problem using IS_ENABLED(). This approach has the added benefit of reducing conditionally compiled code. And thus increasing compile coverage. E.g. for allmodconfig builds which enable CONFIG_TULIP_MWI. Compile tested only. No run-time effect intended. Acked-by: Helge Deller Signed-off-by: Simon Horman Link: https://patch.msgid.link/20250318-tulip-w1-v3-1-a813fadd164d@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/dec/tulip/tulip_core.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/dec/tulip/tulip_core.c b/drivers/net/ethernet/dec/tulip/tulip_core.c index 27e01d780cd0..75eac18ff246 100644 --- a/drivers/net/ethernet/dec/tulip/tulip_core.c +++ b/drivers/net/ethernet/dec/tulip/tulip_core.c @@ -1177,7 +1177,6 @@ static void set_rx_mode(struct net_device *dev) iowrite32(csr6, ioaddr + CSR6); } -#ifdef CONFIG_TULIP_MWI static void tulip_mwi_config(struct pci_dev *pdev, struct net_device *dev) { struct tulip_private *tp = netdev_priv(dev); @@ -1251,7 +1250,6 @@ out: netdev_dbg(dev, "MWI config cacheline=%d, csr0=%08x\n", cache, csr0); } -#endif /* * Chips that have the MRM/reserved bit quirk and the burst quirk. That @@ -1463,10 +1461,9 @@ static int tulip_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) INIT_WORK(&tp->media_work, tulip_tbl[tp->chip_id].media_task); -#ifdef CONFIG_TULIP_MWI - if (!force_csr0 && (tp->flags & HAS_PCI_MWI)) + if (IS_ENABLED(CONFIG_TULIP_MWI) && !force_csr0 && + (tp->flags & HAS_PCI_MWI)) tulip_mwi_config (pdev, dev); -#endif /* Stop the chip's Tx and Rx processes. */ tulip_stop_rxtx(tp); -- 2.50.1 From 07b2fbffaaea9474825fd5e08f81cd62b0378cf3 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 18 Mar 2025 12:46:05 +0100 Subject: [PATCH 13/16] net: mvneta: Add metadata support for xdp mode Set metadata size building the skb from xdp_buff in mvneta driver mvneta sets xdp headroom to: MVNETA_MH_SIZE + MVNETA_SKB_HEADROOM where MVNETA_MH_SIZE 2 MVNETA_SKB_HEADROOM max(NET_SKB_PAD, XDP_PACKET_HEADROOM) so the headroom is large enough to contain xdp_frame and xdp metadata. Reviewed-by: Michal Kubiak Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250318-mvneta-xdp-meta-v2-1-b6075778f61f@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvneta.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 44b18c573909..147571fdada3 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2342,7 +2342,7 @@ mvneta_swbm_rx_frame(struct mvneta_port *pp, prefetch(data); xdp_buff_clear_frags_flag(xdp); xdp_prepare_buff(xdp, data, pp->rx_offset_correction + MVNETA_MH_SIZE, - data_len, false); + data_len, true); } static void @@ -2396,6 +2396,7 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool, struct xdp_buff *xdp, u32 desc_status) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + u32 metasize = xdp->data - xdp->data_meta; struct sk_buff *skb; u8 num_frags; @@ -2410,6 +2411,8 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool, skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); + if (metasize) + skb_metadata_set(skb, metasize); skb->ip_summed = mvneta_rx_csum(pp, desc_status); if (unlikely(xdp_buff_has_frags(xdp))) -- 2.50.1 From 9a45e193c88a55a536d7fd0ebfa29823d588c2cf Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 18 Mar 2025 12:46:06 +0100 Subject: [PATCH 14/16] net: mvpp2: Add metadata support for xdp mode Set metadata size building the skb from xdp_buff in mvpp2 driver mvpp2 driver sets xdp headroom to: MVPP2_MH_SIZE + MVPP2_SKB_HEADROOM where MVPP2_MH_SIZE 2 MVPP2_SKB_HEADROOM min(max(XDP_PACKET_HEADROOM, NET_SKB_PAD), 224) so the headroom is large enough to contain xdp_frame and xdp metadata. Please note this patch is just compiled tested. Reviewed-by: Michal Kubiak Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250318-mvneta-xdp-meta-v2-2-b6075778f61f@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index f166dc4e6503..54a235366a01 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -3915,13 +3915,13 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, while (rx_done < rx_todo) { struct mvpp2_rx_desc *rx_desc = mvpp2_rxq_next_desc_get(rxq); + u32 rx_status, timestamp, metasize = 0; struct mvpp2_bm_pool *bm_pool; struct page_pool *pp = NULL; struct sk_buff *skb; unsigned int frag_size; dma_addr_t dma_addr; phys_addr_t phys_addr; - u32 rx_status, timestamp; int pool, rx_bytes, err, ret; struct page *page; void *data; @@ -3983,7 +3983,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, xdp_init_buff(&xdp, PAGE_SIZE, xdp_rxq); xdp_prepare_buff(&xdp, data, MVPP2_MH_SIZE + MVPP2_SKB_HEADROOM, - rx_bytes, false); + rx_bytes, true); ret = mvpp2_run_xdp(port, xdp_prog, &xdp, pp, &ps); @@ -3999,6 +3999,8 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, ps.rx_bytes += rx_bytes; continue; } + + metasize = xdp.data - xdp.data_meta; } if (frag_size) @@ -4038,6 +4040,8 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, skb_reserve(skb, MVPP2_MH_SIZE + MVPP2_SKB_HEADROOM); skb_put(skb, rx_bytes); + if (metasize) + skb_metadata_set(skb, metasize); skb->ip_summed = mvpp2_rx_csum(port, rx_status); skb->protocol = eth_type_trans(skb, dev); -- 2.50.1 From a5fec3c881420249350be31b7ec99d4aac069b40 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 18 Mar 2025 12:46:07 +0100 Subject: [PATCH 15/16] net: netsec: Add metadata support for xdp mode Set metadata size building the skb from xdp_buff in netsec driver. netsec driver sets xdp headroom to NETSEC_RXBUF_HEADROOM: NETSEC_RXBUF_HEADROOM max(XDP_PACKET_HEADROOM, NET_SKB_PAD) + NET_IP_ALIGN so the headroom is large enough to contain xdp_frame and xdp metadata. Please note this patch is just compiled tested. Acked-by: Ilias Apalodimas Reviewed-by: Michal Kubiak Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250318-mvneta-xdp-meta-v2-3-b6075778f61f@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/socionext/netsec.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index dc99821c6226..ee890de69ffe 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -970,7 +970,7 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) struct netsec_de *de = dring->vaddr + (DESC_SZ * idx); struct netsec_desc *desc = &dring->desc[idx]; struct page *page = virt_to_page(desc->addr); - u32 xdp_result = NETSEC_XDP_PASS; + u32 metasize, xdp_result = NETSEC_XDP_PASS; struct sk_buff *skb = NULL; u16 pkt_len, desc_len; dma_addr_t dma_handle; @@ -1019,7 +1019,7 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) prefetch(desc->addr); xdp_prepare_buff(&xdp, desc->addr, NETSEC_RXBUF_HEADROOM, - pkt_len, false); + pkt_len, true); if (xdp_prog) { xdp_result = netsec_run_xdp(priv, xdp_prog, &xdp); @@ -1048,6 +1048,9 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) skb_reserve(skb, xdp.data - xdp.data_hard_start); skb_put(skb, xdp.data_end - xdp.data); + metasize = xdp.data - xdp.data_meta; + if (metasize) + skb_metadata_set(skb, metasize); skb->protocol = eth_type_trans(skb, priv->ndev); if (priv->rx_cksum_offload_flag && -- 2.50.1 From 33bfff8fc8ba3833440228b93e36b9d0781e1b51 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 18 Mar 2025 12:46:08 +0100 Subject: [PATCH 16/16] net: octeontx2: Add metadata support for xdp mode Set metadata size building the skb from xdp_buff in octeontx2 driver. octeontx2 driver sets xdp headroom to OTX2_HEAD_ROOM OTX2_HEAD_ROOM OTX2_ALIGN OTX2_ALIGN 128 so the headroom is large enough to contain xdp_frame and xdp metadata. Please note this patch is just compiled tested. Reviewed-by: Michal Kubiak Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250318-mvneta-xdp-meta-v2-4-b6075778f61f@kernel.org Signed-off-by: Jakub Kicinski --- .../net/ethernet/marvell/octeontx2/nic/otx2_txrx.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index 6bc5ce5a9f61..af8cabe828d0 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -41,7 +41,7 @@ static bool otx2_xdp_rcv_pkt_handler(struct otx2_nic *pfvf, struct bpf_prog *prog, struct nix_cqe_rx_s *cqe, struct otx2_cq_queue *cq, - bool *need_xdp_flush); + u32 *metasize, bool *need_xdp_flush); static void otx2_sq_set_sqe_base(struct otx2_snd_queue *sq, struct sk_buff *skb) @@ -336,6 +336,7 @@ static void otx2_rcv_pkt_handler(struct otx2_nic *pfvf, struct nix_rx_sg_s *sg = &cqe->sg; struct sk_buff *skb = NULL; void *end, *start; + u32 metasize = 0; u64 *seg_addr; u16 *seg_size; int seg; @@ -346,7 +347,8 @@ static void otx2_rcv_pkt_handler(struct otx2_nic *pfvf, } if (pfvf->xdp_prog) - if (otx2_xdp_rcv_pkt_handler(pfvf, pfvf->xdp_prog, cqe, cq, need_xdp_flush)) + if (otx2_xdp_rcv_pkt_handler(pfvf, pfvf->xdp_prog, cqe, cq, + &metasize, need_xdp_flush)) return; skb = napi_get_frags(napi); @@ -378,6 +380,8 @@ static void otx2_rcv_pkt_handler(struct otx2_nic *pfvf, skb->mark = parse->match_id; skb_mark_for_recycle(skb); + if (metasize) + skb_metadata_set(skb, metasize); napi_gro_frags(napi); } @@ -1482,7 +1486,7 @@ static bool otx2_xdp_rcv_pkt_handler(struct otx2_nic *pfvf, struct bpf_prog *prog, struct nix_cqe_rx_s *cqe, struct otx2_cq_queue *cq, - bool *need_xdp_flush) + u32 *metasize, bool *need_xdp_flush) { struct xdp_buff xdp, *xsk_buff = NULL; unsigned char *hard_start; @@ -1514,13 +1518,14 @@ static bool otx2_xdp_rcv_pkt_handler(struct otx2_nic *pfvf, hard_start = (unsigned char *)phys_to_virt(pa); xdp_prepare_buff(&xdp, hard_start, OTX2_HEAD_ROOM, - cqe->sg.seg_size, false); + cqe->sg.seg_size, true); act = bpf_prog_run_xdp(prog, &xdp); handle_xdp_verdict: switch (act) { case XDP_PASS: + *metasize = xdp.data - xdp.data_meta; break; case XDP_TX: qidx += pfvf->hw.tx_queues; -- 2.50.1