From ee512922ddd7d64afe2b28830a88f19063217649 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Wed, 30 Apr 2025 15:30:43 +0200 Subject: [PATCH 01/16] net: vertexcom: mse102x: Fix RX error handling In case the CMD_RTS got corrupted by interferences, the MSE102x doesn't allow a retransmission of the command. Instead the Ethernet frame must be shifted out of the SPI FIFO. Since the actual length is unknown, assume the maximum possible value. Fixes: 2f207cbf0dd4 ("net: vertexcom: Add MSE102x SPI support") Signed-off-by: Stefan Wahren Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250430133043.7722-5-wahrenst@gmx.net Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/vertexcom/mse102x.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/vertexcom/mse102x.c b/drivers/net/ethernet/vertexcom/mse102x.c index 2c06d1d05164..e4d993f31374 100644 --- a/drivers/net/ethernet/vertexcom/mse102x.c +++ b/drivers/net/ethernet/vertexcom/mse102x.c @@ -263,7 +263,7 @@ static int mse102x_tx_frame_spi(struct mse102x_net *mse, struct sk_buff *txp, } static int mse102x_rx_frame_spi(struct mse102x_net *mse, u8 *buff, - unsigned int frame_len) + unsigned int frame_len, bool drop) { struct mse102x_net_spi *mses = to_mse102x_spi(mse); struct spi_transfer *xfer = &mses->spi_xfer; @@ -281,6 +281,9 @@ static int mse102x_rx_frame_spi(struct mse102x_net *mse, u8 *buff, netdev_err(mse->ndev, "%s: spi_sync() failed: %d\n", __func__, ret); mse->stats.xfer_err++; + } else if (drop) { + netdev_dbg(mse->ndev, "%s: Drop frame\n", __func__); + ret = -EINVAL; } else if (*sof != cpu_to_be16(DET_SOF)) { netdev_dbg(mse->ndev, "%s: SPI start of frame is invalid (0x%04x)\n", __func__, *sof); @@ -308,6 +311,7 @@ static void mse102x_rx_pkt_spi(struct mse102x_net *mse) struct sk_buff *skb; unsigned int rxalign; unsigned int rxlen; + bool drop = false; __be16 rx = 0; u16 cmd_resp; u8 *rxpkt; @@ -330,7 +334,8 @@ static void mse102x_rx_pkt_spi(struct mse102x_net *mse) net_dbg_ratelimited("%s: Unexpected response (0x%04x)\n", __func__, cmd_resp); mse->stats.invalid_rts++; - return; + drop = true; + goto drop; } net_dbg_ratelimited("%s: Unexpected response to first CMD\n", @@ -342,9 +347,16 @@ static void mse102x_rx_pkt_spi(struct mse102x_net *mse) net_dbg_ratelimited("%s: Invalid frame length: %d\n", __func__, rxlen); mse->stats.invalid_len++; - return; + drop = true; } + /* In case of a invalid CMD_RTS, the frame must be consumed anyway. + * So assume the maximum possible frame length. + */ +drop: + if (drop) + rxlen = VLAN_ETH_FRAME_LEN; + rxalign = ALIGN(rxlen + DET_SOF_LEN + DET_DFT_LEN, 4); skb = netdev_alloc_skb_ip_align(mse->ndev, rxalign); if (!skb) @@ -355,7 +367,7 @@ static void mse102x_rx_pkt_spi(struct mse102x_net *mse) * They are copied, but ignored. */ rxpkt = skb_put(skb, rxlen) - DET_SOF_LEN; - if (mse102x_rx_frame_spi(mse, rxpkt, rxlen)) { + if (mse102x_rx_frame_spi(mse, rxpkt, rxlen, drop)) { mse->ndev->stats.rx_errors++; dev_kfree_skb(skb); return; -- 2.50.1 From 3769478610135e82b262640252d90f6efb05be71 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 28 Apr 2025 16:29:54 -0700 Subject: [PATCH 02/16] sch_htb: make htb_deactivate() idempotent Alan reported a NULL pointer dereference in htb_next_rb_node() after we made htb_qlen_notify() idempotent. It turns out in the following case it introduced some regression: htb_dequeue_tree(): |-> fq_codel_dequeue() |-> qdisc_tree_reduce_backlog() |-> htb_qlen_notify() |-> htb_deactivate() |-> htb_next_rb_node() |-> htb_deactivate() For htb_next_rb_node(), after calling the 1st htb_deactivate(), the clprio[prio]->ptr could be already set to NULL, which means htb_next_rb_node() is vulnerable here. For htb_deactivate(), although we checked qlen before calling it, in case of qlen==0 after qdisc_tree_reduce_backlog(), we may call it again which triggers the warning inside. To fix the issues here, we need to: 1) Make htb_deactivate() idempotent, that is, simply return if we already call it before. 2) Make htb_next_rb_node() safe against ptr==NULL. Many thanks to Alan for testing and for the reproducer. Fixes: 5ba8b837b522 ("sch_htb: make htb_qlen_notify() idempotent") Reported-by: Alan J. Wylie Signed-off-by: Cong Wang Link: https://patch.msgid.link/20250428232955.1740419-2-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/sch_htb.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 4b9a639b642e..14bf71f57057 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -348,7 +348,8 @@ static void htb_add_to_wait_tree(struct htb_sched *q, */ static inline void htb_next_rb_node(struct rb_node **n) { - *n = rb_next(*n); + if (*n) + *n = rb_next(*n); } /** @@ -609,8 +610,8 @@ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) */ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) { - WARN_ON(!cl->prio_activity); - + if (!cl->prio_activity) + return; htb_deactivate_prios(q, cl); cl->prio_activity = 0; } @@ -1485,8 +1486,6 @@ static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg) { struct htb_class *cl = (struct htb_class *)arg; - if (!cl->prio_activity) - return; htb_deactivate(qdisc_priv(sch), cl); } @@ -1740,8 +1739,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg, if (cl->parent) cl->parent->children--; - if (cl->prio_activity) - htb_deactivate(q, cl); + htb_deactivate(q, cl); if (cl->cmode != HTB_CAN_SEND) htb_safe_rb_erase(&cl->pq_node, @@ -1949,8 +1947,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, /* turn parent into inner node */ qdisc_purge_queue(parent->leaf.q); parent_qdisc = parent->leaf.q; - if (parent->prio_activity) - htb_deactivate(q, parent); + htb_deactivate(q, parent); /* remove from evt list because of level change */ if (parent->cmode != HTB_CAN_SEND) { -- 2.50.1 From 63890286f557aa5c4eed7e90a5a31658de8fdb4d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 28 Apr 2025 16:29:55 -0700 Subject: [PATCH 03/16] selftests/tc-testing: Add a test case to cover basic HTB+FQ_CODEL case Integrate the reproducer from Alan into TC selftests and use scapy to generate TCP traffic instead of relying on ping command. Cc: Alan J. Wylie Signed-off-by: Cong Wang Link: https://patch.msgid.link/20250428232955.1740419-3-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- .../tc-testing/tc-tests/infra/qdiscs.json | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 0843f6d37e9c..a951c0d33cd2 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -538,5 +538,40 @@ "$TC qdisc del dev $DUMMY handle 1:0 root", "$IP addr del 10.10.10.10/24 dev $DUMMY || true" ] + }, + { + "id": "62c4", + "name": "Test HTB with FQ_CODEL - basic functionality", + "category": [ + "qdisc", + "htb", + "fq_codel" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 root handle 1: htb default 11", + "$TC class add dev $DEV1 parent 1: classid 1:1 htb rate 10kbit", + "$TC class add dev $DEV1 parent 1:1 classid 1:11 htb rate 10kbit prio 0 quantum 1486", + "$TC qdisc add dev $DEV1 parent 1:11 fq_codel quantum 300 noecn", + "sleep 0.5" + ], + "scapy": { + "iface": "$DEV0", + "count": 5, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/TCP(sport=12345, dport=80)" + }, + "cmdUnderTest": "$TC -s qdisc show dev $DEV1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DEV1 | grep -A 5 'qdisc fq_codel'", + "matchPattern": "Sent [0-9]+ bytes [0-9]+ pkt", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 handle 1: root" + ] } ] -- 2.50.1 From 1e20324b23f0afba27997434fb978f1e4a1dbcb6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 30 Apr 2025 09:37:58 -0700 Subject: [PATCH 04/16] virtio-net: don't re-enable refill work too early when NAPI is disabled Commit 4bc12818b363 ("virtio-net: disable delayed refill when pausing rx") fixed a deadlock between reconfig paths and refill work trying to disable the same NAPI instance. The refill work can't run in parallel with reconfig because trying to double-disable a NAPI instance causes a stall under the instance lock, which the reconfig path needs to re-enable the NAPI and therefore unblock the stalled thread. There are two cases where we re-enable refill too early. One is in the virtnet_set_queues() handler. We call it when installing XDP: virtnet_rx_pause_all(vi); ... virtnet_napi_tx_disable(..); ... virtnet_set_queues(..); ... virtnet_rx_resume_all(..); We want the work to be disabled until we call virtnet_rx_resume_all(), but virtnet_set_queues() kicks it before NAPIs were re-enabled. The other case is a more trivial case of mis-ordering in __virtnet_rx_resume() found by code inspection. Taking the spin lock in virtnet_set_queues() (requested during review) may be unnecessary as we are under rtnl_lock and so are all paths writing to ->refill_enabled. Acked-by: Michael S. Tsirkin Reviewed-by: Bui Quang Minh Fixes: 4bc12818b363 ("virtio-net: disable delayed refill when pausing rx") Fixes: 413f0271f396 ("net: protect NAPI enablement with netdev_lock()") Link: https://patch.msgid.link/20250430163758.3029367-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 848fab51dfa1..b5549d542c02 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3383,12 +3383,15 @@ static void __virtnet_rx_resume(struct virtnet_info *vi, bool refill) { bool running = netif_running(vi->dev); + bool schedule_refill = false; if (refill && !try_fill_recv(vi, rq, GFP_KERNEL)) - schedule_delayed_work(&vi->refill, 0); - + schedule_refill = true; if (running) virtnet_napi_enable(rq); + + if (schedule_refill) + schedule_delayed_work(&vi->refill, 0); } static void virtnet_rx_resume_all(struct virtnet_info *vi) @@ -3728,8 +3731,10 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) succ: vi->curr_queue_pairs = queue_pairs; /* virtnet_open() will refill when device is going to up. */ - if (dev->flags & IFF_UP) + spin_lock_bh(&vi->refill_lock); + if (dev->flags & IFF_UP && vi->refill_enabled) schedule_delayed_work(&vi->refill, 0); + spin_unlock_bh(&vi->refill_lock); return 0; } -- 2.50.1 From 4397684a292a71fbc1e815c3e283f7490ddce5ae Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 30 Apr 2025 09:38:36 -0700 Subject: [PATCH 05/16] virtio-net: free xsk_buffs on error in virtnet_xsk_pool_enable() The selftests added to our CI by Bui Quang Minh recently reveals that there is a mem leak on the error path of virtnet_xsk_pool_enable(): unreferenced object 0xffff88800a68a000 (size 2048): comm "xdp_helper", pid 318, jiffies 4294692778 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc 0): __kvmalloc_node_noprof+0x402/0x570 virtnet_xsk_pool_enable+0x293/0x6a0 (drivers/net/virtio_net.c:5882) xp_assign_dev+0x369/0x670 (net/xdp/xsk_buff_pool.c:226) xsk_bind+0x6a5/0x1ae0 __sys_bind+0x15e/0x230 __x64_sys_bind+0x72/0xb0 do_syscall_64+0xc1/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f Acked-by: Jason Wang Fixes: e9f3962441c0 ("virtio_net: xsk: rx: support fill with xsk buffer") Link: https://patch.msgid.link/20250430163836.3029761-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index b5549d542c02..f9e3e628ec4d 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -5890,8 +5890,10 @@ static int virtnet_xsk_pool_enable(struct net_device *dev, hdr_dma = virtqueue_dma_map_single_attrs(sq->vq, &xsk_hdr, vi->hdr_len, DMA_TO_DEVICE, 0); - if (virtqueue_dma_mapping_error(sq->vq, hdr_dma)) - return -ENOMEM; + if (virtqueue_dma_mapping_error(sq->vq, hdr_dma)) { + err = -ENOMEM; + goto err_free_buffs; + } err = xsk_pool_dma_map(pool, dma_dev, 0); if (err) @@ -5919,6 +5921,8 @@ err_rq: err_xsk_map: virtqueue_dma_unmap_single_attrs(rq->vq, hdr_dma, vi->hdr_len, DMA_TO_DEVICE, 0); +err_free_buffs: + kvfree(rq->xsk_buffs); return err; } -- 2.50.1 From c360eb0c3ccb95306704fd221442283ee82f1f58 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 30 Apr 2025 11:21:35 -0500 Subject: [PATCH 06/16] dt-bindings: net: ethernet-controller: Add informative text about RGMII delays Device Tree and Ethernet MAC driver writers often misunderstand RGMII delays. Rewrite the Normative section in terms of the PCB, is the PCB adding the 2ns delay. This meaning was previous implied by the definition, but often wrongly interpreted due to the ambiguous wording and looking at the definition from the wrong perspective. The new definition concentrates clearly on the hardware, and should be less ambiguous. Add an Informative section to the end of the binding describing in detail what the four RGMII delays mean. This expands on just the PCB meaning, adding in the implications for the MAC and PHY. Additionally, when the MAC or PHY needs to add a delay, which is software configuration, describe how Linux does this, in the hope of reducing errors. Make it clear other users of device tree binding may implement the software configuration in other ways while still conforming to the binding. Fixes: 9d3de3c58347 ("dt-bindings: net: Add YAML schemas for the generic Ethernet options") Signed-off-by: Andrew Lunn Acked-by: Conor Dooley Link: https://patch.msgid.link/20250430-v6-15-rc3-net-rgmii-delays-v2-1-099ae651d5e5@lunn.ch Signed-off-by: Jakub Kicinski --- .../bindings/net/ethernet-controller.yaml | 97 +++++++++++++++++-- 1 file changed, 90 insertions(+), 7 deletions(-) diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml b/Documentation/devicetree/bindings/net/ethernet-controller.yaml index 45819b235800..a2d4c626f659 100644 --- a/Documentation/devicetree/bindings/net/ethernet-controller.yaml +++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml @@ -74,19 +74,17 @@ properties: - rev-rmii - moca - # RX and TX delays are added by the MAC when required + # RX and TX delays are provided by the PCB. See below - rgmii - # RGMII with internal RX and TX delays provided by the PHY, - # the MAC should not add the RX or TX delays in this case + # RX and TX delays are not provided by the PCB. This is the most + # frequent case. See below - rgmii-id - # RGMII with internal RX delay provided by the PHY, the MAC - # should not add an RX delay in this case + # TX delay is provided by the PCB. See below - rgmii-rxid - # RGMII with internal TX delay provided by the PHY, the MAC - # should not add an TX delay in this case + # RX delay is provided by the PCB. See below - rgmii-txid - rtbi - smii @@ -286,4 +284,89 @@ allOf: additionalProperties: true +# Informative +# =========== +# +# 'phy-modes' & 'phy-connection-type' properties 'rgmii', 'rgmii-id', +# 'rgmii-rxid', and 'rgmii-txid' are frequently used wrongly by +# developers. This informative section clarifies their usage. +# +# The RGMII specification requires a 2ns delay between the data and +# clock signals on the RGMII bus. How this delay is implemented is not +# specified. +# +# One option is to make the clock traces on the PCB longer than the +# data traces. A sufficiently difference in length can provide the 2ns +# delay. If both the RX and TX delays are implemented in this manner, +# 'rgmii' should be used, so indicating the PCB adds the delays. +# +# If the PCB does not add these delays via extra long traces, +# 'rgmii-id' should be used. Here, 'id' refers to 'internal delay', +# where either the MAC or PHY adds the delay. +# +# If only one of the two delays are implemented via extra long clock +# lines, either 'rgmii-rxid' or 'rgmii-txid' should be used, +# indicating the MAC or PHY should implement one of the delays +# internally, while the PCB implements the other delay. +# +# Device Tree describes hardware, and in this case, it describes the +# PCB between the MAC and the PHY, if the PCB implements delays or +# not. +# +# In practice, very few PCBs make use of extra long clock lines. Hence +# any RGMII phy mode other than 'rgmii-id' is probably wrong, and is +# unlikely to be accepted during review without details provided in +# the commit description and comments in the .dts file. +# +# When the PCB does not implement the delays, the MAC or PHY must. As +# such, this is software configuration, and so not described in Device +# Tree. +# +# The following describes how Linux implements the configuration of +# the MAC and PHY to add these delays when the PCB does not. As stated +# above, developers often get this wrong, and the aim of this section +# is reduce the frequency of these errors by Linux developers. Other +# users of the Device Tree may implement it differently, and still be +# consistent with both the normative and informative description +# above. +# +# By default in Linux, when using phylib/phylink, the MAC is expected +# to read the 'phy-mode' from Device Tree, not implement any delays, +# and pass the value to the PHY. The PHY will then implement delays as +# specified by the 'phy-mode'. The PHY should always be reconfigured +# to implement the needed delays, replacing any setting performed by +# strapping or the bootloader, etc. +# +# Experience to date is that all PHYs which implement RGMII also +# implement the ability to add or not add the needed delays. Hence +# this default is expected to work in all cases. Ignoring this default +# is likely to be questioned by Reviews, and require a strong argument +# to be accepted. +# +# There are a small number of cases where the MAC has hard coded +# delays which cannot be disabled. The 'phy-mode' only describes the +# PCB. The inability to disable the delays in the MAC does not change +# the meaning of 'phy-mode'. It does however mean that a 'phy-mode' of +# 'rgmii' is now invalid, it cannot be supported, since both the PCB +# and the MAC and PHY adding delays cannot result in a functional +# link. Thus the MAC should report a fatal error for any modes which +# cannot be supported. When the MAC implements the delay, it must +# ensure that the PHY does not also implement the same delay. So it +# must modify the phy-mode it passes to the PHY, removing the delay it +# has added. Failure to remove the delay will result in a +# non-functioning link. +# +# Sometimes there is a need to fine tune the delays. Often the MAC or +# PHY can perform this fine tuning. In the MAC node, the Device Tree +# properties 'rx-internal-delay-ps' and 'tx-internal-delay-ps' should +# be used to indicate fine tuning performed by the MAC. The values +# expected here are small. A value of 2000ps, i.e 2ns, and a phy-mode +# of 'rgmii' will not be accepted by Reviewers. +# +# If the PHY is to perform fine tuning, the properties +# 'rx-internal-delay-ps' and 'tx-internal-delay-ps' in the PHY node +# should be used. When the PHY is implementing delays, e.g. 'rgmii-id' +# these properties should have a value near to 2000ps. If the PCB is +# implementing delays, e.g. 'rgmii', a small value can be used to fine +# tune the delay added by the PCB. ... -- 2.50.1 From 3e6a0243ff002ddbd7ee18a8974ae61d2e6ed00d Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 3 May 2025 00:57:52 +0200 Subject: [PATCH 07/16] gre: Fix again IPv6 link-local address generation. Use addrconf_addr_gen() to generate IPv6 link-local addresses on GRE devices in most cases and fall back to using add_v4_addrs() only in case the GRE configuration is incompatible with addrconf_addr_gen(). GRE used to use addrconf_addr_gen() until commit e5dd729460ca ("ip/ip6_gre: use the same logic as SIT interfaces when computing v6LL address") restricted this use to gretap and ip6gretap devices, and created add_v4_addrs() (borrowed from SIT) for non-Ethernet GRE ones. The original problem came when commit 9af28511be10 ("addrconf: refuse isatap eui64 for INADDR_ANY") made __ipv6_isatap_ifid() fail when its addr parameter was 0. The commit says that this would create an invalid address, however, I couldn't find any RFC saying that the generated interface identifier would be wrong. Anyway, since gre over IPv4 devices pass their local tunnel address to __ipv6_isatap_ifid(), that commit broke their IPv6 link-local address generation when the local address was unspecified. Then commit e5dd729460ca ("ip/ip6_gre: use the same logic as SIT interfaces when computing v6LL address") tried to fix that case by defining add_v4_addrs() and calling it to generate the IPv6 link-local address instead of using addrconf_addr_gen() (apart for gretap and ip6gretap devices, which would still use the regular addrconf_addr_gen(), since they have a MAC address). That broke several use cases because add_v4_addrs() isn't properly integrated into the rest of IPv6 Neighbor Discovery code. Several of these shortcomings have been fixed over time, but add_v4_addrs() remains broken on several aspects. In particular, it doesn't send any Router Sollicitations, so the SLAAC process doesn't start until the interface receives a Router Advertisement. Also, add_v4_addrs() mostly ignores the address generation mode of the interface (/proc/sys/net/ipv6/conf/*/addr_gen_mode), thus breaking the IN6_ADDR_GEN_MODE_RANDOM and IN6_ADDR_GEN_MODE_STABLE_PRIVACY cases. Fix the situation by using add_v4_addrs() only in the specific scenario where the normal method would fail. That is, for interfaces that have all of the following characteristics: * run over IPv4, * transport IP packets directly, not Ethernet (that is, not gretap interfaces), * tunnel endpoint is INADDR_ANY (that is, 0), * device address generation mode is EUI64. In all other cases, revert back to the regular addrconf_addr_gen(). Also, remove the special case for ip6gre interfaces in add_v4_addrs(), since ip6gre devices now always use addrconf_addr_gen() instead. Note: This patch was originally applied as commit 183185a18ff9 ("gre: Fix IPv6 link-local address generation."). However, it was then reverted by commit fc486c2d060f ("Revert "gre: Fix IPv6 link-local address generation."") because it uncovered another bug that ended up breaking net/forwarding/ip6gre_custom_multipath_hash.sh. That other bug has now been fixed by commit 4d0ab3a6885e ("ipv6: Start path selection from the first nexthop"). Therefore we can now revive this GRE patch (no changes since original commit 183185a18ff9 ("gre: Fix IPv6 link-local address generation."). Fixes: e5dd729460ca ("ip/ip6_gre: use the same logic as SIT interfaces when computing v6LL address") Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/a88cc5c4811af36007645d610c95102dccb360a6.1746225214.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 9ba83f0c9928..c6b22170dc49 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3214,16 +3214,13 @@ static void add_v4_addrs(struct inet6_dev *idev) struct in6_addr addr; struct net_device *dev; struct net *net = dev_net(idev->dev); - int scope, plen, offset = 0; + int scope, plen; u32 pflags = 0; ASSERT_RTNL(); memset(&addr, 0, sizeof(struct in6_addr)); - /* in case of IP6GRE the dev_addr is an IPv6 and therefore we use only the last 4 bytes */ - if (idev->dev->addr_len == sizeof(struct in6_addr)) - offset = sizeof(struct in6_addr) - 4; - memcpy(&addr.s6_addr32[3], idev->dev->dev_addr + offset, 4); + memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4); if (!(idev->dev->flags & IFF_POINTOPOINT) && idev->dev->type == ARPHRD_SIT) { scope = IPV6_ADDR_COMPATv4; @@ -3534,7 +3531,13 @@ static void addrconf_gre_config(struct net_device *dev) return; } - if (dev->type == ARPHRD_ETHER) { + /* Generate the IPv6 link-local address using addrconf_addr_gen(), + * unless we have an IPv4 GRE device not bound to an IP address and + * which is in EUI64 mode (as __ipv6_isatap_ifid() would fail in this + * case). Such devices fall back to add_v4_addrs() instead. + */ + if (!(dev->type == ARPHRD_IPGRE && *(__be32 *)dev->dev_addr == 0 && + idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)) { addrconf_addr_gen(idev, true); return; } -- 2.50.1 From b6a6006b0e3d10e62c465613f07ff49268baedb1 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 3 May 2025 00:57:59 +0200 Subject: [PATCH 08/16] selftests: Add IPv6 link-local address generation tests for GRE devices. GRE devices have their special code for IPv6 link-local address generation that has been the source of several regressions in the past. Add selftest to check that all gre, ip6gre, gretap and ip6gretap get an IPv6 link-link local address in accordance with the net.ipv6.conf..addr_gen_mode sysctl. Note: This patch was originally applied as commit 6f50175ccad4 ("selftests: Add IPv6 link-local address generation tests for GRE devices."). However, it was then reverted by commit 355d940f4d5a ("Revert "selftests: Add IPv6 link-local address generation tests for GRE devices."") because the commit it depended on was going to be reverted. Now that the situation is resolved, we can add this selftest again (no changes since original patch, appart from context update in tools/testing/selftests/net/Makefile). Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Link: https://patch.msgid.link/2c3a5733cb3a6e3119504361a9b9f89fda570a2d.1746225214.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 1 + .../testing/selftests/net/gre_ipv6_lladdr.sh | 177 ++++++++++++++++++ 2 files changed, 178 insertions(+) create mode 100755 tools/testing/selftests/net/gre_ipv6_lladdr.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 124078b56fa4..70a38f485d4d 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -31,6 +31,7 @@ TEST_PROGS += veth.sh TEST_PROGS += ioam6.sh TEST_PROGS += gro.sh TEST_PROGS += gre_gso.sh +TEST_PROGS += gre_ipv6_lladdr.sh TEST_PROGS += cmsg_so_mark.sh TEST_PROGS += cmsg_so_priority.sh TEST_PROGS += test_so_rcv.sh diff --git a/tools/testing/selftests/net/gre_ipv6_lladdr.sh b/tools/testing/selftests/net/gre_ipv6_lladdr.sh new file mode 100755 index 000000000000..5b34f6e1f831 --- /dev/null +++ b/tools/testing/selftests/net/gre_ipv6_lladdr.sh @@ -0,0 +1,177 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source ./lib.sh + +PAUSE_ON_FAIL="no" + +# The trap function handler +# +exit_cleanup_all() +{ + cleanup_all_ns + + exit "${EXIT_STATUS}" +} + +# Add fake IPv4 and IPv6 networks on the loopback device, to be used as +# underlay by future GRE devices. +# +setup_basenet() +{ + ip -netns "${NS0}" link set dev lo up + ip -netns "${NS0}" address add dev lo 192.0.2.10/24 + ip -netns "${NS0}" address add dev lo 2001:db8::10/64 nodad +} + +# Check if network device has an IPv6 link-local address assigned. +# +# Parameters: +# +# * $1: The network device to test +# * $2: An extra regular expression that should be matched (to verify the +# presence of extra attributes) +# * $3: The expected return code from grep (to allow checking the absence of +# a link-local address) +# * $4: The user visible name for the scenario being tested +# +check_ipv6_ll_addr() +{ + local DEV="$1" + local EXTRA_MATCH="$2" + local XRET="$3" + local MSG="$4" + + RET=0 + set +e + ip -netns "${NS0}" -6 address show dev "${DEV}" scope link | grep "fe80::" | grep -q "${EXTRA_MATCH}" + check_err_fail "${XRET}" $? "" + log_test "${MSG}" + set -e +} + +# Create a GRE device and verify that it gets an IPv6 link-local address as +# expected. +# +# Parameters: +# +# * $1: The device type (gre, ip6gre, gretap or ip6gretap) +# * $2: The local underlay IP address (can be an IPv4, an IPv6 or "any") +# * $3: The remote underlay IP address (can be an IPv4, an IPv6 or "any") +# * $4: The IPv6 interface identifier generation mode to use for the GRE +# device (eui64, none, stable-privacy or random). +# +test_gre_device() +{ + local GRE_TYPE="$1" + local LOCAL_IP="$2" + local REMOTE_IP="$3" + local MODE="$4" + local ADDR_GEN_MODE + local MATCH_REGEXP + local MSG + + ip link add netns "${NS0}" name gretest type "${GRE_TYPE}" local "${LOCAL_IP}" remote "${REMOTE_IP}" + + case "${MODE}" in + "eui64") + ADDR_GEN_MODE=0 + MATCH_REGEXP="" + MSG="${GRE_TYPE}, mode: 0 (EUI64), ${LOCAL_IP} -> ${REMOTE_IP}" + XRET=0 + ;; + "none") + ADDR_GEN_MODE=1 + MATCH_REGEXP="" + MSG="${GRE_TYPE}, mode: 1 (none), ${LOCAL_IP} -> ${REMOTE_IP}" + XRET=1 # No link-local address should be generated + ;; + "stable-privacy") + ADDR_GEN_MODE=2 + MATCH_REGEXP="stable-privacy" + MSG="${GRE_TYPE}, mode: 2 (stable privacy), ${LOCAL_IP} -> ${REMOTE_IP}" + XRET=0 + # Initialise stable_secret (required for stable-privacy mode) + ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.stable_secret="2001:db8::abcd" + ;; + "random") + ADDR_GEN_MODE=3 + MATCH_REGEXP="stable-privacy" + MSG="${GRE_TYPE}, mode: 3 (random), ${LOCAL_IP} -> ${REMOTE_IP}" + XRET=0 + ;; + esac + + # Check that IPv6 link-local address is generated when device goes up + ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.addr_gen_mode="${ADDR_GEN_MODE}" + ip -netns "${NS0}" link set dev gretest up + check_ipv6_ll_addr gretest "${MATCH_REGEXP}" "${XRET}" "config: ${MSG}" + + # Now disable link-local address generation + ip -netns "${NS0}" link set dev gretest down + ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.addr_gen_mode=1 + ip -netns "${NS0}" link set dev gretest up + + # Check that link-local address generation works when re-enabled while + # the device is already up + ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.addr_gen_mode="${ADDR_GEN_MODE}" + check_ipv6_ll_addr gretest "${MATCH_REGEXP}" "${XRET}" "update: ${MSG}" + + ip -netns "${NS0}" link del dev gretest +} + +test_gre4() +{ + local GRE_TYPE + local MODE + + for GRE_TYPE in "gre" "gretap"; do + printf "\n####\nTesting IPv6 link-local address generation on ${GRE_TYPE} devices\n####\n\n" + + for MODE in "eui64" "none" "stable-privacy" "random"; do + test_gre_device "${GRE_TYPE}" 192.0.2.10 192.0.2.11 "${MODE}" + test_gre_device "${GRE_TYPE}" any 192.0.2.11 "${MODE}" + test_gre_device "${GRE_TYPE}" 192.0.2.10 any "${MODE}" + done + done +} + +test_gre6() +{ + local GRE_TYPE + local MODE + + for GRE_TYPE in "ip6gre" "ip6gretap"; do + printf "\n####\nTesting IPv6 link-local address generation on ${GRE_TYPE} devices\n####\n\n" + + for MODE in "eui64" "none" "stable-privacy" "random"; do + test_gre_device "${GRE_TYPE}" 2001:db8::10 2001:db8::11 "${MODE}" + test_gre_device "${GRE_TYPE}" any 2001:db8::11 "${MODE}" + test_gre_device "${GRE_TYPE}" 2001:db8::10 any "${MODE}" + done + done +} + +usage() +{ + echo "Usage: $0 [-p]" + exit 1 +} + +while getopts :p o +do + case $o in + p) PAUSE_ON_FAIL="yes";; + *) usage;; + esac +done + +setup_ns NS0 + +set -e +trap exit_cleanup_all EXIT + +setup_basenet + +test_gre4 +test_gre6 -- 2.50.1 From b344a48cbe5fb24697addc6afbb7358b938a7d31 Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Fri, 2 May 2025 18:35:16 -0700 Subject: [PATCH 09/16] selftests: drv: net: fix test failure on ipv6 sys The `get_interface_info` call has ip version hard-coded which leads to failures on an IPV6 system. The NetDrvEnv class already gathers information about remote interface, so instead of fixing the local implementation switch to using cfg.remote_ifname. Before: ./drivers/net/ping.py Traceback (most recent call last): File "/new_tests/./drivers/net/ping.py", line 217, in main() File "/new_tests/./drivers/net/ping.py", line 204, in main get_interface_info(cfg) File "/new_tests/./drivers/net/ping.py", line 128, in get_interface_info raise KsftFailEx('Can not get remote interface') net.lib.py.ksft.KsftFailEx: Can not get remote interface After: ./drivers/net/ping.py TAP version 13 1..6 ok 1 ping.test_default # SKIP Test requires IPv4 connectivity ok 2 ping.test_xdp_generic_sb # SKIP Test requires IPv4 connectivity ok 3 ping.test_xdp_generic_mb # SKIP Test requires IPv4 connectivity ok 4 ping.test_xdp_native_sb # SKIP Test requires IPv4 connectivity ok 5 ping.test_xdp_native_mb # SKIP Test requires IPv4 connectivity ok 6 ping.test_xdp_offload # SKIP device does not support offloaded XDP Totals: pass:0 fail:0 xfail:0 xpass:0 skip:6 error:0 Fixes: 75cc19c8ff89 ("selftests: drv-net: add xdp cases for ping.py") Signed-off-by: Mohsin Bashir Reviewed-by: David Wei Link: https://patch.msgid.link/20250503013518.1722913-2-mohsin.bashr@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/ping.py | 22 +++++++++------------ 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/drivers/net/ping.py b/tools/testing/selftests/drivers/net/ping.py index 4b6822866066..5272e8b3536d 100755 --- a/tools/testing/selftests/drivers/net/ping.py +++ b/tools/testing/selftests/drivers/net/ping.py @@ -9,7 +9,6 @@ from lib.py import EthtoolFamily, NetDrvEpEnv from lib.py import bkg, cmd, wait_port_listen, rand_port from lib.py import defer, ethtool, ip -remote_ifname="" no_sleep=False def _test_v4(cfg) -> None: @@ -57,7 +56,7 @@ def _set_offload_checksum(cfg, netnl, on) -> None: def _set_xdp_generic_sb_on(cfg) -> None: prog = cfg.net_lib_dir / "xdp_dummy.bpf.o" - cmd(f"ip link set dev {remote_ifname} mtu 1500", shell=True, host=cfg.remote) + cmd(f"ip link set dev {cfg.remote_ifname} mtu 1500", shell=True, host=cfg.remote) cmd(f"ip link set dev {cfg.ifname} mtu 1500 xdpgeneric obj {prog} sec xdp", shell=True) defer(cmd, f"ip link set dev {cfg.ifname} xdpgeneric off") @@ -66,8 +65,8 @@ def _set_xdp_generic_sb_on(cfg) -> None: def _set_xdp_generic_mb_on(cfg) -> None: prog = cfg.net_lib_dir / "xdp_dummy.bpf.o" - cmd(f"ip link set dev {remote_ifname} mtu 9000", shell=True, host=cfg.remote) - defer(ip, f"link set dev {remote_ifname} mtu 1500", host=cfg.remote) + cmd(f"ip link set dev {cfg.remote_ifname} mtu 9000", shell=True, host=cfg.remote) + defer(ip, f"link set dev {cfg.remote_ifname} mtu 1500", host=cfg.remote) ip("link set dev %s mtu 9000 xdpgeneric obj %s sec xdp.frags" % (cfg.ifname, prog)) defer(ip, f"link set dev {cfg.ifname} mtu 1500 xdpgeneric off") @@ -76,7 +75,7 @@ def _set_xdp_generic_mb_on(cfg) -> None: def _set_xdp_native_sb_on(cfg) -> None: prog = cfg.net_lib_dir / "xdp_dummy.bpf.o" - cmd(f"ip link set dev {remote_ifname} mtu 1500", shell=True, host=cfg.remote) + cmd(f"ip link set dev {cfg.remote_ifname} mtu 1500", shell=True, host=cfg.remote) cmd(f"ip -j link set dev {cfg.ifname} mtu 1500 xdp obj {prog} sec xdp", shell=True) defer(ip, f"link set dev {cfg.ifname} mtu 1500 xdp off") xdp_info = ip("-d link show %s" % (cfg.ifname), json=True)[0] @@ -93,8 +92,8 @@ def _set_xdp_native_sb_on(cfg) -> None: def _set_xdp_native_mb_on(cfg) -> None: prog = cfg.net_lib_dir / "xdp_dummy.bpf.o" - cmd(f"ip link set dev {remote_ifname} mtu 9000", shell=True, host=cfg.remote) - defer(ip, f"link set dev {remote_ifname} mtu 1500", host=cfg.remote) + cmd(f"ip link set dev {cfg.remote_ifname} mtu 9000", shell=True, host=cfg.remote) + defer(ip, f"link set dev {cfg.remote_ifname} mtu 1500", host=cfg.remote) try: cmd(f"ip link set dev {cfg.ifname} mtu 9000 xdp obj {prog} sec xdp.frags", shell=True) defer(ip, f"link set dev {cfg.ifname} mtu 1500 xdp off") @@ -112,18 +111,15 @@ def _set_xdp_offload_on(cfg) -> None: except Exception as e: raise KsftSkipEx('device does not support offloaded XDP') defer(ip, f"link set dev {cfg.ifname} xdpoffload off") - cmd(f"ip link set dev {remote_ifname} mtu 1500", shell=True, host=cfg.remote) + cmd(f"ip link set dev {cfg.remote_ifname} mtu 1500", shell=True, host=cfg.remote) if no_sleep != True: time.sleep(10) def get_interface_info(cfg) -> None: - global remote_ifname global no_sleep - remote_info = cmd(f"ip -4 -o addr show to {cfg.remote_addr_v['4']} | awk '{{print $2}}'", shell=True, host=cfg.remote).stdout - remote_ifname = remote_info.rstrip('\n') - if remote_ifname == "": + if cfg.remote_ifname == "": raise KsftFailEx('Can not get remote interface') local_info = ip("-d link show %s" % (cfg.ifname), json=True)[0] if 'parentbus' in local_info and local_info['parentbus'] == "netdevsim": @@ -136,7 +132,7 @@ def set_interface_init(cfg) -> None: cmd(f"ip link set dev {cfg.ifname} xdp off ", shell=True) cmd(f"ip link set dev {cfg.ifname} xdpgeneric off ", shell=True) cmd(f"ip link set dev {cfg.ifname} xdpoffload off", shell=True) - cmd(f"ip link set dev {remote_ifname} mtu 1500", shell=True, host=cfg.remote) + cmd(f"ip link set dev {cfg.remote_ifname} mtu 1500", shell=True, host=cfg.remote) def test_default(cfg, netnl) -> None: _set_offload_checksum(cfg, netnl, "off") -- 2.50.1 From 8bb7d8e5cf7f44ea89a445975cbd78888324f234 Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Fri, 2 May 2025 18:35:17 -0700 Subject: [PATCH 10/16] selftests: drv: net: avoid skipping tests On a system with either of the ipv4 or ipv6 information missing, tests are currently skipped. Ideally, the test should run as long as at least one address family is present. This patch make test run whenever possible. Before: ./drivers/net/ping.py TAP version 13 1..6 ok 1 ping.test_default # SKIP Test requires IPv4 connectivity ok 2 ping.test_xdp_generic_sb # SKIP Test requires IPv4 connectivity ok 3 ping.test_xdp_generic_mb # SKIP Test requires IPv4 connectivity ok 4 ping.test_xdp_native_sb # SKIP Test requires IPv4 connectivity ok 5 ping.test_xdp_native_mb # SKIP Test requires IPv4 connectivity ok 6 ping.test_xdp_offload # SKIP device does not support offloaded XDP Totals: pass:0 fail:0 xfail:0 xpass:0 skip:6 error:0 After: ./drivers/net/ping.py TAP version 13 1..6 ok 1 ping.test_default ok 2 ping.test_xdp_generic_sb ok 3 ping.test_xdp_generic_mb ok 4 ping.test_xdp_native_sb ok 5 ping.test_xdp_native_mb ok 6 ping.test_xdp_offload # SKIP device does not support offloaded XDP Totals: pass:5 fail:0 xfail:0 xpass:0 skip:1 error:0 Fixes: 75cc19c8ff89 ("selftests: drv-net: add xdp cases for ping.py") Signed-off-by: Mohsin Bashir Link: https://patch.msgid.link/20250503013518.1722913-3-mohsin.bashr@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/ping.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/drivers/net/ping.py b/tools/testing/selftests/drivers/net/ping.py index 5272e8b3536d..16b7d3ab0fc8 100755 --- a/tools/testing/selftests/drivers/net/ping.py +++ b/tools/testing/selftests/drivers/net/ping.py @@ -12,7 +12,8 @@ from lib.py import defer, ethtool, ip no_sleep=False def _test_v4(cfg) -> None: - cfg.require_ipver("4") + if not cfg.addr_v["4"]: + return cmd("ping -c 1 -W0.5 " + cfg.remote_addr_v["4"]) cmd("ping -c 1 -W0.5 " + cfg.addr_v["4"], host=cfg.remote) @@ -20,7 +21,8 @@ def _test_v4(cfg) -> None: cmd("ping -s 65000 -c 1 -W0.5 " + cfg.addr_v["4"], host=cfg.remote) def _test_v6(cfg) -> None: - cfg.require_ipver("6") + if not cfg.addr_v["6"]: + return cmd("ping -c 1 -W5 " + cfg.remote_addr_v["6"]) cmd("ping -c 1 -W5 " + cfg.addr_v["6"], host=cfg.remote) -- 2.50.1 From 4a9d494ca24b7fd509b3953fcb43d580cb05097b Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Fri, 2 May 2025 18:35:18 -0700 Subject: [PATCH 11/16] selftests: drv: net: add version indicator Currently, the test result does not differentiate between the cases when either one of the address families are configured or if both the address families are configured. Ideally, the result should report if a particular case was skipped. ./drivers/net/ping.py TAP version 13 1..7 ok 1 ping.test_default_v4 # SKIP Test requires IPv4 connectivity ok 2 ping.test_default_v6 ok 3 ping.test_xdp_generic_sb ok 4 ping.test_xdp_generic_mb ok 5 ping.test_xdp_native_sb ok 6 ping.test_xdp_native_mb ok 7 ping.test_xdp_offload # SKIP device does not support offloaded XDP Totals: pass:5 fail:0 xfail:0 xpass:0 skip:2 error:0 Fixes: 75cc19c8ff89 ("selftests: drv-net: add xdp cases for ping.py") Signed-off-by: Mohsin Bashir Reviewed-by: David Wei Link: https://patch.msgid.link/20250503013518.1722913-4-mohsin.bashr@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/ping.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/drivers/net/ping.py b/tools/testing/selftests/drivers/net/ping.py index 16b7d3ab0fc8..af8df2313a3b 100755 --- a/tools/testing/selftests/drivers/net/ping.py +++ b/tools/testing/selftests/drivers/net/ping.py @@ -136,13 +136,23 @@ def set_interface_init(cfg) -> None: cmd(f"ip link set dev {cfg.ifname} xdpoffload off", shell=True) cmd(f"ip link set dev {cfg.remote_ifname} mtu 1500", shell=True, host=cfg.remote) -def test_default(cfg, netnl) -> None: +def test_default_v4(cfg, netnl) -> None: + cfg.require_ipver("4") + _set_offload_checksum(cfg, netnl, "off") _test_v4(cfg) - _test_v6(cfg) _test_tcp(cfg) _set_offload_checksum(cfg, netnl, "on") _test_v4(cfg) + _test_tcp(cfg) + +def test_default_v6(cfg, netnl) -> None: + cfg.require_ipver("6") + + _set_offload_checksum(cfg, netnl, "off") + _test_v6(cfg) + _test_tcp(cfg) + _set_offload_checksum(cfg, netnl, "on") _test_v6(cfg) _test_tcp(cfg) @@ -200,7 +210,8 @@ def main() -> None: with NetDrvEpEnv(__file__) as cfg: get_interface_info(cfg) set_interface_init(cfg) - ksft_run([test_default, + ksft_run([test_default_v4, + test_default_v6, test_xdp_generic_sb, test_xdp_generic_mb, test_xdp_native_sb, -- 2.50.1 From 4720f9707c783f642332dee3d56dccaefa850e42 Mon Sep 17 00:00:00 2001 From: David Wei Date: Fri, 2 May 2025 21:30:50 -0700 Subject: [PATCH 12/16] tools: ynl-gen: validate 0 len strings from kernel Strings from the kernel are guaranteed to be null terminated and ynl_attr_validate() checks for this. But it doesn't check if the string has a len of 0, which would cause problems when trying to access data[len - 1]. Fix this by checking that len is positive. Signed-off-by: David Wei Link: https://patch.msgid.link/20250503043050.861238-1-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/net/ynl/lib/ynl.c b/tools/net/ynl/lib/ynl.c index ce32cb35007d..c4da34048ef8 100644 --- a/tools/net/ynl/lib/ynl.c +++ b/tools/net/ynl/lib/ynl.c @@ -364,7 +364,7 @@ int ynl_attr_validate(struct ynl_parse_arg *yarg, const struct nlattr *attr) "Invalid attribute (binary %s)", policy->name); return -1; case YNL_PT_NUL_STR: - if ((!policy->len || len <= policy->len) && !data[len - 1]) + if (len && (!policy->len || len <= policy->len) && !data[len - 1]) break; yerr(yarg->ys, YNL_ERROR_ATTR_INVALID, "Invalid attribute (string %s)", policy->name); -- 2.50.1 From 4db6c75124d871fbabf8243f947d34cc7e0697fc Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Mon, 5 May 2025 02:07:32 +0100 Subject: [PATCH 13/16] net: ethernet: mtk_eth_soc: reset all TX queues on DMA free The purpose of resetting the TX queue is to reset the byte and packet count as well as to clear the software flow control XOFF bit. MediaTek developers pointed out that netdev_reset_queue would only resets queue 0 of the network device. Queues that are not reset may cause unexpected issues. Packets may stop being sent after reset and "transmit timeout" log may be displayed. Import fix from MediaTek's SDK to resolve this issue. Link: https://git01.mediatek.com/plugins/gitiles/openwrt/feeds/mtk-openwrt-feeds/+/319c0d9905579a46dc448579f892f364f1f84818 Fixes: f63959c7eec31 ("net: ethernet: mtk_eth_soc: implement multi-queue support for per-port queues") Signed-off-by: Daniel Golle Link: https://patch.msgid.link/c9ff9adceac4f152239a0f65c397f13547639175.1746406763.git.daniel@makrotopia.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 8fda4ce80d81..53c39561b6d9 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -3186,11 +3186,19 @@ static int mtk_dma_init(struct mtk_eth *eth) static void mtk_dma_free(struct mtk_eth *eth) { const struct mtk_soc_data *soc = eth->soc; - int i; + int i, j, txqs = 1; + + if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) + txqs = MTK_QDMA_NUM_QUEUES; + + for (i = 0; i < MTK_MAX_DEVS; i++) { + if (!eth->netdev[i]) + continue; + + for (j = 0; j < txqs; j++) + netdev_tx_reset_subqueue(eth->netdev[i], j); + } - for (i = 0; i < MTK_MAX_DEVS; i++) - if (eth->netdev[i]) - netdev_reset_queue(eth->netdev[i]); if (!MTK_HAS_CAPS(soc->caps, MTK_SRAM) && eth->scratch_ring) { dma_free_coherent(eth->dma_dev, MTK_QDMA_RING_SIZE * soc->tx.desc_size, -- 2.50.1 From e8716b5b0dff1b3d523b4a83fd5e94d57b887c5c Mon Sep 17 00:00:00 2001 From: Frank Wunderlich Date: Mon, 5 May 2025 02:07:58 +0100 Subject: [PATCH 14/16] net: ethernet: mtk_eth_soc: do not reset PSE when setting FE Remove redundant PSE reset. When setting FE register there is no need to reset PSE, doing so may cause FE to work abnormal. Link: https://git01.mediatek.com/plugins/gitiles/openwrt/feeds/mtk-openwrt-feeds/+/3a5223473e086a4b54a2b9a44df7d9ddcc2bc75a Fixes: dee4dd10c79aa ("net: ethernet: mtk_eth_soc: ppe: add support for multiple PPEs") Signed-off-by: Frank Wunderlich Link: https://patch.msgid.link/18f0ac7d83f82defa3342c11ef0d1362f6b81e88.1746406763.git.daniel@makrotopia.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 53c39561b6d9..22a532695fb0 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -3473,9 +3473,6 @@ static int mtk_open(struct net_device *dev) } mtk_gdm_config(eth, target_mac->id, gdm_config); } - /* Reset and enable PSE */ - mtk_w32(eth, RST_GL_PSE, MTK_RST_GL); - mtk_w32(eth, 0, MTK_RST_GL); napi_enable(ð->tx_napi); napi_enable(ð->rx_napi); -- 2.50.1 From 0093cb194a7511d1e68865fa35b763c72e44c2f0 Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Mon, 5 May 2025 09:19:38 -0700 Subject: [PATCH 15/16] ice: use DSN instead of PCI BDF for ice_adapter index Use Device Serial Number instead of PCI bus/device/function for the index of struct ice_adapter. Functions on the same physical device should point to the very same ice_adapter instance, but with two PFs, when at least one of them is PCI-e passed-through to a VM, it is no longer the case - PFs will get seemingly random PCI BDF values, and thus indices, what finally leds to each of them being on their own instance of ice_adapter. That causes them to don't attempt any synchronization of the PTP HW clock usage, or any other future resources. DSN works nicely in place of the index, as it is "immutable" in terms of virtualization. Fixes: 0e2bddf9e5f9 ("ice: add ice_adapter for shared data across PFs on the same NIC") Suggested-by: Jacob Keller Suggested-by: Jakub Kicinski Suggested-by: Jiri Pirko Reviewed-by: Aleksandr Loktionov Signed-off-by: Przemek Kitszel Reviewed-by: Simon Horman Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20250505161939.2083581-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_adapter.c | 47 ++++++++------------ drivers/net/ethernet/intel/ice/ice_adapter.h | 6 ++- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_adapter.c b/drivers/net/ethernet/intel/ice/ice_adapter.c index 01a08cfd0090..66e070095d1b 100644 --- a/drivers/net/ethernet/intel/ice/ice_adapter.c +++ b/drivers/net/ethernet/intel/ice/ice_adapter.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only // SPDX-FileCopyrightText: Copyright Red Hat -#include #include #include #include @@ -14,32 +13,16 @@ static DEFINE_XARRAY(ice_adapters); static DEFINE_MUTEX(ice_adapters_mutex); -/* PCI bus number is 8 bits. Slot is 5 bits. Domain can have the rest. */ -#define INDEX_FIELD_DOMAIN GENMASK(BITS_PER_LONG - 1, 13) -#define INDEX_FIELD_DEV GENMASK(31, 16) -#define INDEX_FIELD_BUS GENMASK(12, 5) -#define INDEX_FIELD_SLOT GENMASK(4, 0) - -static unsigned long ice_adapter_index(const struct pci_dev *pdev) +static unsigned long ice_adapter_index(u64 dsn) { - unsigned int domain = pci_domain_nr(pdev->bus); - - WARN_ON(domain > FIELD_MAX(INDEX_FIELD_DOMAIN)); - - switch (pdev->device) { - case ICE_DEV_ID_E825C_BACKPLANE: - case ICE_DEV_ID_E825C_QSFP: - case ICE_DEV_ID_E825C_SFP: - case ICE_DEV_ID_E825C_SGMII: - return FIELD_PREP(INDEX_FIELD_DEV, pdev->device); - default: - return FIELD_PREP(INDEX_FIELD_DOMAIN, domain) | - FIELD_PREP(INDEX_FIELD_BUS, pdev->bus->number) | - FIELD_PREP(INDEX_FIELD_SLOT, PCI_SLOT(pdev->devfn)); - } +#if BITS_PER_LONG == 64 + return dsn; +#else + return (u32)dsn ^ (u32)(dsn >> 32); +#endif } -static struct ice_adapter *ice_adapter_new(void) +static struct ice_adapter *ice_adapter_new(u64 dsn) { struct ice_adapter *adapter; @@ -47,6 +30,7 @@ static struct ice_adapter *ice_adapter_new(void) if (!adapter) return NULL; + adapter->device_serial_number = dsn; spin_lock_init(&adapter->ptp_gltsyn_time_lock); refcount_set(&adapter->refcount, 1); @@ -77,23 +61,26 @@ static void ice_adapter_free(struct ice_adapter *adapter) * Return: Pointer to ice_adapter on success. * ERR_PTR() on error. -ENOMEM is the only possible error. */ -struct ice_adapter *ice_adapter_get(const struct pci_dev *pdev) +struct ice_adapter *ice_adapter_get(struct pci_dev *pdev) { - unsigned long index = ice_adapter_index(pdev); + u64 dsn = pci_get_dsn(pdev); struct ice_adapter *adapter; + unsigned long index; int err; + index = ice_adapter_index(dsn); scoped_guard(mutex, &ice_adapters_mutex) { err = xa_insert(&ice_adapters, index, NULL, GFP_KERNEL); if (err == -EBUSY) { adapter = xa_load(&ice_adapters, index); refcount_inc(&adapter->refcount); + WARN_ON_ONCE(adapter->device_serial_number != dsn); return adapter; } if (err) return ERR_PTR(err); - adapter = ice_adapter_new(); + adapter = ice_adapter_new(dsn); if (!adapter) return ERR_PTR(-ENOMEM); xa_store(&ice_adapters, index, adapter, GFP_KERNEL); @@ -110,11 +97,13 @@ struct ice_adapter *ice_adapter_get(const struct pci_dev *pdev) * * Context: Process, may sleep. */ -void ice_adapter_put(const struct pci_dev *pdev) +void ice_adapter_put(struct pci_dev *pdev) { - unsigned long index = ice_adapter_index(pdev); + u64 dsn = pci_get_dsn(pdev); struct ice_adapter *adapter; + unsigned long index; + index = ice_adapter_index(dsn); scoped_guard(mutex, &ice_adapters_mutex) { adapter = xa_load(&ice_adapters, index); if (WARN_ON(!adapter)) diff --git a/drivers/net/ethernet/intel/ice/ice_adapter.h b/drivers/net/ethernet/intel/ice/ice_adapter.h index e233225848b3..ac15c0d2bc1a 100644 --- a/drivers/net/ethernet/intel/ice/ice_adapter.h +++ b/drivers/net/ethernet/intel/ice/ice_adapter.h @@ -32,6 +32,7 @@ struct ice_port_list { * @refcount: Reference count. struct ice_pf objects hold the references. * @ctrl_pf: Control PF of the adapter * @ports: Ports list + * @device_serial_number: DSN cached for collision detection on 32bit systems */ struct ice_adapter { refcount_t refcount; @@ -40,9 +41,10 @@ struct ice_adapter { struct ice_pf *ctrl_pf; struct ice_port_list ports; + u64 device_serial_number; }; -struct ice_adapter *ice_adapter_get(const struct pci_dev *pdev); -void ice_adapter_put(const struct pci_dev *pdev); +struct ice_adapter *ice_adapter_get(struct pci_dev *pdev); +void ice_adapter_put(struct pci_dev *pdev); #endif /* _ICE_ADAPTER_H */ -- 2.50.1 From 08e9f2d584c4732180edee4cb2dbfa7586d7d5a3 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Mon, 5 May 2025 22:47:13 +0300 Subject: [PATCH 16/16] net: Lock netdevices during dev_shutdown __qdisc_destroy() calls into various qdiscs .destroy() op, which in turn can call .ndo_setup_tc(), which requires the netdev instance lock. This commit extends the critical section in unregister_netdevice_many_notify() to cover dev_shutdown() (and dev_tcx_uninstall() as a side-effect) and acquires the netdev instance lock in __dev_change_net_namespace() for the other dev_shutdown() call. This should now guarantee that for all qdisc ops, the netdev instance lock is held during .ndo_setup_tc(). Fixes: a0527ee2df3f ("net: hold netdev instance lock during qdisc ndo_setup_tc") Signed-off-by: Cosmin Ratiu Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250505194713.1723399-1-cratiu@nvidia.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 1be7cb73a602..92e004c354ea 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11966,9 +11966,9 @@ void unregister_netdevice_many_notify(struct list_head *head, struct sk_buff *skb = NULL; /* Shutdown queueing discipline. */ + netdev_lock_ops(dev); dev_shutdown(dev); dev_tcx_uninstall(dev); - netdev_lock_ops(dev); dev_xdp_uninstall(dev); dev_memory_provider_uninstall(dev); netdev_unlock_ops(dev); @@ -12161,7 +12161,9 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, synchronize_net(); /* Shutdown queueing discipline. */ + netdev_lock_ops(dev); dev_shutdown(dev); + netdev_unlock_ops(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. -- 2.50.1