From f40a673d6b4a128fe95dd9b8c3ed02da50a6a862 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 7 Apr 2025 12:38:59 +0300 Subject: [PATCH 01/16] net: phy: move phy_link_change() prior to mdio_bus_phy_may_suspend() In an upcoming change, mdio_bus_phy_may_suspend() will need to distinguish a phylib-based PHY client from a phylink PHY client. For that, it will need to compare the phydev->phy_link_change() function pointer with the eponymous phy_link_change() provided by phylib. To avoid forward function declarations, the default PHY link state change method should be moved upwards. There is no functional change associated with this patch, it is only to reduce the noise from a real bug fix. Signed-off-by: Vladimir Oltean Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20250407093900.2155112-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 675fbd225378..1367296a3389 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -244,6 +244,19 @@ static bool phy_drv_wol_enabled(struct phy_device *phydev) return wol.wolopts != 0; } +static void phy_link_change(struct phy_device *phydev, bool up) +{ + struct net_device *netdev = phydev->attached_dev; + + if (up) + netif_carrier_on(netdev); + else + netif_carrier_off(netdev); + phydev->adjust_link(netdev); + if (phydev->mii_ts && phydev->mii_ts->link_state) + phydev->mii_ts->link_state(phydev->mii_ts, phydev); +} + static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) { struct device_driver *drv = phydev->mdio.dev.driver; @@ -1055,19 +1068,6 @@ struct phy_device *phy_find_first(struct mii_bus *bus) } EXPORT_SYMBOL(phy_find_first); -static void phy_link_change(struct phy_device *phydev, bool up) -{ - struct net_device *netdev = phydev->attached_dev; - - if (up) - netif_carrier_on(netdev); - else - netif_carrier_off(netdev); - phydev->adjust_link(netdev); - if (phydev->mii_ts && phydev->mii_ts->link_state) - phydev->mii_ts->link_state(phydev->mii_ts, phydev); -} - /** * phy_prepare_link - prepares the PHY layer to monitor link status * @phydev: target phy_device struct -- 2.50.1 From fc75ea20ffb452652f0d4033f38fe88d7cfdae35 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 7 Apr 2025 12:40:42 +0300 Subject: [PATCH 02/16] net: phy: allow MDIO bus PM ops to start/stop state machine for phylink-controlled PHY DSA has 2 kinds of drivers: 1. Those who call dsa_switch_suspend() and dsa_switch_resume() from their device PM ops: qca8k-8xxx, bcm_sf2, microchip ksz 2. Those who don't: all others. The above methods should be optional. For type 1, dsa_switch_suspend() calls dsa_user_suspend() -> phylink_stop(), and dsa_switch_resume() calls dsa_user_resume() -> phylink_start(). These seem good candidates for setting mac_managed_pm = true because that is essentially its definition [1], but that does not seem to be the biggest problem for now, and is not what this change focuses on. Talking strictly about the 2nd category of DSA drivers here (which do not have MAC managed PM, meaning that for their attached PHYs, mdio_bus_phy_suspend() and mdio_bus_phy_resume() should run in full), I have noticed that the following warning from mdio_bus_phy_resume() is triggered: WARN_ON(phydev->state != PHY_HALTED && phydev->state != PHY_READY && phydev->state != PHY_UP); because the PHY state machine is running. It's running as a result of a previous dsa_user_open() -> ... -> phylink_start() -> phy_start() having been initiated by the user. The previous mdio_bus_phy_suspend() was supposed to have called phy_stop_machine(), but it didn't. So this is why the PHY is in state PHY_NOLINK by the time mdio_bus_phy_resume() runs. mdio_bus_phy_suspend() did not call phy_stop_machine() because for phylink, the phydev->adjust_link function pointer is NULL. This seems a technicality introduced by commit fddd91016d16 ("phylib: fix PAL state machine restart on resume"). That commit was written before phylink existed, and was intended to avoid crashing with consumer drivers which don't use the PHY state machine - phylink always does, when using a PHY. But phylink itself has historically not been developed with suspend/resume in mind, and apparently not tested too much in that scenario, allowing this bug to exist unnoticed for so long. Plus, prior to the WARN_ON(), it would have likely been invisible. This issue is not in fact restricted to type 2 DSA drivers (according to the above ad-hoc classification), but can be extrapolated to any MAC driver with phylink and MDIO-bus-managed PHY PM ops. DSA is just where the issue was reported. Assuming mac_managed_pm is set correctly, a quick search indicates the following other drivers might be affected: $ grep -Zlr PHYLINK_NETDEV drivers/ | xargs -0 grep -L mac_managed_pm drivers/net/ethernet/atheros/ag71xx.c drivers/net/ethernet/microchip/sparx5/sparx5_main.c drivers/net/ethernet/microchip/lan966x/lan966x_main.c drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c drivers/net/ethernet/freescale/dpaa/dpaa_eth.c drivers/net/ethernet/freescale/ucc_geth.c drivers/net/ethernet/freescale/enetc/enetc_pf_common.c drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c drivers/net/ethernet/marvell/mvneta.c drivers/net/ethernet/marvell/prestera/prestera_main.c drivers/net/ethernet/mediatek/mtk_eth_soc.c drivers/net/ethernet/altera/altera_tse_main.c drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c drivers/net/ethernet/meta/fbnic/fbnic_phylink.c drivers/net/ethernet/tehuti/tn40_phy.c drivers/net/ethernet/mscc/ocelot_net.c Make the existing conditions dependent on the PHY device having a phydev->phy_link_change() implementation equal to the default phy_link_change() provided by phylib. Otherwise, we implicitly know that the phydev has the phylink-provided phylink_phy_change() callback, and when phylink is used, the PHY state machine always needs to be stopped/ started on the suspend/resume path. The code is structured as such that if phydev->phy_link_change() is absent, it is a matter of time until the kernel will crash - no need to further complicate the test. Thus, for the situation where the PM is not managed by the MAC, we will make the MDIO bus PM ops treat identically the phylink-controlled PHYs with the phylib-controlled PHYs where an adjust_link() callback is supplied. In both cases, the MDIO bus PM ops should stop and restart the PHY state machine. [1] https://lore.kernel.org/netdev/Z-1tiW9zjcoFkhwc@shell.armlinux.org.uk/ Fixes: 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state") Reported-by: Wei Fang Tested-by: Wei Fang Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250407094042.2155633-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 1367296a3389..cc1bfd22fb81 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -257,6 +257,33 @@ static void phy_link_change(struct phy_device *phydev, bool up) phydev->mii_ts->link_state(phydev->mii_ts, phydev); } +/** + * phy_uses_state_machine - test whether consumer driver uses PAL state machine + * @phydev: the target PHY device structure + * + * Ultimately, this aims to indirectly determine whether the PHY is attached + * to a consumer which uses the state machine by calling phy_start() and + * phy_stop(). + * + * When the PHY driver consumer uses phylib, it must have previously called + * phy_connect_direct() or one of its derivatives, so that phy_prepare_link() + * has set up a hook for monitoring state changes. + * + * When the PHY driver is used by the MAC driver consumer through phylink (the + * only other provider of a phy_link_change() method), using the PHY state + * machine is not optional. + * + * Return: true if consumer calls phy_start() and phy_stop(), false otherwise. + */ +static bool phy_uses_state_machine(struct phy_device *phydev) +{ + if (phydev->phy_link_change == phy_link_change) + return phydev->attached_dev && phydev->adjust_link; + + /* phydev->phy_link_change is implicitly phylink_phy_change() */ + return true; +} + static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) { struct device_driver *drv = phydev->mdio.dev.driver; @@ -323,7 +350,7 @@ static __maybe_unused int mdio_bus_phy_suspend(struct device *dev) * may call phy routines that try to grab the same lock, and that may * lead to a deadlock. */ - if (phydev->attached_dev && phydev->adjust_link) + if (phy_uses_state_machine(phydev)) phy_stop_machine(phydev); if (!mdio_bus_phy_may_suspend(phydev)) @@ -377,7 +404,7 @@ no_resume: } } - if (phydev->attached_dev && phydev->adjust_link) + if (phy_uses_state_machine(phydev)) phy_start_machine(phydev); return 0; -- 2.50.1 From 6933cd4714861eea6848f18396a119d741f25fc3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Apr 2025 11:43:16 +0300 Subject: [PATCH 03/16] ipv6: Align behavior across nexthops during path selection A nexthop is only chosen when the calculated multipath hash falls in the nexthop's hash region (i.e., the hash is smaller than the nexthop's hash threshold) and when the nexthop is assigned a non-negative score by rt6_score_route(). Commit 4d0ab3a6885e ("ipv6: Start path selection from the first nexthop") introduced an unintentional difference between the first nexthop and the rest when the score is negative. When the first nexthop matches, but has a negative score, the code will currently evaluate subsequent nexthops until one is found with a non-negative score. On the other hand, when a different nexthop matches, but has a negative score, the code will fallback to the nexthop with which the selection started ('match'). Align the behavior across all nexthops and fallback to 'match' when the first nexthop matches, but has a negative score. Fixes: 3d709f69a3e7 ("ipv6: Use hash-threshold instead of modulo-N") Fixes: 4d0ab3a6885e ("ipv6: Start path selection from the first nexthop") Reported-by: Willem de Bruijn Closes: https://lore.kernel.org/netdev/67efef607bc41_1ddca82948c@willemb.c.googlers.com.notmuch/ Signed-off-by: Ido Schimmel Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250408084316.243559-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ab12b816ab94..210b84cecc24 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -470,10 +470,10 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, goto out; hash = fl6->mp_hash; - if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound) && - rt6_score_route(first->fib6_nh, first->fib6_flags, oif, - strict) >= 0) { - match = first; + if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound)) { + if (rt6_score_route(first->fib6_nh, first->fib6_flags, oif, + strict) >= 0) + match = first; goto out; } -- 2.50.1 From 0bb2f7a1ad1f11d861f58e5ee5051c8974ff9569 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 7 Apr 2025 09:33:11 -0700 Subject: [PATCH 04/16] net: Fix null-ptr-deref by sock_lock_init_class_and_name() and rmmod. When I ran the repro [0] and waited a few seconds, I observed two LOCKDEP splats: a warning immediately followed by a null-ptr-deref. [1] Reproduction Steps: 1) Mount CIFS 2) Add an iptables rule to drop incoming FIN packets for CIFS 3) Unmount CIFS 4) Unload the CIFS module 5) Remove the iptables rule At step 3), the CIFS module calls sock_release() for the underlying TCP socket, and it returns quickly. However, the socket remains in FIN_WAIT_1 because incoming FIN packets are dropped. At this point, the module's refcnt is 0 while the socket is still alive, so the following rmmod command succeeds. # ss -tan State Recv-Q Send-Q Local Address:Port Peer Address:Port FIN-WAIT-1 0 477 10.0.2.15:51062 10.0.0.137:445 # lsmod | grep cifs cifs 1159168 0 This highlights a discrepancy between the lifetime of the CIFS module and the underlying TCP socket. Even after CIFS calls sock_release() and it returns, the TCP socket does not die immediately in order to close the connection gracefully. While this is generally fine, it causes an issue with LOCKDEP because CIFS assigns a different lock class to the TCP socket's sk->sk_lock using sock_lock_init_class_and_name(). Once an incoming packet is processed for the socket or a timer fires, sk->sk_lock is acquired. Then, LOCKDEP checks the lock context in check_wait_context(), where hlock_class() is called to retrieve the lock class. However, since the module has already been unloaded, hlock_class() logs a warning and returns NULL, triggering the null-ptr-deref. If LOCKDEP is enabled, we must ensure that a module calling sock_lock_init_class_and_name() (CIFS, NFS, etc) cannot be unloaded while such a socket is still alive to prevent this issue. Let's hold the module reference in sock_lock_init_class_and_name() and release it when the socket is freed in sk_prot_free(). Note that sock_lock_init() clears sk->sk_owner for svc_create_socket() that calls sock_lock_init_class_and_name() for a listening socket, which clones a socket by sk_clone_lock() without GFP_ZERO. [0]: CIFS_SERVER="10.0.0.137" CIFS_PATH="//${CIFS_SERVER}/Users/Administrator/Desktop/CIFS_TEST" DEV="enp0s3" CRED="/root/WindowsCredential.txt" MNT=$(mktemp -d /tmp/XXXXXX) mount -t cifs ${CIFS_PATH} ${MNT} -o vers=3.0,credentials=${CRED},cache=none,echo_interval=1 iptables -A INPUT -s ${CIFS_SERVER} -j DROP for i in $(seq 10); do umount ${MNT} rmmod cifs sleep 1 done rm -r ${MNT} iptables -D INPUT -s ${CIFS_SERVER} -j DROP [1]: DEBUG_LOCKS_WARN_ON(1) WARNING: CPU: 10 PID: 0 at kernel/locking/lockdep.c:234 hlock_class (kernel/locking/lockdep.c:234 kernel/locking/lockdep.c:223) Modules linked in: cifs_arc4 nls_ucs2_utils cifs_md4 [last unloaded: cifs] CPU: 10 UID: 0 PID: 0 Comm: swapper/10 Not tainted 6.14.0 #36 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:hlock_class (kernel/locking/lockdep.c:234 kernel/locking/lockdep.c:223) ... Call Trace: __lock_acquire (kernel/locking/lockdep.c:4853 kernel/locking/lockdep.c:5178) lock_acquire (kernel/locking/lockdep.c:469 kernel/locking/lockdep.c:5853 kernel/locking/lockdep.c:5816) _raw_spin_lock_nested (kernel/locking/spinlock.c:379) tcp_v4_rcv (./include/linux/skbuff.h:1678 ./include/net/tcp.h:2547 net/ipv4/tcp_ipv4.c:2350) ... BUG: kernel NULL pointer dereference, address: 00000000000000c4 PF: supervisor read access in kernel mode PF: error_code(0x0000) - not-present page PGD 0 Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 10 UID: 0 PID: 0 Comm: swapper/10 Tainted: G W 6.14.0 #36 Tainted: [W]=WARN Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:__lock_acquire (kernel/locking/lockdep.c:4852 kernel/locking/lockdep.c:5178) Code: 15 41 09 c7 41 8b 44 24 20 25 ff 1f 00 00 41 09 c7 8b 84 24 a0 00 00 00 45 89 7c 24 20 41 89 44 24 24 e8 e1 bc ff ff 4c 89 e7 <44> 0f b6 b8 c4 00 00 00 e8 d1 bc ff ff 0f b6 80 c5 00 00 00 88 44 RSP: 0018:ffa0000000468a10 EFLAGS: 00010046 RAX: 0000000000000000 RBX: ff1100010091cc38 RCX: 0000000000000027 RDX: ff1100081f09ca48 RSI: 0000000000000001 RDI: ff1100010091cc88 RBP: ff1100010091c200 R08: ff1100083fe6e228 R09: 00000000ffffbfff R10: ff1100081eca0000 R11: ff1100083fe10dc0 R12: ff1100010091cc88 R13: 0000000000000001 R14: 0000000000000000 R15: 00000000000424b1 FS: 0000000000000000(0000) GS:ff1100081f080000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000c4 CR3: 0000000002c4a003 CR4: 0000000000771ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: lock_acquire (kernel/locking/lockdep.c:469 kernel/locking/lockdep.c:5853 kernel/locking/lockdep.c:5816) _raw_spin_lock_nested (kernel/locking/spinlock.c:379) tcp_v4_rcv (./include/linux/skbuff.h:1678 ./include/net/tcp.h:2547 net/ipv4/tcp_ipv4.c:2350) ip_protocol_deliver_rcu (net/ipv4/ip_input.c:205 (discriminator 1)) ip_local_deliver_finish (./include/linux/rcupdate.h:878 net/ipv4/ip_input.c:234) ip_sublist_rcv_finish (net/ipv4/ip_input.c:576) ip_list_rcv_finish (net/ipv4/ip_input.c:628) ip_list_rcv (net/ipv4/ip_input.c:670) __netif_receive_skb_list_core (net/core/dev.c:5939 net/core/dev.c:5986) netif_receive_skb_list_internal (net/core/dev.c:6040 net/core/dev.c:6129) napi_complete_done (./include/linux/list.h:37 ./include/net/gro.h:519 ./include/net/gro.h:514 net/core/dev.c:6496) e1000_clean (drivers/net/ethernet/intel/e1000/e1000_main.c:3815) __napi_poll.constprop.0 (net/core/dev.c:7191) net_rx_action (net/core/dev.c:7262 net/core/dev.c:7382) handle_softirqs (kernel/softirq.c:561) __irq_exit_rcu (kernel/softirq.c:596 kernel/softirq.c:435 kernel/softirq.c:662) irq_exit_rcu (kernel/softirq.c:680) common_interrupt (arch/x86/kernel/irq.c:280 (discriminator 14)) asm_common_interrupt (./arch/x86/include/asm/idtentry.h:693) RIP: 0010:default_idle (./arch/x86/include/asm/irqflags.h:37 ./arch/x86/include/asm/irqflags.h:92 arch/x86/kernel/process.c:744) Code: 4c 01 c7 4c 29 c2 e9 72 ff ff ff 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa eb 07 0f 00 2d c3 2b 15 00 fb f4 c3 cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 90 RSP: 0018:ffa00000000ffee8 EFLAGS: 00000202 RAX: 000000000000640b RBX: ff1100010091c200 RCX: 0000000000061aa4 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff812f30c5 RBP: 000000000000000a R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000002 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 ? do_idle (kernel/sched/idle.c:186 kernel/sched/idle.c:325) default_idle_call (./include/linux/cpuidle.h:143 kernel/sched/idle.c:118) do_idle (kernel/sched/idle.c:186 kernel/sched/idle.c:325) cpu_startup_entry (kernel/sched/idle.c:422 (discriminator 1)) start_secondary (arch/x86/kernel/smpboot.c:315) common_startup_64 (arch/x86/kernel/head_64.S:421) Modules linked in: cifs_arc4 nls_ucs2_utils cifs_md4 [last unloaded: cifs] CR2: 00000000000000c4 Fixes: ed07536ed673 ("[PATCH] lockdep: annotate nfs/nfsd in-kernel sockets") Signed-off-by: Kuniyuki Iwashima Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20250407163313.22682-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 40 ++++++++++++++++++++++++++++++++++++++-- net/core/sock.c | 5 +++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 8daf1b3b12c6..694f954258d4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -339,6 +339,8 @@ struct sk_filter; * @sk_txtime_unused: unused txtime flags * @ns_tracker: tracker for netns reference * @sk_user_frags: xarray of pages the user is holding a reference on. + * @sk_owner: reference to the real owner of the socket that calls + * sock_lock_init_class_and_name(). */ struct sock { /* @@ -547,6 +549,10 @@ struct sock { struct rcu_head sk_rcu; netns_tracker ns_tracker; struct xarray sk_user_frags; + +#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) + struct module *sk_owner; +#endif }; struct sock_bh_locked { @@ -1583,6 +1589,35 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) sk_mem_reclaim(sk); } +#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) +static inline void sk_owner_set(struct sock *sk, struct module *owner) +{ + __module_get(owner); + sk->sk_owner = owner; +} + +static inline void sk_owner_clear(struct sock *sk) +{ + sk->sk_owner = NULL; +} + +static inline void sk_owner_put(struct sock *sk) +{ + module_put(sk->sk_owner); +} +#else +static inline void sk_owner_set(struct sock *sk, struct module *owner) +{ +} + +static inline void sk_owner_clear(struct sock *sk) +{ +} + +static inline void sk_owner_put(struct sock *sk) +{ +} +#endif /* * Macro so as to not evaluate some arguments when * lockdep is not enabled. @@ -1592,13 +1627,14 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) */ #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ do { \ + sk_owner_set(sk, THIS_MODULE); \ sk->sk_lock.owned = 0; \ init_waitqueue_head(&sk->sk_lock.wq); \ spin_lock_init(&(sk)->sk_lock.slock); \ debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ - sizeof((sk)->sk_lock)); \ + sizeof((sk)->sk_lock)); \ lockdep_set_class_and_name(&(sk)->sk_lock.slock, \ - (skey), (sname)); \ + (skey), (sname)); \ lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \ } while (0) diff --git a/net/core/sock.c b/net/core/sock.c index 323892066def..739a79859828 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2130,6 +2130,8 @@ lenout: */ static inline void sock_lock_init(struct sock *sk) { + sk_owner_clear(sk); + if (sk->sk_kern_sock) sock_lock_init_class_and_name( sk, @@ -2226,6 +2228,9 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) cgroup_sk_free(&sk->sk_cgrp_data); mem_cgroup_sk_free(sk); security_sk_free(sk); + + sk_owner_put(sk); + if (slab != NULL) kmem_cache_free(slab, sk); else -- 2.50.1 From aabc6596ffb377c4c9c8f335124b92ea282c9821 Mon Sep 17 00:00:00 2001 From: Arnaud Lecomte Date: Tue, 8 Apr 2025 17:55:08 +0200 Subject: [PATCH 05/16] net: ppp: Add bound checking for skb data on ppp_sync_txmung MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Ensure we have enough data in linear buffer from skb before accessing initial bytes. This prevents potential out-of-bounds accesses when processing short packets. When ppp_sync_txmung receives an incoming package with an empty payload: (remote) gef➤ p *(struct pppoe_hdr *) (skb->head + skb->network_header) $18 = { type = 0x1, ver = 0x1, code = 0x0, sid = 0x2, length = 0x0, tag = 0xffff8880371cdb96 } from the skb struct (trimmed) tail = 0x16, end = 0x140, head = 0xffff88803346f400 "4", data = 0xffff88803346f416 ":\377", truesize = 0x380, len = 0x0, data_len = 0x0, mac_len = 0xe, hdr_len = 0x0, it is not safe to access data[2]. Reported-by: syzbot+29fc8991b0ecb186cf40@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=29fc8991b0ecb186cf40 Tested-by: syzbot+29fc8991b0ecb186cf40@syzkaller.appspotmail.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Arnaud Lecomte Link: https://patch.msgid.link/20250408-bound-checking-ppp_txmung-v2-1-94bb6e1b92d0@arnaud-lcm.com [pabeni@redhat.com: fixed subj typo] Signed-off-by: Paolo Abeni --- drivers/net/ppp/ppp_synctty.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ppp/ppp_synctty.c b/drivers/net/ppp/ppp_synctty.c index 644e99fc3623..9c4932198931 100644 --- a/drivers/net/ppp/ppp_synctty.c +++ b/drivers/net/ppp/ppp_synctty.c @@ -506,6 +506,11 @@ ppp_sync_txmunge(struct syncppp *ap, struct sk_buff *skb) unsigned char *data; int islcp; + /* Ensure we can safely access protocol field and LCP code */ + if (!pskb_may_pull(skb, 3)) { + kfree_skb(skb); + return NULL; + } data = skb->data; proto = get_unaligned_be16(data); -- 2.50.1 From e042ed950d4e176379ba4c0722146cd96fb38aa2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 7 Apr 2025 19:40:18 +0200 Subject: [PATCH 06/16] nft_set_pipapo: fix incorrect avx2 match of 5th field octet Given a set element like: icmpv6 . dead:beef:00ff::1 The value of 'ff' is irrelevant, any address will be matched as long as the other octets are the same. This is because of too-early register clobbering: ymm7 is reloaded with new packet data (pkt[9]) but it still holds data of an earlier load that wasn't processed yet. The existing tests in nft_concat_range.sh selftests do exercise this code path, but do not trigger incorrect matching due to the network prefix limitation. Fixes: 7400b063969b ("nft_set_pipapo: Introduce AVX2-based lookup implementation") Reported-by: sontu mazumdar Closes: https://lore.kernel.org/netfilter/CANgxkqwnMH7fXra+VUfODT-8+qFLgskq3set1cAzqqJaV4iEZg@mail.gmail.com/T/#t Reviewed-by: Stefano Brivio Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo_avx2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index b8d3c3213efe..c15db28c5ebc 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -994,8 +994,9 @@ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill, NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize); NFT_PIPAPO_AVX2_AND(6, 2, 3); + NFT_PIPAPO_AVX2_AND(3, 4, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize); - NFT_PIPAPO_AVX2_AND(0, 4, 5); + NFT_PIPAPO_AVX2_AND(0, 3, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize); NFT_PIPAPO_AVX2_AND(2, 6, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize); -- 2.50.1 From 27eb86e22f1067a39f05e8878fd83f00e3311dc3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 7 Apr 2025 19:40:19 +0200 Subject: [PATCH 07/16] selftests: netfilter: add test case for recent mismatch bug Without 'nft_set_pipapo: fix incorrect avx2 match of 5th field octet" this fails: TEST: reported issues Add two elements, flush, re-add 1s [ OK ] net,mac with reload 0s [ OK ] net,port,proto 3s [ OK ] avx2 false match 0s [FAIL] False match for fe80:dead:01fe:0a02:0b03:6007:8009:a001 Other tests do not detect the kernel bug as they only alter parts in the /64 netmask. Reviewed-by: Stefano Brivio Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- .../net/netfilter/nft_concat_range.sh | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range.sh b/tools/testing/selftests/net/netfilter/nft_concat_range.sh index 47088b005390..1f5979c1510c 100755 --- a/tools/testing/selftests/net/netfilter/nft_concat_range.sh +++ b/tools/testing/selftests/net/netfilter/nft_concat_range.sh @@ -27,7 +27,7 @@ TYPES="net_port port_net net6_port port_proto net6_port_mac net6_port_mac_proto net6_port_net6_port net_port_mac_proto_net" # Reported bugs, also described by TYPE_ variables below -BUGS="flush_remove_add reload net_port_proto_match" +BUGS="flush_remove_add reload net_port_proto_match avx2_mismatch" # List of possible paths to pktgen script from kernel tree for performance tests PKTGEN_SCRIPT_PATHS=" @@ -387,6 +387,25 @@ race_repeat 0 perf_duration 0 " + +TYPE_avx2_mismatch=" +display avx2 false match +type_spec inet_proto . ipv6_addr +chain_spec meta l4proto . ip6 daddr +dst proto addr6 +src +start 1 +count 1 +src_delta 1 +tools ping +proto icmp6 + +race_repeat 0 + +perf_duration 0 +" + + # Set template for all tests, types and rules are filled in depending on test set_template=' flush ruleset @@ -1629,6 +1648,24 @@ test_bug_net_port_proto_match() { nft flush ruleset } +test_bug_avx2_mismatch() +{ + setup veth send_"${proto}" set || return ${ksft_skip} + + local a1="fe80:dead:01ff:0a02:0b03:6007:8009:a001" + local a2="fe80:dead:01fe:0a02:0b03:6007:8009:a001" + + nft "add element inet filter test { icmpv6 . $a1 }" + + dst_addr6="$a2" + send_icmp6 + + if [ "$(count_packets)" -gt "0" ]; then + err "False match for $a2" + return 1 + fi +} + test_reported_issues() { eval test_bug_"${subtest}" } -- 2.50.1 From eaa517b77e63442260640d875f824d1111ca6569 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 9 Apr 2025 14:24:40 +0300 Subject: [PATCH 08/16] ethtool: cmis_cdb: Fix incorrect read / write length extension The 'read_write_len_ext' field in 'struct ethtool_cmis_cdb_cmd_args' stores the maximum number of bytes that can be read from or written to the Local Payload (LPL) page in a single multi-byte access. Cited commit started overwriting this field with the maximum number of bytes that can be read from or written to the Extended Payload (LPL) pages in a single multi-byte access. Transceiver modules that support auto paging can advertise a number larger than 255 which is problematic as 'read_write_len_ext' is a 'u8', resulting in the number getting truncated and firmware flashing failing [1]. Fix by ignoring the maximum EPL access size as the kernel does not currently support auto paging (even if the transceiver module does) and will not try to read / write more than 128 bytes at once. [1] Transceiver module firmware flashing started for device enp177s0np0 Transceiver module firmware flashing in progress for device enp177s0np0 Progress: 0% Transceiver module firmware flashing encountered an error for device enp177s0np0 Status message: Write FW block EPL command failed, LPL length is longer than CDB read write length extension allows. Fixes: 9a3b0d078bd8 ("net: ethtool: Add support for writing firmware blocks using EPL payload") Reported-by: Damodharam Ammepalli Closes: https://lore.kernel.org/netdev/20250402183123.321036-3-michael.chan@broadcom.com/ Tested-by: Damodharam Ammepalli Signed-off-by: Ido Schimmel Reviewed-by: Damodharam Ammepalli Reviewed-by: Petr Machata Link: https://patch.msgid.link/20250409112440.365672-1-idosch@nvidia.com Signed-off-by: Paolo Abeni --- net/ethtool/cmis.h | 1 - net/ethtool/cmis_cdb.c | 18 +++--------------- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/net/ethtool/cmis.h b/net/ethtool/cmis.h index 1e790413db0e..4a9a946cabf0 100644 --- a/net/ethtool/cmis.h +++ b/net/ethtool/cmis.h @@ -101,7 +101,6 @@ struct ethtool_cmis_cdb_rpl { }; u32 ethtool_cmis_get_max_lpl_size(u8 num_of_byte_octs); -u32 ethtool_cmis_get_max_epl_size(u8 num_of_byte_octs); void ethtool_cmis_cdb_compose_args(struct ethtool_cmis_cdb_cmd_args *args, enum ethtool_cmis_cdb_cmd_id cmd, u8 *lpl, diff --git a/net/ethtool/cmis_cdb.c b/net/ethtool/cmis_cdb.c index d159dc121bde..0e2691ccb0df 100644 --- a/net/ethtool/cmis_cdb.c +++ b/net/ethtool/cmis_cdb.c @@ -16,15 +16,6 @@ u32 ethtool_cmis_get_max_lpl_size(u8 num_of_byte_octs) return 8 * (1 + min_t(u8, num_of_byte_octs, 15)); } -/* For accessing the EPL field on page 9Fh, the allowable length extension is - * min(i, 255) byte octets where i specifies the allowable additional number of - * byte octets in a READ or a WRITE. - */ -u32 ethtool_cmis_get_max_epl_size(u8 num_of_byte_octs) -{ - return 8 * (1 + min_t(u8, num_of_byte_octs, 255)); -} - void ethtool_cmis_cdb_compose_args(struct ethtool_cmis_cdb_cmd_args *args, enum ethtool_cmis_cdb_cmd_id cmd, u8 *lpl, u8 lpl_len, u8 *epl, u16 epl_len, @@ -33,19 +24,16 @@ void ethtool_cmis_cdb_compose_args(struct ethtool_cmis_cdb_cmd_args *args, { args->req.id = cpu_to_be16(cmd); args->req.lpl_len = lpl_len; - if (lpl) { + if (lpl) memcpy(args->req.payload, lpl, args->req.lpl_len); - args->read_write_len_ext = - ethtool_cmis_get_max_lpl_size(read_write_len_ext); - } if (epl) { args->req.epl_len = cpu_to_be16(epl_len); args->req.epl = epl; - args->read_write_len_ext = - ethtool_cmis_get_max_epl_size(read_write_len_ext); } args->max_duration = max_duration; + args->read_write_len_ext = + ethtool_cmis_get_max_lpl_size(read_write_len_ext); args->msleep_pre_rpl = msleep_pre_rpl; args->rpl_exp_len = rpl_exp_len; args->flags = flags; -- 2.50.1 From 6afd0a3c7ecb5049d75801a3efda0ada70483bd0 Mon Sep 17 00:00:00 2001 From: David Wei Date: Wed, 9 Apr 2025 09:31:53 -0700 Subject: [PATCH 09/16] io_uring/zcrx: enable tcp-data-split in selftest For bnxt when the agg ring is used then tcp-data-split is automatically reported to be enabled, but __net_mp_open_rxq() requires tcp-data-split to be explicitly enabled by the user. Enable tcp-data-split explicitly in io_uring zc rx selftest. Signed-off-by: David Wei Link: https://patch.msgid.link/20250409163153.2747918-1-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/iou-zcrx.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py index 9f271ab6ec04..6a0378e06cab 100755 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py @@ -35,6 +35,7 @@ def test_zcrx(cfg) -> None: rx_ring = _get_rx_ring_entries(cfg) try: + ethtool(f"-G {cfg.ifname} tcp-data-split on", host=cfg.remote) ethtool(f"-G {cfg.ifname} rx 64", host=cfg.remote) ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}", host=cfg.remote) flow_rule_id = _set_flow_rule(cfg, combined_chans - 1) @@ -48,6 +49,7 @@ def test_zcrx(cfg) -> None: ethtool(f"-N {cfg.ifname} delete {flow_rule_id}", host=cfg.remote) ethtool(f"-X {cfg.ifname} default", host=cfg.remote) ethtool(f"-G {cfg.ifname} rx {rx_ring}", host=cfg.remote) + ethtool(f"-G {cfg.ifname} tcp-data-split auto", host=cfg.remote) def test_zcrx_oneshot(cfg) -> None: @@ -59,6 +61,7 @@ def test_zcrx_oneshot(cfg) -> None: rx_ring = _get_rx_ring_entries(cfg) try: + ethtool(f"-G {cfg.ifname} tcp-data-split on", host=cfg.remote) ethtool(f"-G {cfg.ifname} rx 64", host=cfg.remote) ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}", host=cfg.remote) flow_rule_id = _set_flow_rule(cfg, combined_chans - 1) @@ -72,6 +75,7 @@ def test_zcrx_oneshot(cfg) -> None: ethtool(f"-N {cfg.ifname} delete {flow_rule_id}", host=cfg.remote) ethtool(f"-X {cfg.ifname} default", host=cfg.remote) ethtool(f"-G {cfg.ifname} rx {rx_ring}", host=cfg.remote) + ethtool(f"-G {cfg.ifname} tcp-data-split auto", host=cfg.remote) def main() -> None: -- 2.50.1 From cfe82469a00f0c0983bf4652de3a2972637dfc56 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 8 Apr 2025 13:46:17 -0400 Subject: [PATCH 10/16] ipv6: add exception routes to GC list in rt6_insert_exception Commit 5eb902b8e719 ("net/ipv6: Remove expired routes with a separated list of routes.") introduced a separated list for managing route expiration via the GC timer. However, it missed adding exception routes (created by ip6_rt_update_pmtu() and rt6_do_redirect()) to this GC list. As a result, these exceptions were never considered for expiration and removal, leading to stale entries persisting in the routing table. This patch fixes the issue by calling fib6_add_gc_list() in rt6_insert_exception(), ensuring that exception routes are properly tracked and garbage collected when expired. Fixes: 5eb902b8e719 ("net/ipv6: Remove expired routes with a separated list of routes.") Reported-by: Jianlin Shi Signed-off-by: Xin Long Reviewed-by: David Ahern Link: https://patch.msgid.link/837e7506ffb63f47faa2b05d9b85481aad28e1a4.1744134377.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 210b84cecc24..96f1621e2381 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1771,6 +1771,7 @@ out: if (!err) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum(net, f6i); + fib6_add_gc_list(f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); fib6_force_start_gc(net); } -- 2.50.1 From 8e404ad95d2c10c261e2ef6992c7c12dde03df0e Mon Sep 17 00:00:00 2001 From: Christopher S M Hall Date: Tue, 1 Apr 2025 16:35:29 -0700 Subject: [PATCH 11/16] igc: fix PTM cycle trigger logic Writing to clear the PTM status 'valid' bit while the PTM cycle is triggered results in unreliable PTM operation. To fix this, clear the PTM 'trigger' and status after each PTM transaction. The issue can be reproduced with the following: $ sudo phc2sys -R 1000 -O 0 -i tsn0 -m Note: 1000 Hz (-R 1000) is unrealistically large, but provides a way to quickly reproduce the issue. PHC2SYS exits with: "ioctl PTP_OFFSET_PRECISE: Connection timed out" when the PTM transaction fails This patch also fixes a hang in igc_probe() when loading the igc driver in the kdump kernel on systems supporting PTM. The igc driver running in the base kernel enables PTM trigger in igc_probe(). Therefore the driver is always in PTM trigger mode, except in brief periods when manually triggering a PTM cycle. When a crash occurs, the NIC is reset while PTM trigger is enabled. Due to a hardware problem, the NIC is subsequently in a bad busmaster state and doesn't handle register reads/writes. When running igc_probe() in the kdump kernel, the first register access to a NIC register hangs driver probing and ultimately breaks kdump. With this patch, igc has PTM trigger disabled most of the time, and the trigger is only enabled for very brief (10 - 100 us) periods when manually triggering a PTM cycle. Chances that a crash occurs during a PTM trigger are not 0, but extremely reduced. Fixes: a90ec8483732 ("igc: Add support for PTP getcrosststamp()") Reviewed-by: Michal Swiatkowski Tested-by: Mor Bar-Gabay Tested-by: Avigail Dahan Signed-off-by: Christopher S M Hall Reviewed-by: Corinna Vinschen Signed-off-by: Jacob Keller Tested-by: Corinna Vinschen Acked-by: Vinicius Costa Gomes Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_defines.h | 1 + drivers/net/ethernet/intel/igc/igc_ptp.c | 70 ++++++++++++-------- 2 files changed, 42 insertions(+), 29 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index 8e449904aa7d..2ff292f5f63b 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -593,6 +593,7 @@ #define IGC_PTM_STAT_T4M1_OVFL BIT(3) /* T4 minus T1 overflow */ #define IGC_PTM_STAT_ADJUST_1ST BIT(4) /* 1588 timer adjusted during 1st PTM cycle */ #define IGC_PTM_STAT_ADJUST_CYC BIT(5) /* 1588 timer adjusted during non-1st PTM cycle */ +#define IGC_PTM_STAT_ALL GENMASK(5, 0) /* Used to clear all status */ /* PCIe PTM Cycle Control */ #define IGC_PTM_CYCLE_CTRL_CYC_TIME(msec) ((msec) & 0x3ff) /* PTM Cycle Time (msec) */ diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index 946edbad4302..c640e346342b 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -974,13 +974,40 @@ static void igc_ptm_log_error(struct igc_adapter *adapter, u32 ptm_stat) } } +static void igc_ptm_trigger(struct igc_hw *hw) +{ + u32 ctrl; + + /* To "manually" start the PTM cycle we need to set the + * trigger (TRIG) bit + */ + ctrl = rd32(IGC_PTM_CTRL); + ctrl |= IGC_PTM_CTRL_TRIG; + wr32(IGC_PTM_CTRL, ctrl); + /* Perform flush after write to CTRL register otherwise + * transaction may not start + */ + wrfl(); +} + +static void igc_ptm_reset(struct igc_hw *hw) +{ + u32 ctrl; + + ctrl = rd32(IGC_PTM_CTRL); + ctrl &= ~IGC_PTM_CTRL_TRIG; + wr32(IGC_PTM_CTRL, ctrl); + /* Write to clear all status */ + wr32(IGC_PTM_STAT, IGC_PTM_STAT_ALL); +} + static int igc_phc_get_syncdevicetime(ktime_t *device, struct system_counterval_t *system, void *ctx) { - u32 stat, t2_curr_h, t2_curr_l, ctrl; struct igc_adapter *adapter = ctx; struct igc_hw *hw = &adapter->hw; + u32 stat, t2_curr_h, t2_curr_l; int err, count = 100; ktime_t t1, t2_curr; @@ -994,25 +1021,13 @@ static int igc_phc_get_syncdevicetime(ktime_t *device, * are transitory. Repeating the process returns valid * data eventually. */ - - /* To "manually" start the PTM cycle we need to clear and - * then set again the TRIG bit. - */ - ctrl = rd32(IGC_PTM_CTRL); - ctrl &= ~IGC_PTM_CTRL_TRIG; - wr32(IGC_PTM_CTRL, ctrl); - ctrl |= IGC_PTM_CTRL_TRIG; - wr32(IGC_PTM_CTRL, ctrl); - - /* The cycle only starts "for real" when software notifies - * that it has read the registers, this is done by setting - * VALID bit. - */ - wr32(IGC_PTM_STAT, IGC_PTM_STAT_VALID); + igc_ptm_trigger(hw); err = readx_poll_timeout(rd32, IGC_PTM_STAT, stat, stat, IGC_PTM_STAT_SLEEP, IGC_PTM_STAT_TIMEOUT); + igc_ptm_reset(hw); + if (err < 0) { netdev_err(adapter->netdev, "Timeout reading IGC_PTM_STAT register\n"); return err; @@ -1021,15 +1036,7 @@ static int igc_phc_get_syncdevicetime(ktime_t *device, if ((stat & IGC_PTM_STAT_VALID) == IGC_PTM_STAT_VALID) break; - if (stat & ~IGC_PTM_STAT_VALID) { - /* An error occurred, log it. */ - igc_ptm_log_error(adapter, stat); - /* The STAT register is write-1-to-clear (W1C), - * so write the previous error status to clear it. - */ - wr32(IGC_PTM_STAT, stat); - continue; - } + igc_ptm_log_error(adapter, stat); } while (--count); if (!count) { @@ -1255,7 +1262,7 @@ void igc_ptp_stop(struct igc_adapter *adapter) void igc_ptp_reset(struct igc_adapter *adapter) { struct igc_hw *hw = &adapter->hw; - u32 cycle_ctrl, ctrl; + u32 cycle_ctrl, ctrl, stat; unsigned long flags; u32 timadj; @@ -1290,14 +1297,19 @@ void igc_ptp_reset(struct igc_adapter *adapter) ctrl = IGC_PTM_CTRL_EN | IGC_PTM_CTRL_START_NOW | IGC_PTM_CTRL_SHRT_CYC(IGC_PTM_SHORT_CYC_DEFAULT) | - IGC_PTM_CTRL_PTM_TO(IGC_PTM_TIMEOUT_DEFAULT) | - IGC_PTM_CTRL_TRIG; + IGC_PTM_CTRL_PTM_TO(IGC_PTM_TIMEOUT_DEFAULT); wr32(IGC_PTM_CTRL, ctrl); /* Force the first cycle to run. */ - wr32(IGC_PTM_STAT, IGC_PTM_STAT_VALID); + igc_ptm_trigger(hw); + + if (readx_poll_timeout_atomic(rd32, IGC_PTM_STAT, stat, + stat, IGC_PTM_STAT_SLEEP, + IGC_PTM_STAT_TIMEOUT)) + netdev_err(adapter->netdev, "Timeout reading IGC_PTM_STAT register\n"); + igc_ptm_reset(hw); break; default: /* No work to do. */ -- 2.50.1 From 714cd033da6fea4cf54a11b3cfd070afde3f31df Mon Sep 17 00:00:00 2001 From: Christopher S M Hall Date: Tue, 1 Apr 2025 16:35:30 -0700 Subject: [PATCH 12/16] igc: increase wait time before retrying PTM The i225/i226 hardware retries if it receives an inappropriate response from the upstream device. If the device retries too quickly, the root port does not respond. The wait between attempts was reduced from 10us to 1us in commit 6b8aa753a9f9 ("igc: Decrease PTM short interval from 10 us to 1 us"), which said: With the 10us interval, we were seeing PTM transactions take around 12us. Hardware team suggested this interval could be lowered to 1us which was confirmed with PCIe sniffer. With the 1us interval, PTM dialogs took around 2us. While a 1us short cycle time was thought to be theoretically sufficient, it turns out in practice it is not quite long enough. It is unclear if the problem is in the root port or an issue in i225/i226. Increase the wait from 1us to 4us. Increasing to 2us appeared to work in practice on the setups we have available. A value of 4us was chosen due to the limited hardware available for testing, with a goal of ensuring we wait long enough without overly penalizing the response time when unnecessary. The issue can be reproduced with the following: $ sudo phc2sys -R 1000 -O 0 -i tsn0 -m Note: 1000 Hz (-R 1000) is unrealistically large, but provides a way to quickly reproduce the issue. PHC2SYS exits with: "ioctl PTP_OFFSET_PRECISE: Connection timed out" when the PTM transaction fails Fixes: 6b8aa753a9f9 ("igc: Decrease PTM short interval from 10 us to 1 us") Reviewed-by: Michal Swiatkowski Tested-by: Mor Bar-Gabay Tested-by: Avigail Dahan Signed-off-by: Christopher S M Hall Reviewed-by: Corinna Vinschen Signed-off-by: Jacob Keller Acked-by: Vinicius Costa Gomes Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_defines.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index 2ff292f5f63b..d19325b0e6e0 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -574,7 +574,10 @@ #define IGC_PTM_CTRL_SHRT_CYC(usec) (((usec) & 0x3f) << 2) #define IGC_PTM_CTRL_PTM_TO(usec) (((usec) & 0xff) << 8) -#define IGC_PTM_SHORT_CYC_DEFAULT 1 /* Default short cycle interval */ +/* A short cycle time of 1us theoretically should work, but appears to be too + * short in practice. + */ +#define IGC_PTM_SHORT_CYC_DEFAULT 4 /* Default short cycle interval */ #define IGC_PTM_CYC_TIME_DEFAULT 5 /* Default PTM cycle time */ #define IGC_PTM_TIMEOUT_DEFAULT 255 /* Default timeout for PTM errors */ -- 2.50.1 From cd7f7328d691937102732f39f97ead35b15bf803 Mon Sep 17 00:00:00 2001 From: Christopher S M Hall Date: Tue, 1 Apr 2025 16:35:31 -0700 Subject: [PATCH 13/16] igc: move ktime snapshot into PTM retry loop Move ktime_get_snapshot() into the loop. If a retry does occur, a more recent snapshot will result in a more accurate cross-timestamp. Fixes: a90ec8483732 ("igc: Add support for PTP getcrosststamp()") Reviewed-by: Michal Swiatkowski Tested-by: Mor Bar-Gabay Tested-by: Avigail Dahan Signed-off-by: Christopher S M Hall Reviewed-by: Corinna Vinschen Signed-off-by: Jacob Keller Acked-by: Vinicius Costa Gomes Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_ptp.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index c640e346342b..516abe7405de 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -1011,16 +1011,16 @@ static int igc_phc_get_syncdevicetime(ktime_t *device, int err, count = 100; ktime_t t1, t2_curr; - /* Get a snapshot of system clocks to use as historic value. */ - ktime_get_snapshot(&adapter->snapshot); - + /* Doing this in a loop because in the event of a + * badly timed (ha!) system clock adjustment, we may + * get PTM errors from the PCI root, but these errors + * are transitory. Repeating the process returns valid + * data eventually. + */ do { - /* Doing this in a loop because in the event of a - * badly timed (ha!) system clock adjustment, we may - * get PTM errors from the PCI root, but these errors - * are transitory. Repeating the process returns valid - * data eventually. - */ + /* Get a snapshot of system clocks to use as historic value. */ + ktime_get_snapshot(&adapter->snapshot); + igc_ptm_trigger(hw); err = readx_poll_timeout(rd32, IGC_PTM_STAT, stat, -- 2.50.1 From 26a3910afd111f7c1a96dace6dc02f3225063896 Mon Sep 17 00:00:00 2001 From: Christopher S M Hall Date: Tue, 1 Apr 2025 16:35:32 -0700 Subject: [PATCH 14/16] igc: handle the IGC_PTP_ENABLED flag correctly All functions in igc_ptp.c called from igc_main.c should check the IGC_PTP_ENABLED flag. Adding check for this flag to stop and reset functions. Fixes: 5f2958052c58 ("igc: Add basic skeleton for PTP") Signed-off-by: Christopher S M Hall Reviewed-by: Corinna Vinschen Signed-off-by: Jacob Keller Tested-by: Mor Bar-Gabay Acked-by: Vinicius Costa Gomes Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_ptp.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index 516abe7405de..343205bffc35 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -1244,8 +1244,12 @@ void igc_ptp_suspend(struct igc_adapter *adapter) **/ void igc_ptp_stop(struct igc_adapter *adapter) { + if (!(adapter->ptp_flags & IGC_PTP_ENABLED)) + return; + igc_ptp_suspend(adapter); + adapter->ptp_flags &= ~IGC_PTP_ENABLED; if (adapter->ptp_clock) { ptp_clock_unregister(adapter->ptp_clock); netdev_info(adapter->netdev, "PHC removed\n"); @@ -1266,6 +1270,9 @@ void igc_ptp_reset(struct igc_adapter *adapter) unsigned long flags; u32 timadj; + if (!(adapter->ptp_flags & IGC_PTP_ENABLED)) + return; + /* reset the tstamp_config */ igc_ptp_set_timestamp_mode(adapter, &adapter->tstamp_config); -- 2.50.1 From 1f025759ba394dd53e434d2668cb0597886d9b69 Mon Sep 17 00:00:00 2001 From: Christopher S M Hall Date: Tue, 1 Apr 2025 16:35:33 -0700 Subject: [PATCH 15/16] igc: cleanup PTP module if probe fails Make sure that the PTP module is cleaned up if the igc_probe() fails by calling igc_ptp_stop() on exit. Fixes: d89f88419f99 ("igc: Add skeletal frame for Intel(R) 2.5G Ethernet Controller support") Signed-off-by: Christopher S M Hall Reviewed-by: Corinna Vinschen Signed-off-by: Jacob Keller Tested-by: Mor Bar-Gabay Acked-by: Vinicius Costa Gomes Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index f1330379e6bb..b1669d7cf435 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7231,6 +7231,7 @@ static int igc_probe(struct pci_dev *pdev, err_register: igc_release_hw_control(adapter); + igc_ptp_stop(adapter); err_eeprom: if (!igc_check_reset_block(hw)) igc_reset_phy(hw); -- 2.50.1 From 1a931c4f5e6862e61a4b130cb76b422e1415f644 Mon Sep 17 00:00:00 2001 From: Christopher S M Hall Date: Tue, 1 Apr 2025 16:35:34 -0700 Subject: [PATCH 16/16] igc: add lock preventing multiple simultaneous PTM transactions Add a mutex around the PTM transaction to prevent multiple transactors Multiple processes try to initiate a PTM transaction, one or all may fail. This can be reproduced by running two instances of the following: $ sudo phc2sys -O 0 -i tsn0 -m PHC2SYS exits with: "ioctl PTP_OFFSET_PRECISE: Connection timed out" when the PTM transaction fails Note: Normally two instance of PHC2SYS will not run, but one process should not break another. Fixes: a90ec8483732 ("igc: Add support for PTP getcrosststamp()") Signed-off-by: Christopher S M Hall Reviewed-by: Corinna Vinschen Signed-off-by: Jacob Keller Tested-by: Mor Bar-Gabay Acked-by: Vinicius Costa Gomes Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc.h | 1 + drivers/net/ethernet/intel/igc/igc_ptp.c | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index c35cc5cb1185..2f265c0959c7 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -319,6 +319,7 @@ struct igc_adapter { struct timespec64 prev_ptp_time; /* Pre-reset PTP clock */ ktime_t ptp_reset_start; /* Reset time in clock mono */ struct system_time_snapshot snapshot; + struct mutex ptm_lock; /* Only allow one PTM transaction at a time */ char fw_version[32]; diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index 343205bffc35..612ed26a29c5 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -974,6 +974,7 @@ static void igc_ptm_log_error(struct igc_adapter *adapter, u32 ptm_stat) } } +/* The PTM lock: adapter->ptm_lock must be held when calling igc_ptm_trigger() */ static void igc_ptm_trigger(struct igc_hw *hw) { u32 ctrl; @@ -990,6 +991,7 @@ static void igc_ptm_trigger(struct igc_hw *hw) wrfl(); } +/* The PTM lock: adapter->ptm_lock must be held when calling igc_ptm_reset() */ static void igc_ptm_reset(struct igc_hw *hw) { u32 ctrl; @@ -1068,9 +1070,16 @@ static int igc_ptp_getcrosststamp(struct ptp_clock_info *ptp, { struct igc_adapter *adapter = container_of(ptp, struct igc_adapter, ptp_caps); + int ret; - return get_device_system_crosststamp(igc_phc_get_syncdevicetime, - adapter, &adapter->snapshot, cts); + /* This blocks until any in progress PTM transactions complete */ + mutex_lock(&adapter->ptm_lock); + + ret = get_device_system_crosststamp(igc_phc_get_syncdevicetime, + adapter, &adapter->snapshot, cts); + mutex_unlock(&adapter->ptm_lock); + + return ret; } static int igc_ptp_getcyclesx64(struct ptp_clock_info *ptp, @@ -1169,6 +1178,7 @@ void igc_ptp_init(struct igc_adapter *adapter) spin_lock_init(&adapter->ptp_tx_lock); spin_lock_init(&adapter->free_timer_lock); spin_lock_init(&adapter->tmreg_lock); + mutex_init(&adapter->ptm_lock); adapter->tstamp_config.rx_filter = HWTSTAMP_FILTER_NONE; adapter->tstamp_config.tx_type = HWTSTAMP_TX_OFF; @@ -1181,6 +1191,7 @@ void igc_ptp_init(struct igc_adapter *adapter) if (IS_ERR(adapter->ptp_clock)) { adapter->ptp_clock = NULL; netdev_err(netdev, "ptp_clock_register failed\n"); + mutex_destroy(&adapter->ptm_lock); } else if (adapter->ptp_clock) { netdev_info(netdev, "PHC added\n"); adapter->ptp_flags |= IGC_PTP_ENABLED; @@ -1210,10 +1221,12 @@ static void igc_ptm_stop(struct igc_adapter *adapter) struct igc_hw *hw = &adapter->hw; u32 ctrl; + mutex_lock(&adapter->ptm_lock); ctrl = rd32(IGC_PTM_CTRL); ctrl &= ~IGC_PTM_CTRL_EN; wr32(IGC_PTM_CTRL, ctrl); + mutex_unlock(&adapter->ptm_lock); } /** @@ -1255,6 +1268,7 @@ void igc_ptp_stop(struct igc_adapter *adapter) netdev_info(adapter->netdev, "PHC removed\n"); adapter->ptp_flags &= ~IGC_PTP_ENABLED; } + mutex_destroy(&adapter->ptm_lock); } /** @@ -1294,6 +1308,7 @@ void igc_ptp_reset(struct igc_adapter *adapter) if (!igc_is_crosststamp_supported(adapter)) break; + mutex_lock(&adapter->ptm_lock); wr32(IGC_PCIE_DIG_DELAY, IGC_PCIE_DIG_DELAY_DEFAULT); wr32(IGC_PCIE_PHY_DELAY, IGC_PCIE_PHY_DELAY_DEFAULT); @@ -1317,6 +1332,7 @@ void igc_ptp_reset(struct igc_adapter *adapter) netdev_err(adapter->netdev, "Timeout reading IGC_PTM_STAT register\n"); igc_ptm_reset(hw); + mutex_unlock(&adapter->ptm_lock); break; default: /* No work to do. */ -- 2.50.1