From 009970506c92816c857e1705cf6db68c9147d39f Mon Sep 17 00:00:00 2001 From: Jijie Shao Date: Sat, 17 May 2025 17:58:27 +0800 Subject: [PATCH 01/16] net: hibmcge: fix incorrect statistics update issue When the user dumps statistics, the hibmcge driver automatically updates all statistics. If the driver is performing the reset operation, the error data of 0xFFFFFFFF is updated. Therefore, if the driver is resetting, the hbg_update_stats_by_info() needs to return directly. Fixes: c0bf9bf31e79 ("net: hibmcge: Add support for dump statistics") Signed-off-by: Jijie Shao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250517095828.1763126-2-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c index 8f1107b85fbb..55520053270a 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c @@ -317,6 +317,9 @@ static void hbg_update_stats_by_info(struct hbg_priv *priv, const struct hbg_ethtool_stats *stats; u32 i; + if (test_bit(HBG_NIC_STATE_RESETTING, &priv->state)) + return; + for (i = 0; i < info_len; i++) { stats = &info[i]; if (!stats->reg) -- 2.51.0 From 1b45443b84872c010b36dfc2681413bb1a10011a Mon Sep 17 00:00:00 2001 From: Jijie Shao Date: Sat, 17 May 2025 17:58:28 +0800 Subject: [PATCH 02/16] net: hibmcge: fix wrong ndo.open() after reset fail issue. If the driver reset fails, it may not work properly. Therefore, the ndo.open() operation should be rejected. In this patch, the driver calls netif_device_detach() before the reset and calls netif_device_attach() after the reset succeeds. If the reset fails, netif_device_attach() is not called. Therefore, netdev does not present and cannot be opened. If reset fails, only the PCI reset (via sysfs) can be used to attempt recovery. Fixes: 3f5a61f6d504 ("net: hibmcge: Add reset supported in this module") Signed-off-by: Jijie Shao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250517095828.1763126-3-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c index a0bcfb5a713d..ff3295b60a69 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c @@ -61,6 +61,8 @@ static int hbg_reset_prepare(struct hbg_priv *priv, enum hbg_reset_type type) return -EBUSY; } + netif_device_detach(priv->netdev); + priv->reset_type = type; set_bit(HBG_NIC_STATE_RESETTING, &priv->state); clear_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state); @@ -91,6 +93,8 @@ static int hbg_reset_done(struct hbg_priv *priv, enum hbg_reset_type type) return ret; } + netif_device_attach(priv->netdev); + dev_info(&priv->pdev->dev, "reset done\n"); return ret; } @@ -117,16 +121,13 @@ void hbg_err_reset(struct hbg_priv *priv) if (running) dev_close(priv->netdev); - hbg_reset(priv); - - /* in hbg_pci_err_detected(), we will detach first, - * so we need to attach before open - */ - if (!netif_device_present(priv->netdev)) - netif_device_attach(priv->netdev); + if (hbg_reset(priv)) + goto err_unlock; if (running) dev_open(priv->netdev, NULL); + +err_unlock: rtnl_unlock(); } @@ -160,7 +161,6 @@ static pci_ers_result_t hbg_pci_err_slot_reset(struct pci_dev *pdev) pci_save_state(pdev); hbg_err_reset(priv); - netif_device_attach(netdev); return PCI_ERS_RESULT_RECOVERED; } -- 2.51.0 From 407e0efdf8baf1672876d5948b75049860a93e59 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 May 2025 12:40:30 +0000 Subject: [PATCH 03/16] idpf: fix idpf_vport_splitq_napi_poll() idpf_vport_splitq_napi_poll() can incorrectly return @budget after napi_complete_done() has been called. This violates NAPI rules, because after napi_complete_done(), current thread lost napi ownership. Move the test against POLL_MODE before the napi_complete_done(). Fixes: c2d548cad150 ("idpf: add TX splitq napi poll support") Reported-by: Peter Newman Closes: https://lore.kernel.org/netdev/20250520121908.1805732-1-edumazet@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Joshua Hay Cc: Alan Brady Cc: Madhu Chittim Cc: Phani Burra Cc: Pavan Kumar Linga Link: https://patch.msgid.link/20250520124030.1983936-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index bdf52cef3891..2d5f5c9f91ce 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -4025,6 +4025,14 @@ static int idpf_vport_splitq_napi_poll(struct napi_struct *napi, int budget) return budget; } + /* Switch to poll mode in the tear-down path after sending disable + * queues virtchnl message, as the interrupts will be disabled after + * that. + */ + if (unlikely(q_vector->num_txq && idpf_queue_has(POLL_MODE, + q_vector->tx[0]))) + return budget; + work_done = min_t(int, work_done, budget - 1); /* Exit the polling mode, but don't re-enable interrupts if stack might @@ -4035,15 +4043,7 @@ static int idpf_vport_splitq_napi_poll(struct napi_struct *napi, int budget) else idpf_vport_intr_set_wb_on_itr(q_vector); - /* Switch to poll mode in the tear-down path after sending disable - * queues virtchnl message, as the interrupts will be disabled after - * that - */ - if (unlikely(q_vector->num_txq && idpf_queue_has(POLL_MODE, - q_vector->tx[0]))) - return budget; - else - return work_done; + return work_done; } /** -- 2.51.0 From 3f981138109f63232a5fb7165938d4c945cc1b9d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 18 May 2025 15:20:37 -0700 Subject: [PATCH 04/16] sch_hfsc: Fix qlen accounting bug when using peek in hfsc_enqueue() When enqueuing the first packet to an HFSC class, hfsc_enqueue() calls the child qdisc's peek() operation before incrementing sch->q.qlen and sch->qstats.backlog. If the child qdisc uses qdisc_peek_dequeued(), this may trigger an immediate dequeue and potential packet drop. In such cases, qdisc_tree_reduce_backlog() is called, but the HFSC qdisc's qlen and backlog have not yet been updated, leading to inconsistent queue accounting. This can leave an empty HFSC class in the active list, causing further consequences like use-after-free. This patch fixes the bug by moving the increment of sch->q.qlen and sch->qstats.backlog before the call to the child qdisc's peek() operation. This ensures that queue length and backlog are always accurate when packet drops or dequeues are triggered during the peek. Fixes: 12d0ad3be9c3 ("net/sched/sch_hfsc.c: handle corner cases where head may change invalidating calculated deadline") Reported-by: Mingi Cho Signed-off-by: Cong Wang Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250518222038.58538-2-xiyou.wangcong@gmail.com Reviewed-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni --- net/sched/sch_hfsc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index cb8c525ea20e..7986145a527c 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1569,6 +1569,9 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) return err; } + sch->qstats.backlog += len; + sch->q.qlen++; + if (first && !cl->cl_nactive) { if (cl->cl_flags & HFSC_RSC) init_ed(cl, len); @@ -1584,9 +1587,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) } - sch->qstats.backlog += len; - sch->q.qlen++; - return NET_XMIT_SUCCESS; } -- 2.51.0 From c3572acffb751af65505511b847f628c872ffae0 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 18 May 2025 15:20:38 -0700 Subject: [PATCH 05/16] selftests/tc-testing: Add an HFSC qlen accounting test This test reproduces a scenario where HFSC queue length and backlog accounting can become inconsistent when a peek operation triggers a dequeue and possible drop before the parent qdisc updates its counters. The test sets up a DRR root qdisc with an HFSC class, netem, and blackhole children, and uses Scapy to inject a packet. It helps to verify that HFSC correctly tracks qlen and backlog even when packets are dropped during peek-induced dequeue. Cc: Mingi Cho Signed-off-by: Cong Wang Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250518222038.58538-3-xiyou.wangcong@gmail.com Reviewed-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni --- .../tc-testing/tc-tests/infra/qdiscs.json | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index a951c0d33cd2..ddc97ecd8b39 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -573,5 +573,32 @@ "teardown": [ "$TC qdisc del dev $DEV1 handle 1: root" ] + }, + { + "id": "831d", + "name": "Test HFSC qlen accounting with DRR/NETEM/BLACKHOLE chain", + "category": ["qdisc", "hfsc", "drr", "netem", "blackhole"], + "plugins": { "requires": ["nsPlugin", "scapyPlugin"] }, + "setup": [ + "$IP link set dev $DEV1 up || true", + "$TC qdisc add dev $DEV1 root handle 1: drr", + "$TC filter add dev $DEV1 parent 1: basic classid 1:1", + "$TC class add dev $DEV1 parent 1: classid 1:1 drr", + "$TC qdisc add dev $DEV1 parent 1:1 handle 2: hfsc def 1", + "$TC class add dev $DEV1 parent 2: classid 2:1 hfsc rt m1 8 d 1 m2 0", + "$TC qdisc add dev $DEV1 parent 2:1 handle 3: netem", + "$TC qdisc add dev $DEV1 parent 3:1 handle 4: blackhole" + ], + "scapy": { + "iface": "$DEV0", + "count": 5, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + }, + "cmdUnderTest": "$TC -s qdisc show dev $DEV1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DEV1", + "matchPattern": "qdisc hfsc", + "matchCount": "1", + "teardown": ["$TC qdisc del dev $DEV1 root handle 1: drr"] } ] -- 2.51.0 From 184fb40f731bd3353b0887731f7caba66609e9cd Mon Sep 17 00:00:00 2001 From: Suman Ghosh Date: Mon, 19 May 2025 12:56:58 +0530 Subject: [PATCH 06/16] octeontx2-pf: Avoid adding dcbnl_ops for LBK and SDP vf Priority flow control is not supported for LBK and SDP vf. This patch adds support to not add dcbnl_ops for LBK and SDP vf. Fixes: 8e67558177f8 ("octeontx2-pf: PFC config support with DCBx") Signed-off-by: Suman Ghosh Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250519072658.2960851-1-sumang@marvell.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c index 7ef3ba477d49..9b28be4c4a5d 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c @@ -729,9 +729,12 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id) } #ifdef CONFIG_DCB - err = otx2_dcbnl_set_ops(netdev); - if (err) - goto err_free_zc_bmap; + /* Priority flow control is not supported for LBK and SDP vf(s) */ + if (!(is_otx2_lbkvf(vf->pdev) || is_otx2_sdp_rep(vf->pdev))) { + err = otx2_dcbnl_set_ops(netdev); + if (err) + goto err_free_zc_bmap; + } #endif otx2_qos_init(vf, qos_txqs); -- 2.51.0 From e279024617134c94fd3e37470156534d5f2b3472 Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Tue, 20 May 2025 18:14:04 +0800 Subject: [PATCH 07/16] net/tipc: fix slab-use-after-free Read in tipc_aead_encrypt_done Syzbot reported a slab-use-after-free with the following call trace: ================================================================== BUG: KASAN: slab-use-after-free in tipc_aead_encrypt_done+0x4bd/0x510 net/tipc/crypto.c:840 Read of size 8 at addr ffff88807a733000 by task kworker/1:0/25 Call Trace: kasan_report+0xd9/0x110 mm/kasan/report.c:601 tipc_aead_encrypt_done+0x4bd/0x510 net/tipc/crypto.c:840 crypto_request_complete include/crypto/algapi.h:266 aead_request_complete include/crypto/internal/aead.h:85 cryptd_aead_crypt+0x3b8/0x750 crypto/cryptd.c:772 crypto_request_complete include/crypto/algapi.h:266 cryptd_queue_worker+0x131/0x200 crypto/cryptd.c:181 process_one_work+0x9fb/0x1b60 kernel/workqueue.c:3231 Allocated by task 8355: kzalloc_noprof include/linux/slab.h:778 tipc_crypto_start+0xcc/0x9e0 net/tipc/crypto.c:1466 tipc_init_net+0x2dd/0x430 net/tipc/core.c:72 ops_init+0xb9/0x650 net/core/net_namespace.c:139 setup_net+0x435/0xb40 net/core/net_namespace.c:343 copy_net_ns+0x2f0/0x670 net/core/net_namespace.c:508 create_new_namespaces+0x3ea/0xb10 kernel/nsproxy.c:110 unshare_nsproxy_namespaces+0xc0/0x1f0 kernel/nsproxy.c:228 ksys_unshare+0x419/0x970 kernel/fork.c:3323 __do_sys_unshare kernel/fork.c:3394 Freed by task 63: kfree+0x12a/0x3b0 mm/slub.c:4557 tipc_crypto_stop+0x23c/0x500 net/tipc/crypto.c:1539 tipc_exit_net+0x8c/0x110 net/tipc/core.c:119 ops_exit_list+0xb0/0x180 net/core/net_namespace.c:173 cleanup_net+0x5b7/0xbf0 net/core/net_namespace.c:640 process_one_work+0x9fb/0x1b60 kernel/workqueue.c:3231 After freed the tipc_crypto tx by delete namespace, tipc_aead_encrypt_done may still visit it in cryptd_queue_worker workqueue. I reproduce this issue by: ip netns add ns1 ip link add veth1 type veth peer name veth2 ip link set veth1 netns ns1 ip netns exec ns1 tipc bearer enable media eth dev veth1 ip netns exec ns1 tipc node set key this_is_a_master_key master ip netns exec ns1 tipc bearer disable media eth dev veth1 ip netns del ns1 The key of reproduction is that, simd_aead_encrypt is interrupted, leading to crypto_simd_usable() return false. Thus, the cryptd_queue_worker is triggered, and the tipc_crypto tx will be visited. tipc_disc_timeout tipc_bearer_xmit_skb tipc_crypto_xmit tipc_aead_encrypt crypto_aead_encrypt // encrypt() simd_aead_encrypt // crypto_simd_usable() is false child = &ctx->cryptd_tfm->base; simd_aead_encrypt crypto_aead_encrypt // encrypt() cryptd_aead_encrypt_enqueue cryptd_aead_enqueue cryptd_enqueue_request // trigger cryptd_queue_worker queue_work_on(smp_processor_id(), cryptd_wq, &cpu_queue->work) Fix this by holding net reference count before encrypt. Reported-by: syzbot+55c12726619ff85ce1f6@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=55c12726619ff85ce1f6 Fixes: fc1b6d6de220 ("tipc: introduce TIPC encryption & authentication") Signed-off-by: Wang Liang Link: https://patch.msgid.link/20250520101404.1341730-1-wangliang74@huawei.com Signed-off-by: Paolo Abeni --- net/tipc/crypto.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index c524421ec652..8584893b4785 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -817,12 +817,16 @@ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, goto exit; } + /* Get net to avoid freed tipc_crypto when delete namespace */ + get_net(aead->crypto->net); + /* Now, do encrypt */ rc = crypto_aead_encrypt(req); if (rc == -EINPROGRESS || rc == -EBUSY) return rc; tipc_bearer_put(b); + put_net(aead->crypto->net); exit: kfree(ctx); @@ -860,6 +864,7 @@ static void tipc_aead_encrypt_done(void *data, int err) kfree(tx_ctx); tipc_bearer_put(b); tipc_aead_put(aead); + put_net(net); } /** -- 2.51.0 From 0eefa27b493306928d88af6368193b134c98fd64 Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Wed, 21 May 2025 11:38:33 +0530 Subject: [PATCH 08/16] octeontx2-af: Set LMT_ENA bit for APR table entries This patch enables the LMT line for a PF/VF by setting the LMT_ENA bit in the APR_LMT_MAP_ENTRY_S structure. Additionally, it simplifies the logic for calculating the LMTST table index by consistently using the maximum number of hw supported VFs (i.e., 256). Fixes: 873a1e3d207a ("octeontx2-af: cn10k: Setting up lmtst map table"). Signed-off-by: Subbaraya Sundeep Signed-off-by: Geetha sowjanya Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250521060834.19780-2-gakula@marvell.com Signed-off-by: Paolo Abeni --- .../net/ethernet/marvell/octeontx2/af/rvu_cn10k.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c index 7fa98aeb3663..3838c04b78c2 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c @@ -15,13 +15,17 @@ #define LMT_TBL_OP_WRITE 1 #define LMT_MAP_TABLE_SIZE (128 * 1024) #define LMT_MAPTBL_ENTRY_SIZE 16 +#define LMT_MAX_VFS 256 + +#define LMT_MAP_ENTRY_ENA BIT_ULL(20) +#define LMT_MAP_ENTRY_LINES GENMASK_ULL(18, 16) /* Function to perform operations (read/write) on lmtst map table */ static int lmtst_map_table_ops(struct rvu *rvu, u32 index, u64 *val, int lmt_tbl_op) { void __iomem *lmt_map_base; - u64 tbl_base; + u64 tbl_base, cfg; tbl_base = rvu_read64(rvu, BLKADDR_APR, APR_AF_LMT_MAP_BASE); @@ -35,6 +39,13 @@ static int lmtst_map_table_ops(struct rvu *rvu, u32 index, u64 *val, *val = readq(lmt_map_base + index); } else { writeq((*val), (lmt_map_base + index)); + + cfg = FIELD_PREP(LMT_MAP_ENTRY_ENA, 0x1); + /* 2048 LMTLINES */ + cfg |= FIELD_PREP(LMT_MAP_ENTRY_LINES, 0x6); + + writeq(cfg, (lmt_map_base + (index + 8))); + /* Flushing the AP interceptor cache to make APR_LMT_MAP_ENTRY_S * changes effective. Write 1 for flush and read is being used as a * barrier and sets up a data dependency. Write to 0 after a write @@ -52,7 +63,7 @@ static int lmtst_map_table_ops(struct rvu *rvu, u32 index, u64 *val, #define LMT_MAP_TBL_W1_OFF 8 static u32 rvu_get_lmtst_tbl_index(struct rvu *rvu, u16 pcifunc) { - return ((rvu_get_pf(pcifunc) * rvu->hw->total_vfs) + + return ((rvu_get_pf(pcifunc) * LMT_MAX_VFS) + (pcifunc & RVU_PFVF_FUNC_MASK)) * LMT_MAPTBL_ENTRY_SIZE; } -- 2.51.0 From a6ae7129819ad20788e610261246e71736543b8b Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Wed, 21 May 2025 11:38:34 +0530 Subject: [PATCH 09/16] octeontx2-af: Fix APR entry mapping based on APR_LMT_CFG The current implementation maps the APR table using a fixed size, which can lead to incorrect mapping when the number of PFs and VFs varies. This patch corrects the mapping by calculating the APR table size dynamically based on the values configured in the APR_LMT_CFG register, ensuring accurate representation of APR entries in debugfs. Fixes: 0daa55d033b0 ("octeontx2-af: cn10k: debugfs for dumping LMTST map table"). Signed-off-by: Geetha sowjanya Link: https://patch.msgid.link/20250521060834.19780-3-gakula@marvell.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c | 9 ++++++--- .../net/ethernet/marvell/octeontx2/af/rvu_debugfs.c | 11 ++++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c index 3838c04b78c2..4a3370a40dd8 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c @@ -13,7 +13,6 @@ /* RVU LMTST */ #define LMT_TBL_OP_READ 0 #define LMT_TBL_OP_WRITE 1 -#define LMT_MAP_TABLE_SIZE (128 * 1024) #define LMT_MAPTBL_ENTRY_SIZE 16 #define LMT_MAX_VFS 256 @@ -26,10 +25,14 @@ static int lmtst_map_table_ops(struct rvu *rvu, u32 index, u64 *val, { void __iomem *lmt_map_base; u64 tbl_base, cfg; + int pfs, vfs; tbl_base = rvu_read64(rvu, BLKADDR_APR, APR_AF_LMT_MAP_BASE); + cfg = rvu_read64(rvu, BLKADDR_APR, APR_AF_LMT_CFG); + vfs = 1 << (cfg & 0xF); + pfs = 1 << ((cfg >> 4) & 0x7); - lmt_map_base = ioremap_wc(tbl_base, LMT_MAP_TABLE_SIZE); + lmt_map_base = ioremap_wc(tbl_base, pfs * vfs * LMT_MAPTBL_ENTRY_SIZE); if (!lmt_map_base) { dev_err(rvu->dev, "Failed to setup lmt map table mapping!!\n"); return -ENOMEM; @@ -80,7 +83,7 @@ static int rvu_get_lmtaddr(struct rvu *rvu, u16 pcifunc, mutex_lock(&rvu->rsrc_lock); rvu_write64(rvu, BLKADDR_RVUM, RVU_AF_SMMU_ADDR_REQ, iova); - pf = rvu_get_pf(pcifunc) & 0x1F; + pf = rvu_get_pf(pcifunc) & RVU_PFVF_PF_MASK; val = BIT_ULL(63) | BIT_ULL(14) | BIT_ULL(13) | pf << 8 | ((pcifunc & RVU_PFVF_FUNC_MASK) & 0xFF); rvu_write64(rvu, BLKADDR_RVUM, RVU_AF_SMMU_TXN_REQ, val); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c index a1f9ec03c2ce..c827da626471 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c @@ -553,6 +553,7 @@ static ssize_t rvu_dbg_lmtst_map_table_display(struct file *filp, u64 lmt_addr, val, tbl_base; int pf, vf, num_vfs, hw_vfs; void __iomem *lmt_map_base; + int apr_pfs, apr_vfs; int buf_size = 10240; size_t off = 0; int index = 0; @@ -568,8 +569,12 @@ static ssize_t rvu_dbg_lmtst_map_table_display(struct file *filp, return -ENOMEM; tbl_base = rvu_read64(rvu, BLKADDR_APR, APR_AF_LMT_MAP_BASE); + val = rvu_read64(rvu, BLKADDR_APR, APR_AF_LMT_CFG); + apr_vfs = 1 << (val & 0xF); + apr_pfs = 1 << ((val >> 4) & 0x7); - lmt_map_base = ioremap_wc(tbl_base, 128 * 1024); + lmt_map_base = ioremap_wc(tbl_base, apr_pfs * apr_vfs * + LMT_MAPTBL_ENTRY_SIZE); if (!lmt_map_base) { dev_err(rvu->dev, "Failed to setup lmt map table mapping!!\n"); kfree(buf); @@ -591,7 +596,7 @@ static ssize_t rvu_dbg_lmtst_map_table_display(struct file *filp, off += scnprintf(&buf[off], buf_size - 1 - off, "PF%d \t\t\t", pf); - index = pf * rvu->hw->total_vfs * LMT_MAPTBL_ENTRY_SIZE; + index = pf * apr_vfs * LMT_MAPTBL_ENTRY_SIZE; off += scnprintf(&buf[off], buf_size - 1 - off, " 0x%llx\t\t", (tbl_base + index)); lmt_addr = readq(lmt_map_base + index); @@ -604,7 +609,7 @@ static ssize_t rvu_dbg_lmtst_map_table_display(struct file *filp, /* Reading num of VFs per PF */ rvu_get_pf_numvfs(rvu, pf, &num_vfs, &hw_vfs); for (vf = 0; vf < num_vfs; vf++) { - index = (pf * rvu->hw->total_vfs * 16) + + index = (pf * apr_vfs * LMT_MAPTBL_ENTRY_SIZE) + ((vf + 1) * LMT_MAPTBL_ENTRY_SIZE); off += scnprintf(&buf[off], buf_size - 1 - off, "PF%d:VF%d \t\t", pf, vf); -- 2.51.0 From ba5cb47b56e5d89f0a5eb5163ffbfda1e3e7cef0 Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Wed, 21 May 2025 16:00:43 +0530 Subject: [PATCH 10/16] octeontx2-af: Send Link events one by one Send link events one after another otherwise new message is overwriting the message which is being processed by PF. Fixes: a88e0f936ba9 ("octeontx2: Detect the mbox up or down message via register") Signed-off-by: Subbaraya Sundeep Reviewed-by: Michal Swiatkowski Reviewed-by: Simon Horman Link: https://patch.msgid.link/1747823443-404-1-git-send-email-sbhatta@marvell.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c | 2 ++ drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c | 2 ++ drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c | 2 ++ 3 files changed, 6 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c index 655dd4726d36..0277d226293e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c @@ -143,6 +143,8 @@ static int mcs_notify_pfvf(struct mcs_intr_event *event, struct rvu *rvu) otx2_mbox_msg_send_up(&rvu->afpf_wq_info.mbox_up, pf); + otx2_mbox_wait_for_rsp(&rvu->afpf_wq_info.mbox_up, pf); + mutex_unlock(&rvu->mbox_lock); return 0; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c index 992fa0b82e8d..ebb56eb0d18c 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c @@ -272,6 +272,8 @@ static void cgx_notify_pfs(struct cgx_link_event *event, struct rvu *rvu) otx2_mbox_msg_send_up(&rvu->afpf_wq_info.mbox_up, pfid); + otx2_mbox_wait_for_rsp(&rvu->afpf_wq_info.mbox_up, pfid); + mutex_unlock(&rvu->mbox_lock); } while (pfmap); } diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c index 052ae5923e3a..32953cca108c 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c @@ -60,6 +60,8 @@ static int rvu_rep_up_notify(struct rvu *rvu, struct rep_event *event) otx2_mbox_msg_send_up(&rvu->afpf_wq_info.mbox_up, pf); + otx2_mbox_wait_for_rsp(&rvu->afpf_wq_info.mbox_up, pf); + mutex_unlock(&rvu->mbox_lock); return 0; } -- 2.51.0 From 45ca7e9f0730ae36fc610e675b990e9cc9ca0714 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Wed, 21 May 2025 14:17:05 +0200 Subject: [PATCH 11/16] vsock/virtio: fix `rx_bytes` accounting for stream sockets MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit In `struct virtio_vsock_sock`, we maintain two counters: - `rx_bytes`: used internally to track how many bytes have been read. This supports mechanisms like .stream_has_data() and sock_rcvlowat(). - `fwd_cnt`: used for the credit mechanism to inform available receive buffer space to the remote peer. These counters are updated via virtio_transport_inc_rx_pkt() and virtio_transport_dec_rx_pkt(). Since the beginning with commit 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko"), we call virtio_transport_dec_rx_pkt() in virtio_transport_stream_do_dequeue() only when we consume the entire packet, so partial reads, do not update `rx_bytes` and `fwd_cnt`. This is fine for `fwd_cnt`, because we still have space used for the entire packet, and we don't want to update the credit for the other peer until we free the space of the entire packet. However, this causes `rx_bytes` to be stale on partial reads. Previously, this didn’t cause issues because `rx_bytes` was used only by .stream_has_data(), and any unread portion of a packet implied data was still available. However, since commit 93b808876682 ("virtio/vsock: fix logic which reduces credit update messages"), we now rely on `rx_bytes` to determine if a credit update should be sent when the data in the RX queue drops below SO_RCVLOWAT value. This patch fixes the accounting by updating `rx_bytes` with the number of bytes actually read, even on partial reads, while leaving `fwd_cnt` untouched until the packet is fully consumed. Also introduce a new `buf_used` counter to check that the remote peer is honoring the given credit; this was previously done via `rx_bytes`. Fixes: 93b808876682 ("virtio/vsock: fix logic which reduces credit update messages") Signed-off-by: Stefano Garzarella Link: https://patch.msgid.link/20250521121705.196379-1-sgarzare@redhat.com Signed-off-by: Paolo Abeni --- include/linux/virtio_vsock.h | 1 + net/vmw_vsock/virtio_transport_common.c | 26 +++++++++++++++---------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 0387d64e2c66..36fb3edfa403 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -140,6 +140,7 @@ struct virtio_vsock_sock { u32 last_fwd_cnt; u32 rx_bytes; u32 buf_alloc; + u32 buf_used; struct sk_buff_head rx_queue; u32 msg_count; }; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 7f7de6d88096..2c9b1011cdcc 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -441,18 +441,20 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, u32 len) { - if (vvs->rx_bytes + len > vvs->buf_alloc) + if (vvs->buf_used + len > vvs->buf_alloc) return false; vvs->rx_bytes += len; + vvs->buf_used += len; return true; } static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, - u32 len) + u32 bytes_read, u32 bytes_dequeued) { - vvs->rx_bytes -= len; - vvs->fwd_cnt += len; + vvs->rx_bytes -= bytes_read; + vvs->buf_used -= bytes_dequeued; + vvs->fwd_cnt += bytes_dequeued; } void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *skb) @@ -581,11 +583,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, size_t len) { struct virtio_vsock_sock *vvs = vsk->trans; - size_t bytes, total = 0; struct sk_buff *skb; u32 fwd_cnt_delta; bool low_rx_bytes; int err = -EFAULT; + size_t total = 0; u32 free_space; spin_lock_bh(&vvs->rx_lock); @@ -597,6 +599,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, } while (total < len && !skb_queue_empty(&vvs->rx_queue)) { + size_t bytes, dequeued = 0; + skb = skb_peek(&vvs->rx_queue); bytes = min_t(size_t, len - total, @@ -620,12 +624,12 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, VIRTIO_VSOCK_SKB_CB(skb)->offset += bytes; if (skb->len == VIRTIO_VSOCK_SKB_CB(skb)->offset) { - u32 pkt_len = le32_to_cpu(virtio_vsock_hdr(skb)->len); - - virtio_transport_dec_rx_pkt(vvs, pkt_len); + dequeued = le32_to_cpu(virtio_vsock_hdr(skb)->len); __skb_unlink(skb, &vvs->rx_queue); consume_skb(skb); } + + virtio_transport_dec_rx_pkt(vvs, bytes, dequeued); } fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt; @@ -781,7 +785,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, msg->msg_flags |= MSG_EOR; } - virtio_transport_dec_rx_pkt(vvs, pkt_len); + virtio_transport_dec_rx_pkt(vvs, pkt_len, pkt_len); kfree_skb(skb); } @@ -1735,6 +1739,7 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto struct sock *sk = sk_vsock(vsk); struct virtio_vsock_hdr *hdr; struct sk_buff *skb; + u32 pkt_len; int off = 0; int err; @@ -1752,7 +1757,8 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) vvs->msg_count--; - virtio_transport_dec_rx_pkt(vvs, le32_to_cpu(hdr->len)); + pkt_len = le32_to_cpu(hdr->len); + virtio_transport_dec_rx_pkt(vvs, pkt_len, pkt_len); spin_unlock_bh(&vvs->rx_lock); virtio_transport_send_credit_update(vsk); -- 2.51.0 From 57ee9584fd8606deef66d7b65fa4dcf94f6843aa Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Wed, 21 May 2025 14:41:59 +0200 Subject: [PATCH 12/16] net: lan966x: Fix 1-step timestamping over ipv4 or ipv6 When enabling 1-step timestamping for ptp frames that are over udpv4 or udpv6 then the inserted timestamp is added at the wrong offset in the frame, meaning that will modify the frame at the wrong place, so the frame will be malformed. To fix this, the HW needs to know which kind of frame it is to know where to insert the timestamp. For that there is a field in the IFH that says the PDU_TYPE, which can be NONE which is the default value, IPV4 or IPV6. Therefore make sure to set the PDU_TYPE so the HW knows where to insert the timestamp. Like I mention before the issue is not seen with L2 frames because by default the PDU_TYPE has a value of 0, which represents the L2 frames. Fixes: 77eecf25bd9d2f ("net: lan966x: Update extraction/injection for timestamping") Signed-off-by: Horatiu Vultur Link: https://patch.msgid.link/20250521124159.2713525-1-horatiu.vultur@microchip.com Signed-off-by: Paolo Abeni --- .../ethernet/microchip/lan966x/lan966x_main.c | 6 +++ .../ethernet/microchip/lan966x/lan966x_main.h | 5 ++ .../ethernet/microchip/lan966x/lan966x_ptp.c | 49 ++++++++++++++----- 3 files changed, 47 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c index 0af143ec0f86..427bdc0e4908 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c @@ -353,6 +353,11 @@ static void lan966x_ifh_set_rew_op(void *ifh, u64 rew_op) lan966x_ifh_set(ifh, rew_op, IFH_POS_REW_CMD, IFH_WID_REW_CMD); } +static void lan966x_ifh_set_oam_type(void *ifh, u64 oam_type) +{ + lan966x_ifh_set(ifh, oam_type, IFH_POS_PDU_TYPE, IFH_WID_PDU_TYPE); +} + static void lan966x_ifh_set_timestamp(void *ifh, u64 timestamp) { lan966x_ifh_set(ifh, timestamp, IFH_POS_TIMESTAMP, IFH_WID_TIMESTAMP); @@ -380,6 +385,7 @@ static netdev_tx_t lan966x_port_xmit(struct sk_buff *skb, return err; lan966x_ifh_set_rew_op(ifh, LAN966X_SKB_CB(skb)->rew_op); + lan966x_ifh_set_oam_type(ifh, LAN966X_SKB_CB(skb)->pdu_type); lan966x_ifh_set_timestamp(ifh, LAN966X_SKB_CB(skb)->ts_id); } diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.h b/drivers/net/ethernet/microchip/lan966x/lan966x_main.h index 1efa584e7107..1f9df67f0504 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.h +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.h @@ -75,6 +75,10 @@ #define IFH_REW_OP_ONE_STEP_PTP 0x3 #define IFH_REW_OP_TWO_STEP_PTP 0x4 +#define IFH_PDU_TYPE_NONE 0 +#define IFH_PDU_TYPE_IPV4 7 +#define IFH_PDU_TYPE_IPV6 8 + #define FDMA_RX_DCB_MAX_DBS 1 #define FDMA_TX_DCB_MAX_DBS 1 @@ -254,6 +258,7 @@ struct lan966x_phc { struct lan966x_skb_cb { u8 rew_op; + u8 pdu_type; u16 ts_id; unsigned long jiffies; }; diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c index 63905bb5a63a..87e5e81d40dc 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c @@ -322,34 +322,55 @@ void lan966x_ptp_hwtstamp_get(struct lan966x_port *port, *cfg = phc->hwtstamp_config; } -static int lan966x_ptp_classify(struct lan966x_port *port, struct sk_buff *skb) +static void lan966x_ptp_classify(struct lan966x_port *port, struct sk_buff *skb, + u8 *rew_op, u8 *pdu_type) { struct ptp_header *header; u8 msgtype; int type; - if (port->ptp_tx_cmd == IFH_REW_OP_NOOP) - return IFH_REW_OP_NOOP; + if (port->ptp_tx_cmd == IFH_REW_OP_NOOP) { + *rew_op = IFH_REW_OP_NOOP; + *pdu_type = IFH_PDU_TYPE_NONE; + return; + } type = ptp_classify_raw(skb); - if (type == PTP_CLASS_NONE) - return IFH_REW_OP_NOOP; + if (type == PTP_CLASS_NONE) { + *rew_op = IFH_REW_OP_NOOP; + *pdu_type = IFH_PDU_TYPE_NONE; + return; + } header = ptp_parse_header(skb, type); - if (!header) - return IFH_REW_OP_NOOP; + if (!header) { + *rew_op = IFH_REW_OP_NOOP; + *pdu_type = IFH_PDU_TYPE_NONE; + return; + } - if (port->ptp_tx_cmd == IFH_REW_OP_TWO_STEP_PTP) - return IFH_REW_OP_TWO_STEP_PTP; + if (type & PTP_CLASS_L2) + *pdu_type = IFH_PDU_TYPE_NONE; + if (type & PTP_CLASS_IPV4) + *pdu_type = IFH_PDU_TYPE_IPV4; + if (type & PTP_CLASS_IPV6) + *pdu_type = IFH_PDU_TYPE_IPV6; + + if (port->ptp_tx_cmd == IFH_REW_OP_TWO_STEP_PTP) { + *rew_op = IFH_REW_OP_TWO_STEP_PTP; + return; + } /* If it is sync and run 1 step then set the correct operation, * otherwise run as 2 step */ msgtype = ptp_get_msgtype(header, type); - if ((msgtype & 0xf) == 0) - return IFH_REW_OP_ONE_STEP_PTP; + if ((msgtype & 0xf) == 0) { + *rew_op = IFH_REW_OP_ONE_STEP_PTP; + return; + } - return IFH_REW_OP_TWO_STEP_PTP; + *rew_op = IFH_REW_OP_TWO_STEP_PTP; } static void lan966x_ptp_txtstamp_old_release(struct lan966x_port *port) @@ -374,10 +395,12 @@ int lan966x_ptp_txtstamp_request(struct lan966x_port *port, { struct lan966x *lan966x = port->lan966x; unsigned long flags; + u8 pdu_type; u8 rew_op; - rew_op = lan966x_ptp_classify(port, skb); + lan966x_ptp_classify(port, skb, &rew_op, &pdu_type); LAN966X_SKB_CB(skb)->rew_op = rew_op; + LAN966X_SKB_CB(skb)->pdu_type = pdu_type; if (rew_op != IFH_REW_OP_TWO_STEP_PTP) return 0; -- 2.51.0 From f0b50730bdd8f2734e548de541e845c0d40dceb6 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Wed, 21 May 2025 21:36:20 +0800 Subject: [PATCH 13/16] net/mlx5_core: Add error handling inmlx5_query_nic_vport_qkey_viol_cntr() The function mlx5_query_nic_vport_qkey_viol_cntr() calls the function mlx5_query_nic_vport_context() but does not check its return value. This could lead to undefined behavior if the query fails. A proper implementation can be found in mlx5_nic_vport_query_local_lb(). Add error handling for mlx5_query_nic_vport_context(). If it fails, free the out buffer via kvfree() and return error code. Fixes: 9efa75254593 ("net/mlx5_core: Introduce access functions to query vport RoCE fields") Cc: stable@vger.kernel.org # v4.5 Signed-off-by: Wentao Liang Reviewed-by: Tariq Toukan Link: https://patch.msgid.link/20250521133620.912-1-vulab@iscas.ac.cn Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/vport.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index d10d4c396040..a3c57bb8b521 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -519,19 +519,22 @@ int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev, { u32 *out; int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + int err; out = kvzalloc(outlen, GFP_KERNEL); if (!out) return -ENOMEM; - mlx5_query_nic_vport_context(mdev, 0, out); + err = mlx5_query_nic_vport_context(mdev, 0, out); + if (err) + goto out; *qkey_viol_cntr = MLX5_GET(query_nic_vport_context_out, out, nic_vport_context.qkey_violation_counter); - +out: kvfree(out); - return 0; + return err; } EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_qkey_viol_cntr); -- 2.51.0 From 32374234ab0101881e7d0c6a8ef7ebce566c46c9 Mon Sep 17 00:00:00 2001 From: Suraj Gupta Date: Wed, 21 May 2025 23:46:08 +0530 Subject: [PATCH 14/16] net: xilinx: axienet: Fix Tx skb circular buffer occupancy check in dmaengine xmit In Dmaengine flow, driver maintains struct skbuf_dma_descriptor rings each element of which corresponds to a skb. In Tx datapath, compare available space in skb ring with number of skbs instead of skb fragments. Replace x * (MAX_SKB_FRAGS) in netif_txq_completed_wake() and netif_txq_maybe_stop() with x * (1 skb) to fix the comparison. Fixes: 6a91b846af85 ("net: axienet: Introduce dmaengine support") Signed-off-by: Suraj Gupta Reviewed-by: Sean Anderson Link: https://patch.msgid.link/20250521181608.669554-1-suraj.gupta2@amd.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 054abf283ab3..5f912b27bfd7 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -880,7 +880,7 @@ static void axienet_dma_tx_cb(void *data, const struct dmaengine_result *result) dev_consume_skb_any(skbuf_dma->skb); netif_txq_completed_wake(txq, 1, len, CIRC_SPACE(lp->tx_ring_head, lp->tx_ring_tail, TX_BD_NUM_MAX), - 2 * MAX_SKB_FRAGS); + 2); } /** @@ -914,7 +914,7 @@ axienet_start_xmit_dmaengine(struct sk_buff *skb, struct net_device *ndev) dma_dev = lp->tx_chan->device; sg_len = skb_shinfo(skb)->nr_frags + 1; - if (CIRC_SPACE(lp->tx_ring_head, lp->tx_ring_tail, TX_BD_NUM_MAX) <= sg_len) { + if (CIRC_SPACE(lp->tx_ring_head, lp->tx_ring_tail, TX_BD_NUM_MAX) <= 1) { netif_stop_queue(ndev); if (net_ratelimit()) netdev_warn(ndev, "TX ring unexpectedly full\n"); @@ -964,7 +964,7 @@ axienet_start_xmit_dmaengine(struct sk_buff *skb, struct net_device *ndev) txq = skb_get_tx_queue(lp->ndev, skb); netdev_tx_sent_queue(txq, skb->len); netif_txq_maybe_stop(txq, CIRC_SPACE(lp->tx_ring_head, lp->tx_ring_tail, TX_BD_NUM_MAX), - MAX_SKB_FRAGS + 1, 2 * MAX_SKB_FRAGS); + 1, 2); dmaengine_submit(dma_tx_desc); dma_async_issue_pending(lp->tx_chan); -- 2.51.0 From d8d85ef0a631df9127f202e6371bb33a0b589952 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 21 May 2025 20:11:28 -0700 Subject: [PATCH 15/16] af_packet: move notifier's packet_dev_mc out of rcu critical section Syzkaller reports the following issue: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:578 __mutex_lock+0x106/0xe80 kernel/locking/mutex.c:746 team_change_rx_flags+0x38/0x220 drivers/net/team/team_core.c:1781 dev_change_rx_flags net/core/dev.c:9145 [inline] __dev_set_promiscuity+0x3f8/0x590 net/core/dev.c:9189 netif_set_promiscuity+0x50/0xe0 net/core/dev.c:9201 dev_set_promiscuity+0x126/0x260 net/core/dev_api.c:286 packet_dev_mc net/packet/af_packet.c:3698 [inline] packet_dev_mclist_delete net/packet/af_packet.c:3722 [inline] packet_notifier+0x292/0xa60 net/packet/af_packet.c:4247 notifier_call_chain+0x1b3/0x3e0 kernel/notifier.c:85 call_netdevice_notifiers_extack net/core/dev.c:2214 [inline] call_netdevice_notifiers net/core/dev.c:2228 [inline] unregister_netdevice_many_notify+0x15d8/0x2330 net/core/dev.c:11972 rtnl_delete_link net/core/rtnetlink.c:3522 [inline] rtnl_dellink+0x488/0x710 net/core/rtnetlink.c:3564 rtnetlink_rcv_msg+0x7cf/0xb70 net/core/rtnetlink.c:6955 netlink_rcv_skb+0x219/0x490 net/netlink/af_netlink.c:2534 Calling `PACKET_ADD_MEMBERSHIP` on an ops-locked device can trigger the `NETDEV_UNREGISTER` notifier, which may require disabling promiscuous and/or allmulti mode. Both of these operations require acquiring the netdev instance lock. Move the call to `packet_dev_mc` outside of the RCU critical section. The `mclist` modifications (add, del, flush, unregister) are protected by the RTNL, not the RCU. The RCU only protects the `sklist` and its associated `sks`. The delayed operation on the `mclist` entry remains within the RTNL. Reported-by: syzbot+b191b5ccad8d7a986286@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=b191b5ccad8d7a986286 Fixes: ad7c7b2172c3 ("net: hold netdev instance lock during sysfs operations") Signed-off-by: Stanislav Fomichev Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250522031129.3247266-1-stfomichev@gmail.com Signed-off-by: Paolo Abeni --- net/packet/af_packet.c | 21 ++++++++++++++++----- net/packet/internal.h | 1 + 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d4dba06297c3..20be2c47cf41 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3713,15 +3713,15 @@ static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, } static void packet_dev_mclist_delete(struct net_device *dev, - struct packet_mclist **mlp) + struct packet_mclist **mlp, + struct list_head *list) { struct packet_mclist *ml; while ((ml = *mlp) != NULL) { if (ml->ifindex == dev->ifindex) { - packet_dev_mc(dev, ml, -1); + list_add(&ml->remove_list, list); *mlp = ml->next; - kfree(ml); } else mlp = &ml->next; } @@ -3769,6 +3769,7 @@ static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) memcpy(i->addr, mreq->mr_address, i->alen); memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen); i->count = 1; + INIT_LIST_HEAD(&i->remove_list); i->next = po->mclist; po->mclist = i; err = packet_dev_mc(dev, i, 1); @@ -4233,9 +4234,11 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, static int packet_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { - struct sock *sk; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); + struct packet_mclist *ml, *tmp; + LIST_HEAD(mclist); + struct sock *sk; rcu_read_lock(); sk_for_each_rcu(sk, &net->packet.sklist) { @@ -4244,7 +4247,8 @@ static int packet_notifier(struct notifier_block *this, switch (msg) { case NETDEV_UNREGISTER: if (po->mclist) - packet_dev_mclist_delete(dev, &po->mclist); + packet_dev_mclist_delete(dev, &po->mclist, + &mclist); fallthrough; case NETDEV_DOWN: @@ -4277,6 +4281,13 @@ static int packet_notifier(struct notifier_block *this, } } rcu_read_unlock(); + + /* packet_dev_mc might grab instance locks so can't run under rcu */ + list_for_each_entry_safe(ml, tmp, &mclist, remove_list) { + packet_dev_mc(dev, ml, -1); + kfree(ml); + } + return NOTIFY_DONE; } diff --git a/net/packet/internal.h b/net/packet/internal.h index d5d70712007a..1e743d0316fd 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -11,6 +11,7 @@ struct packet_mclist { unsigned short type; unsigned short alen; unsigned char addr[MAX_ADDR_LEN]; + struct list_head remove_list; }; /* kbdq - kernel block descriptor queue */ -- 2.51.0 From 0795b05a59b1371b18ffbf09d385296b12e9f5d5 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Fri, 23 May 2025 16:37:59 +0800 Subject: [PATCH 16/16] net: phy: clear phydev->devlink when the link is deleted There is a potential crash issue when disabling and re-enabling the network port. When disabling the network port, phy_detach() calls device_link_del() to remove the device link, but it does not clear phydev->devlink, so phydev->devlink is not a NULL pointer. Then the network port is re-enabled, but if phy_attach_direct() fails before calling device_link_add(), the code jumps to the "error" label and calls phy_detach(). Since phydev->devlink retains the old value from the previous attach/detach cycle, device_link_del() uses the old value, which accesses a NULL pointer and causes a crash. The simplified crash log is as follows. [ 24.702421] Call trace: [ 24.704856] device_link_put_kref+0x20/0x120 [ 24.709124] device_link_del+0x30/0x48 [ 24.712864] phy_detach+0x24/0x168 [ 24.716261] phy_attach_direct+0x168/0x3a4 [ 24.720352] phylink_fwnode_phy_connect+0xc8/0x14c [ 24.725140] phylink_of_phy_connect+0x1c/0x34 Therefore, phydev->devlink needs to be cleared when the device link is deleted. Fixes: bc66fa87d4fd ("net: phy: Add link between phy dev and mac dev") Signed-off-by: Wei Fang Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250523083759.3741168-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index cc1bfd22fb81..7d5e76a3db0e 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1727,8 +1727,10 @@ void phy_detach(struct phy_device *phydev) struct module *ndev_owner = NULL; struct mii_bus *bus; - if (phydev->devlink) + if (phydev->devlink) { device_link_del(phydev->devlink); + phydev->devlink = NULL; + } if (phydev->sysfs_links) { if (dev) -- 2.51.0