From 076c700988938e02d51c018095d33b339cdb7ffd Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 29 Mar 2025 15:25:36 -0700 Subject: [PATCH 01/16] selftests: tc-testing: Add TBF with SKBPRIO queue length corner case test Add a test case to validate the interaction between TBF and SKBPRIO queueing disciplines, specifically targeting queue length accounting corner cases. This test complements the fix for the queue length accounting issue in the SKBPRIO qdisc. This is still best-effort, as timing and manipulating enqueue and dequeue from user-space is very hard. Cc: Pedro Tammela Signed-off-by: Cong Wang Acked-by: Paolo Abeni Link: https://patch.msgid.link/20250329222536.696204-3-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- .../tc-testing/tc-tests/infra/qdiscs.json | 34 ++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 9044ac054167..25454fd95537 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -126,5 +126,37 @@ "$TC qdisc del dev $DUMMY root handle 1: drr", "$IP addr del 10.10.10.10/24 dev $DUMMY" ] - } + }, + { + "id": "c024", + "name": "Test TBF with SKBPRIO - catch qlen corner cases", + "category": [ + "qdisc", + "tbf", + "skbprio" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY handle 1: root tbf rate 100bit burst 2000 limit 1000", + "$TC qdisc add dev $DUMMY parent 1: handle 10: skbprio limit 1", + "ping -c 1 -W 0.1 -Q 0x00 -s 1400 -I $DUMMY 10.10.10.1 > /dev/null || true", + "ping -c 1 -W 0.1 -Q 0x1c -s 1400 -I $DUMMY 10.10.10.1 > /dev/null || true", + "ping -c 1 -W 0.1 -Q 0x00 -s 1400 -I $DUMMY 10.10.10.1 > /dev/null || true", + "ping -c 1 -W 0.1 -Q 0x1c -s 1400 -I $DUMMY 10.10.10.1 > /dev/null || true", + "sleep 0.5" + ], + "cmdUnderTest": "$TC -s qdisc show dev $DUMMY", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY | grep -A 5 'qdisc skbprio'", + "matchPattern": "dropped [1-9][0-9]*", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] + } ] -- 2.51.0 From 10206302af856791fbcc27a33ed3c3eb09b2793d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 31 Mar 2025 09:15:32 +0000 Subject: [PATCH 02/16] sctp: add mutual exclusion in proc_sctp_do_udp_port() We must serialize calls to sctp_udp_sock_stop() and sctp_udp_sock_start() or risk a crash as syzbot reported: Oops: general protection fault, probably for non-canonical address 0xdffffc000000000d: 0000 [#1] SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000068-0x000000000000006f] CPU: 1 UID: 0 PID: 6551 Comm: syz.1.44 Not tainted 6.14.0-syzkaller-g7f2ff7b62617 #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2025 RIP: 0010:kernel_sock_shutdown+0x47/0x70 net/socket.c:3653 Call Trace: udp_tunnel_sock_release+0x68/0x80 net/ipv4/udp_tunnel_core.c:181 sctp_udp_sock_stop+0x71/0x160 net/sctp/protocol.c:930 proc_sctp_do_udp_port+0x264/0x450 net/sctp/sysctl.c:553 proc_sys_call_handler+0x3d0/0x5b0 fs/proc/proc_sysctl.c:601 iter_file_splice_write+0x91c/0x1150 fs/splice.c:738 do_splice_from fs/splice.c:935 [inline] direct_splice_actor+0x18f/0x6c0 fs/splice.c:1158 splice_direct_to_actor+0x342/0xa30 fs/splice.c:1102 do_splice_direct_actor fs/splice.c:1201 [inline] do_splice_direct+0x174/0x240 fs/splice.c:1227 do_sendfile+0xafd/0xe50 fs/read_write.c:1368 __do_sys_sendfile64 fs/read_write.c:1429 [inline] __se_sys_sendfile64 fs/read_write.c:1415 [inline] __x64_sys_sendfile64+0x1d8/0x220 fs/read_write.c:1415 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] Fixes: 046c052b475e ("sctp: enable udp tunneling socks") Reported-by: syzbot+fae49d997eb56fa7c74d@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/67ea5c01.050a0220.1547ec.012b.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Marcelo Ricardo Leitner Acked-by: Xin Long Link: https://patch.msgid.link/20250331091532.224982-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 8e1e97be4df7..ee3eac338a9d 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -525,6 +525,8 @@ static int proc_sctp_do_auth(const struct ctl_table *ctl, int write, return ret; } +static DEFINE_MUTEX(sctp_sysctl_mutex); + static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -549,6 +551,7 @@ static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, if (new_value > max || new_value < min) return -EINVAL; + mutex_lock(&sctp_sysctl_mutex); net->sctp.udp_port = new_value; sctp_udp_sock_stop(net); if (new_value) { @@ -561,6 +564,7 @@ static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, lock_sock(sk); sctp_sk(sk)->udp_port = htons(net->sctp.udp_port); release_sock(sk); + mutex_unlock(&sctp_sysctl_mutex); } return ret; -- 2.51.0 From 57b290d97c6150774bf929117ca737a26d8fc33d Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 31 Mar 2025 08:52:53 +0200 Subject: [PATCH 03/16] net: airoha: Fix qid report in airoha_tc_get_htb_get_leaf_queue() Fix the following kernel warning deleting HTB offloaded leafs and/or root HTB qdisc in airoha_eth driver properly reporting qid in airoha_tc_get_htb_get_leaf_queue routine. $tc qdisc replace dev eth1 root handle 10: htb offload $tc class add dev eth1 arent 10: classid 10:4 htb rate 100mbit ceil 100mbit $tc qdisc replace dev eth1 parent 10:4 handle 4: ets bands 8 \ quanta 1514 3028 4542 6056 7570 9084 10598 12112 $tc qdisc del dev eth1 root [ 55.827864] ------------[ cut here ]------------ [ 55.832493] WARNING: CPU: 3 PID: 2678 at 0xffffffc0798695a4 [ 55.956510] CPU: 3 PID: 2678 Comm: tc Tainted: G O 6.6.71 #0 [ 55.963557] Hardware name: Airoha AN7581 Evaluation Board (DT) [ 55.969383] pstate: 20400005 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 55.976344] pc : 0xffffffc0798695a4 [ 55.979851] lr : 0xffffffc079869a20 [ 55.983358] sp : ffffffc0850536a0 [ 55.986665] x29: ffffffc0850536a0 x28: 0000000000000024 x27: 0000000000000001 [ 55.993800] x26: 0000000000000000 x25: ffffff8008b19000 x24: ffffff800222e800 [ 56.000935] x23: 0000000000000001 x22: 0000000000000000 x21: ffffff8008b19000 [ 56.008071] x20: ffffff8002225800 x19: ffffff800379d000 x18: 0000000000000000 [ 56.015206] x17: ffffffbf9ea59000 x16: ffffffc080018000 x15: 0000000000000000 [ 56.022342] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000001 [ 56.029478] x11: ffffffc081471008 x10: ffffffc081575a98 x9 : 0000000000000000 [ 56.036614] x8 : ffffffc08167fd40 x7 : ffffffc08069e104 x6 : ffffff8007f86000 [ 56.043748] x5 : 0000000000000000 x4 : 0000000000000000 x3 : 0000000000000001 [ 56.050884] x2 : 0000000000000000 x1 : 0000000000000250 x0 : ffffff800222c000 [ 56.058020] Call trace: [ 56.060459] 0xffffffc0798695a4 [ 56.063618] 0xffffffc079869a20 [ 56.066777] __qdisc_destroy+0x40/0xa0 [ 56.070528] qdisc_put+0x54/0x6c [ 56.073748] qdisc_graft+0x41c/0x648 [ 56.077324] tc_get_qdisc+0x168/0x2f8 [ 56.080978] rtnetlink_rcv_msg+0x230/0x330 [ 56.085076] netlink_rcv_skb+0x5c/0x128 [ 56.088913] rtnetlink_rcv+0x14/0x1c [ 56.092490] netlink_unicast+0x1e0/0x2c8 [ 56.096413] netlink_sendmsg+0x198/0x3c8 [ 56.100337] ____sys_sendmsg+0x1c4/0x274 [ 56.104261] ___sys_sendmsg+0x7c/0xc0 [ 56.107924] __sys_sendmsg+0x44/0x98 [ 56.111492] __arm64_sys_sendmsg+0x20/0x28 [ 56.115580] invoke_syscall.constprop.0+0x58/0xfc [ 56.120285] do_el0_svc+0x3c/0xbc [ 56.123592] el0_svc+0x18/0x4c [ 56.126647] el0t_64_sync_handler+0x118/0x124 [ 56.131005] el0t_64_sync+0x150/0x154 [ 56.134660] ---[ end trace 0000000000000000 ]--- Fixes: ef1ca9271313b ("net: airoha: Add sched HTB offload support") Signed-off-by: Lorenzo Bianconi Acked-by: Paolo Abeni Link: https://patch.msgid.link/20250331-airoha-htb-qdisc-offload-del-fix-v1-1-4ea429c2c968@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index c0a642568ac1..20a96cafc748 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -2358,7 +2358,7 @@ static int airoha_tc_get_htb_get_leaf_queue(struct airoha_gdm_port *port, return -EINVAL; } - opt->qid = channel; + opt->qid = AIROHA_NUM_TX_RING + channel; return 0; } -- 2.51.0 From 367579274f60cb23c570ae5348966ab51e1509a4 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 31 Mar 2025 18:17:31 +0200 Subject: [PATCH 04/16] net: airoha: Fix ETS priomap validation ETS Qdisc schedules SP bands in a priority order assigning band-0 the highest priority (band-0 > band-1 > .. > band-n) while EN7581 arranges SP bands in a priority order assigning band-7 the highest priority (band-7 > band-6, .. > band-n). Fix priomap check in airoha_qdma_set_tx_ets_sched routine in order to align ETS Qdisc and airoha_eth driver SP priority ordering. Fixes: b56e4d660a96 ("net: airoha: Enforce ETS Qdisc priomap") Signed-off-by: Lorenzo Bianconi Reviewed-by: Simon Horman Reviewed-by: Davide Caratti Link: https://patch.msgid.link/20250331-airoha-ets-validate-priomap-v1-1-60a524488672@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index 20a96cafc748..69e523dd4186 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -2028,7 +2028,7 @@ static int airoha_qdma_set_tx_ets_sched(struct airoha_gdm_port *port, struct tc_ets_qopt_offload_replace_params *p = &opt->replace_params; enum tx_sched_mode mode = TC_SCH_SP; u16 w[AIROHA_NUM_QOS_QUEUES] = {}; - int i, nstrict = 0, nwrr, qidx; + int i, nstrict = 0; if (p->bands > AIROHA_NUM_QOS_QUEUES) return -EINVAL; @@ -2046,17 +2046,17 @@ static int airoha_qdma_set_tx_ets_sched(struct airoha_gdm_port *port, * lowest priorities with respect to SP ones. * e.g: WRR0, WRR1, .., WRRm, SP0, SP1, .., SPn */ - nwrr = p->bands - nstrict; - qidx = nstrict && nwrr ? nstrict : 0; - for (i = 1; i <= p->bands; i++) { - if (p->priomap[i % AIROHA_NUM_QOS_QUEUES] != qidx) + for (i = 0; i < nstrict; i++) { + if (p->priomap[p->bands - i - 1] != i) return -EINVAL; - - qidx = i == nwrr ? 0 : qidx + 1; } - for (i = 0; i < nwrr; i++) + for (i = 0; i < p->bands - nstrict; i++) { + if (p->priomap[i] != nstrict + i) + return -EINVAL; + w[i] = p->weights[nstrict + i]; + } if (!nstrict) mode = TC_SCH_WRR8; -- 2.51.0 From d996e412b2dfc079bd44bff5b3bc743fdb6d7c90 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 31 Mar 2025 07:28:14 -0700 Subject: [PATCH 05/16] bpf: add missing ops lock around dev_xdp_attach_link Syzkaller points out that create_link path doesn't grab ops lock, add it. Reported-by: syzbot+08936936fe8132f91f1a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/67e6b3e8.050a0220.2f068f.0079.GAE@google.com/ Fixes: 97246d6d21c2 ("net: hold netdev instance lock during ndo_bpf") Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250331142814.1887506-1-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index be17e0660144..5d20ff226d5e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10284,7 +10284,9 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto unlock; } + netdev_lock_ops(dev); err = dev_xdp_attach_link(dev, &extack, link); + netdev_unlock_ops(dev); rtnl_unlock(); if (err) { -- 2.51.0 From 5bbcb5902e1c7193b5ffee13251ec92878aae0e5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 31 Mar 2025 17:15:20 -0700 Subject: [PATCH 06/16] MAINTAINERS: update Open vSwitch maintainers Pravin has not been active for a while, missingmaints reports: Subsystem OPENVSWITCH Changes 138 / 253 (54%) (No activity) Top reviewers: [41]: aconole@redhat.com [31]: horms@kernel.org [23]: echaudro@redhat.com [8]: fw@strlen.de [6]: i.maximets@ovn.org INACTIVE MAINTAINER Pravin B Shelar Let's elevate Aaron, Eelco and Ilya to the status of maintainers. Acked-by: Aaron Conole Acked-by: Ilya Maximets Acked-by: Eelco Chaudron Acked-by: Simon Horman Link: https://patch.msgid.link/20250401001520.2080231-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index 660d5e67af7d..0dabce0f03f0 100644 --- a/CREDITS +++ b/CREDITS @@ -3666,6 +3666,10 @@ S: 149 Union St. S: Kingston, Ontario S: Canada K7L 2P4 +N: Pravin B Shelar +E: pshelar@ovn.org +D: Open vSwitch maintenance and contributions + N: John Shifflett E: john@geolog.com E: jshiffle@netcom.com diff --git a/MAINTAINERS b/MAINTAINERS index e0045ac4327b..ff882416c445 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18097,7 +18097,9 @@ F: drivers/irqchip/irq-ompic.c F: drivers/irqchip/irq-or1k-* OPENVSWITCH -M: Pravin B Shelar +M: Aaron Conole +M: Eelco Chaudron +M: Ilya Maximets L: netdev@vger.kernel.org L: dev@openvswitch.org S: Maintained -- 2.51.0 From d3210dabda8dd0477f3a6301dcaf9ed44aeccd3c Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Mon, 31 Mar 2025 18:53:15 -0700 Subject: [PATCH 07/16] eth: mlx4: select PAGE_POOL With commit 8533b14b3d65 ("eth: mlx4: create a page pool for Rx") mlx4 started using functions guarded by PAGE_POOL. This change introduced build errors when CONFIG_MLX4_EN is set but CONFIG_PAGE_POOL is not: ld: vmlinux.o: in function `mlx4_en_alloc_frags': en_rx.c:(.text+0xa5eaf9): undefined reference to `page_pool_alloc_pages' ld: vmlinux.o: in function `mlx4_en_create_rx_ring': (.text+0xa5ee91): undefined reference to `page_pool_create' Make MLX4_EN select PAGE_POOL to fix the ml;x4 build errors. Fixes: 8533b14b3d65 ("eth: mlx4: create a page pool for Rx") Signed-off-by: Greg Thelen Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250401015315.2306092-1-gthelen@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx4/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx4/Kconfig b/drivers/net/ethernet/mellanox/mlx4/Kconfig index 825e05fb8607..0b1cb340206f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx4/Kconfig @@ -7,6 +7,7 @@ config MLX4_EN tristate "Mellanox Technologies 1/10/40Gbit Ethernet support" depends on PCI && NETDEVICES && ETHERNET && INET depends on PTP_1588_CLOCK_OPTIONAL + select PAGE_POOL select MLX4_CORE help This driver supports Mellanox Technologies ConnectX Ethernet -- 2.51.0 From 96844075226b49af25a69a1d084b648ec2d9b08d Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Tue, 1 Apr 2025 08:58:04 +0200 Subject: [PATCH 08/16] net: mvpp2: Prevent parser TCAM memory corruption Protect the parser TCAM/SRAM memory, and the cached (shadow) SRAM information, from concurrent modifications. Both the TCAM and SRAM tables are indirectly accessed by configuring an index register that selects the row to read or write to. This means that operations must be atomic in order to, e.g., avoid spreading writes across multiple rows. Since the shadow SRAM array is used to find free rows in the hardware table, it must also be protected in order to avoid TOCTOU errors where multiple cores allocate the same row. This issue was detected in a situation where `mvpp2_set_rx_mode()` ran concurrently on two CPUs. In this particular case the MVPP2_PE_MAC_UC_PROMISCUOUS entry was corrupted, causing the classifier unit to drop all incoming unicast - indicated by the `rx_classifier_drops` counter. Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit") Signed-off-by: Tobias Waldekranz Reviewed-by: Maxime Chevallier Tested-by: Maxime Chevallier Link: https://patch.msgid.link/20250401065855.3113635-1-tobias@waldekranz.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvpp2/mvpp2.h | 3 + .../net/ethernet/marvell/mvpp2/mvpp2_main.c | 3 +- .../net/ethernet/marvell/mvpp2/mvpp2_prs.c | 201 ++++++++++++------ 3 files changed, 140 insertions(+), 67 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h index 44fe9b68d1c2..061fcd444d50 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h @@ -1113,6 +1113,9 @@ struct mvpp2 { /* Spinlocks for CM3 shared memory configuration */ spinlock_t mss_spinlock; + + /* Spinlock for shared PRS parser memory and shadow table */ + spinlock_t prs_spinlock; }; struct mvpp2_pcpu_stats { diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 566c12c89520..416a926a8281 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -7723,8 +7723,9 @@ static int mvpp2_probe(struct platform_device *pdev) if (mvpp2_read(priv, MVPP2_VER_ID_REG) == MVPP2_VER_PP23) priv->hw_version = MVPP23; - /* Init mss lock */ + /* Init locks for shared packet processor resources */ spin_lock_init(&priv->mss_spinlock); + spin_lock_init(&priv->prs_spinlock); /* Initialize network controller */ err = mvpp2_init(pdev, priv); diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c index 9af22f497a40..93e978bdf303 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c @@ -23,6 +23,8 @@ static int mvpp2_prs_hw_write(struct mvpp2 *priv, struct mvpp2_prs_entry *pe) { int i; + lockdep_assert_held(&priv->prs_spinlock); + if (pe->index > MVPP2_PRS_TCAM_SRAM_SIZE - 1) return -EINVAL; @@ -43,11 +45,13 @@ static int mvpp2_prs_hw_write(struct mvpp2 *priv, struct mvpp2_prs_entry *pe) } /* Initialize tcam entry from hw */ -int mvpp2_prs_init_from_hw(struct mvpp2 *priv, struct mvpp2_prs_entry *pe, - int tid) +static int __mvpp2_prs_init_from_hw(struct mvpp2 *priv, + struct mvpp2_prs_entry *pe, int tid) { int i; + lockdep_assert_held(&priv->prs_spinlock); + if (tid > MVPP2_PRS_TCAM_SRAM_SIZE - 1) return -EINVAL; @@ -73,6 +77,18 @@ int mvpp2_prs_init_from_hw(struct mvpp2 *priv, struct mvpp2_prs_entry *pe, return 0; } +int mvpp2_prs_init_from_hw(struct mvpp2 *priv, struct mvpp2_prs_entry *pe, + int tid) +{ + int err; + + spin_lock_bh(&priv->prs_spinlock); + err = __mvpp2_prs_init_from_hw(priv, pe, tid); + spin_unlock_bh(&priv->prs_spinlock); + + return err; +} + /* Invalidate tcam hw entry */ static void mvpp2_prs_hw_inv(struct mvpp2 *priv, int index) { @@ -374,7 +390,7 @@ static int mvpp2_prs_flow_find(struct mvpp2 *priv, int flow) priv->prs_shadow[tid].lu != MVPP2_PRS_LU_FLOWS) continue; - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); bits = mvpp2_prs_sram_ai_get(&pe); /* Sram store classification lookup ID in AI bits [5:0] */ @@ -441,7 +457,7 @@ static void mvpp2_prs_mac_drop_all_set(struct mvpp2 *priv, int port, bool add) if (priv->prs_shadow[MVPP2_PE_DROP_ALL].valid) { /* Entry exist - update port only */ - mvpp2_prs_init_from_hw(priv, &pe, MVPP2_PE_DROP_ALL); + __mvpp2_prs_init_from_hw(priv, &pe, MVPP2_PE_DROP_ALL); } else { /* Entry doesn't exist - create new */ memset(&pe, 0, sizeof(pe)); @@ -469,14 +485,17 @@ static void mvpp2_prs_mac_drop_all_set(struct mvpp2 *priv, int port, bool add) } /* Set port to unicast or multicast promiscuous mode */ -void mvpp2_prs_mac_promisc_set(struct mvpp2 *priv, int port, - enum mvpp2_prs_l2_cast l2_cast, bool add) +static void __mvpp2_prs_mac_promisc_set(struct mvpp2 *priv, int port, + enum mvpp2_prs_l2_cast l2_cast, + bool add) { struct mvpp2_prs_entry pe; unsigned char cast_match; unsigned int ri; int tid; + lockdep_assert_held(&priv->prs_spinlock); + if (l2_cast == MVPP2_PRS_L2_UNI_CAST) { cast_match = MVPP2_PRS_UCAST_VAL; tid = MVPP2_PE_MAC_UC_PROMISCUOUS; @@ -489,7 +508,7 @@ void mvpp2_prs_mac_promisc_set(struct mvpp2 *priv, int port, /* promiscuous mode - Accept unknown unicast or multicast packets */ if (priv->prs_shadow[tid].valid) { - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); } else { memset(&pe, 0, sizeof(pe)); mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_MAC); @@ -522,6 +541,14 @@ void mvpp2_prs_mac_promisc_set(struct mvpp2 *priv, int port, mvpp2_prs_hw_write(priv, &pe); } +void mvpp2_prs_mac_promisc_set(struct mvpp2 *priv, int port, + enum mvpp2_prs_l2_cast l2_cast, bool add) +{ + spin_lock_bh(&priv->prs_spinlock); + __mvpp2_prs_mac_promisc_set(priv, port, l2_cast, add); + spin_unlock_bh(&priv->prs_spinlock); +} + /* Set entry for dsa packets */ static void mvpp2_prs_dsa_tag_set(struct mvpp2 *priv, int port, bool add, bool tagged, bool extend) @@ -539,7 +566,7 @@ static void mvpp2_prs_dsa_tag_set(struct mvpp2 *priv, int port, bool add, if (priv->prs_shadow[tid].valid) { /* Entry exist - update port only */ - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); } else { /* Entry doesn't exist - create new */ memset(&pe, 0, sizeof(pe)); @@ -610,7 +637,7 @@ static void mvpp2_prs_dsa_tag_ethertype_set(struct mvpp2 *priv, int port, if (priv->prs_shadow[tid].valid) { /* Entry exist - update port only */ - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); } else { /* Entry doesn't exist - create new */ memset(&pe, 0, sizeof(pe)); @@ -673,7 +700,7 @@ static int mvpp2_prs_vlan_find(struct mvpp2 *priv, unsigned short tpid, int ai) priv->prs_shadow[tid].lu != MVPP2_PRS_LU_VLAN) continue; - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); match = mvpp2_prs_tcam_data_cmp(&pe, 0, tpid); if (!match) continue; @@ -726,7 +753,7 @@ static int mvpp2_prs_vlan_add(struct mvpp2 *priv, unsigned short tpid, int ai, priv->prs_shadow[tid_aux].lu != MVPP2_PRS_LU_VLAN) continue; - mvpp2_prs_init_from_hw(priv, &pe, tid_aux); + __mvpp2_prs_init_from_hw(priv, &pe, tid_aux); ri_bits = mvpp2_prs_sram_ri_get(&pe); if ((ri_bits & MVPP2_PRS_RI_VLAN_MASK) == MVPP2_PRS_RI_VLAN_DOUBLE) @@ -760,7 +787,7 @@ static int mvpp2_prs_vlan_add(struct mvpp2 *priv, unsigned short tpid, int ai, mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VLAN); } else { - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); } /* Update ports' mask */ mvpp2_prs_tcam_port_map_set(&pe, port_map); @@ -800,7 +827,7 @@ static int mvpp2_prs_double_vlan_find(struct mvpp2 *priv, unsigned short tpid1, priv->prs_shadow[tid].lu != MVPP2_PRS_LU_VLAN) continue; - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); match = mvpp2_prs_tcam_data_cmp(&pe, 0, tpid1) && mvpp2_prs_tcam_data_cmp(&pe, 4, tpid2); @@ -849,7 +876,7 @@ static int mvpp2_prs_double_vlan_add(struct mvpp2 *priv, unsigned short tpid1, priv->prs_shadow[tid_aux].lu != MVPP2_PRS_LU_VLAN) continue; - mvpp2_prs_init_from_hw(priv, &pe, tid_aux); + __mvpp2_prs_init_from_hw(priv, &pe, tid_aux); ri_bits = mvpp2_prs_sram_ri_get(&pe); ri_bits &= MVPP2_PRS_RI_VLAN_MASK; if (ri_bits == MVPP2_PRS_RI_VLAN_SINGLE || @@ -880,7 +907,7 @@ static int mvpp2_prs_double_vlan_add(struct mvpp2 *priv, unsigned short tpid1, mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VLAN); } else { - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); } /* Update ports' mask */ @@ -1213,8 +1240,8 @@ static void mvpp2_prs_mac_init(struct mvpp2 *priv) /* Create dummy entries for drop all and promiscuous modes */ mvpp2_prs_drop_fc(priv); mvpp2_prs_mac_drop_all_set(priv, 0, false); - mvpp2_prs_mac_promisc_set(priv, 0, MVPP2_PRS_L2_UNI_CAST, false); - mvpp2_prs_mac_promisc_set(priv, 0, MVPP2_PRS_L2_MULTI_CAST, false); + __mvpp2_prs_mac_promisc_set(priv, 0, MVPP2_PRS_L2_UNI_CAST, false); + __mvpp2_prs_mac_promisc_set(priv, 0, MVPP2_PRS_L2_MULTI_CAST, false); } /* Set default entries for various types of dsa packets */ @@ -1533,12 +1560,6 @@ static int mvpp2_prs_vlan_init(struct platform_device *pdev, struct mvpp2 *priv) struct mvpp2_prs_entry pe; int err; - priv->prs_double_vlans = devm_kcalloc(&pdev->dev, sizeof(bool), - MVPP2_PRS_DBL_VLANS_MAX, - GFP_KERNEL); - if (!priv->prs_double_vlans) - return -ENOMEM; - /* Double VLAN: 0x88A8, 0x8100 */ err = mvpp2_prs_double_vlan_add(priv, ETH_P_8021AD, ETH_P_8021Q, MVPP2_PRS_PORT_MASK); @@ -1941,7 +1962,7 @@ static int mvpp2_prs_vid_range_find(struct mvpp2_port *port, u16 vid, u16 mask) port->priv->prs_shadow[tid].lu != MVPP2_PRS_LU_VID) continue; - mvpp2_prs_init_from_hw(port->priv, &pe, tid); + __mvpp2_prs_init_from_hw(port->priv, &pe, tid); mvpp2_prs_tcam_data_byte_get(&pe, 2, &byte[0], &enable[0]); mvpp2_prs_tcam_data_byte_get(&pe, 3, &byte[1], &enable[1]); @@ -1970,6 +1991,8 @@ int mvpp2_prs_vid_entry_add(struct mvpp2_port *port, u16 vid) memset(&pe, 0, sizeof(pe)); + spin_lock_bh(&priv->prs_spinlock); + /* Scan TCAM and see if entry with this already exist */ tid = mvpp2_prs_vid_range_find(port, vid, mask); @@ -1988,8 +2011,10 @@ int mvpp2_prs_vid_entry_add(struct mvpp2_port *port, u16 vid) MVPP2_PRS_VLAN_FILT_MAX_ENTRY); /* There isn't room for a new VID filter */ - if (tid < 0) + if (tid < 0) { + spin_unlock_bh(&priv->prs_spinlock); return tid; + } mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_VID); pe.index = tid; @@ -1997,7 +2022,7 @@ int mvpp2_prs_vid_entry_add(struct mvpp2_port *port, u16 vid) /* Mask all ports */ mvpp2_prs_tcam_port_map_set(&pe, 0); } else { - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); } /* Enable the current port */ @@ -2019,6 +2044,7 @@ int mvpp2_prs_vid_entry_add(struct mvpp2_port *port, u16 vid) mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VID); mvpp2_prs_hw_write(priv, &pe); + spin_unlock_bh(&priv->prs_spinlock); return 0; } @@ -2028,15 +2054,16 @@ void mvpp2_prs_vid_entry_remove(struct mvpp2_port *port, u16 vid) struct mvpp2 *priv = port->priv; int tid; - /* Scan TCAM and see if entry with this already exist */ - tid = mvpp2_prs_vid_range_find(port, vid, 0xfff); + spin_lock_bh(&priv->prs_spinlock); - /* No such entry */ - if (tid < 0) - return; + /* Invalidate TCAM entry with this , if it exists */ + tid = mvpp2_prs_vid_range_find(port, vid, 0xfff); + if (tid >= 0) { + mvpp2_prs_hw_inv(priv, tid); + priv->prs_shadow[tid].valid = false; + } - mvpp2_prs_hw_inv(priv, tid); - priv->prs_shadow[tid].valid = false; + spin_unlock_bh(&priv->prs_spinlock); } /* Remove all existing VID filters on this port */ @@ -2045,6 +2072,8 @@ void mvpp2_prs_vid_remove_all(struct mvpp2_port *port) struct mvpp2 *priv = port->priv; int tid; + spin_lock_bh(&priv->prs_spinlock); + for (tid = MVPP2_PRS_VID_PORT_FIRST(port->id); tid <= MVPP2_PRS_VID_PORT_LAST(port->id); tid++) { if (priv->prs_shadow[tid].valid) { @@ -2052,6 +2081,8 @@ void mvpp2_prs_vid_remove_all(struct mvpp2_port *port) priv->prs_shadow[tid].valid = false; } } + + spin_unlock_bh(&priv->prs_spinlock); } /* Remove VID filering entry for this port */ @@ -2060,10 +2091,14 @@ void mvpp2_prs_vid_disable_filtering(struct mvpp2_port *port) unsigned int tid = MVPP2_PRS_VID_PORT_DFLT(port->id); struct mvpp2 *priv = port->priv; + spin_lock_bh(&priv->prs_spinlock); + /* Invalidate the guard entry */ mvpp2_prs_hw_inv(priv, tid); priv->prs_shadow[tid].valid = false; + + spin_unlock_bh(&priv->prs_spinlock); } /* Add guard entry that drops packets when no VID is matched on this port */ @@ -2079,6 +2114,8 @@ void mvpp2_prs_vid_enable_filtering(struct mvpp2_port *port) memset(&pe, 0, sizeof(pe)); + spin_lock_bh(&priv->prs_spinlock); + pe.index = tid; reg_val = mvpp2_read(priv, MVPP2_MH_REG(port->id)); @@ -2111,6 +2148,8 @@ void mvpp2_prs_vid_enable_filtering(struct mvpp2_port *port) /* Update shadow table */ mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VID); mvpp2_prs_hw_write(priv, &pe); + + spin_unlock_bh(&priv->prs_spinlock); } /* Parser default initialization */ @@ -2118,6 +2157,20 @@ int mvpp2_prs_default_init(struct platform_device *pdev, struct mvpp2 *priv) { int err, index, i; + priv->prs_shadow = devm_kcalloc(&pdev->dev, MVPP2_PRS_TCAM_SRAM_SIZE, + sizeof(*priv->prs_shadow), + GFP_KERNEL); + if (!priv->prs_shadow) + return -ENOMEM; + + priv->prs_double_vlans = devm_kcalloc(&pdev->dev, sizeof(bool), + MVPP2_PRS_DBL_VLANS_MAX, + GFP_KERNEL); + if (!priv->prs_double_vlans) + return -ENOMEM; + + spin_lock_bh(&priv->prs_spinlock); + /* Enable tcam table */ mvpp2_write(priv, MVPP2_PRS_TCAM_CTRL_REG, MVPP2_PRS_TCAM_EN_MASK); @@ -2136,12 +2189,6 @@ int mvpp2_prs_default_init(struct platform_device *pdev, struct mvpp2 *priv) for (index = 0; index < MVPP2_PRS_TCAM_SRAM_SIZE; index++) mvpp2_prs_hw_inv(priv, index); - priv->prs_shadow = devm_kcalloc(&pdev->dev, MVPP2_PRS_TCAM_SRAM_SIZE, - sizeof(*priv->prs_shadow), - GFP_KERNEL); - if (!priv->prs_shadow) - return -ENOMEM; - /* Always start from lookup = 0 */ for (index = 0; index < MVPP2_MAX_PORTS; index++) mvpp2_prs_hw_port_init(priv, index, MVPP2_PRS_LU_MH, @@ -2158,26 +2205,13 @@ int mvpp2_prs_default_init(struct platform_device *pdev, struct mvpp2 *priv) mvpp2_prs_vid_init(priv); err = mvpp2_prs_etype_init(priv); - if (err) - return err; - - err = mvpp2_prs_vlan_init(pdev, priv); - if (err) - return err; - - err = mvpp2_prs_pppoe_init(priv); - if (err) - return err; - - err = mvpp2_prs_ip6_init(priv); - if (err) - return err; - - err = mvpp2_prs_ip4_init(priv); - if (err) - return err; + err = err ? : mvpp2_prs_vlan_init(pdev, priv); + err = err ? : mvpp2_prs_pppoe_init(priv); + err = err ? : mvpp2_prs_ip6_init(priv); + err = err ? : mvpp2_prs_ip4_init(priv); - return 0; + spin_unlock_bh(&priv->prs_spinlock); + return err; } /* Compare MAC DA with tcam entry data */ @@ -2217,7 +2251,7 @@ mvpp2_prs_mac_da_range_find(struct mvpp2 *priv, int pmap, const u8 *da, (priv->prs_shadow[tid].udf != udf_type)) continue; - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); entry_pmap = mvpp2_prs_tcam_port_map_get(&pe); if (mvpp2_prs_mac_range_equals(&pe, da, mask) && @@ -2229,7 +2263,8 @@ mvpp2_prs_mac_da_range_find(struct mvpp2 *priv, int pmap, const u8 *da, } /* Update parser's mac da entry */ -int mvpp2_prs_mac_da_accept(struct mvpp2_port *port, const u8 *da, bool add) +static int __mvpp2_prs_mac_da_accept(struct mvpp2_port *port, + const u8 *da, bool add) { unsigned char mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; struct mvpp2 *priv = port->priv; @@ -2261,7 +2296,7 @@ int mvpp2_prs_mac_da_accept(struct mvpp2_port *port, const u8 *da, bool add) /* Mask all ports */ mvpp2_prs_tcam_port_map_set(&pe, 0); } else { - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); } mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_MAC); @@ -2317,6 +2352,17 @@ int mvpp2_prs_mac_da_accept(struct mvpp2_port *port, const u8 *da, bool add) return 0; } +int mvpp2_prs_mac_da_accept(struct mvpp2_port *port, const u8 *da, bool add) +{ + int err; + + spin_lock_bh(&port->priv->prs_spinlock); + err = __mvpp2_prs_mac_da_accept(port, da, add); + spin_unlock_bh(&port->priv->prs_spinlock); + + return err; +} + int mvpp2_prs_update_mac_da(struct net_device *dev, const u8 *da) { struct mvpp2_port *port = netdev_priv(dev); @@ -2345,6 +2391,8 @@ void mvpp2_prs_mac_del_all(struct mvpp2_port *port) unsigned long pmap; int index, tid; + spin_lock_bh(&priv->prs_spinlock); + for (tid = MVPP2_PE_MAC_RANGE_START; tid <= MVPP2_PE_MAC_RANGE_END; tid++) { unsigned char da[ETH_ALEN], da_mask[ETH_ALEN]; @@ -2354,7 +2402,7 @@ void mvpp2_prs_mac_del_all(struct mvpp2_port *port) (priv->prs_shadow[tid].udf != MVPP2_PRS_UDF_MAC_DEF)) continue; - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); pmap = mvpp2_prs_tcam_port_map_get(&pe); @@ -2375,14 +2423,17 @@ void mvpp2_prs_mac_del_all(struct mvpp2_port *port) continue; /* Remove entry from TCAM */ - mvpp2_prs_mac_da_accept(port, da, false); + __mvpp2_prs_mac_da_accept(port, da, false); } + + spin_unlock_bh(&priv->prs_spinlock); } int mvpp2_prs_tag_mode_set(struct mvpp2 *priv, int port, int type) { switch (type) { case MVPP2_TAG_TYPE_EDSA: + spin_lock_bh(&priv->prs_spinlock); /* Add port to EDSA entries */ mvpp2_prs_dsa_tag_set(priv, port, true, MVPP2_PRS_TAGGED, MVPP2_PRS_EDSA); @@ -2393,9 +2444,11 @@ int mvpp2_prs_tag_mode_set(struct mvpp2 *priv, int port, int type) MVPP2_PRS_TAGGED, MVPP2_PRS_DSA); mvpp2_prs_dsa_tag_set(priv, port, false, MVPP2_PRS_UNTAGGED, MVPP2_PRS_DSA); + spin_unlock_bh(&priv->prs_spinlock); break; case MVPP2_TAG_TYPE_DSA: + spin_lock_bh(&priv->prs_spinlock); /* Add port to DSA entries */ mvpp2_prs_dsa_tag_set(priv, port, true, MVPP2_PRS_TAGGED, MVPP2_PRS_DSA); @@ -2406,10 +2459,12 @@ int mvpp2_prs_tag_mode_set(struct mvpp2 *priv, int port, int type) MVPP2_PRS_TAGGED, MVPP2_PRS_EDSA); mvpp2_prs_dsa_tag_set(priv, port, false, MVPP2_PRS_UNTAGGED, MVPP2_PRS_EDSA); + spin_unlock_bh(&priv->prs_spinlock); break; case MVPP2_TAG_TYPE_MH: case MVPP2_TAG_TYPE_NONE: + spin_lock_bh(&priv->prs_spinlock); /* Remove port form EDSA and DSA entries */ mvpp2_prs_dsa_tag_set(priv, port, false, MVPP2_PRS_TAGGED, MVPP2_PRS_DSA); @@ -2419,6 +2474,7 @@ int mvpp2_prs_tag_mode_set(struct mvpp2 *priv, int port, int type) MVPP2_PRS_TAGGED, MVPP2_PRS_EDSA); mvpp2_prs_dsa_tag_set(priv, port, false, MVPP2_PRS_UNTAGGED, MVPP2_PRS_EDSA); + spin_unlock_bh(&priv->prs_spinlock); break; default: @@ -2437,11 +2493,15 @@ int mvpp2_prs_add_flow(struct mvpp2 *priv, int flow, u32 ri, u32 ri_mask) memset(&pe, 0, sizeof(pe)); + spin_lock_bh(&priv->prs_spinlock); + tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_LAST_FREE_TID, MVPP2_PE_FIRST_FREE_TID); - if (tid < 0) + if (tid < 0) { + spin_unlock_bh(&priv->prs_spinlock); return tid; + } pe.index = tid; @@ -2461,6 +2521,7 @@ int mvpp2_prs_add_flow(struct mvpp2 *priv, int flow, u32 ri, u32 ri_mask) mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK); mvpp2_prs_hw_write(priv, &pe); + spin_unlock_bh(&priv->prs_spinlock); return 0; } @@ -2472,6 +2533,8 @@ int mvpp2_prs_def_flow(struct mvpp2_port *port) memset(&pe, 0, sizeof(pe)); + spin_lock_bh(&port->priv->prs_spinlock); + tid = mvpp2_prs_flow_find(port->priv, port->id); /* Such entry not exist */ @@ -2480,8 +2543,10 @@ int mvpp2_prs_def_flow(struct mvpp2_port *port) tid = mvpp2_prs_tcam_first_free(port->priv, MVPP2_PE_LAST_FREE_TID, MVPP2_PE_FIRST_FREE_TID); - if (tid < 0) + if (tid < 0) { + spin_unlock_bh(&port->priv->prs_spinlock); return tid; + } pe.index = tid; @@ -2492,13 +2557,14 @@ int mvpp2_prs_def_flow(struct mvpp2_port *port) /* Update shadow table */ mvpp2_prs_shadow_set(port->priv, pe.index, MVPP2_PRS_LU_FLOWS); } else { - mvpp2_prs_init_from_hw(port->priv, &pe, tid); + __mvpp2_prs_init_from_hw(port->priv, &pe, tid); } mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_FLOWS); mvpp2_prs_tcam_port_map_set(&pe, (1 << port->id)); mvpp2_prs_hw_write(port->priv, &pe); + spin_unlock_bh(&port->priv->prs_spinlock); return 0; } @@ -2509,11 +2575,14 @@ int mvpp2_prs_hits(struct mvpp2 *priv, int index) if (index > MVPP2_PRS_TCAM_SRAM_SIZE) return -EINVAL; + spin_lock_bh(&priv->prs_spinlock); + mvpp2_write(priv, MVPP2_PRS_TCAM_HIT_IDX_REG, index); val = mvpp2_read(priv, MVPP2_PRS_TCAM_HIT_CNT_REG); val &= MVPP2_PRS_TCAM_HIT_CNT_MASK; + spin_unlock_bh(&priv->prs_spinlock); return val; } -- 2.51.0 From ca9e5d3d9a4d7d30355aa68bb30dc6f850f067ab Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Tue, 1 Apr 2025 11:49:08 -0300 Subject: [PATCH 09/16] selftests: tc-testing: fix nat regex matching In iproute 6.14, the nat ip mask logic was fixed to remove an undefined behaviour[1]. So now instead of reporting '0.0.0.0/32' on x86 and potentially '0.0.0.0/0' in other platforms, it reports '0.0.0.0/0' in all platforms. [1] https://lore.kernel.org/netdev/20250306112520.188728-1-torben.nielsen@prevas.dk/ Reviewed-by: Simon Horman Signed-off-by: Pedro Tammela Link: https://patch.msgid.link/20250401144908.568140-1-pctammela@mojatatu.com Signed-off-by: Jakub Kicinski --- .../selftests/tc-testing/tc-tests/actions/nat.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/nat.json b/tools/testing/selftests/tc-testing/tc-tests/actions/nat.json index ee2792998c89..4f21aeb8a3fb 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/nat.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/nat.json @@ -305,7 +305,7 @@ "cmdUnderTest": "$TC actions add action nat ingress default 10.10.10.1 index 12", "expExitCode": "0", "verifyCmd": "$TC actions get action nat index 12", - "matchPattern": "action order [0-9]+: nat ingress 0.0.0.0/32 10.10.10.1 pass.*index 12 ref", + "matchPattern": "action order [0-9]+: nat ingress 0.0.0.0/0 10.10.10.1 pass.*index 12 ref", "matchCount": "1", "teardown": [ "$TC actions flush action nat" @@ -332,7 +332,7 @@ "cmdUnderTest": "$TC actions add action nat ingress any 10.10.10.1 index 12", "expExitCode": "0", "verifyCmd": "$TC actions get action nat index 12", - "matchPattern": "action order [0-9]+: nat ingress 0.0.0.0/32 10.10.10.1 pass.*index 12 ref", + "matchPattern": "action order [0-9]+: nat ingress 0.0.0.0/0 10.10.10.1 pass.*index 12 ref", "matchCount": "1", "teardown": [ "$TC actions flush action nat" @@ -359,7 +359,7 @@ "cmdUnderTest": "$TC actions add action nat ingress all 10.10.10.1 index 12", "expExitCode": "0", "verifyCmd": "$TC actions get action nat index 12", - "matchPattern": "action order [0-9]+: nat ingress 0.0.0.0/32 10.10.10.1 pass.*index 12 ref", + "matchPattern": "action order [0-9]+: nat ingress 0.0.0.0/0 10.10.10.1 pass.*index 12 ref", "matchCount": "1", "teardown": [ "$TC actions flush action nat" @@ -548,7 +548,7 @@ "cmdUnderTest": "$TC actions add action nat egress default 20.20.20.1 pipe index 10", "expExitCode": "0", "verifyCmd": "$TC actions get action nat index 10", - "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/32 20.20.20.1 pipe.*index 10 ref", + "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/0 20.20.20.1 pipe.*index 10 ref", "matchCount": "1", "teardown": [ "$TC actions flush action nat" @@ -575,7 +575,7 @@ "cmdUnderTest": "$TC actions add action nat egress any 20.20.20.1 pipe index 10", "expExitCode": "0", "verifyCmd": "$TC actions get action nat index 10", - "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/32 20.20.20.1 pipe.*index 10 ref", + "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/0 20.20.20.1 pipe.*index 10 ref", "matchCount": "1", "teardown": [ "$TC actions flush action nat" @@ -602,7 +602,7 @@ "cmdUnderTest": "$TC actions add action nat egress all 20.20.20.1 pipe index 10", "expExitCode": "0", "verifyCmd": "$TC actions get action nat index 10", - "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/32 20.20.20.1 pipe.*index 10 ref", + "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/0 20.20.20.1 pipe.*index 10 ref", "matchCount": "1", "teardown": [ "$TC actions flush action nat" @@ -629,7 +629,7 @@ "cmdUnderTest": "$TC actions add action nat egress all 20.20.20.1 pipe index 10 cookie aa1bc2d3eeff112233445566778800a1", "expExitCode": "0", "verifyCmd": "$TC actions get action nat index 10", - "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/32 20.20.20.1 pipe.*index 10 ref.*cookie aa1bc2d3eeff112233445566778800a1", + "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/0 20.20.20.1 pipe.*index 10 ref.*cookie aa1bc2d3eeff112233445566778800a1", "matchCount": "1", "teardown": [ "$TC actions flush action nat" -- 2.51.0 From 1b7fdc702c031134c619b74c4f311c590e50ca3d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 1 Apr 2025 12:07:08 -0700 Subject: [PATCH 10/16] rtnetlink: Use register_pernet_subsys() in rtnl_net_debug_init(). rtnl_net_debug_init() registers rtnl_net_debug_net_ops by register_pernet_device() but calls unregister_pernet_subsys() in case register_netdevice_notifier() fails. It corrupts pernet_list because first_device is updated in register_pernet_device() but not unregister_pernet_subsys(). Let's fix it by calling register_pernet_subsys() instead. The _subsys() one fits better for the use case because it keeps the notifier alive until default_device_exit_net(), giving it more chance to test NETDEV_UNREGISTER. Fixes: 03fa53485659 ("rtnetlink: Add ASSERT_RTNL_NET() placeholder for netdev notifier.") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250401190716.70437-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/rtnl_net_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/rtnl_net_debug.c b/net/core/rtnl_net_debug.c index 7ecd28cc1c22..f3272b09c255 100644 --- a/net/core/rtnl_net_debug.c +++ b/net/core/rtnl_net_debug.c @@ -102,7 +102,7 @@ static int __init rtnl_net_debug_init(void) { int ret; - ret = register_pernet_device(&rtnl_net_debug_net_ops); + ret = register_pernet_subsys(&rtnl_net_debug_net_ops); if (ret) return ret; -- 2.51.0 From 5a465a0da13ee9fbd7d3cd0b2893309b0fe4b7e3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 1 Apr 2025 11:44:42 -0700 Subject: [PATCH 11/16] udp: Fix multiple wraparounds of sk->sk_rmem_alloc. __udp_enqueue_schedule_skb() has the following condition: if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) goto drop; sk->sk_rcvbuf is initialised by net.core.rmem_default and later can be configured by SO_RCVBUF, which is limited by net.core.rmem_max, or SO_RCVBUFFORCE. If we set INT_MAX to sk->sk_rcvbuf, the condition is always false as sk->sk_rmem_alloc is also signed int. Then, the size of the incoming skb is added to sk->sk_rmem_alloc unconditionally. This results in integer overflow (possibly multiple times) on sk->sk_rmem_alloc and allows a single socket to have skb up to net.core.udp_mem[1]. For example, if we set a large value to udp_mem[1] and INT_MAX to sk->sk_rcvbuf and flood packets to the socket, we can see multiple overflows: # cat /proc/net/sockstat | grep UDP: UDP: inuse 3 mem 7956736 <-- (7956736 << 12) bytes > INT_MAX * 15 ^- PAGE_SHIFT # ss -uam State Recv-Q ... UNCONN -1757018048 ... <-- flipping the sign repeatedly skmem:(r2537949248,rb2147483646,t0,tb212992,f1984,w0,o0,bl0,d0) Previously, we had a boundary check for INT_MAX, which was removed by commit 6a1f12dd85a8 ("udp: relax atomic operation on sk->sk_rmem_alloc"). A complete fix would be to revert it and cap the right operand by INT_MAX: rmem = atomic_add_return(size, &sk->sk_rmem_alloc); if (rmem > min(size + (unsigned int)sk->sk_rcvbuf, INT_MAX)) goto uncharge_drop; but we do not want to add the expensive atomic_add_return() back just for the corner case. Casting rmem to unsigned int prevents multiple wraparounds, but we still allow a single wraparound. # cat /proc/net/sockstat | grep UDP: UDP: inuse 3 mem 524288 <-- (INT_MAX + 1) >> 12 # ss -uam State Recv-Q ... UNCONN -2147482816 ... <-- INT_MAX + 831 bytes skmem:(r2147484480,rb2147483646,t0,tb212992,f3264,w0,o0,bl0,d14468947) So, let's define rmem and rcvbuf as unsigned int and check skb->truesize only when rcvbuf is large enough to lower the overflow possibility. Note that we still have a small chance to see overflow if multiple skbs to the same socket are processed on different core at the same time and each size does not exceed the limit but the total size does. Note also that we must ignore skb->truesize for a small buffer as explained in commit 363dc73acacb ("udp: be less conservative with sock rmem accounting"). Fixes: 6a1f12dd85a8 ("udp: relax atomic operation on sk->sk_rmem_alloc") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250401184501.67377-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d0bffcfa56d8..354779b22faf 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1725,17 +1725,25 @@ static int udp_rmem_schedule(struct sock *sk, int size) int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; - int rmem, err = -ENOMEM; + unsigned int rmem, rcvbuf; spinlock_t *busy = NULL; - int size, rcvbuf; + int size, err = -ENOMEM; - /* Immediately drop when the receive queue is full. - * Always allow at least one packet. - */ rmem = atomic_read(&sk->sk_rmem_alloc); rcvbuf = READ_ONCE(sk->sk_rcvbuf); - if (rmem > rcvbuf) - goto drop; + size = skb->truesize; + + /* Immediately drop when the receive queue is full. + * Cast to unsigned int performs the boundary check for INT_MAX. + */ + if (rmem + size > rcvbuf) { + if (rcvbuf > INT_MAX >> 1) + goto drop; + + /* Always allow at least one packet for small buffer. */ + if (rmem > rcvbuf) + goto drop; + } /* Under mem pressure, it might be helpful to help udp_recvmsg() * having linear skbs : @@ -1745,10 +1753,10 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) */ if (rmem > (rcvbuf >> 1)) { skb_condense(skb); - + size = skb->truesize; busy = busylock_acquire(sk); } - size = skb->truesize; + udp_set_dev_scratch(skb); atomic_add(size, &sk->sk_rmem_alloc); -- 2.51.0 From df207de9d9e7a4d92f8567e2c539d9c8c12fd99d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 1 Apr 2025 11:44:43 -0700 Subject: [PATCH 12/16] udp: Fix memory accounting leak. Matt Dowling reported a weird UDP memory usage issue. Under normal operation, the UDP memory usage reported in /proc/net/sockstat remains close to zero. However, it occasionally spiked to 524,288 pages and never dropped. Moreover, the value doubled when the application was terminated. Finally, it caused intermittent packet drops. We can reproduce the issue with the script below [0]: 1. /proc/net/sockstat reports 0 pages # cat /proc/net/sockstat | grep UDP: UDP: inuse 1 mem 0 2. Run the script till the report reaches 524,288 # python3 test.py & sleep 5 # cat /proc/net/sockstat | grep UDP: UDP: inuse 3 mem 524288 <-- (INT_MAX + 1) >> PAGE_SHIFT 3. Kill the socket and confirm the number never drops # pkill python3 && sleep 5 # cat /proc/net/sockstat | grep UDP: UDP: inuse 1 mem 524288 4. (necessary since v6.0) Trigger proto_memory_pcpu_drain() # python3 test.py & sleep 1 && pkill python3 5. The number doubles # cat /proc/net/sockstat | grep UDP: UDP: inuse 1 mem 1048577 The application set INT_MAX to SO_RCVBUF, which triggered an integer overflow in udp_rmem_release(). When a socket is close()d, udp_destruct_common() purges its receive queue and sums up skb->truesize in the queue. This total is calculated and stored in a local unsigned integer variable. The total size is then passed to udp_rmem_release() to adjust memory accounting. However, because the function takes a signed integer argument, the total size can wrap around, causing an overflow. Then, the released amount is calculated as follows: 1) Add size to sk->sk_forward_alloc. 2) Round down sk->sk_forward_alloc to the nearest lower multiple of PAGE_SIZE and assign it to amount. 3) Subtract amount from sk->sk_forward_alloc. 4) Pass amount >> PAGE_SHIFT to __sk_mem_reduce_allocated(). When the issue occurred, the total in udp_destruct_common() was 2147484480 (INT_MAX + 833), which was cast to -2147482816 in udp_rmem_release(). At 1) sk->sk_forward_alloc is changed from 3264 to -2147479552, and 2) sets -2147479552 to amount. 3) reverts the wraparound, so we don't see a warning in inet_sock_destruct(). However, udp_memory_allocated ends up doubling at 4). Since commit 3cd3399dd7a8 ("net: implement per-cpu reserves for memory_allocated"), memory usage no longer doubles immediately after a socket is close()d because __sk_mem_reduce_allocated() caches the amount in udp_memory_per_cpu_fw_alloc. However, the next time a UDP socket receives a packet, the subtraction takes effect, causing UDP memory usage to double. This issue makes further memory allocation fail once the socket's sk->sk_rmem_alloc exceeds net.ipv4.udp_rmem_min, resulting in packet drops. To prevent this issue, let's use unsigned int for the calculation and call sk_forward_alloc_add() only once for the small delta. Note that first_packet_length() also potentially has the same problem. [0]: from socket import * SO_RCVBUFFORCE = 33 INT_MAX = (2 ** 31) - 1 s = socket(AF_INET, SOCK_DGRAM) s.bind(('', 0)) s.setsockopt(SOL_SOCKET, SO_RCVBUFFORCE, INT_MAX) c = socket(AF_INET, SOCK_DGRAM) c.connect(s.getsockname()) data = b'a' * 100 while True: c.send(data) Fixes: f970bd9e3a06 ("udp: implement memory accounting helpers") Reported-by: Matt Dowling Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250401184501.67377-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 354779b22faf..2742cc7602bb 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1625,12 +1625,12 @@ static bool udp_skb_has_head_state(struct sk_buff *skb) } /* fully reclaim rmem/fwd memory allocated for skb */ -static void udp_rmem_release(struct sock *sk, int size, int partial, - bool rx_queue_lock_held) +static void udp_rmem_release(struct sock *sk, unsigned int size, + int partial, bool rx_queue_lock_held) { struct udp_sock *up = udp_sk(sk); struct sk_buff_head *sk_queue; - int amt; + unsigned int amt; if (likely(partial)) { up->forward_deficit += size; @@ -1650,10 +1650,8 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, if (!rx_queue_lock_held) spin_lock(&sk_queue->lock); - - sk_forward_alloc_add(sk, size); - amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1); - sk_forward_alloc_add(sk, -amt); + amt = (size + sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1); + sk_forward_alloc_add(sk, size - amt); if (amt) __sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT); @@ -1843,7 +1841,7 @@ EXPORT_IPV6_MOD_GPL(skb_consume_udp); static struct sk_buff *__first_packet_length(struct sock *sk, struct sk_buff_head *rcvq, - int *total) + unsigned int *total) { struct sk_buff *skb; @@ -1876,8 +1874,8 @@ static int first_packet_length(struct sock *sk) { struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue; struct sk_buff_head *sk_queue = &sk->sk_receive_queue; + unsigned int total = 0; struct sk_buff *skb; - int total = 0; int res; spin_lock_bh(&rcvq->lock); -- 2.51.0 From fccd2b711d9628c7ce0111d5e4938652101ee30a Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 28 Mar 2025 15:15:28 +0100 Subject: [PATCH 13/16] vsock: avoid timeout during connect() if the socket is closing When a peer attempts to establish a connection, vsock_connect() contains a loop that waits for the state to be TCP_ESTABLISHED. However, the other peer can be fast enough to accept the connection and close it immediately, thus moving the state to TCP_CLOSING. When this happens, the peer in the vsock_connect() is properly woken up, but since the state is not TCP_ESTABLISHED, it goes back to sleep until the timeout expires, returning -ETIMEDOUT. If the socket state is TCP_CLOSING, waiting for the timeout is pointless. vsock_connect() can return immediately without errors or delay since the connection actually happened. The socket will be in a closing state, but this is not an issue, and subsequent calls will fail as expected. We discovered this issue while developing a test that accepts and immediately closes connections to stress the transport switch between two connect() calls, where the first one was interrupted by a signal (see Closes link). Reported-by: Luigi Leonardi Closes: https://lore.kernel.org/virtualization/bq6hxrolno2vmtqwcvb5bljfpb7mvwb3kohrvaed6auz5vxrfv@ijmd2f3grobn/ Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Signed-off-by: Stefano Garzarella Acked-by: Paolo Abeni Tested-by: Luigi Leonardi Reviewed-by: Luigi Leonardi Link: https://patch.msgid.link/20250328141528.420719-1-sgarzare@redhat.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 7e3db87ae433..fc6afbc8d680 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1551,7 +1551,11 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, timeout = vsk->connect_timeout; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - while (sk->sk_state != TCP_ESTABLISHED && sk->sk_err == 0) { + /* If the socket is already closing or it is in an error state, there + * is no point in waiting. + */ + while (sk->sk_state != TCP_ESTABLISHED && + sk->sk_state != TCP_CLOSING && sk->sk_err == 0) { if (flags & O_NONBLOCK) { /* If we're not going to block, we schedule a timeout * function to generate a timeout on the connection -- 2.51.0 From 8930424777e43257f5bf6f0f0f53defd0d30415c Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 29 Mar 2025 01:33:44 +0100 Subject: [PATCH 14/16] tunnels: Accept PACKET_HOST in skb_tunnel_check_pmtu(). Because skb_tunnel_check_pmtu() doesn't handle PACKET_HOST packets, commit 30a92c9e3d6b ("openvswitch: Set the skbuff pkt_type for proper pmtud support.") forced skb->pkt_type to PACKET_OUTGOING for openvswitch packets that are sent using the OVS_ACTION_ATTR_OUTPUT action. This allowed such packets to invoke the iptunnel_pmtud_check_icmp() or iptunnel_pmtud_check_icmpv6() helpers and thus trigger PMTU update on the input device. However, this also broke other parts of PMTU discovery. Since these packets don't have the PACKET_HOST type anymore, they won't trigger the sending of ICMP Fragmentation Needed or Packet Too Big messages to remote hosts when oversized (see the skb_in->pkt_type condition in __icmp_send() for example). These two skb->pkt_type checks are therefore incompatible as one requires skb->pkt_type to be PACKET_HOST, while the other requires it to be anything but PACKET_HOST. It makes sense to not trigger ICMP messages for non-PACKET_HOST packets as these messages should be generated only for incoming l2-unicast packets. However there doesn't seem to be any reason for skb_tunnel_check_pmtu() to ignore PACKET_HOST packets. Allow both cases to work by allowing skb_tunnel_check_pmtu() to work on PACKET_HOST packets and not overriding skb->pkt_type in openvswitch anymore. Fixes: 30a92c9e3d6b ("openvswitch: Set the skbuff pkt_type for proper pmtud support.") Fixes: 4cb47a8644cc ("tunnels: PMTU discovery support for directly bridged IP packets") Signed-off-by: Guillaume Nault Reviewed-by: Stefano Brivio Reviewed-by: Aaron Conole Tested-by: Aaron Conole Link: https://patch.msgid.link/eac941652b86fddf8909df9b3bf0d97bc9444793.1743208264.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/ip_tunnel_core.c | 2 +- net/openvswitch/actions.c | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index a3676155be78..364ea798511e 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -416,7 +416,7 @@ int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, skb_dst_update_pmtu_no_confirm(skb, mtu); - if (!reply || skb->pkt_type == PACKET_HOST) + if (!reply) return 0; if (skb->protocol == htons(ETH_P_IP)) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 704c858cf209..61fea7baae5d 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -947,12 +947,6 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, pskb_trim(skb, ovs_mac_header_len(key)); } - /* Need to set the pkt_type to involve the routing layer. The - * packet movement through the OVS datapath doesn't generally - * use routing, but this is needed for tunnel cases. - */ - skb->pkt_type = PACKET_OUTGOING; - if (likely(!mru || (skb->len <= mru + vport->dev->hard_header_len))) { ovs_vport_send(vport, skb, ovs_key_mac_proto(key)); -- 2.51.0 From 3a0a3ff6593d670af2451ec363ccb7b18aec0c0a Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 26 Mar 2025 18:36:32 +0100 Subject: [PATCH 15/16] net: decrease cached dst counters in dst_release Upstream fix ac888d58869b ("net: do not delay dst_entries_add() in dst_release()") moved decrementing the dst count from dst_destroy to dst_release to avoid accessing already freed data in case of netns dismantle. However in case CONFIG_DST_CACHE is enabled and OvS+tunnels are used, this fix is incomplete as the same issue will be seen for cached dsts: Unable to handle kernel paging request at virtual address ffff5aabf6b5c000 Call trace: percpu_counter_add_batch+0x3c/0x160 (P) dst_release+0xec/0x108 dst_cache_destroy+0x68/0xd8 dst_destroy+0x13c/0x168 dst_destroy_rcu+0x1c/0xb0 rcu_do_batch+0x18c/0x7d0 rcu_core+0x174/0x378 rcu_core_si+0x18/0x30 Fix this by invalidating the cache, and thus decrementing cached dst counters, in dst_release too. Fixes: d71785ffc7e7 ("net: add dst_cache to ovs vxlan lwtunnel") Signed-off-by: Antoine Tenart Link: https://patch.msgid.link/20250326173634.31096-1-atenart@kernel.org Signed-off-by: Paolo Abeni --- net/core/dst.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/core/dst.c b/net/core/dst.c index c99b95cf9cbb..795ca07e28a4 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -165,6 +165,14 @@ static void dst_count_dec(struct dst_entry *dst) void dst_release(struct dst_entry *dst) { if (dst && rcuref_put(&dst->__rcuref)) { +#ifdef CONFIG_DST_CACHE + if (dst->flags & DST_METADATA) { + struct metadata_dst *md_dst = (struct metadata_dst *)dst; + + if (md_dst->type == METADATA_IP_TUNNEL) + dst_cache_reset_now(&md_dst->u.tun_info.dst_cache); + } +#endif dst_count_dec(dst); call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); } -- 2.51.0 From 15970e1b23f5c25db88c613fddf9131de086f28e Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 2 Apr 2025 00:10:37 +0000 Subject: [PATCH 16/16] gve: handle overflow when reporting TX consumed descriptors When the tx tail is less than the head (in cases of wraparound), the TX consumed descriptor statistic in DQ will be reported as UINT32_MAX - head + tail, which is incorrect. Mask the difference of head and tail according to the ring size when reporting the statistic. Cc: stable@vger.kernel.org Fixes: 2c9198356d56 ("gve: Add consumed counts to ethtool stats") Signed-off-by: Joshua Washington Signed-off-by: Harshitha Ramamurthy Reviewed-by: Michal Swiatkowski Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250402001037.2717315-1-hramamurthy@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_ethtool.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index 31a21ccf4863..4dea1fdce748 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -392,7 +392,9 @@ gve_get_ethtool_stats(struct net_device *netdev, */ data[i++] = 0; data[i++] = 0; - data[i++] = tx->dqo_tx.tail - tx->dqo_tx.head; + data[i++] = + (tx->dqo_tx.tail - tx->dqo_tx.head) & + tx->mask; } do { start = -- 2.51.0