From b04c4d9eb4f25b950b33218e33b04c94e7445e51 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 29 Sep 2024 02:18:20 -0400 Subject: [PATCH 01/16] vrf: revert "vrf: Remove unnecessary RCU-bh critical section" This reverts commit 504fc6f4f7f681d2a03aa5f68aad549d90eab853. dev_queue_xmit_nit is expected to be called with BH disabled. __dev_queue_xmit has the following: /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ rcu_read_lock_bh(); VRF must follow this invariant. The referenced commit removed this protection. Which triggered a lockdep warning: ================================ WARNING: inconsistent lock state 6.11.0 #1 Tainted: G W -------------------------------- inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. btserver/134819 [HC0[0]:SC0[0]:HE1:SE1] takes: ffff8882da30c118 (rlock-AF_PACKET){+.?.}-{2:2}, at: tpacket_rcv+0x863/0x3b30 {IN-SOFTIRQ-W} state was registered at: lock_acquire+0x19a/0x4f0 _raw_spin_lock+0x27/0x40 packet_rcv+0xa33/0x1320 __netif_receive_skb_core.constprop.0+0xcb0/0x3a90 __netif_receive_skb_list_core+0x2c9/0x890 netif_receive_skb_list_internal+0x610/0xcc0 [...] other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(rlock-AF_PACKET); lock(rlock-AF_PACKET); *** DEADLOCK *** Call Trace: dump_stack_lvl+0x73/0xa0 mark_lock+0x102e/0x16b0 __lock_acquire+0x9ae/0x6170 lock_acquire+0x19a/0x4f0 _raw_spin_lock+0x27/0x40 tpacket_rcv+0x863/0x3b30 dev_queue_xmit_nit+0x709/0xa40 vrf_finish_direct+0x26e/0x340 [vrf] vrf_l3_out+0x5f4/0xe80 [vrf] __ip_local_out+0x51e/0x7a0 [...] Fixes: 504fc6f4f7f6 ("vrf: Remove unnecessary RCU-bh critical section") Link: https://lore.kernel.org/netdev/20240925185216.1990381-1-greearb@candelatech.com/ Reported-by: Ben Greear Signed-off-by: Willem de Bruijn Cc: stable@vger.kernel.org Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20240929061839.1175300-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/vrf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 4d8ccaf9a2b4..4087f72f0d2b 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -608,7 +608,9 @@ static void vrf_finish_direct(struct sk_buff *skb) eth_zero_addr(eth->h_dest); eth->h_proto = skb->protocol; + rcu_read_lock_bh(); dev_queue_xmit_nit(skb, vrf_dev); + rcu_read_unlock_bh(); skb_pull(skb, ETH_HLEN); } -- 2.51.0 From 555f45d24ba7cd5527716553031641cdebbe76c7 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 29 Sep 2024 15:36:40 +0300 Subject: [PATCH 02/16] bridge: mcast: Fail MDB get request on empty entry When user space deletes a port from an MDB entry, the port is removed synchronously. If this was the last port in the entry and the entry is not joined by the host itself, then the entry is scheduled for deletion via a timer. The above means that it is possible for the MDB get netlink request to retrieve an empty entry which is scheduled for deletion. This is problematic as after deleting the last port in an entry, user space cannot rely on a non-zero return code from the MDB get request as an indication that the port was successfully removed. Fix by returning an error when the entry's port list is empty and the entry is not joined by the host. Fixes: 68b380a395a7 ("bridge: mcast: Add MDB get support") Reported-by: Jamie Bainbridge Closes: https://lore.kernel.org/netdev/c92569919307749f879b9482b0f3e125b7d9d2e3.1726480066.git.jamie.bainbridge@gmail.com/ Tested-by: Jamie Bainbridge Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20240929123640.558525-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_mdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index bc37e47ad829..1a52a0bca086 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -1674,7 +1674,7 @@ int br_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq, spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &group); - if (!mp) { + if (!mp || (!mp->ports && !mp->host_joined)) { NL_SET_ERR_MSG_MOD(extack, "MDB entry not found"); err = -ENOENT; goto unlock; -- 2.51.0 From a1e40ac5b5e9077fe1f7ae0eb88034db0f9ae1ab Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 1 Oct 2024 13:17:46 -0400 Subject: [PATCH 03/16] gso: fix udp gso fraglist segmentation after pull from frag_list Detect gso fraglist skbs with corrupted geometry (see below) and pass these to skb_segment instead of skb_segment_list, as the first can segment them correctly. Valid SKB_GSO_FRAGLIST skbs - consist of two or more segments - the head_skb holds the protocol headers plus first gso_size - one or more frag_list skbs hold exactly one segment - all but the last must be gso_size Optional datapath hooks such as NAT and BPF (bpf_skb_pull_data) can modify these skbs, breaking these invariants. In extreme cases they pull all data into skb linear. For UDP, this causes a NULL ptr deref in __udpv4_gso_segment_list_csum at udp_hdr(seg->next)->dest. Detect invalid geometry due to pull, by checking head_skb size. Don't just drop, as this may blackhole a destination. Convert to be able to pass to regular skb_segment. Link: https://lore.kernel.org/netdev/20240428142913.18666-1-shiming.cheng@mediatek.com/ Fixes: 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") Signed-off-by: Willem de Bruijn Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20241001171752.107580-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp_offload.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index d842303587af..a5be6e4ed326 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -296,8 +296,26 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, return NULL; } - if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) - return __udp_gso_segment_list(gso_skb, features, is_ipv6); + if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) { + /* Detect modified geometry and pass those to skb_segment. */ + if (skb_pagelen(gso_skb) - sizeof(*uh) == skb_shinfo(gso_skb)->gso_size) + return __udp_gso_segment_list(gso_skb, features, is_ipv6); + + /* Setup csum, as fraglist skips this in udp4_gro_receive. */ + gso_skb->csum_start = skb_transport_header(gso_skb) - gso_skb->head; + gso_skb->csum_offset = offsetof(struct udphdr, check); + gso_skb->ip_summed = CHECKSUM_PARTIAL; + + uh = udp_hdr(gso_skb); + if (is_ipv6) + uh->check = ~udp_v6_check(gso_skb->len, + &ipv6_hdr(gso_skb)->saddr, + &ipv6_hdr(gso_skb)->daddr, 0); + else + uh->check = ~udp_v4_check(gso_skb->len, + ip_hdr(gso_skb)->saddr, + ip_hdr(gso_skb)->daddr, 0); + } skb_pull(gso_skb, sizeof(*uh)); -- 2.51.0 From fa7dfeae041c91e425db9fbb95fb3f57b821c386 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 26 Sep 2024 12:14:03 +0000 Subject: [PATCH 04/16] net: phy: qt2025: Fix warning: unused import DeviceId Fix the following warning when the driver is compiled as built-in: warning: unused import: `DeviceId` --> drivers/net/phy/qt2025.rs:18:5 | 18 | DeviceId, Driver, | ^^^^^^^^ | = note: `#[warn(unused_imports)]` on by default device_table in module_phy_driver macro is defined only when the driver is built as a module. Use phy::DeviceId in the macro instead of importing `DeviceId` since `phy` is always used. Fixes: fd3eaad826da ("net: phy: add Applied Micro QT2025 PHY driver") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202409190717.i135rfVo-lkp@intel.com/ Reviewed-by: Alice Ryhl Reviewed-by: Trevor Gross Signed-off-by: FUJITA Tomonori Reviewed-by: Fiona Behrens Acked-by: Miguel Ojeda Link: https://patch.msgid.link/20240926121404.242092-1-fujita.tomonori@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/qt2025.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/qt2025.rs b/drivers/net/phy/qt2025.rs index 28d8981f410b..1ab065798175 100644 --- a/drivers/net/phy/qt2025.rs +++ b/drivers/net/phy/qt2025.rs @@ -15,7 +15,7 @@ use kernel::firmware::Firmware; use kernel::net::phy::{ self, reg::{Mmd, C45}, - DeviceId, Driver, + Driver, }; use kernel::prelude::*; use kernel::sizes::{SZ_16K, SZ_8K}; @@ -23,7 +23,7 @@ use kernel::sizes::{SZ_16K, SZ_8K}; kernel::module_phy_driver! { drivers: [PhyQT2025], device_table: [ - DeviceId::new_with_driver::(), + phy::DeviceId::new_with_driver::(), ], name: "qt2025_phy", author: "FUJITA Tomonori ", -- 2.51.0 From a0ffa68c70b367358b2672cdab6fa5bc4c40de2c Mon Sep 17 00:00:00 2001 From: Eddie James Date: Wed, 25 Sep 2024 10:55:23 -0500 Subject: [PATCH 05/16] net/ncsi: Disable the ncsi work before freeing the associated structure The work function can run after the ncsi device is freed, resulting in use-after-free bugs or kernel panic. Fixes: 2d283bdd079c ("net/ncsi: Resource management") Signed-off-by: Eddie James Link: https://patch.msgid.link/20240925155523.1017097-1-eajames@linux.ibm.com Signed-off-by: Paolo Abeni --- net/ncsi/ncsi-manage.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 5ecf611c8820..5cf55bde366d 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -1954,6 +1954,8 @@ void ncsi_unregister_dev(struct ncsi_dev *nd) list_del_rcu(&ndp->node); spin_unlock_irqrestore(&ncsi_dev_lock, flags); + disable_work_sync(&ndp->work); + kfree(ndp); } EXPORT_SYMBOL_GPL(ncsi_unregister_dev); -- 2.51.0 From f7a4874d977bf4202ad575031222e78809a36292 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 2 Oct 2024 08:00:40 -0700 Subject: [PATCH 06/16] iomap: don't bother unsharing delalloc extents If unshare encounters a delalloc reservation in the srcmap, that means that the file range isn't shared because delalloc reservations cannot be reflinked. Therefore, don't try to unshare them. Signed-off-by: Darrick J. Wong Link: https://lore.kernel.org/r/20241002150040.GB21853@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 11ea747228ae..c1c559e0cc07 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1321,7 +1321,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) return length; /* - * Don't bother with holes or unwritten extents. + * Don't bother with delalloc reservations, holes or unwritten extents. * * Note that we use srcmap directly instead of iomap_iter_srcmap as * unsharing requires providing a separate source map, and the presence @@ -1330,6 +1330,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) * fork for XFS. */ if (iter->srcmap.type == IOMAP_HOLE || + iter->srcmap.type == IOMAP_DELALLOC || iter->srcmap.type == IOMAP_UNWRITTEN) return length; -- 2.51.0 From a311a08a4237241fb5b9d219d3e33346de6e83e0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 2 Oct 2024 08:02:13 -0700 Subject: [PATCH 07/16] iomap: constrain the file range passed to iomap_file_unshare File contents can only be shared (i.e. reflinked) below EOF, so it makes no sense to try to unshare ranges beyond EOF. Constrain the file range parameters here so that we don't have to do that in the callers. Fixes: 5f4e5752a8a3 ("fs: add iomap_file_dirty") Signed-off-by: Darrick J. Wong Link: https://lore.kernel.org/r/20241002150213.GC21853@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Christian Brauner --- fs/dax.c | 6 +++++- fs/iomap/buffered-io.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index becb4a6920c6..c62acd2812f8 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1305,11 +1305,15 @@ int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, struct iomap_iter iter = { .inode = inode, .pos = pos, - .len = len, .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX, }; + loff_t size = i_size_read(inode); int ret; + if (pos < 0 || pos >= size) + return 0; + + iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = dax_unshare_iter(&iter); return ret; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index c1c559e0cc07..78ebd265f425 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1375,11 +1375,15 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, struct iomap_iter iter = { .inode = inode, .pos = pos, - .len = len, .flags = IOMAP_WRITE | IOMAP_UNSHARE, }; + loff_t size = i_size_read(inode); int ret; + if (pos < 0 || pos >= size) + return 0; + + iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_unshare_iter(&iter); return ret; -- 2.51.0 From b63ad06ddddfe792f93df0c24adb66622bd7b8c9 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Mon, 30 Sep 2024 11:39:54 -0400 Subject: [PATCH 08/16] doc: net: napi: Update documentation for napi_schedule_irqoff Since commit 8380c81d5c4f ("net: Treat __napi_schedule_irqoff() as __napi_schedule() on PREEMPT_RT"), napi_schedule_irqoff will do the right thing if IRQs are threaded. Therefore, there is no need to use IRQF_NO_THREAD. Signed-off-by: Sean Anderson Reviewed-by: Bagas Sanjaya Reviewed-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20240930153955.971657-1-sean.anderson@linux.dev Signed-off-by: Paolo Abeni --- Documentation/networking/napi.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/napi.rst b/Documentation/networking/napi.rst index 7bf7b95c4f7a..dfa5d549be9c 100644 --- a/Documentation/networking/napi.rst +++ b/Documentation/networking/napi.rst @@ -144,9 +144,8 @@ IRQ should only be unmasked after a successful call to napi_complete_done(): napi_schedule_irqoff() is a variant of napi_schedule() which takes advantage of guarantees given by being invoked in IRQ context (no need to -mask interrupts). Note that PREEMPT_RT forces all interrupts -to be threaded so the interrupt may need to be marked ``IRQF_NO_THREAD`` -to avoid issues on real-time kernel configurations. +mask interrupts). napi_schedule_irqoff() will fall back to napi_schedule() if +IRQs are threaded (such as if ``PREEMPT_RT`` is enabled). Instance to queue mapping ------------------------- -- 2.51.0 From c6929644c1e0d6108e57061d427eb966e1746351 Mon Sep 17 00:00:00 2001 From: Ravikanth Tuniki Date: Tue, 1 Oct 2024 00:43:35 +0530 Subject: [PATCH 09/16] dt-bindings: net: xlnx,axi-ethernet: Add missing reg minItems Add missing reg minItems as based on current binding document only ethernet MAC IO space is a supported configuration. There is a bug in schema, current examples contain 64-bit addressing as well as 32-bit addressing. The schema validation does pass incidentally considering one 64-bit reg address as two 32-bit reg address entries. If we change axi_ethernet_eth1 example node reg addressing to 32-bit schema validation reports: Documentation/devicetree/bindings/net/xlnx,axi-ethernet.example.dtb: ethernet@40000000: reg: [[1073741824, 262144]] is too short To fix it add missing reg minItems constraints and to make things clearer stick to 32-bit addressing in examples. Fixes: cbb1ca6d5f9a ("dt-bindings: net: xlnx,axi-ethernet: convert bindings document to yaml") Signed-off-by: Ravikanth Tuniki Signed-off-by: Radhey Shyam Pandey Acked-by: Conor Dooley Link: https://patch.msgid.link/1727723615-2109795-1-git-send-email-radhey.shyam.pandey@amd.com Signed-off-by: Paolo Abeni --- Documentation/devicetree/bindings/net/xlnx,axi-ethernet.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/xlnx,axi-ethernet.yaml b/Documentation/devicetree/bindings/net/xlnx,axi-ethernet.yaml index bbe89ea9590c..e95c21628281 100644 --- a/Documentation/devicetree/bindings/net/xlnx,axi-ethernet.yaml +++ b/Documentation/devicetree/bindings/net/xlnx,axi-ethernet.yaml @@ -34,6 +34,7 @@ properties: and length of the AXI DMA controller IO space, unless axistream-connected is specified, in which case the reg attribute of the node referenced by it is used. + minItems: 1 maxItems: 2 interrupts: @@ -181,7 +182,7 @@ examples: clock-names = "s_axi_lite_clk", "axis_clk", "ref_clk", "mgt_clk"; clocks = <&axi_clk>, <&axi_clk>, <&pl_enet_ref_clk>, <&mgt_clk>; phy-mode = "mii"; - reg = <0x00 0x40000000 0x00 0x40000>; + reg = <0x40000000 0x40000>; xlnx,rxcsum = <0x2>; xlnx,rxmem = <0x800>; xlnx,txcsum = <0x2>; -- 2.51.0 From 8beee4d8dee76b67c75dc91fd8185d91e845c160 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 30 Sep 2024 16:49:51 -0400 Subject: [PATCH 10/16] sctp: set sk_state back to CLOSED if autobind fails in sctp_listen_start In sctp_listen_start() invoked by sctp_inet_listen(), it should set the sk_state back to CLOSED if sctp_autobind() fails due to whatever reason. Otherwise, next time when calling sctp_inet_listen(), if sctp_sk(sk)->reuse is already set via setsockopt(SCTP_REUSE_PORT), sctp_sk(sk)->bind_hash will be dereferenced as sk_state is LISTENING, which causes a crash as bind_hash is NULL. KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] RIP: 0010:sctp_inet_listen+0x7f0/0xa20 net/sctp/socket.c:8617 Call Trace: __sys_listen_socket net/socket.c:1883 [inline] __sys_listen+0x1b7/0x230 net/socket.c:1894 __do_sys_listen net/socket.c:1902 [inline] Fixes: 5e8f3f703ae4 ("sctp: simplify sctp listening code") Reported-by: syzbot+f4e0f821e3a3b7cee51d@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Link: https://patch.msgid.link/a93e655b3c153dc8945d7a812e6d8ab0d52b7aa0.1727729391.git.lucien.xin@gmail.com Signed-off-by: Paolo Abeni --- net/sctp/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 32f76f1298da..078bcb3858c7 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -8557,8 +8557,10 @@ static int sctp_listen_start(struct sock *sk, int backlog) */ inet_sk_set_state(sk, SCTP_SS_LISTENING); if (!ep->base.bind_addr.port) { - if (sctp_autobind(sk)) + if (sctp_autobind(sk)) { + inet_sk_set_state(sk, SCTP_SS_CLOSED); return -EAGAIN; + } } else { if (sctp_get_port(sk, inet_sk(sk)->inet_num)) { inet_sk_set_state(sk, SCTP_SS_CLOSED); -- 2.51.0 From 3840cbe24cf060ea05a585ca497814609f5d47d1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Oct 2024 07:29:05 -0400 Subject: [PATCH 11/16] sched: psi: fix bogus pressure spikes from aggregation race Brandon reports sporadic, non-sensical spikes in cumulative pressure time (total=) when reading cpu.pressure at a high rate. This is due to a race condition between reader aggregation and tasks changing states. While it affects all states and all resources captured by PSI, in practice it most likely triggers with CPU pressure, since scheduling events are so frequent compared to other resource events. The race context is the live snooping of ongoing stalls during a pressure read. The read aggregates per-cpu records for stalls that have concluded, but will also incorporate ad-hoc the duration of any active state that hasn't been recorded yet. This is important to get timely measurements of ongoing stalls. Those ad-hoc samples are calculated on-the-fly up to the current time on that CPU; since the stall hasn't concluded, it's expected that this is the minimum amount of stall time that will enter the per-cpu records once it does. The problem is that the path that concludes the state uses a CPU clock read that is not synchronized against aggregators; the clock is read outside of the seqlock protection. This allows aggregators to race and snoop a stall with a longer duration than will actually be recorded. With the recorded stall time being less than the last snapshot remembered by the aggregator, a subsequent sample will underflow and observe a bogus delta value, resulting in an erratic jump in pressure. Fix this by moving the clock read of the state change into the seqlock protection. This ensures no aggregation can snoop live stalls past the time that's recorded when the state concludes. Reported-by: Brandon Duffany Link: https://bugzilla.kernel.org/show_bug.cgi?id=219194 Link: https://lore.kernel.org/lkml/20240827121851.GB438928@cmpxchg.org/ Fixes: df77430639c9 ("psi: Reduce calls to sched_clock() in psi") Cc: stable@vger.kernel.org Signed-off-by: Johannes Weiner Reviewed-by: Chengming Zhou Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 020d58967d4e..84dad1511d1e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -769,12 +769,13 @@ static void record_times(struct psi_group_cpu *groupc, u64 now) } static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set, u64 now, + unsigned int clear, unsigned int set, bool wake_clock) { struct psi_group_cpu *groupc; unsigned int t, m; u32 state_mask; + u64 now; lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); @@ -789,6 +790,7 @@ static void psi_group_change(struct psi_group *group, int cpu, * SOME and FULL time these may have resulted in. */ write_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); /* * Start with TSK_ONCPU, which doesn't have a corresponding @@ -899,18 +901,15 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; - u64 now; if (!task->pid) return; psi_flags_change(task, clear, set); - now = cpu_clock(cpu); - group = task_psi_group(task); do { - psi_group_change(group, cpu, clear, set, now, true); + psi_group_change(group, cpu, clear, set, true); } while ((group = group->parent)); } @@ -919,7 +918,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, { struct psi_group *group, *common = NULL; int cpu = task_cpu(prev); - u64 now = cpu_clock(cpu); if (next->pid) { psi_flags_change(next, 0, TSK_ONCPU); @@ -936,7 +934,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, break; } - psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + psi_group_change(group, cpu, 0, TSK_ONCPU, true); } while ((group = group->parent)); } @@ -974,7 +972,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, do { if (group == common) break; - psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_change(group, cpu, clear, set, wake_clock); } while ((group = group->parent)); /* @@ -986,7 +984,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; for (; group; group = group->parent) - psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_change(group, cpu, clear, set, wake_clock); } } } @@ -997,8 +995,8 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st int cpu = task_cpu(curr); struct psi_group *group; struct psi_group_cpu *groupc; - u64 now, irq; s64 delta; + u64 irq; if (static_branch_likely(&psi_disabled)) return; @@ -1011,7 +1009,6 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st if (prev && task_psi_group(prev) == group) return; - now = cpu_clock(cpu); irq = irq_time_read(cpu); delta = (s64)(irq - rq->psi_irq_time); if (delta < 0) @@ -1019,12 +1016,15 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st rq->psi_irq_time = irq; do { + u64 now; + if (!group->enabled) continue; groupc = per_cpu_ptr(group->pcpu, cpu); write_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); record_times(groupc, now); groupc->times[PSI_IRQ_FULL] += delta; @@ -1223,11 +1223,9 @@ void psi_cgroup_restart(struct psi_group *group) for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); struct rq_flags rf; - u64 now; rq_lock_irq(rq, &rf); - now = cpu_clock(cpu); - psi_group_change(group, cpu, 0, 0, now, true); + psi_group_change(group, cpu, 0, 0, true); rq_unlock_irq(rq, &rf); } } -- 2.51.0 From d002b922c4d5d695d617ec262f3e07cd62ee866e Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 16 Sep 2024 19:59:22 +0000 Subject: [PATCH 12/16] selftests/bpf: Remove test_skb_cgroup_id.sh from TEST_PROGS MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit test_skb_cgroup_id.sh was deleted in https://git.kernel.org/bpf/bpf-next/c/f957c230e173 It has to be removed from TEST_PROGS variable in tools/testing/selftests/bpf/Makefile, otherwise install target fails. Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Björn Töpel Link: https://lore.kernel.org/bpf/20240916195919.1872371-1-ihor.solodrai@pm.me Link: https://lore.kernel.org/bpf/Q3BN2kW9Kgy6LkrDOwnyY4Pv7_YF8fInLCd2_QA3LimKYM3wD64kRdnwp7blwG2dI_s7UGnfUae-4_dOmuTrxpYCi32G_KTzB3PfmxIerH8=@pm.me/ Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f04af11df8eb..df75f1beb731 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -132,7 +132,6 @@ TEST_PROGS := test_kmod.sh \ test_tunnel.sh \ test_lwt_seg6local.sh \ test_lirc_mode2.sh \ - test_skb_cgroup_id.sh \ test_flow_dissector.sh \ test_xdp_vlan_mode_generic.sh \ test_xdp_vlan_mode_native.sh \ -- 2.51.0 From fd4a0e67838c1e0fc4927fae113d785aa893997d Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 16 Sep 2024 19:59:27 +0000 Subject: [PATCH 13/16] selftests/bpf: Set vpath in Makefile to search for skels MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Auto-dependencies generated for %.test.o files refer to skels using filenames as opposed to full paths. This requires make to be able to link this name to an actual path, because not all generated skels are put in the working directory. In the original patch [1], this was mitigated by this target: $(notdir %.skel.h): $(TRUNNER_OUTPUT)/%.skel.h @true This turned out to be insufficient. First, %.lskel.h and %.subskel.h were missed, because a typical selftests/bpf build could find these files in the working directory. This error was detected by an out-of-tree build [2]. Second, even with missing rules added, this target causes unnecessary rebuilds in the out-of-tree case, as X.skel.h is searched for in the working directory, and not in the $(OUTPUT). Using vpath directive [3] is a better solution. Instead of introducing a separate target (X.skel.h in addition to $(TRUNNER_OUTPUT)/X.skel.h), make is instructed to search for skels in the output, which allows make to correctly detect that skel has already been generated. [1]: https://lore.kernel.org/bpf/VJihUTnvtwEgv_mOnpfy7EgD9D2MPNoHO-MlANeLIzLJPGhDeyOuGKIYyKgk0O6KPjfM-MuhtvPwZcngN8WFqbTnTRyCSMc2aMZ1ODm1T_g=@pm.me/ [2]: https://lore.kernel.org/bpf/CIjrhJwoIqMc2IhuppVqh4ZtJGbx8kC8rc9PHhAIU6RccnWT4I04F_EIr4GxQwxZe89McuGJlCnUk9UbkdvWtSJjAsd7mHmnTy9F8K2TLZM=@pm.me/ [3]: https://www.gnu.org/software/make/manual/html_node/Selective-Search.html Reported-by: Björn Töpel Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Björn Töpel Link: https://lore.kernel.org/bpf/20240916195919.1872371-2-ihor.solodrai@pm.me Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index df75f1beb731..365740f24d2e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -622,10 +622,11 @@ $(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_OUTPUT)/%: $$$$(%-deps) $(BPFTOOL) | $(TR # When the compiler generates a %.d file, only skel basenames (not # full paths) are specified as prerequisites for corresponding %.o -# file. This target makes %.skel.h basename dependent on full paths, -# linking generated %.d dependency with actual %.skel.h files. -$(notdir %.skel.h): $(TRUNNER_OUTPUT)/%.skel.h - @true +# file. vpath directives below instruct make to search for skel files +# in TRUNNER_OUTPUT, if they are not present in the working directory. +vpath %.skel.h $(TRUNNER_OUTPUT) +vpath %.lskel.h $(TRUNNER_OUTPUT) +vpath %.subskel.h $(TRUNNER_OUTPUT) endif -- 2.51.0 From 4b7c05598a644782b8451e415bb56f31e5c9d3ee Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 24 Sep 2024 13:07:30 +0200 Subject: [PATCH 14/16] selftests/bpf: Fix uprobe consumer test With newly merged code the uprobe behaviour is slightly different and affects uprobe consumer test. We no longer need to check if the uprobe object is still preserved after removing last uretprobe, because it stays as long as there's pending/installed uretprobe instance. This allows to run uretprobe consumers registered 'after' uprobe was hit even if previous uretprobe got unregistered before being hit. The uprobe object will be now removed after the last uprobe ref is released and in such case it's held by ri->uprobe (return instance) which is released after the uretprobe is hit. Reported-by: Ihor Solodrai Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Tested-by: Ihor Solodrai Closes: https://lore.kernel.org/bpf/w6U8Z9fdhjnkSp2UaFaV1fGqJXvfLEtDKEUyGDkwmoruDJ_AgF_c0FFhrkeKW18OqiP-05s9yDKiT6X-Ns-avN_ABf0dcUkXqbSJN1TQSXo=@pm.me/ Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/uprobe_multi_test.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index 844f6fc8487b..c1ac813ff9ba 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -869,21 +869,14 @@ static void consumer_test(struct uprobe_multi_consumers *skel, fmt = "prog 0/1: uprobe"; } else { /* - * uprobe return is tricky ;-) - * * to trigger uretprobe consumer, the uretprobe needs to be installed, * which means one of the 'return' uprobes was alive when probe was hit: * * idxs: 2/3 uprobe return in 'installed' mask - * - * in addition if 'after' state removes everything that was installed in - * 'before' state, then uprobe kernel object goes away and return uprobe - * is not installed and we won't hit it even if it's in 'after' state. */ unsigned long had_uretprobes = before & 0b1100; /* is uretprobe installed */ - unsigned long probe_preserved = before & after; /* did uprobe go away */ - if (had_uretprobes && probe_preserved && test_bit(idx, after)) + if (had_uretprobes && test_bit(idx, after)) val++; fmt = "idx 2/3: uretprobe"; } -- 2.51.0 From 58dbb36930183aea41024d9c0b0ed97629473e20 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 24 Sep 2024 13:07:31 +0200 Subject: [PATCH 15/16] selftests/bpf: Bail out quickly from failing consumer test Let's bail out from consumer test after we hit first fail, so we don't pollute the log with many instances with possibly the same error. Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/uprobe_multi_test.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index c1ac813ff9ba..2c39902b8a09 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -836,10 +836,10 @@ uprobe_consumer_test(struct uprobe_multi_consumers *skel, return 0; } -static void consumer_test(struct uprobe_multi_consumers *skel, - unsigned long before, unsigned long after) +static int consumer_test(struct uprobe_multi_consumers *skel, + unsigned long before, unsigned long after) { - int err, idx; + int err, idx, ret = -1; printf("consumer_test before %lu after %lu\n", before, after); @@ -881,13 +881,17 @@ static void consumer_test(struct uprobe_multi_consumers *skel, fmt = "idx 2/3: uretprobe"; } - ASSERT_EQ(skel->bss->uprobe_result[idx], val, fmt); + if (!ASSERT_EQ(skel->bss->uprobe_result[idx], val, fmt)) + goto cleanup; skel->bss->uprobe_result[idx] = 0; } + ret = 0; + cleanup: for (idx = 0; idx < 4; idx++) uprobe_detach(skel, idx); + return ret; } static void test_consumers(void) @@ -939,9 +943,11 @@ static void test_consumers(void) for (before = 0; before < 16; before++) { for (after = 0; after < 16; after++) - consumer_test(skel, before, after); + if (consumer_test(skel, before, after)) + goto out; } +out: uprobe_multi_consumers__destroy(skel); } -- 2.51.0 From 7bae563c0dbe0039d80a103601f64dcdb48b1481 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 15 Sep 2024 18:21:54 +0200 Subject: [PATCH 16/16] bpf: Constify struct btf_kind_operations struct btf_kind_operations are not modified in BTF. Constifying this structures moves some data to a read-only section, so increase overall security, especially when the structure holds some function pointers. On a x86_64, with allmodconfig: Before: ====== text data bss dec hex filename 184320 7091 548 191959 2edd7 kernel/bpf/btf.o After: ===== text data bss dec hex filename 184896 6515 548 191959 2edd7 kernel/bpf/btf.o Signed-off-by: Christophe JAILLET Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/9192ab72b2e9c66aefd6520f359a20297186327f.1726417289.git.christophe.jaillet@wanadoo.fr Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 75e4fe83c509..13dd1fa1d1b9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2808,7 +2808,7 @@ static void btf_ref_type_log(struct btf_verifier_env *env, btf_verifier_log(env, "type_id=%u", t->type); } -static struct btf_kind_operations modifier_ops = { +static const struct btf_kind_operations modifier_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_modifier_resolve, .check_member = btf_modifier_check_member, @@ -2817,7 +2817,7 @@ static struct btf_kind_operations modifier_ops = { .show = btf_modifier_show, }; -static struct btf_kind_operations ptr_ops = { +static const struct btf_kind_operations ptr_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_ptr_resolve, .check_member = btf_ptr_check_member, @@ -2858,7 +2858,7 @@ static void btf_fwd_type_log(struct btf_verifier_env *env, btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct"); } -static struct btf_kind_operations fwd_ops = { +static const struct btf_kind_operations fwd_ops = { .check_meta = btf_fwd_check_meta, .resolve = btf_df_resolve, .check_member = btf_df_check_member, @@ -3109,7 +3109,7 @@ static void btf_array_show(const struct btf *btf, const struct btf_type *t, __btf_array_show(btf, t, type_id, data, bits_offset, show); } -static struct btf_kind_operations array_ops = { +static const struct btf_kind_operations array_ops = { .check_meta = btf_array_check_meta, .resolve = btf_array_resolve, .check_member = btf_array_check_member, @@ -4185,7 +4185,7 @@ static void btf_struct_show(const struct btf *btf, const struct btf_type *t, __btf_struct_show(btf, t, type_id, data, bits_offset, show); } -static struct btf_kind_operations struct_ops = { +static const struct btf_kind_operations struct_ops = { .check_meta = btf_struct_check_meta, .resolve = btf_struct_resolve, .check_member = btf_struct_check_member, @@ -4353,7 +4353,7 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t, btf_show_end_type(show); } -static struct btf_kind_operations enum_ops = { +static const struct btf_kind_operations enum_ops = { .check_meta = btf_enum_check_meta, .resolve = btf_df_resolve, .check_member = btf_enum_check_member, @@ -4456,7 +4456,7 @@ static void btf_enum64_show(const struct btf *btf, const struct btf_type *t, btf_show_end_type(show); } -static struct btf_kind_operations enum64_ops = { +static const struct btf_kind_operations enum64_ops = { .check_meta = btf_enum64_check_meta, .resolve = btf_df_resolve, .check_member = btf_enum_check_member, @@ -4534,7 +4534,7 @@ done: btf_verifier_log(env, ")"); } -static struct btf_kind_operations func_proto_ops = { +static const struct btf_kind_operations func_proto_ops = { .check_meta = btf_func_proto_check_meta, .resolve = btf_df_resolve, /* @@ -4592,7 +4592,7 @@ static int btf_func_resolve(struct btf_verifier_env *env, return 0; } -static struct btf_kind_operations func_ops = { +static const struct btf_kind_operations func_ops = { .check_meta = btf_func_check_meta, .resolve = btf_func_resolve, .check_member = btf_df_check_member, -- 2.51.0