From cf56aa8dd26328a9af4ffe7fb0bd8fcfa9407112 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 7 Feb 2025 13:25:57 +0100 Subject: [PATCH 01/16] Revert "netfilter: flowtable: teardown flow if cached mtu is stale" This reverts commit b8baac3b9c5cc4b261454ff87d75ae8306016ffd. IPv4 packets with no DF flag set on result in frequent flow entry teardown cycles, this is visible in the network topology that is used in the nft_flowtable.sh test. nft_flowtable.sh test ocassionally fails reporting that the dscp_fwd test sees no packets going through the flowtable path. Fixes: b8baac3b9c5c ("netfilter: flowtable: teardown flow if cached mtu is stale") Reported-by: Jakub Kicinski Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 97c6eb8847a0..8cd4cf7ae211 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -381,10 +381,8 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; - if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) { - flow_offload_teardown(flow); + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return 0; - } iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); thoff = (iph->ihl * 4) + ctx->offset; @@ -662,10 +660,8 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; - if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) { - flow_offload_teardown(flow); + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return 0; - } ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); thoff = sizeof(*ip6h) + ctx->offset; -- 2.51.0 From 15d6f74f03f84c5b8d032bb1be6b90af82e5b679 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 10 Feb 2025 10:24:55 -0300 Subject: [PATCH 02/16] MAINTAINERS: Add sctp headers to the general netdev entry All SCTP patches are picked up by netdev maintainers. Two headers were missing to be listed there. Reported-by: Thorsten Blum Signed-off-by: Marcelo Ricardo Leitner Reviewed-by: Simon Horman Link: https://patch.msgid.link/b3c2dc3a102eb89bd155abca2503ebd015f50ee0.1739193671.git.marcelo.leitner@gmail.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 5f9535f4f196..b182021f4906 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16508,6 +16508,7 @@ F: include/linux/netdev* F: include/linux/netlink.h F: include/linux/netpoll.h F: include/linux/rtnetlink.h +F: include/linux/sctp.h F: include/linux/seq_file_net.h F: include/linux/skbuff* F: include/net/ @@ -16524,6 +16525,7 @@ F: include/uapi/linux/netdev* F: include/uapi/linux/netlink.h F: include/uapi/linux/netlink_diag.h F: include/uapi/linux/rtnetlink.h +F: include/uapi/linux/sctp.h F: lib/net_utils.c F: lib/random32.c F: net/ -- 2.51.0 From 78dafe1cf3afa02ed71084b350713b07e72a18fb Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 10 Feb 2025 13:15:00 +0100 Subject: [PATCH 03/16] vsock: Orphan socket after transport release During socket release, sock_orphan() is called without considering that it sets sk->sk_wq to NULL. Later, if SO_LINGER is enabled, this leads to a null pointer dereferenced in virtio_transport_wait_close(). Orphan the socket only after transport release. Partially reverts the 'Fixes:' commit. KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] lock_acquire+0x19e/0x500 _raw_spin_lock_irqsave+0x47/0x70 add_wait_queue+0x46/0x230 virtio_transport_release+0x4e7/0x7f0 __vsock_release+0xfd/0x490 vsock_release+0x90/0x120 __sock_release+0xa3/0x250 sock_close+0x14/0x20 __fput+0x35e/0xa90 __x64_sys_close+0x78/0xd0 do_syscall_64+0x93/0x1b0 entry_SYSCALL_64_after_hwframe+0x76/0x7e Reported-by: syzbot+9d55b199192a4be7d02c@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=9d55b199192a4be7d02c Fixes: fcdd2242c023 ("vsock: Keep the binding until socket destruction") Tested-by: Luigi Leonardi Reviewed-by: Luigi Leonardi Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20250210-vsock-linger-nullderef-v3-1-ef6244d02b54@rbox.co Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 075695173648..53a081d49d28 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -824,13 +824,19 @@ static void __vsock_release(struct sock *sk, int level) */ lock_sock_nested(sk, level); - sock_orphan(sk); + /* Indicate to vsock_remove_sock() that the socket is being released and + * can be removed from the bound_table. Unlike transport reassignment + * case, where the socket must remain bound despite vsock_remove_sock() + * being called from the transport release() callback. + */ + sock_set_flag(sk, SOCK_DEAD); if (vsk->transport) vsk->transport->release(vsk); else if (sock_type_connectible(sk->sk_type)) vsock_remove_sock(vsk); + sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; skb_queue_purge(&sk->sk_receive_queue); -- 2.51.0 From 440c9d488705366b00372ea7213af69827a6c7af Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 10 Feb 2025 13:15:01 +0100 Subject: [PATCH 04/16] vsock/test: Add test for SO_LINGER null ptr deref Explicitly close() a TCP_ESTABLISHED (connectible) socket with SO_LINGER enabled. As for now, test does not verify if close() actually lingers. On an unpatched machine, may trigger a null pointer dereference. Tested-by: Luigi Leonardi Reviewed-by: Luigi Leonardi Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20250210-vsock-linger-nullderef-v3-2-ef6244d02b54@rbox.co Signed-off-by: Jakub Kicinski --- tools/testing/vsock/vsock_test.c | 41 ++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index dfff8b288265..d0f6d253ac72 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1788,6 +1788,42 @@ static void test_stream_connect_retry_server(const struct test_opts *opts) close(fd); } +static void test_stream_linger_client(const struct test_opts *opts) +{ + struct linger optval = { + .l_onoff = 1, + .l_linger = 1 + }; + int fd; + + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &optval, sizeof(optval))) { + perror("setsockopt(SO_LINGER)"); + exit(EXIT_FAILURE); + } + + close(fd); +} + +static void test_stream_linger_server(const struct test_opts *opts) +{ + int fd; + + fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + vsock_wait_remote_close(fd); + close(fd); +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -1943,6 +1979,11 @@ static struct test_case test_cases[] = { .run_client = test_stream_connect_retry_client, .run_server = test_stream_connect_retry_server, }, + { + .name = "SOCK_STREAM SO_LINGER null-ptr-deref", + .run_client = test_stream_linger_client, + .run_server = test_stream_linger_server, + }, {}, }; -- 2.51.0 From 5db843258de1e4e6b1ef1cbd1797923c9e3de548 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Mon, 10 Feb 2025 16:52:15 +0200 Subject: [PATCH 05/16] net: ethernet: ti: am65-cpsw: fix memleak in certain XDP cases If the XDP program doesn't result in XDP_PASS then we leak the memory allocated by am65_cpsw_build_skb(). It is pointless to allocate SKB memory before running the XDP program as we would be wasting CPU cycles for cases other than XDP_PASS. Move the SKB allocation after evaluating the XDP program result. This fixes the memleak. A performance boost is seen for XDP_DROP test. XDP_DROP test: Before: 460256 rx/s 0 err/s After: 784130 rx/s 0 err/s Fixes: 8acacc40f733 ("net: ethernet: ti: am65-cpsw: Add minimal XDP support") Signed-off-by: Roger Quadros Link: https://patch.msgid.link/20250210-am65-cpsw-xdp-fixes-v1-1-ec6b1f7f1aca@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 26 ++++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index b663271e79f7..021c5a4df8fd 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -842,7 +842,8 @@ static void am65_cpsw_nuss_tx_cleanup(void *data, dma_addr_t desc_dma) static struct sk_buff *am65_cpsw_build_skb(void *page_addr, struct net_device *ndev, - unsigned int len) + unsigned int len, + unsigned int headroom) { struct sk_buff *skb; @@ -852,7 +853,7 @@ static struct sk_buff *am65_cpsw_build_skb(void *page_addr, if (unlikely(!skb)) return NULL; - skb_reserve(skb, AM65_CPSW_HEADROOM); + skb_reserve(skb, headroom); skb->dev = ndev; return skb; @@ -1315,16 +1316,8 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_rx_flow *flow, dev_dbg(dev, "%s rx csum_info:%#x\n", __func__, csum_info); dma_unmap_single(rx_chn->dma_dev, buf_dma, buf_dma_len, DMA_FROM_DEVICE); - k3_cppi_desc_pool_free(rx_chn->desc_pool, desc_rx); - skb = am65_cpsw_build_skb(page_addr, ndev, - AM65_CPSW_MAX_PACKET_SIZE); - if (unlikely(!skb)) { - new_page = page; - goto requeue; - } - if (port->xdp_prog) { xdp_init_buff(&xdp, PAGE_SIZE, &port->xdp_rxq[flow->id]); xdp_prepare_buff(&xdp, page_addr, AM65_CPSW_HEADROOM, @@ -1334,9 +1327,16 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_rx_flow *flow, if (*xdp_state != AM65_CPSW_XDP_PASS) goto allocate; - /* Compute additional headroom to be reserved */ - headroom = (xdp.data - xdp.data_hard_start) - skb_headroom(skb); - skb_reserve(skb, headroom); + headroom = xdp.data - xdp.data_hard_start; + } else { + headroom = AM65_CPSW_HEADROOM; + } + + skb = am65_cpsw_build_skb(page_addr, ndev, + AM65_CPSW_MAX_PACKET_SIZE, headroom); + if (unlikely(!skb)) { + new_page = page; + goto requeue; } ndev_priv = netdev_priv(ndev); -- 2.51.0 From 8a9f82ff15da03a6804cdd6557fb36ff71c0924f Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Mon, 10 Feb 2025 16:52:16 +0200 Subject: [PATCH 06/16] net: ethernet: ti: am65-cpsw: fix RX & TX statistics for XDP_TX case For successful XDP_TX and XDP_REDIRECT cases, the packet was received successfully so update RX statistics. Use original received packet length for that. TX packets statistics are incremented on TX completion so don't update it while TX queueing. If xdp_convert_buff_to_frame() fails, increment tx_dropped. Signed-off-by: Roger Quadros Fixes: 8acacc40f733 ("net: ethernet: ti: am65-cpsw: Add minimal XDP support") Link: https://patch.msgid.link/20250210-am65-cpsw-xdp-fixes-v1-2-ec6b1f7f1aca@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 021c5a4df8fd..38f3be310928 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -1170,9 +1170,11 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_rx_flow *flow, struct xdp_frame *xdpf; struct bpf_prog *prog; struct page *page; + int pkt_len; u32 act; int err; + pkt_len = *len; prog = READ_ONCE(port->xdp_prog); if (!prog) return AM65_CPSW_XDP_PASS; @@ -1190,8 +1192,10 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_rx_flow *flow, netif_txq = netdev_get_tx_queue(ndev, tx_chn->id); xdpf = xdp_convert_buff_to_frame(xdp); - if (unlikely(!xdpf)) + if (unlikely(!xdpf)) { + ndev->stats.tx_dropped++; goto drop; + } __netif_tx_lock(netif_txq, cpu); err = am65_cpsw_xdp_tx_frame(ndev, tx_chn, xdpf, @@ -1200,14 +1204,14 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_rx_flow *flow, if (err) goto drop; - dev_sw_netstats_tx_add(ndev, 1, *len); + dev_sw_netstats_rx_add(ndev, pkt_len); ret = AM65_CPSW_XDP_CONSUMED; goto out; case XDP_REDIRECT: if (unlikely(xdp_do_redirect(ndev, xdp, prog))) goto drop; - dev_sw_netstats_rx_add(ndev, *len); + dev_sw_netstats_rx_add(ndev, pkt_len); ret = AM65_CPSW_XDP_REDIRECT; goto out; default: -- 2.51.0 From 4542536f664f752db5feba2c5998b165933c34f2 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Mon, 10 Feb 2025 16:52:17 +0200 Subject: [PATCH 07/16] net: ethernet: ti: am65_cpsw: fix tx_cleanup for XDP case For XDP transmit case, swdata doesn't contain SKB but the XDP Frame. Infer the correct swdata based on buffer type and return the XDP Frame for XDP transmit case. Signed-off-by: Roger Quadros Fixes: 8acacc40f733 ("net: ethernet: ti: am65-cpsw: Add minimal XDP support") Link: https://patch.msgid.link/20250210-am65-cpsw-xdp-fixes-v1-3-ec6b1f7f1aca@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 38f3be310928..2806238629f8 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -828,16 +828,24 @@ static void am65_cpsw_nuss_xmit_free(struct am65_cpsw_tx_chn *tx_chn, static void am65_cpsw_nuss_tx_cleanup(void *data, dma_addr_t desc_dma) { struct am65_cpsw_tx_chn *tx_chn = data; + enum am65_cpsw_tx_buf_type buf_type; struct cppi5_host_desc_t *desc_tx; + struct xdp_frame *xdpf; struct sk_buff *skb; void **swdata; desc_tx = k3_cppi_desc_pool_dma2virt(tx_chn->desc_pool, desc_dma); swdata = cppi5_hdesc_get_swdata(desc_tx); - skb = *(swdata); - am65_cpsw_nuss_xmit_free(tx_chn, desc_tx); + buf_type = am65_cpsw_nuss_buf_type(tx_chn, desc_dma); + if (buf_type == AM65_CPSW_TX_BUF_TYPE_SKB) { + skb = *(swdata); + dev_kfree_skb_any(skb); + } else { + xdpf = *(swdata); + xdp_return_frame(xdpf); + } - dev_kfree_skb_any(skb); + am65_cpsw_nuss_xmit_free(tx_chn, desc_tx); } static struct sk_buff *am65_cpsw_build_skb(void *page_addr, -- 2.51.0 From b4f82f9ed43aefa79bec2504ae8c29be0c0f5d1d Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 16 Jan 2025 10:35:03 -0500 Subject: [PATCH 08/16] Bluetooth: L2CAP: Fix slab-use-after-free Read in l2cap_send_cmd After the hci sync command releases l2cap_conn, the hci receive data work queue references the released l2cap_conn when sending to the upper layer. Add hci dev lock to the hci receive data work queue to synchronize the two. [1] BUG: KASAN: slab-use-after-free in l2cap_send_cmd+0x187/0x8d0 net/bluetooth/l2cap_core.c:954 Read of size 8 at addr ffff8880271a4000 by task kworker/u9:2/5837 CPU: 0 UID: 0 PID: 5837 Comm: kworker/u9:2 Not tainted 6.13.0-rc5-syzkaller-00163-gab75170520d4 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Workqueue: hci1 hci_rx_work Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0x169/0x550 mm/kasan/report.c:489 kasan_report+0x143/0x180 mm/kasan/report.c:602 l2cap_build_cmd net/bluetooth/l2cap_core.c:2964 [inline] l2cap_send_cmd+0x187/0x8d0 net/bluetooth/l2cap_core.c:954 l2cap_sig_send_rej net/bluetooth/l2cap_core.c:5502 [inline] l2cap_sig_channel net/bluetooth/l2cap_core.c:5538 [inline] l2cap_recv_frame+0x221f/0x10db0 net/bluetooth/l2cap_core.c:6817 hci_acldata_packet net/bluetooth/hci_core.c:3797 [inline] hci_rx_work+0x508/0xdb0 net/bluetooth/hci_core.c:4040 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xa66/0x1840 kernel/workqueue.c:3310 worker_thread+0x870/0xd30 kernel/workqueue.c:3391 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Allocated by task 5837: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x98/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x243/0x390 mm/slub.c:4329 kmalloc_noprof include/linux/slab.h:901 [inline] kzalloc_noprof include/linux/slab.h:1037 [inline] l2cap_conn_add+0xa9/0x8e0 net/bluetooth/l2cap_core.c:6860 l2cap_connect_cfm+0x115/0x1090 net/bluetooth/l2cap_core.c:7239 hci_connect_cfm include/net/bluetooth/hci_core.h:2057 [inline] hci_remote_features_evt+0x68e/0xac0 net/bluetooth/hci_event.c:3726 hci_event_func net/bluetooth/hci_event.c:7473 [inline] hci_event_packet+0xac2/0x1540 net/bluetooth/hci_event.c:7525 hci_rx_work+0x3f3/0xdb0 net/bluetooth/hci_core.c:4035 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xa66/0x1840 kernel/workqueue.c:3310 worker_thread+0x870/0xd30 kernel/workqueue.c:3391 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Freed by task 54: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:582 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x59/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2353 [inline] slab_free mm/slub.c:4613 [inline] kfree+0x196/0x430 mm/slub.c:4761 l2cap_connect_cfm+0xcc/0x1090 net/bluetooth/l2cap_core.c:7235 hci_connect_cfm include/net/bluetooth/hci_core.h:2057 [inline] hci_conn_failed+0x287/0x400 net/bluetooth/hci_conn.c:1266 hci_abort_conn_sync+0x56c/0x11f0 net/bluetooth/hci_sync.c:5603 hci_cmd_sync_work+0x22b/0x400 net/bluetooth/hci_sync.c:332 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xa66/0x1840 kernel/workqueue.c:3310 worker_thread+0x870/0xd30 kernel/workqueue.c:3391 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Reported-by: syzbot+31c2f641b850a348a734@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=31c2f641b850a348a734 Tested-by: syzbot+31c2f641b850a348a734@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 39 +++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 27b4c4a2ba1f..adb8c33ac595 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -948,6 +948,16 @@ static u8 l2cap_get_ident(struct l2cap_conn *conn) return id; } +static void l2cap_send_acl(struct l2cap_conn *conn, struct sk_buff *skb, + u8 flags) +{ + /* Check if the hcon still valid before attempting to send */ + if (hci_conn_valid(conn->hcon->hdev, conn->hcon)) + hci_send_acl(conn->hchan, skb, flags); + else + kfree_skb(skb); +} + static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, void *data) { @@ -970,7 +980,7 @@ static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, bt_cb(skb)->force_active = BT_POWER_FORCE_ACTIVE_ON; skb->priority = HCI_PRIO_MAX; - hci_send_acl(conn->hchan, skb, flags); + l2cap_send_acl(conn, skb, flags); } static void l2cap_do_send(struct l2cap_chan *chan, struct sk_buff *skb) @@ -1792,13 +1802,10 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) mutex_unlock(&conn->chan_lock); - hci_chan_del(conn->hchan); - if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) cancel_delayed_work_sync(&conn->info_timer); hcon->l2cap_data = NULL; - conn->hchan = NULL; l2cap_conn_put(conn); } @@ -1806,6 +1813,7 @@ static void l2cap_conn_free(struct kref *ref) { struct l2cap_conn *conn = container_of(ref, struct l2cap_conn, ref); + hci_chan_del(conn->hchan); hci_conn_put(conn->hcon); kfree(conn); } @@ -7466,14 +7474,33 @@ static void l2cap_recv_reset(struct l2cap_conn *conn) conn->rx_len = 0; } +static struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *c) +{ + BT_DBG("conn %p orig refcnt %u", c, kref_read(&c->ref)); + + if (!kref_get_unless_zero(&c->ref)) + return NULL; + + return c; +} + void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) { - struct l2cap_conn *conn = hcon->l2cap_data; + struct l2cap_conn *conn; int len; + /* Lock hdev to access l2cap_data to avoid race with l2cap_conn_del */ + hci_dev_lock(hcon->hdev); + + conn = hcon->l2cap_data; + if (!conn) conn = l2cap_conn_add(hcon); + conn = l2cap_conn_hold_unless_zero(conn); + + hci_dev_unlock(hcon->hdev); + if (!conn) goto drop; @@ -7565,6 +7592,8 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) break; } + l2cap_conn_put(conn); + drop: kfree_skb(skb); } -- 2.51.0 From 872274b992839ff64fe560767fe7ee5f942ccdb1 Mon Sep 17 00:00:00 2001 From: Kiran K Date: Fri, 31 Jan 2025 18:30:19 +0530 Subject: [PATCH 09/16] Bluetooth: btintel_pcie: Fix a potential race condition On HCI_OP_RESET command, firmware raises alive interrupt. Driver needs to wait for this before sending other command. This patch fixes the potential miss of alive interrupt due to which HCI_OP_RESET can timeout. Expected flow: If tx command is HCI_OP_RESET, 1. set data->gp0_received = false 2. send HCI_OP_RESET 3. wait for alive interrupt Actual flow having potential race: If tx command is HCI_OP_RESET, 1. send HCI_OP_RESET 1a. Firmware raises alive interrupt here and in ISR data->gp0_received is set to true 2. set data->gp0_received = false 3. wait for alive interrupt Signed-off-by: Kiran K Fixes: 05c200c8f029 ("Bluetooth: btintel_pcie: Add handshake between driver and firmware") Reported-by: Bjorn Helgaas Closes: https://patchwork.kernel.org/project/bluetooth/patch/20241001104451.626964-1-kiran.k@intel.com/ Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btintel_pcie.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/bluetooth/btintel_pcie.c b/drivers/bluetooth/btintel_pcie.c index 2b79952f3628..091ffe3e1495 100644 --- a/drivers/bluetooth/btintel_pcie.c +++ b/drivers/bluetooth/btintel_pcie.c @@ -1320,6 +1320,10 @@ static int btintel_pcie_send_frame(struct hci_dev *hdev, if (opcode == 0xfc01) btintel_pcie_inject_cmd_complete(hdev, opcode); } + /* Firmware raises alive interrupt on HCI_OP_RESET */ + if (opcode == HCI_OP_RESET) + data->gp0_received = false; + hdev->stat.cmd_tx++; break; case HCI_ACLDATA_PKT: @@ -1357,7 +1361,6 @@ static int btintel_pcie_send_frame(struct hci_dev *hdev, opcode, btintel_pcie_alivectxt_state2str(old_ctxt), btintel_pcie_alivectxt_state2str(data->alive_intr_ctxt)); if (opcode == HCI_OP_RESET) { - data->gp0_received = false; ret = wait_event_timeout(data->gp0_wait_q, data->gp0_received, msecs_to_jiffies(BTINTEL_DEFAULT_INTR_TIMEOUT_MS)); -- 2.51.0 From ab4eedb790cae44313759b50fe47da285e2519d5 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 6 Feb 2025 15:54:45 -0500 Subject: [PATCH 10/16] Bluetooth: L2CAP: Fix corrupted list in hci_chan_del This fixes the following trace by reworking the locking of l2cap_conn so instead of only locking when changing the chan_l list this promotes chan_lock to a general lock of l2cap_conn so whenever it is being held it would prevents the likes of l2cap_conn_del to run: list_del corruption, ffff888021297e00->prev is LIST_POISON2 (dead000000000122) ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:61! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 UID: 0 PID: 5896 Comm: syz-executor213 Not tainted 6.14.0-rc1-next-20250204-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 12/27/2024 RIP: 0010:__list_del_entry_valid_or_report+0x12c/0x190 lib/list_debug.c:59 Code: 8c 4c 89 fe 48 89 da e8 32 8c 37 fc 90 0f 0b 48 89 df e8 27 9f 14 fd 48 c7 c7 a0 c0 60 8c 4c 89 fe 48 89 da e8 15 8c 37 fc 90 <0f> 0b 4c 89 e7 e8 0a 9f 14 fd 42 80 3c 2b 00 74 08 4c 89 e7 e8 cb RSP: 0018:ffffc90003f6f998 EFLAGS: 00010246 RAX: 000000000000004e RBX: dead000000000122 RCX: 01454d423f7fbf00 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: dffffc0000000000 R08: ffffffff819f077c R09: 1ffff920007eded0 R10: dffffc0000000000 R11: fffff520007eded1 R12: dead000000000122 R13: dffffc0000000000 R14: ffff8880352248d8 R15: ffff888021297e00 FS: 00007f7ace6686c0(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7aceeeb1d0 CR3: 000000003527c000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __list_del_entry_valid include/linux/list.h:124 [inline] __list_del_entry include/linux/list.h:215 [inline] list_del_rcu include/linux/rculist.h:168 [inline] hci_chan_del+0x70/0x1b0 net/bluetooth/hci_conn.c:2858 l2cap_conn_free net/bluetooth/l2cap_core.c:1816 [inline] kref_put include/linux/kref.h:65 [inline] l2cap_conn_put+0x70/0xe0 net/bluetooth/l2cap_core.c:1830 l2cap_sock_shutdown+0xa8a/0x1020 net/bluetooth/l2cap_sock.c:1377 l2cap_sock_release+0x79/0x1d0 net/bluetooth/l2cap_sock.c:1416 __sock_release net/socket.c:642 [inline] sock_close+0xbc/0x240 net/socket.c:1393 __fput+0x3e9/0x9f0 fs/file_table.c:448 task_work_run+0x24f/0x310 kernel/task_work.c:227 ptrace_notify+0x2d2/0x380 kernel/signal.c:2522 ptrace_report_syscall include/linux/ptrace.h:415 [inline] ptrace_report_syscall_exit include/linux/ptrace.h:477 [inline] syscall_exit_work+0xc7/0x1d0 kernel/entry/common.c:173 syscall_exit_to_user_mode_prepare kernel/entry/common.c:200 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:205 [inline] syscall_exit_to_user_mode+0x24a/0x340 kernel/entry/common.c:218 do_syscall_64+0x100/0x230 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f7aceeaf449 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 41 19 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f7ace668218 EFLAGS: 00000246 ORIG_RAX: 000000000000002a RAX: fffffffffffffffc RBX: 00007f7acef39328 RCX: 00007f7aceeaf449 RDX: 000000000000000e RSI: 0000000020000100 RDI: 0000000000000004 RBP: 00007f7acef39320 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000003 R13: 0000000000000004 R14: 00007f7ace668670 R15: 000000000000000b Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:__list_del_entry_valid_or_report+0x12c/0x190 lib/list_debug.c:59 Code: 8c 4c 89 fe 48 89 da e8 32 8c 37 fc 90 0f 0b 48 89 df e8 27 9f 14 fd 48 c7 c7 a0 c0 60 8c 4c 89 fe 48 89 da e8 15 8c 37 fc 90 <0f> 0b 4c 89 e7 e8 0a 9f 14 fd 42 80 3c 2b 00 74 08 4c 89 e7 e8 cb RSP: 0018:ffffc90003f6f998 EFLAGS: 00010246 RAX: 000000000000004e RBX: dead000000000122 RCX: 01454d423f7fbf00 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: dffffc0000000000 R08: ffffffff819f077c R09: 1ffff920007eded0 R10: dffffc0000000000 R11: fffff520007eded1 R12: dead000000000122 R13: dffffc0000000000 R14: ffff8880352248d8 R15: ffff888021297e00 FS: 00007f7ace6686c0(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7acef05b08 CR3: 000000003527c000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Reported-by: syzbot+10bd8fe6741eedd2be2e@syzkaller.appspotmail.com Tested-by: syzbot+10bd8fe6741eedd2be2e@syzkaller.appspotmail.com Fixes: b4f82f9ed43a ("Bluetooth: L2CAP: Fix slab-use-after-free Read in l2cap_send_cmd") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Dan Carpenter --- include/net/bluetooth/l2cap.h | 3 +- net/bluetooth/l2cap_core.c | 138 ++++++++++++---------------------- net/bluetooth/l2cap_sock.c | 15 ++-- 3 files changed, 58 insertions(+), 98 deletions(-) diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index d9c767cf773d..9189354c568f 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -668,7 +668,7 @@ struct l2cap_conn { struct l2cap_chan *smp; struct list_head chan_l; - struct mutex chan_lock; + struct mutex lock; struct kref ref; struct list_head users; }; @@ -970,6 +970,7 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err); void l2cap_send_conn_req(struct l2cap_chan *chan); struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn); +struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *conn); void l2cap_conn_put(struct l2cap_conn *conn); int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index adb8c33ac595..fec11e576f31 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -119,7 +119,6 @@ static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn, { struct l2cap_chan *c; - mutex_lock(&conn->chan_lock); c = __l2cap_get_chan_by_scid(conn, cid); if (c) { /* Only lock if chan reference is not 0 */ @@ -127,7 +126,6 @@ static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn, if (c) l2cap_chan_lock(c); } - mutex_unlock(&conn->chan_lock); return c; } @@ -140,7 +138,6 @@ static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn, { struct l2cap_chan *c; - mutex_lock(&conn->chan_lock); c = __l2cap_get_chan_by_dcid(conn, cid); if (c) { /* Only lock if chan reference is not 0 */ @@ -148,7 +145,6 @@ static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn, if (c) l2cap_chan_lock(c); } - mutex_unlock(&conn->chan_lock); return c; } @@ -418,7 +414,7 @@ static void l2cap_chan_timeout(struct work_struct *work) if (!conn) return; - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); /* __set_chan_timer() calls l2cap_chan_hold(chan) while scheduling * this work. No need to call l2cap_chan_hold(chan) here again. */ @@ -439,7 +435,7 @@ static void l2cap_chan_timeout(struct work_struct *work) l2cap_chan_unlock(chan); l2cap_chan_put(chan); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } struct l2cap_chan *l2cap_chan_create(void) @@ -641,9 +637,9 @@ void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) void l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) { - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); __l2cap_chan_add(conn, chan); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } void l2cap_chan_del(struct l2cap_chan *chan, int err) @@ -731,9 +727,9 @@ void l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func, if (!conn) return; - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); __l2cap_chan_list(conn, func, data); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } EXPORT_SYMBOL_GPL(l2cap_chan_list); @@ -745,7 +741,7 @@ static void l2cap_conn_update_id_addr(struct work_struct *work) struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan; - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); list_for_each_entry(chan, &conn->chan_l, list) { l2cap_chan_lock(chan); @@ -754,7 +750,7 @@ static void l2cap_conn_update_id_addr(struct work_struct *work) l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan) @@ -1507,8 +1503,6 @@ static void l2cap_conn_start(struct l2cap_conn *conn) BT_DBG("conn %p", conn); - mutex_lock(&conn->chan_lock); - list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { l2cap_chan_lock(chan); @@ -1577,8 +1571,6 @@ static void l2cap_conn_start(struct l2cap_conn *conn) l2cap_chan_unlock(chan); } - - mutex_unlock(&conn->chan_lock); } static void l2cap_le_conn_ready(struct l2cap_conn *conn) @@ -1624,7 +1616,7 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) if (hcon->type == ACL_LINK) l2cap_request_info(conn); - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); list_for_each_entry(chan, &conn->chan_l, list) { @@ -1642,7 +1634,7 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); if (hcon->type == LE_LINK) l2cap_le_conn_ready(conn); @@ -1657,14 +1649,10 @@ static void l2cap_conn_unreliable(struct l2cap_conn *conn, int err) BT_DBG("conn %p", conn); - mutex_lock(&conn->chan_lock); - list_for_each_entry(chan, &conn->chan_l, list) { if (test_bit(FLAG_FORCE_RELIABLE, &chan->flags)) l2cap_chan_set_err(chan, err); } - - mutex_unlock(&conn->chan_lock); } static void l2cap_info_timeout(struct work_struct *work) @@ -1675,7 +1663,9 @@ static void l2cap_info_timeout(struct work_struct *work) conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; conn->info_ident = 0; + mutex_lock(&conn->lock); l2cap_conn_start(conn); + mutex_unlock(&conn->lock); } /* @@ -1767,6 +1757,8 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); + mutex_lock(&conn->lock); + kfree_skb(conn->rx_skb); skb_queue_purge(&conn->pending_rx); @@ -1785,8 +1777,6 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) /* Force the connection to be immediately dropped */ hcon->disc_timeout = 0; - mutex_lock(&conn->chan_lock); - /* Kill channels */ list_for_each_entry_safe(chan, l, &conn->chan_l, list) { l2cap_chan_hold(chan); @@ -1800,12 +1790,14 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) l2cap_chan_put(chan); } - mutex_unlock(&conn->chan_lock); - if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) cancel_delayed_work_sync(&conn->info_timer); + hci_chan_del(conn->hchan); + conn->hchan = NULL; + hcon->l2cap_data = NULL; + mutex_unlock(&conn->lock); l2cap_conn_put(conn); } @@ -1813,7 +1805,6 @@ static void l2cap_conn_free(struct kref *ref) { struct l2cap_conn *conn = container_of(ref, struct l2cap_conn, ref); - hci_chan_del(conn->hchan); hci_conn_put(conn->hcon); kfree(conn); } @@ -2924,8 +2915,6 @@ static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb) BT_DBG("conn %p", conn); - mutex_lock(&conn->chan_lock); - list_for_each_entry(chan, &conn->chan_l, list) { if (chan->chan_type != L2CAP_CHAN_RAW) continue; @@ -2940,8 +2929,6 @@ static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb) if (chan->ops->recv(chan, nskb)) kfree_skb(nskb); } - - mutex_unlock(&conn->chan_lock); } /* ---- L2CAP signalling commands ---- */ @@ -3960,7 +3947,6 @@ static void l2cap_connect(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, goto response; } - mutex_lock(&conn->chan_lock); l2cap_chan_lock(pchan); /* Check if the ACL is secure enough (if not SDP) */ @@ -4067,7 +4053,6 @@ response: } l2cap_chan_unlock(pchan); - mutex_unlock(&conn->chan_lock); l2cap_chan_put(pchan); } @@ -4106,27 +4091,19 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, BT_DBG("dcid 0x%4.4x scid 0x%4.4x result 0x%2.2x status 0x%2.2x", dcid, scid, result, status); - mutex_lock(&conn->chan_lock); - if (scid) { chan = __l2cap_get_chan_by_scid(conn, scid); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; } else { chan = __l2cap_get_chan_by_ident(conn, cmd->ident); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; } chan = l2cap_chan_hold_unless_zero(chan); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; err = 0; @@ -4164,9 +4141,6 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, l2cap_chan_unlock(chan); l2cap_chan_put(chan); -unlock: - mutex_unlock(&conn->chan_lock); - return err; } @@ -4454,11 +4428,7 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn, chan->ops->set_shutdown(chan); - l2cap_chan_unlock(chan); - mutex_lock(&conn->chan_lock); - l2cap_chan_lock(chan); l2cap_chan_del(chan, ECONNRESET); - mutex_unlock(&conn->chan_lock); chan->ops->close(chan); @@ -4495,11 +4465,7 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn, return 0; } - l2cap_chan_unlock(chan); - mutex_lock(&conn->chan_lock); - l2cap_chan_lock(chan); l2cap_chan_del(chan, 0); - mutex_unlock(&conn->chan_lock); chan->ops->close(chan); @@ -4697,13 +4663,9 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, BT_DBG("dcid 0x%4.4x mtu %u mps %u credits %u result 0x%2.2x", dcid, mtu, mps, credits, result); - mutex_lock(&conn->chan_lock); - chan = __l2cap_get_chan_by_ident(conn, cmd->ident); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; err = 0; @@ -4751,9 +4713,6 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, l2cap_chan_unlock(chan); -unlock: - mutex_unlock(&conn->chan_lock); - return err; } @@ -4865,7 +4824,6 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, goto response; } - mutex_lock(&conn->chan_lock); l2cap_chan_lock(pchan); if (!smp_sufficient_security(conn->hcon, pchan->sec_level, @@ -4931,7 +4889,6 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, response_unlock: l2cap_chan_unlock(pchan); - mutex_unlock(&conn->chan_lock); l2cap_chan_put(pchan); if (result == L2CAP_CR_PEND) @@ -5065,7 +5022,6 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, goto response; } - mutex_lock(&conn->chan_lock); l2cap_chan_lock(pchan); if (!smp_sufficient_security(conn->hcon, pchan->sec_level, @@ -5140,7 +5096,6 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, unlock: l2cap_chan_unlock(pchan); - mutex_unlock(&conn->chan_lock); l2cap_chan_put(pchan); response: @@ -5177,8 +5132,6 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, BT_DBG("mtu %u mps %u credits %u result 0x%4.4x", mtu, mps, credits, result); - mutex_lock(&conn->chan_lock); - cmd_len -= sizeof(*rsp); list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { @@ -5264,8 +5217,6 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); - return err; } @@ -5378,8 +5329,6 @@ static inline int l2cap_le_command_rej(struct l2cap_conn *conn, if (cmd_len < sizeof(*rej)) return -EPROTO; - mutex_lock(&conn->chan_lock); - chan = __l2cap_get_chan_by_ident(conn, cmd->ident); if (!chan) goto done; @@ -5394,7 +5343,6 @@ static inline int l2cap_le_command_rej(struct l2cap_conn *conn, l2cap_chan_put(chan); done: - mutex_unlock(&conn->chan_lock); return 0; } @@ -6849,8 +6797,12 @@ static void process_pending_rx(struct work_struct *work) BT_DBG(""); + mutex_lock(&conn->lock); + while ((skb = skb_dequeue(&conn->pending_rx))) l2cap_recv_frame(conn, skb); + + mutex_unlock(&conn->lock); } static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) @@ -6889,7 +6841,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) conn->local_fixed_chan |= L2CAP_FC_SMP_BREDR; mutex_init(&conn->ident_lock); - mutex_init(&conn->chan_lock); + mutex_init(&conn->lock); INIT_LIST_HEAD(&conn->chan_l); INIT_LIST_HEAD(&conn->users); @@ -7080,7 +7032,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, } } - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); l2cap_chan_lock(chan); if (cid && __l2cap_get_chan_by_dcid(conn, cid)) { @@ -7121,7 +7073,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, chan_unlock: l2cap_chan_unlock(chan); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); done: hci_dev_unlock(hdev); hci_dev_put(hdev); @@ -7333,7 +7285,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) BT_DBG("conn %p status 0x%2.2x encrypt %u", conn, status, encrypt); - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); list_for_each_entry(chan, &conn->chan_l, list) { l2cap_chan_lock(chan); @@ -7407,7 +7359,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } /* Append fragment into frame respecting the maximum len of rx_skb */ @@ -7474,8 +7426,11 @@ static void l2cap_recv_reset(struct l2cap_conn *conn) conn->rx_len = 0; } -static struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *c) +struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *c) { + if (!c) + return NULL; + BT_DBG("conn %p orig refcnt %u", c, kref_read(&c->ref)); if (!kref_get_unless_zero(&c->ref)) @@ -7501,11 +7456,15 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) hci_dev_unlock(hcon->hdev); - if (!conn) - goto drop; + if (!conn) { + kfree_skb(skb); + return; + } BT_DBG("conn %p len %u flags 0x%x", conn, skb->len, flags); + mutex_lock(&conn->lock); + switch (flags) { case ACL_START: case ACL_START_NO_FLUSH: @@ -7530,7 +7489,7 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) if (len == skb->len) { /* Complete frame received */ l2cap_recv_frame(conn, skb); - return; + goto unlock; } BT_DBG("Start: total len %d, frag len %u", len, skb->len); @@ -7592,10 +7551,11 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) break; } - l2cap_conn_put(conn); - drop: kfree_skb(skb); +unlock: + mutex_unlock(&conn->lock); + l2cap_conn_put(conn); } static struct hci_cb l2cap_cb = { diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 46ea0bee2259..acd11b268b98 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1326,9 +1326,10 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) /* prevent sk structure from being freed whilst unlocked */ sock_hold(sk); - chan = l2cap_pi(sk)->chan; /* prevent chan structure from being freed whilst unlocked */ - l2cap_chan_hold(chan); + chan = l2cap_chan_hold_unless_zero(l2cap_pi(sk)->chan); + if (!chan) + goto shutdown_already; BT_DBG("chan %p state %s", chan, state_to_string(chan->state)); @@ -1358,22 +1359,20 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) release_sock(sk); l2cap_chan_lock(chan); - conn = chan->conn; - if (conn) - /* prevent conn structure from being freed */ - l2cap_conn_get(conn); + /* prevent conn structure from being freed */ + conn = l2cap_conn_hold_unless_zero(chan->conn); l2cap_chan_unlock(chan); if (conn) /* mutex lock must be taken before l2cap_chan_lock() */ - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); l2cap_chan_lock(chan); l2cap_chan_close(chan, 0); l2cap_chan_unlock(chan); if (conn) { - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); l2cap_conn_put(conn); } -- 2.51.0 From 5bef3ac184b5626ea62385d6b82a1992b89d7940 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Feb 2025 13:49:28 +0000 Subject: [PATCH 11/16] team: better TEAM_OPTION_TYPE_STRING validation syzbot reported following splat [1] Make sure user-provided data contains one nul byte. [1] BUG: KMSAN: uninit-value in string_nocheck lib/vsprintf.c:633 [inline] BUG: KMSAN: uninit-value in string+0x3ec/0x5f0 lib/vsprintf.c:714 string_nocheck lib/vsprintf.c:633 [inline] string+0x3ec/0x5f0 lib/vsprintf.c:714 vsnprintf+0xa5d/0x1960 lib/vsprintf.c:2843 __request_module+0x252/0x9f0 kernel/module/kmod.c:149 team_mode_get drivers/net/team/team_core.c:480 [inline] team_change_mode drivers/net/team/team_core.c:607 [inline] team_mode_option_set+0x437/0x970 drivers/net/team/team_core.c:1401 team_option_set drivers/net/team/team_core.c:375 [inline] team_nl_options_set_doit+0x1339/0x1f90 drivers/net/team/team_core.c:2662 genl_family_rcv_msg_doit net/netlink/genetlink.c:1115 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0x1214/0x12c0 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x375/0x650 net/netlink/af_netlink.c:2543 genl_rcv+0x40/0x60 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1322 [inline] netlink_unicast+0xf52/0x1260 net/netlink/af_netlink.c:1348 netlink_sendmsg+0x10da/0x11e0 net/netlink/af_netlink.c:1892 sock_sendmsg_nosec net/socket.c:718 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:733 ____sys_sendmsg+0x877/0xb60 net/socket.c:2573 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2627 __sys_sendmsg net/socket.c:2659 [inline] __do_sys_sendmsg net/socket.c:2664 [inline] __se_sys_sendmsg net/socket.c:2662 [inline] __x64_sys_sendmsg+0x212/0x3c0 net/socket.c:2662 x64_sys_call+0x2ed6/0x3c30 arch/x86/include/generated/asm/syscalls_64.h:47 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 3d249d4ca7d0 ("net: introduce ethernet teaming device") Reported-by: syzbot+1fcd957a82e3a1baa94d@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=1fcd957a82e3a1baa94d Signed-off-by: Eric Dumazet Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20250212134928.1541609-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/team/team_core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c index dc7cbd6a9798..f4019815f473 100644 --- a/drivers/net/team/team_core.c +++ b/drivers/net/team/team_core.c @@ -2639,7 +2639,9 @@ int team_nl_options_set_doit(struct sk_buff *skb, struct genl_info *info) ctx.data.u32_val = nla_get_u32(attr_data); break; case TEAM_OPTION_TYPE_STRING: - if (nla_len(attr_data) > TEAM_STRING_MAX_LEN) { + if (nla_len(attr_data) > TEAM_STRING_MAX_LEN || + !memchr(nla_data(attr_data), '\0', + nla_len(attr_data))) { err = -EINVAL; goto team_put; } -- 2.51.0 From a527750d877fd334de87eef81f1cb5f0f0ca3373 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Feb 2025 14:10:21 +0000 Subject: [PATCH 12/16] ipv6: mcast: add RCU protection to mld_newpack() mld_newpack() can be called without RTNL or RCU being held. Note that we no longer can use sock_alloc_send_skb() because ipv6.igmp_sk uses GFP_KERNEL allocations which can sleep. Instead use alloc_skb() and charge the net->ipv6.igmp_sk socket under RCU protection. Fixes: b8ad0cbc58f7 ("[NETNS][IPV6] mcast - handle several network namespace") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250212141021.1663666-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/mcast.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 81a739ebf709..65831b4fee1f 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1773,21 +1773,19 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) struct net_device *dev = idev->dev; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - struct net *net = dev_net(dev); const struct in6_addr *saddr; struct in6_addr addr_buf; struct mld2_report *pmr; struct sk_buff *skb; unsigned int size; struct sock *sk; - int err; + struct net *net; - sk = net->ipv6.igmp_sk; /* we assume size > sizeof(ra) here * Also try to not allocate high-order pages for big MTU */ size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen; - skb = sock_alloc_send_skb(sk, size, 1, &err); + skb = alloc_skb(size, GFP_KERNEL); if (!skb) return NULL; @@ -1795,6 +1793,12 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) skb_reserve(skb, hlen); skb_tailroom_reserve(skb, mtu, tlen); + rcu_read_lock(); + + net = dev_net_rcu(dev); + sk = net->ipv6.igmp_sk; + skb_set_owner_w(skb, sk); + if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* : * use unspecified address as the source address @@ -1806,6 +1810,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); + rcu_read_unlock(); + skb_put_data(skb, ra, sizeof(ra)); skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data); -- 2.51.0 From fee5d688940690cc845937459e340e4e02598e90 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Wed, 12 Feb 2025 23:23:11 +0800 Subject: [PATCH 13/16] mlxsw: Add return value check for mlxsw_sp_port_get_stats_raw() Add a check for the return value of mlxsw_sp_port_get_stats_raw() in __mlxsw_sp_port_get_stats(). If mlxsw_sp_port_get_stats_raw() returns an error, exit the function to prevent further processing with potentially invalid data. Fixes: 614d509aa1e7 ("mlxsw: Move ethtool_ops to spectrum_ethtool.c") Cc: stable@vger.kernel.org # 5.9+ Signed-off-by: Wentao Liang Reviewed-by: Petr Machata Link: https://patch.msgid.link/20250212152311.1332-1-vulab@iscas.ac.cn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c index 2bed8c86b7cf..3f64cdbabfa3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c @@ -768,7 +768,9 @@ static void __mlxsw_sp_port_get_stats(struct net_device *dev, err = mlxsw_sp_get_hw_stats_by_group(&hw_stats, &len, grp); if (err) return; - mlxsw_sp_port_get_stats_raw(dev, grp, prio, ppcnt_pl); + err = mlxsw_sp_port_get_stats_raw(dev, grp, prio, ppcnt_pl); + if (err) + return; for (i = 0; i < len; i++) { data[data_index + i] = hw_stats[i].getter(ppcnt_pl); if (!hw_stats[i].cells_bytes) -- 2.51.0 From 0d0b752f2497471ddd2b32143d167d42e18a8f3c Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Wed, 12 Feb 2025 17:36:59 +0100 Subject: [PATCH 14/16] s390/qeth: move netif_napi_add_tx() and napi_enable() from under BH Like other drivers qeth is calling local_bh_enable() after napi_schedule() to kick-start softirqs [0]. Since netif_napi_add_tx() and napi_enable() now take the netdev_lock() mutex [1], move them out from under the BH protection. Same solution as in commit a60558644e20 ("wifi: mt76: move napi_enable() from under BH") Fixes: 1b23cdbd2bbc ("net: protect netdev->napi_list with netdev_lock()") Link: https://lore.kernel.org/netdev/20240612181900.4d9d18d0@kernel.org/ [0] Link: https://lore.kernel.org/netdev/20250115035319.559603-1-kuba@kernel.org/ [1] Signed-off-by: Alexandra Winter Acked-by: Joe Damato Link: https://patch.msgid.link/20250212163659.2287292-1-wintera@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/s390/net/qeth_core_main.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index a3adaec5504e..20328d695ef9 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -7050,14 +7050,16 @@ int qeth_open(struct net_device *dev) card->data.state = CH_STATE_UP; netif_tx_start_all_queues(dev); - local_bh_disable(); qeth_for_each_output_queue(card, queue, i) { netif_napi_add_tx(dev, &queue->napi, qeth_tx_poll); napi_enable(&queue->napi); - napi_schedule(&queue->napi); } - napi_enable(&card->napi); + + local_bh_disable(); + qeth_for_each_output_queue(card, queue, i) { + napi_schedule(&queue->napi); + } napi_schedule(&card->napi); /* kick-start the NAPI softirq: */ local_bh_enable(); -- 2.51.0 From 0892b840318daa6ae739b7cdec5ecdfca4006689 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 13 Feb 2025 08:49:44 -0800 Subject: [PATCH 15/16] Reapply "net: skb: introduce and use a single page frag cache" This reverts commit 011b0335903832facca86cd8ed05d7d8d94c9c76. Sabrina reports that the revert may trigger warnings due to intervening changes, especially the ability to rise MAX_SKB_FRAGS. Let's drop it and revisit once that part is also ironed out. Fixes: 011b03359038 ("Revert "net: skb: introduce and use a single page frag cache"") Reported-by: Sabrina Dubroca Link: https://lore.kernel.org/6bf54579233038bc0e76056c5ea459872ce362ab.1739375933.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + net/core/dev.c | 17 ------- net/core/skbuff.c | 103 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 99 insertions(+), 22 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 365f0e2098d1..c0a86afb85da 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4115,6 +4115,7 @@ void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); +void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); static inline void napi_free_frags(struct napi_struct *napi) diff --git a/net/core/dev.c b/net/core/dev.c index 55e356a68db6..b91658e8aedb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6920,23 +6920,6 @@ netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) list_add_rcu(&napi->dev_list, higher); /* adds after higher */ } -/* Double check that napi_get_frags() allocates skbs with - * skb->head being backed by slab, not a page fragment. - * This is to make sure bug fixed in 3226b158e67c - * ("net: avoid 32 x truesize under-estimation for tiny skbs") - * does not accidentally come back. - */ -static void napi_get_frags_check(struct napi_struct *napi) -{ - struct sk_buff *skb; - - local_bh_disable(); - skb = napi_get_frags(napi); - WARN_ON_ONCE(skb && skb->head_frag); - napi_free_frags(napi); - local_bh_enable(); -} - void netif_napi_add_weight_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6a99c453397f..a441613a1e6c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -220,9 +220,67 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) #define NAPI_SKB_CACHE_BULK 16 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) +#if PAGE_SIZE == SZ_4K + +#define NAPI_HAS_SMALL_PAGE_FRAG 1 +#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) + +/* specialized page frag allocator using a single order 0 page + * and slicing it into 1K sized fragment. Constrained to systems + * with a very limited amount of 1K fragments fitting a single + * page - to avoid excessive truesize underestimation + */ + +struct page_frag_1k { + void *va; + u16 offset; + bool pfmemalloc; +}; + +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) +{ + struct page *page; + int offset; + + offset = nc->offset - SZ_1K; + if (likely(offset >= 0)) + goto use_frag; + + page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); + if (!page) + return NULL; + + nc->va = page_address(page); + nc->pfmemalloc = page_is_pfmemalloc(page); + offset = PAGE_SIZE - SZ_1K; + page_ref_add(page, offset / SZ_1K); + +use_frag: + nc->offset = offset; + return nc->va + offset; +} +#else + +/* the small page is actually unused in this build; add dummy helpers + * to please the compiler and avoid later preprocessor's conditionals + */ +#define NAPI_HAS_SMALL_PAGE_FRAG 0 +#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false + +struct page_frag_1k { +}; + +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) +{ + return NULL; +} + +#endif + struct napi_alloc_cache { local_lock_t bh_lock; struct page_frag_cache page; + struct page_frag_1k page_small; unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; @@ -232,6 +290,23 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); +} + void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -738,8 +813,10 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. + * When the small frag allocator is available, prefer it over kmalloc + * for small fragments */ - if (len <= SKB_WITH_OVERHEAD(1024) || + if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, @@ -749,16 +826,32 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) goto skb_success; } - len = SKB_HEAD_ALIGN(len); - if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc = this_cpu_ptr(&napi_alloc_cache); + if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { + /* we are artificially inflating the allocation size, but + * that is not as bad as it may look like, as: + * - 'len' less than GRO_MAX_HEAD makes little sense + * - On most systems, larger 'len' values lead to fragment + * size above 512 bytes + * - kmalloc would use the kmalloc-1k slab for such values + * - Builds with smaller GRO_MAX_HEAD will very likely do + * little networking, as that implies no WiFi and no + * tunnels support, and 32 bits arches. + */ + len = SZ_1K; - data = page_frag_alloc(&nc->page, len, gfp_mask); - pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); + data = page_frag_alloc_1k(&nc->page_small, gfp_mask); + pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); + } else { + len = SKB_HEAD_ALIGN(len); + + data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); + } local_unlock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!data)) -- 2.51.0 From 540cda75884a6ba4c289980c84392261b1f61a9c Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2025 11:21:24 +0000 Subject: [PATCH 16/16] rxrpc: Fix ipv6 path MTU discovery rxrpc path MTU discovery currently only makes use of ICMPv4, but not ICMPv6, which means that pmtud for IPv6 doesn't work correctly. Fix it to check for ICMPv6 messages also. Fixes: eeaedc5449d9 ("rxrpc: Implement path-MTU probing using padded PING ACKs (RFC8899)") Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/3517283.1739359284@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/peer_event.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index e874c31fa901..bc283da9ee40 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -169,6 +169,13 @@ void rxrpc_input_error(struct rxrpc_local *local, struct sk_buff *skb) goto out; } + if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 && + serr->ee.ee_type == ICMPV6_PKT_TOOBIG && + serr->ee.ee_code == 0)) { + rxrpc_adjust_mtu(peer, serr->ee.ee_info); + goto out; + } + rxrpc_store_error(peer, skb); out: rxrpc_put_peer(peer, rxrpc_peer_put_input_error); -- 2.51.0