From 8d6712c892019b9b9dc5c7039edd3c9d770b510b Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Fri, 6 Dec 2024 10:10:44 +0900 Subject: [PATCH 01/16] virtio_ring: add a func argument 'recycle_done' to virtqueue_resize() When virtqueue_resize() has actually recycled all unused buffers, additional work may be required in some cases. Relying solely on its return status is fragile, so introduce a new function argument 'recycle_done', which is invoked when the recycle really occurs. Cc: # v6.11+ Signed-off-by: Koichiro Den Acked-by: Jason Wang Reviewed-by: Xuan Zhuo Signed-off-by: Paolo Abeni --- drivers/net/virtio_net.c | 4 ++-- drivers/virtio/virtio_ring.c | 6 +++++- include/linux/virtio.h | 3 ++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index fc89c5e1a207..e10bc9e6b072 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3331,7 +3331,7 @@ static int virtnet_rx_resize(struct virtnet_info *vi, virtnet_rx_pause(vi, rq); - err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf); + err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf, NULL); if (err) netdev_err(vi->dev, "resize rx fail: rx queue index: %d err: %d\n", qindex, err); @@ -3394,7 +3394,7 @@ static int virtnet_tx_resize(struct virtnet_info *vi, struct send_queue *sq, virtnet_tx_pause(vi, sq); - err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf); + err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf, NULL); if (err) netdev_err(vi->dev, "resize tx fail: tx queue index: %d err: %d\n", qindex, err); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 82a7d2cbc704..6af8cf6a619e 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2772,6 +2772,7 @@ EXPORT_SYMBOL_GPL(vring_create_virtqueue_dma); * @_vq: the struct virtqueue we're talking about. * @num: new ring num * @recycle: callback to recycle unused buffers + * @recycle_done: callback to be invoked when recycle for all unused buffers done * * When it is really necessary to create a new vring, it will set the current vq * into the reset state. Then call the passed callback to recycle the buffer @@ -2792,7 +2793,8 @@ EXPORT_SYMBOL_GPL(vring_create_virtqueue_dma); * */ int virtqueue_resize(struct virtqueue *_vq, u32 num, - void (*recycle)(struct virtqueue *vq, void *buf)) + void (*recycle)(struct virtqueue *vq, void *buf), + void (*recycle_done)(struct virtqueue *vq)) { struct vring_virtqueue *vq = to_vvq(_vq); int err; @@ -2809,6 +2811,8 @@ int virtqueue_resize(struct virtqueue *_vq, u32 num, err = virtqueue_disable_and_recycle(_vq, recycle); if (err) return err; + if (recycle_done) + recycle_done(_vq); if (vq->packed_ring) err = virtqueue_resize_packed(_vq, num); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 57cc4b07fd17..0aa7df4ed5ca 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -109,7 +109,8 @@ dma_addr_t virtqueue_get_avail_addr(const struct virtqueue *vq); dma_addr_t virtqueue_get_used_addr(const struct virtqueue *vq); int virtqueue_resize(struct virtqueue *vq, u32 num, - void (*recycle)(struct virtqueue *vq, void *buf)); + void (*recycle)(struct virtqueue *vq, void *buf), + void (*recycle_done)(struct virtqueue *vq)); int virtqueue_reset(struct virtqueue *vq, void (*recycle)(struct virtqueue *vq, void *buf)); -- 2.51.0 From 1480f0f61b675567ca5d0943d6ef2e39172dcafd Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Fri, 6 Dec 2024 10:10:45 +0900 Subject: [PATCH 02/16] virtio_net: ensure netdev_tx_reset_queue is called on tx ring resize virtnet_tx_resize() flushes remaining tx skbs, requiring DQL counters to be reset when flushing has actually occurred. Add virtnet_sq_free_unused_buf_done() as a callback for virtqueue_reset() to handle this. Fixes: c8bd1f7f3e61 ("virtio_net: add support for Byte Queue Limits") Cc: # v6.11+ Signed-off-by: Koichiro Den Acked-by: Jason Wang Reviewed-by: Xuan Zhuo Signed-off-by: Paolo Abeni --- drivers/net/virtio_net.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index e10bc9e6b072..3a0341cc6085 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -503,6 +503,7 @@ struct virtio_net_common_hdr { static struct virtio_net_common_hdr xsk_hdr; static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf); +static void virtnet_sq_free_unused_buf_done(struct virtqueue *vq); static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp, struct net_device *dev, unsigned int *xdp_xmit, @@ -3394,7 +3395,8 @@ static int virtnet_tx_resize(struct virtnet_info *vi, struct send_queue *sq, virtnet_tx_pause(vi, sq); - err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf, NULL); + err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf, + virtnet_sq_free_unused_buf_done); if (err) netdev_err(vi->dev, "resize tx fail: tx queue index: %d err: %d\n", qindex, err); @@ -6233,6 +6235,14 @@ static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf) } } +static void virtnet_sq_free_unused_buf_done(struct virtqueue *vq) +{ + struct virtnet_info *vi = vq->vdev->priv; + int i = vq2txq(vq); + + netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, i)); +} + static void free_unused_bufs(struct virtnet_info *vi) { void *buf; -- 2.51.0 From 8d2da07c813ad333c20eb803e15f8c4541f25350 Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Fri, 6 Dec 2024 10:10:46 +0900 Subject: [PATCH 03/16] virtio_ring: add a func argument 'recycle_done' to virtqueue_reset() When virtqueue_reset() has actually recycled all unused buffers, additional work may be required in some cases. Relying solely on its return status is fragile, so introduce a new function argument 'recycle_done', which is invoked when it really occurs. Signed-off-by: Koichiro Den Acked-by: Jason Wang Reviewed-by: Xuan Zhuo Signed-off-by: Paolo Abeni --- drivers/net/virtio_net.c | 4 ++-- drivers/virtio/virtio_ring.c | 6 +++++- include/linux/virtio.h | 3 ++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 3a0341cc6085..5cf4b2b20431 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -5711,7 +5711,7 @@ static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queu virtnet_rx_pause(vi, rq); - err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf); + err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf, NULL); if (err) { netdev_err(vi->dev, "reset rx fail: rx queue index: %d err: %d\n", qindex, err); @@ -5740,7 +5740,7 @@ static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi, virtnet_tx_pause(vi, sq); - err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf); + err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf, NULL); if (err) { netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err); pool = NULL; diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 6af8cf6a619e..fdd2d2b07b5a 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2827,6 +2827,7 @@ EXPORT_SYMBOL_GPL(virtqueue_resize); * virtqueue_reset - detach and recycle all unused buffers * @_vq: the struct virtqueue we're talking about. * @recycle: callback to recycle unused buffers + * @recycle_done: callback to be invoked when recycle for all unused buffers done * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). @@ -2838,7 +2839,8 @@ EXPORT_SYMBOL_GPL(virtqueue_resize); * -EPERM: Operation not permitted */ int virtqueue_reset(struct virtqueue *_vq, - void (*recycle)(struct virtqueue *vq, void *buf)) + void (*recycle)(struct virtqueue *vq, void *buf), + void (*recycle_done)(struct virtqueue *vq)) { struct vring_virtqueue *vq = to_vvq(_vq); int err; @@ -2846,6 +2848,8 @@ int virtqueue_reset(struct virtqueue *_vq, err = virtqueue_disable_and_recycle(_vq, recycle); if (err) return err; + if (recycle_done) + recycle_done(_vq); if (vq->packed_ring) virtqueue_reinit_packed(vq); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 0aa7df4ed5ca..dd88682e27e3 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -112,7 +112,8 @@ int virtqueue_resize(struct virtqueue *vq, u32 num, void (*recycle)(struct virtqueue *vq, void *buf), void (*recycle_done)(struct virtqueue *vq)); int virtqueue_reset(struct virtqueue *vq, - void (*recycle)(struct virtqueue *vq, void *buf)); + void (*recycle)(struct virtqueue *vq, void *buf), + void (*recycle_done)(struct virtqueue *vq)); struct virtio_admin_cmd { __le16 opcode; -- 2.51.0 From 76a771ec4c9adfd75fe53c8505cf656a075d7101 Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Fri, 6 Dec 2024 10:10:47 +0900 Subject: [PATCH 04/16] virtio_net: ensure netdev_tx_reset_queue is called on bind xsk for tx virtnet_sq_bind_xsk_pool() flushes tx skbs and then resets tx queue, so DQL counters need to be reset when flushing has actually occurred, Add virtnet_sq_free_unused_buf_done() as a callback for virtqueue_resize() to handle this. Fixes: 21a4e3ce6dc7 ("virtio_net: xsk: bind/unbind xsk for tx") Signed-off-by: Koichiro Den Acked-by: Jason Wang Reviewed-by: Xuan Zhuo Signed-off-by: Paolo Abeni --- drivers/net/virtio_net.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 5cf4b2b20431..7646ddd9bef7 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -5740,7 +5740,8 @@ static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi, virtnet_tx_pause(vi, sq); - err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf, NULL); + err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf, + virtnet_sq_free_unused_buf_done); if (err) { netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err); pool = NULL; -- 2.51.0 From 51a00be6a0994da2ba6b4ace3b7a0d9373b4b25e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 6 Dec 2024 16:49:14 +0100 Subject: [PATCH 05/16] udp: fix l4 hash after reconnect After the blamed commit below, udp_rehash() is supposed to be called with both local and remote addresses set. Currently that is already the case for IPv6 sockets, but for IPv4 the destination address is updated after rehashing. Address the issue moving the destination address and port initialization before rehashing. Fixes: 1b29a730ef8b ("ipv6/udp: Add 4-tuple hash for connected socket") Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/4761e466ab9f7542c68cdc95f248987d127044d2.1733499715.git.pabeni@redhat.com Signed-off-by: Paolo Abeni --- net/ipv4/datagram.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index cc6d0bd7b0a9..4aca1f05edd3 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -61,15 +61,17 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len err = -EACCES; goto out; } + + /* Update addresses before rehashing */ + inet->inet_daddr = fl4->daddr; + inet->inet_dport = usin->sin_port; if (!inet->inet_saddr) - inet->inet_saddr = fl4->saddr; /* Update source address */ + inet->inet_saddr = fl4->saddr; if (!inet->inet_rcv_saddr) { inet->inet_rcv_saddr = fl4->saddr; if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } - inet->inet_daddr = fl4->daddr; - inet->inet_dport = usin->sin_port; reuseport_has_conns_set(sk); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); -- 2.51.0 From 24c6843b7393ebc80962b59d7ae71af91bf0dcc1 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 8 Dec 2024 17:54:48 -0800 Subject: [PATCH 06/16] bnxt_en: Fix aggregation ID mask to prevent oops on 5760X chips The 5760X (P7) chip's HW GRO/LRO interface is very similar to that of the previous generation (5750X or P5). However, the aggregation ID fields in the completion structures on P7 have been redefined from 16 bits to 12 bits. The freed up 4 bits are redefined for part of the metadata such as the VLAN ID. The aggregation ID mask was not modified when adding support for P7 chips. Including the extra 4 bits for the aggregation ID can potentially cause the driver to store or fetch the packet header of GRO/LRO packets in the wrong TPA buffer. It may hit the BUG() condition in __skb_pull() because the SKB contains no valid packet header: kernel BUG at include/linux/skbuff.h:2766! Oops: invalid opcode: 0000 1 PREEMPT SMP NOPTI CPU: 4 UID: 0 PID: 0 Comm: swapper/4 Kdump: loaded Tainted: G OE 6.12.0-rc2+ #7 Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE Hardware name: Dell Inc. PowerEdge R760/0VRV9X, BIOS 1.0.1 12/27/2022 RIP: 0010:eth_type_trans+0xda/0x140 Code: 80 00 00 00 eb c1 8b 47 70 2b 47 74 48 8b 97 d0 00 00 00 83 f8 01 7e 1b 48 85 d2 74 06 66 83 3a ff 74 09 b8 00 04 00 00 eb a5 <0f> 0b b8 00 01 00 00 eb 9c 48 85 ff 74 eb 31 f6 b9 02 00 00 00 48 RSP: 0018:ff615003803fcc28 EFLAGS: 00010283 RAX: 00000000000022d2 RBX: 0000000000000003 RCX: ff2e8c25da334040 RDX: 0000000000000040 RSI: ff2e8c25c1ce8000 RDI: ff2e8c25869f9000 RBP: ff2e8c258c31c000 R08: ff2e8c25da334000 R09: 0000000000000001 R10: ff2e8c25da3342c0 R11: ff2e8c25c1ce89c0 R12: ff2e8c258e0990b0 R13: ff2e8c25bb120000 R14: ff2e8c25c1ce89c0 R15: ff2e8c25869f9000 FS: 0000000000000000(0000) GS:ff2e8c34be300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055f05317e4c8 CR3: 000000108bac6006 CR4: 0000000000773ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? die+0x33/0x90 ? do_trap+0xd9/0x100 ? eth_type_trans+0xda/0x140 ? do_error_trap+0x65/0x80 ? eth_type_trans+0xda/0x140 ? exc_invalid_op+0x4e/0x70 ? eth_type_trans+0xda/0x140 ? asm_exc_invalid_op+0x16/0x20 ? eth_type_trans+0xda/0x140 bnxt_tpa_end+0x10b/0x6b0 [bnxt_en] ? bnxt_tpa_start+0x195/0x320 [bnxt_en] bnxt_rx_pkt+0x902/0xd90 [bnxt_en] ? __bnxt_tx_int.constprop.0+0x89/0x300 [bnxt_en] ? kmem_cache_free+0x343/0x440 ? __bnxt_tx_int.constprop.0+0x24f/0x300 [bnxt_en] __bnxt_poll_work+0x193/0x370 [bnxt_en] bnxt_poll_p5+0x9a/0x300 [bnxt_en] ? try_to_wake_up+0x209/0x670 __napi_poll+0x29/0x1b0 Fix it by redefining the aggregation ID mask for P5_PLUS chips to be 12 bits. This will work because the maximum aggregation ID is less than 4096 on all P5_PLUS chips. Fixes: 13d2d3d381ee ("bnxt_en: Add new P7 hardware interface definitions") Reviewed-by: Damodharam Ammepalli Reviewed-by: Kalesh AP Reviewed-by: Andy Gospodarek Signed-off-by: Michael Chan Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241209015448.1937766-1-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index c9f1cb7e3740..7df7a2233307 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -381,7 +381,7 @@ struct rx_agg_cmp { u32 rx_agg_cmp_opaque; __le32 rx_agg_cmp_v; #define RX_AGG_CMP_V (1 << 0) - #define RX_AGG_CMP_AGG_ID (0xffff << 16) + #define RX_AGG_CMP_AGG_ID (0x0fff << 16) #define RX_AGG_CMP_AGG_ID_SHIFT 16 __le32 rx_agg_cmp_unused; }; @@ -419,7 +419,7 @@ struct rx_tpa_start_cmp { #define RX_TPA_START_CMP_V3_RSS_HASH_TYPE_SHIFT 7 #define RX_TPA_START_CMP_AGG_ID (0x7f << 25) #define RX_TPA_START_CMP_AGG_ID_SHIFT 25 - #define RX_TPA_START_CMP_AGG_ID_P5 (0xffff << 16) + #define RX_TPA_START_CMP_AGG_ID_P5 (0x0fff << 16) #define RX_TPA_START_CMP_AGG_ID_SHIFT_P5 16 #define RX_TPA_START_CMP_METADATA1 (0xf << 28) #define RX_TPA_START_CMP_METADATA1_SHIFT 28 @@ -543,7 +543,7 @@ struct rx_tpa_end_cmp { #define RX_TPA_END_CMP_PAYLOAD_OFFSET_SHIFT 16 #define RX_TPA_END_CMP_AGG_ID (0x7f << 25) #define RX_TPA_END_CMP_AGG_ID_SHIFT 25 - #define RX_TPA_END_CMP_AGG_ID_P5 (0xffff << 16) + #define RX_TPA_END_CMP_AGG_ID_P5 (0x0fff << 16) #define RX_TPA_END_CMP_AGG_ID_SHIFT_P5 16 __le32 rx_tpa_end_cmp_tsdelta; -- 2.51.0 From bbe4b41259a3e255a16d795486d331c1670b4e75 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Mon, 9 Dec 2024 12:05:31 +0100 Subject: [PATCH 07/16] Documentation: networking: Add a caveat to nexthop_compat_mode sysctl net.ipv4.nexthop_compat_mode was added when nexthop objects were added to provide the view of nexthop objects through the usual lens of the route UAPI. As nexthop objects evolved, the information provided through this lens became incomplete. For example, details of resilient nexthop groups are obviously omitted. Now that 16-bit nexthop group weights are a thing, the 8-bit UAPI cannot convey the >8-bit weight accurately. Instead of inventing workarounds for an obsolete interface, just document the expectations of inaccuracy. Fixes: b72a6a7ab957 ("net: nexthop: Increase weight to u16") Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/b575e32399ccacd09079b2a218255164535123bd.1733740749.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index eacf8983e230..dcbb6f6caf6d 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -2170,6 +2170,12 @@ nexthop_compat_mode - BOOLEAN understands the new API, this sysctl can be disabled to achieve full performance benefits of the new API by disabling the nexthop expansion and extraneous notifications. + + Note that as a backward-compatible mode, dumping of modern features + might be incomplete or wrong. For example, resilient groups will not be + shown as such, but rather as just a list of next hops. Also weights that + do not fit into 8 bits will show incorrectly. + Default: true (backward compat mode) fib_notify_on_flag_change - INTEGER -- 2.51.0 From 06d64ab46f19ac12f59a1d2aa8cd196b2e4edb5b Mon Sep 17 00:00:00 2001 From: MoYuanhao Date: Mon, 9 Dec 2024 13:28:14 +0100 Subject: [PATCH 08/16] tcp: check space before adding MPTCP SYN options Ensure there is enough space before adding MPTCP options in tcp_syn_options(). Without this check, 'remaining' could underflow, and causes issues. If there is not enough space, MPTCP should not be used. Signed-off-by: MoYuanhao Fixes: cec37a6e41aa ("mptcp: Handle MP_CAPABLE options for outgoing connections") Cc: stable@vger.kernel.org Acked-by: Matthieu Baerts (NGI0) [ Matt: Add Fixes, cc Stable, update Description ] Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241209-net-mptcp-check-space-syn-v1-1-2da992bb6f74@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5485a70b5fe5..0e5b9a654254 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -883,8 +883,10 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, unsigned int size; if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) { - opts->options |= OPTION_MPTCP; - remaining -= size; + if (remaining >= size) { + opts->options |= OPTION_MPTCP; + remaining -= size; + } } } -- 2.51.0 From 5cb099902b6b6292b3a85ffa1bb844e0ba195945 Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Sun, 8 Dec 2024 14:50:01 +0500 Subject: [PATCH 09/16] net: renesas: rswitch: fix possible early skb release When sending frame split into multiple descriptors, hardware processes descriptors one by one, including writing back DT values. The first descriptor could be already marked as completed when processing of next descriptors for the same frame is still in progress. Although only the last descriptor is configured to generate interrupt, completion of the first descriptor could be noticed by the driver when handling interrupt for the previous frame. Currently, driver stores skb in the entry that corresponds to the first descriptor. This results into skb could be unmapped and freed when hardware did not complete the send yet. This opens a window for corrupting the data being sent. Fix this by saving skb in the entry that corresponds to the last descriptor used to send the frame. Fixes: d2c96b9d5f83 ("net: rswitch: Add jumbo frames handling for TX") Signed-off-by: Nikita Yushchenko Reviewed-by: Yoshihiro Shimoda Link: https://patch.msgid.link/20241208095004.69468-2-nikita.yoush@cogentembedded.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rswitch.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index 8d18dae4d8fb..f4fc2e7212c3 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -1681,8 +1681,9 @@ static netdev_tx_t rswitch_start_xmit(struct sk_buff *skb, struct net_device *nd if (dma_mapping_error(ndev->dev.parent, dma_addr_orig)) goto err_kfree; - gq->skbs[gq->cur] = skb; - gq->unmap_addrs[gq->cur] = dma_addr_orig; + /* Stored the skb at the last descriptor to avoid skb free before hardware completes send */ + gq->skbs[(gq->cur + nr_desc - 1) % gq->ring_size] = skb; + gq->unmap_addrs[(gq->cur + nr_desc - 1) % gq->ring_size] = dma_addr_orig; /* DT_FSTART should be set at last. So, this is reverse order. */ for (i = nr_desc; i-- > 0; ) { -- 2.51.0 From 0c9547e6ccf40455b0574cf589be3b152a3edf5b Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Sun, 8 Dec 2024 14:50:02 +0500 Subject: [PATCH 10/16] net: renesas: rswitch: fix race window between tx start and complete If hardware is already transmitting, it can start handling the descriptor being written to immediately after it observes updated DT field, before the queue is kicked by a write to GWTRC. If the start_xmit() execution is preempted at unfortunate moment, this transmission can complete, and interrupt handled, before gq->cur gets updated. With the current implementation of completion, this will cause the last entry not completed. Fix that by changing completion loop to check DT values directly, instead of depending on gq->cur. Fixes: 3590918b5d07 ("net: ethernet: renesas: Add support for "Ethernet Switch"") Signed-off-by: Nikita Yushchenko Reviewed-by: Yoshihiro Shimoda Link: https://patch.msgid.link/20241208095004.69468-3-nikita.yoush@cogentembedded.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rswitch.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index f4fc2e7212c3..cdf4d6ccccb0 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -862,13 +862,10 @@ static void rswitch_tx_free(struct net_device *ndev) struct rswitch_ext_desc *desc; struct sk_buff *skb; - for (; rswitch_get_num_cur_queues(gq) > 0; - gq->dirty = rswitch_next_queue_index(gq, false, 1)) { - desc = &gq->tx_ring[gq->dirty]; - if ((desc->desc.die_dt & DT_MASK) != DT_FEMPTY) - break; - + desc = &gq->tx_ring[gq->dirty]; + while ((desc->desc.die_dt & DT_MASK) == DT_FEMPTY) { dma_rmb(); + skb = gq->skbs[gq->dirty]; if (skb) { rdev->ndev->stats.tx_packets++; @@ -879,7 +876,10 @@ static void rswitch_tx_free(struct net_device *ndev) dev_kfree_skb_any(gq->skbs[gq->dirty]); gq->skbs[gq->dirty] = NULL; } + desc->desc.die_dt = DT_EEMPTY; + gq->dirty = rswitch_next_queue_index(gq, false, 1); + desc = &gq->tx_ring[gq->dirty]; } } @@ -1685,6 +1685,8 @@ static netdev_tx_t rswitch_start_xmit(struct sk_buff *skb, struct net_device *nd gq->skbs[(gq->cur + nr_desc - 1) % gq->ring_size] = skb; gq->unmap_addrs[(gq->cur + nr_desc - 1) % gq->ring_size] = dma_addr_orig; + dma_wmb(); + /* DT_FSTART should be set at last. So, this is reverse order. */ for (i = nr_desc; i-- > 0; ) { desc = &gq->tx_ring[rswitch_next_queue_index(gq, true, i)]; @@ -1695,8 +1697,6 @@ static netdev_tx_t rswitch_start_xmit(struct sk_buff *skb, struct net_device *nd goto err_unmap; } - wmb(); /* gq->cur must be incremented after die_dt was set */ - gq->cur = rswitch_next_queue_index(gq, true, nr_desc); rswitch_modify(rdev->addr, GWTRC(gq->index), 0, BIT(gq->index % 32)); -- 2.51.0 From bb617328bafa1023d8e9c25a25345a564c66c14f Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Sun, 8 Dec 2024 14:50:03 +0500 Subject: [PATCH 11/16] net: renesas: rswitch: fix leaked pointer on error path If error path is taken while filling descriptor for a frame, skb pointer is left in the entry. Later, on the ring entry reuse, the same entry could be used as a part of a multi-descriptor frame, and skb for that new frame could be stored in a different entry. Then, the stale pointer will reach the completion routine, and passed to the release operation. Fix that by clearing the saved skb pointer at the error path. Fixes: d2c96b9d5f83 ("net: rswitch: Add jumbo frames handling for TX") Signed-off-by: Nikita Yushchenko Reviewed-by: Yoshihiro Shimoda Link: https://patch.msgid.link/20241208095004.69468-4-nikita.yoush@cogentembedded.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rswitch.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index cdf4d6ccccb0..6d6912b9c16e 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -1703,6 +1703,7 @@ static netdev_tx_t rswitch_start_xmit(struct sk_buff *skb, struct net_device *nd return ret; err_unmap: + gq->skbs[(gq->cur + nr_desc - 1) % gq->ring_size] = NULL; dma_unmap_single(ndev->dev.parent, dma_addr_orig, skb->len, DMA_TO_DEVICE); err_kfree: -- 2.51.0 From 66b7e9f85b8459c823b11e9af69dbf4be5eb6be8 Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Sun, 8 Dec 2024 14:50:04 +0500 Subject: [PATCH 12/16] net: renesas: rswitch: avoid use-after-put for a device tree node The device tree node saved in the rswitch_device structure is used at several driver locations. So passing this node to of_node_put() after the first use is wrong. Move of_node_put() for this node to exit paths. Fixes: b46f1e579329 ("net: renesas: rswitch: Simplify struct phy * handling") Signed-off-by: Nikita Yushchenko Reviewed-by: Yoshihiro Shimoda Link: https://patch.msgid.link/20241208095004.69468-5-nikita.yoush@cogentembedded.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rswitch.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index 6d6912b9c16e..ebcdf2917b83 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -1891,7 +1891,6 @@ static int rswitch_device_alloc(struct rswitch_private *priv, unsigned int index rdev->np_port = rswitch_get_port_node(rdev); rdev->disabled = !rdev->np_port; err = of_get_ethdev_address(rdev->np_port, ndev); - of_node_put(rdev->np_port); if (err) { if (is_valid_ether_addr(rdev->etha->mac_addr)) eth_hw_addr_set(ndev, rdev->etha->mac_addr); @@ -1921,6 +1920,7 @@ out_txdmac: out_rxdmac: out_get_params: + of_node_put(rdev->np_port); netif_napi_del(&rdev->napi); free_netdev(ndev); @@ -1934,6 +1934,7 @@ static void rswitch_device_free(struct rswitch_private *priv, unsigned int index rswitch_txdmac_free(ndev); rswitch_rxdmac_free(ndev); + of_node_put(rdev->np_port); netif_napi_del(&rdev->napi); free_netdev(ndev); } -- 2.51.0 From 3dd002f20098b9569f8fd7f8703f364571e2e975 Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Mon, 9 Dec 2024 16:32:04 +0500 Subject: [PATCH 13/16] net: renesas: rswitch: handle stop vs interrupt race Currently the stop routine of rswitch driver does not immediately prevent hardware from continuing to update descriptors and requesting interrupts. It can happen that when rswitch_stop() executes the masking of interrupts from the queues of the port being closed, napi poll for that port is already scheduled or running on a different CPU. When execution of this napi poll completes, it will unmask the interrupts. And unmasked interrupt can fire after rswitch_stop() returns from napi_disable() call. Then, the handler won't mask it, because napi_schedule_prep() will return false, and interrupt storm will happen. This can't be fixed by making rswitch_stop() call napi_disable() before masking interrupts. In this case, the interrupt storm will happen if interrupt fires between napi_disable() and masking. Fix this by checking for priv->opened_ports bit when unmasking interrupts after napi poll. For that to be consistent, move priv->opened_ports changes into spinlock-protected areas, and reorder other operations in rswitch_open() and rswitch_stop() accordingly. Signed-off-by: Nikita Yushchenko Reviewed-by: Yoshihiro Shimoda Fixes: 3590918b5d07 ("net: ethernet: renesas: Add support for "Ethernet Switch"") Link: https://patch.msgid.link/20241209113204.175015-1-nikita.yoush@cogentembedded.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rswitch.c | 33 ++++++++++++++------------ 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index ebcdf2917b83..6ec714789573 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -908,8 +908,10 @@ retry: if (napi_complete_done(napi, budget - quota)) { spin_lock_irqsave(&priv->lock, flags); - rswitch_enadis_data_irq(priv, rdev->tx_queue->index, true); - rswitch_enadis_data_irq(priv, rdev->rx_queue->index, true); + if (test_bit(rdev->port, priv->opened_ports)) { + rswitch_enadis_data_irq(priv, rdev->tx_queue->index, true); + rswitch_enadis_data_irq(priv, rdev->rx_queue->index, true); + } spin_unlock_irqrestore(&priv->lock, flags); } @@ -1538,20 +1540,20 @@ static int rswitch_open(struct net_device *ndev) struct rswitch_device *rdev = netdev_priv(ndev); unsigned long flags; - phy_start(ndev->phydev); + if (bitmap_empty(rdev->priv->opened_ports, RSWITCH_NUM_PORTS)) + iowrite32(GWCA_TS_IRQ_BIT, rdev->priv->addr + GWTSDIE); napi_enable(&rdev->napi); - netif_start_queue(ndev); spin_lock_irqsave(&rdev->priv->lock, flags); + bitmap_set(rdev->priv->opened_ports, rdev->port, 1); rswitch_enadis_data_irq(rdev->priv, rdev->tx_queue->index, true); rswitch_enadis_data_irq(rdev->priv, rdev->rx_queue->index, true); spin_unlock_irqrestore(&rdev->priv->lock, flags); - if (bitmap_empty(rdev->priv->opened_ports, RSWITCH_NUM_PORTS)) - iowrite32(GWCA_TS_IRQ_BIT, rdev->priv->addr + GWTSDIE); + phy_start(ndev->phydev); - bitmap_set(rdev->priv->opened_ports, rdev->port, 1); + netif_start_queue(ndev); return 0; }; @@ -1563,7 +1565,16 @@ static int rswitch_stop(struct net_device *ndev) unsigned long flags; netif_tx_stop_all_queues(ndev); + + phy_stop(ndev->phydev); + + spin_lock_irqsave(&rdev->priv->lock, flags); + rswitch_enadis_data_irq(rdev->priv, rdev->tx_queue->index, false); + rswitch_enadis_data_irq(rdev->priv, rdev->rx_queue->index, false); bitmap_clear(rdev->priv->opened_ports, rdev->port, 1); + spin_unlock_irqrestore(&rdev->priv->lock, flags); + + napi_disable(&rdev->napi); if (bitmap_empty(rdev->priv->opened_ports, RSWITCH_NUM_PORTS)) iowrite32(GWCA_TS_IRQ_BIT, rdev->priv->addr + GWTSDID); @@ -1576,14 +1587,6 @@ static int rswitch_stop(struct net_device *ndev) kfree(ts_info); } - spin_lock_irqsave(&rdev->priv->lock, flags); - rswitch_enadis_data_irq(rdev->priv, rdev->tx_queue->index, false); - rswitch_enadis_data_irq(rdev->priv, rdev->rx_queue->index, false); - spin_unlock_irqrestore(&rdev->priv->lock, flags); - - phy_stop(ndev->phydev); - napi_disable(&rdev->napi); - return 0; }; -- 2.51.0 From 3e643e4efa1e87432204b62f9cfdea3b2508c830 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 19 Nov 2024 14:31:40 +0100 Subject: [PATCH 14/16] Bluetooth: Improve setsockopt() handling of malformed user input The bt_copy_from_sockptr() return value is being misinterpreted by most users: a non-zero result is mistakenly assumed to represent an error code, but actually indicates the number of bytes that could not be copied. Remove bt_copy_from_sockptr() and adapt callers to use copy_safe_from_sockptr(). For sco_sock_setsockopt() (case BT_CODEC) use copy_struct_from_sockptr() to scrub parts of uninitialized buffer. Opportunistically, rename `len` to `optlen` in hci_sock_setsockopt_old() and hci_sock_setsockopt(). Fixes: 51eda36d33e4 ("Bluetooth: SCO: Fix not validating setsockopt user input") Fixes: a97de7bff13b ("Bluetooth: RFCOMM: Fix not validating setsockopt user input") Fixes: 4f3951242ace ("Bluetooth: L2CAP: Fix not validating setsockopt user input") Fixes: 9e8742cdfc4b ("Bluetooth: ISO: Fix not validating setsockopt user input") Fixes: b2186061d604 ("Bluetooth: hci_sock: Fix not validating setsockopt user input") Reviewed-by: Luiz Augusto von Dentz Reviewed-by: David Wei Signed-off-by: Michal Luczaj Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 9 --------- net/bluetooth/hci_sock.c | 14 +++++++------- net/bluetooth/iso.c | 10 +++++----- net/bluetooth/l2cap_sock.c | 20 +++++++++++--------- net/bluetooth/rfcomm/sock.c | 9 ++++----- net/bluetooth/sco.c | 11 ++++++----- 6 files changed, 33 insertions(+), 40 deletions(-) diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index f66bc85c6411..e6760c11f007 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -590,15 +590,6 @@ static inline struct sk_buff *bt_skb_sendmmsg(struct sock *sk, return skb; } -static inline int bt_copy_from_sockptr(void *dst, size_t dst_size, - sockptr_t src, size_t src_size) -{ - if (dst_size > src_size) - return -EINVAL; - - return copy_from_sockptr(dst, src, dst_size); -} - int bt_to_errno(u16 code); __u8 bt_status(int err); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 2272e1849ebd..022b86797acd 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1926,7 +1926,7 @@ drop: } static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname, - sockptr_t optval, unsigned int len) + sockptr_t optval, unsigned int optlen) { struct hci_ufilter uf = { .opcode = 0 }; struct sock *sk = sock->sk; @@ -1943,7 +1943,7 @@ static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname, switch (optname) { case HCI_DATA_DIR: - err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, len); + err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; @@ -1954,7 +1954,7 @@ static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname, break; case HCI_TIME_STAMP: - err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, len); + err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; @@ -1974,7 +1974,7 @@ static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname, uf.event_mask[1] = *((u32 *) f->event_mask + 1); } - err = bt_copy_from_sockptr(&uf, sizeof(uf), optval, len); + err = copy_safe_from_sockptr(&uf, sizeof(uf), optval, optlen); if (err) break; @@ -2005,7 +2005,7 @@ done: } static int hci_sock_setsockopt(struct socket *sock, int level, int optname, - sockptr_t optval, unsigned int len) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err = 0; @@ -2015,7 +2015,7 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, if (level == SOL_HCI) return hci_sock_setsockopt_old(sock, level, optname, optval, - len); + optlen); if (level != SOL_BLUETOOTH) return -ENOPROTOOPT; @@ -2035,7 +2035,7 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, goto done; } - err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, len); + err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 1b40fd2b2f02..695b56091e18 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1566,7 +1566,7 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, break; } - err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; @@ -1577,7 +1577,7 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_PKT_STATUS: - err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; @@ -1596,7 +1596,7 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, break; } - err = bt_copy_from_sockptr(&qos, sizeof(qos), optval, optlen); + err = copy_safe_from_sockptr(&qos, sizeof(qos), optval, optlen); if (err) break; @@ -1617,8 +1617,8 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, break; } - err = bt_copy_from_sockptr(iso_pi(sk)->base, optlen, optval, - optlen); + err = copy_safe_from_sockptr(iso_pi(sk)->base, optlen, optval, + optlen); if (err) break; diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 18e89e764f3b..3d2553dcdb1b 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -755,7 +755,8 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, opts.max_tx = chan->max_tx; opts.txwin_size = chan->tx_win; - err = bt_copy_from_sockptr(&opts, sizeof(opts), optval, optlen); + err = copy_safe_from_sockptr(&opts, sizeof(opts), optval, + optlen); if (err) break; @@ -800,7 +801,7 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, break; case L2CAP_LM: - err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; @@ -909,7 +910,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, sec.level = BT_SECURITY_LOW; - err = bt_copy_from_sockptr(&sec, sizeof(sec), optval, optlen); + err = copy_safe_from_sockptr(&sec, sizeof(sec), optval, optlen); if (err) break; @@ -956,7 +957,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; @@ -970,7 +971,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_FLUSHABLE: - err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; @@ -1004,7 +1005,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, pwr.force_active = BT_POWER_FORCE_ACTIVE_ON; - err = bt_copy_from_sockptr(&pwr, sizeof(pwr), optval, optlen); + err = copy_safe_from_sockptr(&pwr, sizeof(pwr), optval, optlen); if (err) break; @@ -1015,7 +1016,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_CHANNEL_POLICY: - err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; @@ -1046,7 +1047,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - err = bt_copy_from_sockptr(&mtu, sizeof(mtu), optval, optlen); + err = copy_safe_from_sockptr(&mtu, sizeof(mtu), optval, optlen); if (err) break; @@ -1076,7 +1077,8 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - err = bt_copy_from_sockptr(&mode, sizeof(mode), optval, optlen); + err = copy_safe_from_sockptr(&mode, sizeof(mode), optval, + optlen); if (err) break; diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 40766f8119ed..913402806fa0 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -629,10 +629,9 @@ static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, switch (optname) { case RFCOMM_LM: - if (bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen)) { - err = -EFAULT; + err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt & RFCOMM_LM_FIPS) { err = -EINVAL; @@ -685,7 +684,7 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, sec.level = BT_SECURITY_LOW; - err = bt_copy_from_sockptr(&sec, sizeof(sec), optval, optlen); + err = copy_safe_from_sockptr(&sec, sizeof(sec), optval, optlen); if (err) break; @@ -703,7 +702,7 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, break; } - err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 78f7bca24487..7eb8d3e04ec4 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -896,7 +896,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, break; } - err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; @@ -915,8 +915,8 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, voice.setting = sco_pi(sk)->setting; - err = bt_copy_from_sockptr(&voice, sizeof(voice), optval, - optlen); + err = copy_safe_from_sockptr(&voice, sizeof(voice), optval, + optlen); if (err) break; @@ -941,7 +941,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_PKT_STATUS: - err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; @@ -984,7 +984,8 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, break; } - err = bt_copy_from_sockptr(buffer, optlen, optval, optlen); + err = copy_struct_from_sockptr(buffer, sizeof(buffer), optval, + optlen); if (err) { hci_dev_put(hdev); break; -- 2.51.0 From 4d94f05558271654670d18c26c912da0c1c15549 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 3 Dec 2024 16:07:32 -0500 Subject: [PATCH 15/16] Bluetooth: hci_core: Fix sleeping function called from invalid context This reworks hci_cb_list to not use mutex hci_cb_list_lock to avoid bugs like the bellow: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:585 in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 5070, name: kworker/u9:2 preempt_count: 0, expected: 0 RCU nest depth: 1, expected: 0 4 locks held by kworker/u9:2/5070: #0: ffff888015be3948 ((wq_completion)hci0#2){+.+.}-{0:0}, at: process_one_work kernel/workqueue.c:3229 [inline] #0: ffff888015be3948 ((wq_completion)hci0#2){+.+.}-{0:0}, at: process_scheduled_works+0x8e0/0x1770 kernel/workqueue.c:3335 #1: ffffc90003b6fd00 ((work_completion)(&hdev->rx_work)){+.+.}-{0:0}, at: process_one_work kernel/workqueue.c:3230 [inline] #1: ffffc90003b6fd00 ((work_completion)(&hdev->rx_work)){+.+.}-{0:0}, at: process_scheduled_works+0x91b/0x1770 kernel/workqueue.c:3335 #2: ffff8880665d0078 (&hdev->lock){+.+.}-{3:3}, at: hci_le_create_big_complete_evt+0xcf/0xae0 net/bluetooth/hci_event.c:6914 #3: ffffffff8e132020 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:298 [inline] #3: ffffffff8e132020 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:750 [inline] #3: ffffffff8e132020 (rcu_read_lock){....}-{1:2}, at: hci_le_create_big_complete_evt+0xdb/0xae0 net/bluetooth/hci_event.c:6915 CPU: 0 PID: 5070 Comm: kworker/u9:2 Not tainted 6.8.0-syzkaller-08073-g480e035fc4c7 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Workqueue: hci0 hci_rx_work Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 __might_resched+0x5d4/0x780 kernel/sched/core.c:10187 __mutex_lock_common kernel/locking/mutex.c:585 [inline] __mutex_lock+0xc1/0xd70 kernel/locking/mutex.c:752 hci_connect_cfm include/net/bluetooth/hci_core.h:2004 [inline] hci_le_create_big_complete_evt+0x3d9/0xae0 net/bluetooth/hci_event.c:6939 hci_event_func net/bluetooth/hci_event.c:7514 [inline] hci_event_packet+0xa53/0x1540 net/bluetooth/hci_event.c:7569 hci_rx_work+0x3e8/0xca0 net/bluetooth/hci_core.c:4171 process_one_work kernel/workqueue.c:3254 [inline] process_scheduled_works+0xa00/0x1770 kernel/workqueue.c:3335 worker_thread+0x86d/0xd70 kernel/workqueue.c:3416 kthread+0x2f0/0x390 kernel/kthread.c:388 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:243 Reported-by: syzbot+2fb0835e0c9cefc34614@syzkaller.appspotmail.com Tested-by: syzbot+2fb0835e0c9cefc34614@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=2fb0835e0c9cefc34614 Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 108 ++++++++++++++++++++----------- net/bluetooth/hci_core.c | 10 +-- net/bluetooth/iso.c | 6 ++ net/bluetooth/l2cap_core.c | 12 ++-- net/bluetooth/rfcomm/core.c | 6 ++ net/bluetooth/sco.c | 12 ++-- 6 files changed, 97 insertions(+), 57 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index ea798f07c5a2..ca22ead85dbe 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -804,7 +804,6 @@ struct hci_conn_params { extern struct list_head hci_dev_list; extern struct list_head hci_cb_list; extern rwlock_t hci_dev_list_lock; -extern struct mutex hci_cb_list_lock; #define hci_dev_set_flag(hdev, nr) set_bit((nr), (hdev)->dev_flags) #define hci_dev_clear_flag(hdev, nr) clear_bit((nr), (hdev)->dev_flags) @@ -2017,24 +2016,47 @@ struct hci_cb { char *name; + bool (*match) (struct hci_conn *conn); void (*connect_cfm) (struct hci_conn *conn, __u8 status); void (*disconn_cfm) (struct hci_conn *conn, __u8 status); void (*security_cfm) (struct hci_conn *conn, __u8 status, - __u8 encrypt); + __u8 encrypt); void (*key_change_cfm) (struct hci_conn *conn, __u8 status); void (*role_switch_cfm) (struct hci_conn *conn, __u8 status, __u8 role); }; +static inline void hci_cb_lookup(struct hci_conn *conn, struct list_head *list) +{ + struct hci_cb *cb, *cpy; + + rcu_read_lock(); + list_for_each_entry_rcu(cb, &hci_cb_list, list) { + if (cb->match && cb->match(conn)) { + cpy = kmalloc(sizeof(*cpy), GFP_ATOMIC); + if (!cpy) + break; + + *cpy = *cb; + INIT_LIST_HEAD(&cpy->list); + list_add_rcu(&cpy->list, list); + } + } + rcu_read_unlock(); +} + static inline void hci_connect_cfm(struct hci_conn *conn, __u8 status) { - struct hci_cb *cb; + struct list_head list; + struct hci_cb *cb, *tmp; + + INIT_LIST_HEAD(&list); + hci_cb_lookup(conn, &list); - mutex_lock(&hci_cb_list_lock); - list_for_each_entry(cb, &hci_cb_list, list) { + list_for_each_entry_safe(cb, tmp, &list, list) { if (cb->connect_cfm) cb->connect_cfm(conn, status); + kfree(cb); } - mutex_unlock(&hci_cb_list_lock); if (conn->connect_cfm_cb) conn->connect_cfm_cb(conn, status); @@ -2042,43 +2064,55 @@ static inline void hci_connect_cfm(struct hci_conn *conn, __u8 status) static inline void hci_disconn_cfm(struct hci_conn *conn, __u8 reason) { - struct hci_cb *cb; + struct list_head list; + struct hci_cb *cb, *tmp; + + INIT_LIST_HEAD(&list); + hci_cb_lookup(conn, &list); - mutex_lock(&hci_cb_list_lock); - list_for_each_entry(cb, &hci_cb_list, list) { + list_for_each_entry_safe(cb, tmp, &list, list) { if (cb->disconn_cfm) cb->disconn_cfm(conn, reason); + kfree(cb); } - mutex_unlock(&hci_cb_list_lock); if (conn->disconn_cfm_cb) conn->disconn_cfm_cb(conn, reason); } -static inline void hci_auth_cfm(struct hci_conn *conn, __u8 status) +static inline void hci_security_cfm(struct hci_conn *conn, __u8 status, + __u8 encrypt) { - struct hci_cb *cb; - __u8 encrypt; - - if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) - return; + struct list_head list; + struct hci_cb *cb, *tmp; - encrypt = test_bit(HCI_CONN_ENCRYPT, &conn->flags) ? 0x01 : 0x00; + INIT_LIST_HEAD(&list); + hci_cb_lookup(conn, &list); - mutex_lock(&hci_cb_list_lock); - list_for_each_entry(cb, &hci_cb_list, list) { + list_for_each_entry_safe(cb, tmp, &list, list) { if (cb->security_cfm) cb->security_cfm(conn, status, encrypt); + kfree(cb); } - mutex_unlock(&hci_cb_list_lock); if (conn->security_cfm_cb) conn->security_cfm_cb(conn, status); } +static inline void hci_auth_cfm(struct hci_conn *conn, __u8 status) +{ + __u8 encrypt; + + if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) + return; + + encrypt = test_bit(HCI_CONN_ENCRYPT, &conn->flags) ? 0x01 : 0x00; + + hci_security_cfm(conn, status, encrypt); +} + static inline void hci_encrypt_cfm(struct hci_conn *conn, __u8 status) { - struct hci_cb *cb; __u8 encrypt; if (conn->state == BT_CONFIG) { @@ -2105,40 +2139,38 @@ static inline void hci_encrypt_cfm(struct hci_conn *conn, __u8 status) conn->sec_level = conn->pending_sec_level; } - mutex_lock(&hci_cb_list_lock); - list_for_each_entry(cb, &hci_cb_list, list) { - if (cb->security_cfm) - cb->security_cfm(conn, status, encrypt); - } - mutex_unlock(&hci_cb_list_lock); - - if (conn->security_cfm_cb) - conn->security_cfm_cb(conn, status); + hci_security_cfm(conn, status, encrypt); } static inline void hci_key_change_cfm(struct hci_conn *conn, __u8 status) { - struct hci_cb *cb; + struct list_head list; + struct hci_cb *cb, *tmp; + + INIT_LIST_HEAD(&list); + hci_cb_lookup(conn, &list); - mutex_lock(&hci_cb_list_lock); - list_for_each_entry(cb, &hci_cb_list, list) { + list_for_each_entry_safe(cb, tmp, &list, list) { if (cb->key_change_cfm) cb->key_change_cfm(conn, status); + kfree(cb); } - mutex_unlock(&hci_cb_list_lock); } static inline void hci_role_switch_cfm(struct hci_conn *conn, __u8 status, __u8 role) { - struct hci_cb *cb; + struct list_head list; + struct hci_cb *cb, *tmp; + + INIT_LIST_HEAD(&list); + hci_cb_lookup(conn, &list); - mutex_lock(&hci_cb_list_lock); - list_for_each_entry(cb, &hci_cb_list, list) { + list_for_each_entry_safe(cb, tmp, &list, list) { if (cb->role_switch_cfm) cb->role_switch_cfm(conn, status, role); + kfree(cb); } - mutex_unlock(&hci_cb_list_lock); } static inline bool hci_bdaddr_is_rpa(bdaddr_t *bdaddr, u8 addr_type) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index f9e19f9cb5a3..18ab5628f85a 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -57,7 +57,6 @@ DEFINE_RWLOCK(hci_dev_list_lock); /* HCI callback list */ LIST_HEAD(hci_cb_list); -DEFINE_MUTEX(hci_cb_list_lock); /* HCI ID Numbering */ static DEFINE_IDA(hci_index_ida); @@ -2993,9 +2992,7 @@ int hci_register_cb(struct hci_cb *cb) { BT_DBG("%p name %s", cb, cb->name); - mutex_lock(&hci_cb_list_lock); - list_add_tail(&cb->list, &hci_cb_list); - mutex_unlock(&hci_cb_list_lock); + list_add_tail_rcu(&cb->list, &hci_cb_list); return 0; } @@ -3005,9 +3002,8 @@ int hci_unregister_cb(struct hci_cb *cb) { BT_DBG("%p name %s", cb, cb->name); - mutex_lock(&hci_cb_list_lock); - list_del(&cb->list); - mutex_unlock(&hci_cb_list_lock); + list_del_rcu(&cb->list); + synchronize_rcu(); return 0; } diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 695b56091e18..a7e0008f0ad8 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -2118,6 +2118,11 @@ done: return HCI_LM_ACCEPT; } +static bool iso_match(struct hci_conn *hcon) +{ + return hcon->type == ISO_LINK || hcon->type == LE_LINK; +} + static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) { if (hcon->type != ISO_LINK) { @@ -2299,6 +2304,7 @@ drop: static struct hci_cb iso_cb = { .name = "ISO", + .match = iso_match, .connect_cfm = iso_connect_cfm, .disconn_cfm = iso_disconn_cfm, }; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 6544c1ed7143..27b4c4a2ba1f 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7217,6 +7217,11 @@ static struct l2cap_chan *l2cap_global_fixed_chan(struct l2cap_chan *c, return NULL; } +static bool l2cap_match(struct hci_conn *hcon) +{ + return hcon->type == ACL_LINK || hcon->type == LE_LINK; +} + static void l2cap_connect_cfm(struct hci_conn *hcon, u8 status) { struct hci_dev *hdev = hcon->hdev; @@ -7224,9 +7229,6 @@ static void l2cap_connect_cfm(struct hci_conn *hcon, u8 status) struct l2cap_chan *pchan; u8 dst_type; - if (hcon->type != ACL_LINK && hcon->type != LE_LINK) - return; - BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status); if (status) { @@ -7291,9 +7293,6 @@ int l2cap_disconn_ind(struct hci_conn *hcon) static void l2cap_disconn_cfm(struct hci_conn *hcon, u8 reason) { - if (hcon->type != ACL_LINK && hcon->type != LE_LINK) - return; - BT_DBG("hcon %p reason %d", hcon, reason); l2cap_conn_del(hcon, bt_to_errno(reason)); @@ -7572,6 +7571,7 @@ drop: static struct hci_cb l2cap_cb = { .name = "L2CAP", + .match = l2cap_match, .connect_cfm = l2cap_connect_cfm, .disconn_cfm = l2cap_disconn_cfm, .security_cfm = l2cap_security_cfm, diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index ad5177e3a69b..4c56ca5a216c 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -2134,6 +2134,11 @@ static int rfcomm_run(void *unused) return 0; } +static bool rfcomm_match(struct hci_conn *hcon) +{ + return hcon->type == ACL_LINK; +} + static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt) { struct rfcomm_session *s; @@ -2180,6 +2185,7 @@ static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt) static struct hci_cb rfcomm_cb = { .name = "RFCOMM", + .match = rfcomm_match, .security_cfm = rfcomm_security_cfm }; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 7eb8d3e04ec4..40c4957cfc0b 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -1397,11 +1397,13 @@ int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) return lm; } -static void sco_connect_cfm(struct hci_conn *hcon, __u8 status) +static bool sco_match(struct hci_conn *hcon) { - if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) - return; + return hcon->type == SCO_LINK || hcon->type == ESCO_LINK; +} +static void sco_connect_cfm(struct hci_conn *hcon, __u8 status) +{ BT_DBG("hcon %p bdaddr %pMR status %u", hcon, &hcon->dst, status); if (!status) { @@ -1416,9 +1418,6 @@ static void sco_connect_cfm(struct hci_conn *hcon, __u8 status) static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason) { - if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) - return; - BT_DBG("hcon %p reason %d", hcon, reason); sco_conn_del(hcon, bt_to_errno(reason)); @@ -1444,6 +1443,7 @@ drop: static struct hci_cb sco_cb = { .name = "SCO", + .match = sco_match, .connect_cfm = sco_connect_cfm, .disconn_cfm = sco_disconn_cfm, }; -- 2.51.0 From 581dd2dc168fe0ed2a7a5534a724f0d3751c93ae Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 4 Dec 2024 11:40:59 -0500 Subject: [PATCH 16/16] Bluetooth: hci_event: Fix using rcu_read_(un)lock while iterating The usage of rcu_read_(un)lock while inside list_for_each_entry_rcu is not safe since for the most part entries fetched this way shall be treated as rcu_dereference: Note that the value returned by rcu_dereference() is valid only within the enclosing RCU read-side critical section [1]_. For example, the following is **not** legal:: rcu_read_lock(); p = rcu_dereference(head.next); rcu_read_unlock(); x = p->address; /* BUG!!! */ rcu_read_lock(); y = p->data; /* BUG!!! */ rcu_read_unlock(); Fixes: a0bfde167b50 ("Bluetooth: ISO: Add support for connecting multiple BISes") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 1427d6e2f3c9..2cc7a9306350 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6870,38 +6870,27 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, return; hci_dev_lock(hdev); - rcu_read_lock(); /* Connect all BISes that are bound to the BIG */ - list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { - if (bacmp(&conn->dst, BDADDR_ANY) || - conn->type != ISO_LINK || - conn->iso_qos.bcast.big != ev->handle) + while ((conn = hci_conn_hash_lookup_big_state(hdev, ev->handle, + BT_BOUND))) { + if (ev->status) { + hci_connect_cfm(conn, ev->status); + hci_conn_del(conn); continue; + } if (hci_conn_set_handle(conn, __le16_to_cpu(ev->bis_handle[i++]))) continue; - if (!ev->status) { - conn->state = BT_CONNECTED; - set_bit(HCI_CONN_BIG_CREATED, &conn->flags); - rcu_read_unlock(); - hci_debugfs_create_conn(conn); - hci_conn_add_sysfs(conn); - hci_iso_setup_path(conn); - rcu_read_lock(); - continue; - } - - hci_connect_cfm(conn, ev->status); - rcu_read_unlock(); - hci_conn_del(conn); - rcu_read_lock(); + conn->state = BT_CONNECTED; + set_bit(HCI_CONN_BIG_CREATED, &conn->flags); + hci_debugfs_create_conn(conn); + hci_conn_add_sysfs(conn); + hci_iso_setup_path(conn); } - rcu_read_unlock(); - if (!ev->status && !i) /* If no BISes have been connected for the BIG, * terminate. This is in case all bound connections -- 2.51.0