From b3af60928ab9129befa65e6df0310d27300942bf Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Fri, 10 Jan 2025 14:21:55 +0100 Subject: [PATCH 01/16] bpf: Fix bpf_sk_select_reuseport() memory leak As pointed out in the original comment, lookup in sockmap can return a TCP ESTABLISHED socket. Such TCP socket may have had SO_ATTACH_REUSEPORT_EBPF set before it was ESTABLISHED. In other words, a non-NULL sk_reuseport_cb does not imply a non-refcounted socket. Drop sk's reference in both error paths. unreferenced object 0xffff888101911800 (size 2048): comm "test_progs", pid 44109, jiffies 4297131437 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 80 00 01 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc 9336483b): __kmalloc_noprof+0x3bf/0x560 __reuseport_alloc+0x1d/0x40 reuseport_alloc+0xca/0x150 reuseport_attach_prog+0x87/0x140 sk_reuseport_attach_bpf+0xc8/0x100 sk_setsockopt+0x1181/0x1990 do_sock_setsockopt+0x12b/0x160 __sys_setsockopt+0x7b/0xc0 __x64_sys_setsockopt+0x1b/0x30 do_syscall_64+0x93/0x180 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fixes: 64d85290d79c ("bpf: Allow bpf_map_lookup_elem for SOCKMAP and SOCKHASH") Signed-off-by: Michal Luczaj Reviewed-by: Martin KaFai Lau Link: https://patch.msgid.link/20250110-reuseport-memleak-v1-1-fa1ddab0adfe@rbox.co Signed-off-by: Jakub Kicinski --- net/core/filter.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 834614071727..2fb45a86f3dd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -11251,6 +11251,7 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; struct sock_reuseport *reuse; struct sock *selected_sk; + int err; selected_sk = map->ops->map_lookup_elem(map, key); if (!selected_sk) @@ -11258,10 +11259,6 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, reuse = rcu_dereference(selected_sk->sk_reuseport_cb); if (!reuse) { - /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ - if (sk_is_refcounted(selected_sk)) - sock_put(selected_sk); - /* reuseport_array has only sk with non NULL sk_reuseport_cb. * The only (!reuse) case here is - the sk has already been * unhashed (e.g. by close()), so treat it as -ENOENT. @@ -11269,24 +11266,33 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, * Other maps (e.g. sock_map) do not provide this guarantee and * the sk may never be in the reuseport group to begin with. */ - return is_sockarray ? -ENOENT : -EINVAL; + err = is_sockarray ? -ENOENT : -EINVAL; + goto error; } if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { struct sock *sk = reuse_kern->sk; - if (sk->sk_protocol != selected_sk->sk_protocol) - return -EPROTOTYPE; - else if (sk->sk_family != selected_sk->sk_family) - return -EAFNOSUPPORT; - - /* Catch all. Likely bound to a different sockaddr. */ - return -EBADFD; + if (sk->sk_protocol != selected_sk->sk_protocol) { + err = -EPROTOTYPE; + } else if (sk->sk_family != selected_sk->sk_family) { + err = -EAFNOSUPPORT; + } else { + /* Catch all. Likely bound to a different sockaddr. */ + err = -EBADFD; + } + goto error; } reuse_kern->selected_sk = selected_sk; return 0; +error: + /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ + if (sk_is_refcounted(selected_sk)) + sock_put(selected_sk); + + return err; } static const struct bpf_func_proto sk_select_reuseport_proto = { -- 2.51.0 From f0aa6a37a3dbb40b272df5fc6db93c114688adcd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 8 Jan 2025 20:30:57 -0800 Subject: [PATCH 02/16] eth: bnxt: always recalculate features after XDP clearing, fix null-deref Recalculate features when XDP is detached. Before: # ip li set dev eth0 xdp obj xdp_dummy.bpf.o sec xdp # ip li set dev eth0 xdp off # ethtool -k eth0 | grep gro rx-gro-hw: off [requested on] After: # ip li set dev eth0 xdp obj xdp_dummy.bpf.o sec xdp # ip li set dev eth0 xdp off # ethtool -k eth0 | grep gro rx-gro-hw: on The fact that HW-GRO doesn't get re-enabled automatically is just a minor annoyance. The real issue is that the features will randomly come back during another reconfiguration which just happens to invoke netdev_update_features(). The driver doesn't handle reconfiguring two things at a time very robustly. Starting with commit 98ba1d931f61 ("bnxt_en: Fix RSS logic in __bnxt_reserve_rings()") we only reconfigure the RSS hash table if the "effective" number of Rx rings has changed. If HW-GRO is enabled "effective" number of rings is 2x what user sees. So if we are in the bad state, with HW-GRO re-enablement "pending" after XDP off, and we lower the rings by / 2 - the HW-GRO rings doing 2x and the ethtool -L doing / 2 may cancel each other out, and the: if (old_rx_rings != bp->hw_resc.resv_rx_rings && condition in __bnxt_reserve_rings() will be false. The RSS map won't get updated, and we'll crash with: BUG: kernel NULL pointer dereference, address: 0000000000000168 RIP: 0010:__bnxt_hwrm_vnic_set_rss+0x13a/0x1a0 bnxt_hwrm_vnic_rss_cfg_p5+0x47/0x180 __bnxt_setup_vnic_p5+0x58/0x110 bnxt_init_nic+0xb72/0xf50 __bnxt_open_nic+0x40d/0xab0 bnxt_open_nic+0x2b/0x60 ethtool_set_channels+0x18c/0x1d0 As we try to access a freed ring. The issue is present since XDP support was added, really, but prior to commit 98ba1d931f61 ("bnxt_en: Fix RSS logic in __bnxt_reserve_rings()") it wasn't causing major issues. Fixes: 1054aee82321 ("bnxt_en: Use NETIF_F_GRO_HW.") Fixes: 98ba1d931f61 ("bnxt_en: Fix RSS logic in __bnxt_reserve_rings()") Reviewed-by: Michael Chan Reviewed-by: Somnath Kotur Link: https://patch.msgid.link/20250109043057.2888953-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 25 +++++++++++++++---- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 7 ------ 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index aeaa74f03046..b6f844cac80e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -4708,7 +4708,7 @@ void bnxt_set_ring_params(struct bnxt *bp) /* Changing allocation mode of RX rings. * TODO: Update when extending xdp_rxq_info to support allocation modes. */ -int bnxt_set_rx_skb_mode(struct bnxt *bp, bool page_mode) +static void __bnxt_set_rx_skb_mode(struct bnxt *bp, bool page_mode) { struct net_device *dev = bp->dev; @@ -4729,15 +4729,30 @@ int bnxt_set_rx_skb_mode(struct bnxt *bp, bool page_mode) bp->rx_skb_func = bnxt_rx_page_skb; } bp->rx_dir = DMA_BIDIRECTIONAL; - /* Disable LRO or GRO_HW */ - netdev_update_features(dev); } else { dev->max_mtu = bp->max_mtu; bp->flags &= ~BNXT_FLAG_RX_PAGE_MODE; bp->rx_dir = DMA_FROM_DEVICE; bp->rx_skb_func = bnxt_rx_skb; } - return 0; +} + +void bnxt_set_rx_skb_mode(struct bnxt *bp, bool page_mode) +{ + __bnxt_set_rx_skb_mode(bp, page_mode); + + if (!page_mode) { + int rx, tx; + + bnxt_get_max_rings(bp, &rx, &tx, true); + if (rx > 1) { + bp->flags &= ~BNXT_FLAG_NO_AGG_RINGS; + bp->dev->hw_features |= NETIF_F_LRO; + } + } + + /* Update LRO and GRO_HW availability */ + netdev_update_features(bp->dev); } static void bnxt_free_vnic_attributes(struct bnxt *bp) @@ -16214,7 +16229,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (bp->max_fltr < BNXT_MAX_FLTR) bp->max_fltr = BNXT_MAX_FLTR; bnxt_init_l2_fltr_tbl(bp); - bnxt_set_rx_skb_mode(bp, false); + __bnxt_set_rx_skb_mode(bp, false); bnxt_set_tpa_flags(bp); bnxt_set_ring_params(bp); bnxt_rdma_aux_device_init(bp); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 7df7a2233307..f11ed59203d9 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2846,7 +2846,7 @@ u32 bnxt_fw_health_readl(struct bnxt *bp, int reg_idx); bool bnxt_bs_trace_avail(struct bnxt *bp, u16 type); void bnxt_set_tpa_flags(struct bnxt *bp); void bnxt_set_ring_params(struct bnxt *); -int bnxt_set_rx_skb_mode(struct bnxt *bp, bool page_mode); +void bnxt_set_rx_skb_mode(struct bnxt *bp, bool page_mode); void bnxt_insert_usr_fltr(struct bnxt *bp, struct bnxt_filter_base *fltr); void bnxt_del_one_usr_fltr(struct bnxt *bp, struct bnxt_filter_base *fltr); int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp, unsigned long *bmap, diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index f88b641533fc..dc51dce209d5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -422,15 +422,8 @@ static int bnxt_xdp_set(struct bnxt *bp, struct bpf_prog *prog) bnxt_set_rx_skb_mode(bp, true); xdp_features_set_redirect_target(dev, true); } else { - int rx, tx; - xdp_features_clear_redirect_target(dev); bnxt_set_rx_skb_mode(bp, false); - bnxt_get_max_rings(bp, &rx, &tx, true); - if (rx > 1) { - bp->flags &= ~BNXT_FLAG_NO_AGG_RINGS; - bp->dev->hw_features |= NETIF_F_LRO; - } } bp->tx_nr_rings_xdp = tx_xdp; bp->tx_nr_rings = bp->tx_nr_rings_per_tc * tc + tx_xdp; -- 2.51.0 From 5ef44b3cb43bda4009ba7fe3d54e0d258ae4aee7 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 8 Jan 2025 16:34:36 -0800 Subject: [PATCH 03/16] xsk: Bring back busy polling support Commit 86e25f40aa1e ("net: napi: Add napi_config") moved napi->napi_id assignment to a later point in time (napi_hash_add_with_id). This breaks __xdp_rxq_info_reg which copies napi_id at an earlier time and now stores 0 napi_id. It also makes sk_mark_napi_id_once_xdp and __sk_mark_napi_id_once useless because they now work against 0 napi_id. Since sk_busy_loop requires valid napi_id to busy-poll on, there is no way to busy-poll AF_XDP sockets anymore. Bring back the ability to busy-poll on XSK by resolving socket's napi_id at bind time. This relies on relatively recent netif_queue_set_napi, but (assume) at this point most popular drivers should have been converted. This also removes per-tx/rx cycles which used to check and/or set the napi_id value. Confirmed by running a busy-polling AF_XDP socket (github.com/fomichev/xskrtt) on mlx5 and looking at BusyPollRxPackets from /proc/net/netstat. Fixes: 86e25f40aa1e ("net: napi: Add napi_config") Signed-off-by: Stanislav Fomichev Acked-by: Magnus Karlsson Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20250109003436.2829560-1-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/net/busy_poll.h | 8 -------- include/net/xdp.h | 1 - include/net/xdp_sock_drv.h | 14 -------------- net/core/xdp.c | 1 - net/xdp/xsk.c | 14 +++++++++----- 5 files changed, 9 insertions(+), 29 deletions(-) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index c858270141bc..c39a426ebf52 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -174,12 +174,4 @@ static inline void sk_mark_napi_id_once(struct sock *sk, #endif } -static inline void sk_mark_napi_id_once_xdp(struct sock *sk, - const struct xdp_buff *xdp) -{ -#ifdef CONFIG_NET_RX_BUSY_POLL - __sk_mark_napi_id_once(sk, xdp->rxq->napi_id); -#endif -} - #endif /* _LINUX_NET_BUSY_POLL_H */ diff --git a/include/net/xdp.h b/include/net/xdp.h index e6770dd40c91..b5b10f2b88e5 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -62,7 +62,6 @@ struct xdp_rxq_info { u32 queue_index; u32 reg_state; struct xdp_mem_info mem; - unsigned int napi_id; u32 frag_size; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 40085afd9160..7a7316d9c0da 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -59,15 +59,6 @@ static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, xp_fill_cb(pool, desc); } -static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) -{ -#ifdef CONFIG_NET_RX_BUSY_POLL - return pool->heads[0].xdp.rxq->napi_id; -#else - return 0; -#endif -} - static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { @@ -306,11 +297,6 @@ static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, { } -static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) -{ - return 0; -} - static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { diff --git a/net/core/xdp.c b/net/core/xdp.c index bcc5551c6424..2315feed94ef 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -186,7 +186,6 @@ int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, xdp_rxq_info_init(xdp_rxq); xdp_rxq->dev = dev; xdp_rxq->queue_index = queue_index; - xdp_rxq->napi_id = napi_id; xdp_rxq->frag_size = frag_size; xdp_rxq->reg_state = REG_STATE_REGISTERED; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3fa70286c846..89d2bef96469 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -322,7 +322,6 @@ static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) return -ENOSPC; } - sk_mark_napi_id_once_xdp(&xs->sk, xdp); return 0; } @@ -908,11 +907,8 @@ static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len if (unlikely(!xs->tx)) return -ENOBUFS; - if (sk_can_busy_loop(sk)) { - if (xs->zc) - __sk_mark_napi_id_once(sk, xsk_pool_get_napi_id(xs->pool)); + if (sk_can_busy_loop(sk)) sk_busy_loop(sk, 1); /* only support non-blocking sockets */ - } if (xs->zc && xsk_no_wakeup(sk)) return 0; @@ -1298,6 +1294,14 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xs->queue_id = qid; xp_add_xsk(xs->pool, xs); + if (xs->zc && qid < dev->real_num_rx_queues) { + struct netdev_rx_queue *rxq; + + rxq = __netif_get_rx_queue(dev, qid); + if (rxq->napi) + __sk_mark_napi_id_once(sk, rxq->napi->napi_id); + } + out_unlock: if (err) { dev_put(dev); -- 2.51.0 From eaeea5028fa82412392d9325c44624ef8fcd1869 Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Wed, 8 Jan 2025 21:03:11 -0800 Subject: [PATCH 04/16] net: mana: Cleanup "mana" debugfs dir after cleanup of all children In mana_driver_exit(), mana_debugfs_root gets cleanup before any of it's children (which happens later in the pci_unregister_driver()). Due to this, when mana driver is configured as a module and rmmod is invoked, following stack gets printed along with failure in rmmod command. [ 2399.317651] BUG: kernel NULL pointer dereference, address: 0000000000000098 [ 2399.318657] #PF: supervisor write access in kernel mode [ 2399.319057] #PF: error_code(0x0002) - not-present page [ 2399.319528] PGD 10eb68067 P4D 0 [ 2399.319914] Oops: Oops: 0002 [#1] SMP NOPTI [ 2399.320308] CPU: 72 UID: 0 PID: 5815 Comm: rmmod Not tainted 6.13.0-rc5+ #89 [ 2399.320986] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 09/28/2024 [ 2399.321892] RIP: 0010:down_write+0x1a/0x50 [ 2399.322303] Code: 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 55 48 89 e5 41 54 49 89 fc e8 9d cd ff ff 31 c0 ba 01 00 00 00 49 0f b1 14 24 75 17 65 48 8b 05 f6 84 dd 5f 49 89 44 24 08 4c [ 2399.323669] RSP: 0018:ff53859d6c663a70 EFLAGS: 00010246 [ 2399.324061] RAX: 0000000000000000 RBX: ff1d4eb505060180 RCX: ffffff8100000000 [ 2399.324620] RDX: 0000000000000001 RSI: 0000000000000064 RDI: 0000000000000098 [ 2399.325167] RBP: ff53859d6c663a78 R08: 00000000000009c4 R09: ff1d4eb4fac90000 [ 2399.325681] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000098 [ 2399.326185] R13: ff1d4e42e1a4a0c8 R14: ff1d4eb538ce0000 R15: 0000000000000098 [ 2399.326755] FS: 00007fe729570000(0000) GS:ff1d4eb2b7200000(0000) knlGS:0000000000000000 [ 2399.327269] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2399.327690] CR2: 0000000000000098 CR3: 00000001c0584005 CR4: 0000000000373ef0 [ 2399.328166] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 2399.328623] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 [ 2399.329055] Call Trace: [ 2399.329243] [ 2399.329379] ? show_regs+0x69/0x80 [ 2399.329602] ? __die+0x25/0x70 [ 2399.329856] ? page_fault_oops+0x271/0x550 [ 2399.330088] ? psi_group_change+0x217/0x470 [ 2399.330341] ? do_user_addr_fault+0x455/0x7b0 [ 2399.330667] ? finish_task_switch.isra.0+0x91/0x2f0 [ 2399.331004] ? exc_page_fault+0x73/0x160 [ 2399.331275] ? asm_exc_page_fault+0x27/0x30 [ 2399.343324] ? down_write+0x1a/0x50 [ 2399.343631] simple_recursive_removal+0x4d/0x2c0 [ 2399.343977] ? __pfx_remove_one+0x10/0x10 [ 2399.344251] debugfs_remove+0x45/0x70 [ 2399.344511] mana_destroy_rxq+0x44/0x400 [mana] [ 2399.344845] mana_destroy_vport+0x54/0x1c0 [mana] [ 2399.345229] mana_detach+0x2f1/0x4e0 [mana] [ 2399.345466] ? ida_free+0x150/0x160 [ 2399.345718] ? __cond_resched+0x1a/0x50 [ 2399.345987] mana_remove+0xf4/0x1a0 [mana] [ 2399.346243] mana_gd_remove+0x25/0x80 [mana] [ 2399.346605] pci_device_remove+0x41/0xb0 [ 2399.346878] device_remove+0x46/0x70 [ 2399.347150] device_release_driver_internal+0x1e3/0x250 [ 2399.347831] ? klist_remove+0x81/0xe0 [ 2399.348377] driver_detach+0x4b/0xa0 [ 2399.348906] bus_remove_driver+0x83/0x100 [ 2399.349435] driver_unregister+0x31/0x60 [ 2399.349919] pci_unregister_driver+0x40/0x90 [ 2399.350492] mana_driver_exit+0x1c/0xb50 [mana] [ 2399.351102] __do_sys_delete_module.constprop.0+0x184/0x320 [ 2399.351664] ? __fput+0x1a9/0x2d0 [ 2399.352200] __x64_sys_delete_module+0x12/0x20 [ 2399.352760] x64_sys_call+0x1e66/0x2140 [ 2399.353316] do_syscall_64+0x79/0x150 [ 2399.353813] ? syscall_exit_to_user_mode+0x49/0x230 [ 2399.354346] ? do_syscall_64+0x85/0x150 [ 2399.354816] ? irqentry_exit+0x1d/0x30 [ 2399.355287] ? exc_page_fault+0x7f/0x160 [ 2399.355756] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 2399.356302] RIP: 0033:0x7fe728d26aeb [ 2399.356776] Code: 73 01 c3 48 8b 0d 45 33 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 b0 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 15 33 0f 00 f7 d8 64 89 01 48 [ 2399.358372] RSP: 002b:00007ffff954d6f8 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 [ 2399.359066] RAX: ffffffffffffffda RBX: 00005609156cc760 RCX: 00007fe728d26aeb [ 2399.359779] RDX: 000000000000000a RSI: 0000000000000800 RDI: 00005609156cc7c8 [ 2399.360535] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 [ 2399.361261] R10: 00007fe728dbeac0 R11: 0000000000000206 R12: 00007ffff954d950 [ 2399.361952] R13: 00005609156cc2a0 R14: 00007ffff954ee5f R15: 00005609156cc760 [ 2399.362688] Fixes: 6607c17c6c5e ("net: mana: Enable debugfs files for MANA device") Cc: stable@vger.kernel.org Signed-off-by: Shradha Gupta Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/1736398991-764-1-git-send-email-shradhagupta@linux.microsoft.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microsoft/mana/gdma_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 2dc0c6ad54be..be95336ce089 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -1656,9 +1656,9 @@ static int __init mana_driver_init(void) static void __exit mana_driver_exit(void) { - debugfs_remove(mana_debugfs_root); - pci_unregister_driver(&mana_driver); + + debugfs_remove(mana_debugfs_root); } module_init(mana_driver_init); -- 2.51.0 From e7e441a4100e4bc90b52f80494a28a9667993975 Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Thu, 9 Jan 2025 11:37:06 +0000 Subject: [PATCH 05/16] net: ravb: Fix max TX frame size for RZ/V2M MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When tx_max_frame_size was added to struct ravb_hw_info, no value was set in ravb_rzv2m_hw_info so the default value of zero was used. The maximum MTU is set by subtracting from tx_max_frame_size to allow space for headers and frame checksums. As ndev->max_mtu is unsigned, this subtraction wraps around leading to a ridiculously large positive value that is obviously incorrect. Before tx_max_frame_size was introduced, the maximum MTU was based on rx_max_frame_size. So, we can restore the correct maximum MTU by copying the rx_max_frame_size value into tx_max_frame_size for RZ/V2M. Fixes: 1d63864299ca ("net: ravb: Fix maximum TX frame size for GbEth devices") Signed-off-by: Paul Barker Reviewed-by: Niklas Söderlund Reviewed-by: Simon Horman Reviewed-by: Sergey Shtylyov Link: https://patch.msgid.link/20250109113706.1409149-1-paul.barker.ct@bp.renesas.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/ravb_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index ac0f093f647a..bc395294a32d 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -2763,6 +2763,7 @@ static const struct ravb_hw_info ravb_rzv2m_hw_info = { .net_features = NETIF_F_RXCSUM, .stats_len = ARRAY_SIZE(ravb_gstrings_stats), .tccr_mask = TCCR_TSRQ0 | TCCR_TSRQ1 | TCCR_TSRQ2 | TCCR_TSRQ3, + .tx_max_frame_size = SZ_2K, .rx_max_frame_size = SZ_2K, .rx_buffer_size = SZ_2K + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), -- 2.51.0 From 47e55e4b410f7d552e43011baa5be1aab4093990 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 9 Jan 2025 13:21:24 +0100 Subject: [PATCH 06/16] openvswitch: fix lockup on tx to unregistering netdev with carrier Commit in a fixes tag attempted to fix the issue in the following sequence of calls: do_output -> ovs_vport_send -> dev_queue_xmit -> __dev_queue_xmit -> netdev_core_pick_tx -> skb_tx_hash When device is unregistering, the 'dev->real_num_tx_queues' goes to zero and the 'while (unlikely(hash >= qcount))' loop inside the 'skb_tx_hash' becomes infinite, locking up the core forever. But unfortunately, checking just the carrier status is not enough to fix the issue, because some devices may still be in unregistering state while reporting carrier status OK. One example of such device is a net/dummy. It sets carrier ON on start, but it doesn't implement .ndo_stop to set the carrier off. And it makes sense, because dummy doesn't really have a carrier. Therefore, while this device is unregistering, it's still easy to hit the infinite loop in the skb_tx_hash() from the OVS datapath. There might be other drivers that do the same, but dummy by itself is important for the OVS ecosystem, because it is frequently used as a packet sink for tcpdump while debugging OVS deployments. And when the issue is hit, the only way to recover is to reboot. Fix that by also checking if the device is running. The running state is handled by the net core during unregistering, so it covers unregistering case better, and we don't really need to send packets to devices that are not running anyway. While only checking the running state might be enough, the carrier check is preserved. The running and the carrier states seem disjoined throughout the code and different drivers. And other core functions like __dev_direct_xmit() check both before attempting to transmit a packet. So, it seems safer to check both flags in OVS as well. Fixes: 066b86787fa3 ("net: openvswitch: fix race on port output") Reported-by: Friedrich Weber Closes: https://mail.openvswitch.org/pipermail/ovs-discuss/2025-January/053423.html Signed-off-by: Ilya Maximets Tested-by: Friedrich Weber Reviewed-by: Aaron Conole Link: https://patch.msgid.link/20250109122225.4034688-1-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- net/openvswitch/actions.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 16e260014684..704c858cf209 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -934,7 +934,9 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, { struct vport *vport = ovs_vport_rcu(dp, out_port); - if (likely(vport && netif_carrier_ok(vport->dev))) { + if (likely(vport && + netif_running(vport->dev) && + netif_carrier_ok(vport->dev))) { u16 mru = OVS_CB(skb)->mru; u32 cutlen = OVS_CB(skb)->cutlen; -- 2.51.0 From 76201b5979768500bca362871db66d77cb4c225e Mon Sep 17 00:00:00 2001 From: Artem Chernyshev Date: Thu, 9 Jan 2025 11:30:39 +0300 Subject: [PATCH 07/16] pktgen: Avoid out-of-bounds access in get_imix_entries Passing a sufficient amount of imix entries leads to invalid access to the pkt_dev->imix_entries array because of the incorrect boundary check. UBSAN: array-index-out-of-bounds in net/core/pktgen.c:874:24 index 20 is out of range for type 'imix_pkt [20]' CPU: 2 PID: 1210 Comm: bash Not tainted 6.10.0-rc1 #121 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) Call Trace: dump_stack_lvl lib/dump_stack.c:117 __ubsan_handle_out_of_bounds lib/ubsan.c:429 get_imix_entries net/core/pktgen.c:874 pktgen_if_write net/core/pktgen.c:1063 pde_write fs/proc/inode.c:334 proc_reg_write fs/proc/inode.c:346 vfs_write fs/read_write.c:593 ksys_write fs/read_write.c:644 do_syscall_64 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe arch/x86/entry/entry_64.S:130 Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 52a62f8603f9 ("pktgen: Parse internet mix (imix) input") Signed-off-by: Artem Chernyshev [ fp: allow to fill the array completely; minor changelog cleanup ] Signed-off-by: Fedor Pchelkin Signed-off-by: David S. Miller --- net/core/pktgen.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 7e23cacbe66e..4cb547fae91f 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -851,6 +851,9 @@ static ssize_t get_imix_entries(const char __user *buffer, unsigned long weight; unsigned long size; + if (pkt_dev->n_imix_entries >= MAX_IMIX_ENTRIES) + return -E2BIG; + len = num_arg(&buffer[i], max_digits, &size); if (len < 0) return len; @@ -880,9 +883,6 @@ static ssize_t get_imix_entries(const char __user *buffer, i++; pkt_dev->n_imix_entries++; - - if (pkt_dev->n_imix_entries > MAX_IMIX_ENTRIES) - return -E2BIG; } while (c == ' '); return i; -- 2.51.0 From 9e2bbab94b88295dcc57c7580393c9ee08d7314d Mon Sep 17 00:00:00 2001 From: Paul Fertser Date: Thu, 9 Jan 2025 17:50:54 +0300 Subject: [PATCH 08/16] net/ncsi: fix locking in Get MAC Address handling Obtaining RTNL lock in a response handler is not allowed since it runs in an atomic softirq context. Postpone setting the MAC address by adding a dedicated step to the configuration FSM. Fixes: 790071347a0a ("net/ncsi: change from ndo_set_mac_address to dev_set_mac_address") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20241129-potin-revert-ncsi-set-mac-addr-v1-1-94ea2cb596af@gmail.com Signed-off-by: Paul Fertser Tested-by: Potin Lai Link: https://patch.msgid.link/20250109145054.30925-1-fercerpav@gmail.com Signed-off-by: Jakub Kicinski --- net/ncsi/internal.h | 2 ++ net/ncsi/ncsi-manage.c | 16 ++++++++++++++-- net/ncsi/ncsi-rsp.c | 19 ++++++------------- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index ef0f8f73826f..4e0842df5234 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -289,6 +289,7 @@ enum { ncsi_dev_state_config_sp = 0x0301, ncsi_dev_state_config_cis, ncsi_dev_state_config_oem_gma, + ncsi_dev_state_config_apply_mac, ncsi_dev_state_config_clear_vids, ncsi_dev_state_config_svf, ncsi_dev_state_config_ev, @@ -322,6 +323,7 @@ struct ncsi_dev_priv { #define NCSI_DEV_RESHUFFLE 4 #define NCSI_DEV_RESET 8 /* Reset state of NC */ unsigned int gma_flag; /* OEM GMA flag */ + struct sockaddr pending_mac; /* MAC address received from GMA */ spinlock_t lock; /* Protect the NCSI device */ unsigned int package_probe_id;/* Current ID during probe */ unsigned int package_num; /* Number of packages */ diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 5cf55bde366d..bf276eaf9330 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -1038,7 +1038,7 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) : ncsi_dev_state_config_clear_vids; break; case ncsi_dev_state_config_oem_gma: - nd->state = ncsi_dev_state_config_clear_vids; + nd->state = ncsi_dev_state_config_apply_mac; nca.package = np->id; nca.channel = nc->id; @@ -1050,10 +1050,22 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) nca.type = NCSI_PKT_CMD_OEM; ret = ncsi_gma_handler(&nca, nc->version.mf_id); } - if (ret < 0) + if (ret < 0) { + nd->state = ncsi_dev_state_config_clear_vids; schedule_work(&ndp->work); + } break; + case ncsi_dev_state_config_apply_mac: + rtnl_lock(); + ret = dev_set_mac_address(dev, &ndp->pending_mac, NULL); + rtnl_unlock(); + if (ret < 0) + netdev_warn(dev, "NCSI: 'Writing MAC address to device failed\n"); + + nd->state = ncsi_dev_state_config_clear_vids; + + fallthrough; case ncsi_dev_state_config_clear_vids: case ncsi_dev_state_config_svf: case ncsi_dev_state_config_ev: diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index e28be33bdf2c..14bd66909ca4 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -628,16 +628,14 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr) static int ncsi_rsp_handler_oem_gma(struct ncsi_request *nr, int mfr_id) { struct ncsi_dev_priv *ndp = nr->ndp; + struct sockaddr *saddr = &ndp->pending_mac; struct net_device *ndev = ndp->ndev.dev; struct ncsi_rsp_oem_pkt *rsp; - struct sockaddr saddr; u32 mac_addr_off = 0; - int ret = 0; /* Get the response header */ rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); - saddr.sa_family = ndev->type; ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; if (mfr_id == NCSI_OEM_MFR_BCM_ID) mac_addr_off = BCM_MAC_ADDR_OFFSET; @@ -646,22 +644,17 @@ static int ncsi_rsp_handler_oem_gma(struct ncsi_request *nr, int mfr_id) else if (mfr_id == NCSI_OEM_MFR_INTEL_ID) mac_addr_off = INTEL_MAC_ADDR_OFFSET; - memcpy(saddr.sa_data, &rsp->data[mac_addr_off], ETH_ALEN); + saddr->sa_family = ndev->type; + memcpy(saddr->sa_data, &rsp->data[mac_addr_off], ETH_ALEN); if (mfr_id == NCSI_OEM_MFR_BCM_ID || mfr_id == NCSI_OEM_MFR_INTEL_ID) - eth_addr_inc((u8 *)saddr.sa_data); - if (!is_valid_ether_addr((const u8 *)saddr.sa_data)) + eth_addr_inc((u8 *)saddr->sa_data); + if (!is_valid_ether_addr((const u8 *)saddr->sa_data)) return -ENXIO; /* Set the flag for GMA command which should only be called once */ ndp->gma_flag = 1; - rtnl_lock(); - ret = dev_set_mac_address(ndev, &saddr, NULL); - rtnl_unlock(); - if (ret < 0) - netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); - - return ret; + return 0; } /* Response handler for Mellanox card */ -- 2.51.0 From 1f691a1fc4bef1c5cf5f503e14e1a22fc37c97e3 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 9 Jan 2025 23:43:12 +0100 Subject: [PATCH 09/16] r8169: remove redundant hwmon support The temperature sensor is actually part of the integrated PHY and available also on the standalone versions of the PHY. Therefore hwmon support will be added to the Realtek PHY driver and can be removed here. Fixes: 1ffcc8d41306 ("r8169: add support for the temperature sensor being available from RTL8125B") Signed-off-by: Heiner Kallweit Reviewed-by: Jacob Keller Link: https://patch.msgid.link/afba85f5-987b-4449-83cc-350438af7fe7@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 44 ----------------------- 1 file changed, 44 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 739707a7b40f..8a3959bb2360 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -5347,43 +5346,6 @@ static bool rtl_aspm_is_safe(struct rtl8169_private *tp) return false; } -static umode_t r8169_hwmon_is_visible(const void *drvdata, - enum hwmon_sensor_types type, - u32 attr, int channel) -{ - return 0444; -} - -static int r8169_hwmon_read(struct device *dev, enum hwmon_sensor_types type, - u32 attr, int channel, long *val) -{ - struct rtl8169_private *tp = dev_get_drvdata(dev); - int val_raw; - - val_raw = phy_read_paged(tp->phydev, 0xbd8, 0x12) & 0x3ff; - if (val_raw >= 512) - val_raw -= 1024; - - *val = 1000 * val_raw / 2; - - return 0; -} - -static const struct hwmon_ops r8169_hwmon_ops = { - .is_visible = r8169_hwmon_is_visible, - .read = r8169_hwmon_read, -}; - -static const struct hwmon_channel_info * const r8169_hwmon_info[] = { - HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT), - NULL -}; - -static const struct hwmon_chip_info r8169_hwmon_chip_info = { - .ops = &r8169_hwmon_ops, - .info = r8169_hwmon_info, -}; - static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { struct rtl8169_private *tp; @@ -5563,12 +5525,6 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (rc) return rc; - /* The temperature sensor is available from RTl8125B */ - if (IS_REACHABLE(CONFIG_HWMON) && tp->mac_version >= RTL_GIGA_MAC_VER_63) - /* ignore errors */ - devm_hwmon_device_register_with_info(&pdev->dev, "nic_temp", tp, - &r8169_hwmon_chip_info, - NULL); rc = register_netdev(dev); if (rc) return rc; -- 2.51.0 From 644f9108f3a505022ef43510e5143cb985e0cf8b Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Fri, 10 Jan 2025 09:08:10 +0800 Subject: [PATCH 10/16] udp: Make rehash4 independent in udp_lib_rehash() As discussed in [0], rehash4 could be missed in udp_lib_rehash() when udp hash4 changes while hash2 doesn't change. This patch fixes this by moving rehash4 codes out of rehash2 checking, and then rehash2 and rehash4 are done separately. By doing this, we no longer need to call rehash4 explicitly in udp_lib_hash4(), as the rehash callback in __ip4_datagram_connect takes it. Thus, now udp_lib_hash4() returns directly if the sk is already hashed. Note that uhash4 may fail to work under consecutive connect() calls because rehash() is not called with every connect(). To overcome this, connect() needs to be called after the next connect to a new destination. [0] https://lore.kernel.org/all/4761e466ab9f7542c68cdc95f248987d127044d2.1733499715.git.pabeni@redhat.com/ Fixes: 78c91ae2c6de ("ipv4/udp: Add 4-tuple hash for connected socket") Suggested-by: Paolo Abeni Signed-off-by: Philo Lu Link: https://patch.msgid.link/20250110010810.107145-1-lulie@linux.alibaba.com Signed-off-by: Paolo Abeni --- net/ipv4/udp.c | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e8953e88efef..86d282618515 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -533,7 +533,7 @@ begin: return NULL; } -/* In hash4, rehash can happen in connect(), where hash4_cnt keeps unchanged. */ +/* udp_rehash4() only checks hslot4, and hash4_cnt is not processed. */ static void udp_rehash4(struct udp_table *udptable, struct sock *sk, u16 newhash4) { @@ -582,15 +582,13 @@ void udp_lib_hash4(struct sock *sk, u16 hash) struct net *net = sock_net(sk); struct udp_table *udptable; - /* Connected udp socket can re-connect to another remote address, - * so rehash4 is needed. + /* Connected udp socket can re-connect to another remote address, which + * will be handled by rehash. Thus no need to redo hash4 here. */ - udptable = net->ipv4.udp_table; - if (udp_hashed4(sk)) { - udp_rehash4(udptable, sk, hash); + if (udp_hashed4(sk)) return; - } + udptable = net->ipv4.udp_table; hslot = udp_hashslot(udptable, net, udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); hslot4 = udp_hashslot4(udptable, hash); @@ -2173,14 +2171,14 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2, *nhslot2; + hslot = udp_hashslot(udptable, sock_net(sk), + udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); nhslot2 = udp_hashslot2(udptable, newhash); udp_sk(sk)->udp_portaddr_hash = newhash; if (hslot2 != nhslot2 || rcu_access_pointer(sk->sk_reuseport_cb)) { - hslot = udp_hashslot(udptable, sock_net(sk), - udp_sk(sk)->udp_port_hash); /* we must lock primary chain too */ spin_lock_bh(&hslot->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) @@ -2199,19 +2197,29 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) spin_unlock(&nhslot2->lock); } - if (udp_hashed4(sk)) { - udp_rehash4(udptable, sk, newhash4); + spin_unlock_bh(&hslot->lock); + } + + /* Now process hash4 if necessary: + * (1) update hslot4; + * (2) update hslot2->hash4_cnt. + * Note that hslot2/hslot4 should be checked separately, as + * either of them may change with the other unchanged. + */ + if (udp_hashed4(sk)) { + spin_lock_bh(&hslot->lock); - if (hslot2 != nhslot2) { - spin_lock(&hslot2->lock); - udp_hash4_dec(hslot2); - spin_unlock(&hslot2->lock); + udp_rehash4(udptable, sk, newhash4); + if (hslot2 != nhslot2) { + spin_lock(&hslot2->lock); + udp_hash4_dec(hslot2); + spin_unlock(&hslot2->lock); - spin_lock(&nhslot2->lock); - udp_hash4_inc(nhslot2); - spin_unlock(&nhslot2->lock); - } + spin_lock(&nhslot2->lock); + udp_hash4_inc(nhslot2); + spin_unlock(&nhslot2->lock); } + spin_unlock_bh(&hslot->lock); } } -- 2.51.0 From 46841c7053e6d25fb33e0534ef023833bf03e382 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 10 Jan 2025 10:47:52 +0900 Subject: [PATCH 11/16] gtp: Use for_each_netdev_rcu() in gtp_genl_dump_pdp(). gtp_newlink() links the gtp device to a list in dev_net(dev). However, even after the gtp device is moved to another netns, it stays on the list but should be invisible. Let's use for_each_netdev_rcu() for netdev traversal in gtp_genl_dump_pdp(). Note that gtp_dev_list is no longer used under RCU, so list helpers are converted to the non-RCU variant. Fixes: 459aa660eb1d ("gtp: add initial driver for datapath of GPRS Tunneling Protocol (GTP-U)") Reported-by: Xiao Liang Closes: https://lore.kernel.org/netdev/CABAhCOQdBL6h9M2C+kd+bGivRJ9Q72JUxW+-gur0nub_=PmFPA@mail.gmail.com/ Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- drivers/net/gtp.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 89a996ad8cd0..0f9cb0c378af 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1525,7 +1525,7 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev, } gn = net_generic(dev_net(dev), gtp_net_id); - list_add_rcu(>p->list, &gn->gtp_dev_list); + list_add(>p->list, &gn->gtp_dev_list); dev->priv_destructor = gtp_destructor; netdev_dbg(dev, "registered new GTP interface\n"); @@ -1551,7 +1551,7 @@ static void gtp_dellink(struct net_device *dev, struct list_head *head) hlist_for_each_entry_safe(pctx, next, >p->tid_hash[i], hlist_tid) pdp_context_delete(pctx); - list_del_rcu(>p->list); + list_del(>p->list); unregister_netdevice_queue(dev, head); } @@ -2271,16 +2271,19 @@ static int gtp_genl_dump_pdp(struct sk_buff *skb, struct gtp_dev *last_gtp = (struct gtp_dev *)cb->args[2], *gtp; int i, j, bucket = cb->args[0], skip = cb->args[1]; struct net *net = sock_net(skb->sk); + struct net_device *dev; struct pdp_ctx *pctx; - struct gtp_net *gn; - - gn = net_generic(net, gtp_net_id); if (cb->args[4]) return 0; rcu_read_lock(); - list_for_each_entry_rcu(gtp, &gn->gtp_dev_list, list) { + for_each_netdev_rcu(net, dev) { + if (dev->rtnl_link_ops != >p_link_ops) + continue; + + gtp = netdev_priv(dev); + if (last_gtp && last_gtp != gtp) continue; else @@ -2475,9 +2478,9 @@ static void __net_exit gtp_net_exit_batch_rtnl(struct list_head *net_list, list_for_each_entry(net, net_list, exit_list) { struct gtp_net *gn = net_generic(net, gtp_net_id); - struct gtp_dev *gtp; + struct gtp_dev *gtp, *gtp_next; - list_for_each_entry(gtp, &gn->gtp_dev_list, list) + list_for_each_entry_safe(gtp, gtp_next, &gn->gtp_dev_list, list) gtp_dellink(gtp->dev, dev_to_kill); } } -- 2.51.0 From eb28fd76c0a08a47b470677c6cef9dd1c60e92d1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 10 Jan 2025 10:47:53 +0900 Subject: [PATCH 12/16] gtp: Destroy device along with udp socket's netns dismantle. gtp_newlink() links the device to a list in dev_net(dev) instead of src_net, where a udp tunnel socket is created. Even when src_net is removed, the device stays alive on dev_net(dev). Then, removing src_net triggers the splat below. [0] In this example, gtp0 is created in ns2, and the udp socket is created in ns1. ip netns add ns1 ip netns add ns2 ip -n ns1 link add netns ns2 name gtp0 type gtp role sgsn ip netns del ns1 Let's link the device to the socket's netns instead. Now, gtp_net_exit_batch_rtnl() needs another netdev iteration to remove all gtp devices in the netns. [0]: ref_tracker: net notrefcnt@000000003d6e7d05 has 1/2 users at sk_alloc (./include/net/net_namespace.h:345 net/core/sock.c:2236) inet_create (net/ipv4/af_inet.c:326 net/ipv4/af_inet.c:252) __sock_create (net/socket.c:1558) udp_sock_create4 (net/ipv4/udp_tunnel_core.c:18) gtp_create_sock (./include/net/udp_tunnel.h:59 drivers/net/gtp.c:1423) gtp_create_sockets (drivers/net/gtp.c:1447) gtp_newlink (drivers/net/gtp.c:1507) rtnl_newlink (net/core/rtnetlink.c:3786 net/core/rtnetlink.c:3897 net/core/rtnetlink.c:4012) rtnetlink_rcv_msg (net/core/rtnetlink.c:6922) netlink_rcv_skb (net/netlink/af_netlink.c:2542) netlink_unicast (net/netlink/af_netlink.c:1321 net/netlink/af_netlink.c:1347) netlink_sendmsg (net/netlink/af_netlink.c:1891) ____sys_sendmsg (net/socket.c:711 net/socket.c:726 net/socket.c:2583) ___sys_sendmsg (net/socket.c:2639) __sys_sendmsg (net/socket.c:2669) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) WARNING: CPU: 1 PID: 60 at lib/ref_tracker.c:179 ref_tracker_dir_exit (lib/ref_tracker.c:179) Modules linked in: CPU: 1 UID: 0 PID: 60 Comm: kworker/u16:2 Not tainted 6.13.0-rc5-00147-g4c1224501e9d #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Workqueue: netns cleanup_net RIP: 0010:ref_tracker_dir_exit (lib/ref_tracker.c:179) Code: 00 00 00 fc ff df 4d 8b 26 49 bd 00 01 00 00 00 00 ad de 4c 39 f5 0f 85 df 00 00 00 48 8b 74 24 08 48 89 df e8 a5 cc 12 02 90 <0f> 0b 90 48 8d 6b 44 be 04 00 00 00 48 89 ef e8 80 de 67 ff 48 89 RSP: 0018:ff11000009a07b60 EFLAGS: 00010286 RAX: 0000000000002bd3 RBX: ff1100000f4e1aa0 RCX: 1ffffffff0e40ac6 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff8423ee3c RBP: ff1100000f4e1af0 R08: 0000000000000001 R09: fffffbfff0e395ae R10: 0000000000000001 R11: 0000000000036001 R12: ff1100000f4e1af0 R13: dead000000000100 R14: ff1100000f4e1af0 R15: dffffc0000000000 FS: 0000000000000000(0000) GS:ff1100006ce80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f9b2464bd98 CR3: 0000000005286005 CR4: 0000000000771ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? __warn (kernel/panic.c:748) ? ref_tracker_dir_exit (lib/ref_tracker.c:179) ? report_bug (lib/bug.c:201 lib/bug.c:219) ? handle_bug (arch/x86/kernel/traps.c:285) ? exc_invalid_op (arch/x86/kernel/traps.c:309 (discriminator 1)) ? asm_exc_invalid_op (./arch/x86/include/asm/idtentry.h:621) ? _raw_spin_unlock_irqrestore (./arch/x86/include/asm/irqflags.h:42 ./arch/x86/include/asm/irqflags.h:97 ./arch/x86/include/asm/irqflags.h:155 ./include/linux/spinlock_api_smp.h:151 kernel/locking/spinlock.c:194) ? ref_tracker_dir_exit (lib/ref_tracker.c:179) ? __pfx_ref_tracker_dir_exit (lib/ref_tracker.c:158) ? kfree (mm/slub.c:4613 mm/slub.c:4761) net_free (net/core/net_namespace.c:476 net/core/net_namespace.c:467) cleanup_net (net/core/net_namespace.c:664 (discriminator 3)) process_one_work (kernel/workqueue.c:3229) worker_thread (kernel/workqueue.c:3304 kernel/workqueue.c:3391) kthread (kernel/kthread.c:389) ret_from_fork (arch/x86/kernel/process.c:147) ret_from_fork_asm (arch/x86/entry/entry_64.S:257) Fixes: 459aa660eb1d ("gtp: add initial driver for datapath of GPRS Tunneling Protocol (GTP-U)") Reported-by: Xiao Liang Closes: https://lore.kernel.org/netdev/20250104125732.17335-1-shaw.leon@gmail.com/ Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- drivers/net/gtp.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 0f9cb0c378af..fbabada7d3ba 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1524,7 +1524,7 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev, goto out_encap; } - gn = net_generic(dev_net(dev), gtp_net_id); + gn = net_generic(src_net, gtp_net_id); list_add(>p->list, &gn->gtp_dev_list); dev->priv_destructor = gtp_destructor; @@ -2479,6 +2479,11 @@ static void __net_exit gtp_net_exit_batch_rtnl(struct list_head *net_list, list_for_each_entry(net, net_list, exit_list) { struct gtp_net *gn = net_generic(net, gtp_net_id); struct gtp_dev *gtp, *gtp_next; + struct net_device *dev; + + for_each_netdev(net, dev) + if (dev->rtnl_link_ops == >p_link_ops) + gtp_dellink(dev, dev_to_kill); list_for_each_entry_safe(gtp, gtp_next, &gn->gtp_dev_list, list) gtp_dellink(gtp->dev, dev_to_kill); -- 2.51.0 From ffc90e9ca61b0f619326a1417ff32efd6cc71ed2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 10 Jan 2025 10:47:54 +0900 Subject: [PATCH 13/16] pfcp: Destroy device along with udp socket's netns dismantle. pfcp_newlink() links the device to a list in dev_net(dev) instead of net, where a udp tunnel socket is created. Even when net is removed, the device stays alive on dev_net(dev). Then, removing net triggers the splat below. [0] In this example, pfcp0 is created in ns2, but the udp socket is created in ns1. ip netns add ns1 ip netns add ns2 ip -n ns1 link add netns ns2 name pfcp0 type pfcp ip netns del ns1 Let's link the device to the socket's netns instead. Now, pfcp_net_exit() needs another netdev iteration to remove all pfcp devices in the netns. pfcp_dev_list is not used under RCU, so the list API is converted to the non-RCU variant. pfcp_net_exit() can be converted to .exit_batch_rtnl() in net-next. [0]: ref_tracker: net notrefcnt@00000000128b34dc has 1/1 users at sk_alloc (./include/net/net_namespace.h:345 net/core/sock.c:2236) inet_create (net/ipv4/af_inet.c:326 net/ipv4/af_inet.c:252) __sock_create (net/socket.c:1558) udp_sock_create4 (net/ipv4/udp_tunnel_core.c:18) pfcp_create_sock (drivers/net/pfcp.c:168) pfcp_newlink (drivers/net/pfcp.c:182 drivers/net/pfcp.c:197) rtnl_newlink (net/core/rtnetlink.c:3786 net/core/rtnetlink.c:3897 net/core/rtnetlink.c:4012) rtnetlink_rcv_msg (net/core/rtnetlink.c:6922) netlink_rcv_skb (net/netlink/af_netlink.c:2542) netlink_unicast (net/netlink/af_netlink.c:1321 net/netlink/af_netlink.c:1347) netlink_sendmsg (net/netlink/af_netlink.c:1891) ____sys_sendmsg (net/socket.c:711 net/socket.c:726 net/socket.c:2583) ___sys_sendmsg (net/socket.c:2639) __sys_sendmsg (net/socket.c:2669) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) WARNING: CPU: 1 PID: 11 at lib/ref_tracker.c:179 ref_tracker_dir_exit (lib/ref_tracker.c:179) Modules linked in: CPU: 1 UID: 0 PID: 11 Comm: kworker/u16:0 Not tainted 6.13.0-rc5-00147-g4c1224501e9d #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Workqueue: netns cleanup_net RIP: 0010:ref_tracker_dir_exit (lib/ref_tracker.c:179) Code: 00 00 00 fc ff df 4d 8b 26 49 bd 00 01 00 00 00 00 ad de 4c 39 f5 0f 85 df 00 00 00 48 8b 74 24 08 48 89 df e8 a5 cc 12 02 90 <0f> 0b 90 48 8d 6b 44 be 04 00 00 00 48 89 ef e8 80 de 67 ff 48 89 RSP: 0018:ff11000007f3fb60 EFLAGS: 00010286 RAX: 00000000000020ef RBX: ff1100000d6481e0 RCX: 1ffffffff0e40d82 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff8423ee3c RBP: ff1100000d648230 R08: 0000000000000001 R09: fffffbfff0e395af R10: 0000000000000001 R11: 0000000000000000 R12: ff1100000d648230 R13: dead000000000100 R14: ff1100000d648230 R15: dffffc0000000000 FS: 0000000000000000(0000) GS:ff1100006ce80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005620e1363990 CR3: 000000000eeb2002 CR4: 0000000000771ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? __warn (kernel/panic.c:748) ? ref_tracker_dir_exit (lib/ref_tracker.c:179) ? report_bug (lib/bug.c:201 lib/bug.c:219) ? handle_bug (arch/x86/kernel/traps.c:285) ? exc_invalid_op (arch/x86/kernel/traps.c:309 (discriminator 1)) ? asm_exc_invalid_op (./arch/x86/include/asm/idtentry.h:621) ? _raw_spin_unlock_irqrestore (./arch/x86/include/asm/irqflags.h:42 ./arch/x86/include/asm/irqflags.h:97 ./arch/x86/include/asm/irqflags.h:155 ./include/linux/spinlock_api_smp.h:151 kernel/locking/spinlock.c:194) ? ref_tracker_dir_exit (lib/ref_tracker.c:179) ? __pfx_ref_tracker_dir_exit (lib/ref_tracker.c:158) ? kfree (mm/slub.c:4613 mm/slub.c:4761) net_free (net/core/net_namespace.c:476 net/core/net_namespace.c:467) cleanup_net (net/core/net_namespace.c:664 (discriminator 3)) process_one_work (kernel/workqueue.c:3229) worker_thread (kernel/workqueue.c:3304 kernel/workqueue.c:3391) kthread (kernel/kthread.c:389) ret_from_fork (arch/x86/kernel/process.c:147) ret_from_fork_asm (arch/x86/entry/entry_64.S:257) Fixes: 76c8764ef36a ("pfcp: add PFCP module") Reported-by: Xiao Liang Closes: https://lore.kernel.org/netdev/20250104125732.17335-1-shaw.leon@gmail.com/ Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- drivers/net/pfcp.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/pfcp.c b/drivers/net/pfcp.c index 69434fd13f96..68d0d9e92a22 100644 --- a/drivers/net/pfcp.c +++ b/drivers/net/pfcp.c @@ -206,8 +206,8 @@ static int pfcp_newlink(struct net *net, struct net_device *dev, goto exit_del_pfcp_sock; } - pn = net_generic(dev_net(dev), pfcp_net_id); - list_add_rcu(&pfcp->list, &pn->pfcp_dev_list); + pn = net_generic(net, pfcp_net_id); + list_add(&pfcp->list, &pn->pfcp_dev_list); netdev_dbg(dev, "registered new PFCP interface\n"); @@ -224,7 +224,7 @@ static void pfcp_dellink(struct net_device *dev, struct list_head *head) { struct pfcp_dev *pfcp = netdev_priv(dev); - list_del_rcu(&pfcp->list); + list_del(&pfcp->list); unregister_netdevice_queue(dev, head); } @@ -247,11 +247,16 @@ static int __net_init pfcp_net_init(struct net *net) static void __net_exit pfcp_net_exit(struct net *net) { struct pfcp_net *pn = net_generic(net, pfcp_net_id); - struct pfcp_dev *pfcp; + struct pfcp_dev *pfcp, *pfcp_next; + struct net_device *dev; LIST_HEAD(list); rtnl_lock(); - list_for_each_entry(pfcp, &pn->pfcp_dev_list, list) + for_each_netdev(net, dev) + if (dev->rtnl_link_ops == &pfcp_link_ops) + pfcp_dellink(dev, &list); + + list_for_each_entry_safe(pfcp, pfcp_next, &pn->pfcp_dev_list, list) pfcp_dellink(pfcp->dev, &list); unregister_netdevice_many(&list); -- 2.51.0 From 2cb7c756f605ec02ffe562fb26828e4bcc5fdfc1 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 10 Jan 2025 09:35:07 +0100 Subject: [PATCH 14/16] vsock/virtio: discard packets if the transport changes If the socket has been de-assigned or assigned to another transport, we must discard any packets received because they are not expected and would cause issues when we access vsk->transport. A possible scenario is described by Hyunwoo Kim in the attached link, where after a first connect() interrupted by a signal, and a second connect() failed, we can find `vsk->transport` at NULL, leading to a NULL pointer dereference. Fixes: c0cfa2d8a788 ("vsock: add multi-transports support") Cc: stable@vger.kernel.org Reported-by: Hyunwoo Kim Reported-by: Wongi Lee Closes: https://lore.kernel.org/netdev/Z2LvdTTQR7dBmPb5@v4bel-B760M-AORUS-ELITE-AX/ Signed-off-by: Stefano Garzarella Reviewed-by: Hyunwoo Kim Signed-off-by: Paolo Abeni --- net/vmw_vsock/virtio_transport_common.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 9acc13ab3f82..51a494b69be8 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1628,8 +1628,11 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, lock_sock(sk); - /* Check if sk has been closed before lock_sock */ - if (sock_flag(sk, SOCK_DONE)) { + /* Check if sk has been closed or assigned to another transport before + * lock_sock (note: listener sockets are not assigned to any transport) + */ + if (sock_flag(sk, SOCK_DONE) || + (sk->sk_state != TCP_LISTEN && vsk->transport != &t->transport)) { (void)virtio_transport_reset_no_sock(t, skb); release_sock(sk); sock_put(sk); -- 2.51.0 From f6abafcd32f9cfc4b1a2f820ecea70773e26d423 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 10 Jan 2025 09:35:08 +0100 Subject: [PATCH 15/16] vsock/bpf: return early if transport is not assigned Some of the core functions can only be called if the transport has been assigned. As Michal reported, a socket might have the transport at NULL, for example after a failed connect(), causing the following trace: BUG: kernel NULL pointer dereference, address: 00000000000000a0 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 12faf8067 P4D 12faf8067 PUD 113670067 PMD 0 Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 15 UID: 0 PID: 1198 Comm: a.out Not tainted 6.13.0-rc2+ RIP: 0010:vsock_connectible_has_data+0x1f/0x40 Call Trace: vsock_bpf_recvmsg+0xca/0x5e0 sock_recvmsg+0xb9/0xc0 __sys_recvfrom+0xb3/0x130 __x64_sys_recvfrom+0x20/0x30 do_syscall_64+0x93/0x180 entry_SYSCALL_64_after_hwframe+0x76/0x7e So we need to check the `vsk->transport` in vsock_bpf_recvmsg(), especially for connected sockets (stream/seqpacket) as we already do in __vsock_connectible_recvmsg(). Fixes: 634f1a7110b4 ("vsock: support sockmap") Cc: stable@vger.kernel.org Reported-by: Michal Luczaj Closes: https://lore.kernel.org/netdev/5ca20d4c-1017-49c2-9516-f6f75fd331e9@rbox.co/ Tested-by: Michal Luczaj Reported-by: syzbot+3affdbfc986ecd9200fd@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/677f84a8.050a0220.25a300.01b3.GAE@google.com/ Tested-by: syzbot+3affdbfc986ecd9200fd@syzkaller.appspotmail.com Reviewed-by: Hyunwoo Kim Acked-by: Michael S. Tsirkin Reviewed-by: Luigi Leonardi Signed-off-by: Stefano Garzarella Signed-off-by: Paolo Abeni --- net/vmw_vsock/vsock_bpf.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c index 4aa6e74ec295..f201d9eca1df 100644 --- a/net/vmw_vsock/vsock_bpf.c +++ b/net/vmw_vsock/vsock_bpf.c @@ -77,6 +77,7 @@ static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct sk_psock *psock; + struct vsock_sock *vsk; int copied; psock = sk_psock_get(sk); @@ -84,6 +85,13 @@ static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg, return __vsock_recvmsg(sk, msg, len, flags); lock_sock(sk); + vsk = vsock_sk(sk); + + if (!vsk->transport) { + copied = -ENODEV; + goto out; + } + if (vsock_has_data(sk, psock) && sk_psock_queue_empty(psock)) { release_sock(sk); sk_psock_put(sk, psock); @@ -108,6 +116,7 @@ static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg, copied = sk_msg_recvmsg(sk, psock, msg, len, flags); } +out: release_sock(sk); sk_psock_put(sk, psock); -- 2.51.0 From df137da9d6d166e87e40980e36eb8e0bc90483ef Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 10 Jan 2025 09:35:09 +0100 Subject: [PATCH 16/16] vsock/virtio: cancel close work in the destructor During virtio_transport_release() we can schedule a delayed work to perform the closing of the socket before destruction. The destructor is called either when the socket is really destroyed (reference counter to zero), or it can also be called when we are de-assigning the transport. In the former case, we are sure the delayed work has completed, because it holds a reference until it completes, so the destructor will definitely be called after the delayed work is finished. But in the latter case, the destructor is called by AF_VSOCK core, just after the release(), so there may still be delayed work scheduled. Refactor the code, moving the code to delete the close work already in the do_close() to a new function. Invoke it during destruction to make sure we don't leave any pending work. Fixes: c0cfa2d8a788 ("vsock: add multi-transports support") Cc: stable@vger.kernel.org Reported-by: Hyunwoo Kim Closes: https://lore.kernel.org/netdev/Z37Sh+utS+iV3+eb@v4bel-B760M-AORUS-ELITE-AX/ Signed-off-by: Stefano Garzarella Reviewed-by: Luigi Leonardi Tested-by: Hyunwoo Kim Signed-off-by: Paolo Abeni --- net/vmw_vsock/virtio_transport_common.c | 29 ++++++++++++++++++------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 51a494b69be8..7f7de6d88096 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -26,6 +26,9 @@ /* Threshold for detecting small packets to copy */ #define GOOD_COPY_LEN 128 +static void virtio_transport_cancel_close_work(struct vsock_sock *vsk, + bool cancel_timeout); + static const struct virtio_transport * virtio_transport_get_ops(struct vsock_sock *vsk) { @@ -1109,6 +1112,8 @@ void virtio_transport_destruct(struct vsock_sock *vsk) { struct virtio_vsock_sock *vvs = vsk->trans; + virtio_transport_cancel_close_work(vsk, true); + kfree(vvs); vsk->trans = NULL; } @@ -1204,17 +1209,11 @@ static void virtio_transport_wait_close(struct sock *sk, long timeout) } } -static void virtio_transport_do_close(struct vsock_sock *vsk, - bool cancel_timeout) +static void virtio_transport_cancel_close_work(struct vsock_sock *vsk, + bool cancel_timeout) { struct sock *sk = sk_vsock(vsk); - sock_set_flag(sk, SOCK_DONE); - vsk->peer_shutdown = SHUTDOWN_MASK; - if (vsock_stream_has_data(vsk) <= 0) - sk->sk_state = TCP_CLOSING; - sk->sk_state_change(sk); - if (vsk->close_work_scheduled && (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) { vsk->close_work_scheduled = false; @@ -1226,6 +1225,20 @@ static void virtio_transport_do_close(struct vsock_sock *vsk, } } +static void virtio_transport_do_close(struct vsock_sock *vsk, + bool cancel_timeout) +{ + struct sock *sk = sk_vsock(vsk); + + sock_set_flag(sk, SOCK_DONE); + vsk->peer_shutdown = SHUTDOWN_MASK; + if (vsock_stream_has_data(vsk) <= 0) + sk->sk_state = TCP_CLOSING; + sk->sk_state_change(sk); + + virtio_transport_cancel_close_work(vsk, cancel_timeout); +} + static void virtio_transport_close_timeout(struct work_struct *work) { struct vsock_sock *vsk = -- 2.51.0