From 4d0ab3a6885e3e9040310a8d8f54503366083626 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 2 Apr 2025 14:42:23 +0300 Subject: [PATCH 01/16] ipv6: Start path selection from the first nexthop Cited commit transitioned IPv6 path selection to use hash-threshold instead of modulo-N. With hash-threshold, each nexthop is assigned a region boundary in the multipath hash function's output space and a nexthop is chosen if the calculated hash is smaller than the nexthop's region boundary. Hash-threshold does not work correctly if path selection does not start with the first nexthop. For example, if fib6_select_path() is always passed the last nexthop in the group, then it will always be chosen because its region boundary covers the entire hash function's output space. Fix this by starting the selection process from the first nexthop and do not consider nexthops for which rt6_score_route() provided a negative score. Fixes: 3d709f69a3e7 ("ipv6: Use hash-threshold instead of modulo-N") Reported-by: Stanislav Fomichev Closes: https://lore.kernel.org/netdev/Z9RIyKZDNoka53EO@mini-arch/ Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250402114224.293392-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c3406a0d45bd..864f0002034b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -412,11 +412,35 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } +static struct fib6_info * +rt6_multipath_first_sibling_rcu(const struct fib6_info *rt) +{ + struct fib6_info *iter; + struct fib6_node *fn; + + fn = rcu_dereference(rt->fib6_node); + if (!fn) + goto out; + iter = rcu_dereference(fn->leaf); + if (!iter) + goto out; + + while (iter) { + if (iter->fib6_metric == rt->fib6_metric && + rt6_qualify_for_ecmp(iter)) + return iter; + iter = rcu_dereference(iter->fib6_next); + } + +out: + return NULL; +} + void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict) { - struct fib6_info *match = res->f6i; + struct fib6_info *first, *match = res->f6i; struct fib6_info *sibling; if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) @@ -440,10 +464,18 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, return; } - if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) + first = rt6_multipath_first_sibling_rcu(match); + if (!first) goto out; - list_for_each_entry_rcu(sibling, &match->fib6_siblings, + if (fl6->mp_hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound) && + rt6_score_route(first->fib6_nh, first->fib6_flags, oif, + strict) >= 0) { + match = first; + goto out; + } + + list_for_each_entry_rcu(sibling, &first->fib6_siblings, fib6_siblings) { const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; -- 2.51.0 From 8b8e0dd357165e0258d9f9cdab5366720ed2f619 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 2 Apr 2025 14:42:24 +0300 Subject: [PATCH 02/16] ipv6: Do not consider link down nexthops in path selection Nexthops whose link is down are not supposed to be considered during path selection when the "ignore_routes_with_linkdown" sysctl is set. This is done by assigning them a negative region boundary. However, when comparing the computed hash (unsigned) with the region boundary (signed), the negative region boundary is treated as unsigned, resulting in incorrect nexthop selection. Fix by treating the computed hash as signed. Note that the computed hash is always in range of [0, 2^31 - 1]. Fixes: 3d709f69a3e7 ("ipv6: Use hash-threshold instead of modulo-N") Signed-off-by: Ido Schimmel Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250402114224.293392-3-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 864f0002034b..ab12b816ab94 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -442,6 +442,7 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, { struct fib6_info *first, *match = res->f6i; struct fib6_info *sibling; + int hash; if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) goto out; @@ -468,7 +469,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, if (!first) goto out; - if (fl6->mp_hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound) && + hash = fl6->mp_hash; + if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound) && rt6_score_route(first->fib6_nh, first->fib6_flags, oif, strict) >= 0) { match = first; @@ -481,7 +483,7 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, int nh_upper_bound; nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); - if (fl6->mp_hash > nh_upper_bound) + if (hash > nh_upper_bound) continue; if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) break; -- 2.51.0 From fda8c491db2a90ff3e6fbbae58e495b4ddddeca3 Mon Sep 17 00:00:00 2001 From: Henry Martin Date: Wed, 2 Apr 2025 21:50:36 +0800 Subject: [PATCH 03/16] arcnet: Add NULL check in com20020pci_probe() devm_kasprintf() returns NULL when memory allocation fails. Currently, com20020pci_probe() does not check for this case, which results in a NULL pointer dereference. Add NULL check after devm_kasprintf() to prevent this issue and ensure no resources are left allocated. Fixes: 6b17a597fc2f ("arcnet: restoring support for multiple Sohard Arcnet cards") Signed-off-by: Henry Martin Link: https://patch.msgid.link/20250402135036.44697-1-bsdhenrymartin@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/arcnet/com20020-pci.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c index c5e571ec94c9..0472bcdff130 100644 --- a/drivers/net/arcnet/com20020-pci.c +++ b/drivers/net/arcnet/com20020-pci.c @@ -251,18 +251,33 @@ static int com20020pci_probe(struct pci_dev *pdev, card->tx_led.default_trigger = devm_kasprintf(&pdev->dev, GFP_KERNEL, "arc%d-%d-tx", dev->dev_id, i); + if (!card->tx_led.default_trigger) { + ret = -ENOMEM; + goto err_free_arcdev; + } card->tx_led.name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "pci:green:tx:%d-%d", dev->dev_id, i); - + if (!card->tx_led.name) { + ret = -ENOMEM; + goto err_free_arcdev; + } card->tx_led.dev = &dev->dev; card->recon_led.brightness_set = led_recon_set; card->recon_led.default_trigger = devm_kasprintf(&pdev->dev, GFP_KERNEL, "arc%d-%d-recon", dev->dev_id, i); + if (!card->recon_led.default_trigger) { + ret = -ENOMEM; + goto err_free_arcdev; + } card->recon_led.name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "pci:red:recon:%d-%d", dev->dev_id, i); + if (!card->recon_led.name) { + ret = -ENOMEM; + goto err_free_arcdev; + } card->recon_led.dev = &dev->dev; ret = devm_led_classdev_register(&pdev->dev, &card->tx_led); -- 2.51.0 From 053f3ff67d7feefc75797863f3d84b47ad47086f Mon Sep 17 00:00:00 2001 From: Dave Marquardt Date: Wed, 2 Apr 2025 10:44:03 -0500 Subject: [PATCH 04/16] net: ibmveth: make veth_pool_store stop hanging v2: - Created a single error handling unlock and exit in veth_pool_store - Greatly expanded commit message with previous explanatory-only text Summary: Use rtnl_mutex to synchronize veth_pool_store with itself, ibmveth_close and ibmveth_open, preventing multiple calls in a row to napi_disable. Background: Two (or more) threads could call veth_pool_store through writing to /sys/devices/vio/30000002/pool*/*. You can do this easily with a little shell script. This causes a hang. I configured LOCKDEP, compiled ibmveth.c with DEBUG, and built a new kernel. I ran this test again and saw: Setting pool0/active to 0 Setting pool1/active to 1 [ 73.911067][ T4365] ibmveth 30000002 eth0: close starting Setting pool1/active to 1 Setting pool1/active to 0 [ 73.911367][ T4366] ibmveth 30000002 eth0: close starting [ 73.916056][ T4365] ibmveth 30000002 eth0: close complete [ 73.916064][ T4365] ibmveth 30000002 eth0: open starting [ 110.808564][ T712] systemd-journald[712]: Sent WATCHDOG=1 notification. [ 230.808495][ T712] systemd-journald[712]: Sent WATCHDOG=1 notification. [ 243.683786][ T123] INFO: task stress.sh:4365 blocked for more than 122 seconds. [ 243.683827][ T123] Not tainted 6.14.0-01103-g2df0c02dab82-dirty #8 [ 243.683833][ T123] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 243.683838][ T123] task:stress.sh state:D stack:28096 pid:4365 tgid:4365 ppid:4364 task_flags:0x400040 flags:0x00042000 [ 243.683852][ T123] Call Trace: [ 243.683857][ T123] [c00000000c38f690] [0000000000000001] 0x1 (unreliable) [ 243.683868][ T123] [c00000000c38f840] [c00000000001f908] __switch_to+0x318/0x4e0 [ 243.683878][ T123] [c00000000c38f8a0] [c000000001549a70] __schedule+0x500/0x12a0 [ 243.683888][ T123] [c00000000c38f9a0] [c00000000154a878] schedule+0x68/0x210 [ 243.683896][ T123] [c00000000c38f9d0] [c00000000154ac80] schedule_preempt_disabled+0x30/0x50 [ 243.683904][ T123] [c00000000c38fa00] [c00000000154dbb0] __mutex_lock+0x730/0x10f0 [ 243.683913][ T123] [c00000000c38fb10] [c000000001154d40] napi_enable+0x30/0x60 [ 243.683921][ T123] [c00000000c38fb40] [c000000000f4ae94] ibmveth_open+0x68/0x5dc [ 243.683928][ T123] [c00000000c38fbe0] [c000000000f4aa20] veth_pool_store+0x220/0x270 [ 243.683936][ T123] [c00000000c38fc70] [c000000000826278] sysfs_kf_write+0x68/0xb0 [ 243.683944][ T123] [c00000000c38fcb0] [c0000000008240b8] kernfs_fop_write_iter+0x198/0x2d0 [ 243.683951][ T123] [c00000000c38fd00] [c00000000071b9ac] vfs_write+0x34c/0x650 [ 243.683958][ T123] [c00000000c38fdc0] [c00000000071bea8] ksys_write+0x88/0x150 [ 243.683966][ T123] [c00000000c38fe10] [c0000000000317f4] system_call_exception+0x124/0x340 [ 243.683973][ T123] [c00000000c38fe50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec ... [ 243.684087][ T123] Showing all locks held in the system: [ 243.684095][ T123] 1 lock held by khungtaskd/123: [ 243.684099][ T123] #0: c00000000278e370 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x50/0x248 [ 243.684114][ T123] 4 locks held by stress.sh/4365: [ 243.684119][ T123] #0: c00000003a4cd3f8 (sb_writers#3){.+.+}-{0:0}, at: ksys_write+0x88/0x150 [ 243.684132][ T123] #1: c000000041aea888 (&of->mutex#2){+.+.}-{3:3}, at: kernfs_fop_write_iter+0x154/0x2d0 [ 243.684143][ T123] #2: c0000000366fb9a8 (kn->active#64){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x160/0x2d0 [ 243.684155][ T123] #3: c000000035ff4cb8 (&dev->lock){+.+.}-{3:3}, at: napi_enable+0x30/0x60 [ 243.684166][ T123] 5 locks held by stress.sh/4366: [ 243.684170][ T123] #0: c00000003a4cd3f8 (sb_writers#3){.+.+}-{0:0}, at: ksys_write+0x88/0x150 [ 243.684183][ T123] #1: c00000000aee2288 (&of->mutex#2){+.+.}-{3:3}, at: kernfs_fop_write_iter+0x154/0x2d0 [ 243.684194][ T123] #2: c0000000366f4ba8 (kn->active#64){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x160/0x2d0 [ 243.684205][ T123] #3: c000000035ff4cb8 (&dev->lock){+.+.}-{3:3}, at: napi_disable+0x30/0x60 [ 243.684216][ T123] #4: c0000003ff9bbf18 (&rq->__lock){-.-.}-{2:2}, at: __schedule+0x138/0x12a0 From the ibmveth debug, two threads are calling veth_pool_store, which calls ibmveth_close and ibmveth_open. Here's the sequence: T4365 T4366 ----------------- ----------------- --------- veth_pool_store veth_pool_store ibmveth_close ibmveth_close napi_disable napi_disable ibmveth_open napi_enable <- HANG ibmveth_close calls napi_disable at the top and ibmveth_open calls napi_enable at the top. https://docs.kernel.org/networking/napi.html]] says The control APIs are not idempotent. Control API calls are safe against concurrent use of datapath APIs but an incorrect sequence of control API calls may result in crashes, deadlocks, or race conditions. For example, calling napi_disable() multiple times in a row will deadlock. In the normal open and close paths, rtnl_mutex is acquired to prevent other callers. This is missing from veth_pool_store. Use rtnl_mutex in veth_pool_store fixes these hangs. Signed-off-by: Dave Marquardt Fixes: 860f242eb534 ("[PATCH] ibmveth change buffer pools dynamically") Reviewed-by: Nick Child Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250402154403.386744-1-davemarq@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmveth.c | 39 +++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c index b619a3ec245b..04192190beba 100644 --- a/drivers/net/ethernet/ibm/ibmveth.c +++ b/drivers/net/ethernet/ibm/ibmveth.c @@ -1802,18 +1802,22 @@ static ssize_t veth_pool_store(struct kobject *kobj, struct attribute *attr, long value = simple_strtol(buf, NULL, 10); long rc; + rtnl_lock(); + if (attr == &veth_active_attr) { if (value && !pool->active) { if (netif_running(netdev)) { if (ibmveth_alloc_buffer_pool(pool)) { netdev_err(netdev, "unable to alloc pool\n"); - return -ENOMEM; + rc = -ENOMEM; + goto unlock_err; } pool->active = 1; ibmveth_close(netdev); - if ((rc = ibmveth_open(netdev))) - return rc; + rc = ibmveth_open(netdev); + if (rc) + goto unlock_err; } else { pool->active = 1; } @@ -1833,48 +1837,59 @@ static ssize_t veth_pool_store(struct kobject *kobj, struct attribute *attr, if (i == IBMVETH_NUM_BUFF_POOLS) { netdev_err(netdev, "no active pool >= MTU\n"); - return -EPERM; + rc = -EPERM; + goto unlock_err; } if (netif_running(netdev)) { ibmveth_close(netdev); pool->active = 0; - if ((rc = ibmveth_open(netdev))) - return rc; + rc = ibmveth_open(netdev); + if (rc) + goto unlock_err; } pool->active = 0; } } else if (attr == &veth_num_attr) { if (value <= 0 || value > IBMVETH_MAX_POOL_COUNT) { - return -EINVAL; + rc = -EINVAL; + goto unlock_err; } else { if (netif_running(netdev)) { ibmveth_close(netdev); pool->size = value; - if ((rc = ibmveth_open(netdev))) - return rc; + rc = ibmveth_open(netdev); + if (rc) + goto unlock_err; } else { pool->size = value; } } } else if (attr == &veth_size_attr) { if (value <= IBMVETH_BUFF_OH || value > IBMVETH_MAX_BUF_SIZE) { - return -EINVAL; + rc = -EINVAL; + goto unlock_err; } else { if (netif_running(netdev)) { ibmveth_close(netdev); pool->buff_size = value; - if ((rc = ibmveth_open(netdev))) - return rc; + rc = ibmveth_open(netdev); + if (rc) + goto unlock_err; } else { pool->buff_size = value; } } } + rtnl_unlock(); /* kick the interrupt handler to allocate/deallocate pools */ ibmveth_interrupt(netdev->irq, netdev); return count; + +unlock_err: + rtnl_unlock(); + return rc; } -- 2.51.0 From ec304b70d46bd2ed66541c5b57b63276529e05b1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Apr 2025 18:34:04 -0700 Subject: [PATCH 05/16] net: move mp dev config validation to __net_mp_open_rxq() devmem code performs a number of safety checks to avoid having to reimplement all of them in the drivers. Move those to __net_mp_open_rxq() and reuse that function for binding to make sure that io_uring ZC also benefits from them. While at it rename the queue ID variable to rxq_idx in __net_mp_open_rxq(), we touch most of the relevant lines. The XArray insertion is reordered after the netdev_rx_queue_restart() call, otherwise we'd need to duplicate the queue index check or risk inserting an invalid pointer. The XArray allocation failures should be extremely rare. Reviewed-by: Mina Almasry Acked-by: Stanislav Fomichev Fixes: 6e18ed929d3b ("net: add helpers for setting a memory provider on an rx queue") Link: https://patch.msgid.link/20250403013405.2827250-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/page_pool/memory_provider.h | 6 +++ net/core/devmem.c | 50 +++++-------------------- net/core/netdev-genl.c | 6 --- net/core/netdev_rx_queue.c | 49 ++++++++++++++++++------ 4 files changed, 54 insertions(+), 57 deletions(-) diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index b3e665897767..ada4f968960a 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -6,6 +6,7 @@ #include struct netdev_rx_queue; +struct netlink_ext_ack; struct sk_buff; struct memory_provider_ops { @@ -24,8 +25,13 @@ void net_mp_niov_clear_page_pool(struct net_iov *niov); int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, struct pp_memory_provider_params *p); +int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *p, + struct netlink_ext_ack *extack); void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, struct pp_memory_provider_params *old_p); +void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *old_p); /** * net_mp_netmem_place_in_cache() - give a netmem to a page pool diff --git a/net/core/devmem.c b/net/core/devmem.c index ee145a2aa41c..f2ce3c2ebc97 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -8,7 +8,6 @@ */ #include -#include #include #include #include @@ -143,57 +142,28 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, struct net_devmem_dmabuf_binding *binding, struct netlink_ext_ack *extack) { + struct pp_memory_provider_params mp_params = { + .mp_priv = binding, + .mp_ops = &dmabuf_devmem_ops, + }; struct netdev_rx_queue *rxq; u32 xa_idx; int err; - if (rxq_idx >= dev->real_num_rx_queues) { - NL_SET_ERR_MSG(extack, "rx queue index out of range"); - return -ERANGE; - } - - if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { - NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); - return -EINVAL; - } - - if (dev->cfg->hds_thresh) { - NL_SET_ERR_MSG(extack, "hds-thresh is not zero"); - return -EINVAL; - } + err = __net_mp_open_rxq(dev, rxq_idx, &mp_params, extack); + if (err) + return err; rxq = __netif_get_rx_queue(dev, rxq_idx); - if (rxq->mp_params.mp_ops) { - NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); - return -EEXIST; - } - -#ifdef CONFIG_XDP_SOCKETS - if (rxq->pool) { - NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); - return -EBUSY; - } -#endif - err = xa_alloc(&binding->bound_rxqs, &xa_idx, rxq, xa_limit_32b, GFP_KERNEL); if (err) - return err; - - rxq->mp_params.mp_priv = binding; - rxq->mp_params.mp_ops = &dmabuf_devmem_ops; - - err = netdev_rx_queue_restart(dev, rxq_idx); - if (err) - goto err_xa_erase; + goto err_close_rxq; return 0; -err_xa_erase: - rxq->mp_params.mp_priv = NULL; - rxq->mp_params.mp_ops = NULL; - xa_erase(&binding->bound_rxqs, xa_idx); - +err_close_rxq: + __net_mp_close_rxq(dev, rxq_idx, &mp_params); return err; } diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 3afeaa8c5dc5..5d7af50fe702 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -874,12 +874,6 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock; } - if (dev_xdp_prog_count(netdev)) { - NL_SET_ERR_MSG(info->extack, "unable to bind dmabuf to device with XDP program attached"); - err = -EEXIST; - goto err_unlock; - } - binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index 3af716f77a13..556b5393ec9f 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include #include #include #include @@ -86,8 +87,9 @@ err_free_new_mem: } EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL"); -static int __net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, - struct pp_memory_provider_params *p) +int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *p, + struct netlink_ext_ack *extack) { struct netdev_rx_queue *rxq; int ret; @@ -95,16 +97,41 @@ static int __net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, if (!netdev_need_ops_lock(dev)) return -EOPNOTSUPP; - if (ifq_idx >= dev->real_num_rx_queues) + if (rxq_idx >= dev->real_num_rx_queues) return -EINVAL; - ifq_idx = array_index_nospec(ifq_idx, dev->real_num_rx_queues); + rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); - rxq = __netif_get_rx_queue(dev, ifq_idx); - if (rxq->mp_params.mp_ops) + if (rxq_idx >= dev->real_num_rx_queues) { + NL_SET_ERR_MSG(extack, "rx queue index out of range"); + return -ERANGE; + } + if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { + NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); + return -EINVAL; + } + if (dev->cfg->hds_thresh) { + NL_SET_ERR_MSG(extack, "hds-thresh is not zero"); + return -EINVAL; + } + if (dev_xdp_prog_count(dev)) { + NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached"); return -EEXIST; + } + + rxq = __netif_get_rx_queue(dev, rxq_idx); + if (rxq->mp_params.mp_ops) { + NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); + return -EEXIST; + } +#ifdef CONFIG_XDP_SOCKETS + if (rxq->pool) { + NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); + return -EBUSY; + } +#endif rxq->mp_params = *p; - ret = netdev_rx_queue_restart(dev, ifq_idx); + ret = netdev_rx_queue_restart(dev, rxq_idx); if (ret) { rxq->mp_params.mp_ops = NULL; rxq->mp_params.mp_priv = NULL; @@ -112,19 +139,19 @@ static int __net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, return ret; } -int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, +int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, struct pp_memory_provider_params *p) { int ret; netdev_lock(dev); - ret = __net_mp_open_rxq(dev, ifq_idx, p); + ret = __net_mp_open_rxq(dev, rxq_idx, p, NULL); netdev_unlock(dev); return ret; } -static void __net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, - struct pp_memory_provider_params *old_p) +void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, + const struct pp_memory_provider_params *old_p) { struct netdev_rx_queue *rxq; -- 2.51.0 From 34f71de3f548eba0604c9cbabc1eb68b2f81fa0f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Apr 2025 18:34:05 -0700 Subject: [PATCH 06/16] net: avoid false positive warnings in __net_mp_close_rxq() Commit under Fixes solved the problem of spurious warnings when we uninstall an MP from a device while its down. The __net_mp_close_rxq() which is used by io_uring was not fixed. Move the fix over and reuse __net_mp_close_rxq() in the devmem path. Acked-by: Stanislav Fomichev Fixes: a70f891e0fa0 ("net: devmem: do not WARN conditionally after netdev_rx_queue_restart()") Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250403013405.2827250-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/devmem.c | 12 +++++------- net/core/netdev_rx_queue.c | 4 +++- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/net/core/devmem.c b/net/core/devmem.c index f2ce3c2ebc97..6e27a47d0493 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -116,21 +116,19 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) struct netdev_rx_queue *rxq; unsigned long xa_idx; unsigned int rxq_idx; - int err; if (binding->list.next) list_del(&binding->list); xa_for_each(&binding->bound_rxqs, xa_idx, rxq) { - WARN_ON(rxq->mp_params.mp_priv != binding); - - rxq->mp_params.mp_priv = NULL; - rxq->mp_params.mp_ops = NULL; + const struct pp_memory_provider_params mp_params = { + .mp_priv = binding, + .mp_ops = &dmabuf_devmem_ops, + }; rxq_idx = get_netdev_rx_queue_index(rxq); - err = netdev_rx_queue_restart(binding->dev, rxq_idx); - WARN_ON(err && err != -ENETDOWN); + __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params); } xa_erase(&net_devmem_dmabuf_bindings, binding->id); diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index 556b5393ec9f..d126f10197bf 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -154,6 +154,7 @@ void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, const struct pp_memory_provider_params *old_p) { struct netdev_rx_queue *rxq; + int err; if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) return; @@ -173,7 +174,8 @@ void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, rxq->mp_params.mp_ops = NULL; rxq->mp_params.mp_priv = NULL; - WARN_ON(netdev_rx_queue_restart(dev, ifq_idx)); + err = netdev_rx_queue_restart(dev, ifq_idx); + WARN_ON(err && err != -ENETDOWN); } void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, -- 2.51.0 From 0802c32d4b03a26604c2db2c8a63b34a80361305 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Apr 2025 18:37:03 -0700 Subject: [PATCH 07/16] netlink: specs: rt_addr: fix the spec format / schema failures The spec is mis-formatted, schema validation says: Failed validating 'type' in schema['properties']['operations']['properties']['list']['items']['properties']['dump']['properties']['request']['properties']['value']: {'minimum': 0, 'type': 'integer'} On instance['operations']['list'][3]['dump']['request']['value']: '58 - ifa-family' The ifa-family clearly wants to be part of an attribute list. Reviewed-by: Jacob Keller Reviewed-by: Donald Hunter Reviewed-by: Yuyang Huang Fixes: 4f280376e531 ("selftests/net: Add selftest for IPv4 RTM_GETMULTICAST support") Link: https://patch.msgid.link/20250403013706.2828322-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt_addr.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/netlink/specs/rt_addr.yaml b/Documentation/netlink/specs/rt_addr.yaml index 5dd5469044c7..3bc9b6f9087e 100644 --- a/Documentation/netlink/specs/rt_addr.yaml +++ b/Documentation/netlink/specs/rt_addr.yaml @@ -187,6 +187,7 @@ operations: dump: request: value: 58 + attributes: - ifa-family reply: value: 58 -- 2.51.0 From 524c03585fda36584cc7ada49a1827666d37eb4e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Apr 2025 18:37:04 -0700 Subject: [PATCH 08/16] netlink: specs: rt_addr: fix get multi command name Command names should match C defines, codegens may depend on it. Reviewed-by: Jacob Keller Fixes: 4f280376e531 ("selftests/net: Add selftest for IPv4 RTM_GETMULTICAST support") Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250403013706.2828322-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt_addr.yaml | 2 +- tools/testing/selftests/net/rtnetlink.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/netlink/specs/rt_addr.yaml b/Documentation/netlink/specs/rt_addr.yaml index 3bc9b6f9087e..1650dc3f091a 100644 --- a/Documentation/netlink/specs/rt_addr.yaml +++ b/Documentation/netlink/specs/rt_addr.yaml @@ -169,7 +169,7 @@ operations: value: 20 attributes: *ifaddr-all - - name: getmaddrs + name: getmulticast doc: Get / dump IPv4/IPv6 multicast addresses. attribute-set: addr-attrs fixed-header: ifaddrmsg diff --git a/tools/testing/selftests/net/rtnetlink.py b/tools/testing/selftests/net/rtnetlink.py index 80950888800b..69436415d56e 100755 --- a/tools/testing/selftests/net/rtnetlink.py +++ b/tools/testing/selftests/net/rtnetlink.py @@ -12,7 +12,7 @@ def dump_mcaddr_check(rtnl: RtnlAddrFamily) -> None: At least the loopback interface should have this address. """ - addresses = rtnl.getmaddrs({"ifa-family": socket.AF_INET}, dump=True) + addresses = rtnl.getmulticast({"ifa-family": socket.AF_INET}, dump=True) all_host_multicasts = [ addr for addr in addresses if addr['ifa-multicast'] == IPV4_ALL_HOSTS_MULTICAST -- 2.51.0 From 0c8e30252d9fe8127f90b7a0a293872b368ebf3c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Apr 2025 18:37:05 -0700 Subject: [PATCH 09/16] netlink: specs: rt_addr: pull the ifa- prefix out of the names YAML specs don't normally include the C prefix name in the name of the YAML attr. Remove the ifa- prefix from all attributes in addr-attrs and specify name-prefix instead. This is a bit risky, hopefully there aren't many users out there. Fixes: dfb0f7d9d979 ("doc/netlink: Add spec for rt addr messages") Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250403013706.2828322-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt_addr.yaml | 39 ++++++++++++------------ tools/testing/selftests/net/rtnetlink.py | 2 +- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/Documentation/netlink/specs/rt_addr.yaml b/Documentation/netlink/specs/rt_addr.yaml index 1650dc3f091a..df6b23f06a22 100644 --- a/Documentation/netlink/specs/rt_addr.yaml +++ b/Documentation/netlink/specs/rt_addr.yaml @@ -78,45 +78,46 @@ definitions: attribute-sets: - name: addr-attrs + name-prefix: ifa- attributes: - - name: ifa-address + name: address type: binary display-hint: ipv4 - - name: ifa-local + name: local type: binary display-hint: ipv4 - - name: ifa-label + name: label type: string - - name: ifa-broadcast + name: broadcast type: binary display-hint: ipv4 - - name: ifa-anycast + name: anycast type: binary - - name: ifa-cacheinfo + name: cacheinfo type: binary struct: ifa-cacheinfo - - name: ifa-multicast + name: multicast type: binary - - name: ifa-flags + name: flags type: u32 enum: ifa-flags enum-as-flags: true - - name: ifa-rt-priority + name: rt-priority type: u32 - - name: ifa-target-netnsid + name: target-netnsid type: binary - - name: ifa-proto + name: proto type: u8 @@ -137,10 +138,10 @@ operations: - ifa-prefixlen - ifa-scope - ifa-index - - ifa-address - - ifa-label - - ifa-local - - ifa-cacheinfo + - address + - label + - local + - cacheinfo - name: deladdr doc: Remove address @@ -154,8 +155,8 @@ operations: - ifa-prefixlen - ifa-scope - ifa-index - - ifa-address - - ifa-local + - address + - local - name: getaddr doc: Dump address information. @@ -182,8 +183,8 @@ operations: reply: value: 58 attributes: &mcaddr-attrs - - ifa-multicast - - ifa-cacheinfo + - multicast + - cacheinfo dump: request: value: 58 diff --git a/tools/testing/selftests/net/rtnetlink.py b/tools/testing/selftests/net/rtnetlink.py index 69436415d56e..e9ad5e88da97 100755 --- a/tools/testing/selftests/net/rtnetlink.py +++ b/tools/testing/selftests/net/rtnetlink.py @@ -15,7 +15,7 @@ def dump_mcaddr_check(rtnl: RtnlAddrFamily) -> None: addresses = rtnl.getmulticast({"ifa-family": socket.AF_INET}, dump=True) all_host_multicasts = [ - addr for addr in addresses if addr['ifa-multicast'] == IPV4_ALL_HOSTS_MULTICAST + addr for addr in addresses if addr['multicast'] == IPV4_ALL_HOSTS_MULTICAST ] ksft_ge(len(all_host_multicasts), 1, -- 2.51.0 From 1a1eba0e9899c286914032c78708c614b016704b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Apr 2025 18:37:06 -0700 Subject: [PATCH 10/16] netlink: specs: rt_route: pull the ifa- prefix out of the names YAML specs don't normally include the C prefix name in the name of the YAML attr. Remove the ifa- prefix from all attributes in route-attrs and metrics and specify name-prefix instead. This is a bit risky, hopefully there aren't many users out there. Fixes: 023289b4f582 ("doc/netlink: Add spec for rt route messages") Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250403013706.2828322-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt_route.yaml | 180 +++++++++++----------- 1 file changed, 91 insertions(+), 89 deletions(-) diff --git a/Documentation/netlink/specs/rt_route.yaml b/Documentation/netlink/specs/rt_route.yaml index a674103e5bc4..292469c7d4b9 100644 --- a/Documentation/netlink/specs/rt_route.yaml +++ b/Documentation/netlink/specs/rt_route.yaml @@ -80,165 +80,167 @@ definitions: attribute-sets: - name: route-attrs + name-prefix: rta- attributes: - - name: rta-dst + name: dst type: binary display-hint: ipv4 - - name: rta-src + name: src type: binary display-hint: ipv4 - - name: rta-iif + name: iif type: u32 - - name: rta-oif + name: oif type: u32 - - name: rta-gateway + name: gateway type: binary display-hint: ipv4 - - name: rta-priority + name: priority type: u32 - - name: rta-prefsrc + name: prefsrc type: binary display-hint: ipv4 - - name: rta-metrics + name: metrics type: nest - nested-attributes: rta-metrics + nested-attributes: metrics - - name: rta-multipath + name: multipath type: binary - - name: rta-protoinfo # not used + name: protoinfo # not used type: binary - - name: rta-flow + name: flow type: u32 - - name: rta-cacheinfo + name: cacheinfo type: binary struct: rta-cacheinfo - - name: rta-session # not used + name: session # not used type: binary - - name: rta-mp-algo # not used + name: mp-algo # not used type: binary - - name: rta-table + name: table type: u32 - - name: rta-mark + name: mark type: u32 - - name: rta-mfc-stats + name: mfc-stats type: binary - - name: rta-via + name: via type: binary - - name: rta-newdst + name: newdst type: binary - - name: rta-pref + name: pref type: u8 - - name: rta-encap-type + name: encap-type type: u16 - - name: rta-encap + name: encap type: binary # tunnel specific nest - - name: rta-expires + name: expires type: u32 - - name: rta-pad + name: pad type: binary - - name: rta-uid + name: uid type: u32 - - name: rta-ttl-propagate + name: ttl-propagate type: u8 - - name: rta-ip-proto + name: ip-proto type: u8 - - name: rta-sport + name: sport type: u16 - - name: rta-dport + name: dport type: u16 - - name: rta-nh-id + name: nh-id type: u32 - - name: rta-flowlabel + name: flowlabel type: u32 byte-order: big-endian display-hint: hex - - name: rta-metrics + name: metrics + name-prefix: rtax- attributes: - - name: rtax-unspec + name: unspec type: unused value: 0 - - name: rtax-lock + name: lock type: u32 - - name: rtax-mtu + name: mtu type: u32 - - name: rtax-window + name: window type: u32 - - name: rtax-rtt + name: rtt type: u32 - - name: rtax-rttvar + name: rttvar type: u32 - - name: rtax-ssthresh + name: ssthresh type: u32 - - name: rtax-cwnd + name: cwnd type: u32 - - name: rtax-advmss + name: advmss type: u32 - - name: rtax-reordering + name: reordering type: u32 - - name: rtax-hoplimit + name: hoplimit type: u32 - - name: rtax-initcwnd + name: initcwnd type: u32 - - name: rtax-features + name: features type: u32 - - name: rtax-rto-min + name: rto-min type: u32 - - name: rtax-initrwnd + name: initrwnd type: u32 - - name: rtax-quickack + name: quickack type: u32 - - name: rtax-cc-algo + name: cc-algo type: string - - name: rtax-fastopen-no-cookie + name: fastopen-no-cookie type: u32 operations: @@ -254,18 +256,18 @@ operations: value: 26 attributes: - rtm-family - - rta-src + - src - rtm-src-len - - rta-dst + - dst - rtm-dst-len - - rta-iif - - rta-oif - - rta-ip-proto - - rta-sport - - rta-dport - - rta-mark - - rta-uid - - rta-flowlabel + - iif + - oif + - ip-proto + - sport + - dport + - mark + - uid + - flowlabel reply: value: 24 attributes: &all-route-attrs @@ -278,34 +280,34 @@ operations: - rtm-scope - rtm-type - rtm-flags - - rta-dst - - rta-src - - rta-iif - - rta-oif - - rta-gateway - - rta-priority - - rta-prefsrc - - rta-metrics - - rta-multipath - - rta-flow - - rta-cacheinfo - - rta-table - - rta-mark - - rta-mfc-stats - - rta-via - - rta-newdst - - rta-pref - - rta-encap-type - - rta-encap - - rta-expires - - rta-pad - - rta-uid - - rta-ttl-propagate - - rta-ip-proto - - rta-sport - - rta-dport - - rta-nh-id - - rta-flowlabel + - dst + - src + - iif + - oif + - gateway + - priority + - prefsrc + - metrics + - multipath + - flow + - cacheinfo + - table + - mark + - mfc-stats + - via + - newdst + - pref + - encap-type + - encap + - expires + - pad + - uid + - ttl-propagate + - ip-proto + - sport + - dport + - nh-id + - flowlabel dump: request: value: 26 -- 2.51.0 From 94f68c0f99a548d33a102672690100bf76a7c460 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 Apr 2025 07:56:36 -0700 Subject: [PATCH 11/16] selftests: net: amt: indicate progress in the stress test Our CI expects output from the test at least once every 10 minutes. The AMT test when running on debug kernel is just on the edge of that time for the stress test. Improve the output: - print the name of the test first, before starting it, - output a dot every 10% of the way. Output after: TEST: amt discovery [ OK ] TEST: IPv4 amt multicast forwarding [ OK ] TEST: IPv6 amt multicast forwarding [ OK ] TEST: IPv4 amt traffic forwarding torture .......... [ OK ] TEST: IPv6 amt traffic forwarding torture .......... [ OK ] Reviewed-by: Taehee Yoo Link: https://patch.msgid.link/20250403145636.2891166-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/amt.sh | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/amt.sh b/tools/testing/selftests/net/amt.sh index d458b45c775b..3ef209cacb8e 100755 --- a/tools/testing/selftests/net/amt.sh +++ b/tools/testing/selftests/net/amt.sh @@ -194,15 +194,21 @@ test_remote_ip() send_mcast_torture4() { - ip netns exec "${SOURCE}" bash -c \ - 'cat /dev/urandom | head -c 1G | nc -w 1 -u 239.0.0.1 4001' + for i in `seq 10`; do + ip netns exec "${SOURCE}" bash -c \ + 'cat /dev/urandom | head -c 100M | nc -w 1 -u 239.0.0.1 4001' + echo -n "." + done } send_mcast_torture6() { - ip netns exec "${SOURCE}" bash -c \ - 'cat /dev/urandom | head -c 1G | nc -w 1 -u ff0e::5:6 6001' + for i in `seq 10`; do + ip netns exec "${SOURCE}" bash -c \ + 'cat /dev/urandom | head -c 100M | nc -w 1 -u ff0e::5:6 6001' + echo -n "." + done } check_features() @@ -278,10 +284,12 @@ wait $pid || err=$? if [ $err -eq 1 ]; then ERR=1 fi +printf "TEST: %-50s" "IPv4 amt traffic forwarding torture" send_mcast_torture4 -printf "TEST: %-60s [ OK ]\n" "IPv4 amt traffic forwarding torture" +printf " [ OK ]\n" +printf "TEST: %-50s" "IPv6 amt traffic forwarding torture" send_mcast_torture6 -printf "TEST: %-60s [ OK ]\n" "IPv6 amt traffic forwarding torture" +printf " [ OK ]\n" sleep 5 if [ "${ERR}" -eq 1 ]; then echo "Some tests failed." >&2 -- 2.51.0 From 9bae8f4f21689b96a4b4fc505740dd97b9142c41 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 3 Apr 2025 15:08:41 -0700 Subject: [PATCH 12/16] selftests/bpf: Make res_spin_lock test less verbose Currently, the res_spin_lock test is too chatty as it constantly prints the test_run results for each iteration in each thread, so in case verbose output is requested or things go wrong, it will flood the logs of CI and other systems with repeated messages that offer no valuable insight. Reduce this by doing assertions when the condition actually flips, and proceed to break out and exit the threads. We still assert to mark the test as failed and print the expected and reported values. Suggested-by: Alexei Starovoitov Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250403220841.66654-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/res_spin_lock.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c b/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c index 115287ba441b..0703e987df89 100644 --- a/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c @@ -25,8 +25,11 @@ static void *spin_lock_thread(void *arg) while (!READ_ONCE(skip)) { err = bpf_prog_test_run_opts(prog_fd, &topts); - ASSERT_OK(err, "test_run"); - ASSERT_OK(topts.retval, "test_run retval"); + if (err || topts.retval) { + ASSERT_OK(err, "test_run"); + ASSERT_OK(topts.retval, "test_run retval"); + break; + } } pthread_exit(arg); } -- 2.51.0 From d4bac0288a2b444e468e6df9cb4ed69479ddf14a Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 8 Apr 2025 09:27:48 -0400 Subject: [PATCH 13/16] bpf: support SKF_NET_OFF and SKF_LL_OFF on skb frags MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Classic BPF socket filters with SKB_NET_OFF and SKB_LL_OFF fail to read when these offsets extend into frags. This has been observed with iwlwifi and reproduced with tun with IFF_NAPI_FRAGS. The below straightforward socket filter on UDP port, applied to a RAW socket, will silently miss matching packets. const int offset_proto = offsetof(struct ip6_hdr, ip6_nxt); const int offset_dport = sizeof(struct ip6_hdr) + offsetof(struct udphdr, dest); struct sock_filter filter_code[] = { BPF_STMT(BPF_LD + BPF_B + BPF_ABS, SKF_AD_OFF + SKF_AD_PKTTYPE), BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, PACKET_HOST, 0, 4), BPF_STMT(BPF_LD + BPF_B + BPF_ABS, SKF_NET_OFF + offset_proto), BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_UDP, 0, 2), BPF_STMT(BPF_LD + BPF_H + BPF_ABS, SKF_NET_OFF + offset_dport), This is unexpected behavior. Socket filter programs should be consistent regardless of environment. Silent misses are particularly concerning as hard to detect. Use skb_copy_bits for offsets outside linear, same as done for non-SKF_(LL|NET) offsets. Offset is always positive after subtracting the reference threshold SKB_(LL|NET)_OFF, so is always >= skb_(mac|network)_offset. The sum of the two is an offset against skb->data, and may be negative, but it cannot point before skb->head, as skb_(mac|network)_offset would too. This appears to go back to when frag support was introduced to sk_run_filter in linux-2.4.4, before the introduction of git. The amount of code change and 8/16/32 bit duplication are unfortunate. But any attempt I made to be smarter saved very few LoC while complicating the code. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Link: https://lore.kernel.org/netdev/20250122200402.3461154-1-maze@google.com/ Link: https://elixir.bootlin.com/linux/2.4.4/source/net/core/filter.c#L244 Reported-by: Matt Moeller Co-developed-by: Maciej Å»enczykowski Signed-off-by: Maciej Å»enczykowski Signed-off-by: Willem de Bruijn Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20250408132833.195491-2-willemdebruijn.kernel@gmail.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 80 ++++++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index bc6828761a47..79cab4d78dc3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -218,24 +218,36 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; } +static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset) +{ + if (likely(offset >= 0)) + return offset; + + if (offset >= SKF_NET_OFF) + return offset - SKF_NET_OFF + skb_network_offset(skb); + + if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb)) + return offset - SKF_LL_OFF + skb_mac_offset(skb); + + return INT_MIN; +} + BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - u8 tmp, *ptr; + u8 tmp; const int len = sizeof(tmp); - if (offset >= 0) { - if (headlen - offset >= len) - return *(u8 *)(data + offset); - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) - return tmp; - } else { - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); - if (likely(ptr)) - return *(u8 *)ptr; - } + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; - return -EFAULT; + if (headlen - offset >= len) + return *(u8 *)(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return tmp; + else + return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, @@ -248,21 +260,19 @@ BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - __be16 tmp, *ptr; + __be16 tmp; const int len = sizeof(tmp); - if (offset >= 0) { - if (headlen - offset >= len) - return get_unaligned_be16(data + offset); - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) - return be16_to_cpu(tmp); - } else { - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); - if (likely(ptr)) - return get_unaligned_be16(ptr); - } + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; - return -EFAULT; + if (headlen - offset >= len) + return get_unaligned_be16(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be16_to_cpu(tmp); + else + return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, @@ -275,21 +285,19 @@ BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - __be32 tmp, *ptr; + __be32 tmp; const int len = sizeof(tmp); - if (likely(offset >= 0)) { - if (headlen - offset >= len) - return get_unaligned_be32(data + offset); - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) - return be32_to_cpu(tmp); - } else { - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); - if (likely(ptr)) - return get_unaligned_be32(ptr); - } + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; - return -EFAULT; + if (headlen - offset >= len) + return get_unaligned_be32(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be32_to_cpu(tmp); + else + return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, -- 2.51.0 From fcd7132cb1f93e4d4594ecb19b8dcecdf0497d9e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 8 Apr 2025 09:27:49 -0400 Subject: [PATCH 14/16] selftests/net: test sk_filter support for SKF_NET_OFF on frags Verify that a classic BPF linux socket filter correctly matches packet contents. Including when accessing contents in an skb_frag. 1. Open a SOCK_RAW socket with a classic BPF filter on UDP dport 8000. 2. Open a tap device with IFF_NAPI_FRAGS to inject skbs with frags. 3. Send a packet for which the UDP header is in frag[0]. 4. Receive this packet to demonstrate that the socket accepted it. Acked-by: Stanislav Fomichev Signed-off-by: Willem de Bruijn Link: https://lore.kernel.org/r/20250408132833.195491-3-willemdebruijn.kernel@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/Makefile | 2 + tools/testing/selftests/net/skf_net_off.c | 244 +++++++++++++++++++++ tools/testing/selftests/net/skf_net_off.sh | 30 +++ 4 files changed, 277 insertions(+) create mode 100644 tools/testing/selftests/net/skf_net_off.c create mode 100755 tools/testing/selftests/net/skf_net_off.sh diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 679542f565a4..532bb732bc6d 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -39,6 +39,7 @@ scm_rights sk_bind_sendto_listen sk_connect_zero_addr sk_so_peek_off +skf_net_off socket so_incoming_cpu so_netns_cookie diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 6d718b478ed8..124078b56fa4 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -106,6 +106,8 @@ TEST_PROGS += ipv6_route_update_soft_lockup.sh TEST_PROGS += busy_poll_test.sh TEST_GEN_PROGS += proc_net_pktgen TEST_PROGS += lwt_dst_cache_ref_loop.sh +TEST_PROGS += skf_net_off.sh +TEST_GEN_FILES += skf_net_off # YNL files, must be before "include ..lib.mk" YNL_GEN_FILES := busy_poller netlink-dumps diff --git a/tools/testing/selftests/net/skf_net_off.c b/tools/testing/selftests/net/skf_net_off.c new file mode 100644 index 000000000000..1fdf61d6cd7f --- /dev/null +++ b/tools/testing/selftests/net/skf_net_off.c @@ -0,0 +1,244 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Open a tun device. + * + * [modifications: use IFF_NAPI_FRAGS, add sk filter] + * + * Expects the device to have been configured previously, e.g.: + * sudo ip tuntap add name tap1 mode tap + * sudo ip link set tap1 up + * sudo ip link set dev tap1 addr 02:00:00:00:00:01 + * sudo ip -6 addr add fdab::1 peer fdab::2 dev tap1 nodad + * + * And to avoid premature pskb_may_pull: + * + * sudo ethtool -K tap1 gro off + * sudo bash -c 'echo 0 > /proc/sys/net/ipv4/ip_early_demux' + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static bool cfg_do_filter; +static bool cfg_do_frags; +static int cfg_dst_port = 8000; +static char *cfg_ifname; + +static int tun_open(const char *tun_name) +{ + struct ifreq ifr = {0}; + int fd, ret; + + fd = open("/dev/net/tun", O_RDWR); + if (fd == -1) + error(1, errno, "open /dev/net/tun"); + + ifr.ifr_flags = IFF_TAP; + if (cfg_do_frags) + ifr.ifr_flags |= IFF_NAPI | IFF_NAPI_FRAGS; + + strncpy(ifr.ifr_name, tun_name, IFNAMSIZ - 1); + + ret = ioctl(fd, TUNSETIFF, &ifr); + if (ret) + error(1, ret, "ioctl TUNSETIFF"); + + return fd; +} + +static void sk_set_filter(int fd) +{ + const int offset_proto = offsetof(struct ip6_hdr, ip6_nxt); + const int offset_dport = sizeof(struct ip6_hdr) + offsetof(struct udphdr, dest); + + /* Filter UDP packets with destination port cfg_dst_port */ + struct sock_filter filter_code[] = { + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, SKF_AD_OFF + SKF_AD_PKTTYPE), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, PACKET_HOST, 0, 4), + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, SKF_NET_OFF + offset_proto), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_UDP, 0, 2), + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, SKF_NET_OFF + offset_dport), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, cfg_dst_port, 1, 0), + BPF_STMT(BPF_RET + BPF_K, 0), + BPF_STMT(BPF_RET + BPF_K, 0xFFFF), + }; + + struct sock_fprog filter = { + sizeof(filter_code) / sizeof(filter_code[0]), + filter_code, + }; + + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &filter, sizeof(filter))) + error(1, errno, "setsockopt attach filter"); +} + +static int raw_open(void) +{ + int fd; + + fd = socket(PF_INET6, SOCK_RAW, IPPROTO_UDP); + if (fd == -1) + error(1, errno, "socket raw (udp)"); + + if (cfg_do_filter) + sk_set_filter(fd); + + return fd; +} + +static void tun_write(int fd) +{ + const char eth_src[] = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x02 }; + const char eth_dst[] = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x01 }; + struct tun_pi pi = {0}; + struct ipv6hdr ip6h = {0}; + struct udphdr uh = {0}; + struct ethhdr eth = {0}; + uint32_t payload; + struct iovec iov[5]; + int ret; + + pi.proto = htons(ETH_P_IPV6); + + memcpy(eth.h_source, eth_src, sizeof(eth_src)); + memcpy(eth.h_dest, eth_dst, sizeof(eth_dst)); + eth.h_proto = htons(ETH_P_IPV6); + + ip6h.version = 6; + ip6h.payload_len = htons(sizeof(uh) + sizeof(uint32_t)); + ip6h.nexthdr = IPPROTO_UDP; + ip6h.hop_limit = 8; + if (inet_pton(AF_INET6, "fdab::2", &ip6h.saddr) != 1) + error(1, errno, "inet_pton src"); + if (inet_pton(AF_INET6, "fdab::1", &ip6h.daddr) != 1) + error(1, errno, "inet_pton src"); + + uh.source = htons(8000); + uh.dest = htons(cfg_dst_port); + uh.len = ip6h.payload_len; + uh.check = 0; + + payload = htonl(0xABABABAB); /* Covered in IPv6 length */ + + iov[0].iov_base = π + iov[0].iov_len = sizeof(pi); + iov[1].iov_base = ð + iov[1].iov_len = sizeof(eth); + iov[2].iov_base = &ip6h; + iov[2].iov_len = sizeof(ip6h); + iov[3].iov_base = &uh; + iov[3].iov_len = sizeof(uh); + iov[4].iov_base = &payload; + iov[4].iov_len = sizeof(payload); + + ret = writev(fd, iov, sizeof(iov) / sizeof(iov[0])); + if (ret <= 0) + error(1, errno, "writev"); +} + +static void raw_read(int fd) +{ + struct timeval tv = { .tv_usec = 100 * 1000 }; + struct msghdr msg = {0}; + struct iovec iov[2]; + struct udphdr uh; + uint32_t payload[2]; + int ret; + + if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv))) + error(1, errno, "setsockopt rcvtimeo udp"); + + iov[0].iov_base = &uh; + iov[0].iov_len = sizeof(uh); + + iov[1].iov_base = payload; + iov[1].iov_len = sizeof(payload); + + msg.msg_iov = iov; + msg.msg_iovlen = sizeof(iov) / sizeof(iov[0]); + + ret = recvmsg(fd, &msg, 0); + if (ret <= 0) + error(1, errno, "read raw"); + if (ret != sizeof(uh) + sizeof(payload[0])) + error(1, errno, "read raw: len=%d\n", ret); + + fprintf(stderr, "raw recv: 0x%x\n", payload[0]); +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "fFi:")) != -1) { + switch (c) { + case 'f': + cfg_do_filter = true; + printf("bpf filter enabled\n"); + break; + case 'F': + cfg_do_frags = true; + printf("napi frags mode enabled\n"); + break; + case 'i': + cfg_ifname = optarg; + break; + default: + error(1, 0, "unknown option %c", optopt); + break; + } + } + + if (!cfg_ifname) + error(1, 0, "must specify tap interface name (-i)"); +} + +int main(int argc, char **argv) +{ + int fdt, fdr; + + parse_opts(argc, argv); + + fdr = raw_open(); + fdt = tun_open(cfg_ifname); + + tun_write(fdt); + raw_read(fdr); + + if (close(fdt)) + error(1, errno, "close tun"); + if (close(fdr)) + error(1, errno, "close udp"); + + fprintf(stderr, "OK\n"); + return 0; +} + diff --git a/tools/testing/selftests/net/skf_net_off.sh b/tools/testing/selftests/net/skf_net_off.sh new file mode 100755 index 000000000000..5da5066fb465 --- /dev/null +++ b/tools/testing/selftests/net/skf_net_off.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +readonly NS="ns-$(mktemp -u XXXXXX)" + +cleanup() { + ip netns del $NS +} + +ip netns add $NS +trap cleanup EXIT + +ip -netns $NS link set lo up +ip -netns $NS tuntap add name tap1 mode tap +ip -netns $NS link set tap1 up +ip -netns $NS link set dev tap1 addr 02:00:00:00:00:01 +ip -netns $NS -6 addr add fdab::1 peer fdab::2 dev tap1 nodad +ip netns exec $NS ethtool -K tap1 gro off + +# disable early demux, else udp_v6_early_demux pulls udp header into linear +ip netns exec $NS sysctl -w net.ipv4.ip_early_demux=0 + +echo "no filter" +ip netns exec $NS ./skf_net_off -i tap1 + +echo "filter, linear skb (-f)" +ip netns exec $NS ./skf_net_off -i tap1 -f + +echo "filter, fragmented skb (-f) (-F)" +ip netns exec $NS ./skf_net_off -i tap1 -f -F -- 2.51.0 From 1ddb9ad2ac6e527f220d5821ad54d37d3f9d122a Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 10 Apr 2025 10:00:23 -0700 Subject: [PATCH 15/16] selftests/bpf: Make res_spin_lock AA test condition stronger Let's make sure that we see a EDEADLK and ETIMEDOUT whenever checking for the AA tests (in case of simple AA and AA after exhausting 31 entries). Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250410170023.2670683-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/res_spin_lock.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/res_spin_lock.c b/tools/testing/selftests/bpf/progs/res_spin_lock.c index b33385dfbd35..22c4fb8b9266 100644 --- a/tools/testing/selftests/bpf/progs/res_spin_lock.c +++ b/tools/testing/selftests/bpf/progs/res_spin_lock.c @@ -38,13 +38,14 @@ int res_spin_lock_test(struct __sk_buff *ctx) r = bpf_res_spin_lock(&elem1->lock); if (r) return r; - if (!bpf_res_spin_lock(&elem2->lock)) { + r = bpf_res_spin_lock(&elem2->lock); + if (!r) { bpf_res_spin_unlock(&elem2->lock); bpf_res_spin_unlock(&elem1->lock); return -1; } bpf_res_spin_unlock(&elem1->lock); - return 0; + return r != -EDEADLK; } SEC("tc") @@ -124,12 +125,15 @@ int res_spin_lock_test_held_lock_max(struct __sk_buff *ctx) /* Trigger AA, after exhausting entries in the held lock table. This * time, only the timeout can save us, as AA detection won't succeed. */ - if (!bpf_res_spin_lock(locks[34])) { + ret = bpf_res_spin_lock(locks[34]); + if (!ret) { bpf_res_spin_unlock(locks[34]); ret = 1; goto end; } + ret = ret != -ETIMEDOUT ? 2 : 0; + end: for (i = i - 1; i >= 0; i--) bpf_res_spin_unlock(locks[i]); -- 2.51.0 From 92b90f780d056a28f3c751c2dfbcd9540c7ae28a Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 10 Apr 2025 07:55:12 -0700 Subject: [PATCH 16/16] bpf: Use architecture provided res_smp_cond_load_acquire In v2 of rqspinlock [0], we fixed potential problems with WFE usage in arm64 to fallback to a version copied from Ankur's series [1]. This logic was moved into arch-specific headers in v3 [2]. However, we missed using the arch-provided res_smp_cond_load_acquire in commit ebababcd0372 ("rqspinlock: Hardcode cond_acquire loops for arm64") due to a rebasing mistake between v2 and v3 of the rqspinlock series. Fix the typo to fallback to the arm64 definition as we did in v2. [0]: https://lore.kernel.org/bpf/20250206105435.2159977-18-memxor@gmail.com [1]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@oracle.com [2]: https://lore.kernel.org/bpf/20250303152305.3195648-9-memxor@gmail.com Fixes: ebababcd0372 ("rqspinlock: Hardcode cond_acquire loops for arm64") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250410145512.1876745-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- arch/arm64/include/asm/rqspinlock.h | 2 +- kernel/bpf/rqspinlock.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/rqspinlock.h b/arch/arm64/include/asm/rqspinlock.h index 5b80785324b6..9ea0a74e5892 100644 --- a/arch/arm64/include/asm/rqspinlock.h +++ b/arch/arm64/include/asm/rqspinlock.h @@ -86,7 +86,7 @@ #endif -#define res_smp_cond_load_acquire_timewait(v, c) smp_cond_load_acquire_timewait(v, c, 0, 1) +#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire_timewait(v, c, 0, 1) #include diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index b896c4a75a5c..338305c8852c 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -253,7 +253,7 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask, }) #else #define RES_CHECK_TIMEOUT(ts, ret, mask) \ - ({ (ret) = check_timeout(&(ts)); }) + ({ (ret) = check_timeout((lock), (mask), &(ts)); }) #endif /* -- 2.51.0