From 57afb483015768903029c8336ee287f4b03c1235 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Feb 2025 13:56:15 -0800 Subject: [PATCH 01/16] net: page_pool: create hooks for custom memory providers A spin off from the original page pool memory providers patch by Jakub, which allows extending page pools with custom allocators. One of such providers is devmem TCP, and the other is io_uring zerocopy added in following patches. Link: https://lore.kernel.org/netdev/20230707183935.997267-7-kuba@kernel.org/ Co-developed-by: Jakub Kicinski # initial mp proposal Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-5-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/page_pool/memory_provider.h | 15 +++++++++++++++ include/net/page_pool/types.h | 4 ++++ net/core/devmem.c | 15 ++++++++++++++- net/core/page_pool.c | 23 +++++++++++++++-------- 4 files changed, 48 insertions(+), 9 deletions(-) create mode 100644 include/net/page_pool/memory_provider.h diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h new file mode 100644 index 000000000000..e49d0a52629d --- /dev/null +++ b/include/net/page_pool/memory_provider.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_PAGE_POOL_MEMORY_PROVIDER_H +#define _NET_PAGE_POOL_MEMORY_PROVIDER_H + +#include +#include + +struct memory_provider_ops { + netmem_ref (*alloc_netmems)(struct page_pool *pool, gfp_t gfp); + bool (*release_netmem)(struct page_pool *pool, netmem_ref netmem); + int (*init)(struct page_pool *pool); + void (*destroy)(struct page_pool *pool); +}; + +#endif diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 7f405672b089..36eb57d73abc 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -152,8 +152,11 @@ struct page_pool_stats { */ #define PAGE_POOL_FRAG_GROUP_ALIGN (4 * sizeof(long)) +struct memory_provider_ops; + struct pp_memory_provider_params { void *mp_priv; + const struct memory_provider_ops *mp_ops; }; struct page_pool { @@ -216,6 +219,7 @@ struct page_pool { struct ptr_ring ring; void *mp_priv; + const struct memory_provider_ops *mp_ops; #ifdef CONFIG_PAGE_POOL_STATS /* recycle stats are per-cpu to avoid locking */ diff --git a/net/core/devmem.c b/net/core/devmem.c index fb0dddcb4e60..c81625ca57c6 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "devmem.h" @@ -27,6 +28,8 @@ /* Protected by rtnl_lock() */ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); +static const struct memory_provider_ops dmabuf_devmem_ops; + static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, struct gen_pool_chunk *chunk, void *not_used) @@ -118,6 +121,7 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) WARN_ON(rxq->mp_params.mp_priv != binding); rxq->mp_params.mp_priv = NULL; + rxq->mp_params.mp_ops = NULL; rxq_idx = get_netdev_rx_queue_index(rxq); @@ -153,7 +157,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, } rxq = __netif_get_rx_queue(dev, rxq_idx); - if (rxq->mp_params.mp_priv) { + if (rxq->mp_params.mp_ops) { NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); return -EEXIST; } @@ -171,6 +175,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, return err; rxq->mp_params.mp_priv = binding; + rxq->mp_params.mp_ops = &dmabuf_devmem_ops; err = netdev_rx_queue_restart(dev, rxq_idx); if (err) @@ -180,6 +185,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, err_xa_erase: rxq->mp_params.mp_priv = NULL; + rxq->mp_params.mp_ops = NULL; xa_erase(&binding->bound_rxqs, xa_idx); return err; @@ -399,3 +405,10 @@ bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) /* We don't want the page pool put_page()ing our net_iovs. */ return false; } + +static const struct memory_provider_ops dmabuf_devmem_ops = { + .init = mp_dmabuf_devmem_init, + .destroy = mp_dmabuf_devmem_destroy, + .alloc_netmems = mp_dmabuf_devmem_alloc_netmems, + .release_netmem = mp_dmabuf_devmem_release_page, +}; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index f5e908c9e7ad..d632cf2c91c3 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -13,6 +13,7 @@ #include #include +#include #include #include @@ -285,13 +286,19 @@ static int page_pool_init(struct page_pool *pool, rxq = __netif_get_rx_queue(pool->slow.netdev, pool->slow.queue_idx); pool->mp_priv = rxq->mp_params.mp_priv; + pool->mp_ops = rxq->mp_params.mp_ops; } - if (pool->mp_priv) { + if (pool->mp_ops) { if (!pool->dma_map || !pool->dma_sync) return -EOPNOTSUPP; - err = mp_dmabuf_devmem_init(pool); + if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { + err = -EFAULT; + goto free_ptr_ring; + } + + err = pool->mp_ops->init(pool); if (err) { pr_warn("%s() mem-provider init failed %d\n", __func__, err); @@ -587,8 +594,8 @@ netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) return netmem; /* Slow-path: cache empty, do real allocation */ - if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) - netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp); + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + netmem = pool->mp_ops->alloc_netmems(pool, gfp); else netmem = __page_pool_alloc_pages_slow(pool, gfp); return netmem; @@ -679,8 +686,8 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) bool put; put = true; - if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) - put = mp_dmabuf_devmem_release_page(pool, netmem); + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + put = pool->mp_ops->release_netmem(pool, netmem); else __page_pool_release_page_dma(pool, netmem); @@ -1048,8 +1055,8 @@ static void __page_pool_destroy(struct page_pool *pool) page_pool_unlist(pool); page_pool_uninit(pool); - if (pool->mp_priv) { - mp_dmabuf_devmem_destroy(pool); + if (pool->mp_ops) { + pool->mp_ops->destroy(pool); static_branch_dec(&page_pool_mem_providers); } -- 2.51.0 From dcc0113acd3b77cca3c7e805fffd8ea4c5a675b3 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 4 Feb 2025 13:56:16 -0800 Subject: [PATCH 02/16] netdev: add io_uring memory provider info Add a nested attribute for io_uring memory provider info. For now it is empty and its presence indicates that a particular page pool or queue has an io_uring memory provider attached. $ ./cli.py --spec netlink/specs/netdev.yaml --dump page-pool-get [{'id': 80, 'ifindex': 2, 'inflight': 64, 'inflight-mem': 262144, 'napi-id': 525}, {'id': 79, 'ifindex': 2, 'inflight': 320, 'inflight-mem': 1310720, 'io_uring': {}, 'napi-id': 525}, ... $ ./cli.py --spec netlink/specs/netdev.yaml --dump queue-get [{'id': 0, 'ifindex': 1, 'type': 'rx'}, {'id': 0, 'ifindex': 1, 'type': 'tx'}, {'id': 0, 'ifindex': 2, 'napi-id': 513, 'type': 'rx'}, {'id': 1, 'ifindex': 2, 'napi-id': 514, 'type': 'rx'}, ... {'id': 12, 'ifindex': 2, 'io_uring': {}, 'napi-id': 525, 'type': 'rx'}, ... Reviewed-by: Jakub Kicinski Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-6-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 15 +++++++++++++++ include/uapi/linux/netdev.h | 7 +++++++ tools/include/uapi/linux/netdev.h | 7 +++++++ 3 files changed, 29 insertions(+) diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index cbb544bd6c84..288923e965ae 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -114,6 +114,9 @@ attribute-sets: doc: Bitmask of enabled AF_XDP features. type: u64 enum: xsk-flags + - + name: io-uring-provider-info + attributes: [] - name: page-pool attributes: @@ -171,6 +174,11 @@ attribute-sets: name: dmabuf doc: ID of the dmabuf this page-pool is attached to. type: u32 + - + name: io-uring + doc: io-uring memory provider information. + type: nest + nested-attributes: io-uring-provider-info - name: page-pool-info subset-of: page-pool @@ -296,6 +304,11 @@ attribute-sets: name: dmabuf doc: ID of the dmabuf attached to this queue, if any. type: u32 + - + name: io-uring + doc: io_uring memory provider information. + type: nest + nested-attributes: io-uring-provider-info - name: qstats @@ -572,6 +585,7 @@ operations: - inflight-mem - detach-time - dmabuf + - io-uring dump: reply: *pp-reply config-cond: page-pool @@ -637,6 +651,7 @@ operations: - napi-id - ifindex - dmabuf + - io-uring dump: request: attributes: diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index e4be227d3ad6..6c6ee183802d 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -86,6 +86,11 @@ enum { NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) }; +enum { + __NETDEV_A_IO_URING_PROVIDER_INFO_MAX, + NETDEV_A_IO_URING_PROVIDER_INFO_MAX = (__NETDEV_A_IO_URING_PROVIDER_INFO_MAX - 1) +}; + enum { NETDEV_A_PAGE_POOL_ID = 1, NETDEV_A_PAGE_POOL_IFINDEX, @@ -94,6 +99,7 @@ enum { NETDEV_A_PAGE_POOL_INFLIGHT_MEM, NETDEV_A_PAGE_POOL_DETACH_TIME, NETDEV_A_PAGE_POOL_DMABUF, + NETDEV_A_PAGE_POOL_IO_URING, __NETDEV_A_PAGE_POOL_MAX, NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) @@ -136,6 +142,7 @@ enum { NETDEV_A_QUEUE_TYPE, NETDEV_A_QUEUE_NAPI_ID, NETDEV_A_QUEUE_DMABUF, + NETDEV_A_QUEUE_IO_URING, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index e4be227d3ad6..6c6ee183802d 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -86,6 +86,11 @@ enum { NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) }; +enum { + __NETDEV_A_IO_URING_PROVIDER_INFO_MAX, + NETDEV_A_IO_URING_PROVIDER_INFO_MAX = (__NETDEV_A_IO_URING_PROVIDER_INFO_MAX - 1) +}; + enum { NETDEV_A_PAGE_POOL_ID = 1, NETDEV_A_PAGE_POOL_IFINDEX, @@ -94,6 +99,7 @@ enum { NETDEV_A_PAGE_POOL_INFLIGHT_MEM, NETDEV_A_PAGE_POOL_DETACH_TIME, NETDEV_A_PAGE_POOL_DMABUF, + NETDEV_A_PAGE_POOL_IO_URING, __NETDEV_A_PAGE_POOL_MAX, NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) @@ -136,6 +142,7 @@ enum { NETDEV_A_QUEUE_TYPE, NETDEV_A_QUEUE_NAPI_ID, NETDEV_A_QUEUE_DMABUF, + NETDEV_A_QUEUE_IO_URING, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) -- 2.51.0 From 2508a46f920a3130e35ab2183a70fc93f0aaaee4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Feb 2025 13:56:17 -0800 Subject: [PATCH 03/16] net: page_pool: add callback for mp info printing Add a mandatory callback that prints information about the memory provider to netlink. Reviewed-by: Jakub Kicinski Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-7-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/page_pool/memory_provider.h | 5 +++++ net/core/devmem.c | 10 ++++++++++ net/core/netdev-genl.c | 11 ++++++----- net/core/page_pool_user.c | 5 ++--- 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index e49d0a52629d..6d10a0959d00 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -5,11 +5,16 @@ #include #include +struct netdev_rx_queue; +struct sk_buff; + struct memory_provider_ops { netmem_ref (*alloc_netmems)(struct page_pool *pool, gfp_t gfp); bool (*release_netmem)(struct page_pool *pool, netmem_ref netmem); int (*init)(struct page_pool *pool); void (*destroy)(struct page_pool *pool); + int (*nl_fill)(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq); }; #endif diff --git a/net/core/devmem.c b/net/core/devmem.c index c81625ca57c6..63b790326c7d 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -406,9 +406,19 @@ bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) return false; } +static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq) +{ + const struct net_devmem_dmabuf_binding *binding = mp_priv; + int type = rxq ? NETDEV_A_QUEUE_DMABUF : NETDEV_A_PAGE_POOL_DMABUF; + + return nla_put_u32(rsp, type, binding->id); +} + static const struct memory_provider_ops dmabuf_devmem_ops = { .init = mp_dmabuf_devmem_init, .destroy = mp_dmabuf_devmem_destroy, .alloc_netmems = mp_dmabuf_devmem_alloc_netmems, .release_netmem = mp_dmabuf_devmem_release_page, + .nl_fill = mp_dmabuf_devmem_nl_fill, }; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 715f85c6b62e..5b459b4fef46 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "dev.h" #include "devmem.h" @@ -368,7 +369,7 @@ static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { - struct net_devmem_dmabuf_binding *binding; + struct pp_memory_provider_params *params; struct netdev_rx_queue *rxq; struct netdev_queue *txq; void *hdr; @@ -385,15 +386,15 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, switch (q_type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(netdev, q_idx); + if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, rxq->napi->napi_id)) goto nla_put_failure; - binding = rxq->mp_params.mp_priv; - if (binding && - nla_put_u32(rsp, NETDEV_A_QUEUE_DMABUF, binding->id)) + params = &rxq->mp_params; + if (params->mp_ops && + params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) goto nla_put_failure; - break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index d5e214c30c31..9d8a3d8597fa 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -8,9 +8,9 @@ #include #include #include +#include #include -#include "devmem.h" #include "page_pool_priv.h" #include "netdev-genl-gen.h" @@ -216,7 +216,6 @@ static int page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) { - struct net_devmem_dmabuf_binding *binding = pool->mp_priv; size_t inflight, refsz; unsigned int napi_id; void *hdr; @@ -249,7 +248,7 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, pool->user.detach_time)) goto err_cancel; - if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id)) + if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL)) goto err_cancel; genlmsg_end(rsp, hdr); -- 2.51.0 From f8350a4358fc9c4801f2f4229cf9b6e678055d9a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Feb 2025 13:56:18 -0800 Subject: [PATCH 04/16] net: page_pool: add a mp hook to unregister_netdevice* Devmem TCP needs a hook in unregister_netdevice_many_notify() to upkeep the set tracking queues it's bound to, i.e. ->bound_rxqs. Instead of devmem sticking directly out of the genetic path, add a mp function. Reviewed-by: Jakub Kicinski Reviewed-by: Mina Almasry Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-8-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/page_pool/memory_provider.h | 1 + net/core/dev.c | 16 ++++++++++- net/core/devmem.c | 36 +++++++++++-------------- net/core/devmem.h | 5 ---- 4 files changed, 32 insertions(+), 26 deletions(-) diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index 6d10a0959d00..36469a7e649f 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -15,6 +15,7 @@ struct memory_provider_ops { void (*destroy)(struct page_pool *pool); int (*nl_fill)(void *mp_priv, struct sk_buff *rsp, struct netdev_rx_queue *rxq); + void (*uninstall)(void *mp_priv, struct netdev_rx_queue *rxq); }; #endif diff --git a/net/core/dev.c b/net/core/dev.c index c0021cbd28fc..384b9109932a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -159,6 +159,7 @@ #include #include #include +#include #include #include @@ -11724,6 +11725,19 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) } EXPORT_SYMBOL(unregister_netdevice_queue); +static void dev_memory_provider_uninstall(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct netdev_rx_queue *rxq = &dev->_rx[i]; + struct pp_memory_provider_params *p = &rxq->mp_params; + + if (p->mp_ops && p->mp_ops->uninstall) + p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq); + } +} + void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh) { @@ -11778,7 +11792,7 @@ void unregister_netdevice_many_notify(struct list_head *head, dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); bpf_dev_bound_netdev_unregister(dev); - dev_dmabuf_uninstall(dev); + dev_memory_provider_uninstall(dev); netdev_offload_xstats_disable_all(dev); diff --git a/net/core/devmem.c b/net/core/devmem.c index 63b790326c7d..cbac6419fcc4 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -320,26 +320,6 @@ err_put_dmabuf: return ERR_PTR(err); } -void dev_dmabuf_uninstall(struct net_device *dev) -{ - struct net_devmem_dmabuf_binding *binding; - struct netdev_rx_queue *rxq; - unsigned long xa_idx; - unsigned int i; - - for (i = 0; i < dev->real_num_rx_queues; i++) { - binding = dev->_rx[i].mp_params.mp_priv; - if (!binding) - continue; - - xa_for_each(&binding->bound_rxqs, xa_idx, rxq) - if (rxq == &dev->_rx[i]) { - xa_erase(&binding->bound_rxqs, xa_idx); - break; - } - } -} - /*** "Dmabuf devmem memory provider" ***/ int mp_dmabuf_devmem_init(struct page_pool *pool) @@ -415,10 +395,26 @@ static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp, return nla_put_u32(rsp, type, binding->id); } +static void mp_dmabuf_devmem_uninstall(void *mp_priv, + struct netdev_rx_queue *rxq) +{ + struct net_devmem_dmabuf_binding *binding = mp_priv; + struct netdev_rx_queue *bound_rxq; + unsigned long xa_idx; + + xa_for_each(&binding->bound_rxqs, xa_idx, bound_rxq) { + if (bound_rxq == rxq) { + xa_erase(&binding->bound_rxqs, xa_idx); + break; + } + } +} + static const struct memory_provider_ops dmabuf_devmem_ops = { .init = mp_dmabuf_devmem_init, .destroy = mp_dmabuf_devmem_destroy, .alloc_netmems = mp_dmabuf_devmem_alloc_netmems, .release_netmem = mp_dmabuf_devmem_release_page, .nl_fill = mp_dmabuf_devmem_nl_fill, + .uninstall = mp_dmabuf_devmem_uninstall, }; diff --git a/net/core/devmem.h b/net/core/devmem.h index a2b9913e9a17..8e999fe2ae67 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -68,7 +68,6 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, struct net_devmem_dmabuf_binding *binding, struct netlink_ext_ack *extack); -void dev_dmabuf_uninstall(struct net_device *dev); static inline struct dmabuf_genpool_chunk_owner * net_devmem_iov_to_chunk_owner(const struct net_iov *niov) @@ -145,10 +144,6 @@ net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, return -EOPNOTSUPP; } -static inline void dev_dmabuf_uninstall(struct net_device *dev) -{ -} - static inline struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) { -- 2.51.0 From 69e39537b66232103633f685b208f293bf9a15b5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Feb 2025 13:56:19 -0800 Subject: [PATCH 05/16] net: prepare for non devmem TCP memory providers There is a good bunch of places in generic paths assuming that the only page pool memory provider is devmem TCP. As we want to reuse the net_iov and provider infrastructure, we need to patch it up and explicitly check the provider type when we branch into devmem TCP code. Reviewed-by: Mina Almasry Reviewed-by: Jakub Kicinski Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-9-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- net/core/devmem.c | 5 +++++ net/core/devmem.h | 7 +++++++ net/ipv4/tcp.c | 5 +++++ 3 files changed, 17 insertions(+) diff --git a/net/core/devmem.c b/net/core/devmem.c index cbac6419fcc4..7c6e0b5b6acb 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -30,6 +30,11 @@ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); static const struct memory_provider_ops dmabuf_devmem_ops; +bool net_is_devmem_iov(struct net_iov *niov) +{ + return niov->pp->mp_ops == &dmabuf_devmem_ops; +} + static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, struct gen_pool_chunk *chunk, void *not_used) diff --git a/net/core/devmem.h b/net/core/devmem.h index 8e999fe2ae67..7fc158d52729 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -115,6 +115,8 @@ struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); +bool net_is_devmem_iov(struct net_iov *niov); + #else struct net_devmem_dmabuf_binding; @@ -163,6 +165,11 @@ static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) { return 0; } + +static inline bool net_is_devmem_iov(struct net_iov *niov) +{ + return false; +} #endif #endif /* _NET_DEVMEM_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b872de9a8271..7f43d31c9400 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2476,6 +2476,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, } niov = skb_frag_net_iov(frag); + if (!net_is_devmem_iov(niov)) { + err = -ENODEV; + goto out; + } + end = start + skb_frag_size(frag); copy = end - offset; -- 2.51.0 From 56102c013fa7b8dbba8c5d5f7e042ad5f18cf4ec Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Feb 2025 13:56:20 -0800 Subject: [PATCH 06/16] net: page_pool: add memory provider helpers Add helpers for memory providers to interact with page pools. net_mp_niov_{set,clear}_page_pool() serve to [dis]associate a net_iov with a page pool. If used, the memory provider is responsible to match "set" calls with "clear" once a net_iov is not going to be used by a page pool anymore, changing a page pool, etc. Acked-by: Jakub Kicinski Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-10-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/page_pool/memory_provider.h | 19 +++++++++++++++++ net/core/page_pool.c | 28 +++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index 36469a7e649f..4f0ffb8f6a0a 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -18,4 +18,23 @@ struct memory_provider_ops { void (*uninstall)(void *mp_priv, struct netdev_rx_queue *rxq); }; +bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr); +void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov); +void net_mp_niov_clear_page_pool(struct net_iov *niov); + +/** + * net_mp_netmem_place_in_cache() - give a netmem to a page pool + * @pool: the page pool to place the netmem into + * @netmem: netmem to give + * + * Push an accounted netmem into the page pool's allocation cache. The caller + * must ensure that there is space in the cache. It should only be called off + * the mp_ops->alloc_netmems() path. + */ +static inline void net_mp_netmem_place_in_cache(struct page_pool *pool, + netmem_ref netmem) +{ + pool->alloc.cache[pool->alloc.count++] = netmem; +} + #endif diff --git a/net/core/page_pool.c b/net/core/page_pool.c index d632cf2c91c3..686bd4a117d9 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -1197,3 +1197,31 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); + +bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr) +{ + return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr); +} + +/* Associate a niov with a page pool. Should follow with a matching + * net_mp_niov_clear_page_pool() + */ +void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_set_pp_info(pool, netmem); + + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); +} + +/* Disassociate a niov from a page pool. Should only be used in the + * ->release_netmem() path. + */ +void net_mp_niov_clear_page_pool(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_clear_pp_info(netmem); +} -- 2.51.0 From 6e18ed929d3ba9b3b92ba5894f9233686b3e3ec1 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 4 Feb 2025 13:56:21 -0800 Subject: [PATCH 07/16] net: add helpers for setting a memory provider on an rx queue Add helpers that properly prep or remove a memory provider for an rx queue then restart the queue. Reviewed-by: Jakub Kicinski Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-11-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/page_pool/memory_provider.h | 5 ++ net/core/netdev_rx_queue.c | 69 +++++++++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index 4f0ffb8f6a0a..b3e665897767 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -22,6 +22,11 @@ bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr); void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov); void net_mp_niov_clear_page_pool(struct net_iov *niov); +int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *p); +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p); + /** * net_mp_netmem_place_in_cache() - give a netmem to a page pool * @pool: the page pool to place the netmem into diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index db82786fa0c4..db46880f37cc 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "page_pool_priv.h" @@ -80,3 +81,71 @@ err_free_new_mem: return err; } EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL"); + +static int __net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *p) +{ + struct netdev_rx_queue *rxq; + int ret; + + if (ifq_idx >= dev->real_num_rx_queues) + return -EINVAL; + ifq_idx = array_index_nospec(ifq_idx, dev->real_num_rx_queues); + + rxq = __netif_get_rx_queue(dev, ifq_idx); + if (rxq->mp_params.mp_ops) + return -EEXIST; + + rxq->mp_params = *p; + ret = netdev_rx_queue_restart(dev, ifq_idx); + if (ret) { + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + } + return ret; +} + +int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *p) +{ + int ret; + + rtnl_lock(); + ret = __net_mp_open_rxq(dev, ifq_idx, p); + rtnl_unlock(); + return ret; +} + +static void __net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p) +{ + struct netdev_rx_queue *rxq; + + if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) + return; + + rxq = __netif_get_rx_queue(dev, ifq_idx); + + /* Callers holding a netdev ref may get here after we already + * went thru shutdown via dev_memory_provider_uninstall(). + */ + if (dev->reg_state > NETREG_REGISTERED && + !rxq->mp_params.mp_ops) + return; + + if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops || + rxq->mp_params.mp_priv != old_p->mp_priv)) + return; + + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + WARN_ON(netdev_rx_queue_restart(dev, ifq_idx)); +} + +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p) +{ + rtnl_lock(); + __net_mp_close_rxq(dev, ifq_idx, old_p); + rtnl_unlock(); +} -- 2.51.0 From 6597e8d35851e1e0cb381f76ce1d960518fd8c94 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 5 Feb 2025 19:37:47 +0000 Subject: [PATCH 08/16] netdev-genl: Elide napi_id when not present There are at least two cases where napi_id may not present and the napi_id should be elided: 1. Queues could be created, but napi_enable may not have been called yet. In this case, there may be a NAPI but it may not have an ID and output of a napi_id should be elided. 2. TX-only NAPIs currently do not have NAPI IDs. If a TX queue happens to be linked with a TX-only NAPI, elide the NAPI ID from the netlink output as a NAPI ID of 0 is not useful for users. Signed-off-by: Joe Damato Reviewed-by: Sridhar Samudrala Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250205193751.297211-1-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- include/net/busy_poll.h | 5 +++++ net/core/netdev-genl.c | 14 +++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index c39a426ebf52..741fa7754700 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -24,6 +24,11 @@ */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) +static inline bool napi_id_valid(unsigned int napi_id) +{ + return napi_id >= MIN_NAPI_ID; +} + #define BUSY_POLL_BUDGET 8 #ifdef CONFIG_NET_RX_BUSY_POLL diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 5b459b4fef46..0dcd4faefd8d 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -365,6 +365,13 @@ int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) return err; } +static int nla_put_napi_id(struct sk_buff *skb, const struct napi_struct *napi) +{ + if (napi && napi_id_valid(napi->napi_id)) + return nla_put_u32(skb, NETDEV_A_QUEUE_NAPI_ID, napi->napi_id); + return 0; +} + static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) @@ -386,9 +393,7 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, switch (q_type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(netdev, q_idx); - - if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, - rxq->napi->napi_id)) + if (nla_put_napi_id(rsp, rxq->napi)) goto nla_put_failure; params = &rxq->mp_params; @@ -398,8 +403,7 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); - if (txq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, - txq->napi->napi_id)) + if (nla_put_napi_id(rsp, txq->napi)) goto nla_put_failure; } -- 2.51.0 From fe57762c6490581ae446956ab54d221262a4951b Mon Sep 17 00:00:00 2001 From: John Daley Date: Wed, 5 Feb 2025 15:54:13 -0800 Subject: [PATCH 09/16] enic: Move RX functions to their own file Move RX handler code into its own file in preparation for further changes. Some formatting changes were necessary in order to satisfy checkpatch but there were no functional changes. Co-developed-by: Nelson Escobar Signed-off-by: Nelson Escobar Co-developed-by: Satish Kharat Signed-off-by: Satish Kharat Signed-off-by: John Daley Link: https://patch.msgid.link/20250205235416.25410-2-johndale@cisco.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cisco/enic/Makefile | 2 +- drivers/net/ethernet/cisco/enic/enic_main.c | 238 +------------------ drivers/net/ethernet/cisco/enic/enic_rq.c | 242 ++++++++++++++++++++ drivers/net/ethernet/cisco/enic/enic_rq.h | 10 + 4 files changed, 254 insertions(+), 238 deletions(-) create mode 100644 drivers/net/ethernet/cisco/enic/enic_rq.c create mode 100644 drivers/net/ethernet/cisco/enic/enic_rq.h diff --git a/drivers/net/ethernet/cisco/enic/Makefile b/drivers/net/ethernet/cisco/enic/Makefile index c3b6febfdbe4..b3b5196b2dfc 100644 --- a/drivers/net/ethernet/cisco/enic/Makefile +++ b/drivers/net/ethernet/cisco/enic/Makefile @@ -3,5 +3,5 @@ obj-$(CONFIG_ENIC) := enic.o enic-y := enic_main.o vnic_cq.o vnic_intr.o vnic_wq.o \ enic_res.o enic_dev.o enic_pp.o vnic_dev.o vnic_rq.o vnic_vic.o \ - enic_ethtool.o enic_api.o enic_clsf.o + enic_ethtool.o enic_api.o enic_clsf.o enic_rq.o diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index 49f6cab01ed5..1d9f109346b8 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -58,6 +58,7 @@ #include "enic_dev.h" #include "enic_pp.h" #include "enic_clsf.h" +#include "enic_rq.h" #define ENIC_NOTIFY_TIMER_PERIOD (2 * HZ) #define WQ_ENET_MAX_DESC_LEN (1 << WQ_ENET_LEN_BITS) @@ -1313,243 +1314,6 @@ nla_put_failure: return -EMSGSIZE; } -static void enic_free_rq_buf(struct vnic_rq *rq, struct vnic_rq_buf *buf) -{ - struct enic *enic = vnic_dev_priv(rq->vdev); - - if (!buf->os_buf) - return; - - dma_unmap_single(&enic->pdev->dev, buf->dma_addr, buf->len, - DMA_FROM_DEVICE); - dev_kfree_skb_any(buf->os_buf); - buf->os_buf = NULL; -} - -static int enic_rq_alloc_buf(struct vnic_rq *rq) -{ - struct enic *enic = vnic_dev_priv(rq->vdev); - struct net_device *netdev = enic->netdev; - struct sk_buff *skb; - unsigned int len = netdev->mtu + VLAN_ETH_HLEN; - unsigned int os_buf_index = 0; - dma_addr_t dma_addr; - struct vnic_rq_buf *buf = rq->to_use; - - if (buf->os_buf) { - enic_queue_rq_desc(rq, buf->os_buf, os_buf_index, buf->dma_addr, - buf->len); - - return 0; - } - skb = netdev_alloc_skb_ip_align(netdev, len); - if (!skb) { - enic->rq[rq->index].stats.no_skb++; - return -ENOMEM; - } - - dma_addr = dma_map_single(&enic->pdev->dev, skb->data, len, - DMA_FROM_DEVICE); - if (unlikely(enic_dma_map_check(enic, dma_addr))) { - dev_kfree_skb(skb); - return -ENOMEM; - } - - enic_queue_rq_desc(rq, skb, os_buf_index, - dma_addr, len); - - return 0; -} - -static void enic_intr_update_pkt_size(struct vnic_rx_bytes_counter *pkt_size, - u32 pkt_len) -{ - if (ENIC_LARGE_PKT_THRESHOLD <= pkt_len) - pkt_size->large_pkt_bytes_cnt += pkt_len; - else - pkt_size->small_pkt_bytes_cnt += pkt_len; -} - -static bool enic_rxcopybreak(struct net_device *netdev, struct sk_buff **skb, - struct vnic_rq_buf *buf, u16 len) -{ - struct enic *enic = netdev_priv(netdev); - struct sk_buff *new_skb; - - if (len > enic->rx_copybreak) - return false; - new_skb = netdev_alloc_skb_ip_align(netdev, len); - if (!new_skb) - return false; - dma_sync_single_for_cpu(&enic->pdev->dev, buf->dma_addr, len, - DMA_FROM_DEVICE); - memcpy(new_skb->data, (*skb)->data, len); - *skb = new_skb; - - return true; -} - -static void enic_rq_indicate_buf(struct vnic_rq *rq, - struct cq_desc *cq_desc, struct vnic_rq_buf *buf, - int skipped, void *opaque) -{ - struct enic *enic = vnic_dev_priv(rq->vdev); - struct net_device *netdev = enic->netdev; - struct sk_buff *skb; - struct vnic_cq *cq = &enic->cq[enic_cq_rq(enic, rq->index)]; - struct enic_rq_stats *rqstats = &enic->rq[rq->index].stats; - - u8 type, color, eop, sop, ingress_port, vlan_stripped; - u8 fcoe, fcoe_sof, fcoe_fc_crc_ok, fcoe_enc_error, fcoe_eof; - u8 tcp_udp_csum_ok, udp, tcp, ipv4_csum_ok; - u8 ipv6, ipv4, ipv4_fragment, fcs_ok, rss_type, csum_not_calc; - u8 packet_error; - u16 q_number, completed_index, bytes_written, vlan_tci, checksum; - u32 rss_hash; - bool outer_csum_ok = true, encap = false; - - rqstats->packets++; - if (skipped) { - rqstats->desc_skip++; - return; - } - - skb = buf->os_buf; - - cq_enet_rq_desc_dec((struct cq_enet_rq_desc *)cq_desc, - &type, &color, &q_number, &completed_index, - &ingress_port, &fcoe, &eop, &sop, &rss_type, - &csum_not_calc, &rss_hash, &bytes_written, - &packet_error, &vlan_stripped, &vlan_tci, &checksum, - &fcoe_sof, &fcoe_fc_crc_ok, &fcoe_enc_error, - &fcoe_eof, &tcp_udp_csum_ok, &udp, &tcp, - &ipv4_csum_ok, &ipv6, &ipv4, &ipv4_fragment, - &fcs_ok); - - if (packet_error) { - - if (!fcs_ok) { - if (bytes_written > 0) - rqstats->bad_fcs++; - else if (bytes_written == 0) - rqstats->pkt_truncated++; - } - - dma_unmap_single(&enic->pdev->dev, buf->dma_addr, buf->len, - DMA_FROM_DEVICE); - dev_kfree_skb_any(skb); - buf->os_buf = NULL; - - return; - } - - if (eop && bytes_written > 0) { - - /* Good receive - */ - rqstats->bytes += bytes_written; - if (!enic_rxcopybreak(netdev, &skb, buf, bytes_written)) { - buf->os_buf = NULL; - dma_unmap_single(&enic->pdev->dev, buf->dma_addr, - buf->len, DMA_FROM_DEVICE); - } - prefetch(skb->data - NET_IP_ALIGN); - - skb_put(skb, bytes_written); - skb->protocol = eth_type_trans(skb, netdev); - skb_record_rx_queue(skb, q_number); - if ((netdev->features & NETIF_F_RXHASH) && rss_hash && - (type == 3)) { - switch (rss_type) { - case CQ_ENET_RQ_DESC_RSS_TYPE_TCP_IPv4: - case CQ_ENET_RQ_DESC_RSS_TYPE_TCP_IPv6: - case CQ_ENET_RQ_DESC_RSS_TYPE_TCP_IPv6_EX: - skb_set_hash(skb, rss_hash, PKT_HASH_TYPE_L4); - rqstats->l4_rss_hash++; - break; - case CQ_ENET_RQ_DESC_RSS_TYPE_IPv4: - case CQ_ENET_RQ_DESC_RSS_TYPE_IPv6: - case CQ_ENET_RQ_DESC_RSS_TYPE_IPv6_EX: - skb_set_hash(skb, rss_hash, PKT_HASH_TYPE_L3); - rqstats->l3_rss_hash++; - break; - } - } - if (enic->vxlan.vxlan_udp_port_number) { - switch (enic->vxlan.patch_level) { - case 0: - if (fcoe) { - encap = true; - outer_csum_ok = fcoe_fc_crc_ok; - } - break; - case 2: - if ((type == 7) && - (rss_hash & BIT(0))) { - encap = true; - outer_csum_ok = (rss_hash & BIT(1)) && - (rss_hash & BIT(2)); - } - break; - } - } - - /* Hardware does not provide whole packet checksum. It only - * provides pseudo checksum. Since hw validates the packet - * checksum but not provide us the checksum value. use - * CHECSUM_UNNECESSARY. - * - * In case of encap pkt tcp_udp_csum_ok/tcp_udp_csum_ok is - * inner csum_ok. outer_csum_ok is set by hw when outer udp - * csum is correct or is zero. - */ - if ((netdev->features & NETIF_F_RXCSUM) && !csum_not_calc && - tcp_udp_csum_ok && outer_csum_ok && - (ipv4_csum_ok || ipv6)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->csum_level = encap; - if (encap) - rqstats->csum_unnecessary_encap++; - else - rqstats->csum_unnecessary++; - } - - if (vlan_stripped) { - __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci); - rqstats->vlan_stripped++; - } - skb_mark_napi_id(skb, &enic->napi[rq->index]); - if (!(netdev->features & NETIF_F_GRO)) - netif_receive_skb(skb); - else - napi_gro_receive(&enic->napi[q_number], skb); - if (enic->rx_coalesce_setting.use_adaptive_rx_coalesce) - enic_intr_update_pkt_size(&cq->pkt_size_counter, - bytes_written); - } else { - - /* Buffer overflow - */ - rqstats->pkt_truncated++; - dma_unmap_single(&enic->pdev->dev, buf->dma_addr, buf->len, - DMA_FROM_DEVICE); - dev_kfree_skb_any(skb); - buf->os_buf = NULL; - } -} - -static int enic_rq_service(struct vnic_dev *vdev, struct cq_desc *cq_desc, - u8 type, u16 q_number, u16 completed_index, void *opaque) -{ - struct enic *enic = vnic_dev_priv(vdev); - - vnic_rq_service(&enic->rq[q_number].vrq, cq_desc, - completed_index, VNIC_RQ_RETURN_DESC, - enic_rq_indicate_buf, opaque); - - return 0; -} - static void enic_set_int_moderation(struct enic *enic, struct vnic_rq *rq) { unsigned int intr = enic_msix_rq_intr(enic, rq->index); diff --git a/drivers/net/ethernet/cisco/enic/enic_rq.c b/drivers/net/ethernet/cisco/enic/enic_rq.c new file mode 100644 index 000000000000..e5b2f581c055 --- /dev/null +++ b/drivers/net/ethernet/cisco/enic/enic_rq.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2024 Cisco Systems, Inc. All rights reserved. + +#include +#include +#include +#include "enic.h" +#include "enic_res.h" +#include "enic_rq.h" +#include "vnic_rq.h" +#include "cq_enet_desc.h" + +#define ENIC_LARGE_PKT_THRESHOLD 1000 + +static void enic_intr_update_pkt_size(struct vnic_rx_bytes_counter *pkt_size, + u32 pkt_len) +{ + if (pkt_len > ENIC_LARGE_PKT_THRESHOLD) + pkt_size->large_pkt_bytes_cnt += pkt_len; + else + pkt_size->small_pkt_bytes_cnt += pkt_len; +} + +static bool enic_rxcopybreak(struct net_device *netdev, struct sk_buff **skb, + struct vnic_rq_buf *buf, u16 len) +{ + struct enic *enic = netdev_priv(netdev); + struct sk_buff *new_skb; + + if (len > enic->rx_copybreak) + return false; + new_skb = netdev_alloc_skb_ip_align(netdev, len); + if (!new_skb) + return false; + dma_sync_single_for_cpu(&enic->pdev->dev, buf->dma_addr, len, + DMA_FROM_DEVICE); + memcpy(new_skb->data, (*skb)->data, len); + *skb = new_skb; + + return true; +} + +int enic_rq_service(struct vnic_dev *vdev, struct cq_desc *cq_desc, u8 type, + u16 q_number, u16 completed_index, void *opaque) +{ + struct enic *enic = vnic_dev_priv(vdev); + + vnic_rq_service(&enic->rq[q_number].vrq, cq_desc, completed_index, + VNIC_RQ_RETURN_DESC, enic_rq_indicate_buf, opaque); + return 0; +} + +int enic_rq_alloc_buf(struct vnic_rq *rq) +{ + struct enic *enic = vnic_dev_priv(rq->vdev); + struct net_device *netdev = enic->netdev; + struct sk_buff *skb; + unsigned int len = netdev->mtu + VLAN_ETH_HLEN; + unsigned int os_buf_index = 0; + dma_addr_t dma_addr; + struct vnic_rq_buf *buf = rq->to_use; + + if (buf->os_buf) { + enic_queue_rq_desc(rq, buf->os_buf, os_buf_index, buf->dma_addr, + buf->len); + + return 0; + } + skb = netdev_alloc_skb_ip_align(netdev, len); + if (!skb) { + enic->rq[rq->index].stats.no_skb++; + return -ENOMEM; + } + + dma_addr = dma_map_single(&enic->pdev->dev, skb->data, len, + DMA_FROM_DEVICE); + if (unlikely(enic_dma_map_check(enic, dma_addr))) { + dev_kfree_skb(skb); + return -ENOMEM; + } + + enic_queue_rq_desc(rq, skb, os_buf_index, dma_addr, len); + + return 0; +} + +void enic_free_rq_buf(struct vnic_rq *rq, struct vnic_rq_buf *buf) +{ + struct enic *enic = vnic_dev_priv(rq->vdev); + + if (!buf->os_buf) + return; + + dma_unmap_single(&enic->pdev->dev, buf->dma_addr, buf->len, + DMA_FROM_DEVICE); + dev_kfree_skb_any(buf->os_buf); + buf->os_buf = NULL; +} + +void enic_rq_indicate_buf(struct vnic_rq *rq, struct cq_desc *cq_desc, + struct vnic_rq_buf *buf, int skipped, void *opaque) +{ + struct enic *enic = vnic_dev_priv(rq->vdev); + struct net_device *netdev = enic->netdev; + struct sk_buff *skb; + struct vnic_cq *cq = &enic->cq[enic_cq_rq(enic, rq->index)]; + struct enic_rq_stats *rqstats = &enic->rq[rq->index].stats; + + u8 type, color, eop, sop, ingress_port, vlan_stripped; + u8 fcoe, fcoe_sof, fcoe_fc_crc_ok, fcoe_enc_error, fcoe_eof; + u8 tcp_udp_csum_ok, udp, tcp, ipv4_csum_ok; + u8 ipv6, ipv4, ipv4_fragment, fcs_ok, rss_type, csum_not_calc; + u8 packet_error; + u16 q_number, completed_index, bytes_written, vlan_tci, checksum; + u32 rss_hash; + bool outer_csum_ok = true, encap = false; + + rqstats->packets++; + if (skipped) { + rqstats->desc_skip++; + return; + } + + skb = buf->os_buf; + + cq_enet_rq_desc_dec((struct cq_enet_rq_desc *)cq_desc, &type, &color, + &q_number, &completed_index, &ingress_port, &fcoe, + &eop, &sop, &rss_type, &csum_not_calc, &rss_hash, + &bytes_written, &packet_error, &vlan_stripped, + &vlan_tci, &checksum, &fcoe_sof, &fcoe_fc_crc_ok, + &fcoe_enc_error, &fcoe_eof, &tcp_udp_csum_ok, &udp, + &tcp, &ipv4_csum_ok, &ipv6, &ipv4, &ipv4_fragment, + &fcs_ok); + + if (packet_error) { + if (!fcs_ok) { + if (bytes_written > 0) + rqstats->bad_fcs++; + else if (bytes_written == 0) + rqstats->pkt_truncated++; + } + + dma_unmap_single(&enic->pdev->dev, buf->dma_addr, buf->len, + DMA_FROM_DEVICE); + dev_kfree_skb_any(skb); + buf->os_buf = NULL; + + return; + } + + if (eop && bytes_written > 0) { + /* Good receive + */ + rqstats->bytes += bytes_written; + if (!enic_rxcopybreak(netdev, &skb, buf, bytes_written)) { + buf->os_buf = NULL; + dma_unmap_single(&enic->pdev->dev, buf->dma_addr, + buf->len, DMA_FROM_DEVICE); + } + prefetch(skb->data - NET_IP_ALIGN); + + skb_put(skb, bytes_written); + skb->protocol = eth_type_trans(skb, netdev); + skb_record_rx_queue(skb, q_number); + if ((netdev->features & NETIF_F_RXHASH) && rss_hash && + type == 3) { + switch (rss_type) { + case CQ_ENET_RQ_DESC_RSS_TYPE_TCP_IPv4: + case CQ_ENET_RQ_DESC_RSS_TYPE_TCP_IPv6: + case CQ_ENET_RQ_DESC_RSS_TYPE_TCP_IPv6_EX: + skb_set_hash(skb, rss_hash, PKT_HASH_TYPE_L4); + rqstats->l4_rss_hash++; + break; + case CQ_ENET_RQ_DESC_RSS_TYPE_IPv4: + case CQ_ENET_RQ_DESC_RSS_TYPE_IPv6: + case CQ_ENET_RQ_DESC_RSS_TYPE_IPv6_EX: + skb_set_hash(skb, rss_hash, PKT_HASH_TYPE_L3); + rqstats->l3_rss_hash++; + break; + } + } + if (enic->vxlan.vxlan_udp_port_number) { + switch (enic->vxlan.patch_level) { + case 0: + if (fcoe) { + encap = true; + outer_csum_ok = fcoe_fc_crc_ok; + } + break; + case 2: + if (type == 7 && + (rss_hash & BIT(0))) { + encap = true; + outer_csum_ok = (rss_hash & BIT(1)) && + (rss_hash & BIT(2)); + } + break; + } + } + + /* Hardware does not provide whole packet checksum. It only + * provides pseudo checksum. Since hw validates the packet + * checksum but not provide us the checksum value. use + * CHECSUM_UNNECESSARY. + * + * In case of encap pkt tcp_udp_csum_ok/tcp_udp_csum_ok is + * inner csum_ok. outer_csum_ok is set by hw when outer udp + * csum is correct or is zero. + */ + if ((netdev->features & NETIF_F_RXCSUM) && !csum_not_calc && + tcp_udp_csum_ok && outer_csum_ok && + (ipv4_csum_ok || ipv6)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->csum_level = encap; + if (encap) + rqstats->csum_unnecessary_encap++; + else + rqstats->csum_unnecessary++; + } + + if (vlan_stripped) { + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci); + rqstats->vlan_stripped++; + } + skb_mark_napi_id(skb, &enic->napi[rq->index]); + if (!(netdev->features & NETIF_F_GRO)) + netif_receive_skb(skb); + else + napi_gro_receive(&enic->napi[q_number], skb); + if (enic->rx_coalesce_setting.use_adaptive_rx_coalesce) + enic_intr_update_pkt_size(&cq->pkt_size_counter, + bytes_written); + } else { + /* Buffer overflow + */ + rqstats->pkt_truncated++; + dma_unmap_single(&enic->pdev->dev, buf->dma_addr, buf->len, + DMA_FROM_DEVICE); + dev_kfree_skb_any(skb); + buf->os_buf = NULL; + } +} diff --git a/drivers/net/ethernet/cisco/enic/enic_rq.h b/drivers/net/ethernet/cisco/enic/enic_rq.h new file mode 100644 index 000000000000..a75d07562686 --- /dev/null +++ b/drivers/net/ethernet/cisco/enic/enic_rq.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only + * Copyright 2024 Cisco Systems, Inc. All rights reserved. + */ + +int enic_rq_service(struct vnic_dev *vdev, struct cq_desc *cq_desc, u8 type, + u16 q_number, u16 completed_index, void *opaque); +void enic_rq_indicate_buf(struct vnic_rq *rq, struct cq_desc *cq_desc, + struct vnic_rq_buf *buf, int skipped, void *opaque); +int enic_rq_alloc_buf(struct vnic_rq *rq); +void enic_free_rq_buf(struct vnic_rq *rq, struct vnic_rq_buf *buf); -- 2.51.0 From eab3726347f8e38162ee2a0bef0f24fad11f4b61 Mon Sep 17 00:00:00 2001 From: John Daley Date: Wed, 5 Feb 2025 15:54:14 -0800 Subject: [PATCH 10/16] enic: Simplify RX handler function Split up RX handler functions in preparation for moving to a page pool based implementation. No functional changes. Co-developed-by: Nelson Escobar Signed-off-by: Nelson Escobar Co-developed-by: Satish Kharat Signed-off-by: Satish Kharat Signed-off-by: John Daley Link: https://patch.msgid.link/20250205235416.25410-3-johndale@cisco.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cisco/enic/enic_rq.c | 162 +++++++++++++--------- 1 file changed, 93 insertions(+), 69 deletions(-) diff --git a/drivers/net/ethernet/cisco/enic/enic_rq.c b/drivers/net/ethernet/cisco/enic/enic_rq.c index e5b2f581c055..48aa385aa831 100644 --- a/drivers/net/ethernet/cisco/enic/enic_rq.c +++ b/drivers/net/ethernet/cisco/enic/enic_rq.c @@ -50,6 +50,94 @@ int enic_rq_service(struct vnic_dev *vdev, struct cq_desc *cq_desc, u8 type, return 0; } +static void enic_rq_set_skb_flags(struct vnic_rq *vrq, u8 type, u32 rss_hash, + u8 rss_type, u8 fcoe, u8 fcoe_fc_crc_ok, + u8 vlan_stripped, u8 csum_not_calc, + u8 tcp_udp_csum_ok, u8 ipv6, u8 ipv4_csum_ok, + u16 vlan_tci, struct sk_buff *skb) +{ + struct enic *enic = vnic_dev_priv(vrq->vdev); + struct net_device *netdev = enic->netdev; + struct enic_rq_stats *rqstats = &enic->rq[vrq->index].stats; + bool outer_csum_ok = true, encap = false; + + if ((netdev->features & NETIF_F_RXHASH) && rss_hash && type == 3) { + switch (rss_type) { + case CQ_ENET_RQ_DESC_RSS_TYPE_TCP_IPv4: + case CQ_ENET_RQ_DESC_RSS_TYPE_TCP_IPv6: + case CQ_ENET_RQ_DESC_RSS_TYPE_TCP_IPv6_EX: + skb_set_hash(skb, rss_hash, PKT_HASH_TYPE_L4); + rqstats->l4_rss_hash++; + break; + case CQ_ENET_RQ_DESC_RSS_TYPE_IPv4: + case CQ_ENET_RQ_DESC_RSS_TYPE_IPv6: + case CQ_ENET_RQ_DESC_RSS_TYPE_IPv6_EX: + skb_set_hash(skb, rss_hash, PKT_HASH_TYPE_L3); + rqstats->l3_rss_hash++; + break; + } + } + if (enic->vxlan.vxlan_udp_port_number) { + switch (enic->vxlan.patch_level) { + case 0: + if (fcoe) { + encap = true; + outer_csum_ok = fcoe_fc_crc_ok; + } + break; + case 2: + if (type == 7 && (rss_hash & BIT(0))) { + encap = true; + outer_csum_ok = (rss_hash & BIT(1)) && + (rss_hash & BIT(2)); + } + break; + } + } + + /* Hardware does not provide whole packet checksum. It only + * provides pseudo checksum. Since hw validates the packet + * checksum but not provide us the checksum value. use + * CHECSUM_UNNECESSARY. + * + * In case of encap pkt tcp_udp_csum_ok/tcp_udp_csum_ok is + * inner csum_ok. outer_csum_ok is set by hw when outer udp + * csum is correct or is zero. + */ + if ((netdev->features & NETIF_F_RXCSUM) && !csum_not_calc && + tcp_udp_csum_ok && outer_csum_ok && (ipv4_csum_ok || ipv6)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->csum_level = encap; + if (encap) + rqstats->csum_unnecessary_encap++; + else + rqstats->csum_unnecessary++; + } + + if (vlan_stripped) { + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci); + rqstats->vlan_stripped++; + } +} + +static bool enic_rq_pkt_error(struct vnic_rq *vrq, u8 packet_error, u8 fcs_ok, + u16 bytes_written) +{ + struct enic *enic = vnic_dev_priv(vrq->vdev); + struct enic_rq_stats *rqstats = &enic->rq[vrq->index].stats; + + if (packet_error) { + if (!fcs_ok) { + if (bytes_written > 0) + rqstats->bad_fcs++; + else if (bytes_written == 0) + rqstats->pkt_truncated++; + } + return true; + } + return false; +} + int enic_rq_alloc_buf(struct vnic_rq *rq) { struct enic *enic = vnic_dev_priv(rq->vdev); @@ -113,7 +201,6 @@ void enic_rq_indicate_buf(struct vnic_rq *rq, struct cq_desc *cq_desc, u8 packet_error; u16 q_number, completed_index, bytes_written, vlan_tci, checksum; u32 rss_hash; - bool outer_csum_ok = true, encap = false; rqstats->packets++; if (skipped) { @@ -132,14 +219,7 @@ void enic_rq_indicate_buf(struct vnic_rq *rq, struct cq_desc *cq_desc, &tcp, &ipv4_csum_ok, &ipv6, &ipv4, &ipv4_fragment, &fcs_ok); - if (packet_error) { - if (!fcs_ok) { - if (bytes_written > 0) - rqstats->bad_fcs++; - else if (bytes_written == 0) - rqstats->pkt_truncated++; - } - + if (enic_rq_pkt_error(rq, packet_error, fcs_ok, bytes_written)) { dma_unmap_single(&enic->pdev->dev, buf->dma_addr, buf->len, DMA_FROM_DEVICE); dev_kfree_skb_any(skb); @@ -162,66 +242,10 @@ void enic_rq_indicate_buf(struct vnic_rq *rq, struct cq_desc *cq_desc, skb_put(skb, bytes_written); skb->protocol = eth_type_trans(skb, netdev); skb_record_rx_queue(skb, q_number); - if ((netdev->features & NETIF_F_RXHASH) && rss_hash && - type == 3) { - switch (rss_type) { - case CQ_ENET_RQ_DESC_RSS_TYPE_TCP_IPv4: - case CQ_ENET_RQ_DESC_RSS_TYPE_TCP_IPv6: - case CQ_ENET_RQ_DESC_RSS_TYPE_TCP_IPv6_EX: - skb_set_hash(skb, rss_hash, PKT_HASH_TYPE_L4); - rqstats->l4_rss_hash++; - break; - case CQ_ENET_RQ_DESC_RSS_TYPE_IPv4: - case CQ_ENET_RQ_DESC_RSS_TYPE_IPv6: - case CQ_ENET_RQ_DESC_RSS_TYPE_IPv6_EX: - skb_set_hash(skb, rss_hash, PKT_HASH_TYPE_L3); - rqstats->l3_rss_hash++; - break; - } - } - if (enic->vxlan.vxlan_udp_port_number) { - switch (enic->vxlan.patch_level) { - case 0: - if (fcoe) { - encap = true; - outer_csum_ok = fcoe_fc_crc_ok; - } - break; - case 2: - if (type == 7 && - (rss_hash & BIT(0))) { - encap = true; - outer_csum_ok = (rss_hash & BIT(1)) && - (rss_hash & BIT(2)); - } - break; - } - } - - /* Hardware does not provide whole packet checksum. It only - * provides pseudo checksum. Since hw validates the packet - * checksum but not provide us the checksum value. use - * CHECSUM_UNNECESSARY. - * - * In case of encap pkt tcp_udp_csum_ok/tcp_udp_csum_ok is - * inner csum_ok. outer_csum_ok is set by hw when outer udp - * csum is correct or is zero. - */ - if ((netdev->features & NETIF_F_RXCSUM) && !csum_not_calc && - tcp_udp_csum_ok && outer_csum_ok && - (ipv4_csum_ok || ipv6)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->csum_level = encap; - if (encap) - rqstats->csum_unnecessary_encap++; - else - rqstats->csum_unnecessary++; - } - - if (vlan_stripped) { - __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci); - rqstats->vlan_stripped++; - } + enic_rq_set_skb_flags(rq, type, rss_hash, rss_type, fcoe, + fcoe_fc_crc_ok, vlan_stripped, + csum_not_calc, tcp_udp_csum_ok, ipv6, + ipv4_csum_ok, vlan_tci, skb); skb_mark_napi_id(skb, &enic->napi[rq->index]); if (!(netdev->features & NETIF_F_GRO)) netif_receive_skb(skb); -- 2.51.0 From d24cb52b2d8abc89739694ac0be7f0bc2da58e37 Mon Sep 17 00:00:00 2001 From: John Daley Date: Wed, 5 Feb 2025 15:54:15 -0800 Subject: [PATCH 11/16] enic: Use the Page Pool API for RX The Page Pool API improves bandwidth and CPU overhead by recycling pages instead of allocating new buffers in the driver. Make use of page pool fragment allocation for smaller MTUs so that multiple packets can share a page. For MTUs larger than PAGE_SIZE, adjust the 'order' page parameter so that contiguous pages can be used to receive the larger packets. The RQ descriptor field 'os_buf' is repurposed to hold page pointers allocated from page_pool instead of SKBs. When packets arrive, SKBs are allocated and the page pointers are attached instead of preallocating SKBs. 'alloc_fail' netdev statistic is incremented when page_pool_dev_alloc() fails. Co-developed-by: Nelson Escobar Signed-off-by: Nelson Escobar Co-developed-by: Satish Kharat Signed-off-by: Satish Kharat Signed-off-by: John Daley Link: https://patch.msgid.link/20250205235416.25410-4-johndale@cisco.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cisco/enic/enic.h | 3 + drivers/net/ethernet/cisco/enic/enic_main.c | 33 +++++++- drivers/net/ethernet/cisco/enic/enic_rq.c | 94 ++++++++------------- drivers/net/ethernet/cisco/enic/vnic_rq.h | 2 + 4 files changed, 71 insertions(+), 61 deletions(-) diff --git a/drivers/net/ethernet/cisco/enic/enic.h b/drivers/net/ethernet/cisco/enic/enic.h index 10b7e02ba4d0..2ccf2d2a77db 100644 --- a/drivers/net/ethernet/cisco/enic/enic.h +++ b/drivers/net/ethernet/cisco/enic/enic.h @@ -17,6 +17,7 @@ #include "vnic_nic.h" #include "vnic_rss.h" #include +#include #define DRV_NAME "enic" #define DRV_DESCRIPTION "Cisco VIC Ethernet NIC Driver" @@ -158,6 +159,7 @@ struct enic_rq_stats { u64 pkt_truncated; /* truncated pkts */ u64 no_skb; /* out of skbs */ u64 desc_skip; /* Rx pkt went into later buffer */ + u64 pp_alloc_fail; /* page pool alloc failure */ }; struct enic_wq { @@ -169,6 +171,7 @@ struct enic_wq { struct enic_rq { struct vnic_rq vrq; struct enic_rq_stats stats; + struct page_pool *pool; } ____cacheline_aligned; /* Per-instance private data structure */ diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index 1d9f109346b8..447c54dcd89b 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -1736,6 +1736,17 @@ static int enic_open(struct net_device *netdev) struct enic *enic = netdev_priv(netdev); unsigned int i; int err, ret; + unsigned int max_pkt_len = netdev->mtu + VLAN_ETH_HLEN; + struct page_pool_params pp_params = { + .order = get_order(max_pkt_len), + .pool_size = enic->config.rq_desc_count, + .nid = dev_to_node(&enic->pdev->dev), + .dev = &enic->pdev->dev, + .dma_dir = DMA_FROM_DEVICE, + .max_len = (max_pkt_len > PAGE_SIZE) ? max_pkt_len : PAGE_SIZE, + .netdev = netdev, + .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV, + }; err = enic_request_intr(enic); if (err) { @@ -1753,6 +1764,16 @@ static int enic_open(struct net_device *netdev) } for (i = 0; i < enic->rq_count; i++) { + /* create a page pool for each RQ */ + pp_params.napi = &enic->napi[i]; + pp_params.queue_idx = i; + enic->rq[i].pool = page_pool_create(&pp_params); + if (IS_ERR(enic->rq[i].pool)) { + err = PTR_ERR(enic->rq[i].pool); + enic->rq[i].pool = NULL; + goto err_out_free_rq; + } + /* enable rq before updating rq desc */ vnic_rq_enable(&enic->rq[i].vrq); vnic_rq_fill(&enic->rq[i].vrq, enic_rq_alloc_buf); @@ -1793,8 +1814,11 @@ static int enic_open(struct net_device *netdev) err_out_free_rq: for (i = 0; i < enic->rq_count; i++) { ret = vnic_rq_disable(&enic->rq[i].vrq); - if (!ret) + if (!ret) { vnic_rq_clean(&enic->rq[i].vrq, enic_free_rq_buf); + page_pool_destroy(enic->rq[i].pool); + enic->rq[i].pool = NULL; + } } enic_dev_notify_unset(enic); err_out_free_intr: @@ -1852,8 +1876,11 @@ static int enic_stop(struct net_device *netdev) for (i = 0; i < enic->wq_count; i++) vnic_wq_clean(&enic->wq[i].vwq, enic_free_wq_buf); - for (i = 0; i < enic->rq_count; i++) + for (i = 0; i < enic->rq_count; i++) { vnic_rq_clean(&enic->rq[i].vrq, enic_free_rq_buf); + page_pool_destroy(enic->rq[i].pool); + enic->rq[i].pool = NULL; + } for (i = 0; i < enic->cq_count; i++) vnic_cq_clean(&enic->cq[i]); for (i = 0; i < enic->intr_count; i++) @@ -2363,6 +2390,7 @@ static void enic_get_queue_stats_rx(struct net_device *dev, int idx, rxs->hw_drop_overruns = rqstats->pkt_truncated; rxs->csum_unnecessary = rqstats->csum_unnecessary + rqstats->csum_unnecessary_encap; + rxs->alloc_fail = rqstats->pp_alloc_fail; } static void enic_get_queue_stats_tx(struct net_device *dev, int idx, @@ -2390,6 +2418,7 @@ static void enic_get_base_stats(struct net_device *dev, rxs->hw_drops = 0; rxs->hw_drop_overruns = 0; rxs->csum_unnecessary = 0; + rxs->alloc_fail = 0; txs->bytes = 0; txs->packets = 0; txs->csum_none = 0; diff --git a/drivers/net/ethernet/cisco/enic/enic_rq.c b/drivers/net/ethernet/cisco/enic/enic_rq.c index 48aa385aa831..e3228ef7988a 100644 --- a/drivers/net/ethernet/cisco/enic/enic_rq.c +++ b/drivers/net/ethernet/cisco/enic/enic_rq.c @@ -21,25 +21,6 @@ static void enic_intr_update_pkt_size(struct vnic_rx_bytes_counter *pkt_size, pkt_size->small_pkt_bytes_cnt += pkt_len; } -static bool enic_rxcopybreak(struct net_device *netdev, struct sk_buff **skb, - struct vnic_rq_buf *buf, u16 len) -{ - struct enic *enic = netdev_priv(netdev); - struct sk_buff *new_skb; - - if (len > enic->rx_copybreak) - return false; - new_skb = netdev_alloc_skb_ip_align(netdev, len); - if (!new_skb) - return false; - dma_sync_single_for_cpu(&enic->pdev->dev, buf->dma_addr, len, - DMA_FROM_DEVICE); - memcpy(new_skb->data, (*skb)->data, len); - *skb = new_skb; - - return true; -} - int enic_rq_service(struct vnic_dev *vdev, struct cq_desc *cq_desc, u8 type, u16 q_number, u16 completed_index, void *opaque) { @@ -142,11 +123,15 @@ int enic_rq_alloc_buf(struct vnic_rq *rq) { struct enic *enic = vnic_dev_priv(rq->vdev); struct net_device *netdev = enic->netdev; - struct sk_buff *skb; + struct enic_rq *erq = &enic->rq[rq->index]; + struct enic_rq_stats *rqstats = &erq->stats; + unsigned int offset = 0; unsigned int len = netdev->mtu + VLAN_ETH_HLEN; unsigned int os_buf_index = 0; dma_addr_t dma_addr; struct vnic_rq_buf *buf = rq->to_use; + struct page *page; + unsigned int truesize = len; if (buf->os_buf) { enic_queue_rq_desc(rq, buf->os_buf, os_buf_index, buf->dma_addr, @@ -154,20 +139,16 @@ int enic_rq_alloc_buf(struct vnic_rq *rq) return 0; } - skb = netdev_alloc_skb_ip_align(netdev, len); - if (!skb) { - enic->rq[rq->index].stats.no_skb++; - return -ENOMEM; - } - dma_addr = dma_map_single(&enic->pdev->dev, skb->data, len, - DMA_FROM_DEVICE); - if (unlikely(enic_dma_map_check(enic, dma_addr))) { - dev_kfree_skb(skb); + page = page_pool_dev_alloc(erq->pool, &offset, &truesize); + if (unlikely(!page)) { + rqstats->pp_alloc_fail++; return -ENOMEM; } - - enic_queue_rq_desc(rq, skb, os_buf_index, dma_addr, len); + buf->offset = offset; + buf->truesize = truesize; + dma_addr = page_pool_get_dma_addr(page) + offset; + enic_queue_rq_desc(rq, (void *)page, os_buf_index, dma_addr, len); return 0; } @@ -175,13 +156,12 @@ int enic_rq_alloc_buf(struct vnic_rq *rq) void enic_free_rq_buf(struct vnic_rq *rq, struct vnic_rq_buf *buf) { struct enic *enic = vnic_dev_priv(rq->vdev); + struct enic_rq *erq = &enic->rq[rq->index]; if (!buf->os_buf) return; - dma_unmap_single(&enic->pdev->dev, buf->dma_addr, buf->len, - DMA_FROM_DEVICE); - dev_kfree_skb_any(buf->os_buf); + page_pool_put_full_page(erq->pool, (struct page *)buf->os_buf, true); buf->os_buf = NULL; } @@ -189,10 +169,10 @@ void enic_rq_indicate_buf(struct vnic_rq *rq, struct cq_desc *cq_desc, struct vnic_rq_buf *buf, int skipped, void *opaque) { struct enic *enic = vnic_dev_priv(rq->vdev); - struct net_device *netdev = enic->netdev; struct sk_buff *skb; struct vnic_cq *cq = &enic->cq[enic_cq_rq(enic, rq->index)]; struct enic_rq_stats *rqstats = &enic->rq[rq->index].stats; + struct napi_struct *napi; u8 type, color, eop, sop, ingress_port, vlan_stripped; u8 fcoe, fcoe_sof, fcoe_fc_crc_ok, fcoe_enc_error, fcoe_eof; @@ -208,8 +188,6 @@ void enic_rq_indicate_buf(struct vnic_rq *rq, struct cq_desc *cq_desc, return; } - skb = buf->os_buf; - cq_enet_rq_desc_dec((struct cq_enet_rq_desc *)cq_desc, &type, &color, &q_number, &completed_index, &ingress_port, &fcoe, &eop, &sop, &rss_type, &csum_not_calc, &rss_hash, @@ -219,48 +197,46 @@ void enic_rq_indicate_buf(struct vnic_rq *rq, struct cq_desc *cq_desc, &tcp, &ipv4_csum_ok, &ipv6, &ipv4, &ipv4_fragment, &fcs_ok); - if (enic_rq_pkt_error(rq, packet_error, fcs_ok, bytes_written)) { - dma_unmap_single(&enic->pdev->dev, buf->dma_addr, buf->len, - DMA_FROM_DEVICE); - dev_kfree_skb_any(skb); - buf->os_buf = NULL; - + if (enic_rq_pkt_error(rq, packet_error, fcs_ok, bytes_written)) return; - } if (eop && bytes_written > 0) { /* Good receive */ rqstats->bytes += bytes_written; - if (!enic_rxcopybreak(netdev, &skb, buf, bytes_written)) { - buf->os_buf = NULL; - dma_unmap_single(&enic->pdev->dev, buf->dma_addr, - buf->len, DMA_FROM_DEVICE); + napi = &enic->napi[rq->index]; + skb = napi_get_frags(napi); + if (unlikely(!skb)) { + net_warn_ratelimited("%s: skb alloc error rq[%d], desc[%d]\n", + enic->netdev->name, rq->index, + completed_index); + rqstats->no_skb++; + return; } + prefetch(skb->data - NET_IP_ALIGN); - skb_put(skb, bytes_written); - skb->protocol = eth_type_trans(skb, netdev); + dma_sync_single_for_cpu(&enic->pdev->dev, buf->dma_addr, + bytes_written, DMA_FROM_DEVICE); + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, + (struct page *)buf->os_buf, buf->offset, + bytes_written, buf->truesize); skb_record_rx_queue(skb, q_number); enic_rq_set_skb_flags(rq, type, rss_hash, rss_type, fcoe, fcoe_fc_crc_ok, vlan_stripped, csum_not_calc, tcp_udp_csum_ok, ipv6, ipv4_csum_ok, vlan_tci, skb); - skb_mark_napi_id(skb, &enic->napi[rq->index]); - if (!(netdev->features & NETIF_F_GRO)) - netif_receive_skb(skb); - else - napi_gro_receive(&enic->napi[q_number], skb); + skb_mark_for_recycle(skb); + napi_gro_frags(napi); if (enic->rx_coalesce_setting.use_adaptive_rx_coalesce) enic_intr_update_pkt_size(&cq->pkt_size_counter, bytes_written); + buf->os_buf = NULL; + buf->dma_addr = 0; + buf = buf->next; } else { /* Buffer overflow */ rqstats->pkt_truncated++; - dma_unmap_single(&enic->pdev->dev, buf->dma_addr, buf->len, - DMA_FROM_DEVICE); - dev_kfree_skb_any(skb); - buf->os_buf = NULL; } } diff --git a/drivers/net/ethernet/cisco/enic/vnic_rq.h b/drivers/net/ethernet/cisco/enic/vnic_rq.h index 0bc595abc03b..2ee4be2b9a34 100644 --- a/drivers/net/ethernet/cisco/enic/vnic_rq.h +++ b/drivers/net/ethernet/cisco/enic/vnic_rq.h @@ -61,6 +61,8 @@ struct vnic_rq_buf { unsigned int index; void *desc; uint64_t wr_id; + unsigned int offset; + unsigned int truesize; }; enum enic_poll_state { -- 2.51.0 From a3b2caaedeaa34891394fbb723b4f430bc6a08e2 Mon Sep 17 00:00:00 2001 From: John Daley Date: Wed, 5 Feb 2025 15:54:16 -0800 Subject: [PATCH 12/16] enic: remove copybreak tunable With the move to using the Page Pool API for RX, rx copybreak was not showing any improvement in host CPU overhead, latency or bandwidth so the driver no longer makes use of the rx_copybreak setting. This patch removes the ethtool tuneable hooks to set and get the rx copybreak since they and now no-ops. Rx copybreak was the only tunable supported, so remove the set and get tunable callbacks all together. Co-developed-by: Nelson Escobar Signed-off-by: Nelson Escobar Co-developed-by: Satish Kharat Signed-off-by: Satish Kharat Signed-off-by: John Daley Link: https://patch.msgid.link/20250205235416.25410-5-johndale@cisco.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cisco/enic/enic.h | 1 - .../net/ethernet/cisco/enic/enic_ethtool.c | 39 ------------------- drivers/net/ethernet/cisco/enic/enic_main.c | 3 -- 3 files changed, 43 deletions(-) diff --git a/drivers/net/ethernet/cisco/enic/enic.h b/drivers/net/ethernet/cisco/enic/enic.h index 2ccf2d2a77db..305ed12aa031 100644 --- a/drivers/net/ethernet/cisco/enic/enic.h +++ b/drivers/net/ethernet/cisco/enic/enic.h @@ -226,7 +226,6 @@ struct enic { unsigned int cq_avail; unsigned int cq_count; struct enic_rfs_flw_tbl rfs_h; - u32 rx_copybreak; u8 rss_key[ENIC_RSS_LEN]; struct vnic_gen_stats gen_stats; }; diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c index d607b4f0542c..18b929fc2879 100644 --- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c +++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c @@ -608,43 +608,6 @@ static int enic_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, return ret; } -static int enic_get_tunable(struct net_device *dev, - const struct ethtool_tunable *tuna, void *data) -{ - struct enic *enic = netdev_priv(dev); - int ret = 0; - - switch (tuna->id) { - case ETHTOOL_RX_COPYBREAK: - *(u32 *)data = enic->rx_copybreak; - break; - default: - ret = -EINVAL; - break; - } - - return ret; -} - -static int enic_set_tunable(struct net_device *dev, - const struct ethtool_tunable *tuna, - const void *data) -{ - struct enic *enic = netdev_priv(dev); - int ret = 0; - - switch (tuna->id) { - case ETHTOOL_RX_COPYBREAK: - enic->rx_copybreak = *(u32 *)data; - break; - default: - ret = -EINVAL; - break; - } - - return ret; -} - static u32 enic_get_rxfh_key_size(struct net_device *netdev) { return ENIC_RSS_LEN; @@ -727,8 +690,6 @@ static const struct ethtool_ops enic_ethtool_ops = { .get_coalesce = enic_get_coalesce, .set_coalesce = enic_set_coalesce, .get_rxnfc = enic_get_rxnfc, - .get_tunable = enic_get_tunable, - .set_tunable = enic_set_tunable, .get_rxfh_key_size = enic_get_rxfh_key_size, .get_rxfh = enic_get_rxfh, .set_rxfh = enic_set_rxfh, diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index 447c54dcd89b..f24fd29ea207 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -69,8 +69,6 @@ #define PCI_DEVICE_ID_CISCO_VIC_ENET_DYN 0x0044 /* enet dynamic vnic */ #define PCI_DEVICE_ID_CISCO_VIC_ENET_VF 0x0071 /* enet SRIOV VF */ -#define RX_COPYBREAK_DEFAULT 256 - /* Supported devices */ static const struct pci_device_id enic_id_table[] = { { PCI_VDEVICE(CISCO, PCI_DEVICE_ID_CISCO_VIC_ENET) }, @@ -2972,7 +2970,6 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) dev_err(dev, "Cannot register net device, aborting\n"); goto err_out_dev_deinit; } - enic->rx_copybreak = RX_COPYBREAK_DEFAULT; return 0; -- 2.51.0 From 508df2de7b3eabc72d773c5f29207745100986d0 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 5 Feb 2025 15:43:32 +0000 Subject: [PATCH 13/16] net: pcs: rzn1-miic: fill in PCS supported_interfaces Populate the PCS supported_interfaces bitmap with the interfaces that this PCS supports. This makes the manual checking in miic_validate() redundant, so remove that. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1tfhYq-003aTm-Nx@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/pcs/pcs-rzn1-miic.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/net/pcs/pcs-rzn1-miic.c b/drivers/net/pcs/pcs-rzn1-miic.c index 61944574d087..11a96459a425 100644 --- a/drivers/net/pcs/pcs-rzn1-miic.c +++ b/drivers/net/pcs/pcs-rzn1-miic.c @@ -268,17 +268,6 @@ static void miic_link_up(struct phylink_pcs *pcs, unsigned int neg_mode, (MIIC_CONVCTRL_CONV_SPEED | MIIC_CONVCTRL_FULLD), val); } -static int miic_validate(struct phylink_pcs *pcs, unsigned long *supported, - const struct phylink_link_state *state) -{ - if (phy_interface_mode_is_rgmii(state->interface) || - state->interface == PHY_INTERFACE_MODE_RMII || - state->interface == PHY_INTERFACE_MODE_MII) - return 1; - - return -EINVAL; -} - static int miic_pre_init(struct phylink_pcs *pcs) { struct miic_port *miic_port = phylink_pcs_to_miic_port(pcs); @@ -307,7 +296,6 @@ static int miic_pre_init(struct phylink_pcs *pcs) } static const struct phylink_pcs_ops miic_phylink_ops = { - .pcs_validate = miic_validate, .pcs_config = miic_config, .pcs_link_up = miic_link_up, .pcs_pre_init = miic_pre_init, @@ -363,6 +351,10 @@ struct phylink_pcs *miic_create(struct device *dev, struct device_node *np) miic_port->pcs.ops = &miic_phylink_ops; miic_port->pcs.neg_mode = true; + phy_interface_set_rgmii(miic_port->pcs.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_RMII, miic_port->pcs.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_MII, miic_port->pcs.supported_interfaces); + return &miic_port->pcs; } EXPORT_SYMBOL(miic_create); -- 2.51.0 From ec7309525a37b3f34c4efa85fc2ecaa65f41830e Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 5 Feb 2025 17:09:47 +0100 Subject: [PATCH 14/16] net: pcs: rzn1-miic: Convert to for_each_available_child_of_node() helper Simplify miic_parse_dt() by using the for_each_available_child_of_node() helper instead of manually skipping unavailable child nodes. Signed-off-by: Geert Uytterhoeven Reviewed-by: Simon Horman Link: https://patch.msgid.link/3e394d4cf8204bcf17b184bfda474085aa8ed0dd.1738771631.git.geert+renesas@glider.be Signed-off-by: Jakub Kicinski --- drivers/net/pcs/pcs-rzn1-miic.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/pcs/pcs-rzn1-miic.c b/drivers/net/pcs/pcs-rzn1-miic.c index 11a96459a425..a808ac7375f5 100644 --- a/drivers/net/pcs/pcs-rzn1-miic.c +++ b/drivers/net/pcs/pcs-rzn1-miic.c @@ -464,13 +464,10 @@ static int miic_parse_dt(struct device *dev, u32 *mode_cfg) if (of_property_read_u32(np, "renesas,miic-switch-portin", &conf) == 0) dt_val[0] = conf; - for_each_child_of_node(np, conv) { + for_each_available_child_of_node(np, conv) { if (of_property_read_u32(conv, "reg", &port)) continue; - if (!of_device_is_available(conv)) - continue; - if (of_property_read_u32(conv, "renesas,miic-input", &conf) == 0) dt_val[port] = conf; } -- 2.51.0 From 8d3bbe4355aded32961b9009b31de6d41b7352e9 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Wed, 5 Feb 2025 12:42:21 +0000 Subject: [PATCH 15/16] of: base: Add of_get_available_child_by_name() There are lot of drivers using of_get_child_by_name() followed by of_device_is_available() to find the available child node by name for a given parent. Provide a helper for these users to simplify the code. Suggested-by: Geert Uytterhoeven Reviewed-by: Rob Herring Signed-off-by: Biju Das Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/of/base.c | 27 +++++++++++++++++++++++++++ include/linux/of.h | 9 +++++++++ 2 files changed, 36 insertions(+) diff --git a/drivers/of/base.c b/drivers/of/base.c index af6c68bbb427..e37b088f1fad 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -824,6 +824,33 @@ struct device_node *of_get_child_by_name(const struct device_node *node, } EXPORT_SYMBOL(of_get_child_by_name); +/** + * of_get_available_child_by_name - Find the available child node by name for a given parent + * @node: parent node + * @name: child name to look for. + * + * This function looks for child node for given matching name and checks the + * device's availability for use. + * + * Return: A node pointer if found, with refcount incremented, use + * of_node_put() on it when done. + * Returns NULL if node is not found. + */ +struct device_node *of_get_available_child_by_name(const struct device_node *node, + const char *name) +{ + struct device_node *child; + + child = of_get_child_by_name(node, name); + if (child && !of_device_is_available(child)) { + of_node_put(child); + return NULL; + } + + return child; +} +EXPORT_SYMBOL(of_get_available_child_by_name); + struct device_node *__of_find_node_by_path(const struct device_node *parent, const char *path) { diff --git a/include/linux/of.h b/include/linux/of.h index eaf0e2a2b75c..9d6b8a61607f 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -301,6 +301,8 @@ extern struct device_node *of_get_compatible_child(const struct device_node *par const char *compatible); extern struct device_node *of_get_child_by_name(const struct device_node *node, const char *name); +extern struct device_node *of_get_available_child_by_name(const struct device_node *node, + const char *name); /* cache lookup */ extern struct device_node *of_find_next_cache_node(const struct device_node *); @@ -578,6 +580,13 @@ static inline struct device_node *of_get_child_by_name( return NULL; } +static inline struct device_node *of_get_available_child_by_name( + const struct device_node *node, + const char *name) +{ + return NULL; +} + static inline int of_device_is_compatible(const struct device_node *device, const char *name) { -- 2.51.0 From 46df19a8dfdf48c9ef6fe1fc66e912eabb0213ed Mon Sep 17 00:00:00 2001 From: Biju Das Date: Wed, 5 Feb 2025 12:42:22 +0000 Subject: [PATCH 16/16] net: dsa: rzn1_a5psw: Use of_get_available_child_by_name() Simplify a5psw_probe() by using of_get_available_child_by_name(). While at it, move of_node_put(mdio) inside the if block to avoid code duplication. Signed-off-by: Biju Das Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/dsa/rzn1_a5psw.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/net/dsa/rzn1_a5psw.c b/drivers/net/dsa/rzn1_a5psw.c index 66974379334a..31ea8130a495 100644 --- a/drivers/net/dsa/rzn1_a5psw.c +++ b/drivers/net/dsa/rzn1_a5psw.c @@ -1248,18 +1248,16 @@ static int a5psw_probe(struct platform_device *pdev) if (ret) goto clk_disable; - mdio = of_get_child_by_name(dev->of_node, "mdio"); - if (of_device_is_available(mdio)) { + mdio = of_get_available_child_by_name(dev->of_node, "mdio"); + if (mdio) { ret = a5psw_probe_mdio(a5psw, mdio); + of_node_put(mdio); if (ret) { - of_node_put(mdio); dev_err(dev, "Failed to register MDIO: %d\n", ret); goto hclk_disable; } } - of_node_put(mdio); - ds = &a5psw->ds; ds->dev = dev; ds->num_ports = A5PSW_PORTS_NUM; -- 2.51.0