From 945ca432bfd0788b960f8f721594dae4fc3c02c1 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Thu, 7 Nov 2024 21:43:56 +0200 Subject: [PATCH 01/16] net/mlx5e: SHAMPO, Drop info array The info array is used to store a pointer to the dma address of the header and to the frag page. However, this array is not really required: - The frag page can be calculated from the header index frag page index = header index / headers per page. - The dma address can be calculated through a formula: dma page address + header offset. This series gets rid of the info array and uses the above formulas instead. The current_page_index was used in conjunction with the info array to store page fragment indices. This variable is dropped as well. There was no performance regression observed. Signed-off-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241107194357.683732-12-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 3 +- .../net/ethernet/mellanox/mlx5/core/en_main.c | 7 +- .../net/ethernet/mellanox/mlx5/core/en_rx.c | 76 ++++++++++--------- 3 files changed, 42 insertions(+), 44 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index b4abb094f01a..979fc56205e1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -83,6 +83,7 @@ struct page_pool; #define MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE (8) #define MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE (9) #define MLX5E_SHAMPO_WQ_HEADER_PER_PAGE (PAGE_SIZE >> MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE) +#define MLX5E_SHAMPO_LOG_WQ_HEADER_PER_PAGE (PAGE_SHIFT - MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE) #define MLX5E_SHAMPO_WQ_BASE_HEAD_ENTRY_SIZE (64) #define MLX5E_SHAMPO_WQ_RESRV_SIZE (64 * 1024) #define MLX5E_SHAMPO_WQ_BASE_RESRV_SIZE (4096) @@ -624,9 +625,7 @@ struct mlx5e_dma_info { struct mlx5e_shampo_hd { u32 mkey; - struct mlx5e_dma_info *info; struct mlx5e_frag_page *pages; - u16 curr_page_index; u32 hd_per_wq; u16 hd_per_wqe; u16 pages_per_wq; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 3ca1ef1f39a5..2e27e9d6b820 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -350,19 +350,15 @@ static int mlx5e_rq_shampo_hd_info_alloc(struct mlx5e_rq *rq, int node) shampo->bitmap = bitmap_zalloc_node(shampo->hd_per_wq, GFP_KERNEL, node); - shampo->info = kvzalloc_node(array_size(shampo->hd_per_wq, - sizeof(*shampo->info)), - GFP_KERNEL, node); shampo->pages = kvzalloc_node(array_size(shampo->hd_per_wq, sizeof(*shampo->pages)), GFP_KERNEL, node); - if (!shampo->bitmap || !shampo->info || !shampo->pages) + if (!shampo->bitmap || !shampo->pages) goto err_nomem; return 0; err_nomem: - kvfree(shampo->info); kvfree(shampo->bitmap); kvfree(shampo->pages); @@ -372,7 +368,6 @@ err_nomem: static void mlx5e_rq_shampo_hd_info_free(struct mlx5e_rq *rq) { kvfree(rq->mpwqe.shampo->bitmap); - kvfree(rq->mpwqe.shampo->info); kvfree(rq->mpwqe.shampo->pages); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 637069c1b988..3de575875586 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -643,6 +643,21 @@ static void build_ksm_umr(struct mlx5e_icosq *sq, struct mlx5e_umr_wqe *umr_wqe, umr_wqe->uctrl.mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); } +static struct mlx5e_frag_page *mlx5e_shampo_hd_to_frag_page(struct mlx5e_rq *rq, int header_index) +{ + BUILD_BUG_ON(MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE > PAGE_SHIFT); + + return &rq->mpwqe.shampo->pages[header_index >> MLX5E_SHAMPO_LOG_WQ_HEADER_PER_PAGE]; +} + +static u64 mlx5e_shampo_hd_offset(int header_index) +{ + return (header_index & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1)) << + MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE; +} + +static void mlx5e_free_rx_shampo_hd_entry(struct mlx5e_rq *rq, u16 header_index); + static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq, struct mlx5e_icosq *sq, u16 ksm_entries, u16 index) @@ -650,9 +665,6 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq, struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; u16 pi, header_offset, err, wqe_bbs; u32 lkey = rq->mdev->mlx5e_res.hw_objs.mkey; - u16 page_index = shampo->curr_page_index; - struct mlx5e_frag_page *frag_page = NULL; - struct mlx5e_dma_info *dma_info; struct mlx5e_umr_wqe *umr_wqe; int headroom, i; u64 addr = 0; @@ -665,29 +677,20 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq, WARN_ON_ONCE(ksm_entries & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1)); for (i = 0; i < ksm_entries; i++, index++) { - dma_info = &shampo->info[index]; - header_offset = (index & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1)) << - MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE; - if (!(header_offset & (PAGE_SIZE - 1))) { - frag_page = &shampo->pages[page_index]; - page_index = (page_index + 1) & (shampo->pages_per_wq - 1); + header_offset = mlx5e_shampo_hd_offset(index); + if (!header_offset) { + struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, index); err = mlx5e_page_alloc_fragmented(rq, frag_page); if (unlikely(err)) goto err_unmap; addr = page_pool_get_dma_addr(frag_page->page); - - dma_info->addr = addr; - dma_info->frag_page = frag_page; - } else { - dma_info->addr = addr + header_offset; - dma_info->frag_page = frag_page; } umr_wqe->inline_ksms[i] = (struct mlx5_ksm) { .key = cpu_to_be32(lkey), - .va = cpu_to_be64(dma_info->addr + headroom), + .va = cpu_to_be64(addr + header_offset + headroom), }; } @@ -698,20 +701,22 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq, }; shampo->pi = (shampo->pi + ksm_entries) & (shampo->hd_per_wq - 1); - shampo->curr_page_index = page_index; sq->pc += wqe_bbs; sq->doorbell_cseg = &umr_wqe->ctrl; return 0; err_unmap: - while (--i >= 0) { - dma_info = &shampo->info[--index]; - if (!(i & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1))) { - dma_info->addr = ALIGN_DOWN(dma_info->addr, PAGE_SIZE); - mlx5e_page_release_fragmented(rq, dma_info->frag_page); + while (--i) { + --index; + header_offset = mlx5e_shampo_hd_offset(index); + if (!header_offset) { + struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, index); + + mlx5e_page_release_fragmented(rq, frag_page); } } + rq->stats->buff_alloc_err++; return err; } @@ -844,13 +849,11 @@ static void mlx5e_free_rx_shampo_hd_entry(struct mlx5e_rq *rq, u16 header_index) { struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; - u64 addr = shampo->info[header_index].addr; if (((header_index + 1) & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1)) == 0) { - struct mlx5e_dma_info *dma_info = &shampo->info[header_index]; + struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, header_index); - dma_info->addr = ALIGN_DOWN(addr, PAGE_SIZE); - mlx5e_page_release_fragmented(rq, dma_info->frag_page); + mlx5e_page_release_fragmented(rq, frag_page); } clear_bit(header_index, shampo->bitmap); } @@ -1204,10 +1207,10 @@ static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe, static void *mlx5e_shampo_get_packet_hd(struct mlx5e_rq *rq, u16 header_index) { - struct mlx5e_dma_info *last_head = &rq->mpwqe.shampo->info[header_index]; - u16 head_offset = (last_head->addr & (PAGE_SIZE - 1)) + rq->buff.headroom; + struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, header_index); + u16 head_offset = mlx5e_shampo_hd_offset(header_index) + rq->buff.headroom; - return page_address(last_head->frag_page->page) + head_offset; + return page_address(frag_page->page) + head_offset; } static void mlx5e_shampo_update_ipv4_udp_hdr(struct mlx5e_rq *rq, struct iphdr *ipv4) @@ -2178,29 +2181,30 @@ static struct sk_buff * mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, struct mlx5_cqe64 *cqe, u16 header_index) { - struct mlx5e_dma_info *head = &rq->mpwqe.shampo->info[header_index]; - u16 head_offset = head->addr & (PAGE_SIZE - 1); + struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, header_index); + dma_addr_t page_dma_addr = page_pool_get_dma_addr(frag_page->page); + u16 head_offset = mlx5e_shampo_hd_offset(header_index); + dma_addr_t dma_addr = page_dma_addr + head_offset; u16 head_size = cqe->shampo.header_size; u16 rx_headroom = rq->buff.headroom; struct sk_buff *skb = NULL; void *hdr, *data; u32 frag_size; - hdr = page_address(head->frag_page->page) + head_offset; + hdr = page_address(frag_page->page) + head_offset; data = hdr + rx_headroom; frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + head_size); if (likely(frag_size <= BIT(MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE))) { /* build SKB around header */ - dma_sync_single_range_for_cpu(rq->pdev, head->addr, 0, frag_size, rq->buff.map_dir); + dma_sync_single_range_for_cpu(rq->pdev, dma_addr, 0, frag_size, rq->buff.map_dir); net_prefetchw(hdr); net_prefetch(data); skb = mlx5e_build_linear_skb(rq, hdr, frag_size, rx_headroom, head_size, 0); - if (unlikely(!skb)) return NULL; - head->frag_page->frags++; + frag_page->frags++; } else { /* allocate SKB and copy header for large header */ rq->stats->gro_large_hds++; @@ -2212,7 +2216,7 @@ mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, } net_prefetchw(skb->data); - mlx5e_copy_skb_header(rq, skb, head->frag_page->page, head->addr, + mlx5e_copy_skb_header(rq, skb, frag_page->page, dma_addr, head_offset + rx_headroom, rx_headroom, head_size); /* skb linear part was allocated with headlen and aligned to long */ -- 2.50.1 From ab4219db89da1d019ef45675e4cd56a6841bbc1e Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Thu, 7 Nov 2024 21:43:57 +0200 Subject: [PATCH 02/16] net/mlx5e: SHAMPO, Rework header allocation loop The current loop code was based on the assumption that there can be page leftovers from previous function calls. This patch changes the allocation loop to make it clearer how pages get allocated every MLX5E_SHAMPO_WQ_HEADER_PER_PAGE headers. This change has no functional implications. Signed-off-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241107194357.683732-13-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en_rx.c | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 3de575875586..1963bc5adb18 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -666,8 +666,7 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq, u16 pi, header_offset, err, wqe_bbs; u32 lkey = rq->mdev->mlx5e_res.hw_objs.mkey; struct mlx5e_umr_wqe *umr_wqe; - int headroom, i; - u64 addr = 0; + int headroom, i = 0; headroom = rq->buff.headroom; wqe_bbs = MLX5E_KSM_UMR_WQEBBS(ksm_entries); @@ -676,22 +675,25 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq, build_ksm_umr(sq, umr_wqe, shampo->key, index, ksm_entries); WARN_ON_ONCE(ksm_entries & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1)); - for (i = 0; i < ksm_entries; i++, index++) { - header_offset = mlx5e_shampo_hd_offset(index); - if (!header_offset) { - struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, index); + while (i < ksm_entries) { + struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, index); + u64 addr; + + err = mlx5e_page_alloc_fragmented(rq, frag_page); + if (unlikely(err)) + goto err_unmap; - err = mlx5e_page_alloc_fragmented(rq, frag_page); - if (unlikely(err)) - goto err_unmap; - addr = page_pool_get_dma_addr(frag_page->page); - } + addr = page_pool_get_dma_addr(frag_page->page); - umr_wqe->inline_ksms[i] = (struct mlx5_ksm) { - .key = cpu_to_be32(lkey), - .va = cpu_to_be64(addr + header_offset + headroom), - }; + for (int j = 0; j < MLX5E_SHAMPO_WQ_HEADER_PER_PAGE; j++) { + header_offset = mlx5e_shampo_hd_offset(index++); + + umr_wqe->inline_ksms[i++] = (struct mlx5_ksm) { + .key = cpu_to_be32(lkey), + .va = cpu_to_be64(addr + header_offset + headroom), + }; + } } sq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) { -- 2.50.1 From 37653a0b8a6f5d6ab23daa8e585c5ed24a0fc500 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:53 +0800 Subject: [PATCH 03/16] net: ip: make fib_validate_source() support drop reasons In this commit, we make fib_validate_source() and __fib_validate_source() return -reason instead of errno on error. The return value of fib_validate_source can be -errno, 0, and 1. It's hard to make fib_validate_source() return drop reasons directly. The fib_validate_source() will return 1 if the scope of the source(revert) route is HOST. And the __mkroute_input() will mark the skb with IPSKB_DOREDIRECT in this case (combine with some other conditions). And then, a REDIRECT ICMP will be sent in ip_forward() if this flag exists. We can't pass this information to __mkroute_input if we make fib_validate_source() return drop reasons. Therefore, we introduce the wrapper fib_validate_source_reason() for fib_validate_source(), which will return the drop reasons on error. In the origin logic, LINUX_MIB_IPRPFILTER will be counted if fib_validate_source() return -EXDEV. And now, we need to adjust it by checking "reason == SKB_DROP_REASON_IP_RPFILTER". However, this will take effect only after the patch "net: ip: make ip_route_input_noref() return drop reasons", as we can't pass the drop reasons from fib_validate_source() to ip_rcv_finish_core() in this patch. Following new drop reasons are added in this patch: SKB_DROP_REASON_IP_LOCAL_SOURCE SKB_DROP_REASON_IP_INVALID_SOURCE Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 10 ++++++++++ include/net/ip_fib.h | 12 ++++++++++++ net/ipv4/fib_frontend.c | 17 ++++++++++++----- net/ipv4/ip_input.c | 4 +--- net/ipv4/route.c | 33 +++++++++++++++++++-------------- 5 files changed, 54 insertions(+), 22 deletions(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index d59bb96c5a02..62a60be1db84 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -76,6 +76,8 @@ FN(INVALID_PROTO) \ FN(IP_INADDRERRORS) \ FN(IP_INNOROUTES) \ + FN(IP_LOCAL_SOURCE) \ + FN(IP_INVALID_SOURCE) \ FN(PKT_TOO_BIG) \ FN(DUP_FRAG) \ FN(FRAG_REASM_TIMEOUT) \ @@ -373,6 +375,14 @@ enum skb_drop_reason { * IPSTATS_MIB_INADDRERRORS */ SKB_DROP_REASON_IP_INNOROUTES, + /** @SKB_DROP_REASON_IP_LOCAL_SOURCE: the source ip is local */ + SKB_DROP_REASON_IP_LOCAL_SOURCE, + /** + * @SKB_DROP_REASON_IP_INVALID_SOURCE: the source ip is invalid: + * 1) source ip is multicast or limited broadcast + * 2) source ip is zero and not IGMP + */ + SKB_DROP_REASON_IP_INVALID_SOURCE, /** * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the * MTU) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index b6e44f4eaa4c..a113c11ab56b 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -452,6 +452,18 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag); +static inline enum skb_drop_reason +fib_validate_source_reason(struct sk_buff *skb, __be32 src, __be32 dst, + dscp_t dscp, int oif, struct net_device *dev, + struct in_device *idev, u32 *itag) +{ + int err = fib_validate_source(skb, src, dst, dscp, oif, dev, idev, + itag); + if (err < 0) + return -err; + return SKB_NOT_DROPPED_YET; +} + #ifdef CONFIG_IP_ROUTE_CLASSID static inline int fib_num_tclassid_users(struct net *net) { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 0c9ce934b490..87bb36a5bdec 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -346,6 +346,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, int rpf, struct in_device *idev, u32 *itag) { struct net *net = dev_net(dev); + enum skb_drop_reason reason; struct flow_keys flkeys; int ret, no_addr; struct fib_result res; @@ -377,9 +378,15 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; - if (res.type != RTN_UNICAST && - (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) - goto e_inval; + if (res.type != RTN_UNICAST) { + if (res.type != RTN_LOCAL) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; + goto e_inval; + } else if (!IN_DEV_ACCEPT_LOCAL(idev)) { + reason = SKB_DROP_REASON_IP_LOCAL_SOURCE; + goto e_inval; + } + } fib_combine_itag(itag, &res); dev_match = fib_info_nh_uses_dev(res.fi, dev); @@ -412,9 +419,9 @@ last_resort: return 0; e_inval: - return -EINVAL; + return -reason; e_rpf: - return -EXDEV; + return -SKB_DROP_REASON_IP_RPFILTER; } /* Ignore rp_filter for packets protected by IPsec. */ diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 89bb63da6852..c40a26972884 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -425,10 +425,8 @@ drop: return NET_RX_DROP; drop_error: - if (err == -EXDEV) { - drop_reason = SKB_DROP_REASON_IP_RPFILTER; + if (drop_reason == SKB_DROP_REASON_IP_RPFILTER) __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); - } goto drop; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ccdbe9c70132..c38e95d9da9e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1682,7 +1682,7 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct in_device *in_dev, u32 *itag) { - int err; + enum skb_drop_reason reason; /* Primary sanity checks. */ if (!in_dev) @@ -1700,10 +1700,10 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, ip_hdr(skb)->protocol != IPPROTO_IGMP) return -EINVAL; } else { - err = fib_validate_source(skb, saddr, 0, dscp, 0, dev, in_dev, - itag); - if (err < 0) - return err; + reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, + dev, in_dev, itag); + if (reason) + return -EINVAL; } return 0; } @@ -1801,6 +1801,7 @@ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); if (err < 0) { + err = -EINVAL; ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -2153,6 +2154,7 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct in_device *in_dev = __in_dev_get_rcu(dev); struct rtable *rt = skb_rtable(hint); struct net *net = dev_net(dev); + enum skb_drop_reason reason; int err = -EINVAL; u32 tag = 0; @@ -2171,9 +2173,9 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (rt->rt_type != RTN_LOCAL) goto skip_validate_source; - err = fib_validate_source(skb, saddr, daddr, dscp, 0, dev, in_dev, - &tag); - if (err < 0) + reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev, + in_dev, &tag); + if (reason) goto martian_source; skip_validate_source: @@ -2215,6 +2217,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct fib_result *res) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); struct flow_keys *flkeys = NULL, _flkeys; struct net *net = dev_net(dev); @@ -2309,10 +2312,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto brd_input; } + err = -EINVAL; if (res->type == RTN_LOCAL) { - err = fib_validate_source(skb, saddr, daddr, dscp, 0, dev, - in_dev, &itag); - if (err < 0) + reason = fib_validate_source_reason(skb, saddr, daddr, dscp, + 0, dev, in_dev, &itag); + if (reason) goto martian_source; goto local_input; } @@ -2333,9 +2337,10 @@ brd_input: goto e_inval; if (!ipv4_is_zeronet(saddr)) { - err = fib_validate_source(skb, saddr, 0, dscp, 0, dev, in_dev, - &itag); - if (err < 0) + err = -EINVAL; + reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, + dev, in_dev, &itag); + if (reason) goto martian_source; } flags |= RTCF_BROADCAST; -- 2.50.1 From c6c670784b862878deba7e16210ca4b2a2966ca0 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:54 +0800 Subject: [PATCH 04/16] net: ip: make ip_route_input_mc() return drop reason Make ip_route_input_mc() return drop reason, and adjust the call of it in ip_route_input_rcu(). Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- net/ipv4/route.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c38e95d9da9e..b20316789baa 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1709,8 +1709,9 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, } /* called in rcu_read_lock() section */ -static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, int our) +static enum skb_drop_reason +ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, int our) { struct in_device *in_dev = __in_dev_get_rcu(dev); unsigned int flags = RTCF_MULTICAST; @@ -1721,7 +1722,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, err = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev, &itag); if (err) - return err; + return SKB_DROP_REASON_NOT_SPECIFIED; if (our) flags |= RTCF_LOCAL; @@ -1732,7 +1733,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, false); if (!rth) - return -ENOBUFS; + return SKB_DROP_REASON_NOMEM; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; @@ -1748,7 +1749,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, skb_dst_drop(skb); skb_dst_set(skb, &rth->dst); - return 0; + return SKB_NOT_DROPPED_YET; } @@ -2446,12 +2447,12 @@ static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, * route cache entry is created eventually. */ if (ipv4_is_multicast(daddr)) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); int our = 0; - int err = -EINVAL; if (!in_dev) - return err; + return -EINVAL; our = ip_check_mc_rcu(in_dev, daddr, saddr, ip_hdr(skb)->protocol); @@ -2472,10 +2473,10 @@ static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, IN_DEV_MFORWARD(in_dev)) #endif ) { - err = ip_route_input_mc(skb, daddr, saddr, dscp, dev, - our); + reason = ip_route_input_mc(skb, daddr, saddr, dscp, + dev, our); } - return err; + return reason ? -EINVAL : 0; } return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res); -- 2.50.1 From d46f827016d891dbc234cb05c406180f77fb3b2d Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:55 +0800 Subject: [PATCH 05/16] net: ip: make ip_mc_validate_source() return drop reason Make ip_mc_validate_source() return drop reason, and adjust the call of it in ip_route_input_mc(). Another caller of it is ip_rcv_finish_core->udp_v4_early_demux, and the errno is not checked in detail, so we don't do more adjustment for it. The drop reason "SKB_DROP_REASON_IP_LOCALNET" is added in this commit. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 3 +++ include/net/route.h | 7 ++++--- net/ipv4/route.c | 35 +++++++++++++++++++---------------- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 62a60be1db84..a2a1fb90e0e5 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -78,6 +78,7 @@ FN(IP_INNOROUTES) \ FN(IP_LOCAL_SOURCE) \ FN(IP_INVALID_SOURCE) \ + FN(IP_LOCALNET) \ FN(PKT_TOO_BIG) \ FN(DUP_FRAG) \ FN(FRAG_REASM_TIMEOUT) \ @@ -383,6 +384,8 @@ enum skb_drop_reason { * 2) source ip is zero and not IGMP */ SKB_DROP_REASON_IP_INVALID_SOURCE, + /** @SKB_DROP_REASON_IP_LOCALNET: source or dest ip is local net */ + SKB_DROP_REASON_IP_LOCALNET, /** * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the * MTU) diff --git a/include/net/route.h b/include/net/route.h index 0a690adfdff5..e2e1922c58fb 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -199,9 +199,10 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 return ip_route_output_key(net, fl4); } -int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - struct in_device *in_dev, u32 *itag); +enum skb_drop_reason +ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + struct in_device *in_dev, u32 *itag); int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev); int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b20316789baa..ef0b5ffda843 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1678,34 +1678,37 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) EXPORT_SYMBOL(rt_dst_clone); /* called in rcu_read_lock() section */ -int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - struct in_device *in_dev, u32 *itag) +enum skb_drop_reason +ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + struct in_device *in_dev, u32 *itag) { enum skb_drop_reason reason; /* Primary sanity checks. */ if (!in_dev) - return -EINVAL; + return SKB_DROP_REASON_NOT_SPECIFIED; - if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || - skb->protocol != htons(ETH_P_IP)) - return -EINVAL; + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) + return SKB_DROP_REASON_IP_INVALID_SOURCE; + + if (skb->protocol != htons(ETH_P_IP)) + return SKB_DROP_REASON_INVALID_PROTO; if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) - return -EINVAL; + return SKB_DROP_REASON_IP_LOCALNET; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr) && ip_hdr(skb)->protocol != IPPROTO_IGMP) - return -EINVAL; + return SKB_DROP_REASON_IP_INVALID_SOURCE; } else { reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, dev, in_dev, itag); if (reason) - return -EINVAL; + return reason; } - return 0; + return SKB_NOT_DROPPED_YET; } /* called in rcu_read_lock() section */ @@ -1715,14 +1718,14 @@ ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, { struct in_device *in_dev = __in_dev_get_rcu(dev); unsigned int flags = RTCF_MULTICAST; + enum skb_drop_reason reason; struct rtable *rth; u32 itag = 0; - int err; - err = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev, - &itag); - if (err) - return SKB_DROP_REASON_NOT_SPECIFIED; + reason = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev, + &itag); + if (reason) + return reason; if (our) flags |= RTCF_LOCAL; -- 2.50.1 From 5b92112acd8e2ed84a4df653fc20575f4da6fa49 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:56 +0800 Subject: [PATCH 06/16] net: ip: make ip_route_input_slow() return drop reasons In this commit, we make ip_route_input_slow() return skb drop reasons, and following new skb drop reasons are added: SKB_DROP_REASON_IP_INVALID_DEST The only caller of ip_route_input_slow() is ip_route_input_rcu(), and we adjust it by making it return -EINVAL on error. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 6 ++++ net/ipv4/route.c | 56 ++++++++++++++++++++++------------- 2 files changed, 41 insertions(+), 21 deletions(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index a2a1fb90e0e5..74624d369d48 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -79,6 +79,7 @@ FN(IP_LOCAL_SOURCE) \ FN(IP_INVALID_SOURCE) \ FN(IP_LOCALNET) \ + FN(IP_INVALID_DEST) \ FN(PKT_TOO_BIG) \ FN(DUP_FRAG) \ FN(FRAG_REASM_TIMEOUT) \ @@ -386,6 +387,11 @@ enum skb_drop_reason { SKB_DROP_REASON_IP_INVALID_SOURCE, /** @SKB_DROP_REASON_IP_LOCALNET: source or dest ip is local net */ SKB_DROP_REASON_IP_LOCALNET, + /** + * @SKB_DROP_REASON_IP_INVALID_DEST: the dest ip is invalid: + * 1) dest ip is 0 + */ + SKB_DROP_REASON_IP_INVALID_DEST, /** * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the * MTU) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ef0b5ffda843..b73f0355cc38 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2217,9 +2217,10 @@ static struct net_device *ip_rt_get_dev(struct net *net, * called with rcu_read_lock() */ -static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - struct fib_result *res) +static enum skb_drop_reason +ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + struct fib_result *res) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -2249,8 +2250,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_tun_key.tun_id = 0; skb_dst_drop(skb); - if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; + } res->fi = NULL; res->table = NULL; @@ -2260,21 +2263,29 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, /* Accept zero addresses only to limited broadcast; * I even do not know to fix it or not. Waiting for complains :-) */ - if (ipv4_is_zeronet(saddr)) + if (ipv4_is_zeronet(saddr)) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; + } - if (ipv4_is_zeronet(daddr)) + if (ipv4_is_zeronet(daddr)) { + reason = SKB_DROP_REASON_IP_INVALID_DEST; goto martian_destination; + } /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), * and call it once if daddr or/and saddr are loopback addresses */ if (ipv4_is_loopback(daddr)) { - if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { + reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_destination; + } } else if (ipv4_is_loopback(saddr)) { - if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { + reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_source; + } } /* @@ -2329,19 +2340,26 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, err = -EHOSTUNREACH; goto no_route; } - if (res->type != RTN_UNICAST) + if (res->type != RTN_UNICAST) { + reason = SKB_DROP_REASON_IP_INVALID_DEST; goto martian_destination; + } make_route: err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp, flkeys); -out: return err; + if (!err) + reason = SKB_NOT_DROPPED_YET; + +out: + return reason; brd_input: - if (skb->protocol != htons(ETH_P_IP)) - goto e_inval; + if (skb->protocol != htons(ETH_P_IP)) { + reason = SKB_DROP_REASON_INVALID_PROTO; + goto out; + } if (!ipv4_is_zeronet(saddr)) { - err = -EINVAL; reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, dev, in_dev, &itag); if (reason) @@ -2362,7 +2380,7 @@ local_input: rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); - err = 0; + reason = SKB_NOT_DROPPED_YET; goto out; } } @@ -2399,7 +2417,7 @@ local_input: rt_add_uncached_list(rth); } skb_dst_set(skb, &rth->dst); - err = 0; + reason = SKB_NOT_DROPPED_YET; goto out; no_route: @@ -2420,12 +2438,8 @@ martian_destination: &daddr, &saddr, dev->name); #endif -e_inval: - err = -EINVAL; - goto out; - e_nobufs: - err = -ENOBUFS; + reason = SKB_DROP_REASON_NOMEM; goto out; martian_source: @@ -2482,7 +2496,7 @@ static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, return reason ? -EINVAL : 0; } - return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res); + return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res) ? -EINVAL : 0; } int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, -- 2.50.1 From 61b95c70f3449c1c0bd1415c8ef37e2959cf1c41 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:57 +0800 Subject: [PATCH 07/16] net: ip: make ip_route_input_rcu() return drop reasons In this commit, we make ip_route_input_rcu() return drop reasons, which come from ip_route_input_mc() and ip_route_input_slow(). The only caller of ip_route_input_rcu() is ip_route_input_noref(). We adjust it by making it return -EINVAL on error and ignore the reasons that ip_route_input_rcu() returns. In the following patch, we will make ip_route_input_noref() returns the drop reasons. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- net/ipv4/route.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b73f0355cc38..270bc8c96619 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2448,9 +2448,10 @@ martian_source: } /* called with rcu_read_lock held */ -static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - struct fib_result *res) +static enum skb_drop_reason +ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + struct fib_result *res) { /* Multicast recognition logic is moved from route cache to here. * The problem was that too many Ethernet cards have broken/missing @@ -2493,23 +2494,23 @@ static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, reason = ip_route_input_mc(skb, daddr, saddr, dscp, dev, our); } - return reason ? -EINVAL : 0; + return reason; } - return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res) ? -EINVAL : 0; + return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res); } int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev) { + enum skb_drop_reason reason; struct fib_result res; - int err; rcu_read_lock(); - err = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res); + reason = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res); rcu_read_unlock(); - return err; + return reason ? -EINVAL : 0; } EXPORT_SYMBOL(ip_route_input_noref); @@ -3321,7 +3322,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, skb->mark = mark; err = ip_route_input_rcu(skb, dst, src, inet_dsfield_to_dscp(rtm->rtm_tos), - dev, &res); + dev, &res) ? -EINVAL : 0; rt = skb_rtable(skb); if (err == 0 && rt->dst.error) -- 2.50.1 From 82d9983ebeb871cb5abd27c12a950c14c68772e1 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:58 +0800 Subject: [PATCH 08/16] net: ip: make ip_route_input_noref() return drop reasons In this commit, we make ip_route_input_noref() return drop reasons, which come from ip_route_input_rcu(). We need adjust the callers of ip_route_input_noref() to make sure the return value of ip_route_input_noref() is used properly. The errno that ip_route_input_noref() returns comes from ip_route_input and bpf_lwt_input_reroute in the origin logic, and we make them return -EINVAL on error instead. In the following patch, we will make ip_route_input() returns drop reasons too. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/route.h | 15 ++++++++------- net/core/lwt_bpf.c | 6 ++++-- net/ipv4/ip_fragment.c | 11 ++++++----- net/ipv4/ip_input.c | 7 ++++--- net/ipv4/route.c | 7 ++++--- 5 files changed, 26 insertions(+), 20 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index e2e1922c58fb..b85ffa3e042b 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -203,8 +203,9 @@ enum skb_drop_reason ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct in_device *in_dev, u32 *itag); -int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev); +enum skb_drop_reason +ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev); int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, const struct sk_buff *hint); @@ -212,18 +213,18 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, dscp_t dscp, struct net_device *devin) { - int err; + enum skb_drop_reason reason; rcu_read_lock(); - err = ip_route_input_noref(skb, dst, src, dscp, devin); - if (!err) { + reason = ip_route_input_noref(skb, dst, src, dscp, devin); + if (!reason) { skb_dst_force(skb); if (!skb_dst(skb)) - err = -EINVAL; + reason = SKB_DROP_REASON_NOT_SPECIFIED; } rcu_read_unlock(); - return err; + return reason ? -EINVAL : 0; } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index e0ca24a58810..8a78bff53b2c 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -88,6 +88,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, static int bpf_lwt_input_reroute(struct sk_buff *skb) { + enum skb_drop_reason reason; int err = -EINVAL; if (skb->protocol == htons(ETH_P_IP)) { @@ -96,8 +97,9 @@ static int bpf_lwt_input_reroute(struct sk_buff *skb) dev_hold(dev); skb_dst_drop(skb); - err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - ip4h_dscp(iph), dev); + reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev); + err = reason ? -EINVAL : 0; dev_put(dev); } else if (skb->protocol == htons(ETH_P_IPV6)) { skb_dst_drop(skb); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 48e2810f1f27..07036a2943c1 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -132,12 +132,12 @@ static bool frag_expire_skip_icmp(u32 user) */ static void ip_expire(struct timer_list *t) { + enum skb_drop_reason reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT; struct inet_frag_queue *frag = from_timer(frag, t, timer); const struct iphdr *iph; struct sk_buff *head = NULL; struct net *net; struct ipq *qp; - int err; qp = container_of(frag, struct ipq, q); net = qp->q.fqdir->net; @@ -175,14 +175,15 @@ static void ip_expire(struct timer_list *t) /* skb has no dst, perform route lookup again */ iph = ip_hdr(head); - err = ip_route_input_noref(head, iph->daddr, iph->saddr, ip4h_dscp(iph), - head->dev); - if (err) + reason = ip_route_input_noref(head, iph->daddr, iph->saddr, + ip4h_dscp(iph), head->dev); + if (reason) goto out; /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ + reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT; if (frag_expire_skip_icmp(qp->q.key.v4.user) && (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out; @@ -195,7 +196,7 @@ out: spin_unlock(&qp->q.lock); out_rcu_unlock: rcu_read_unlock(); - kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT); + kfree_skb_reason(head, reason); ipq_put(qp); } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c40a26972884..513eb0c6435a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -362,10 +362,11 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { - err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - ip4h_dscp(iph), dev); - if (unlikely(err)) + drop_reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev); + if (unlikely(drop_reason)) goto drop_error; + drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; } else { struct in_device *in_dev = __in_dev_get_rcu(dev); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 270bc8c96619..5a7edb66174a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2500,8 +2500,9 @@ ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res); } -int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev) +enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr, + __be32 saddr, dscp_t dscp, + struct net_device *dev) { enum skb_drop_reason reason; struct fib_result res; @@ -2510,7 +2511,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, reason = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res); rcu_read_unlock(); - return reason ? -EINVAL : 0; + return reason; } EXPORT_SYMBOL(ip_route_input_noref); -- 2.50.1 From 50038bf38e6577a15d52b890d82c197cf3b163a0 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:59 +0800 Subject: [PATCH 09/16] net: ip: make ip_route_input() return drop reasons In this commit, we make ip_route_input() return skb drop reasons that come from ip_route_input_noref(). Meanwhile, adjust all the call to it. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/route.h | 7 ++++--- net/bridge/br_netfilter_hooks.c | 11 ++++++----- net/ipv4/icmp.c | 2 +- net/ipv4/ip_options.c | 2 +- net/ipv6/seg6_local.c | 14 +++++++------- 5 files changed, 19 insertions(+), 17 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index b85ffa3e042b..fb3433dc9c72 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -210,8 +210,9 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, const struct sk_buff *hint); -static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, - dscp_t dscp, struct net_device *devin) +static inline enum skb_drop_reason +ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, dscp_t dscp, + struct net_device *devin) { enum skb_drop_reason reason; @@ -224,7 +225,7 @@ static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, } rcu_read_unlock(); - return reason ? -EINVAL : 0; + return reason; } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 7f2f40cef5fe..451e45b9a6a5 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -373,8 +373,8 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *dev = skb->dev, *br_indev; const struct iphdr *iph = ip_hdr(skb); + enum skb_drop_reason reason; struct rtable *rt; - int err; br_indev = nf_bridge_get_physindev(skb, net); if (!br_indev) { @@ -390,9 +390,9 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ } nf_bridge->in_prerouting = 0; if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) { - err = ip_route_input(skb, iph->daddr, iph->saddr, - ip4h_dscp(iph), dev); - if (err) { + reason = ip_route_input(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev); + if (reason) { struct in_device *in_dev = __in_dev_get_rcu(dev); /* If err equals -EHOSTUNREACH the error is due to a @@ -402,7 +402,8 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ * martian destinations: loopback destinations and destination * 0.0.0.0. In both cases the packet will be dropped because the * destination is the loopback device and not the bridge. */ - if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev)) + if (reason != SKB_DROP_REASON_IP_INADDRERRORS || !in_dev || + IN_DEV_FORWARD(in_dev)) goto free_skb; rt = ip_route_output(net, iph->daddr, 0, diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 33eec844a5a0..4f088fa1c2f2 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -545,7 +545,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, orefdst = skb_in->_skb_refdst; /* save old refdst */ skb_dst_set(skb_in, NULL); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, - dscp, rt2->dst.dev); + dscp, rt2->dst.dev) ? -EINVAL : 0; dst_release(&rt2->dst); rt2 = skb_rtable(skb_in); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 81e86e5defee..e3321932bec0 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -618,7 +618,7 @@ int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev) orefdst = skb->_skb_refdst; skb_dst_set(skb, NULL); err = ip_route_input(skb, nexthop, iph->saddr, ip4h_dscp(iph), - dev); + dev) ? -EINVAL : 0; rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { skb_dst_drop(skb); diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index c74705ead984..ac1dbd492c22 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -954,10 +954,10 @@ static int input_action_end_dx4_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); + enum skb_drop_reason reason; struct seg6_local_lwt *slwt; struct iphdr *iph; __be32 nhaddr; - int err; slwt = seg6_local_lwtunnel(orig_dst->lwtstate); @@ -967,9 +967,9 @@ static int input_action_end_dx4_finish(struct net *net, struct sock *sk, skb_dst_drop(skb); - err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev); - if (err) { - kfree_skb(skb); + reason = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev); + if (reason) { + kfree_skb_reason(skb, reason); return -EINVAL; } @@ -1174,8 +1174,8 @@ drop: static int input_action_end_dt4(struct sk_buff *skb, struct seg6_local_lwt *slwt) { + enum skb_drop_reason reason; struct iphdr *iph; - int err; if (!decap_and_validate(skb, IPPROTO_IPIP)) goto drop; @@ -1193,8 +1193,8 @@ static int input_action_end_dt4(struct sk_buff *skb, iph = ip_hdr(skb); - err = ip_route_input(skb, iph->daddr, iph->saddr, 0, skb->dev); - if (unlikely(err)) + reason = ip_route_input(skb, iph->daddr, iph->saddr, 0, skb->dev); + if (unlikely(reason)) goto drop; return dst_input(skb); -- 2.50.1 From d9340d1e02779dbf83d53b0deb4068c7768b8261 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:56:00 +0800 Subject: [PATCH 10/16] net: ip: make ip_mkroute_input/__mkroute_input return drop reasons In this commit, we make ip_mkroute_input() and __mkroute_input() return drop reasons. The drop reason "SKB_DROP_REASON_ARP_PVLAN_DISABLE" is introduced for the case: the packet which is not IP is forwarded to the in_dev, and the proxy_arp_pvlan is not enabled. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 7 +++++++ net/ipv4/route.c | 34 ++++++++++++++++++---------------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 74624d369d48..6c5a1ea209a2 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -104,6 +104,7 @@ FN(IP_TUNNEL_ECN) \ FN(TUNNEL_TXINFO) \ FN(LOCAL_MAC) \ + FN(ARP_PVLAN_DISABLE) \ FNe(MAX) /** @@ -477,6 +478,12 @@ enum skb_drop_reason { * the MAC address of the local netdev. */ SKB_DROP_REASON_LOCAL_MAC, + /** + * @SKB_DROP_REASON_ARP_PVLAN_DISABLE: packet which is not IP is + * forwarded to the in_dev, and the proxy_arp_pvlan is not + * enabled. + */ + SKB_DROP_REASON_ARP_PVLAN_DISABLE, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5a7edb66174a..2697a6c88416 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1782,10 +1782,12 @@ static void ip_handle_martian_source(struct net_device *dev, } /* called in rcu_read_lock() section */ -static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, - struct in_device *in_dev, __be32 daddr, - __be32 saddr, dscp_t dscp) +static enum skb_drop_reason +__mkroute_input(struct sk_buff *skb, const struct fib_result *res, + struct in_device *in_dev, __be32 daddr, + __be32 saddr, dscp_t dscp) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct net_device *dev = nhc->nhc_dev; struct fib_nh_exception *fnhe; @@ -1799,13 +1801,13 @@ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, out_dev = __in_dev_get_rcu(dev); if (!out_dev) { net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); - return -EINVAL; + return reason; } err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); if (err < 0) { - err = -EINVAL; + reason = -err; ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -1833,7 +1835,7 @@ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, */ if (out_dev == in_dev && IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { - err = -EINVAL; + reason = SKB_DROP_REASON_ARP_PVLAN_DISABLE; goto cleanup; } } @@ -1856,7 +1858,7 @@ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, rth = rt_dst_alloc(out_dev->dev, 0, res->type, IN_DEV_ORCONF(out_dev, NOXFRM)); if (!rth) { - err = -ENOBUFS; + reason = SKB_DROP_REASON_NOMEM; goto cleanup; } @@ -1870,9 +1872,9 @@ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, lwtunnel_set_redirect(&rth->dst); skb_dst_set(skb, &rth->dst); out: - err = 0; - cleanup: - return err; + reason = SKB_NOT_DROPPED_YET; +cleanup: + return reason; } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -2130,9 +2132,10 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ -static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, - struct in_device *in_dev, __be32 daddr, - __be32 saddr, dscp_t dscp, struct flow_keys *hkeys) +static enum skb_drop_reason +ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, + struct in_device *in_dev, __be32 daddr, + __be32 saddr, dscp_t dscp, struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && fib_info_num_path(res->fi) > 1) { @@ -2346,9 +2349,8 @@ ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, } make_route: - err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp, flkeys); - if (!err) - reason = SKB_NOT_DROPPED_YET; + reason = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp, + flkeys); out: return reason; -- 2.50.1 From 479aed04e84a5d66caa3b25bfc651292c153ef70 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:56:01 +0800 Subject: [PATCH 11/16] net: ip: make ip_route_use_hint() return drop reasons In this commit, we make ip_route_use_hint() return drop reasons. The drop reasons that we return are similar to what we do in ip_route_input_slow(), and no drop reasons are added in this commit. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/route.h | 7 ++++--- net/ipv4/ip_input.c | 9 ++++----- net/ipv4/route.c | 28 +++++++++++++++++----------- 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index fb3433dc9c72..84cb1e04f5cd 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -206,9 +206,10 @@ ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev); -int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - const struct sk_buff *hint); +enum skb_drop_reason +ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + const struct sk_buff *hint); static inline enum skb_drop_reason ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, dscp_t dscp, diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 513eb0c6435a..f0a4dda246ab 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -322,15 +322,14 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, int err, drop_reason; struct rtable *rt; - drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; - if (ip_can_use_hint(skb, iph, hint)) { - err = ip_route_use_hint(skb, iph->daddr, iph->saddr, - ip4h_dscp(iph), dev, hint); - if (unlikely(err)) + drop_reason = ip_route_use_hint(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev, hint); + if (unlikely(drop_reason)) goto drop_error; } + drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && !skb_dst(skb) && !skb->sk && diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2697a6c88416..e5603e84b20d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2154,28 +2154,34 @@ ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, * assuming daddr is valid and the destination is not a local broadcast one. * Uses the provided hint instead of performing a route lookup. */ -int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - const struct sk_buff *hint) +enum skb_drop_reason +ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + const struct sk_buff *hint) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); struct rtable *rt = skb_rtable(hint); struct net *net = dev_net(dev); - enum skb_drop_reason reason; - int err = -EINVAL; u32 tag = 0; if (!in_dev) - return -EINVAL; + return reason; - if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; + } - if (ipv4_is_zeronet(saddr)) + if (ipv4_is_zeronet(saddr)) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; + } - if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { + reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_source; + } if (rt->rt_type != RTN_LOCAL) goto skip_validate_source; @@ -2187,11 +2193,11 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, skip_validate_source: skb_dst_copy(skb, hint); - return 0; + return SKB_NOT_DROPPED_YET; martian_source: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); - return err; + return reason; } /* get device for dst_alloc with local routes */ -- 2.50.1 From 12079a59ce52e72a342c49cfacf0281213fd6f32 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 7 Nov 2024 08:11:44 -0800 Subject: [PATCH 12/16] net: Implement fault injection forcing skb reallocation Introduce a fault injection mechanism to force skb reallocation. The primary goal is to catch bugs related to pointer invalidation after potential skb reallocation. The fault injection mechanism aims to identify scenarios where callers retain pointers to various headers in the skb but fail to reload these pointers after calling a function that may reallocate the data. This type of bug can lead to memory corruption or crashes if the old, now-invalid pointers are used. By forcing reallocation through fault injection, we can stress-test code paths and ensure proper pointer management after potential skb reallocations. Add a hook for fault injection in the following functions: * pskb_trim_rcsum() * pskb_may_pull_reason() * pskb_trim() As the other fault injection mechanism, protect it under a debug Kconfig called CONFIG_FAIL_SKB_REALLOC. This patch was *heavily* inspired by Jakub's proposal from: https://lore.kernel.org/all/20240719174140.47a868e6@kernel.org/ CC: Akinobu Mita Suggested-by: Jakub Kicinski Signed-off-by: Breno Leitao Reviewed-by: Akinobu Mita Acked-by: Paolo Abeni Acked-by: Guillaume Nault Link: https://patch.msgid.link/20241107-fault_v6-v6-1-1b82cb6ecacd@debian.org Signed-off-by: Paolo Abeni --- .../admin-guide/kernel-parameters.txt | 1 + .../fault-injection/fault-injection.rst | 40 +++++++ include/linux/skbuff.h | 9 ++ lib/Kconfig.debug | 10 ++ net/core/Makefile | 1 + net/core/skb_fault_injection.c | 106 ++++++++++++++++++ 6 files changed, 167 insertions(+) create mode 100644 net/core/skb_fault_injection.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1518343bbe22..2fb830453dcc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1546,6 +1546,7 @@ failslab= fail_usercopy= fail_page_alloc= + fail_skb_realloc= fail_make_request=[KNL] General fault injection mechanism. Format: ,,, diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst index 8b8aeea71c68..1c14ba08fbfc 100644 --- a/Documentation/fault-injection/fault-injection.rst +++ b/Documentation/fault-injection/fault-injection.rst @@ -45,6 +45,32 @@ Available fault injection capabilities ALLOW_ERROR_INJECTION() macro, by setting debugfs entries under /sys/kernel/debug/fail_function. No boot option supported. +- fail_skb_realloc + + inject skb (socket buffer) reallocation events into the network path. The + primary goal is to identify and prevent issues related to pointer + mismanagement in the network subsystem. By forcing skb reallocation at + strategic points, this feature creates scenarios where existing pointers to + skb headers become invalid. + + When the fault is injected and the reallocation is triggered, cached pointers + to skb headers and data no longer reference valid memory locations. This + deliberate invalidation helps expose code paths where proper pointer updating + is neglected after a reallocation event. + + By creating these controlled fault scenarios, the system can catch instances + where stale pointers are used, potentially leading to memory corruption or + system instability. + + To select the interface to act on, write the network name to + /sys/kernel/debug/fail_skb_realloc/devname. + If this field is left empty (which is the default value), skb reallocation + will be forced on all network interfaces. + + The effectiveness of this fault detection is enhanced when KASAN is + enabled, as it helps identify invalid memory references and use-after-free + (UAF) issues. + - NVMe fault injection inject NVMe status code and retry flag on devices permitted by setting @@ -216,6 +242,19 @@ configuration of fault-injection capabilities. use a negative errno, you better use 'printf' instead of 'echo', e.g.: $ printf %#x -12 > retval +- /sys/kernel/debug/fail_skb_realloc/devname: + + Specifies the network interface on which to force SKB reallocation. If + left empty, SKB reallocation will be applied to all network interfaces. + + Example usage:: + + # Force skb reallocation on eth0 + echo "eth0" > /sys/kernel/debug/fail_skb_realloc/devname + + # Clear the selection and force skb reallocation on all interfaces + echo "" > /sys/kernel/debug/fail_skb_realloc/devname + Boot option ^^^^^^^^^^^ @@ -227,6 +266,7 @@ use the boot option:: fail_usercopy= fail_make_request= fail_futex= + fail_skb_realloc= mmc_core.fail_request=,,, proc entries diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 60535c706851..58009fa66102 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2682,6 +2682,12 @@ static inline void skb_assert_len(struct sk_buff *skb) #endif /* CONFIG_DEBUG_NET */ } +#if defined(CONFIG_FAIL_SKB_REALLOC) +void skb_might_realloc(struct sk_buff *skb); +#else +static inline void skb_might_realloc(struct sk_buff *skb) {} +#endif + /* * Add data to an sk_buff */ @@ -2782,6 +2788,7 @@ static inline enum skb_drop_reason pskb_may_pull_reason(struct sk_buff *skb, unsigned int len) { DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); + skb_might_realloc(skb); if (likely(len <= skb_headlen(skb))) return SKB_NOT_DROPPED_YET; @@ -3240,6 +3247,7 @@ static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) static inline int pskb_trim(struct sk_buff *skb, unsigned int len) { + skb_might_realloc(skb); return (len < skb->len) ? __pskb_trim(skb, len) : 0; } @@ -3994,6 +4002,7 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len); static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) { + skb_might_realloc(skb); if (likely(len >= skb->len)) return 0; return pskb_trim_rcsum_slow(skb, len); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7312ae7c3cc5..67b669d2e70e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2115,6 +2115,16 @@ config FAIL_SUNRPC Provide fault-injection capability for SunRPC and its consumers. +config FAIL_SKB_REALLOC + bool "Fault-injection capability forcing skb to reallocate" + depends on FAULT_INJECTION_DEBUG_FS + help + Provide fault-injection capability that forces the skb to be + reallocated, catching possible invalid pointers to the skb. + + For more information, check + Documentation/dev-tools/fault-injection/fault-injection.rst + config FAULT_INJECTION_CONFIGFS bool "Configfs interface for fault-injection capabilities" depends on FAULT_INJECTION diff --git a/net/core/Makefile b/net/core/Makefile index 5a72a87ee0f1..d9326600e289 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -46,3 +46,4 @@ obj-$(CONFIG_OF) += of_net.o obj-$(CONFIG_NET_TEST) += net_test.o obj-$(CONFIG_NET_DEVMEM) += devmem.o obj-$(CONFIG_DEBUG_NET_SMALL_RTNL) += rtnl_net_debug.o +obj-$(CONFIG_FAIL_SKB_REALLOC) += skb_fault_injection.o diff --git a/net/core/skb_fault_injection.c b/net/core/skb_fault_injection.c new file mode 100644 index 000000000000..4235db6bdfad --- /dev/null +++ b/net/core/skb_fault_injection.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include + +static struct { + struct fault_attr attr; + char devname[IFNAMSIZ]; + bool filtered; +} skb_realloc = { + .attr = FAULT_ATTR_INITIALIZER, + .filtered = false, +}; + +static bool should_fail_net_realloc_skb(struct sk_buff *skb) +{ + struct net_device *net = skb->dev; + + if (skb_realloc.filtered && + strncmp(net->name, skb_realloc.devname, IFNAMSIZ)) + /* device name filter set, but names do not match */ + return false; + + if (!should_fail(&skb_realloc.attr, 1)) + return false; + + return true; +} +ALLOW_ERROR_INJECTION(should_fail_net_realloc_skb, TRUE); + +void skb_might_realloc(struct sk_buff *skb) +{ + if (!should_fail_net_realloc_skb(skb)) + return; + + pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} +EXPORT_SYMBOL(skb_might_realloc); + +static int __init fail_skb_realloc_setup(char *str) +{ + return setup_fault_attr(&skb_realloc.attr, str); +} +__setup("fail_skb_realloc=", fail_skb_realloc_setup); + +static void reset_settings(void) +{ + skb_realloc.filtered = false; + memset(&skb_realloc.devname, 0, IFNAMSIZ); +} + +static ssize_t devname_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + ssize_t ret; + + reset_settings(); + ret = simple_write_to_buffer(&skb_realloc.devname, IFNAMSIZ, + ppos, buffer, count); + if (ret < 0) + return ret; + + skb_realloc.devname[IFNAMSIZ - 1] = '\0'; + /* Remove a possible \n at the end of devname */ + strim(skb_realloc.devname); + + if (strnlen(skb_realloc.devname, IFNAMSIZ)) + skb_realloc.filtered = true; + + return count; +} + +static ssize_t devname_read(struct file *file, + char __user *buffer, + size_t size, loff_t *ppos) +{ + if (!skb_realloc.filtered) + return 0; + + return simple_read_from_buffer(buffer, size, ppos, &skb_realloc.devname, + strlen(skb_realloc.devname)); +} + +static const struct file_operations devname_ops = { + .write = devname_write, + .read = devname_read, +}; + +static int __init fail_skb_realloc_debugfs(void) +{ + umode_t mode = S_IFREG | 0600; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_skb_realloc", NULL, + &skb_realloc.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + debugfs_create_file("devname", mode, dir, NULL, &devname_ops); + + return 0; +} + +late_initcall(fail_skb_realloc_debugfs); -- 2.50.1 From a58f00ed24b849d449f7134fd5d86f07090fe2f5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 7 Nov 2024 17:02:54 -0800 Subject: [PATCH 13/16] net: sched: cls_api: improve the error message for ID allocation failure We run into an exhaustion problem with the kernel-allocated filter IDs. Our allocation problem can be fixed on the user space side, but the error message in this case was quite misleading: "Filter with specified priority/protocol not found" (EINVAL) Specifically when we can't allocate a _new_ ID because filter with lowest ID already _exists_, saying "filter not found", is confusing. Kernel allocates IDs in range of 0xc0000 -> 0x8000, giving out ID one lower than lowest existing in that range. The error message makes sense when tcf_chain_tp_find() gets called for GET and DEL but for NEW we need to provide more specific error messages for all three cases: - user wants the ID to be auto-allocated but filter with ID 0x8000 already exists - filter already exists and can be replaced, but user asked for a protocol change - filter doesn't exist Caller of tcf_chain_tp_insert_unique() doesn't set extack today, so don't bother plumbing it in. Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20241108010254.2995438-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/sched/cls_api.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 04942f8c62e0..7578e27260c9 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1933,7 +1933,8 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain, static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, struct tcf_chain_info *chain_info, u32 protocol, u32 prio, - bool prio_allocate); + bool prio_allocate, + struct netlink_ext_ack *extack); /* Try to insert new proto. * If proto with specified priority already exists, free new proto @@ -1957,8 +1958,7 @@ static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain, return ERR_PTR(-EAGAIN); } - tp = tcf_chain_tp_find(chain, &chain_info, - protocol, prio, false); + tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, false, NULL); if (!tp) err = tcf_chain_tp_insert(chain, &chain_info, tp_new); mutex_unlock(&chain->filter_chain_lock); @@ -2018,7 +2018,8 @@ static void tcf_chain_tp_delete_empty(struct tcf_chain *chain, static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, struct tcf_chain_info *chain_info, u32 protocol, u32 prio, - bool prio_allocate) + bool prio_allocate, + struct netlink_ext_ack *extack) { struct tcf_proto **pprev; struct tcf_proto *tp; @@ -2029,9 +2030,14 @@ static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, pprev = &tp->next) { if (tp->prio >= prio) { if (tp->prio == prio) { - if (prio_allocate || - (tp->protocol != protocol && protocol)) + if (prio_allocate) { + NL_SET_ERR_MSG(extack, "Lowest ID from auto-alloc range already in use"); + return ERR_PTR(-ENOSPC); + } + if (tp->protocol != protocol && protocol) { + NL_SET_ERR_MSG(extack, "Protocol mismatch for filter with specified priority"); return ERR_PTR(-EINVAL); + } } else { tp = NULL; } @@ -2312,9 +2318,8 @@ replay: mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_tp_find(chain, &chain_info, protocol, - prio, prio_allocate); + prio, prio_allocate, extack); if (IS_ERR(tp)) { - NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); err = PTR_ERR(tp); goto errout_locked; } @@ -2539,10 +2544,13 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_tp_find(chain, &chain_info, protocol, - prio, false); - if (!tp || IS_ERR(tp)) { + prio, false, extack); + if (!tp) { + err = -ENOENT; NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); - err = tp ? PTR_ERR(tp) : -ENOENT; + goto errout_locked; + } else if (IS_ERR(tp)) { + err = PTR_ERR(tp); goto errout_locked; } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); @@ -2679,11 +2687,14 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_tp_find(chain, &chain_info, protocol, - prio, false); + prio, false, extack); mutex_unlock(&chain->filter_chain_lock); - if (!tp || IS_ERR(tp)) { + if (!tp) { + err = -ENOENT; NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); - err = tp ? PTR_ERR(tp) : -ENOENT; + goto errout; + } else if (IS_ERR(tp)) { + err = PTR_ERR(tp); goto errout; } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); -- 2.50.1 From b169e76ebad22cbd055101ee5aa1a7bed0e66606 Mon Sep 17 00:00:00 2001 From: Dmitry Kandybka Date: Thu, 7 Nov 2024 13:36:57 +0300 Subject: [PATCH 14/16] mptcp: fix possible integer overflow in mptcp_reset_tout_timer In 'mptcp_reset_tout_timer', promote 'probe_timestamp' to unsigned long to avoid possible integer overflow. Compile tested only. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Dmitry Kandybka Link: https://patch.msgid.link/20241107103657.1560536-1-d.kandybka@gmail.com Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b0e9a745ea62..a6f2a25edb11 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2722,8 +2722,8 @@ void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout) if (!fail_tout && !inet_csk(sk)->icsk_mtup.probe_timestamp) return; - close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + - mptcp_close_timeout(sk); + close_timeout = (unsigned long)inet_csk(sk)->icsk_mtup.probe_timestamp - + tcp_jiffies32 + jiffies + mptcp_close_timeout(sk); /* the close timeout takes precedence on the fail one, and here at least one of * them is active -- 2.50.1 From 078e0d596f7b5952dad8662ace8f20ed2165e2ce Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 10 Nov 2024 18:59:55 +0100 Subject: [PATCH 15/16] dsa: qca8k: Use nested lock to avoid splat qca8k_phy_eth_command() is used to probe the child MDIO bus while the parent MDIO is locked. This causes lockdep splat, reporting a possible deadlock. It is not an actually deadlock, because different locks are used. By making use of mutex_lock_nested() we can avoid this false positive. Signed-off-by: Andrew Lunn Link: https://patch.msgid.link/20241110175955.3053664-1-andrew@lunn.ch Signed-off-by: Jakub Kicinski --- drivers/net/dsa/qca/qca8k-8xxx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c index f8d8c70642c4..59b4a7240b58 100644 --- a/drivers/net/dsa/qca/qca8k-8xxx.c +++ b/drivers/net/dsa/qca/qca8k-8xxx.c @@ -673,7 +673,7 @@ qca8k_phy_eth_command(struct qca8k_priv *priv, bool read, int phy, * We therefore need to lock the MDIO bus onto which the switch is * connected. */ - mutex_lock(&priv->bus->mdio_lock); + mutex_lock_nested(&priv->bus->mdio_lock, MDIO_MUTEX_NESTED); /* Actually start the request: * 1. Send mdio master packet -- 2.50.1 From 7ed816be35abc3d5bed39d3edc5f2efed2ca5216 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 8 Nov 2024 19:51:19 -0800 Subject: [PATCH 16/16] eth: bnxt: use page pool for head frags Testing small size RPCs (300B-400B) on a large AMD system suggests that page pool recycling is very useful even for just the head frags. With this patch (and copy break disabled) I see a 30% performance improvement (82Gbps -> 106Gbps). Convert bnxt from normal page frags to page pool frags for head buffers. On systems with small page size we can use the same pool as for TPA pages. On systems with large pages the frag allocation logic of the page pool is already used to split a large page into TPA chunks. TPA chunks are much larger than heads (8k or 64k, AFAICT vs 1kB) and we always allocate the same sized chunks. Mixing allocation of TPA and head pages would lead to sub-optimal memory use. Plus Taehee's work on zero-copy / devmem will need to differentiate between TPA and non-TPA page pool, anyway. Conditionally allocate a new page pool for heads. Link: https://patch.msgid.link/20241109035119.3391864-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 98 ++++++++++++----------- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + 2 files changed, 51 insertions(+), 48 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index e9aab2e2840e..4c1302a8f72d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -864,6 +864,11 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int budget) bnapi->events &= ~BNXT_TX_CMP_EVENT; } +static bool bnxt_separate_head_pool(void) +{ + return PAGE_SIZE > BNXT_RX_PAGE_SIZE; +} + static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, struct bnxt_rx_ring_info *rxr, unsigned int *offset, @@ -886,27 +891,19 @@ static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, } static inline u8 *__bnxt_alloc_rx_frag(struct bnxt *bp, dma_addr_t *mapping, + struct bnxt_rx_ring_info *rxr, gfp_t gfp) { - u8 *data; - struct pci_dev *pdev = bp->pdev; + unsigned int offset; + struct page *page; - if (gfp == GFP_ATOMIC) - data = napi_alloc_frag(bp->rx_buf_size); - else - data = netdev_alloc_frag(bp->rx_buf_size); - if (!data) + page = page_pool_alloc_frag(rxr->head_pool, &offset, + bp->rx_buf_size, gfp); + if (!page) return NULL; - *mapping = dma_map_single_attrs(&pdev->dev, data + bp->rx_dma_offset, - bp->rx_buf_use_size, bp->rx_dir, - DMA_ATTR_WEAK_ORDERING); - - if (dma_mapping_error(&pdev->dev, *mapping)) { - skb_free_frag(data); - data = NULL; - } - return data; + *mapping = page_pool_get_dma_addr(page) + bp->rx_dma_offset + offset; + return page_address(page) + offset; } int bnxt_alloc_rx_data(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, @@ -928,7 +925,7 @@ int bnxt_alloc_rx_data(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, rx_buf->data = page; rx_buf->data_ptr = page_address(page) + offset + bp->rx_offset; } else { - u8 *data = __bnxt_alloc_rx_frag(bp, &mapping, gfp); + u8 *data = __bnxt_alloc_rx_frag(bp, &mapping, rxr, gfp); if (!data) return -ENOMEM; @@ -1179,13 +1176,14 @@ static struct sk_buff *bnxt_rx_skb(struct bnxt *bp, } skb = napi_build_skb(data, bp->rx_buf_size); - dma_unmap_single_attrs(&bp->pdev->dev, dma_addr, bp->rx_buf_use_size, - bp->rx_dir, DMA_ATTR_WEAK_ORDERING); + dma_sync_single_for_cpu(&bp->pdev->dev, dma_addr, bp->rx_buf_use_size, + bp->rx_dir); if (!skb) { - skb_free_frag(data); + page_pool_free_va(rxr->head_pool, data, true); return NULL; } + skb_mark_for_recycle(skb); skb_reserve(skb, bp->rx_offset); skb_put(skb, offset_and_len & 0xffff); return skb; @@ -1840,7 +1838,8 @@ static inline struct sk_buff *bnxt_tpa_end(struct bnxt *bp, u8 *new_data; dma_addr_t new_mapping; - new_data = __bnxt_alloc_rx_frag(bp, &new_mapping, GFP_ATOMIC); + new_data = __bnxt_alloc_rx_frag(bp, &new_mapping, rxr, + GFP_ATOMIC); if (!new_data) { bnxt_abort_tpa(cpr, idx, agg_bufs); cpr->sw_stats->rx.rx_oom_discards += 1; @@ -1852,16 +1851,16 @@ static inline struct sk_buff *bnxt_tpa_end(struct bnxt *bp, tpa_info->mapping = new_mapping; skb = napi_build_skb(data, bp->rx_buf_size); - dma_unmap_single_attrs(&bp->pdev->dev, mapping, - bp->rx_buf_use_size, bp->rx_dir, - DMA_ATTR_WEAK_ORDERING); + dma_sync_single_for_cpu(&bp->pdev->dev, mapping, + bp->rx_buf_use_size, bp->rx_dir); if (!skb) { - skb_free_frag(data); + page_pool_free_va(rxr->head_pool, data, true); bnxt_abort_tpa(cpr, idx, agg_bufs); cpr->sw_stats->rx.rx_oom_discards += 1; return NULL; } + skb_mark_for_recycle(skb); skb_reserve(skb, bp->rx_offset); skb_put(skb, len); } @@ -3308,28 +3307,22 @@ static void bnxt_free_tx_skbs(struct bnxt *bp) static void bnxt_free_one_rx_ring(struct bnxt *bp, struct bnxt_rx_ring_info *rxr) { - struct pci_dev *pdev = bp->pdev; int i, max_idx; max_idx = bp->rx_nr_pages * RX_DESC_CNT; for (i = 0; i < max_idx; i++) { struct bnxt_sw_rx_bd *rx_buf = &rxr->rx_buf_ring[i]; - dma_addr_t mapping = rx_buf->mapping; void *data = rx_buf->data; if (!data) continue; rx_buf->data = NULL; - if (BNXT_RX_PAGE_MODE(bp)) { + if (BNXT_RX_PAGE_MODE(bp)) page_pool_recycle_direct(rxr->page_pool, data); - } else { - dma_unmap_single_attrs(&pdev->dev, mapping, - bp->rx_buf_use_size, bp->rx_dir, - DMA_ATTR_WEAK_ORDERING); - skb_free_frag(data); - } + else + page_pool_free_va(rxr->head_pool, data, true); } } @@ -3356,7 +3349,6 @@ static void bnxt_free_one_rx_agg_ring(struct bnxt *bp, struct bnxt_rx_ring_info static void bnxt_free_one_rx_ring_skbs(struct bnxt *bp, int ring_nr) { struct bnxt_rx_ring_info *rxr = &bp->rx_ring[ring_nr]; - struct pci_dev *pdev = bp->pdev; struct bnxt_tpa_idx_map *map; int i; @@ -3370,13 +3362,8 @@ static void bnxt_free_one_rx_ring_skbs(struct bnxt *bp, int ring_nr) if (!data) continue; - dma_unmap_single_attrs(&pdev->dev, tpa_info->mapping, - bp->rx_buf_use_size, bp->rx_dir, - DMA_ATTR_WEAK_ORDERING); - tpa_info->data = NULL; - - skb_free_frag(data); + page_pool_free_va(rxr->head_pool, data, false); } skip_rx_tpa_free: @@ -3592,7 +3579,9 @@ static void bnxt_free_rx_rings(struct bnxt *bp) xdp_rxq_info_unreg(&rxr->xdp_rxq); page_pool_destroy(rxr->page_pool); - rxr->page_pool = NULL; + if (rxr->page_pool != rxr->head_pool) + page_pool_destroy(rxr->head_pool); + rxr->page_pool = rxr->head_pool = NULL; kfree(rxr->rx_agg_bmap); rxr->rx_agg_bmap = NULL; @@ -3610,6 +3599,7 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, int numa_node) { struct page_pool_params pp = { 0 }; + struct page_pool *pool; pp.pool_size = bp->rx_agg_ring_size; if (BNXT_RX_PAGE_MODE(bp)) @@ -3622,14 +3612,25 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, pp.max_len = PAGE_SIZE; pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; - rxr->page_pool = page_pool_create(&pp); - if (IS_ERR(rxr->page_pool)) { - int err = PTR_ERR(rxr->page_pool); + pool = page_pool_create(&pp); + if (IS_ERR(pool)) + return PTR_ERR(pool); + rxr->page_pool = pool; - rxr->page_pool = NULL; - return err; + if (bnxt_separate_head_pool()) { + pp.pool_size = max(bp->rx_ring_size, 1024); + pool = page_pool_create(&pp); + if (IS_ERR(pool)) + goto err_destroy_pp; } + rxr->head_pool = pool; + return 0; + +err_destroy_pp: + page_pool_destroy(rxr->page_pool); + rxr->page_pool = NULL; + return PTR_ERR(pool); } static int bnxt_alloc_rx_rings(struct bnxt *bp) @@ -4180,7 +4181,8 @@ static int bnxt_alloc_one_rx_ring(struct bnxt *bp, int ring_nr) u8 *data; for (i = 0; i < bp->max_tpa; i++) { - data = __bnxt_alloc_rx_frag(bp, &mapping, GFP_KERNEL); + data = __bnxt_alloc_rx_frag(bp, &mapping, rxr, + GFP_KERNEL); if (!data) return -ENOMEM; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 69231e85140b..649955fa3e37 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1105,6 +1105,7 @@ struct bnxt_rx_ring_info { struct bnxt_ring_struct rx_agg_ring_struct; struct xdp_rxq_info xdp_rxq; struct page_pool *page_pool; + struct page_pool *head_pool; }; struct bnxt_rx_sw_stats { -- 2.50.1