From a6c81e32aeacbfd530d576fa401edd506ec966ef Mon Sep 17 00:00:00 2001 From: shantiprasad shettar Date: Mon, 10 Mar 2025 11:31:26 -0700 Subject: [PATCH 01/16] bnxt_en: Query FW parameters when the CAPS_CHANGE bit is set Newer FW can set the CAPS_CHANGE flag during ifup if some capabilities or configurations have changed. For example, the CoS queue configurations may have changed. Support this new flag by treating it almost like FW reset. The driver will essentially rediscover all features and capabilities, reconfigure all backing store context memory, reset everything to default, and reserve all resources. Reviewed-by: Somnath Kotur Reviewed-by: Pavan Chebbi Signed-off-by: shantiprasad shettar Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250310183129.3154117-5-michael.chan@broadcom.com Reviewed-by: Jacob Keller Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 869bdde09c73..08a41e9c25ff 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -12309,6 +12309,7 @@ static int bnxt_hwrm_if_change(struct bnxt *bp, bool up) struct hwrm_func_drv_if_change_input *req; bool fw_reset = !bp->irq_tbl; bool resc_reinit = false; + bool caps_change = false; int rc, retry = 0; u32 flags = 0; @@ -12364,8 +12365,11 @@ static int bnxt_hwrm_if_change(struct bnxt *bp, bool up) set_bit(BNXT_STATE_ABORT_ERR, &bp->state); return -ENODEV; } - if (resc_reinit || fw_reset) { - if (fw_reset) { + if (flags & FUNC_DRV_IF_CHANGE_RESP_FLAGS_CAPS_CHANGE) + caps_change = true; + + if (resc_reinit || fw_reset || caps_change) { + if (fw_reset || caps_change) { set_bit(BNXT_STATE_FW_RESET_DET, &bp->state); if (!test_bit(BNXT_STATE_IN_FW_RESET, &bp->state)) bnxt_ulp_irq_stop(bp); -- 2.51.0 From 17596d239f341b3ae4f32781c5d56bf28494709d Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 10 Mar 2025 11:31:27 -0700 Subject: [PATCH 02/16] bnxt_en: Update firmware interface to 1.10.3.97 The main changes are adding i2c write for module eeprom and a new v2 PCIe statistics structure. Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250310183129.3154117-6-michael.chan@broadcom.com Reviewed-by: Jacob Keller Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h | 143 +++++++++++++++--- 1 file changed, 119 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h index 5f8de1634378..549231703bce 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h @@ -2,7 +2,7 @@ * * Copyright (c) 2014-2016 Broadcom Corporation * Copyright (c) 2014-2018 Broadcom Limited - * Copyright (c) 2018-2024 Broadcom Inc. + * Copyright (c) 2018-2025 Broadcom Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -438,6 +438,7 @@ struct cmd_nums { #define HWRM_MFG_PRVSN_EXPORT_CERT 0x219UL #define HWRM_STAT_DB_ERROR_QSTATS 0x21aUL #define HWRM_MFG_TESTS 0x21bUL + #define HWRM_MFG_WRITE_CERT_NVM 0x21cUL #define HWRM_PORT_POE_CFG 0x230UL #define HWRM_PORT_POE_QCFG 0x231UL #define HWRM_UDCC_QCAPS 0x258UL @@ -514,6 +515,8 @@ struct cmd_nums { #define HWRM_TFC_TBL_SCOPE_CONFIG_GET 0x39aUL #define HWRM_TFC_RESC_USAGE_QUERY 0x39bUL #define HWRM_TFC_GLOBAL_ID_FREE 0x39cUL + #define HWRM_TFC_TCAM_PRI_UPDATE 0x39dUL + #define HWRM_TFC_HOT_UPGRADE_PROCESS 0x3a0UL #define HWRM_SV 0x400UL #define HWRM_DBG_SERDES_TEST 0xff0eUL #define HWRM_DBG_LOG_BUFFER_FLUSH 0xff0fUL @@ -629,8 +632,8 @@ struct hwrm_err_output { #define HWRM_VERSION_MAJOR 1 #define HWRM_VERSION_MINOR 10 #define HWRM_VERSION_UPDATE 3 -#define HWRM_VERSION_RSVD 85 -#define HWRM_VERSION_STR "1.10.3.85" +#define HWRM_VERSION_RSVD 97 +#define HWRM_VERSION_STR "1.10.3.97" /* hwrm_ver_get_input (size:192b/24B) */ struct hwrm_ver_get_input { @@ -1905,11 +1908,15 @@ struct hwrm_func_qcaps_output { __le32 roce_vf_max_srq; __le32 roce_vf_max_gid; __le32 flags_ext3; - #define FUNC_QCAPS_RESP_FLAGS_EXT3_RM_RSV_WHILE_ALLOC_CAP 0x1UL - #define FUNC_QCAPS_RESP_FLAGS_EXT3_REQUIRE_L2_FILTER 0x2UL - #define FUNC_QCAPS_RESP_FLAGS_EXT3_MAX_ROCE_VFS_SUPPORTED 0x4UL + #define FUNC_QCAPS_RESP_FLAGS_EXT3_RM_RSV_WHILE_ALLOC_CAP 0x1UL + #define FUNC_QCAPS_RESP_FLAGS_EXT3_REQUIRE_L2_FILTER 0x2UL + #define FUNC_QCAPS_RESP_FLAGS_EXT3_MAX_ROCE_VFS_SUPPORTED 0x4UL + #define FUNC_QCAPS_RESP_FLAGS_EXT3_RX_RATE_PROFILE_SEL_SUPPORTED 0x8UL + #define FUNC_QCAPS_RESP_FLAGS_EXT3_BIDI_OPT_SUPPORTED 0x10UL + #define FUNC_QCAPS_RESP_FLAGS_EXT3_MIRROR_ON_ROCE_SUPPORTED 0x20UL __le16 max_roce_vfs; - u8 unused_3[5]; + __le16 max_crypto_rx_flow_filters; + u8 unused_3[3]; u8 valid; }; @@ -1924,7 +1931,7 @@ struct hwrm_func_qcfg_input { u8 unused_0[6]; }; -/* hwrm_func_qcfg_output (size:1280b/160B) */ +/* hwrm_func_qcfg_output (size:1344b/168B) */ struct hwrm_func_qcfg_output { __le16 error_code; __le16 req_type; @@ -2087,14 +2094,18 @@ struct hwrm_func_qcfg_output { __le16 host_mtu; __le16 flags2; #define FUNC_QCFG_RESP_FLAGS2_SRIOV_DSCP_INSERT_ENABLED 0x1UL - u8 unused_4[2]; + __le16 stag_vid; u8 port_kdnet_mode; #define FUNC_QCFG_RESP_PORT_KDNET_MODE_DISABLED 0x0UL #define FUNC_QCFG_RESP_PORT_KDNET_MODE_ENABLED 0x1UL #define FUNC_QCFG_RESP_PORT_KDNET_MODE_LAST FUNC_QCFG_RESP_PORT_KDNET_MODE_ENABLED u8 kdnet_pcie_function; __le16 port_kdnet_fid; - u8 unused_5[2]; + u8 unused_5; + u8 roce_bidi_opt_mode; + #define FUNC_QCFG_RESP_ROCE_BIDI_OPT_MODE_DISABLED 0x1UL + #define FUNC_QCFG_RESP_ROCE_BIDI_OPT_MODE_DEDICATED 0x2UL + #define FUNC_QCFG_RESP_ROCE_BIDI_OPT_MODE_SHARED 0x4UL __le32 num_ktls_tx_key_ctxs; __le32 num_ktls_rx_key_ctxs; u8 lag_id; @@ -2112,7 +2123,8 @@ struct hwrm_func_qcfg_output { __le16 xid_partition_cfg; #define FUNC_QCFG_RESP_XID_PARTITION_CFG_TX_CK 0x1UL #define FUNC_QCFG_RESP_XID_PARTITION_CFG_RX_CK 0x2UL - u8 unused_7; + __le16 mirror_vnic_id; + u8 unused_7[7]; u8 valid; }; @@ -3965,7 +3977,7 @@ struct ts_split_entries { __le32 region_num_entries; u8 tsid; u8 lkup_static_bkt_cnt_exp[2]; - u8 rsvd; + u8 locked; __le32 rsvd2[2]; }; @@ -5483,6 +5495,37 @@ struct hwrm_port_phy_qcaps_output { u8 valid; }; +/* hwrm_port_phy_i2c_write_input (size:832b/104B) */ +struct hwrm_port_phy_i2c_write_input { + __le16 req_type; + __le16 cmpl_ring; + __le16 seq_id; + __le16 target_id; + __le64 resp_addr; + __le32 flags; + __le32 enables; + #define PORT_PHY_I2C_WRITE_REQ_ENABLES_PAGE_OFFSET 0x1UL + #define PORT_PHY_I2C_WRITE_REQ_ENABLES_BANK_NUMBER 0x2UL + __le16 port_id; + u8 i2c_slave_addr; + u8 bank_number; + __le16 page_number; + __le16 page_offset; + u8 data_length; + u8 unused_1[7]; + __le32 data[16]; +}; + +/* hwrm_port_phy_i2c_write_output (size:128b/16B) */ +struct hwrm_port_phy_i2c_write_output { + __le16 error_code; + __le16 req_type; + __le16 seq_id; + __le16 resp_len; + u8 unused_0[7]; + u8 valid; +}; + /* hwrm_port_phy_i2c_read_input (size:320b/40B) */ struct hwrm_port_phy_i2c_read_input { __le16 req_type; @@ -6610,8 +6653,9 @@ struct hwrm_vnic_alloc_input { __le32 flags; #define VNIC_ALLOC_REQ_FLAGS_DEFAULT 0x1UL #define VNIC_ALLOC_REQ_FLAGS_VIRTIO_NET_FID_VALID 0x2UL + #define VNIC_ALLOC_REQ_FLAGS_VNIC_ID_VALID 0x4UL __le16 virtio_net_fid; - u8 unused_0[2]; + __le16 vnic_id; }; /* hwrm_vnic_alloc_output (size:128b/16B) */ @@ -6710,6 +6754,7 @@ struct hwrm_vnic_cfg_input { #define VNIC_CFG_REQ_ENABLES_QUEUE_ID 0x80UL #define VNIC_CFG_REQ_ENABLES_RX_CSUM_V2_MODE 0x100UL #define VNIC_CFG_REQ_ENABLES_L2_CQE_MODE 0x200UL + #define VNIC_CFG_REQ_ENABLES_RAW_QP_ID 0x400UL __le16 vnic_id; __le16 dflt_ring_grp; __le16 rss_rule; @@ -6729,7 +6774,7 @@ struct hwrm_vnic_cfg_input { #define VNIC_CFG_REQ_L2_CQE_MODE_COMPRESSED 0x1UL #define VNIC_CFG_REQ_L2_CQE_MODE_MIXED 0x2UL #define VNIC_CFG_REQ_L2_CQE_MODE_LAST VNIC_CFG_REQ_L2_CQE_MODE_MIXED - u8 unused0[4]; + __le32 raw_qp_id; }; /* hwrm_vnic_cfg_output (size:128b/16B) */ @@ -7082,6 +7127,15 @@ struct hwrm_vnic_plcmodes_cfg_output { u8 valid; }; +/* hwrm_vnic_plcmodes_cfg_cmd_err (size:64b/8B) */ +struct hwrm_vnic_plcmodes_cfg_cmd_err { + u8 code; + #define VNIC_PLCMODES_CFG_CMD_ERR_CODE_UNKNOWN 0x0UL + #define VNIC_PLCMODES_CFG_CMD_ERR_CODE_INVALID_HDS_THRESHOLD 0x1UL + #define VNIC_PLCMODES_CFG_CMD_ERR_CODE_LAST VNIC_PLCMODES_CFG_CMD_ERR_CODE_INVALID_HDS_THRESHOLD + u8 unused_0[7]; +}; + /* hwrm_vnic_rss_cos_lb_ctx_alloc_input (size:128b/16B) */ struct hwrm_vnic_rss_cos_lb_ctx_alloc_input { __le16 req_type; @@ -7131,15 +7185,16 @@ struct hwrm_ring_alloc_input { __le16 target_id; __le64 resp_addr; __le32 enables; - #define RING_ALLOC_REQ_ENABLES_RING_ARB_CFG 0x2UL - #define RING_ALLOC_REQ_ENABLES_STAT_CTX_ID_VALID 0x8UL - #define RING_ALLOC_REQ_ENABLES_MAX_BW_VALID 0x20UL - #define RING_ALLOC_REQ_ENABLES_RX_RING_ID_VALID 0x40UL - #define RING_ALLOC_REQ_ENABLES_NQ_RING_ID_VALID 0x80UL - #define RING_ALLOC_REQ_ENABLES_RX_BUF_SIZE_VALID 0x100UL - #define RING_ALLOC_REQ_ENABLES_SCHQ_ID 0x200UL - #define RING_ALLOC_REQ_ENABLES_MPC_CHNLS_TYPE 0x400UL - #define RING_ALLOC_REQ_ENABLES_STEERING_TAG_VALID 0x800UL + #define RING_ALLOC_REQ_ENABLES_RING_ARB_CFG 0x2UL + #define RING_ALLOC_REQ_ENABLES_STAT_CTX_ID_VALID 0x8UL + #define RING_ALLOC_REQ_ENABLES_MAX_BW_VALID 0x20UL + #define RING_ALLOC_REQ_ENABLES_RX_RING_ID_VALID 0x40UL + #define RING_ALLOC_REQ_ENABLES_NQ_RING_ID_VALID 0x80UL + #define RING_ALLOC_REQ_ENABLES_RX_BUF_SIZE_VALID 0x100UL + #define RING_ALLOC_REQ_ENABLES_SCHQ_ID 0x200UL + #define RING_ALLOC_REQ_ENABLES_MPC_CHNLS_TYPE 0x400UL + #define RING_ALLOC_REQ_ENABLES_STEERING_TAG_VALID 0x800UL + #define RING_ALLOC_REQ_ENABLES_RX_RATE_PROFILE_VALID 0x1000UL u8 ring_type; #define RING_ALLOC_REQ_RING_TYPE_L2_CMPL 0x0UL #define RING_ALLOC_REQ_RING_TYPE_TX 0x1UL @@ -7226,7 +7281,11 @@ struct hwrm_ring_alloc_input { #define RING_ALLOC_REQ_MPC_CHNLS_TYPE_RE_CFA 0x3UL #define RING_ALLOC_REQ_MPC_CHNLS_TYPE_PRIMATE 0x4UL #define RING_ALLOC_REQ_MPC_CHNLS_TYPE_LAST RING_ALLOC_REQ_MPC_CHNLS_TYPE_PRIMATE - u8 unused_4[2]; + u8 rx_rate_profile_sel; + #define RING_ALLOC_REQ_RX_RATE_PROFILE_SEL_DEFAULT 0x0UL + #define RING_ALLOC_REQ_RX_RATE_PROFILE_SEL_POLL_MODE 0x1UL + #define RING_ALLOC_REQ_RX_RATE_PROFILE_SEL_LAST RING_ALLOC_REQ_RX_RATE_PROFILE_SEL_POLL_MODE + u8 unused_4; __le64 cq_handle; }; @@ -9122,6 +9181,39 @@ struct pcie_ctx_hw_stats { __le64 pcie_recovery_histogram; }; +/* pcie_ctx_hw_stats_v2 (size:4096b/512B) */ +struct pcie_ctx_hw_stats_v2 { + __le64 pcie_pl_signal_integrity; + __le64 pcie_dl_signal_integrity; + __le64 pcie_tl_signal_integrity; + __le64 pcie_link_integrity; + __le64 pcie_tx_traffic_rate; + __le64 pcie_rx_traffic_rate; + __le64 pcie_tx_dllp_statistics; + __le64 pcie_rx_dllp_statistics; + __le64 pcie_equalization_time; + __le32 pcie_ltssm_histogram[4]; + __le64 pcie_recovery_histogram; + __le32 pcie_tl_credit_nph_histogram[8]; + __le32 pcie_tl_credit_ph_histogram[8]; + __le32 pcie_tl_credit_pd_histogram[8]; + __le32 pcie_cmpl_latest_times[4]; + __le32 pcie_cmpl_longest_time; + __le32 pcie_cmpl_shortest_time; + __le32 unused_0[2]; + __le32 pcie_cmpl_latest_headers[4][4]; + __le32 pcie_cmpl_longest_headers[4][4]; + __le32 pcie_cmpl_shortest_headers[4][4]; + __le32 pcie_wr_latency_histogram[12]; + __le32 pcie_wr_latency_all_normal_count; + __le32 unused_1; + __le64 pcie_posted_packet_count; + __le64 pcie_non_posted_packet_count; + __le64 pcie_other_packet_count; + __le64 pcie_blocked_packet_count; + __le64 pcie_cmpl_packet_count; +}; + /* hwrm_stat_generic_qstats_input (size:256b/32B) */ struct hwrm_stat_generic_qstats_input { __le16 req_type; @@ -9317,6 +9409,9 @@ struct hwrm_struct_hdr { #define STRUCT_HDR_STRUCT_ID_LAST STRUCT_HDR_STRUCT_ID_UDCC_RTT_BUCKET_BOUND __le16 len; u8 version; + #define STRUCT_HDR_VERSION_0 0x0UL + #define STRUCT_HDR_VERSION_1 0x1UL + #define STRUCT_HDR_VERSION_LAST STRUCT_HDR_VERSION_1 u8 count; __le16 subtype; __le16 next_offset; -- 2.51.0 From 1b64544d634c33605dd112fb7349188b4c305f61 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 10 Mar 2025 11:31:28 -0700 Subject: [PATCH 03/16] bnxt_en: Refactor bnxt_get_module_eeprom_by_page() In preparation for adding .set_module_eeprom_by_page(), extract the common error checking done in bnxt_get_module_eeprom_by_page() into a new common function that can be re-used for .set_module_eeprom_by_page(). Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250310183129.3154117-7-michael.chan@broadcom.com Reviewed-by: Jacob Keller Signed-off-by: Paolo Abeni --- .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index e031340bdce2..c0de8f0e722d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -4541,11 +4541,11 @@ static int bnxt_get_module_status(struct bnxt *bp, struct netlink_ext_ack *extac return -EINVAL; } -static int bnxt_get_module_eeprom_by_page(struct net_device *dev, - const struct ethtool_module_eeprom *page_data, - struct netlink_ext_ack *extack) +static int +bnxt_mod_eeprom_by_page_precheck(struct bnxt *bp, + const struct ethtool_module_eeprom *page_data, + struct netlink_ext_ack *extack) { - struct bnxt *bp = netdev_priv(dev); int rc; if (BNXT_VF(bp) && !BNXT_VF_IS_TRUSTED(bp)) { @@ -4567,6 +4567,19 @@ static int bnxt_get_module_eeprom_by_page(struct net_device *dev, NL_SET_ERR_MSG_MOD(extack, "Firmware not capable for bank selection"); return -EINVAL; } + return 0; +} + +static int bnxt_get_module_eeprom_by_page(struct net_device *dev, + const struct ethtool_module_eeprom *page_data, + struct netlink_ext_ack *extack) +{ + struct bnxt *bp = netdev_priv(dev); + int rc; + + rc = bnxt_mod_eeprom_by_page_precheck(bp, page_data, extack); + if (rc) + return rc; rc = bnxt_read_sfp_module_eeprom_info(bp, page_data->i2c_address << 1, page_data->page, page_data->bank, -- 2.51.0 From c3be245dfc8a5506e0936e9a1224813edc506a0c Mon Sep 17 00:00:00 2001 From: Damodharam Ammepalli Date: Mon, 10 Mar 2025 11:31:29 -0700 Subject: [PATCH 04/16] bnxt_en: add .set_module_eeprom_by_page() support Add support for .set_module_eeprom_by_page() callback which implements generic solution for modules eeprom access. This implementation also supports CMIS 5.0.3 compliant eeprom FW download. Sample Usage: ethtool --flash-module-firmware enp177s0np0 file dummy.bin Signed-off-by: Damodharam Ammepalli Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250310183129.3154117-8-michael.chan@broadcom.com Reviewed-by: Jacob Keller Signed-off-by: Paolo Abeni --- .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 59 ++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index c0de8f0e722d..48dd5922e4dd 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -4550,7 +4550,7 @@ bnxt_mod_eeprom_by_page_precheck(struct bnxt *bp, if (BNXT_VF(bp) && !BNXT_VF_IS_TRUSTED(bp)) { NL_SET_ERR_MSG_MOD(extack, - "Module read not permitted on untrusted VF"); + "Module read/write not permitted on untrusted VF"); return -EPERM; } @@ -4593,6 +4593,62 @@ static int bnxt_get_module_eeprom_by_page(struct net_device *dev, return page_data->length; } +static int bnxt_write_sfp_module_eeprom_info(struct bnxt *bp, + const struct ethtool_module_eeprom *page) +{ + struct hwrm_port_phy_i2c_write_input *req; + int bytes_written = 0; + int rc; + + rc = hwrm_req_init(bp, req, HWRM_PORT_PHY_I2C_WRITE); + if (rc) + return rc; + + hwrm_req_hold(bp, req); + req->i2c_slave_addr = page->i2c_address << 1; + req->page_number = cpu_to_le16(page->page); + req->bank_number = page->bank; + req->port_id = cpu_to_le16(bp->pf.port_id); + req->enables = cpu_to_le32(PORT_PHY_I2C_WRITE_REQ_ENABLES_PAGE_OFFSET | + PORT_PHY_I2C_WRITE_REQ_ENABLES_BANK_NUMBER); + + while (bytes_written < page->length) { + u16 xfer_size; + + xfer_size = min_t(u16, page->length - bytes_written, + BNXT_MAX_PHY_I2C_RESP_SIZE); + req->page_offset = cpu_to_le16(page->offset + bytes_written); + req->data_length = xfer_size; + memcpy(req->data, page->data + bytes_written, xfer_size); + rc = hwrm_req_send(bp, req); + if (rc) + break; + bytes_written += xfer_size; + } + + hwrm_req_drop(bp, req); + return rc; +} + +static int bnxt_set_module_eeprom_by_page(struct net_device *dev, + const struct ethtool_module_eeprom *page_data, + struct netlink_ext_ack *extack) +{ + struct bnxt *bp = netdev_priv(dev); + int rc; + + rc = bnxt_mod_eeprom_by_page_precheck(bp, page_data, extack); + if (rc) + return rc; + + rc = bnxt_write_sfp_module_eeprom_info(bp, page_data); + if (rc) { + NL_SET_ERR_MSG_MOD(extack, "Module`s eeprom write failed"); + return rc; + } + return page_data->length; +} + static int bnxt_nway_reset(struct net_device *dev) { int rc = 0; @@ -5455,6 +5511,7 @@ const struct ethtool_ops bnxt_ethtool_ops = { .get_module_info = bnxt_get_module_info, .get_module_eeprom = bnxt_get_module_eeprom, .get_module_eeprom_by_page = bnxt_get_module_eeprom_by_page, + .set_module_eeprom_by_page = bnxt_set_module_eeprom_by_page, .nway_reset = bnxt_nway_reset, .set_phys_id = bnxt_set_phys_id, .self_test = bnxt_self_test, -- 2.51.0 From b407b4b804cd6193f45599aff82307c9e2a81225 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Mon, 10 Mar 2025 23:26:53 +0200 Subject: [PATCH 05/16] net/mlx5: Rename devlink rate parent set function for leaf nodes Rename `mlx5_esw_devlink_rate_parent_set()` to `mlx5_esw_devlink_rate_leaf_parent_set()` to distinguish setting a parent for leafs from nodes, which is not yet supported. Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1741642016-44918-2-git-send-email-tariqt@nvidia.com Reviewed-by: Jacob Keller Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 8 ++++---- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index a2cf3e79693d..2a9e3a43c1c8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -327,7 +327,7 @@ static const struct devlink_ops mlx5_devlink_ops = { .rate_node_tx_max_set = mlx5_esw_devlink_rate_node_tx_max_set, .rate_node_new = mlx5_esw_devlink_rate_node_new, .rate_node_del = mlx5_esw_devlink_rate_node_del, - .rate_leaf_parent_set = mlx5_esw_devlink_rate_parent_set, + .rate_leaf_parent_set = mlx5_esw_devlink_rate_leaf_parent_set, #endif #ifdef CONFIG_MLX5_SF_MANAGER .port_new = mlx5_devlink_sf_port_new, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 823c1ba456cd..c56027838a57 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -986,10 +986,10 @@ int mlx5_esw_qos_vport_update_parent(struct mlx5_vport *vport, struct mlx5_esw_s return err; } -int mlx5_esw_devlink_rate_parent_set(struct devlink_rate *devlink_rate, - struct devlink_rate *parent, - void *priv, void *parent_priv, - struct netlink_ext_ack *extack) +int mlx5_esw_devlink_rate_leaf_parent_set(struct devlink_rate *devlink_rate, + struct devlink_rate *parent, + void *priv, void *parent_priv, + struct netlink_ext_ack *extack) { struct mlx5_esw_sched_node *node; struct mlx5_vport *vport = priv; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h index 6eb8f6a648c8..43a40bda7d19 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h @@ -29,10 +29,10 @@ int mlx5_esw_devlink_rate_node_new(struct devlink_rate *rate_node, void **priv, struct netlink_ext_ack *extack); int mlx5_esw_devlink_rate_node_del(struct devlink_rate *rate_node, void *priv, struct netlink_ext_ack *extack); -int mlx5_esw_devlink_rate_parent_set(struct devlink_rate *devlink_rate, - struct devlink_rate *parent, - void *priv, void *parent_priv, - struct netlink_ext_ack *extack); +int mlx5_esw_devlink_rate_leaf_parent_set(struct devlink_rate *devlink_rate, + struct devlink_rate *parent, + void *priv, void *parent_priv, + struct netlink_ext_ack *extack); #endif #endif -- 2.51.0 From 498bd79cb92be572258a587a4abee67d464ba1de Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Mon, 10 Mar 2025 23:26:54 +0200 Subject: [PATCH 06/16] net/mlx5: Introduce hierarchy level tracking on scheduling nodes MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add a `level` field to `mlx5_esw_sched_node` to track the hierarchy depth of each scheduling node. This allows enforcement of the scheduling depth constraints based on `log_esw_max_sched_depth`. Modify `esw_qos_node_set_parent()` and `__esw_qos_alloc_node()` to correctly assign hierarchy levels. Ensure that nodes inherit their parent’s level incrementally. Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1741642016-44918-3-git-send-email-tariqt@nvidia.com Reviewed-by: Jacob Keller Signed-off-by: Paolo Abeni --- .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index c56027838a57..959e4446327d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -90,8 +90,22 @@ struct mlx5_esw_sched_node { struct list_head children; /* Valid only if this node is associated with a vport. */ struct mlx5_vport *vport; + /* Level in the hierarchy. The root node level is 1. */ + u8 level; }; +static void esw_qos_node_attach_to_parent(struct mlx5_esw_sched_node *node) +{ + if (!node->parent) { + /* Root children are assigned a depth level of 2. */ + node->level = 2; + list_add_tail(&node->entry, &node->esw->qos.domain->nodes); + } else { + node->level = node->parent->level + 1; + list_add_tail(&node->entry, &node->parent->children); + } +} + static void esw_qos_node_set_parent(struct mlx5_esw_sched_node *node, struct mlx5_esw_sched_node *parent) { @@ -99,6 +113,7 @@ esw_qos_node_set_parent(struct mlx5_esw_sched_node *node, struct mlx5_esw_sched_ node->parent = parent; list_add_tail(&node->entry, &parent->children); node->esw = parent->esw; + node->level = parent->level + 1; } void mlx5_esw_qos_vport_qos_free(struct mlx5_vport *vport) @@ -358,7 +373,6 @@ static struct mlx5_esw_sched_node * __esw_qos_alloc_node(struct mlx5_eswitch *esw, u32 tsar_ix, enum sched_node_type type, struct mlx5_esw_sched_node *parent) { - struct list_head *parent_children; struct mlx5_esw_sched_node *node; node = kzalloc(sizeof(*node), GFP_KERNEL); @@ -370,8 +384,7 @@ __esw_qos_alloc_node(struct mlx5_eswitch *esw, u32 tsar_ix, enum sched_node_type node->type = type; node->parent = parent; INIT_LIST_HEAD(&node->children); - parent_children = parent ? &parent->children : &esw->qos.domain->nodes; - list_add_tail(&node->entry, parent_children); + esw_qos_node_attach_to_parent(node); return node; } -- 2.51.0 From f88c349c75e3784a3f5463f5b403ff28dd823782 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Mon, 10 Mar 2025 23:26:55 +0200 Subject: [PATCH 07/16] net/mlx5: Preserve rate settings when creating a rate node Modify `esw_qos_create_node_sched_elem()` to receive max_rate and bw_share values while maintaining the previous configuration. This change is essential for the upcoming patch that will modify rate nodes and requires the existing settings to be preserved unless explicitly changed. Signed-off-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1741642016-44918-4-git-send-email-tariqt@nvidia.com Reviewed-by: Jacob Keller Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 959e4446327d..3c850efb4ca3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -320,8 +320,9 @@ static int esw_qos_set_node_min_rate(struct mlx5_esw_sched_node *node, return 0; } -static int esw_qos_create_node_sched_elem(struct mlx5_core_dev *dev, u32 parent_element_id, - u32 *tsar_ix) +static int +esw_qos_create_node_sched_elem(struct mlx5_core_dev *dev, u32 parent_element_id, + u32 max_rate, u32 bw_share, u32 *tsar_ix) { u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; void *attr; @@ -338,6 +339,8 @@ static int esw_qos_create_node_sched_elem(struct mlx5_core_dev *dev, u32 parent_ SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR); MLX5_SET(scheduling_context, tsar_ctx, parent_element_id, parent_element_id); + MLX5_SET(scheduling_context, tsar_ctx, max_average_bw, max_rate); + MLX5_SET(scheduling_context, tsar_ctx, bw_share, bw_share); attr = MLX5_ADDR_OF(scheduling_context, tsar_ctx, element_attributes); MLX5_SET(tsar_element, attr, tsar_type, TSAR_ELEMENT_TSAR_TYPE_DWRR); @@ -409,7 +412,8 @@ __esw_qos_create_vports_sched_node(struct mlx5_eswitch *esw, struct mlx5_esw_sch u32 tsar_ix; int err; - err = esw_qos_create_node_sched_elem(esw->dev, esw->qos.root_tsar_ix, &tsar_ix); + err = esw_qos_create_node_sched_elem(esw->dev, esw->qos.root_tsar_ix, 0, + 0, &tsar_ix); if (err) { NL_SET_ERR_MSG_MOD(extack, "E-Switch create TSAR for node failed"); return ERR_PTR(err); @@ -476,7 +480,8 @@ static int esw_qos_create(struct mlx5_eswitch *esw, struct netlink_ext_ack *exta if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, esw_scheduling)) return -EOPNOTSUPP; - err = esw_qos_create_node_sched_elem(esw->dev, 0, &esw->qos.root_tsar_ix); + err = esw_qos_create_node_sched_elem(esw->dev, 0, 0, 0, + &esw->qos.root_tsar_ix); if (err) { esw_warn(dev, "E-Switch create root TSAR failed (%d)\n", err); return err; -- 2.51.0 From 9c7bbf4c3304f139faad4b753241963f875b4195 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Mon, 10 Mar 2025 23:26:56 +0200 Subject: [PATCH 08/16] net/mlx5: Add support for setting parent of nodes Introduce `mlx5_esw_devlink_rate_node_parent_set()` to allow assigning a parent to scheduling nodes. Implement `mlx5_esw_qos_node_update_parent()` and `mlx5_esw_qos_node_validate_set_parent()` to enforce constraints on node reassignment. Don't allow reassignment of nodes with active rate objects. Update `esw_qos_node_set_parent()` to handle cases where the parent is NULL. A NULL parent indicates that the scheduling element is attached to the root scheduling element, and since only rate nodes can be connected to the root, this update is now necessary. Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1741642016-44918-5-git-send-email-tariqt@nvidia.com Reviewed-by: Jacob Keller Signed-off-by: Paolo Abeni --- .../net/ethernet/mellanox/mlx5/core/devlink.c | 1 + .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 108 +++++++++++++++++- .../net/ethernet/mellanox/mlx5/core/esw/qos.h | 4 + 3 files changed, 110 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index 2a9e3a43c1c8..73cd74644378 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -328,6 +328,7 @@ static const struct devlink_ops mlx5_devlink_ops = { .rate_node_new = mlx5_esw_devlink_rate_node_new, .rate_node_del = mlx5_esw_devlink_rate_node_del, .rate_leaf_parent_set = mlx5_esw_devlink_rate_leaf_parent_set, + .rate_node_parent_set = mlx5_esw_devlink_rate_node_parent_set, #endif #ifdef CONFIG_MLX5_SF_MANAGER .port_new = mlx5_devlink_sf_port_new, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 3c850efb4ca3..b6ae384396b3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -111,9 +111,9 @@ esw_qos_node_set_parent(struct mlx5_esw_sched_node *node, struct mlx5_esw_sched_ { list_del_init(&node->entry); node->parent = parent; - list_add_tail(&node->entry, &parent->children); - node->esw = parent->esw; - node->level = parent->level + 1; + if (parent) + node->esw = parent->esw; + esw_qos_node_attach_to_parent(node); } void mlx5_esw_qos_vport_qos_free(struct mlx5_vport *vport) @@ -1018,3 +1018,105 @@ int mlx5_esw_devlink_rate_leaf_parent_set(struct devlink_rate *devlink_rate, node = parent_priv; return mlx5_esw_qos_vport_update_parent(vport, node, extack); } + +static int +mlx5_esw_qos_node_validate_set_parent(struct mlx5_esw_sched_node *node, + struct mlx5_esw_sched_node *parent, + struct netlink_ext_ack *extack) +{ + u8 new_level, max_level; + + if (parent && parent->esw != node->esw) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot assign node to another E-Switch"); + return -EOPNOTSUPP; + } + + if (!list_empty(&node->children)) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot reassign a node that contains rate objects"); + return -EOPNOTSUPP; + } + + new_level = parent ? parent->level + 1 : 2; + max_level = 1 << MLX5_CAP_QOS(node->esw->dev, log_esw_max_sched_depth); + if (new_level > max_level) { + NL_SET_ERR_MSG_MOD(extack, + "Node hierarchy depth exceeds the maximum supported level"); + return -EOPNOTSUPP; + } + + return 0; +} + +static int esw_qos_vports_node_update_parent(struct mlx5_esw_sched_node *node, + struct mlx5_esw_sched_node *parent, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *curr_parent = node->parent; + struct mlx5_eswitch *esw = node->esw; + u32 parent_ix; + int err; + + parent_ix = parent ? parent->ix : node->esw->qos.root_tsar_ix; + mlx5_destroy_scheduling_element_cmd(esw->dev, + SCHEDULING_HIERARCHY_E_SWITCH, + node->ix); + err = esw_qos_create_node_sched_elem(esw->dev, parent_ix, + node->max_rate, 0, &node->ix); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to create a node under the new hierarchy."); + if (esw_qos_create_node_sched_elem(esw->dev, curr_parent->ix, + node->max_rate, + node->bw_share, + &node->ix)) + esw_warn(esw->dev, "Node restore QoS failed\n"); + + return err; + } + esw_qos_node_set_parent(node, parent); + + return 0; +} + +static int mlx5_esw_qos_node_update_parent(struct mlx5_esw_sched_node *node, + struct mlx5_esw_sched_node *parent, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *curr_parent; + struct mlx5_eswitch *esw = node->esw; + int err; + + err = mlx5_esw_qos_node_validate_set_parent(node, parent, extack); + if (err) + return err; + + esw_qos_lock(esw); + curr_parent = node->parent; + err = esw_qos_vports_node_update_parent(node, parent, extack); + if (err) + goto out; + + esw_qos_normalize_min_rate(esw, curr_parent, extack); + esw_qos_normalize_min_rate(esw, parent, extack); + +out: + esw_qos_unlock(esw); + + return err; +} + +int mlx5_esw_devlink_rate_node_parent_set(struct devlink_rate *devlink_rate, + struct devlink_rate *parent, + void *priv, void *parent_priv, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *node = priv, *parent_node; + + if (!parent) + return mlx5_esw_qos_node_update_parent(node, NULL, extack); + + parent_node = parent_priv; + return mlx5_esw_qos_node_update_parent(node, parent_node, extack); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h index 43a40bda7d19..ed40ec8f027e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h @@ -33,6 +33,10 @@ int mlx5_esw_devlink_rate_leaf_parent_set(struct devlink_rate *devlink_rate, struct devlink_rate *parent, void *priv, void *parent_priv, struct netlink_ext_ack *extack); +int mlx5_esw_devlink_rate_node_parent_set(struct devlink_rate *devlink_rate, + struct devlink_rate *parent, + void *priv, void *parent_priv, + struct netlink_ext_ack *extack); #endif #endif -- 2.51.0 From f5825e79b2b7b1b0912c219d24cd7aa3eb3e300d Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 11 Mar 2025 15:06:24 +0800 Subject: [PATCH 09/16] qed: remove cast to pointers passed to kfree Remove unnecessary casts to pointer types passed to kfree. Issue detected by coccinelle: @@ type t1; expression *e; @@ -kfree((t1 *)e); +kfree(e); Signed-off-by: Chen Ni Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/20250311070624.1037787-1-nichen@iscas.ac.cn Signed-off-by: Paolo Abeni --- drivers/net/ethernet/qlogic/qed/qed_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index f915c423fe70..886061d7351a 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -454,7 +454,7 @@ int qed_fill_dev_info(struct qed_dev *cdev, static void qed_free_cdev(struct qed_dev *cdev) { - kfree((void *)cdev); + kfree(cdev); } static struct qed_dev *qed_alloc_cdev(struct pci_dev *pdev) -- 2.51.0 From 8d4880db378350f8ed8969feea13bdc164564fc1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 11 Mar 2025 21:42:28 +0100 Subject: [PATCH 10/16] udp_tunnel: create a fastpath GRO lookup. Most UDP tunnels bind a socket to a local port, with ANY address, no peer and no interface index specified. Additionally it's quite common to have a single tunnel device per namespace. Track in each namespace the UDP tunnel socket respecting the above. When only a single one is present, store a reference in the netns. When such reference is not NULL, UDP tunnel GRO lookup just need to match the incoming packet destination port vs the socket local port. The tunnel socket never sets the reuse[port] flag[s]. When bound to no address and interface, no other socket can exist in the same netns matching the specified local port. Matching packets with non-local destination addresses will be aggregated, and eventually segmented as needed - no behavior changes intended. Note that the UDP tunnel socket reference is stored into struct netns_ipv4 for both IPv4 and IPv6 tunnels. That is intentional to keep all the fastpath-related netns fields in the same struct and allow cacheline-based optimization. Currently both the IPv4 and IPv6 socket pointer share the same cacheline as the `udp_table` field. Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/4d5c319c4471161829f50cb8436841de81a5edae.1741718157.git.pabeni@redhat.com Signed-off-by: Paolo Abeni --- include/linux/udp.h | 16 ++++++++++++++++ include/net/netns/ipv4.h | 11 +++++++++++ include/net/udp.h | 1 + include/net/udp_tunnel.h | 18 ++++++++++++++++++ net/ipv4/udp.c | 13 ++++++++++++- net/ipv4/udp_offload.c | 37 +++++++++++++++++++++++++++++++++++++ net/ipv4/udp_tunnel_core.c | 12 ++++++++++++ net/ipv6/udp.c | 2 ++ net/ipv6/udp_offload.c | 5 +++++ 9 files changed, 114 insertions(+), 1 deletion(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index 0807e21cfec9..895240177f4f 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -101,6 +101,13 @@ struct udp_sock { /* Cache friendly copy of sk->sk_peek_off >= 0 */ bool peeking_with_offset; + + /* + * Accounting for the tunnel GRO fastpath. + * Unprotected by compilers guard, as it uses space available in + * the last UDP socket cacheline. + */ + struct hlist_node tunnel_list; }; #define udp_test_bit(nr, sk) \ @@ -219,4 +226,13 @@ static inline void udp_allow_gso(struct sock *sk) #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) +static inline struct sock *udp_tunnel_sk(const struct net *net, bool is_ipv6) +{ +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + return rcu_dereference(net->ipv4.udp_tunnel_gro[is_ipv6].sk); +#else + return NULL; +#endif +} + #endif /* _LINUX_UDP_H */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 650b2dc9199f..6373e3f17da8 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -47,6 +47,11 @@ struct sysctl_fib_multipath_hash_seed { }; #endif +struct udp_tunnel_gro { + struct sock __rcu *sk; + struct hlist_head list; +}; + struct netns_ipv4 { /* Cacheline organization can be found documented in * Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst. @@ -85,6 +90,11 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; struct udp_table *udp_table; +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + /* Not in a pernet subsys because need to be available at GRO stage */ + struct udp_tunnel_gro udp_tunnel_gro[2]; +#endif + #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table_header *frags_hdr; @@ -277,4 +287,5 @@ struct netns_ipv4 { struct hlist_head *inet_addr_lst; struct delayed_work addr_chk_work; }; + #endif diff --git a/include/net/udp.h b/include/net/udp.h index 6e89520e100d..a772510b2aa5 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -290,6 +290,7 @@ static inline void udp_lib_init_sock(struct sock *sk) struct udp_sock *up = udp_sk(sk); skb_queue_head_init(&up->reader_queue); + INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); } diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index a93dc51f6323..eda0f3e2f65f 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -203,6 +203,24 @@ static inline void udp_tunnel_encap_enable(struct sock *sk) udp_encap_enable(); } +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add); +#else +static inline void udp_tunnel_update_gro_lookup(struct net *net, + struct sock *sk, bool add) {} +#endif + +static inline void udp_tunnel_cleanup_gro(struct sock *sk) +{ + struct udp_sock *up = udp_sk(sk); + struct net *net = sock_net(sk); + + if (!up->tunnel_list.pprev) + return; + + udp_tunnel_update_gro_lookup(net, sk, false); +} + #define UDP_TUNNEL_NIC_MAX_TABLES 4 enum udp_tunnel_nic_info_flags { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d0bffcfa56d8..db606f7e4163 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2891,8 +2891,10 @@ void udp_destroy_sock(struct sock *sk) if (encap_destroy) encap_destroy(sk); } - if (udp_test_bit(ENCAP_ENABLED, sk)) + if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udp_encap_needed_key); + udp_tunnel_cleanup_gro(sk); + } } } @@ -3804,6 +3806,15 @@ fallback: static int __net_init udp_pernet_init(struct net *net) { +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + int i; + + /* No tunnel is configured */ + for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) { + INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list); + RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL); + } +#endif udp_sysctl_init(net); udp_set_table(net); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 2c0725583be3..e36d8a234848 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -12,6 +12,38 @@ #include #include #include +#include + +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) +static DEFINE_SPINLOCK(udp_tunnel_gro_lock); + +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) +{ + bool is_ipv6 = sk->sk_family == AF_INET6; + struct udp_sock *tup, *up = udp_sk(sk); + struct udp_tunnel_gro *udp_tunnel_gro; + + spin_lock(&udp_tunnel_gro_lock); + udp_tunnel_gro = &net->ipv4.udp_tunnel_gro[is_ipv6]; + if (add) + hlist_add_head(&up->tunnel_list, &udp_tunnel_gro->list); + else + hlist_del_init(&up->tunnel_list); + + if (udp_tunnel_gro->list.first && + !udp_tunnel_gro->list.first->next) { + tup = hlist_entry(udp_tunnel_gro->list.first, struct udp_sock, + tunnel_list); + + rcu_assign_pointer(udp_tunnel_gro->sk, (struct sock *)tup); + } else { + RCU_INIT_POINTER(udp_tunnel_gro->sk, NULL); + } + + spin_unlock(&udp_tunnel_gro_lock); +} +EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup); +#endif static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, @@ -635,8 +667,13 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct iphdr *iph = skb_gro_network_header(skb); struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; int iif, sdif; + sk = udp_tunnel_sk(net, false); + if (sk && dport == htons(sk->sk_num)) + return sk; + inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 619a53eb672d..b5695826e57a 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -58,6 +58,15 @@ error: } EXPORT_SYMBOL(udp_sock_create4); +static bool sk_saddr_any(struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); +#else + return !sk->sk_rcv_saddr; +#endif +} + void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct udp_tunnel_sock_cfg *cfg) { @@ -80,6 +89,9 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->gro_complete = cfg->gro_complete; udp_tunnel_encap_enable(sk); + + if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sock->sk)) + udp_tunnel_update_gro_lookup(net, sock->sk, true); } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 024458ef163c..7317f8e053f1 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -1825,6 +1826,7 @@ void udpv6_destroy_sock(struct sock *sk) if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udpv6_encap_needed_key); udp_encap_disable(); + udp_tunnel_cleanup_gro(sk); } } } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 404212dfc99a..d8445ac1b2e4 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -118,8 +118,13 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct ipv6hdr *iph = skb_gro_network_header(skb); struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; int iif, sdif; + sk = udp_tunnel_sk(net, true); + if (sk && dport == htons(sk->sk_num)) + return sk; + inet6_get_iif_sdif(skb, &iif, &sdif); return __udp6_lib_lookup(net, &iph->saddr, sport, -- 2.51.0 From 311b36574ceaccfa3f91b74054a09cd4bb877702 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 11 Mar 2025 21:42:29 +0100 Subject: [PATCH 11/16] udp_tunnel: use static call for GRO hooks when possible It's quite common to have a single UDP tunnel type active in the whole system. In such a case we can replace the indirect call for the UDP tunnel GRO callback with a static call. Add the related accounting in the control path and switch to static call when possible. To keep the code simple use a static array for the registered tunnel types, and size such array based on the kernel config. Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/6fd1f9c7651151493ecab174e7b8386a1534170d.1741718157.git.pabeni@redhat.com Signed-off-by: Paolo Abeni --- include/net/udp_tunnel.h | 4 ++ net/ipv4/udp_offload.c | 130 ++++++++++++++++++++++++++++++++++++- net/ipv4/udp_tunnel_core.c | 2 + 3 files changed, 135 insertions(+), 1 deletion(-) diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index eda0f3e2f65f..a7b230867eb1 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -205,9 +205,11 @@ static inline void udp_tunnel_encap_enable(struct sock *sk) #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add); +void udp_tunnel_update_gro_rcv(struct sock *sk, bool add); #else static inline void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) {} +static inline void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) {} #endif static inline void udp_tunnel_cleanup_gro(struct sock *sk) @@ -215,6 +217,8 @@ static inline void udp_tunnel_cleanup_gro(struct sock *sk) struct udp_sock *up = udp_sk(sk); struct net *net = sock_net(sk); + udp_tunnel_update_gro_rcv(sk, false); + if (!up->tunnel_list.pprev) return; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index e36d8a234848..088aa8cb8ac0 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -15,6 +15,37 @@ #include #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + +/* + * Dummy GRO tunnel callback, exists mainly to avoid dangling/NULL + * values for the udp tunnel static call. + */ +static struct sk_buff *dummy_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + NAPI_GRO_CB(skb)->flush = 1; + return NULL; +} + +typedef struct sk_buff *(*udp_tunnel_gro_rcv_t)(struct sock *sk, + struct list_head *head, + struct sk_buff *skb); + +struct udp_tunnel_type_entry { + udp_tunnel_gro_rcv_t gro_receive; + refcount_t count; +}; + +#define UDP_MAX_TUNNEL_TYPES (IS_ENABLED(CONFIG_GENEVE) + \ + IS_ENABLED(CONFIG_VXLAN) * 2 + \ + IS_ENABLED(CONFIG_NET_FOU) * 2) + +DEFINE_STATIC_CALL(udp_tunnel_gro_rcv, dummy_gro_rcv); +static DEFINE_STATIC_KEY_FALSE(udp_tunnel_static_call); +static struct mutex udp_tunnel_gro_type_lock; +static struct udp_tunnel_type_entry udp_tunnel_gro_types[UDP_MAX_TUNNEL_TYPES]; +static unsigned int udp_tunnel_gro_type_nr; static DEFINE_SPINLOCK(udp_tunnel_gro_lock); void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) @@ -43,6 +74,101 @@ void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) spin_unlock(&udp_tunnel_gro_lock); } EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup); + +void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) +{ + struct udp_tunnel_type_entry *cur = NULL; + struct udp_sock *up = udp_sk(sk); + int i, old_gro_type_nr; + + if (!up->gro_receive) + return; + + mutex_lock(&udp_tunnel_gro_type_lock); + for (i = 0; i < udp_tunnel_gro_type_nr; i++) + if (udp_tunnel_gro_types[i].gro_receive == up->gro_receive) + cur = &udp_tunnel_gro_types[i]; + + old_gro_type_nr = udp_tunnel_gro_type_nr; + if (add) { + /* + * Update the matching entry, if found, or add a new one + * if needed + */ + if (cur) { + refcount_inc(&cur->count); + goto out; + } + + if (unlikely(udp_tunnel_gro_type_nr == UDP_MAX_TUNNEL_TYPES)) { + pr_err_once("Too many UDP tunnel types, please increase UDP_MAX_TUNNEL_TYPES\n"); + /* Ensure static call will never be enabled */ + udp_tunnel_gro_type_nr = UDP_MAX_TUNNEL_TYPES + 2; + goto out; + } + + cur = &udp_tunnel_gro_types[udp_tunnel_gro_type_nr++]; + refcount_set(&cur->count, 1); + cur->gro_receive = up->gro_receive; + } else { + /* + * The stack cleanups only successfully added tunnel, the + * lookup on removal should never fail. + */ + if (WARN_ON_ONCE(!cur)) + goto out; + + if (!refcount_dec_and_test(&cur->count)) + goto out; + + /* avoid gaps, so that the enable tunnel has always id 0 */ + *cur = udp_tunnel_gro_types[--udp_tunnel_gro_type_nr]; + } + + if (udp_tunnel_gro_type_nr == 1) { + static_call_update(udp_tunnel_gro_rcv, + udp_tunnel_gro_types[0].gro_receive); + static_branch_enable(&udp_tunnel_static_call); + } else if (old_gro_type_nr == 1) { + static_branch_disable(&udp_tunnel_static_call); + static_call_update(udp_tunnel_gro_rcv, dummy_gro_rcv); + } + +out: + mutex_unlock(&udp_tunnel_gro_type_lock); +} +EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_rcv); + +static void udp_tunnel_gro_init(void) +{ + mutex_init(&udp_tunnel_gro_type_lock); +} + +static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + if (static_branch_likely(&udp_tunnel_static_call)) { + if (unlikely(gro_recursion_inc_test(skb))) { + NAPI_GRO_CB(skb)->flush |= 1; + return NULL; + } + return static_call(udp_tunnel_gro_rcv)(sk, head, skb); + } + return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); +} + +#else + +static void udp_tunnel_gro_init(void) {} + +static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); +} + #endif static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, @@ -654,7 +780,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); - pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); + pp = udp_tunnel_gro_rcv(sk, head, skb); out: skb_gro_flush_final(skb, pp, flush); @@ -804,5 +930,7 @@ int __init udpv4_offload_init(void) .gro_complete = udp4_gro_complete, }, }; + + udp_tunnel_gro_init(); return inet_add_offload(&net_hotdata.udpv4_offload, IPPROTO_UDP); } diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index b5695826e57a..c49fceea8313 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -90,6 +90,8 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_tunnel_encap_enable(sk); + udp_tunnel_update_gro_rcv(sock->sk, true); + if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sock->sk)) udp_tunnel_update_gro_lookup(net, sock->sk, true); } -- 2.51.0 From 24faa63bcea88b6f24b0a3a710708505a876f9ba Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Wed, 12 Mar 2025 14:34:50 +0800 Subject: [PATCH 12/16] net: skbuff: Remove unused skb_add_data() Since commit a4ea4c477619 ("rxrpc: Don't use a ring buffer for call Tx queue") this function is not used anymore. Signed-off-by: Yue Haibing Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250312063450.183652-1-yuehaibing@huawei.com Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b8a1343d6785..cd8294cdc249 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3867,25 +3867,6 @@ static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int l bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i) __must_check; -static inline int skb_add_data(struct sk_buff *skb, - struct iov_iter *from, int copy) -{ - const int off = skb->len; - - if (skb->ip_summed == CHECKSUM_NONE) { - __wsum csum = 0; - if (csum_and_copy_from_iter_full(skb_put(skb, copy), copy, - &csum, from)) { - skb->csum = csum_block_add(skb->csum, csum, off); - return 0; - } - } else if (copy_from_iter_full(skb_put(skb, copy), copy, from)) - return 0; - - __skb_trim(skb, off); - return -EFAULT; -} - static inline bool skb_can_coalesce(struct sk_buff *skb, int i, const struct page *page, int off) { -- 2.51.0 From ae2d90355aa5592b0e99c8bbb4c3fa1d8e205f1b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Mar 2025 08:22:47 +0000 Subject: [PATCH 13/16] inet: frags: add inet_frag_putn() helper inet_frag_putn() can release multiple references in one step. Use it in inet_frags_free_cb(). Replace inet_frag_put(X) with inet_frag_putn(X, 1) Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250312082250.1803501-2-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/inet_frag.h | 4 ++-- include/net/ipv6_frag.h | 3 ++- net/ieee802154/6lowpan/reassembly.c | 7 ++++--- net/ipv4/inet_fragment.c | 3 +-- net/ipv4/ip_fragment.c | 2 +- net/ipv6/netfilter/nf_conntrack_reasm.c | 3 ++- net/ipv6/reassembly.c | 4 ++-- 7 files changed, 14 insertions(+), 12 deletions(-) diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 5af6eb14c5db..26687ad0b141 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -145,9 +145,9 @@ struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key); unsigned int inet_frag_rbtree_purge(struct rb_root *root, enum skb_drop_reason reason); -static inline void inet_frag_put(struct inet_frag_queue *q) +static inline void inet_frag_putn(struct inet_frag_queue *q, int refs) { - if (refcount_dec_and_test(&q->refcnt)) + if (refs && refcount_sub_and_test(refs, &q->refcnt)) inet_frag_destroy(q); } diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h index 7321ffe3a108..9d968d7d9fa4 100644 --- a/include/net/ipv6_frag.h +++ b/include/net/ipv6_frag.h @@ -66,6 +66,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) { struct net_device *dev = NULL; struct sk_buff *head; + int refs = 1; rcu_read_lock(); /* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */ @@ -109,7 +110,7 @@ out: spin_unlock(&fq->q.lock); out_rcu_unlock: rcu_read_unlock(); - inet_frag_put(&fq->q); + inet_frag_putn(&fq->q, refs); } /* Check if the upper layer header is truncated in the first fragment. */ diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 867d637d86f0..2df1a027e68a 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -45,6 +45,7 @@ static void lowpan_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; + int refs = 1; fq = container_of(frag, struct frag_queue, q); @@ -56,7 +57,7 @@ static void lowpan_frag_expire(struct timer_list *t) inet_frag_kill(&fq->q); out: spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q); + inet_frag_putn(&fq->q, refs); } static inline struct lowpan_frag_queue * @@ -302,13 +303,13 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type) fq = fq_find(net, cb, &hdr.source, &hdr.dest); if (fq != NULL) { - int ret; + int ret, refs = 1; spin_lock(&fq->q.lock); ret = lowpan_frag_queue(fq, skb, frag_type); spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q); + inet_frag_putn(&fq->q, refs); return ret; } diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index d179a2c84222..efc4cbee04c2 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -145,8 +145,7 @@ static void inet_frags_free_cb(void *ptr, void *arg) } spin_unlock_bh(&fq->lock); - if (refcount_sub_and_test(count, &fq->refcnt)) - inet_frag_destroy(fq); + inet_frag_putn(fq, count); } static LLIST_HEAD(fqdir_free_list); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 7a435746a22d..474a26191c89 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -112,7 +112,7 @@ static void ip4_frag_free(struct inet_frag_queue *q) static void ipq_put(struct ipq *ipq) { - inet_frag_put(&ipq->q); + inet_frag_putn(&ipq->q, 1); } /* Kill ipq entry. It is not destroyed immediately, diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 4120e67a8ce6..fcf969f9fe2a 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -447,6 +447,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) struct frag_hdr *fhdr; struct frag_queue *fq; struct ipv6hdr *hdr; + int refs = 1; u8 prevhdr; /* Jumbo payload inhibits frag. header */ @@ -489,7 +490,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) } spin_unlock_bh(&fq->q.lock); - inet_frag_put(&fq->q); + inet_frag_putn(&fq->q, refs); return ret; } EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index a48be617a8ab..5d56a8e5166b 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -380,7 +380,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { u32 prob_offset = 0; - int ret; + int ret, refs = 1; spin_lock(&fq->q.lock); @@ -389,7 +389,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) &prob_offset); spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q); + inet_frag_putn(&fq->q, refs); if (prob_offset) { __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); -- 2.51.0 From a2fb987c0ecf0498cc17056339cb11d128c46ab7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Mar 2025 08:22:48 +0000 Subject: [PATCH 14/16] ipv4: frags: remove ipq_put() Replace ipq_put() with inet_frag_putn() Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250312082250.1803501-3-edumazet@google.com Signed-off-by: Paolo Abeni --- net/ipv4/ip_fragment.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 474a26191c89..ee953be49b34 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -107,14 +107,6 @@ static void ip4_frag_free(struct inet_frag_queue *q) inet_putpeer(qp->peer); } - -/* Destruction primitives. */ - -static void ipq_put(struct ipq *ipq) -{ - inet_frag_putn(&ipq->q, 1); -} - /* Kill ipq entry. It is not destroyed immediately, * because caller (and someone more) holds reference count. */ @@ -143,6 +135,7 @@ static void ip_expire(struct timer_list *t) struct sk_buff *head = NULL; struct net *net; struct ipq *qp; + int refs = 1; qp = container_of(frag, struct ipq, q); net = qp->q.fqdir->net; @@ -202,7 +195,7 @@ out: out_rcu_unlock: rcu_read_unlock(); kfree_skb_reason(head, reason); - ipq_put(qp); + inet_frag_putn(&qp->q, refs); } /* Find the correct entry in the "incomplete datagrams" queue for @@ -498,14 +491,14 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) /* Lookup (or create) queue header */ qp = ip_find(net, ip_hdr(skb), user, vif); if (qp) { - int ret; + int ret, refs = 1; spin_lock(&qp->q.lock); ret = ip_frag_queue(qp, skb); spin_unlock(&qp->q.lock); - ipq_put(qp); + inet_frag_putn(&qp->q, refs); return ret; } -- 2.51.0 From eb0dfc0ef195a04e519b15d73cf25d8c25ee8df7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Mar 2025 08:22:49 +0000 Subject: [PATCH 15/16] inet: frags: change inet_frag_kill() to defer refcount updates In the following patch, we no longer assume inet_frag_kill() callers own a reference. Consuming two refcounts from inet_frag_kill() would lead in UAF. Propagate the pointer to the refs that will be consumed later by the final inet_frag_putn() call. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250312082250.1803501-4-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/inet_frag.h | 2 +- include/net/ipv6_frag.h | 2 +- net/ieee802154/6lowpan/reassembly.c | 17 ++++++++------ net/ipv4/inet_fragment.c | 12 +++++----- net/ipv4/ip_fragment.c | 30 ++++++++++--------------- net/ipv6/netfilter/nf_conntrack_reasm.c | 21 +++++++++-------- net/ipv6/reassembly.c | 18 ++++++++------- 7 files changed, 53 insertions(+), 49 deletions(-) diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 26687ad0b141..0eccd9c3a883 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -137,7 +137,7 @@ static inline void fqdir_pre_exit(struct fqdir *fqdir) } void fqdir_exit(struct fqdir *fqdir); -void inet_frag_kill(struct inet_frag_queue *q); +void inet_frag_kill(struct inet_frag_queue *q, int *refs); void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key); diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h index 9d968d7d9fa4..38ef66826939 100644 --- a/include/net/ipv6_frag.h +++ b/include/net/ipv6_frag.h @@ -78,7 +78,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) goto out; fq->q.flags |= INET_FRAG_DROP; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, &refs); dev = dev_get_by_index_rcu(net, fq->iif); if (!dev) diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 2df1a027e68a..c5f86cff06ee 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -31,7 +31,8 @@ static const char lowpan_frags_cache_name[] = "lowpan-frags"; static struct inet_frags lowpan_frags; static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev, struct net_device *ldev); + struct sk_buff *prev, struct net_device *ldev, + int *refs); static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) { @@ -54,7 +55,7 @@ static void lowpan_frag_expire(struct timer_list *t) if (fq->q.flags & INET_FRAG_COMPLETE) goto out; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, &refs); out: spin_unlock(&fq->q.lock); inet_frag_putn(&fq->q, refs); @@ -83,7 +84,8 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb, } static int lowpan_frag_queue(struct lowpan_frag_queue *fq, - struct sk_buff *skb, u8 frag_type) + struct sk_buff *skb, u8 frag_type, + int *refs) { struct sk_buff *prev_tail; struct net_device *ldev; @@ -144,7 +146,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq, unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - res = lowpan_frag_reasm(fq, skb, prev_tail, ldev); + res = lowpan_frag_reasm(fq, skb, prev_tail, ldev, refs); skb->_skb_refdst = orefdst; return res; } @@ -163,11 +165,12 @@ err: * the last and the first frames arrived and all the bits are here. */ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *ldev) + struct sk_buff *prev_tail, struct net_device *ldev, + int *refs) { void *reasm_data; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); if (!reasm_data) @@ -306,7 +309,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type) int ret, refs = 1; spin_lock(&fq->q.lock); - ret = lowpan_frag_queue(fq, skb, frag_type); + ret = lowpan_frag_queue(fq, skb, frag_type, &refs); spin_unlock(&fq->q.lock); inet_frag_putn(&fq->q, refs); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index efc4cbee04c2..5eb186050013 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -225,10 +225,10 @@ void fqdir_exit(struct fqdir *fqdir) } EXPORT_SYMBOL(fqdir_exit); -void inet_frag_kill(struct inet_frag_queue *fq) +void inet_frag_kill(struct inet_frag_queue *fq, int *refs) { if (del_timer(&fq->timer)) - refcount_dec(&fq->refcnt); + (*refs)++; if (!(fq->flags & INET_FRAG_COMPLETE)) { struct fqdir *fqdir = fq->fqdir; @@ -243,7 +243,7 @@ void inet_frag_kill(struct inet_frag_queue *fq) if (!READ_ONCE(fqdir->dead)) { rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, fqdir->f->rhash_params); - refcount_dec(&fq->refcnt); + (*refs)++; } else { fq->flags |= INET_FRAG_HASH_DEAD; } @@ -349,9 +349,11 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, &q->node, f->rhash_params); if (*prev) { + int refs = 2; + q->flags |= INET_FRAG_COMPLETE; - inet_frag_kill(q); - inet_frag_destroy(q); + inet_frag_kill(q, &refs); + inet_frag_putn(q, refs); return NULL; } return q; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index ee953be49b34..c5f3c810706f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -76,7 +76,8 @@ static u8 ip4_frag_ecn(u8 tos) static struct inet_frags ip4_frags; static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev); + struct sk_buff *prev_tail, struct net_device *dev, + int *refs); static void ip4_frag_init(struct inet_frag_queue *q, const void *a) @@ -107,14 +108,6 @@ static void ip4_frag_free(struct inet_frag_queue *q) inet_putpeer(qp->peer); } -/* Kill ipq entry. It is not destroyed immediately, - * because caller (and someone more) holds reference count. - */ -static void ipq_kill(struct ipq *ipq) -{ - inet_frag_kill(&ipq->q); -} - static bool frag_expire_skip_icmp(u32 user) { return user == IP_DEFRAG_AF_PACKET || @@ -152,7 +145,7 @@ static void ip_expire(struct timer_list *t) goto out; qp->q.flags |= INET_FRAG_DROP; - ipq_kill(qp); + inet_frag_kill(&qp->q, &refs); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); @@ -271,7 +264,7 @@ static int ip_frag_reinit(struct ipq *qp) } /* Add new segment to existing queue. */ -static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) +static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb, int *refs) { struct net *net = qp->q.fqdir->net; int ihl, end, flags, offset; @@ -291,7 +284,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && unlikely(ip_frag_too_far(qp)) && unlikely(err = ip_frag_reinit(qp))) { - ipq_kill(qp); + inet_frag_kill(&qp->q, refs); goto err; } @@ -375,10 +368,10 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = ip_frag_reasm(qp, skb, prev_tail, dev); + err = ip_frag_reasm(qp, skb, prev_tail, dev, refs); skb->_skb_refdst = orefdst; if (err) - inet_frag_kill(&qp->q); + inet_frag_kill(&qp->q, refs); return err; } @@ -395,7 +388,7 @@ insert_error: err = -EINVAL; __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); discard_qp: - inet_frag_kill(&qp->q); + inet_frag_kill(&qp->q, refs); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); err: kfree_skb_reason(skb, reason); @@ -409,7 +402,8 @@ static bool ip_frag_coalesce_ok(const struct ipq *qp) /* Build a new IP datagram from all its fragments. */ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev) + struct sk_buff *prev_tail, struct net_device *dev, + int *refs) { struct net *net = qp->q.fqdir->net; struct iphdr *iph; @@ -417,7 +411,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, int len, err; u8 ecn; - ipq_kill(qp); + inet_frag_kill(&qp->q, refs); ecn = ip_frag_ecn_table[qp->ecn]; if (unlikely(ecn == 0xff)) { @@ -495,7 +489,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) spin_lock(&qp->q.lock); - ret = ip_frag_queue(qp, skb); + ret = ip_frag_queue(qp, skb, &refs); spin_unlock(&qp->q.lock); inet_frag_putn(&qp->q, refs); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index fcf969f9fe2a..f33acb730dc5 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -123,7 +123,8 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) #endif static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev); + struct sk_buff *prev_tail, struct net_device *dev, + int *refs); static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { @@ -167,7 +168,8 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, - const struct frag_hdr *fhdr, int nhoff) + const struct frag_hdr *fhdr, int nhoff, + int *refs) { unsigned int payload_len; struct net_device *dev; @@ -221,7 +223,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, * this case. -DaveM */ pr_debug("end of fragment not rounded to 8 bytes.\n"); - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); return -EPROTO; } if (end > fq->q.len) { @@ -287,7 +289,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = nf_ct_frag6_reasm(fq, skb, prev, dev); + err = nf_ct_frag6_reasm(fq, skb, prev, dev, refs); skb->_skb_refdst = orefdst; /* After queue has assumed skb ownership, only 0 or @@ -301,7 +303,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, return -EINPROGRESS; insert_error: - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); err: skb_dst_drop(skb); return -EINVAL; @@ -315,13 +317,14 @@ err: * the last and the first frames arrived and all the bits are here. */ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev) + struct sk_buff *prev_tail, struct net_device *dev, + int *refs) { void *reasm_data; int payload_len; u8 ecn; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) @@ -372,7 +375,7 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, return 0; err: - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); return -EINVAL; } @@ -483,7 +486,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) spin_lock_bh(&fq->q.lock); - ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff); + ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff, &refs); if (ret == -EPROTO) { skb->transport_header = savethdr; ret = 0; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 5d56a8e5166b..7560380bd587 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -68,7 +68,8 @@ static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev); + struct sk_buff *prev_tail, struct net_device *dev, + int *refs); static void ip6_frag_expire(struct timer_list *t) { @@ -105,7 +106,7 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, struct frag_hdr *fhdr, int nhoff, - u32 *prob_offset) + u32 *prob_offset, int *refs) { struct net *net = dev_net(skb_dst(skb)->dev); int offset, end, fragsize; @@ -220,7 +221,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = ip6_frag_reasm(fq, skb, prev_tail, dev); + err = ip6_frag_reasm(fq, skb, prev_tail, dev, refs); skb->_skb_refdst = orefdst; return err; } @@ -238,7 +239,7 @@ insert_error: __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASM_OVERLAPS); discard_fq: - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); err: @@ -254,7 +255,8 @@ err: * the last and the first frames arrived and all the bits are here. */ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev) + struct sk_buff *prev_tail, struct net_device *dev, + int *refs) { struct net *net = fq->q.fqdir->net; unsigned int nhoff; @@ -262,7 +264,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, int payload_len; u8 ecn; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) @@ -320,7 +322,7 @@ out_fail: rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMFAILS); rcu_read_unlock(); - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); return -1; } @@ -386,7 +388,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) fq->iif = iif; ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff, - &prob_offset); + &prob_offset, &refs); spin_unlock(&fq->q.lock); inet_frag_putn(&fq->q, refs); -- 2.51.0 From ca0359df45a55a9eb4d6dc09a481064abf78320f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Mar 2025 08:22:50 +0000 Subject: [PATCH 16/16] inet: frags: save a pair of atomic operations in reassembly As mentioned in commit 648700f76b03 ("inet: frags: use rhashtables for reassembly units"): A followup patch will even remove the refcount hold/release left from prior implementation and save a couple of atomic operations. This patch implements this idea, seven years later. Signed-off-by: Eric Dumazet Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250312082250.1803501-5-edumazet@google.com Signed-off-by: Paolo Abeni --- net/ieee802154/6lowpan/reassembly.c | 5 ++++- net/ipv4/inet_fragment.c | 18 ++++++++---------- net/ipv4/ip_fragment.c | 5 ++++- net/ipv6/netfilter/nf_conntrack_reasm.c | 5 ++++- net/ipv6/reassembly.c | 9 ++++----- 5 files changed, 24 insertions(+), 18 deletions(-) diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index c5f86cff06ee..d4b983d17038 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -304,17 +304,20 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type) goto err; } + rcu_read_lock(); fq = fq_find(net, cb, &hdr.source, &hdr.dest); if (fq != NULL) { - int ret, refs = 1; + int ret, refs = 0; spin_lock(&fq->q.lock); ret = lowpan_frag_queue(fq, skb, frag_type, &refs); spin_unlock(&fq->q.lock); + rcu_read_unlock(); inet_frag_putn(&fq->q, refs); return ret; } + rcu_read_unlock(); err: kfree_skb(skb); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 5eb186050013..19fae4811ab2 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -327,7 +327,8 @@ static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); - refcount_set(&q->refcnt, 3); + /* One reference for the timer, one for the hash table. */ + refcount_set(&q->refcnt, 2); return q; } @@ -349,7 +350,11 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, &q->node, f->rhash_params); if (*prev) { - int refs = 2; + /* We could not insert in the hash table, + * we need to cancel what inet_frag_alloc() + * anticipated. + */ + int refs = 1; q->flags |= INET_FRAG_COMPLETE; inet_frag_kill(q, &refs); @@ -359,7 +364,6 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, return q; } -/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) { /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */ @@ -369,17 +373,11 @@ struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) if (!high_thresh || frag_mem_limit(fqdir) > high_thresh) return NULL; - rcu_read_lock(); - prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params); if (!prev) fq = inet_frag_create(fqdir, key, &prev); - if (!IS_ERR_OR_NULL(prev)) { + if (!IS_ERR_OR_NULL(prev)) fq = prev; - if (!refcount_inc_not_zero(&fq->refcnt)) - fq = NULL; - } - rcu_read_unlock(); return fq; } EXPORT_SYMBOL(inet_frag_find); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index c5f3c810706f..77f395b28ec7 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -483,18 +483,21 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); /* Lookup (or create) queue header */ + rcu_read_lock(); qp = ip_find(net, ip_hdr(skb), user, vif); if (qp) { - int ret, refs = 1; + int ret, refs = 0; spin_lock(&qp->q.lock); ret = ip_frag_queue(qp, skb, &refs); spin_unlock(&qp->q.lock); + rcu_read_unlock(); inet_frag_putn(&qp->q, refs); return ret; } + rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); kfree_skb(skb); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index f33acb730dc5..d6bd8f7079bb 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -450,7 +450,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) struct frag_hdr *fhdr; struct frag_queue *fq; struct ipv6hdr *hdr; - int refs = 1; + int refs = 0; u8 prevhdr; /* Jumbo payload inhibits frag. header */ @@ -477,9 +477,11 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); + rcu_read_lock(); fq = fq_find(net, fhdr->identification, user, hdr, skb->dev ? skb->dev->ifindex : 0); if (fq == NULL) { + rcu_read_unlock(); pr_debug("Can't find and can't create new queue\n"); return -ENOMEM; } @@ -493,6 +495,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) } spin_unlock_bh(&fq->q.lock); + rcu_read_unlock(); inet_frag_putn(&fq->q, refs); return ret; } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 7560380bd587..49740898bc13 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -305,9 +305,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, skb_postpush_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); - rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMOKS); - rcu_read_unlock(); fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; fq->q.last_run_head = NULL; @@ -319,9 +317,7 @@ out_oversize: out_oom: net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n"); out_fail: - rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMFAILS); - rcu_read_unlock(); inet_frag_kill(&fq->q, refs); return -1; } @@ -379,10 +375,11 @@ static int ipv6_frag_rcv(struct sk_buff *skb) } iif = skb->dev ? skb->dev->ifindex : 0; + rcu_read_lock(); fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { u32 prob_offset = 0; - int ret, refs = 1; + int ret, refs = 0; spin_lock(&fq->q.lock); @@ -391,6 +388,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) &prob_offset, &refs); spin_unlock(&fq->q.lock); + rcu_read_unlock(); inet_frag_putn(&fq->q, refs); if (prob_offset) { __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), @@ -400,6 +398,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) } return ret; } + rcu_read_unlock(); __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); kfree_skb(skb); -- 2.51.0