From 196b07ba910403c75d96a82000efa3942da243b8 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 24 Feb 2025 16:53:15 +0000 Subject: [PATCH 01/16] net: stmmac: dwc-qos: clean up clock initialisation Clean up the clock initialisation by providing a helper to find a named clock in the bulk clocks, and provide the name of the stmmac clock in match data so we can locate the stmmac clock in generic code. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Tested-by: Thierry Reding Link: https://patch.msgid.link/E1tmbhj-004vSz-Pt@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- .../stmicro/stmmac/dwmac-dwc-qos-eth.c | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index acb0a2e1664f..6cadf24a575c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -35,6 +35,16 @@ struct tegra_eqos { struct gpio_desc *reset; }; +static struct clk *dwc_eth_find_clk(struct plat_stmmacenet_data *plat_dat, + const char *name) +{ + for (int i = 0; i < plat_dat->num_clks; i++) + if (strcmp(plat_dat->clks[i].id, name) == 0) + return plat_dat->clks[i].clk; + + return NULL; +} + static int dwc_eth_dwmac_config_dt(struct platform_device *pdev, struct plat_stmmacenet_data *plat_dat) { @@ -121,12 +131,7 @@ static int dwc_qos_probe(struct platform_device *pdev, struct plat_stmmacenet_data *plat_dat, struct stmmac_resources *stmmac_res) { - for (int i = 0; i < plat_dat->num_clks; i++) { - if (strcmp(plat_dat->clks[i].id, "apb_pclk") == 0) - plat_dat->stmmac_clk = plat_dat->clks[i].clk; - else if (strcmp(plat_dat->clks[i].id, "phy_ref_clk") == 0) - plat_dat->pclk = plat_dat->clks[i].clk; - } + plat_dat->pclk = dwc_eth_find_clk(plat_dat, "phy_ref_clk"); return 0; } @@ -237,18 +242,12 @@ static int tegra_eqos_probe(struct platform_device *pdev, eqos->dev = &pdev->dev; eqos->regs = res->addr; + eqos->clk_slave = plat_dat->stmmac_clk; if (!is_of_node(dev->fwnode)) goto bypass_clk_reset_gpio; - for (int i = 0; i < plat_dat->num_clks; i++) { - if (strcmp(plat_dat->clks[i].id, "slave_bus") == 0) { - eqos->clk_slave = plat_dat->clks[i].clk; - plat_dat->stmmac_clk = eqos->clk_slave; - } else if (strcmp(plat_dat->clks[i].id, "tx") == 0) { - eqos->clk_tx = plat_dat->clks[i].clk; - } - } + eqos->clk_tx = dwc_eth_find_clk(plat_dat, "tx"); eqos->reset = devm_gpiod_get(&pdev->dev, "phy-reset", GPIOD_OUT_HIGH); if (IS_ERR(eqos->reset)) { @@ -312,15 +311,18 @@ struct dwc_eth_dwmac_data { struct plat_stmmacenet_data *plat_dat, struct stmmac_resources *res); void (*remove)(struct platform_device *pdev); + const char *stmmac_clk_name; }; static const struct dwc_eth_dwmac_data dwc_qos_data = { .probe = dwc_qos_probe, + .stmmac_clk_name = "apb_pclk", }; static const struct dwc_eth_dwmac_data tegra_eqos_data = { .probe = tegra_eqos_probe, .remove = tegra_eqos_remove, + .stmmac_clk_name = "slave_bus", }; static int dwc_eth_dwmac_probe(struct platform_device *pdev) @@ -360,6 +362,9 @@ static int dwc_eth_dwmac_probe(struct platform_device *pdev) if (ret) return dev_err_probe(&pdev->dev, ret, "Failed to enable clocks\n"); + plat_dat->stmmac_clk = dwc_eth_find_clk(plat_dat, + data->stmmac_clk_name); + ret = data->probe(pdev, plat_dat, &stmmac_res); if (ret < 0) { dev_err_probe(&pdev->dev, ret, "failed to probe subdriver\n"); -- 2.51.0 From ad530283d3c8bb926a08bb1a14c8aa053de4bc2c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 24 Feb 2025 14:00:37 +0200 Subject: [PATCH 02/16] drivers: net: xgene: Don't use "proxy" headers Update header inclusions to follow IWYU (Include What You Use) principle. In this case replace *gpio.h, which are subject to remove by the GPIOLIB subsystem, with the respective headers that are being used by the driver. Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20250224120037.3801609-1-andriy.shevchenko@linux.intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/apm/xgene/xgene_enet_xgmac.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_xgmac.c b/drivers/net/ethernet/apm/xgene/xgene_enet_xgmac.c index 86607b79c09f..cc3b1631c905 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_xgmac.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_xgmac.c @@ -6,8 +6,14 @@ * Keyur Chudgar */ -#include -#include +#include +#include +#include +#include +#include +#include +#include + #include "xgene_enet_main.h" #include "xgene_enet_hw.h" #include "xgene_enet_xgmac.h" -- 2.51.0 From ecdff893384cf169a55a66ca68c9d8917f8417a9 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 24 Feb 2025 19:44:13 +0200 Subject: [PATCH 03/16] ethtool: Symmetric OR-XOR RSS hash Add an additional type of symmetric RSS hash type: OR-XOR. The "Symmetric-OR-XOR" algorithm transforms the input as follows: (SRC_IP | DST_IP, SRC_IP ^ DST_IP, SRC_PORT | DST_PORT, SRC_PORT ^ DST_PORT) Change 'cap_rss_sym_xor_supported' to 'supported_input_xfrm', a bitmap of supported RXH_XFRM_* types. Reviewed-by: Cosmin Ratiu Reviewed-by: Tariq Toukan Signed-off-by: Gal Pressman Reviewed-by: Edward Cree Link: https://patch.msgid.link/20250224174416.499070-2-gal@nvidia.com Signed-off-by: Jakub Kicinski --- Documentation/networking/ethtool-netlink.rst | 2 +- Documentation/networking/scaling.rst | 15 +++++++++++---- drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 2 +- drivers/net/ethernet/intel/ice/ice_ethtool.c | 2 +- include/linux/ethtool.h | 5 ++--- include/uapi/linux/ethtool.h | 4 ++++ net/ethtool/ioctl.c | 8 ++++---- 7 files changed, 24 insertions(+), 14 deletions(-) diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 3770a2294509..b6e9af4d0f1b 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1934,7 +1934,7 @@ ETHTOOL_A_RSS_INDIR attribute returns RSS indirection table where each byte indicates queue number. ETHTOOL_A_RSS_INPUT_XFRM attribute is a bitmap indicating the type of transformation applied to the input protocol fields before given to the RSS -hfunc. Current supported option is symmetric-xor. +hfunc. Current supported options are symmetric-xor and symmetric-or-xor. PLCA_GET_CFG ============ diff --git a/Documentation/networking/scaling.rst b/Documentation/networking/scaling.rst index 4eb50bcb9d42..6984700cec6a 100644 --- a/Documentation/networking/scaling.rst +++ b/Documentation/networking/scaling.rst @@ -49,14 +49,21 @@ destination address) and TCP/UDP (source port, destination port) tuples are swapped, the computed hash is the same. This is beneficial in some applications that monitor TCP/IP flows (IDS, firewalls, ...etc) and need both directions of the flow to land on the same Rx queue (and CPU). The -"Symmetric-XOR" is a type of RSS algorithms that achieves this hash -symmetry by XORing the input source and destination fields of the IP -and/or L4 protocols. This, however, results in reduced input entropy and -could potentially be exploited. Specifically, the algorithm XORs the input +"Symmetric-XOR" and "Symmetric-OR-XOR" are types of RSS algorithms that +achieve this hash symmetry by XOR/ORing the input source and destination +fields of the IP and/or L4 protocols. This, however, results in reduced +input entropy and could potentially be exploited. + +Specifically, the "Symmetric-XOR" algorithm XORs the input as follows:: # (SRC_IP ^ DST_IP, SRC_IP ^ DST_IP, SRC_PORT ^ DST_PORT, SRC_PORT ^ DST_PORT) +The "Symmetric-OR-XOR" algorithm, on the other hand, transforms the input as +follows:: + + # (SRC_IP | DST_IP, SRC_IP ^ DST_IP, SRC_PORT | DST_PORT, SRC_PORT ^ DST_PORT) + The result is then fed to the underlying RSS algorithm. Some advanced NICs allow steering packets to queues based on diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c index 74a1e9fe1821..288bb5b2e72e 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c @@ -1808,7 +1808,7 @@ static int iavf_set_rxfh(struct net_device *netdev, static const struct ethtool_ops iavf_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_USECS | ETHTOOL_COALESCE_USE_ADAPTIVE, - .cap_rss_sym_xor_supported = true, + .supported_input_xfrm = RXH_XFRM_SYM_XOR, .get_drvinfo = iavf_get_drvinfo, .get_link = ethtool_op_get_link, .get_ringparam = iavf_get_ringparam, diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index b0805704834d..7c2dc347e4e5 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -4770,7 +4770,7 @@ static const struct ethtool_ops ice_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_USECS | ETHTOOL_COALESCE_USE_ADAPTIVE | ETHTOOL_COALESCE_RX_USECS_HIGH, - .cap_rss_sym_xor_supported = true, + .supported_input_xfrm = RXH_XFRM_SYM_XOR, .rxfh_per_ctx_key = true, .get_link_ksettings = ice_get_link_ksettings, .set_link_ksettings = ice_set_link_ksettings, diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 870994cc3ef7..7f222dccc7d1 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -763,13 +763,12 @@ struct kernel_ethtool_ts_info { /** * struct ethtool_ops - optional netdev operations + * @supported_input_xfrm: supported types of input xfrm from %RXH_XFRM_*. * @cap_link_lanes_supported: indicates if the driver supports lanes * parameter. * @cap_rss_ctx_supported: indicates if the driver supports RSS * contexts via legacy API, drivers implementing @create_rxfh_context * do not have to set this bit. - * @cap_rss_sym_xor_supported: indicates if the driver supports symmetric-xor - * RSS. * @rxfh_per_ctx_key: device supports setting different RSS key for each * additional context. Netlink API should report hfunc, key, and input_xfrm * for every context, not just context 0. @@ -995,9 +994,9 @@ struct kernel_ethtool_ts_info { * of the generic netdev features interface. */ struct ethtool_ops { + u32 supported_input_xfrm:8; u32 cap_link_lanes_supported:1; u32 cap_rss_ctx_supported:1; - u32 cap_rss_sym_xor_supported:1; u32 rxfh_per_ctx_key:1; u32 cap_rss_rxnfc_adds:1; u32 rxfh_indir_space; diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 2feba0929a8a..84833cca29fe 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2289,6 +2289,10 @@ static inline int ethtool_validate_duplex(__u8 duplex) * be exploited to reduce the RSS queue spread. */ #define RXH_XFRM_SYM_XOR (1 << 0) +/* Similar to SYM_XOR, except that one copy of the XOR'ed fields is replaced by + * an OR of the same fields + */ +#define RXH_XFRM_SYM_OR_XOR (1 << 1) #define RXH_XFRM_NO_CHANGE 0xff /* L2-L4 network traffic flow types */ diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 271c7cef9ef3..77d714874eca 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1011,11 +1011,11 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, if (rc) return rc; - /* Sanity check: if symmetric-xor is set, then: + /* Sanity check: if symmetric-xor/symmetric-or-xor is set, then: * 1 - no other fields besides IP src/dst and/or L4 src/dst * 2 - If src is set, dst must also be set */ - if ((rxfh.input_xfrm & RXH_XFRM_SYM_XOR) && + if ((rxfh.input_xfrm & (RXH_XFRM_SYM_XOR | RXH_XFRM_SYM_OR_XOR)) && ((info.data & ~(RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3)) || (!!(info.data & RXH_IP_SRC) ^ !!(info.data & RXH_IP_DST)) || @@ -1388,11 +1388,11 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, return -EOPNOTSUPP; /* Check input data transformation capabilities */ if (rxfh.input_xfrm && rxfh.input_xfrm != RXH_XFRM_SYM_XOR && + rxfh.input_xfrm != RXH_XFRM_SYM_OR_XOR && rxfh.input_xfrm != RXH_XFRM_NO_CHANGE) return -EINVAL; if (rxfh.input_xfrm != RXH_XFRM_NO_CHANGE && - (rxfh.input_xfrm & RXH_XFRM_SYM_XOR) && - !ops->cap_rss_sym_xor_supported) + rxfh.input_xfrm & ~ops->supported_input_xfrm) return -EOPNOTSUPP; create = rxfh.rss_context == ETH_RXFH_CONTEXT_ALLOC; -- 2.51.0 From 4d20c9f2db83a066537e54f3cffc40e85f17ad37 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 24 Feb 2025 19:44:14 +0200 Subject: [PATCH 04/16] net/mlx5e: Symmetric OR-XOR RSS hash control Allow control over the symmetric RSS hash, which was previously set to enabled by default by the driver. Symmetric OR-XOR RSS can now be queried and controlled using the 'ethtool -x/X' command. Reviewed-by: Cosmin Ratiu Reviewed-by: Tariq Toukan Signed-off-by: Gal Pressman Link: https://patch.msgid.link/20250224174416.499070-3-gal@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en/rss.c | 13 +++++++++++-- .../net/ethernet/mellanox/mlx5/core/en/rss.h | 4 ++-- .../net/ethernet/mellanox/mlx5/core/en/rx_res.c | 11 ++++++----- .../net/ethernet/mellanox/mlx5/core/en/rx_res.h | 5 +++-- .../net/ethernet/mellanox/mlx5/core/en/tir.c | 2 +- .../net/ethernet/mellanox/mlx5/core/en/tir.h | 1 + .../ethernet/mellanox/mlx5/core/en_ethtool.c | 17 ++++++++++++++--- 7 files changed, 38 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rss.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rss.c index 0d8ccc7b6c11..74cd111ee320 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rss.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rss.c @@ -161,6 +161,7 @@ static void mlx5e_rss_params_init(struct mlx5e_rss *rss) { enum mlx5_traffic_types tt; + rss->hash.symmetric = true; rss->hash.hfunc = ETH_RSS_HASH_TOP; netdev_rss_key_fill(rss->hash.toeplitz_hash_key, sizeof(rss->hash.toeplitz_hash_key)); @@ -566,7 +567,7 @@ inner_tir: return final_err; } -int mlx5e_rss_get_rxfh(struct mlx5e_rss *rss, u32 *indir, u8 *key, u8 *hfunc) +int mlx5e_rss_get_rxfh(struct mlx5e_rss *rss, u32 *indir, u8 *key, u8 *hfunc, bool *symmetric) { if (indir) memcpy(indir, rss->indir.table, @@ -579,11 +580,14 @@ int mlx5e_rss_get_rxfh(struct mlx5e_rss *rss, u32 *indir, u8 *key, u8 *hfunc) if (hfunc) *hfunc = rss->hash.hfunc; + if (symmetric) + *symmetric = rss->hash.symmetric; + return 0; } int mlx5e_rss_set_rxfh(struct mlx5e_rss *rss, const u32 *indir, - const u8 *key, const u8 *hfunc, + const u8 *key, const u8 *hfunc, const bool *symmetric, u32 *rqns, u32 *vhca_ids, unsigned int num_rqns) { bool changed_indir = false; @@ -623,6 +627,11 @@ int mlx5e_rss_set_rxfh(struct mlx5e_rss *rss, const u32 *indir, rss->indir.actual_table_size * sizeof(*rss->indir.table)); } + if (symmetric) { + rss->hash.symmetric = *symmetric; + changed_hash = true; + } + if (changed_indir && rss->enabled) { err = mlx5e_rss_apply(rss, rqns, vhca_ids, num_rqns); if (err) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rss.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rss.h index 72089f5f473c..8ac902190010 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rss.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rss.h @@ -47,9 +47,9 @@ void mlx5e_rss_disable(struct mlx5e_rss *rss); int mlx5e_rss_packet_merge_set_param(struct mlx5e_rss *rss, struct mlx5e_packet_merge_param *pkt_merge_param); -int mlx5e_rss_get_rxfh(struct mlx5e_rss *rss, u32 *indir, u8 *key, u8 *hfunc); +int mlx5e_rss_get_rxfh(struct mlx5e_rss *rss, u32 *indir, u8 *key, u8 *hfunc, bool *symmetric); int mlx5e_rss_set_rxfh(struct mlx5e_rss *rss, const u32 *indir, - const u8 *key, const u8 *hfunc, + const u8 *key, const u8 *hfunc, const bool *symmetric, u32 *rqns, u32 *vhca_ids, unsigned int num_rqns); struct mlx5e_rss_params_hash mlx5e_rss_get_hash(struct mlx5e_rss *rss); u8 mlx5e_rss_get_hash_fields(struct mlx5e_rss *rss, enum mlx5_traffic_types tt); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c index 9d8b2f5f6c96..5fcbe47337b0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c @@ -194,7 +194,7 @@ void mlx5e_rx_res_rss_set_indir_uniform(struct mlx5e_rx_res *res, unsigned int n } int mlx5e_rx_res_rss_get_rxfh(struct mlx5e_rx_res *res, u32 rss_idx, - u32 *indir, u8 *key, u8 *hfunc) + u32 *indir, u8 *key, u8 *hfunc, bool *symmetric) { struct mlx5e_rss *rss; @@ -205,11 +205,12 @@ int mlx5e_rx_res_rss_get_rxfh(struct mlx5e_rx_res *res, u32 rss_idx, if (!rss) return -ENOENT; - return mlx5e_rss_get_rxfh(rss, indir, key, hfunc); + return mlx5e_rss_get_rxfh(rss, indir, key, hfunc, symmetric); } int mlx5e_rx_res_rss_set_rxfh(struct mlx5e_rx_res *res, u32 rss_idx, - const u32 *indir, const u8 *key, const u8 *hfunc) + const u32 *indir, const u8 *key, const u8 *hfunc, + const bool *symmetric) { u32 *vhca_ids = get_vhca_ids(res, 0); struct mlx5e_rss *rss; @@ -221,8 +222,8 @@ int mlx5e_rx_res_rss_set_rxfh(struct mlx5e_rx_res *res, u32 rss_idx, if (!rss) return -ENOENT; - return mlx5e_rss_set_rxfh(rss, indir, key, hfunc, res->rss_rqns, vhca_ids, - res->rss_nch); + return mlx5e_rss_set_rxfh(rss, indir, key, hfunc, symmetric, + res->rss_rqns, vhca_ids, res->rss_nch); } int mlx5e_rx_res_rss_get_hash_fields(struct mlx5e_rx_res *res, u32 rss_idx, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h index 05b438043bcb..3e09d91281af 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h @@ -49,9 +49,10 @@ void mlx5e_rx_res_xsk_update(struct mlx5e_rx_res *res, struct mlx5e_channels *ch /* Configuration API */ void mlx5e_rx_res_rss_set_indir_uniform(struct mlx5e_rx_res *res, unsigned int nch); int mlx5e_rx_res_rss_get_rxfh(struct mlx5e_rx_res *res, u32 rss_idx, - u32 *indir, u8 *key, u8 *hfunc); + u32 *indir, u8 *key, u8 *hfunc, bool *symmetric); int mlx5e_rx_res_rss_set_rxfh(struct mlx5e_rx_res *res, u32 rss_idx, - const u32 *indir, const u8 *key, const u8 *hfunc); + const u32 *indir, const u8 *key, const u8 *hfunc, + const bool *symmetric); int mlx5e_rx_res_rss_get_hash_fields(struct mlx5e_rx_res *res, u32 rss_idx, enum mlx5_traffic_types tt); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c index 11f724ad90db..19499072f67f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c @@ -124,7 +124,7 @@ void mlx5e_tir_builder_build_rss(struct mlx5e_tir_builder *builder, const size_t len = MLX5_FLD_SZ_BYTES(tirc, rx_hash_toeplitz_key); void *rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key); - MLX5_SET(tirc, tirc, rx_hash_symmetric, 1); + MLX5_SET(tirc, tirc, rx_hash_symmetric, rss_hash->symmetric); memcpy(rss_key, rss_hash->toeplitz_hash_key, len); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tir.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tir.h index 857a84bcd53a..e8df3aaf6562 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tir.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tir.h @@ -9,6 +9,7 @@ struct mlx5e_rss_params_hash { u8 hfunc; u8 toeplitz_hash_key[40]; + bool symmetric; }; struct mlx5e_rss_params_traffic_type { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 1bf2212a0bb6..0cb515fa179f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1456,18 +1456,27 @@ static int mlx5e_get_rxfh(struct net_device *netdev, struct ethtool_rxfh_param * { struct mlx5e_priv *priv = netdev_priv(netdev); u32 rss_context = rxfh->rss_context; + bool symmetric; int err; mutex_lock(&priv->state_lock); err = mlx5e_rx_res_rss_get_rxfh(priv->rx_res, rss_context, - rxfh->indir, rxfh->key, &rxfh->hfunc); + rxfh->indir, rxfh->key, &rxfh->hfunc, &symmetric); mutex_unlock(&priv->state_lock); - return err; + + if (err) + return err; + + if (symmetric) + rxfh->input_xfrm = RXH_XFRM_SYM_OR_XOR; + + return 0; } static int mlx5e_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh, struct netlink_ext_ack *extack) { + bool symmetric = rxfh->input_xfrm == RXH_XFRM_SYM_OR_XOR; struct mlx5e_priv *priv = netdev_priv(dev); u32 *rss_context = &rxfh->rss_context; u8 hfunc = rxfh->hfunc; @@ -1502,7 +1511,8 @@ static int mlx5e_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxf err = mlx5e_rx_res_rss_set_rxfh(priv->rx_res, *rss_context, rxfh->indir, rxfh->key, - hfunc == ETH_RSS_HASH_NO_CHANGE ? NULL : &hfunc); + hfunc == ETH_RSS_HASH_NO_CHANGE ? NULL : &hfunc, + rxfh->input_xfrm == RXH_XFRM_NO_CHANGE ? NULL : &symmetric); unlock: mutex_unlock(&priv->state_lock); @@ -2611,6 +2621,7 @@ const struct ethtool_ops mlx5e_ethtool_ops = { ETHTOOL_COALESCE_MAX_FRAMES | ETHTOOL_COALESCE_USE_ADAPTIVE | ETHTOOL_COALESCE_USE_CQE, + .supported_input_xfrm = RXH_XFRM_SYM_OR_XOR, .get_drvinfo = mlx5e_get_drvinfo, .get_link = ethtool_op_get_link, .get_link_ext_state = mlx5e_get_link_ext_state, -- 2.51.0 From 0163250039c3a861f922a293f6108880b2d4516f Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 24 Feb 2025 19:44:15 +0200 Subject: [PATCH 05/16] selftests: drv-net: Make rand_port() get a port more reliably Instead of guessing a port and checking whether it's available, get an available port from the OS. Reviewed-by: Nimrod Oren Signed-off-by: Gal Pressman Link: https://patch.msgid.link/20250224174416.499070-4-gal@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/utils.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index d879700ef2b9..a6af97c7e283 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -187,18 +187,11 @@ def ethtool(args, json=None, ns=None, host=None): def rand_port(): """ - Get a random unprivileged port, try to make sure it's not already used. + Get a random unprivileged port. """ - for _ in range(1000): - port = random.randint(10000, 65535) - try: - with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s: - s.bind(("", port)) - return port - except OSError as e: - if e.errno != errno.EADDRINUSE: - raise - raise Exception("Can't find any free unprivileged port") + with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] def wait_port_listen(port, proto="tcp", ns=None, host=None, sleep=0.005, deadline=5): -- 2.51.0 From da87cabaf87702d43a016d255f11be5379892b6a Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 24 Feb 2025 19:44:16 +0200 Subject: [PATCH 06/16] selftests: drv-net-hw: Add a test for symmetric RSS hash Add a selftest that verifies symmetric RSS hash is working as intended. The test runs iterations of traffic, swapping the src/dst UDP ports, and verifies that the same RX queue is receiving the traffic in both cases. Reviewed-by: Nimrod Oren Signed-off-by: Gal Pressman Link: https://patch.msgid.link/20250224174416.499070-5-gal@nvidia.com Signed-off-by: Jakub Kicinski --- .../testing/selftests/drivers/net/hw/Makefile | 1 + .../drivers/net/hw/rss_input_xfrm.py | 87 +++++++++++++++++++ tools/testing/selftests/net/lib/py/utils.py | 4 +- 3 files changed, 90 insertions(+), 2 deletions(-) create mode 100755 tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index ae783e18be83..b2be14d0cc56 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -15,6 +15,7 @@ TEST_PROGS = \ nic_performance.py \ pp_alloc_fail.py \ rss_ctx.py \ + rss_input_xfrm.py \ tso.py \ # diff --git a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py new file mode 100755 index 000000000000..53bb08cc29ec --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import multiprocessing +import socket +from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_ge, cmd, fd_read_timeout +from lib.py import NetDrvEpEnv +from lib.py import EthtoolFamily, NetdevFamily +from lib.py import KsftSkipEx, KsftFailEx +from lib.py import rand_port + + +def traffic(cfg, local_port, remote_port, ipver): + af_inet = socket.AF_INET if ipver == "4" else socket.AF_INET6 + sock = socket.socket(af_inet, socket.SOCK_DGRAM) + sock.bind(("", local_port)) + sock.connect((cfg.remote_addr_v[ipver], remote_port)) + tgt = f"{ipver}:[{cfg.addr_v[ipver]}]:{local_port},sourceport={remote_port}" + cmd("echo a | socat - UDP" + tgt, host=cfg.remote) + fd_read_timeout(sock.fileno(), 5) + return sock.getsockopt(socket.SOL_SOCKET, socket.SO_INCOMING_CPU) + + +def test_rss_input_xfrm(cfg, ipver): + """ + Test symmetric input_xfrm. + If symmetric RSS hash is configured, send traffic twice, swapping the + src/dst UDP ports, and verify that the same queue is receiving the traffic + in both cases (IPs are constant). + """ + + if multiprocessing.cpu_count() < 2: + raise KsftSkipEx("Need at least two CPUs to test symmetric RSS hash") + + input_xfrm = cfg.ethnl.rss_get( + {'header': {'dev-name': cfg.ifname}}).get('input_xfrm') + + # Check for symmetric xor/or-xor + if not input_xfrm or (input_xfrm != 1 and input_xfrm != 2): + raise KsftSkipEx("Symmetric RSS hash not requested") + + cpus = set() + successful = 0 + for _ in range(100): + try: + port1 = rand_port(socket.SOCK_DGRAM) + port2 = rand_port(socket.SOCK_DGRAM) + cpu1 = traffic(cfg, port1, port2, ipver) + cpu2 = traffic(cfg, port2, port1, ipver) + cpus.update([cpu1, cpu2]) + ksft_eq( + cpu1, cpu2, comment=f"Received traffic on different cpus with ports ({port1 = }, {port2 = }) while symmetric hash is configured") + + successful += 1 + if successful == 10: + break + except: + continue + else: + raise KsftFailEx("Failed to run traffic") + + ksft_ge(len(cpus), 2, + comment=f"Received traffic on less than two cpus {cpus = }") + + +def test_rss_input_xfrm_ipv4(cfg): + cfg.require_ipver("4") + test_rss_input_xfrm(cfg, "4") + + +def test_rss_input_xfrm_ipv6(cfg): + cfg.require_ipver("6") + test_rss_input_xfrm(cfg, "6") + + +def main() -> None: + with NetDrvEpEnv(__file__, nsim_test=False) as cfg: + cfg.ethnl = EthtoolFamily() + cfg.netdevnl = NetdevFamily() + + ksft_run([test_rss_input_xfrm_ipv4, test_rss_input_xfrm_ipv6], + args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index a6af97c7e283..34470d65d871 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -185,11 +185,11 @@ def ethtool(args, json=None, ns=None, host=None): return tool('ethtool', args, json=json, ns=ns, host=host) -def rand_port(): +def rand_port(type=socket.SOCK_STREAM): """ Get a random unprivileged port. """ - with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s: + with socket.socket(socket.AF_INET6, type) as s: s.bind(("", 0)) return s.getsockname()[1] -- 2.51.0 From 8fa19c2c69fbf1f615e84d24d75dcf001ea91ccb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 24 Feb 2025 15:05:06 +0100 Subject: [PATCH 07/16] net: wangxun: fix LIBWX dependencies Selecting LIBWX requires that its dependencies are met first: WARNING: unmet direct dependencies detected for LIBWX Depends on [m]: NETDEVICES [=y] && ETHERNET [=y] && NET_VENDOR_WANGXUN [=y] && PTP_1588_CLOCK_OPTIONAL [=m] Selected by [y]: - TXGBE [=y] && NETDEVICES [=y] && ETHERNET [=y] && NET_VENDOR_WANGXUN [=y] && PCI [=y] && COMMON_CLK [=y] && I2C_DESIGNWARE_PLATFORM [=y] ld.lld-21: error: undefined symbol: ptp_schedule_worker >>> referenced by wx_ptp.c:747 (/home/arnd/arm-soc/drivers/net/ethernet/wangxun/libwx/wx_ptp.c:747) >>> drivers/net/ethernet/wangxun/libwx/wx_ptp.o:(wx_ptp_reset) in archive vmlinux.a Add the smae dependency on PTP_1588_CLOCK_OPTIONAL to the two driver using this library module. Fixes: 06e75161b9d4 ("net: wangxun: Add support for PTP clock") Signed-off-by: Arnd Bergmann Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250224140516.1168214-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/wangxun/Kconfig b/drivers/net/ethernet/wangxun/Kconfig index 6b60173fe1f5..47e3e8434b9e 100644 --- a/drivers/net/ethernet/wangxun/Kconfig +++ b/drivers/net/ethernet/wangxun/Kconfig @@ -26,6 +26,7 @@ config LIBWX config NGBE tristate "Wangxun(R) GbE PCI Express adapters support" depends on PCI + depends on PTP_1588_CLOCK_OPTIONAL select LIBWX select PHYLINK help @@ -43,6 +44,7 @@ config TXGBE depends on PCI depends on COMMON_CLK depends on I2C_DESIGNWARE_PLATFORM + depends on PTP_1588_CLOCK_OPTIONAL select MARVELL_10G_PHY select REGMAP select PHYLINK -- 2.51.0 From 91c8d8e4b7a38dc099b26e14b22f814ca4e75089 Mon Sep 17 00:00:00 2001 From: John Daley Date: Mon, 24 Feb 2025 15:43:50 -0800 Subject: [PATCH 08/16] enic: add dependency on Page Pool Driver was not configured to select page_pool, causing a compile error if page pool module was not already selected. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202502211253.3XRosM9I-lkp@intel.com/ Fixes: d24cb52b2d8a ("enic: Use the Page Pool API for RX") Reviewed-by: Nelson Escobar Signed-off-by: John Daley Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250224234350.23157-2-johndale@cisco.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cisco/enic/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/cisco/enic/Kconfig b/drivers/net/ethernet/cisco/enic/Kconfig index ad80c0fa96a6..96709875fe4f 100644 --- a/drivers/net/ethernet/cisco/enic/Kconfig +++ b/drivers/net/ethernet/cisco/enic/Kconfig @@ -6,5 +6,6 @@ config ENIC tristate "Cisco VIC Ethernet NIC Support" depends on PCI + select PAGE_POOL help This enables the support for the Cisco VIC Ethernet card. -- 2.51.0 From 3ba075278c11cdb19e2dbb80362042f1b0c08f74 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Feb 2025 17:10:48 +0000 Subject: [PATCH 09/16] tcp: be less liberal in TSEcr received while in SYN_RECV state Yong-Hao Zou mentioned that linux was not strict as other OS in 3WHS, for flows using TCP TS option (RFC 7323) As hinted by an old comment in tcp_check_req(), we can check the TSEcr value in the incoming packet corresponds to one of the SYNACK TSval values we have sent. In this patch, I record the oldest and most recent values that SYNACK packets have used. Send a challenge ACK if we receive a TSEcr outside of this range, and increase a new SNMP counter. nstat -az | grep TSEcrRejected TcpExtTSEcrRejected 0 0.0 Due to TCP fastopen implementation, do not apply yet these checks for fastopen flows. v2: No longer use req->num_timeout, but treq->snt_tsval_first to detect when first SYNACK is prepared. This means we make sure to not send an initial zero TSval. Make sure MPTCP and TCP selftests are passing. Change MIB name to TcpExtTSEcrRejected v1: https://lore.kernel.org/netdev/CADVnQykD8i4ArpSZaPKaoNxLJ2if2ts9m4As+=Jvdkrgx1qMHw@mail.gmail.com/T/ Reported-by: Yong-Hao Zou Signed-off-by: Eric Dumazet Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250225171048.3105061-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/snmp.rst | 1 + include/linux/tcp.h | 2 ++ include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/syncookies.c | 1 + net/ipv4/tcp_input.c | 1 + net/ipv4/tcp_minisocks.c | 26 +++++++++++-------- net/ipv4/tcp_output.c | 6 +++++ 8 files changed, 28 insertions(+), 11 deletions(-) diff --git a/Documentation/networking/net_cachelines/snmp.rst b/Documentation/networking/net_cachelines/snmp.rst index 90ca2d92547d..bc96efc92cf5 100644 --- a/Documentation/networking/net_cachelines/snmp.rst +++ b/Documentation/networking/net_cachelines/snmp.rst @@ -36,6 +36,7 @@ unsigned_long LINUX_MIB_TIMEWAITRECYCLED unsigned_long LINUX_MIB_TIMEWAITKILLED unsigned_long LINUX_MIB_PAWSACTIVEREJECTED unsigned_long LINUX_MIB_PAWSESTABREJECTED +unsigned_long LINUX_MIB_TSECR_REJECTED unsigned_long LINUX_MIB_DELAYEDACKLOST unsigned_long LINUX_MIB_LISTENOVERFLOWS unsigned_long LINUX_MIB_LISTENDROPS diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f88daaa76d83..159b2c59eb62 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -160,6 +160,8 @@ struct tcp_request_sock { u32 rcv_isn; u32 snt_isn; u32 ts_off; + u32 snt_tsval_first; + u32 snt_tsval_last; u32 last_oow_ack_time; /* last SYNACK */ u32 rcv_nxt; /* the ack # by SYNACK. For * FastOpen it's the seq# diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 848c7784e684..eb9fb776fdc3 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -186,6 +186,7 @@ enum LINUX_MIB_TIMEWAITKILLED, /* TimeWaitKilled */ LINUX_MIB_PAWSACTIVEREJECTED, /* PAWSActiveRejected */ LINUX_MIB_PAWSESTABREJECTED, /* PAWSEstabRejected */ + LINUX_MIB_TSECRREJECTED, /* TSEcrRejected */ LINUX_MIB_PAWS_OLD_ACK, /* PAWSOldAck */ LINUX_MIB_DELAYEDACKS, /* DelayedACKs */ LINUX_MIB_DELAYEDACKLOCKED, /* DelayedACKLocked */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index affd21a0f572..10cbeb76c274 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -189,6 +189,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), + SNMP_MIB_ITEM("TSEcrRejected", LINUX_MIB_TSECRREJECTED), SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK), SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED), diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 26816b876dd8..5459a78b9809 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -279,6 +279,7 @@ static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb, ireq->smc_ok = 0; treq->snt_synack = 0; + treq->snt_tsval_first = 0; treq->tfo_listener = false; treq->txhash = net_tx_rndhash(); treq->rcv_isn = ntohl(th->seq) - 1; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 217a8747a79b..d22ad553b45b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7081,6 +7081,7 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_rsk(req)->snt_synack = 0; + tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1eccc518b957..4f87406ddbcd 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -663,6 +663,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); + bool tsecr_reject = false; bool paws_reject = false; bool own_req; @@ -672,8 +673,13 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = READ_ONCE(req->ts_recent); - if (tmp_opt.rcv_tsecr) + if (tmp_opt.rcv_tsecr) { + if (inet_rsk(req)->tstamp_ok && !fastopen) + tsecr_reject = !between(tmp_opt.rcv_tsecr, + tcp_rsk(req)->snt_tsval_first, + READ_ONCE(tcp_rsk(req)->snt_tsval_last)); tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; + } /* We do not store true stamp, but it is not required, * it can be estimated (approximately) * from another data. @@ -788,18 +794,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->snt_isn + 1)) return sk; - /* Also, it would be not so bad idea to check rcv_tsecr, which - * is essentially ACK extension and too early or too late values - * should cause reset in unsynchronized states. - */ - /* RFC793: "first check sequence number". */ - if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_nxt, - tcp_rsk(req)->rcv_nxt + - tcp_synack_window(req))) { + if (paws_reject || tsecr_reject || + !tcp_in_window(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + tcp_rsk(req)->rcv_nxt, + tcp_rsk(req)->rcv_nxt + + tcp_synack_window(req))) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST) && !tcp_oow_rate_limited(sock_net(sk), skb, @@ -808,6 +810,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, req->rsk_ops->send_ack(sk, skb, req); if (paws_reject) NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + else if (tsecr_reject) + NET_INC_STATS(sock_net(sk), LINUX_MIB_TSECRREJECTED); return NULL; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9a3cf51eab78..0a660075add5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -943,6 +943,12 @@ static unsigned int tcp_synack_options(const struct sock *sk, opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) + tcp_rsk(req)->ts_off; + if (!tcp_rsk(req)->snt_tsval_first) { + if (!opts->tsval) + opts->tsval = ~0U; + tcp_rsk(req)->snt_tsval_first = opts->tsval; + } + WRITE_ONCE(tcp_rsk(req)->snt_tsval_last, opts->tsval); opts->tsecr = READ_ONCE(req->ts_recent); remaining -= TCPOLEN_TSTAMP_ALIGNED; } -- 2.51.0 From c1d6d629ab0b3dd991c7fb04587dd2adf2c4b426 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 24 Feb 2025 21:23:58 -0500 Subject: [PATCH 10/16] selftests/net: prepare cmsg_ipv6.sh for ipv4 Move IPV6_TCLASS and IPV6_HOPLIMIT into loops, to be able to use them for IP_TOS and IP_TTL in a follow-on patch. Indentation in this file is a mix of four spaces and tabs for double indents. To minimize code churn, maintain that pattern. Very small diff if viewing with -w. Signed-off-by: Willem de Bruijn Link: https://patch.msgid.link/20250225022431.2083926-2-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/cmsg_ipv6.sh | 109 ++++++++++++----------- 1 file changed, 59 insertions(+), 50 deletions(-) diff --git a/tools/testing/selftests/net/cmsg_ipv6.sh b/tools/testing/selftests/net/cmsg_ipv6.sh index 8bc23fb4c82b..51132c26e9b5 100755 --- a/tools/testing/selftests/net/cmsg_ipv6.sh +++ b/tools/testing/selftests/net/cmsg_ipv6.sh @@ -64,76 +64,85 @@ for ovr in setsock cmsg both diff; do done # IPV6_TCLASS -TOS=0x10 -TOS2=0x20 -ip -6 -netns $NS rule add tos $TOS lookup 300 -ip -6 -netns $NS route add table 300 prohibit any +test_dscp() { + local -r TOS=0x10 + local -r TOS2=0x20 -for ovr in setsock cmsg both diff; do - for p in u i r; do - [ $p == "u" ] && prot=UDP - [ $p == "i" ] && prot=ICMP - [ $p == "r" ] && prot=RAW + ip -6 -netns $NS rule add tos $TOS lookup 300 + ip -6 -netns $NS route add table 300 prohibit any - [ $ovr == "setsock" ] && m="-C" - [ $ovr == "cmsg" ] && m="-c" - [ $ovr == "both" ] && m="-C $((TOS2)) -c" - [ $ovr == "diff" ] && m="-C $((TOS )) -c" + for ovr in setsock cmsg both diff; do + for p in u i r; do + [ $p == "u" ] && prot=UDP + [ $p == "i" ] && prot=ICMP + [ $p == "r" ] && prot=RAW - $NSEXE nohup tcpdump --immediate-mode -p -ni dummy0 -w $TMPF -c 4 2> /dev/null & - BG=$! - sleep 0.05 + [ $ovr == "setsock" ] && m="-C" + [ $ovr == "cmsg" ] && m="-c" + [ $ovr == "both" ] && m="-C $((TOS2)) -c" + [ $ovr == "diff" ] && m="-C $((TOS )) -c" - $NSEXE ./cmsg_sender -6 -p $p $m $((TOS2)) $TGT6 1234 - check_result $? 0 "TCLASS $prot $ovr - pass" + $NSEXE nohup tcpdump --immediate-mode -p -ni dummy0 -w $TMPF -c 4 2> /dev/null & + BG=$! + sleep 0.05 - while [ -d /proc/$BG ]; do $NSEXE ./cmsg_sender -6 -p $p $m $((TOS2)) $TGT6 1234 - done + check_result $? 0 "TCLASS $prot $ovr - pass" - tcpdump -r $TMPF -v 2>&1 | grep "class $TOS2" >> /dev/null - check_result $? 0 "TCLASS $prot $ovr - packet data" - rm $TMPF + while [ -d /proc/$BG ]; do + $NSEXE ./cmsg_sender -6 -p $p $m $((TOS2)) $TGT6 1234 + done - [ $ovr == "both" ] && m="-C $((TOS )) -c" - [ $ovr == "diff" ] && m="-C $((TOS2)) -c" + tcpdump -r $TMPF -v 2>&1 | grep "class $TOS2" >> /dev/null + check_result $? 0 "TCLASS $prot $ovr - packet data" + rm $TMPF - $NSEXE ./cmsg_sender -6 -p $p $m $((TOS)) -s $TGT6 1234 - check_result $? 1 "TCLASS $prot $ovr - rejection" + [ $ovr == "both" ] && m="-C $((TOS )) -c" + [ $ovr == "diff" ] && m="-C $((TOS2)) -c" + + $NSEXE ./cmsg_sender -6 -p $p $m $((TOS)) -s $TGT6 1234 + check_result $? 1 "TCLASS $prot $ovr - rejection" + done done -done +} -# IPV6_HOPLIMIT -LIM=4 +test_dscp -for ovr in setsock cmsg both diff; do - for p in u i r; do - [ $p == "u" ] && prot=UDP - [ $p == "i" ] && prot=ICMP - [ $p == "r" ] && prot=RAW +# IPV6_HOPLIMIT +test_hoplimit() { + local -r LIM=4 - [ $ovr == "setsock" ] && m="-L" - [ $ovr == "cmsg" ] && m="-l" - [ $ovr == "both" ] && m="-L $LIM -l" - [ $ovr == "diff" ] && m="-L $((LIM + 1)) -l" + for ovr in setsock cmsg both diff; do + for p in u i r; do + [ $p == "u" ] && prot=UDP + [ $p == "i" ] && prot=ICMP + [ $p == "r" ] && prot=RAW - $NSEXE nohup tcpdump --immediate-mode -p -ni dummy0 -w $TMPF -c 4 2> /dev/null & - BG=$! - sleep 0.05 + [ $ovr == "setsock" ] && m="-L" + [ $ovr == "cmsg" ] && m="-l" + [ $ovr == "both" ] && m="-L $LIM -l" + [ $ovr == "diff" ] && m="-L $((LIM + 1)) -l" - $NSEXE ./cmsg_sender -6 -p $p $m $LIM $TGT6 1234 - check_result $? 0 "HOPLIMIT $prot $ovr - pass" + $NSEXE nohup tcpdump --immediate-mode -p -ni dummy0 -w $TMPF -c 4 2> /dev/null & + BG=$! + sleep 0.05 - while [ -d /proc/$BG ]; do $NSEXE ./cmsg_sender -6 -p $p $m $LIM $TGT6 1234 - done + check_result $? 0 "HOPLIMIT $prot $ovr - pass" - tcpdump -r $TMPF -v 2>&1 | grep "hlim $LIM[^0-9]" >> /dev/null - check_result $? 0 "HOPLIMIT $prot $ovr - packet data" - rm $TMPF + while [ -d /proc/$BG ]; do + $NSEXE ./cmsg_sender -6 -p $p $m $LIM $TGT6 1234 + done + + tcpdump -r $TMPF -v 2>&1 | grep "hlim $LIM[^0-9]" >> /dev/null + check_result $? 0 "HOPLIMIT $prot $ovr - packet data" + rm $TMPF + done done -done +} + +test_hoplimit # IPV6 exthdr for p in u i r; do -- 2.51.0 From 2e5584e0f913b53630238223443ae2a67eea714c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 24 Feb 2025 21:23:59 -0500 Subject: [PATCH 11/16] selftests/net: expand cmsg_ipv6.sh with ipv4 Expand IPV6_TCLASS to also cover IP_TOS. Expand IPV6_HOPLIMIT to also cover IP_TTL. Expand csmg_sender.c to allow setting IPv4 setsockopts. Also rename struct v6 to cmsg to match its expanded scope. Don't bother updating all occurrences of tclass and hoplimit. Rename cmsg_ipv6.sh to cmsg_ip.sh to match the expanded scope. Be careful around the subtle API difference between TCLASS and TOS. IP_TOS includes ECN bits. Add a test to verify that these are masked when making routing decisions. Diff is more concise with --word-diff Signed-off-by: Willem de Bruijn Link: https://patch.msgid.link/20250225022431.2083926-3-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 2 +- .../net/{cmsg_ipv6.sh => cmsg_ip.sh} | 59 ++++++++---- tools/testing/selftests/net/cmsg_sender.c | 90 +++++++++++-------- 3 files changed, 96 insertions(+), 55 deletions(-) rename tools/testing/selftests/net/{cmsg_ipv6.sh => cmsg_ip.sh} (65%) diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index aeb96c085614..2e16ce6d28e4 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -34,7 +34,7 @@ TEST_PROGS += gre_gso.sh TEST_PROGS += cmsg_so_mark.sh TEST_PROGS += cmsg_so_priority.sh TEST_PROGS += test_so_rcv.sh -TEST_PROGS += cmsg_time.sh cmsg_ipv6.sh +TEST_PROGS += cmsg_time.sh cmsg_ip.sh TEST_PROGS += netns-name.sh TEST_PROGS += link_netns.py TEST_PROGS += nl_netdev.py diff --git a/tools/testing/selftests/net/cmsg_ipv6.sh b/tools/testing/selftests/net/cmsg_ip.sh similarity index 65% rename from tools/testing/selftests/net/cmsg_ipv6.sh rename to tools/testing/selftests/net/cmsg_ip.sh index 51132c26e9b5..2a52520aca32 100755 --- a/tools/testing/selftests/net/cmsg_ipv6.sh +++ b/tools/testing/selftests/net/cmsg_ip.sh @@ -3,6 +3,8 @@ source lib.sh +IP4=172.16.0.1/24 +TGT4=172.16.0.2 IP6=2001:db8:1::1/64 TGT6=2001:db8:1::2 TMPF=$(mktemp --suffix ".pcap") @@ -30,6 +32,7 @@ $NSEXE sysctl -w net.ipv4.ping_group_range='0 2147483647' > /dev/null # Connectivity ip -netns $NS link add type dummy ip -netns $NS link set dev dummy0 up +ip -netns $NS addr add $IP4 dev dummy0 ip -netns $NS addr add $IP6 dev dummy0 # Test @@ -63,14 +66,19 @@ for ovr in setsock cmsg both diff; do done done -# IPV6_TCLASS +# IP_TOS + IPV6_TCLASS test_dscp() { + local -r IPVER=$1 + local -r TGT=$2 + local -r MATCH=$3 + local -r TOS=0x10 local -r TOS2=0x20 + local -r ECN=0x3 - ip -6 -netns $NS rule add tos $TOS lookup 300 - ip -6 -netns $NS route add table 300 prohibit any + ip $IPVER -netns $NS rule add tos $TOS lookup 300 + ip $IPVER -netns $NS route add table 300 prohibit any for ovr in setsock cmsg both diff; do for p in u i r; do @@ -87,30 +95,42 @@ test_dscp() { BG=$! sleep 0.05 - $NSEXE ./cmsg_sender -6 -p $p $m $((TOS2)) $TGT6 1234 - check_result $? 0 "TCLASS $prot $ovr - pass" + $NSEXE ./cmsg_sender $IPVER -p $p $m $((TOS2)) $TGT 1234 + check_result $? 0 "$MATCH $prot $ovr - pass" while [ -d /proc/$BG ]; do - $NSEXE ./cmsg_sender -6 -p $p $m $((TOS2)) $TGT6 1234 + $NSEXE ./cmsg_sender $IPVER -p $p $m $((TOS2)) $TGT 1234 done - tcpdump -r $TMPF -v 2>&1 | grep "class $TOS2" >> /dev/null - check_result $? 0 "TCLASS $prot $ovr - packet data" + tcpdump -r $TMPF -v 2>&1 | grep "$MATCH $TOS2" >> /dev/null + check_result $? 0 "$MATCH $prot $ovr - packet data" rm $TMPF [ $ovr == "both" ] && m="-C $((TOS )) -c" [ $ovr == "diff" ] && m="-C $((TOS2)) -c" - $NSEXE ./cmsg_sender -6 -p $p $m $((TOS)) -s $TGT6 1234 - check_result $? 1 "TCLASS $prot $ovr - rejection" + # Match prohibit rule: expect failure + $NSEXE ./cmsg_sender $IPVER -p $p $m $((TOS)) -s $TGT 1234 + check_result $? 1 "$MATCH $prot $ovr - rejection" + + # Match prohibit rule: IPv4 masks ECN: expect failure + if [[ "$IPVER" == "-4" ]]; then + $NSEXE ./cmsg_sender $IPVER -p $p $m "$((TOS | ECN))" -s $TGT 1234 + check_result $? 1 "$MATCH $prot $ovr - rejection (ECN)" + fi done done } -test_dscp +test_dscp -4 $TGT4 tos +test_dscp -6 $TGT6 class + +# IP_TTL + IPV6_HOPLIMIT +test_ttl_hoplimit() { + local -r IPVER=$1 + local -r TGT=$2 + local -r MATCH=$3 -# IPV6_HOPLIMIT -test_hoplimit() { local -r LIM=4 for ovr in setsock cmsg both diff; do @@ -128,21 +148,22 @@ test_hoplimit() { BG=$! sleep 0.05 - $NSEXE ./cmsg_sender -6 -p $p $m $LIM $TGT6 1234 - check_result $? 0 "HOPLIMIT $prot $ovr - pass" + $NSEXE ./cmsg_sender $IPVER -p $p $m $LIM $TGT 1234 + check_result $? 0 "$MATCH $prot $ovr - pass" while [ -d /proc/$BG ]; do - $NSEXE ./cmsg_sender -6 -p $p $m $LIM $TGT6 1234 + $NSEXE ./cmsg_sender $IPVER -p $p $m $LIM $TGT 1234 done - tcpdump -r $TMPF -v 2>&1 | grep "hlim $LIM[^0-9]" >> /dev/null - check_result $? 0 "HOPLIMIT $prot $ovr - packet data" + tcpdump -r $TMPF -v 2>&1 | grep "$MATCH $LIM[^0-9]" >> /dev/null + check_result $? 0 "$MATCH $prot $ovr - packet data" rm $TMPF done done } -test_hoplimit +test_ttl_hoplimit -4 $TGT4 ttl +test_ttl_hoplimit -6 $TGT6 hlim # IPV6 exthdr for p in u i r; do diff --git a/tools/testing/selftests/net/cmsg_sender.c b/tools/testing/selftests/net/cmsg_sender.c index bc314382e4e1..19bd8499031b 100644 --- a/tools/testing/selftests/net/cmsg_sender.c +++ b/tools/testing/selftests/net/cmsg_sender.c @@ -72,7 +72,7 @@ struct options { struct option_cmsg_u32 tclass; struct option_cmsg_u32 hlimit; struct option_cmsg_u32 exthdr; - } v6; + } cmsg; } opt = { .size = 13, .num_pkt = 1, @@ -104,10 +104,10 @@ static void __attribute__((noreturn)) cs_usage(const char *bin) "\t\t-t Enable time stamp reporting\n" "\t\t-f val Set don't fragment via cmsg\n" "\t\t-F val Set don't fragment via setsockopt\n" - "\t\t-c val Set TCLASS via cmsg\n" - "\t\t-C val Set TCLASS via setsockopt\n" - "\t\t-l val Set HOPLIMIT via cmsg\n" - "\t\t-L val Set HOPLIMIT via setsockopt\n" + "\t\t-c val Set TOS/TCLASS via cmsg\n" + "\t\t-C val Set TOS/TCLASS via setsockopt\n" + "\t\t-l val Set TTL/HOPLIMIT via cmsg\n" + "\t\t-L val Set TTL/HOPLIMIT via setsockopt\n" "\t\t-H type Add an IPv6 header option\n" "\t\t (h = HOP; d = DST; r = RTDST)" ""); @@ -169,37 +169,37 @@ static void cs_parse_args(int argc, char *argv[]) opt.ts.ena = true; break; case 'f': - opt.v6.dontfrag.ena = true; - opt.v6.dontfrag.val = atoi(optarg); + opt.cmsg.dontfrag.ena = true; + opt.cmsg.dontfrag.val = atoi(optarg); break; case 'F': opt.sockopt.dontfrag = atoi(optarg); break; case 'c': - opt.v6.tclass.ena = true; - opt.v6.tclass.val = atoi(optarg); + opt.cmsg.tclass.ena = true; + opt.cmsg.tclass.val = atoi(optarg); break; case 'C': opt.sockopt.tclass = atoi(optarg); break; case 'l': - opt.v6.hlimit.ena = true; - opt.v6.hlimit.val = atoi(optarg); + opt.cmsg.hlimit.ena = true; + opt.cmsg.hlimit.val = atoi(optarg); break; case 'L': opt.sockopt.hlimit = atoi(optarg); break; case 'H': - opt.v6.exthdr.ena = true; + opt.cmsg.exthdr.ena = true; switch (optarg[0]) { case 'h': - opt.v6.exthdr.val = IPV6_HOPOPTS; + opt.cmsg.exthdr.val = IPV6_HOPOPTS; break; case 'd': - opt.v6.exthdr.val = IPV6_DSTOPTS; + opt.cmsg.exthdr.val = IPV6_DSTOPTS; break; case 'r': - opt.v6.exthdr.val = IPV6_RTHDRDSTOPTS; + opt.cmsg.exthdr.val = IPV6_RTHDRDSTOPTS; break; default: printf("Error: hdr type: %s\n", optarg); @@ -261,12 +261,20 @@ cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz) SOL_SOCKET, SO_MARK, &opt.mark); ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, SOL_SOCKET, SO_PRIORITY, &opt.priority); - ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, - SOL_IPV6, IPV6_DONTFRAG, &opt.v6.dontfrag); - ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, - SOL_IPV6, IPV6_TCLASS, &opt.v6.tclass); - ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, - SOL_IPV6, IPV6_HOPLIMIT, &opt.v6.hlimit); + + if (opt.sock.family == AF_INET) { + ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, + SOL_IP, IP_TOS, &opt.cmsg.tclass); + ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, + SOL_IP, IP_TTL, &opt.cmsg.hlimit); + } else { + ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, + SOL_IPV6, IPV6_DONTFRAG, &opt.cmsg.dontfrag); + ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, + SOL_IPV6, IPV6_TCLASS, &opt.cmsg.tclass); + ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, + SOL_IPV6, IPV6_HOPLIMIT, &opt.cmsg.hlimit); + } if (opt.txtime.ena) { __u64 txtime; @@ -297,14 +305,14 @@ cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz) *(__u32 *)CMSG_DATA(cmsg) = SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE; } - if (opt.v6.exthdr.ena) { + if (opt.cmsg.exthdr.ena) { cmsg = (struct cmsghdr *)(cbuf + cmsg_len); cmsg_len += CMSG_SPACE(8); if (cbuf_sz < cmsg_len) error(ERN_CMSG_WR, EFAULT, "cmsg buffer too small"); cmsg->cmsg_level = SOL_IPV6; - cmsg->cmsg_type = opt.v6.exthdr.val; + cmsg->cmsg_type = opt.cmsg.exthdr.val; cmsg->cmsg_len = CMSG_LEN(8); *(__u64 *)CMSG_DATA(cmsg) = 0; } @@ -405,23 +413,35 @@ static void ca_set_sockopts(int fd) setsockopt(fd, SOL_SOCKET, SO_MARK, &opt.sockopt.mark, sizeof(opt.sockopt.mark))) error(ERN_SOCKOPT, errno, "setsockopt SO_MARK"); - if (opt.sockopt.dontfrag && - setsockopt(fd, SOL_IPV6, IPV6_DONTFRAG, - &opt.sockopt.dontfrag, sizeof(opt.sockopt.dontfrag))) - error(ERN_SOCKOPT, errno, "setsockopt IPV6_DONTFRAG"); - if (opt.sockopt.tclass && - setsockopt(fd, SOL_IPV6, IPV6_TCLASS, - &opt.sockopt.tclass, sizeof(opt.sockopt.tclass))) - error(ERN_SOCKOPT, errno, "setsockopt IPV6_TCLASS"); - if (opt.sockopt.hlimit && - setsockopt(fd, SOL_IPV6, IPV6_UNICAST_HOPS, - &opt.sockopt.hlimit, sizeof(opt.sockopt.hlimit))) - error(ERN_SOCKOPT, errno, "setsockopt IPV6_HOPLIMIT"); if (opt.sockopt.priority && setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opt.sockopt.priority, sizeof(opt.sockopt.priority))) error(ERN_SOCKOPT, errno, "setsockopt SO_PRIORITY"); + if (opt.sock.family == AF_INET) { + if (opt.sockopt.tclass && + setsockopt(fd, SOL_IP, IP_TOS, + &opt.sockopt.tclass, sizeof(opt.sockopt.tclass))) + error(ERN_SOCKOPT, errno, "setsockopt IP_TOS"); + if (opt.sockopt.hlimit && + setsockopt(fd, SOL_IP, IP_TTL, + &opt.sockopt.hlimit, sizeof(opt.sockopt.hlimit))) + error(ERN_SOCKOPT, errno, "setsockopt IP_TTL"); + } else { + if (opt.sockopt.dontfrag && + setsockopt(fd, SOL_IPV6, IPV6_DONTFRAG, + &opt.sockopt.dontfrag, sizeof(opt.sockopt.dontfrag))) + error(ERN_SOCKOPT, errno, "setsockopt IPV6_DONTFRAG"); + if (opt.sockopt.tclass && + setsockopt(fd, SOL_IPV6, IPV6_TCLASS, + &opt.sockopt.tclass, sizeof(opt.sockopt.tclass))) + error(ERN_SOCKOPT, errno, "setsockopt IPV6_TCLASS"); + if (opt.sockopt.hlimit && + setsockopt(fd, SOL_IPV6, IPV6_UNICAST_HOPS, + &opt.sockopt.hlimit, sizeof(opt.sockopt.hlimit))) + error(ERN_SOCKOPT, errno, "setsockopt IPV6_HOPLIMIT"); + } + if (opt.txtime.ena) { struct sock_txtime so_txtime = { .clockid = CLOCK_MONOTONIC, -- 2.51.0 From e6116fc605574bb58c2016938ff24a7fbafe6e2a Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 24 Feb 2025 21:33:55 -0500 Subject: [PATCH 12/16] net: skb: free up one bit in tx_flags The linked series wants to add skb tx completion timestamps. That needs a bit in skb_shared_info.tx_flags, but all are in use. A per-skb bit is only needed for features that are configured on a per packet basis. Per socket features can be read from sk->sk_tsflags. Per packet tsflags can be set in sendmsg using cmsg, but only those in SOF_TIMESTAMPING_TX_RECORD_MASK. Per packet tsflags can also be set without cmsg by sandwiching a send inbetween two setsockopts: val |= SOF_TIMESTAMPING_$FEATURE; setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val)); write(fd, buf, sz); val &= ~SOF_TIMESTAMPING_$FEATURE; setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val)); Changing a datapath test from skb_shinfo(skb)->tx_flags to skb->sk->sk_tsflags can change behavior in that case, as the tx_flags is written before the second setsockopt updates sk_tsflags. Therefore, only bits can be reclaimed that cannot be set by cmsg and are also highly unlikely to be used to target individual packets otherwise. Free up the bit currently used for SKBTX_HW_TSTAMP_USE_CYCLES. This selects between clock and free running counter source for HW TX timestamps. It is probable that all packets of the same socket will always use the same source. Link: https://lore.kernel.org/netdev/cover.1739988644.git.pav@iki.fi/ Signed-off-by: Willem de Bruijn Reviewed-by: Jason Xing Reviewed-by: Gerhard Engleder Link: https://patch.msgid.link/20250225023416.2088705-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/engleder/tsnep_main.c | 4 ++-- drivers/net/ethernet/intel/igc/igc_main.c | 3 ++- include/linux/skbuff.h | 5 ++--- net/socket.c | 11 +---------- 4 files changed, 7 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c index 0d030cb0b21c..3de4cb06e266 100644 --- a/drivers/net/ethernet/engleder/tsnep_main.c +++ b/drivers/net/ethernet/engleder/tsnep_main.c @@ -852,8 +852,8 @@ static bool tsnep_tx_poll(struct tsnep_tx *tx, int napi_budget) struct skb_shared_hwtstamps hwtstamps; u64 timestamp; - if (skb_shinfo(entry->skb)->tx_flags & - SKBTX_HW_TSTAMP_USE_CYCLES) + if (entry->skb->sk && + READ_ONCE(entry->skb->sk->sk_tsflags) & SOF_TIMESTAMPING_BIND_PHC) timestamp = __le64_to_cpu(entry->desc_wb->counter); else diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 3044392e8ded..472f009630c9 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -1668,7 +1668,8 @@ done: if (igc_request_tx_tstamp(adapter, skb, &tstamp_flags)) { skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; tx_flags |= IGC_TX_FLAGS_TSTAMP | tstamp_flags; - if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_USE_CYCLES) + if (skb->sk && + READ_ONCE(skb->sk->sk_tsflags) & SOF_TIMESTAMPING_BIND_PHC) tx_flags |= IGC_TX_FLAGS_TSTAMP_TIMER_1; } else { adapter->tx_hwtstamp_skipped++; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f2bb8473d99a..171aa15f6541 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -478,8 +478,8 @@ enum { /* device driver is going to provide hardware time stamp */ SKBTX_IN_PROGRESS = 1 << 2, - /* generate hardware time stamp based on cycles if supported */ - SKBTX_HW_TSTAMP_USE_CYCLES = 1 << 3, + /* reserved */ + SKBTX_RESERVED = 1 << 3, /* generate wifi status information (where possible) */ SKBTX_WIFI_STATUS = 1 << 4, @@ -500,7 +500,6 @@ enum { SKBTX_SCHED_TSTAMP | \ SKBTX_BPF) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \ - SKBTX_HW_TSTAMP_USE_CYCLES | \ SKBTX_ANY_SW_TSTAMP) /* Definitions for flags in struct skb_shared_info */ diff --git a/net/socket.c b/net/socket.c index 0545e9ea7058..b64ecf2722e7 100644 --- a/net/socket.c +++ b/net/socket.c @@ -680,18 +680,9 @@ void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags) { u8 flags = *tx_flags; - if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) { + if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) flags |= SKBTX_HW_TSTAMP_NOBPF; - /* PTP hardware clocks can provide a free running cycle counter - * as a time base for virtual clocks. Tell driver to use the - * free running cycle counter for timestamp if socket is bound - * to virtual clock. - */ - if (tsflags & SOF_TIMESTAMPING_BIND_PHC) - flags |= SKBTX_HW_TSTAMP_USE_CYCLES; - } - if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) flags |= SKBTX_SW_TSTAMP; -- 2.51.0 From 28d68d396a1cd21591e8c6d74afbde33a7ea107e Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 25 Feb 2025 03:39:14 +0000 Subject: [PATCH 13/16] bonding: report duplicate MAC address in all situations MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Normally, a bond uses the MAC address of the first added slave as the bond’s MAC address. And the bond will set active slave’s MAC address to bond’s address if fail_over_mac is set to none (0) or follow (2). When the first slave is removed, the bond will still use the removed slave’s MAC address, which can lead to a duplicate MAC address and potentially cause issues with the switch. To avoid confusion, let's warn the user in all situations, including when fail_over_mac is set to 2 or not in active-backup mode. Signed-off-by: Hangbin Liu Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250225033914.18617-1-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 7d716e90a84c..7d98fee5a27f 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2548,7 +2548,7 @@ static int __bond_release_one(struct net_device *bond_dev, RCU_INIT_POINTER(bond->current_arp_slave, NULL); - if (!all && (!bond->params.fail_over_mac || + if (!all && (bond->params.fail_over_mac != BOND_FOM_ACTIVE || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)) { if (ether_addr_equal_64bits(bond_dev->dev_addr, slave->perm_hwaddr) && bond_has_slaves(bond)) -- 2.51.0 From bd7c00605ee0cf0bf27764d5da0b948d8004229e Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Mon, 24 Feb 2025 16:22:22 -0700 Subject: [PATCH 14/16] net: move aRFS rmap management and CPU affinity to core A common task for most drivers is to remember the user-set CPU affinity to its IRQs. On each netdev reset, the driver should re-assign the user's settings to the IRQs. Unify this task across all drivers by moving the CPU affinity to napi->config. However, to move the CPU affinity to core, we also need to move aRFS rmap management since aRFS uses its own IRQ notifiers. For the aRFS, add a new netdev flag "rx_cpu_rmap_auto". Drivers supporting aRFS should set the flag via netif_enable_cpu_rmap() and core will allocate and manage the aRFS rmaps. Freeing the rmap is also done by core when the netdev is freed. For better IRQ affinity management, move the IRQ rmap notifier inside the napi_struct and add new notify.notify and notify.release functions: netif_irq_cpu_rmap_notify() and netif_napi_affinity_release(). Now we have the aRFS rmap management in core, add CPU affinity mask to napi_config. To delegate the CPU affinity management to the core, drivers must: 1 - set the new netdev flag "irq_affinity_auto": netif_enable_irq_affinity(netdev) 2 - create the napi with persistent config: netif_napi_add_config() 3 - bind an IRQ to the napi instance: netif_napi_set_irq() the core will then make sure to use re-assign affinity to the napi's IRQ. The default IRQ mask is set to one cpu starting from the closest NUMA. Signed-off-by: Ahmed Zaki Link: https://patch.msgid.link/20250224232228.990783-2-ahmed.zaki@intel.com Signed-off-by: Jakub Kicinski --- Documentation/networking/scaling.rst | 6 +- include/linux/cpu_rmap.h | 1 + include/linux/netdevice.h | 24 +++- lib/cpu_rmap.c | 2 +- net/core/dev.c | 169 +++++++++++++++++++++++++++ 5 files changed, 195 insertions(+), 7 deletions(-) diff --git a/Documentation/networking/scaling.rst b/Documentation/networking/scaling.rst index 6984700cec6a..99b6a61e5e31 100644 --- a/Documentation/networking/scaling.rst +++ b/Documentation/networking/scaling.rst @@ -434,8 +434,10 @@ rps_dev_flow_table. The stack consults a CPU to hardware queue map which is maintained by the NIC driver. This is an auto-generated reverse map of the IRQ affinity table shown by /proc/interrupts. Drivers can use functions in the cpu_rmap (“CPU affinity reverse map”) kernel library -to populate the map. For each CPU, the corresponding queue in the map is -set to be one whose processing CPU is closest in cache locality. +to populate the map. Alternatively, drivers can delegate the cpu_rmap +management to the Kernel by calling netif_enable_cpu_rmap(). For each CPU, +the corresponding queue in the map is set to be one whose processing CPU is +closest in cache locality. Accelerated RFS Configuration diff --git a/include/linux/cpu_rmap.h b/include/linux/cpu_rmap.h index 20b5729903d7..2fd7ba75362a 100644 --- a/include/linux/cpu_rmap.h +++ b/include/linux/cpu_rmap.h @@ -32,6 +32,7 @@ struct cpu_rmap { #define CPU_RMAP_DIST_INF 0xffff extern struct cpu_rmap *alloc_cpu_rmap(unsigned int size, gfp_t flags); +extern void cpu_rmap_get(struct cpu_rmap *rmap); extern int cpu_rmap_put(struct cpu_rmap *rmap); extern int cpu_rmap_add(struct cpu_rmap *rmap, void *obj); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9a387d456592..2094d3edda73 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -352,6 +352,7 @@ struct napi_config { u64 gro_flush_timeout; u64 irq_suspend_timeout; u32 defer_hard_irqs; + cpumask_t affinity_mask; unsigned int napi_id; }; @@ -394,6 +395,8 @@ struct napi_struct { struct list_head dev_list; struct hlist_node napi_hash_node; int irq; + struct irq_affinity_notify notify; + int napi_rmap_idx; int index; struct napi_config *config; }; @@ -409,6 +412,7 @@ enum { NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/ NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */ + NAPI_STATE_HAS_NOTIFIER, /* Napi has an IRQ notifier */ }; enum { @@ -422,6 +426,7 @@ enum { NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL), NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED), + NAPIF_STATE_HAS_NOTIFIER = BIT(NAPI_STATE_HAS_NOTIFIER), }; enum gro_result { @@ -1989,6 +1994,15 @@ enum netdev_reg_state { * * @threaded: napi threaded mode is enabled * + * @irq_affinity_auto: driver wants the core to store and re-assign the IRQ + * affinity. Set by netif_enable_irq_affinity(), then + * the driver must create a persistent napi by + * netif_napi_add_config() and finally bind the napi to + * IRQ (via netif_napi_set_irq()). + * + * @rx_cpu_rmap_auto: driver wants the core to manage the ARFS rmap. + * Set by calling netif_enable_cpu_rmap(). + * * @see_all_hwtstamp_requests: device wants to see calls to * ndo_hwtstamp_set() for all timestamp requests * regardless of source, even if those aren't @@ -2396,6 +2410,8 @@ struct net_device { struct lock_class_key *qdisc_tx_busylock; bool proto_down; bool threaded; + bool irq_affinity_auto; + bool rx_cpu_rmap_auto; /* priv_flags_slow, ungrouped to save space */ unsigned long see_all_hwtstamp_requests:1; @@ -2724,10 +2740,7 @@ static inline void netdev_assert_locked_or_invisible(struct net_device *dev) netdev_assert_locked(dev); } -static inline void netif_napi_set_irq_locked(struct napi_struct *napi, int irq) -{ - napi->irq = irq; -} +void netif_napi_set_irq_locked(struct napi_struct *napi, int irq); static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) { @@ -2865,6 +2878,9 @@ static inline void netif_napi_del(struct napi_struct *napi) synchronize_net(); } +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs); +void netif_set_affinity_auto(struct net_device *dev); + struct packet_type { __be16 type; /* This is really htons(ether_type). */ bool ignore_outgoing; diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c index 4c348670da31..f03d9be3f06b 100644 --- a/lib/cpu_rmap.c +++ b/lib/cpu_rmap.c @@ -73,7 +73,7 @@ static void cpu_rmap_release(struct kref *ref) * cpu_rmap_get - internal helper to get new ref on a cpu_rmap * @rmap: reverse-map allocated with alloc_cpu_rmap() */ -static inline void cpu_rmap_get(struct cpu_rmap *rmap) +void cpu_rmap_get(struct cpu_rmap *rmap) { kref_get(&rmap->refcount); } diff --git a/net/core/dev.c b/net/core/dev.c index 3f525278a871..b0ab0169f507 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6943,11 +6943,175 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, } EXPORT_SYMBOL(netif_queue_set_napi); +static void +netif_napi_irq_notify(struct irq_affinity_notify *notify, + const cpumask_t *mask) +{ + struct napi_struct *napi = + container_of(notify, struct napi_struct, notify); +#ifdef CONFIG_RFS_ACCEL + struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; + int err; +#endif + + if (napi->config && napi->dev->irq_affinity_auto) + cpumask_copy(&napi->config->affinity_mask, mask); + +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask); + if (err) + netdev_warn(napi->dev, "RMAP update failed (%d)\n", + err); + } +#endif +} + +#ifdef CONFIG_RFS_ACCEL +static void netif_napi_affinity_release(struct kref *ref) +{ + struct napi_struct *napi = + container_of(ref, struct napi_struct, notify.kref); + struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; + + netdev_assert_locked(napi->dev); + WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, + &napi->state)); + + if (!napi->dev->rx_cpu_rmap_auto) + return; + rmap->obj[napi->napi_rmap_idx] = NULL; + napi->napi_rmap_idx = -1; + cpu_rmap_put(rmap); +} + +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) +{ + if (dev->rx_cpu_rmap_auto) + return 0; + + dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs); + if (!dev->rx_cpu_rmap) + return -ENOMEM; + + dev->rx_cpu_rmap_auto = true; + return 0; +} +EXPORT_SYMBOL(netif_enable_cpu_rmap); + +static void netif_del_cpu_rmap(struct net_device *dev) +{ + struct cpu_rmap *rmap = dev->rx_cpu_rmap; + + if (!dev->rx_cpu_rmap_auto) + return; + + /* Free the rmap */ + cpu_rmap_put(rmap); + dev->rx_cpu_rmap = NULL; + dev->rx_cpu_rmap_auto = false; +} + +#else +static void netif_napi_affinity_release(struct kref *ref) +{ +} + +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) +{ + return 0; +} +EXPORT_SYMBOL(netif_enable_cpu_rmap); + +static void netif_del_cpu_rmap(struct net_device *dev) +{ +} +#endif + +void netif_set_affinity_auto(struct net_device *dev) +{ + unsigned int i, maxqs, numa; + + maxqs = max(dev->num_tx_queues, dev->num_rx_queues); + numa = dev_to_node(&dev->dev); + + for (i = 0; i < maxqs; i++) + cpumask_set_cpu(cpumask_local_spread(i, numa), + &dev->napi_config[i].affinity_mask); + + dev->irq_affinity_auto = true; +} +EXPORT_SYMBOL(netif_set_affinity_auto); + +void netif_napi_set_irq_locked(struct napi_struct *napi, int irq) +{ + int rc; + + netdev_assert_locked_or_invisible(napi->dev); + + if (napi->irq == irq) + return; + + /* Remove existing resources */ + if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) + irq_set_affinity_notifier(napi->irq, NULL); + + napi->irq = irq; + if (irq < 0 || + (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto)) + return; + + /* Abort for buggy drivers */ + if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config)) + return; + +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi); + if (rc < 0) + return; + + cpu_rmap_get(napi->dev->rx_cpu_rmap); + napi->napi_rmap_idx = rc; + } +#endif + + /* Use core IRQ notifier */ + napi->notify.notify = netif_napi_irq_notify; + napi->notify.release = netif_napi_affinity_release; + rc = irq_set_affinity_notifier(irq, &napi->notify); + if (rc) { + netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n", + rc); + goto put_rmap; + } + + set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state); + return; + +put_rmap: +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + cpu_rmap_put(napi->dev->rx_cpu_rmap); + napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL; + napi->napi_rmap_idx = -1; + } +#endif + napi->notify.notify = NULL; + napi->notify.release = NULL; +} +EXPORT_SYMBOL(netif_napi_set_irq_locked); + static void napi_restore_config(struct napi_struct *n) { n->defer_hard_irqs = n->config->defer_hard_irqs; n->gro_flush_timeout = n->config->gro_flush_timeout; n->irq_suspend_timeout = n->config->irq_suspend_timeout; + + if (n->dev->irq_affinity_auto && + test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state)) + irq_set_affinity(n->irq, &n->config->affinity_mask); + /* a NAPI ID might be stored in the config, if so use it. if not, use * napi_hash_add to generate one for us. */ @@ -7168,6 +7332,9 @@ void __netif_napi_del_locked(struct napi_struct *napi) /* Make sure NAPI is disabled (or was never enabled). */ WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state)); + if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) + irq_set_affinity_notifier(napi->irq, NULL); + if (napi->config) { napi->index = -1; napi->config = NULL; @@ -11720,6 +11887,8 @@ void free_netdev(struct net_device *dev) netdev_napi_exit(dev); + netif_del_cpu_rmap(dev); + ref_tracker_dir_exit(&dev->refcnt_tracker); #ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); -- 2.51.0 From de340d8206bf1897fcc05436adffb4d63a0a13e0 Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Mon, 24 Feb 2025 16:22:23 -0700 Subject: [PATCH 15/16] net: ena: use napi's aRFS rmap notifers Use the core's rmap notifiers and delete our own. Acked-by: David Arinzon Signed-off-by: Ahmed Zaki Link: https://patch.msgid.link/20250224232228.990783-3-ahmed.zaki@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 43 +------------------- 1 file changed, 1 insertion(+), 42 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index c1295dfad0d0..6aab85a7c60a 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -5,9 +5,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#ifdef CONFIG_RFS_ACCEL -#include -#endif /* CONFIG_RFS_ACCEL */ #include #include #include @@ -162,30 +159,6 @@ int ena_xmit_common(struct ena_adapter *adapter, return 0; } -static int ena_init_rx_cpu_rmap(struct ena_adapter *adapter) -{ -#ifdef CONFIG_RFS_ACCEL - u32 i; - int rc; - - adapter->netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(adapter->num_io_queues); - if (!adapter->netdev->rx_cpu_rmap) - return -ENOMEM; - for (i = 0; i < adapter->num_io_queues; i++) { - int irq_idx = ENA_IO_IRQ_IDX(i); - - rc = irq_cpu_rmap_add(adapter->netdev->rx_cpu_rmap, - pci_irq_vector(adapter->pdev, irq_idx)); - if (rc) { - free_irq_cpu_rmap(adapter->netdev->rx_cpu_rmap); - adapter->netdev->rx_cpu_rmap = NULL; - return rc; - } - } -#endif /* CONFIG_RFS_ACCEL */ - return 0; -} - static void ena_init_io_rings_common(struct ena_adapter *adapter, struct ena_ring *ring, u16 qid) { @@ -1596,7 +1569,7 @@ static int ena_enable_msix(struct ena_adapter *adapter) adapter->num_io_queues = irq_cnt - ENA_ADMIN_MSIX_VEC; } - if (ena_init_rx_cpu_rmap(adapter)) + if (netif_enable_cpu_rmap(adapter->netdev, adapter->num_io_queues)) netif_warn(adapter, probe, adapter->netdev, "Failed to map IRQs to CPUs\n"); @@ -1742,13 +1715,6 @@ static void ena_free_io_irq(struct ena_adapter *adapter) struct ena_irq *irq; int i; -#ifdef CONFIG_RFS_ACCEL - if (adapter->msix_vecs >= 1) { - free_irq_cpu_rmap(adapter->netdev->rx_cpu_rmap); - adapter->netdev->rx_cpu_rmap = NULL; - } -#endif /* CONFIG_RFS_ACCEL */ - for (i = ENA_IO_IRQ_FIRST_IDX; i < ENA_MAX_MSIX_VEC(io_queue_count); i++) { irq = &adapter->irq_tbl[i]; irq_set_affinity_hint(irq->vector, NULL); @@ -4131,13 +4097,6 @@ static void __ena_shutoff(struct pci_dev *pdev, bool shutdown) ena_dev = adapter->ena_dev; netdev = adapter->netdev; -#ifdef CONFIG_RFS_ACCEL - if ((adapter->msix_vecs >= 1) && (netdev->rx_cpu_rmap)) { - free_irq_cpu_rmap(netdev->rx_cpu_rmap); - netdev->rx_cpu_rmap = NULL; - } - -#endif /* CONFIG_RFS_ACCEL */ /* Make sure timer and reset routine won't be called after * freeing device resources. */ -- 2.51.0 From 30b78ba3d4fe7a4dcf86f7ed21e719a62b71a392 Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Mon, 24 Feb 2025 16:22:24 -0700 Subject: [PATCH 16/16] ice: clear NAPI's IRQ numbers in ice_vsi_clear_napi_queues() We set the NAPI's IRQ number in ice_vsi_set_napi_queues(). Clear the NAPI's IRQ in ice_vsi_clear_napi_queues(). Signed-off-by: Ahmed Zaki Link: https://patch.msgid.link/20250224232228.990783-4-ahmed.zaki@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_lib.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 7f5b229cab05..ce30674abf8f 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2765,11 +2765,18 @@ void ice_vsi_set_napi_queues(struct ice_vsi *vsi) void ice_vsi_clear_napi_queues(struct ice_vsi *vsi) { struct net_device *netdev = vsi->netdev; - int q_idx; + int q_idx, v_idx; if (!netdev) return; + /* Clear the NAPI's interrupt number */ + ice_for_each_q_vector(vsi, v_idx) { + struct ice_q_vector *q_vector = vsi->q_vectors[v_idx]; + + netif_napi_set_irq(&q_vector->napi, -1); + } + ice_for_each_txq(vsi, q_idx) netif_queue_set_napi(netdev, q_idx, NETDEV_QUEUE_TYPE_TX, NULL); -- 2.51.0