From eec8359f0797ef87c6ef6cbed6de08b02073b833 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 14 Jan 2025 14:28:44 +0000 Subject: [PATCH 01/16] net: ethtool: add support for configuring hds-thresh The hds-thresh option configures the threshold value of the header-data-split. If a received packet size is larger than this threshold value, a packet will be split into header and payload. The header indicates TCP and UDP header, but it depends on driver spec. The bnxt_en driver supports HDS(Header-Data-Split) configuration at FW level, affecting TCP and UDP too. So, If hds-thresh is set, it affects UDP and TCP packets. Example: # ethtool -G hds-thresh # ethtool -G enp14s0f0np0 tcp-data-split on hds-thresh 256 # ethtool -g enp14s0f0np0 Ring parameters for enp14s0f0np0: Pre-set maximums: ... HDS thresh: 1023 Current hardware settings: ... TCP data split: on HDS thresh: 256 The default/min/max values are not defined in the ethtool so the drivers should define themself. The 0 value means that all TCP/UDP packets' header and payload will be split. Tested-by: Stanislav Fomichev Signed-off-by: Taehee Yoo Link: https://patch.msgid.link/20250114142852.3364986-3-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 8 ++++++ Documentation/networking/ethtool-netlink.rst | 10 +++++++ include/linux/ethtool.h | 9 ++++++ .../uapi/linux/ethtool_netlink_generated.h | 2 ++ net/ethtool/netlink.h | 2 +- net/ethtool/rings.c | 28 +++++++++++++++++-- 6 files changed, 55 insertions(+), 4 deletions(-) diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 60f85fbf4156..66be04013048 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -332,6 +332,12 @@ attribute-sets: - name: tx-push-buf-len-max type: u32 + - + name: hds-thresh + type: u32 + - + name: hds-thresh-max + type: u32 - name: mm-stat @@ -1777,6 +1783,8 @@ operations: - rx-push - tx-push-buf-len - tx-push-buf-len-max + - hds-thresh + - hds-thresh-max dump: *ring-get-op - name: rings-set diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index da846f1d998e..f70c0249860c 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -899,6 +899,10 @@ Kernel response contents: ``ETHTOOL_A_RINGS_RX_PUSH`` u8 flag of RX Push mode ``ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN`` u32 size of TX push buffer ``ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX`` u32 max size of TX push buffer + ``ETHTOOL_A_RINGS_HDS_THRESH`` u32 threshold of + header / data split + ``ETHTOOL_A_RINGS_HDS_THRESH_MAX`` u32 max threshold of + header / data split ======================================= ====== =========================== ``ETHTOOL_A_RINGS_TCP_DATA_SPLIT`` indicates whether the device is usable with @@ -941,10 +945,12 @@ Request contents: ``ETHTOOL_A_RINGS_RX_JUMBO`` u32 size of RX jumbo ring ``ETHTOOL_A_RINGS_TX`` u32 size of TX ring ``ETHTOOL_A_RINGS_RX_BUF_LEN`` u32 size of buffers on the ring + ``ETHTOOL_A_RINGS_TCP_DATA_SPLIT`` u8 TCP header / data split ``ETHTOOL_A_RINGS_CQE_SIZE`` u32 Size of TX/RX CQE ``ETHTOOL_A_RINGS_TX_PUSH`` u8 flag of TX Push mode ``ETHTOOL_A_RINGS_RX_PUSH`` u8 flag of RX Push mode ``ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN`` u32 size of TX push buffer + ``ETHTOOL_A_RINGS_HDS_THRESH`` u32 threshold of header / data split ==================================== ====== =========================== Kernel checks that requested ring sizes do not exceed limits reported by @@ -961,6 +967,10 @@ A bigger CQE can have more receive buffer pointers, and in turn the NIC can transfer a bigger frame from wire. Based on the NIC hardware, the overall completion queue size can be adjusted in the driver if CQE size is modified. +``ETHTOOL_A_RINGS_HDS_THRESH`` specifies the threshold value of +header / data split feature. If a received packet size is larger than this +threshold value, header and data will be split. + CHANNELS_GET ============ diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index d79bd201c1c8..e4136b0df892 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -78,6 +78,9 @@ enum { * @cqe_size: Size of TX/RX completion queue event * @tx_push_buf_len: Size of TX push buffer * @tx_push_buf_max_len: Maximum allowed size of TX push buffer + * @hds_thresh: Packet size threshold for header data split (HDS) + * @hds_thresh_max: Maximum supported setting for @hds_threshold + * */ struct kernel_ethtool_ringparam { u32 rx_buf_len; @@ -87,6 +90,8 @@ struct kernel_ethtool_ringparam { u32 cqe_size; u32 tx_push_buf_len; u32 tx_push_buf_max_len; + u32 hds_thresh; + u32 hds_thresh_max; }; /** @@ -97,6 +102,7 @@ struct kernel_ethtool_ringparam { * @ETHTOOL_RING_USE_RX_PUSH: capture for setting rx_push * @ETHTOOL_RING_USE_TX_PUSH_BUF_LEN: capture for setting tx_push_buf_len * @ETHTOOL_RING_USE_TCP_DATA_SPLIT: capture for setting tcp_data_split + * @ETHTOOL_RING_USE_HDS_THRS: capture for setting header-data-split-thresh */ enum ethtool_supported_ring_param { ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0), @@ -105,6 +111,7 @@ enum ethtool_supported_ring_param { ETHTOOL_RING_USE_RX_PUSH = BIT(3), ETHTOOL_RING_USE_TX_PUSH_BUF_LEN = BIT(4), ETHTOOL_RING_USE_TCP_DATA_SPLIT = BIT(5), + ETHTOOL_RING_USE_HDS_THRS = BIT(6), }; #define __ETH_RSS_HASH_BIT(bit) ((u32)1 << (bit)) @@ -1157,6 +1164,7 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev, * @rss_ctx: XArray of custom RSS contexts * @rss_lock: Protects entries in @rss_ctx. May be taken from * within RTNL. + * @hds_thresh: HDS Threshold value. * @hds_config: HDS value from userspace. * @wol_enabled: Wake-on-LAN is enabled * @module_fw_flash_in_progress: Module firmware flashing is in progress. @@ -1164,6 +1172,7 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev, struct ethtool_netdev_state { struct xarray rss_ctx; struct mutex rss_lock; + u32 hds_thresh; u8 hds_config; unsigned wol_enabled:1; unsigned module_fw_flash_in_progress:1; diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 43993a2d68e5..2e17ff348f89 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -155,6 +155,8 @@ enum { ETHTOOL_A_RINGS_RX_PUSH, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, + ETHTOOL_A_RINGS_HDS_THRESH, + ETHTOOL_A_RINGS_HDS_THRESH_MAX, __ETHTOOL_A_RINGS_CNT, ETHTOOL_A_RINGS_MAX = (__ETHTOOL_A_RINGS_CNT - 1) diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 1ce0a3de1430..ff69ca0715de 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -456,7 +456,7 @@ extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANT extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1]; extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1]; extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1]; -extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX + 1]; +extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_HDS_THRESH_MAX + 1]; extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1]; extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1]; extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1]; diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index b2a2586b241f..a381913a19f0 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -61,7 +61,9 @@ static int rings_reply_size(const struct ethnl_req_info *req_base, nla_total_size(sizeof(u8)) + /* _RINGS_TX_PUSH */ nla_total_size(sizeof(u8))) + /* _RINGS_RX_PUSH */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN */ - nla_total_size(sizeof(u32)); /* _RINGS_TX_PUSH_BUF_LEN_MAX */ + nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN_MAX */ + nla_total_size(sizeof(u32)) + /* _RINGS_HDS_THRESH */ + nla_total_size(sizeof(u32)); /* _RINGS_HDS_THRESH_MAX*/ } static int rings_fill_reply(struct sk_buff *skb, @@ -108,7 +110,12 @@ static int rings_fill_reply(struct sk_buff *skb, (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, kr->tx_push_buf_max_len) || nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, - kr->tx_push_buf_len)))) + kr->tx_push_buf_len))) || + ((supported_ring_params & ETHTOOL_RING_USE_HDS_THRS) && + (nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH, + kr->hds_thresh) || + nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH_MAX, + kr->hds_thresh_max)))) return -EMSGSIZE; return 0; @@ -130,6 +137,7 @@ const struct nla_policy ethnl_rings_set_policy[] = { [ETHTOOL_A_RINGS_TX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_RINGS_RX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] = { .type = NLA_U32 }, + [ETHTOOL_A_RINGS_HDS_THRESH] = { .type = NLA_U32 }, }; static int @@ -155,6 +163,14 @@ ethnl_set_rings_validate(struct ethnl_req_info *req_info, return -EOPNOTSUPP; } + if (tb[ETHTOOL_A_RINGS_HDS_THRESH] && + !(ops->supported_ring_params & ETHTOOL_RING_USE_HDS_THRS)) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_RINGS_HDS_THRESH], + "setting hds-thresh is not supported"); + return -EOPNOTSUPP; + } + if (tb[ETHTOOL_A_RINGS_CQE_SIZE] && !(ops->supported_ring_params & ETHTOOL_RING_USE_CQE_SIZE)) { NL_SET_ERR_MSG_ATTR(info->extack, @@ -223,6 +239,8 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) tb[ETHTOOL_A_RINGS_RX_PUSH], &mod); ethnl_update_u32(&kernel_ringparam.tx_push_buf_len, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], &mod); + ethnl_update_u32(&kernel_ringparam.hds_thresh, + tb[ETHTOOL_A_RINGS_HDS_THRESH], &mod); if (!mod) return 0; @@ -243,6 +261,8 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) err_attr = tb[ETHTOOL_A_RINGS_RX_JUMBO]; else if (ringparam.tx_pending > ringparam.tx_max_pending) err_attr = tb[ETHTOOL_A_RINGS_TX]; + else if (kernel_ringparam.hds_thresh > kernel_ringparam.hds_thresh_max) + err_attr = tb[ETHTOOL_A_RINGS_HDS_THRESH]; else err_attr = NULL; if (err_attr) { @@ -261,8 +281,10 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); - if (!ret) + if (!ret) { dev->ethtool->hds_config = kernel_ringparam.tcp_data_split; + dev->ethtool->hds_thresh = kernel_ringparam.hds_thresh; + } return ret < 0 ? ret : 1; } -- 2.50.1 From a08a5c9484015a88c937aa0f9eaf3efb2123c3b8 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 14 Jan 2025 14:28:45 +0000 Subject: [PATCH 02/16] net: devmem: add ring parameter filtering If driver doesn't support ring parameter or tcp-data-split configuration is not sufficient, the devmem should not be set up. Before setup the devmem, tcp-data-split should be ON and hds-thresh value should be 0. Tested-by: Stanislav Fomichev Reviewed-by: Mina Almasry Signed-off-by: Taehee Yoo Link: https://patch.msgid.link/20250114142852.3364986-4-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- net/core/devmem.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/core/devmem.c b/net/core/devmem.c index 0b6ed7525b22..c971b8aceac8 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -140,6 +141,16 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, return -ERANGE; } + if (dev->ethtool->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { + NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); + return -EINVAL; + } + + if (dev->ethtool->hds_thresh) { + NL_SET_ERR_MSG(extack, "hds-thresh is not zero"); + return -EINVAL; + } + rxq = __netif_get_rx_queue(dev, rxq_idx); if (rxq->mp_params.mp_priv) { NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); -- 2.50.1 From e61779015c4a4655b31e2e1fc47a7210be9f53f3 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 14 Jan 2025 14:28:46 +0000 Subject: [PATCH 03/16] net: ethtool: add ring parameter filtering While the devmem is running, the tcp-data-split and hds-thresh configuration should not be changed. If user tries to change tcp-data-split and threshold value while the devmem is running, it fails and shows extack message. Reviewed-by: Jakub Kicinski Tested-by: Stanislav Fomichev Reviewed-by: Mina Almasry Signed-off-by: Taehee Yoo Link: https://patch.msgid.link/20250114142852.3364986-5-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- net/ethtool/rings.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index a381913a19f0..d8cd4e4d7762 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -252,6 +252,19 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) return -EINVAL; } + if (dev_get_min_mp_channel_count(dev)) { + if (kernel_ringparam.tcp_data_split != + ETHTOOL_TCP_DATA_SPLIT_ENABLED) { + NL_SET_ERR_MSG(info->extack, + "can't disable tcp-data-split while device has memory provider enabled"); + return -EINVAL; + } else if (kernel_ringparam.hds_thresh) { + NL_SET_ERR_MSG(info->extack, + "can't set non-zero hds_thresh while device is memory provider enabled"); + return -EINVAL; + } + } + /* ensure new ring parameters are within limits */ if (ringparam.rx_pending > ringparam.rx_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX]; -- 2.50.1 From 2d46e481a9afc8e6b214f5c78b05374f05b8f62a Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 14 Jan 2025 14:28:47 +0000 Subject: [PATCH 04/16] net: disallow setup single buffer XDP when tcp-data-split is enabled. When a single buffer XDP is attached, NIC should guarantee only single page packets will be received. tcp-data-split feature splits packets into header and payload. single buffer XDP can't handle it properly. So attaching single buffer XDP should be disallowed when tcp-data-split is enabled. Acked-by: Jakub Kicinski Signed-off-by: Taehee Yoo Link: https://patch.msgid.link/20250114142852.3364986-6-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index 5ef817d656ef..47e6b0f73cfc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -92,6 +92,7 @@ #include #include #include +#include #include #include #include @@ -9567,6 +9568,14 @@ int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) if (!dev->netdev_ops->ndo_bpf) return -EOPNOTSUPP; + if (dev->ethtool->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + bpf->command == XDP_SETUP_PROG && + bpf->prog && !bpf->prog->aux->xdp_has_frags) { + NL_SET_ERR_MSG(bpf->extack, + "unable to propagate XDP to device using tcp-data-split"); + return -EBUSY; + } + if (dev_get_min_mp_channel_count(dev)) { NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider"); return -EBUSY; @@ -9604,6 +9613,12 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, struct netdev_bpf xdp; int err; + if (dev->ethtool->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + prog && !prog->aux->xdp_has_frags) { + NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split"); + return -EBUSY; + } + if (dev_get_min_mp_channel_count(dev)) { NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider"); return -EBUSY; -- 2.50.1 From 152f4da05aeee62cf04d61daf9789575f1df8f4e Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 14 Jan 2025 14:28:48 +0000 Subject: [PATCH 05/16] bnxt_en: add support for rx-copybreak ethtool command The bnxt_en driver supports rx-copybreak, but it couldn't be set by userspace. Only the default value(256) has worked. This patch makes the bnxt_en driver support following command. `ethtool --set-tunable rx-copybreak ` and `ethtool --get-tunable rx-copybreak`. By this patch, hds_threshol is set to the rx-copybreak value. But it will be set by `ethtool -G eth0 hds-thresh N` in the next patch. Reviewed-by: Jakub Kicinski Reviewed-by: Brett Creeley Tested-by: Stanislav Fomichev Tested-by: Andy Gospodarek Signed-off-by: Taehee Yoo Reviewed-by: Michael Chan Link: https://patch.msgid.link/20250114142852.3364986-7-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 28 +++++++----- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 5 ++- .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 44 ++++++++++++++++++- 3 files changed, 63 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 884d42db5554..d19c4fb588e5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -81,7 +81,6 @@ MODULE_DESCRIPTION("Broadcom NetXtreme network driver"); #define BNXT_RX_OFFSET (NET_SKB_PAD + NET_IP_ALIGN) #define BNXT_RX_DMA_OFFSET NET_SKB_PAD -#define BNXT_RX_COPY_THRESH 256 #define BNXT_TX_PUSH_THRESH 164 @@ -1343,13 +1342,13 @@ static struct sk_buff *bnxt_copy_data(struct bnxt_napi *bnapi, u8 *data, if (!skb) return NULL; - dma_sync_single_for_cpu(&pdev->dev, mapping, bp->rx_copy_thresh, + dma_sync_single_for_cpu(&pdev->dev, mapping, bp->rx_copybreak, bp->rx_dir); memcpy(skb->data - NET_IP_ALIGN, data - NET_IP_ALIGN, len + NET_IP_ALIGN); - dma_sync_single_for_device(&pdev->dev, mapping, bp->rx_copy_thresh, + dma_sync_single_for_device(&pdev->dev, mapping, bp->rx_copybreak, bp->rx_dir); skb_put(skb, len); @@ -1842,7 +1841,7 @@ static inline struct sk_buff *bnxt_tpa_end(struct bnxt *bp, return NULL; } - if (len <= bp->rx_copy_thresh) { + if (len <= bp->rx_copybreak) { skb = bnxt_copy_skb(bnapi, data_ptr, len, mapping); if (!skb) { bnxt_abort_tpa(cpr, idx, agg_bufs); @@ -2176,7 +2175,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, } } - if (len <= bp->rx_copy_thresh) { + if (len <= bp->rx_copybreak) { if (!xdp_active) skb = bnxt_copy_skb(bnapi, data_ptr, len, dma_addr); else @@ -4608,6 +4607,11 @@ void bnxt_set_tpa_flags(struct bnxt *bp) bp->flags |= BNXT_FLAG_GRO; } +static void bnxt_init_ring_params(struct bnxt *bp) +{ + bp->rx_copybreak = BNXT_DEFAULT_RX_COPYBREAK; +} + /* bp->rx_ring_size, bp->tx_ring_size, dev->mtu, BNXT_FLAG_{G|L}RO flags must * be set on entry. */ @@ -4622,7 +4626,6 @@ void bnxt_set_ring_params(struct bnxt *bp) rx_space = rx_size + ALIGN(max(NET_SKB_PAD, XDP_PACKET_HEADROOM), 8) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - bp->rx_copy_thresh = BNXT_RX_COPY_THRESH; ring_size = bp->rx_ring_size; bp->rx_agg_ring_size = 0; bp->rx_agg_nr_pages = 0; @@ -4667,7 +4670,9 @@ void bnxt_set_ring_params(struct bnxt *bp) ALIGN(max(NET_SKB_PAD, XDP_PACKET_HEADROOM), 8) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); } else { - rx_size = SKB_DATA_ALIGN(BNXT_RX_COPY_THRESH + NET_IP_ALIGN); + rx_size = SKB_DATA_ALIGN(max(BNXT_DEFAULT_RX_COPYBREAK, + bp->rx_copybreak) + + NET_IP_ALIGN); rx_space = rx_size + NET_SKB_PAD + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); } @@ -6573,16 +6578,14 @@ static int bnxt_hwrm_vnic_set_hds(struct bnxt *bp, struct bnxt_vnic_info *vnic) req->flags = cpu_to_le32(VNIC_PLCMODES_CFG_REQ_FLAGS_JUMBO_PLACEMENT); req->enables = cpu_to_le32(VNIC_PLCMODES_CFG_REQ_ENABLES_JUMBO_THRESH_VALID); + req->jumbo_thresh = cpu_to_le16(bp->rx_buf_use_size); - if (BNXT_RX_PAGE_MODE(bp)) { - req->jumbo_thresh = cpu_to_le16(bp->rx_buf_use_size); - } else { + if (!BNXT_RX_PAGE_MODE(bp) && (bp->flags & BNXT_FLAG_AGG_RINGS)) { req->flags |= cpu_to_le32(VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_IPV4 | VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_IPV6); req->enables |= cpu_to_le32(VNIC_PLCMODES_CFG_REQ_ENABLES_HDS_THRESHOLD_VALID); - req->jumbo_thresh = cpu_to_le16(bp->rx_copy_thresh); - req->hds_threshold = cpu_to_le16(bp->rx_copy_thresh); + req->hds_threshold = cpu_to_le16(bp->rx_copybreak); } req->vnic_id = cpu_to_le32(vnic->fw_vnic_id); return hwrm_req_send(bp, req); @@ -16261,6 +16264,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) bnxt_init_l2_fltr_tbl(bp); bnxt_set_rx_skb_mode(bp, false); bnxt_set_tpa_flags(bp); + bnxt_init_ring_params(bp); bnxt_set_ring_params(bp); bnxt_rdma_aux_device_init(bp); rc = bnxt_set_dflt_rings(bp, true); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 094c9e95b463..7edb92ce5976 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -34,6 +34,9 @@ #include #endif +#define BNXT_DEFAULT_RX_COPYBREAK 256 +#define BNXT_MAX_RX_COPYBREAK 1024 + extern struct list_head bnxt_block_cb_list; struct page_pool; @@ -2347,7 +2350,7 @@ struct bnxt { enum dma_data_direction rx_dir; u32 rx_ring_size; u32 rx_agg_ring_size; - u32 rx_copy_thresh; + u32 rx_copybreak; u32 rx_ring_mask; u32 rx_agg_ring_mask; int rx_nr_pages; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 75a59dd72bce..e9e63d95df17 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -4328,6 +4328,45 @@ static int bnxt_get_eee(struct net_device *dev, struct ethtool_keee *edata) return 0; } +static int bnxt_set_tunable(struct net_device *dev, + const struct ethtool_tunable *tuna, + const void *data) +{ + struct bnxt *bp = netdev_priv(dev); + u32 rx_copybreak; + + switch (tuna->id) { + case ETHTOOL_RX_COPYBREAK: + rx_copybreak = *(u32 *)data; + if (rx_copybreak > BNXT_MAX_RX_COPYBREAK) + return -ERANGE; + if (rx_copybreak != bp->rx_copybreak) { + if (netif_running(dev)) + return -EBUSY; + bp->rx_copybreak = rx_copybreak; + } + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int bnxt_get_tunable(struct net_device *dev, + const struct ethtool_tunable *tuna, void *data) +{ + struct bnxt *bp = netdev_priv(dev); + + switch (tuna->id) { + case ETHTOOL_RX_COPYBREAK: + *(u32 *)data = bp->rx_copybreak; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + static int bnxt_read_sfp_module_eeprom_info(struct bnxt *bp, u16 i2c_addr, u16 page_number, u8 bank, u16 start_addr, u16 data_length, @@ -4790,7 +4829,8 @@ static int bnxt_run_loopback(struct bnxt *bp) cpr = &rxr->bnapi->cp_ring; if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) cpr = rxr->rx_cpr; - pkt_size = min(bp->dev->mtu + ETH_HLEN, bp->rx_copy_thresh); + pkt_size = min(bp->dev->mtu + ETH_HLEN, max(BNXT_DEFAULT_RX_COPYBREAK, + bp->rx_copybreak)); skb = netdev_alloc_skb(bp->dev, pkt_size); if (!skb) return -ENOMEM; @@ -5372,6 +5412,8 @@ const struct ethtool_ops bnxt_ethtool_ops = { .get_link_ext_stats = bnxt_get_link_ext_stats, .get_eee = bnxt_get_eee, .set_eee = bnxt_set_eee, + .get_tunable = bnxt_get_tunable, + .set_tunable = bnxt_set_tunable, .get_module_info = bnxt_get_module_info, .get_module_eeprom = bnxt_get_module_eeprom, .get_module_eeprom_by_page = bnxt_get_module_eeprom_by_page, -- 2.50.1 From 87c8f8496a05de71dc42f5f2ed2b1ea64ea8b77d Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 14 Jan 2025 14:28:49 +0000 Subject: [PATCH 06/16] bnxt_en: add support for tcp-data-split ethtool command NICs that uses bnxt_en driver supports tcp-data-split feature by the name of HDS(header-data-split). But there is no implementation for the HDS to enable by ethtool. Only getting the current HDS status is implemented and The HDS is just automatically enabled only when either LRO, HW-GRO, or JUMBO is enabled. The hds_threshold follows rx-copybreak value. and it was unchangeable. This implements `ethtool -G tcp-data-split ` command option. The value can be and . The value is and one of LRO/GRO/JUMBO is enabled, HDS is automatically enabled and all LRO/GRO/JUMBO are disabled, HDS is automatically disabled. HDS feature relies on the aggregation ring. So, if HDS is enabled, the bnxt_en driver initializes the aggregation ring. This is the reason why BNXT_FLAG_AGG_RINGS contains HDS condition. Acked-by: Jakub Kicinski Tested-by: Stanislav Fomichev Tested-by: Andy Gospodarek Signed-off-by: Taehee Yoo Reviewed-by: Michael Chan Link: https://patch.msgid.link/20250114142852.3364986-8-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 5 +++-- .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 20 +++++++++++++++++++ drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 4 ++++ 4 files changed, 28 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index d19c4fb588e5..f029559a581e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -4630,7 +4630,7 @@ void bnxt_set_ring_params(struct bnxt *bp) bp->rx_agg_ring_size = 0; bp->rx_agg_nr_pages = 0; - if (bp->flags & BNXT_FLAG_TPA) + if (bp->flags & BNXT_FLAG_TPA || bp->flags & BNXT_FLAG_HDS) agg_factor = min_t(u32, 4, 65536 / BNXT_RX_PAGE_SIZE); bp->flags &= ~BNXT_FLAG_JUMBO; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 7edb92ce5976..7dc06e07bae2 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2244,8 +2244,6 @@ struct bnxt { #define BNXT_FLAG_TPA (BNXT_FLAG_LRO | BNXT_FLAG_GRO) #define BNXT_FLAG_JUMBO 0x10 #define BNXT_FLAG_STRIP_VLAN 0x20 - #define BNXT_FLAG_AGG_RINGS (BNXT_FLAG_JUMBO | BNXT_FLAG_GRO | \ - BNXT_FLAG_LRO) #define BNXT_FLAG_RFS 0x100 #define BNXT_FLAG_SHARED_RINGS 0x200 #define BNXT_FLAG_PORT_STATS 0x400 @@ -2266,6 +2264,9 @@ struct bnxt { #define BNXT_FLAG_ROCE_MIRROR_CAP 0x4000000 #define BNXT_FLAG_TX_COAL_CMPL 0x8000000 #define BNXT_FLAG_PORT_STATS_EXT 0x10000000 + #define BNXT_FLAG_HDS 0x20000000 + #define BNXT_FLAG_AGG_RINGS (BNXT_FLAG_JUMBO | BNXT_FLAG_GRO | \ + BNXT_FLAG_LRO | BNXT_FLAG_HDS) #define BNXT_FLAG_ALL_CONFIG_FEATS (BNXT_FLAG_TPA | \ BNXT_FLAG_RFS | \ diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index e9e63d95df17..413007190f50 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -840,16 +840,35 @@ static int bnxt_set_ringparam(struct net_device *dev, struct kernel_ethtool_ringparam *kernel_ering, struct netlink_ext_ack *extack) { + u8 tcp_data_split = kernel_ering->tcp_data_split; struct bnxt *bp = netdev_priv(dev); + u8 hds_config_mod; if ((ering->rx_pending > BNXT_MAX_RX_DESC_CNT) || (ering->tx_pending > BNXT_MAX_TX_DESC_CNT) || (ering->tx_pending < BNXT_MIN_TX_DESC_CNT)) return -EINVAL; + hds_config_mod = tcp_data_split != dev->ethtool->hds_config; + if (tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_DISABLED && hds_config_mod) + return -EINVAL; + + if (tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + hds_config_mod && BNXT_RX_PAGE_MODE(bp)) { + NL_SET_ERR_MSG_MOD(extack, "tcp-data-split is disallowed when XDP is attached"); + return -EINVAL; + } + if (netif_running(dev)) bnxt_close_nic(bp, false, false); + if (hds_config_mod) { + if (tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_ENABLED) + bp->flags |= BNXT_FLAG_HDS; + else if (tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_UNKNOWN) + bp->flags &= ~BNXT_FLAG_HDS; + } + bp->rx_ring_size = ering->rx_pending; bp->tx_ring_size = ering->tx_pending; bnxt_set_ring_params(bp); @@ -5371,6 +5390,7 @@ const struct ethtool_ops bnxt_ethtool_ops = { ETHTOOL_COALESCE_STATS_BLOCK_USECS | ETHTOOL_COALESCE_USE_ADAPTIVE_RX | ETHTOOL_COALESCE_USE_CQE, + .supported_ring_params = ETHTOOL_RING_USE_TCP_DATA_SPLIT, .get_link_ksettings = bnxt_get_link_ksettings, .set_link_ksettings = bnxt_set_link_ksettings, .get_fec_stats = bnxt_get_fec_stats, diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index f88b641533fc..1bfff7f29310 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -395,6 +395,10 @@ static int bnxt_xdp_set(struct bnxt *bp, struct bpf_prog *prog) bp->dev->mtu, BNXT_MAX_PAGE_MODE_MTU); return -EOPNOTSUPP; } + if (prog && bp->flags & BNXT_FLAG_HDS) { + netdev_warn(dev, "XDP is disallowed when HDS is enabled.\n"); + return -EOPNOTSUPP; + } if (!(bp->flags & BNXT_FLAG_SHARED_RINGS)) { netdev_warn(dev, "ethtool rx/tx channels must be combined to support XDP.\n"); return -EOPNOTSUPP; -- 2.50.1 From 6b43673a25c3666d42f5524e59aed8a3914924cc Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 14 Jan 2025 14:28:50 +0000 Subject: [PATCH 07/16] bnxt_en: add support for hds-thresh ethtool command The bnxt_en driver has configured the hds_threshold value automatically when TPA is enabled based on the rx-copybreak default value. Now the hds-thresh ethtool command is added, so it adds an implementation of hds-thresh option. Configuration of the hds-thresh is applied only when the tcp-data-split is enabled. The default value of hds-thresh is 256, which is the default value of rx-copybreak, which used to be the hds_thresh value. The maximum hds-thresh is 1023. # Example: # ethtool -G enp14s0f0np0 tcp-data-split on hds-thresh 256 # ethtool -g enp14s0f0np0 Ring parameters for enp14s0f0np0: Pre-set maximums: ... HDS thresh: 1023 Current hardware settings: ... TCP data split: on HDS thresh: 256 Tested-by: Stanislav Fomichev Tested-by: Andy Gospodarek Signed-off-by: Taehee Yoo Reviewed-by: Michael Chan Link: https://patch.msgid.link/20250114142852.3364986-9-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 +++- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 6 +++++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index f029559a581e..caddb5cbc024 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -4610,6 +4610,7 @@ void bnxt_set_tpa_flags(struct bnxt *bp) static void bnxt_init_ring_params(struct bnxt *bp) { bp->rx_copybreak = BNXT_DEFAULT_RX_COPYBREAK; + bp->dev->ethtool->hds_thresh = BNXT_DEFAULT_RX_COPYBREAK; } /* bp->rx_ring_size, bp->tx_ring_size, dev->mtu, BNXT_FLAG_{G|L}RO flags must @@ -6569,6 +6570,7 @@ static void bnxt_hwrm_update_rss_hash_cfg(struct bnxt *bp) static int bnxt_hwrm_vnic_set_hds(struct bnxt *bp, struct bnxt_vnic_info *vnic) { + u16 hds_thresh = (u16)bp->dev->ethtool->hds_thresh; struct hwrm_vnic_plcmodes_cfg_input *req; int rc; @@ -6585,7 +6587,7 @@ static int bnxt_hwrm_vnic_set_hds(struct bnxt *bp, struct bnxt_vnic_info *vnic) VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_IPV6); req->enables |= cpu_to_le32(VNIC_PLCMODES_CFG_REQ_ENABLES_HDS_THRESHOLD_VALID); - req->hds_threshold = cpu_to_le16(bp->rx_copybreak); + req->hds_threshold = cpu_to_le16(hds_thresh); } req->vnic_id = cpu_to_le32(vnic->fw_vnic_id); return hwrm_req_send(bp, req); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 7dc06e07bae2..8f481dd9c224 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2779,6 +2779,8 @@ struct bnxt { #define SFF_MODULE_ID_QSFP28 0x11 #define BNXT_MAX_PHY_I2C_RESP_SIZE 64 +#define BNXT_HDS_THRESHOLD_MAX 1023 + static inline u32 bnxt_tx_avail(struct bnxt *bp, const struct bnxt_tx_ring_info *txr) { diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 413007190f50..540c140d52dc 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -833,6 +833,9 @@ static void bnxt_get_ringparam(struct net_device *dev, ering->rx_pending = bp->rx_ring_size; ering->rx_jumbo_pending = bp->rx_agg_ring_size; ering->tx_pending = bp->tx_ring_size; + + kernel_ering->hds_thresh = dev->ethtool->hds_thresh; + kernel_ering->hds_thresh_max = BNXT_HDS_THRESHOLD_MAX; } static int bnxt_set_ringparam(struct net_device *dev, @@ -5390,7 +5393,8 @@ const struct ethtool_ops bnxt_ethtool_ops = { ETHTOOL_COALESCE_STATS_BLOCK_USECS | ETHTOOL_COALESCE_USE_ADAPTIVE_RX | ETHTOOL_COALESCE_USE_CQE, - .supported_ring_params = ETHTOOL_RING_USE_TCP_DATA_SPLIT, + .supported_ring_params = ETHTOOL_RING_USE_TCP_DATA_SPLIT | + ETHTOOL_RING_USE_HDS_THRS, .get_link_ksettings = bnxt_get_link_ksettings, .set_link_ksettings = bnxt_set_link_ksettings, .get_fec_stats = bnxt_get_fec_stats, -- 2.50.1 From f394d07b192b67a895dbed76253ce95dcbb5d17c Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 14 Jan 2025 14:28:51 +0000 Subject: [PATCH 08/16] netdevsim: add HDS feature HDS options(tcp-data-split, hds-thresh) have dependencies between other features like XDP. Basic dependencies are checked in the core API. netdevsim is very useful to check basic dependencies. The default tcp-data-split mode is UNKNOWN but netdevsim driver returns ENABLED when ethtool dumps tcp-data-split mode. The default value of HDS threshold is 0 and the maximum value is 1024. ethtool shows like this. ethtool -g eni1np1 Ring parameters for eni1np1: Pre-set maximums: ... HDS thresh: 1024 Current hardware settings: ... TCP data split: on HDS thresh: 0 ethtool -G eni1np1 tcp-data-split on hds-thresh 1024 ethtool -g eni1np1 Ring parameters for eni1np1: Pre-set maximums: ... HDS thresh: 1024 Current hardware settings: ... TCP data split: on HDS thresh: 1024 Signed-off-by: Taehee Yoo Link: https://patch.msgid.link/20250114142852.3364986-10-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/ethtool.c | 12 +++++++++++- drivers/net/netdevsim/netdev.c | 9 +++++++++ drivers/net/netdevsim/netdevsim.h | 3 +++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/net/netdevsim/ethtool.c b/drivers/net/netdevsim/ethtool.c index 5fe1eaef99b5..9e0df40c71e1 100644 --- a/drivers/net/netdevsim/ethtool.c +++ b/drivers/net/netdevsim/ethtool.c @@ -2,7 +2,6 @@ // Copyright (c) 2020 Facebook #include -#include #include #include "netdevsim.h" @@ -72,6 +71,12 @@ static void nsim_get_ringparam(struct net_device *dev, struct netdevsim *ns = netdev_priv(dev); memcpy(ring, &ns->ethtool.ring, sizeof(ns->ethtool.ring)); + kernel_ring->tcp_data_split = dev->ethtool->hds_config; + kernel_ring->hds_thresh = dev->ethtool->hds_thresh; + kernel_ring->hds_thresh_max = NSIM_HDS_THRESHOLD_MAX; + + if (kernel_ring->tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_UNKNOWN) + kernel_ring->tcp_data_split = ETHTOOL_TCP_DATA_SPLIT_ENABLED; } static int nsim_set_ringparam(struct net_device *dev, @@ -161,6 +166,8 @@ static int nsim_get_ts_info(struct net_device *dev, static const struct ethtool_ops nsim_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_ALL_PARAMS, + .supported_ring_params = ETHTOOL_RING_USE_TCP_DATA_SPLIT | + ETHTOOL_RING_USE_HDS_THRS, .get_pause_stats = nsim_get_pause_stats, .get_pauseparam = nsim_get_pauseparam, .set_pauseparam = nsim_set_pauseparam, @@ -182,6 +189,9 @@ static void nsim_ethtool_ring_init(struct netdevsim *ns) ns->ethtool.ring.rx_jumbo_max_pending = 4096; ns->ethtool.ring.rx_mini_max_pending = 4096; ns->ethtool.ring.tx_max_pending = 4096; + + ns->netdev->ethtool->hds_config = ETHTOOL_TCP_DATA_SPLIT_UNKNOWN; + ns->netdev->ethtool->hds_thresh = 0; } void nsim_ethtool_init(struct netdevsim *ns) diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index d013b6498539..f92b05ccdca9 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -54,6 +55,7 @@ static int nsim_forward_skb(struct net_device *dev, struct sk_buff *skb, static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct netdevsim *ns = netdev_priv(dev); + struct ethtool_netdev_state *ethtool; struct net_device *peer_dev; unsigned int len = skb->len; struct netdevsim *peer_ns; @@ -74,6 +76,13 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev) rxq = rxq % peer_dev->num_rx_queues; rq = peer_ns->rq[rxq]; + ethtool = peer_dev->ethtool; + if (skb_is_nonlinear(skb) && + (ethtool->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED || + (ethtool->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + ethtool->hds_thresh > len))) + skb_linearize(skb); + skb_tx_timestamp(skb); if (unlikely(nsim_forward_skb(peer_dev, skb, rq) == NET_RX_DROP)) goto out_drop_cnt; diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index a70f62af4c88..dcf073bc4802 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -36,6 +37,8 @@ #define NSIM_IPSEC_VALID BIT(31) #define NSIM_UDP_TUNNEL_N_PORTS 4 +#define NSIM_HDS_THRESHOLD_MAX 1024 + struct nsim_sa { struct xfrm_state *xs; __be32 ipaddr[4]; -- 2.50.1 From cfd70e3eba2b68aa230d431e3c6ca0a1566e8d2e Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 14 Jan 2025 14:28:52 +0000 Subject: [PATCH 09/16] selftest: net-drv: hds: add test for HDS feature HDS/HDS-thresh features were updated/implemented. so add some tests for these features. HDS tests are the same with `ethtool -G eth0 tcp-data-split ` but `auto` depends on driver specification. So, it doesn't include `auto` case. HDS-thresh tests are same with `ethtool -G eth0 hds-thresh <0 - MAX>` It includes both 0 and MAX cases. It also includes exceed case, MAX + 1. Signed-off-by: Taehee Yoo Link: https://patch.msgid.link/20250114142852.3364986-11-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/Makefile | 1 + tools/testing/selftests/drivers/net/hds.py | 120 +++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/hds.py diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 469179c18935..137470bdee0c 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -12,6 +12,7 @@ TEST_PROGS := \ queues.py \ stats.py \ shaper.py \ + hds.py \ # end of TEST_PROGS include ../../lib.mk diff --git a/tools/testing/selftests/drivers/net/hds.py b/tools/testing/selftests/drivers/net/hds.py new file mode 100755 index 000000000000..394971b25c0b --- /dev/null +++ b/tools/testing/selftests/drivers/net/hds.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import errno +from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_raises, KsftSkipEx +from lib.py import EthtoolFamily, NlError +from lib.py import NetDrvEnv + +def get_hds(cfg, netnl) -> None: + try: + rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}}) + except NlError as e: + raise KsftSkipEx('ring-get not supported by device') + if 'tcp-data-split' not in rings: + raise KsftSkipEx('tcp-data-split not supported by device') + +def get_hds_thresh(cfg, netnl) -> None: + try: + rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}}) + except NlError as e: + raise KsftSkipEx('ring-get not supported by device') + if 'hds-thresh' not in rings: + raise KsftSkipEx('hds-thresh not supported by device') + +def set_hds_enable(cfg, netnl) -> None: + try: + netnl.rings_set({'header': {'dev-index': cfg.ifindex}, 'tcp-data-split': 'enabled'}) + except NlError as e: + if e.error == errno.EINVAL: + raise KsftSkipEx("disabling of HDS not supported by the device") + elif e.error == errno.EOPNOTSUPP: + raise KsftSkipEx("ring-set not supported by the device") + try: + rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}}) + except NlError as e: + raise KsftSkipEx('ring-get not supported by device') + if 'tcp-data-split' not in rings: + raise KsftSkipEx('tcp-data-split not supported by device') + + ksft_eq('enabled', rings['tcp-data-split']) + +def set_hds_disable(cfg, netnl) -> None: + try: + netnl.rings_set({'header': {'dev-index': cfg.ifindex}, 'tcp-data-split': 'disabled'}) + except NlError as e: + if e.error == errno.EINVAL: + raise KsftSkipEx("disabling of HDS not supported by the device") + elif e.error == errno.EOPNOTSUPP: + raise KsftSkipEx("ring-set not supported by the device") + try: + rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}}) + except NlError as e: + raise KsftSkipEx('ring-get not supported by device') + if 'tcp-data-split' not in rings: + raise KsftSkipEx('tcp-data-split not supported by device') + + ksft_eq('disabled', rings['tcp-data-split']) + +def set_hds_thresh_zero(cfg, netnl) -> None: + try: + netnl.rings_set({'header': {'dev-index': cfg.ifindex}, 'hds-thresh': 0}) + except NlError as e: + if e.error == errno.EINVAL: + raise KsftSkipEx("hds-thresh-set not supported by the device") + elif e.error == errno.EOPNOTSUPP: + raise KsftSkipEx("ring-set not supported by the device") + try: + rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}}) + except NlError as e: + raise KsftSkipEx('ring-get not supported by device') + if 'hds-thresh' not in rings: + raise KsftSkipEx('hds-thresh not supported by device') + + ksft_eq(0, rings['hds-thresh']) + +def set_hds_thresh_max(cfg, netnl) -> None: + try: + rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}}) + except NlError as e: + raise KsftSkipEx('ring-get not supported by device') + if 'hds-thresh' not in rings: + raise KsftSkipEx('hds-thresh not supported by device') + try: + netnl.rings_set({'header': {'dev-index': cfg.ifindex}, 'hds-thresh': rings['hds-thresh-max']}) + except NlError as e: + if e.error == errno.EINVAL: + raise KsftSkipEx("hds-thresh-set not supported by the device") + elif e.error == errno.EOPNOTSUPP: + raise KsftSkipEx("ring-set not supported by the device") + rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}}) + ksft_eq(rings['hds-thresh'], rings['hds-thresh-max']) + +def set_hds_thresh_gt(cfg, netnl) -> None: + try: + rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}}) + except NlError as e: + raise KsftSkipEx('ring-get not supported by device') + if 'hds-thresh' not in rings: + raise KsftSkipEx('hds-thresh not supported by device') + if 'hds-thresh-max' not in rings: + raise KsftSkipEx('hds-thresh-max not defined by device') + hds_gt = rings['hds-thresh-max'] + 1 + with ksft_raises(NlError) as e: + netnl.rings_set({'header': {'dev-index': cfg.ifindex}, 'hds-thresh': hds_gt}) + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) + +def main() -> None: + with NetDrvEnv(__file__, queue_count=3) as cfg: + ksft_run([get_hds, + get_hds_thresh, + set_hds_disable, + set_hds_enable, + set_hds_thresh_zero, + set_hds_thresh_max, + set_hds_thresh_gt], + args=(cfg, EthtoolFamily())) + ksft_exit() + +if __name__ == "__main__": + main() -- 2.50.1 From 3440fa34ad99d471f1085bc2f4dedeaebc310261 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Jan 2025 22:10:49 +0000 Subject: [PATCH 10/16] inet: ipmr: fix data-races Following fields of 'struct mr_mfc' can be updated concurrently (no lock protection) from ip_mr_forward() and ip6_mr_forward() - bytes - pkt - wrong_if - lastuse They also can be read from other functions. Convert bytes, pkt and wrong_if to atomic_long_t, and use READ_ONCE()/WRITE_ONCE() for lastuse. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250114221049.1190631-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlxsw/spectrum_mr.c | 8 +++--- include/linux/mroute_base.h | 6 ++-- net/ipv4/ipmr.c | 28 +++++++++---------- net/ipv4/ipmr_base.c | 6 ++-- net/ipv6/ip6mr.c | 28 +++++++++---------- 5 files changed, 38 insertions(+), 38 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c index 69cd689dbc83..5afe6b155ef0 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c @@ -1003,10 +1003,10 @@ static void mlxsw_sp_mr_route_stats_update(struct mlxsw_sp *mlxsw_sp, mr->mr_ops->route_stats(mlxsw_sp, mr_route->route_priv, &packets, &bytes); - if (mr_route->mfc->mfc_un.res.pkt != packets) - mr_route->mfc->mfc_un.res.lastuse = jiffies; - mr_route->mfc->mfc_un.res.pkt = packets; - mr_route->mfc->mfc_un.res.bytes = bytes; + if (atomic_long_read(&mr_route->mfc->mfc_un.res.pkt) != packets) + WRITE_ONCE(mr_route->mfc->mfc_un.res.lastuse, jiffies); + atomic_long_set(&mr_route->mfc->mfc_un.res.pkt, packets); + atomic_long_set(&mr_route->mfc->mfc_un.res.bytes, bytes); } static void mlxsw_sp_mr_stats_update(struct work_struct *work) diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 9dd4bf157255..58a2401e4b55 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -146,9 +146,9 @@ struct mr_mfc { unsigned long last_assert; int minvif; int maxvif; - unsigned long bytes; - unsigned long pkt; - unsigned long wrong_if; + atomic_long_t bytes; + atomic_long_t pkt; + atomic_long_t wrong_if; unsigned long lastuse; unsigned char ttls[MAXVIFS]; refcount_t refcount; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 99d8faa508e5..21ae7594a852 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -831,7 +831,7 @@ static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, cache->mfc_un.res.maxvif = vifi + 1; } } - cache->mfc_un.res.lastuse = jiffies; + WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies); } static int vif_add(struct net *net, struct mr_table *mrt, @@ -1681,9 +1681,9 @@ int ipmr_ioctl(struct sock *sk, int cmd, void *arg) rcu_read_lock(); c = ipmr_cache_find(mrt, sr->src.s_addr, sr->grp.s_addr); if (c) { - sr->pktcnt = c->_c.mfc_un.res.pkt; - sr->bytecnt = c->_c.mfc_un.res.bytes; - sr->wrong_if = c->_c.mfc_un.res.wrong_if; + sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); + sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); + sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); return 0; } @@ -1753,9 +1753,9 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) rcu_read_lock(); c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); if (c) { - sr.pktcnt = c->_c.mfc_un.res.pkt; - sr.bytecnt = c->_c.mfc_un.res.bytes; - sr.wrong_if = c->_c.mfc_un.res.wrong_if; + sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); + sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); + sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) @@ -1988,9 +1988,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, int vif, ct; vif = c->_c.mfc_parent; - c->_c.mfc_un.res.pkt++; - c->_c.mfc_un.res.bytes += skb->len; - c->_c.mfc_un.res.lastuse = jiffies; + atomic_long_inc(&c->_c.mfc_un.res.pkt); + atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); + WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { struct mfc_cache *cache_proxy; @@ -2021,7 +2021,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, goto dont_forward; } - c->_c.mfc_un.res.wrong_if++; + atomic_long_inc(&c->_c.mfc_un.res.wrong_if); if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, @@ -3029,9 +3029,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", - mfc->_c.mfc_un.res.pkt, - mfc->_c.mfc_un.res.bytes, - mfc->_c.mfc_un.res.wrong_if); + atomic_long_read(&mfc->_c.mfc_un.res.pkt), + atomic_long_read(&mfc->_c.mfc_un.res.bytes), + atomic_long_read(&mfc->_c.mfc_un.res.wrong_if)); for (n = mfc->_c.mfc_un.res.minvif; n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index f0af12a2f70b..03b6eee407a2 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -263,9 +263,9 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, lastuse = READ_ONCE(c->mfc_un.res.lastuse); lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; - mfcs.mfcs_packets = c->mfc_un.res.pkt; - mfcs.mfcs_bytes = c->mfc_un.res.bytes; - mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; + mfcs.mfcs_packets = atomic_long_read(&c->mfc_un.res.pkt); + mfcs.mfcs_bytes = atomic_long_read(&c->mfc_un.res.bytes); + mfcs.mfcs_wrong_if = atomic_long_read(&c->mfc_un.res.wrong_if); if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), RTA_PAD)) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 578ff1336afe..535e9f72514c 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -520,9 +520,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", - mfc->_c.mfc_un.res.pkt, - mfc->_c.mfc_un.res.bytes, - mfc->_c.mfc_un.res.wrong_if); + atomic_long_read(&mfc->_c.mfc_un.res.pkt), + atomic_long_read(&mfc->_c.mfc_un.res.bytes), + atomic_long_read(&mfc->_c.mfc_un.res.wrong_if)); for (n = mfc->_c.mfc_un.res.minvif; n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && @@ -884,7 +884,7 @@ static void ip6mr_update_thresholds(struct mr_table *mrt, cache->mfc_un.res.maxvif = vifi + 1; } } - cache->mfc_un.res.lastuse = jiffies; + WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies); } static int mif6_add(struct net *net, struct mr_table *mrt, @@ -1945,9 +1945,9 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void *arg) c = ip6mr_cache_find(mrt, &sr->src.sin6_addr, &sr->grp.sin6_addr); if (c) { - sr->pktcnt = c->_c.mfc_un.res.pkt; - sr->bytecnt = c->_c.mfc_un.res.bytes; - sr->wrong_if = c->_c.mfc_un.res.wrong_if; + sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); + sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); + sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); return 0; } @@ -2017,9 +2017,9 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { - sr.pktcnt = c->_c.mfc_un.res.pkt; - sr.bytecnt = c->_c.mfc_un.res.bytes; - sr.wrong_if = c->_c.mfc_un.res.wrong_if; + sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); + sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); + sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) @@ -2142,9 +2142,9 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, int true_vifi = ip6mr_find_vif(mrt, dev); vif = c->_c.mfc_parent; - c->_c.mfc_un.res.pkt++; - c->_c.mfc_un.res.bytes += skb->len; - c->_c.mfc_un.res.lastuse = jiffies; + atomic_long_inc(&c->_c.mfc_un.res.pkt); + atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); + WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); if (ipv6_addr_any(&c->mf6c_origin) && true_vifi >= 0) { struct mfc6_cache *cache_proxy; @@ -2162,7 +2162,7 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, * Wrong interface: drop packet and (maybe) send PIM assert. */ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) { - c->_c.mfc_un.res.wrong_if++; + atomic_long_inc(&c->_c.mfc_un.res.wrong_if); if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, -- 2.50.1 From 0b6f6593aa8c3a05f155c12fd0e7ad33a5149c31 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Thu, 9 Jan 2025 00:33:50 +0100 Subject: [PATCH 11/16] net: wwan: iosm: Fix hibernation by re-binding the driver around it Currently, the driver is seriously broken with respect to the hibernation (S4): after image restore the device is back into IPC_MEM_EXEC_STAGE_BOOT (which AFAIK means bootloader stage) and needs full re-launch of the rest of its firmware, but the driver restore handler treats the device as merely sleeping and just sends it a wake-up command. This wake-up command times out but device nodes (/dev/wwan*) remain accessible. However attempting to use them causes the bootloader to crash and enter IPC_MEM_EXEC_STAGE_CD_READY stage (which apparently means "a crash dump is ready"). It seems that the device cannot be re-initialized from this crashed stage without toggling some reset pin (on my test platform that's apparently what the device _RST ACPI method does). While it would theoretically be possible to rewrite the driver to tear down the whole MUX / IPC layers on hibernation (so the bootloader does not crash from improper access) and then re-launch the device on restore this would require significant refactoring of the driver (believe me, I've tried), since there are quite a few assumptions hard-coded in the driver about the device never being partially de-initialized (like channels other than devlink cannot be closed, for example). Probably this would also need some programming guide for this hardware. Considering that the driver seems orphaned [1] and other people are hitting this issue too [2] fix it by simply unbinding the PCI driver before hibernation and re-binding it after restore, much like USB_QUIRK_RESET_RESUME does for USB devices that exhibit a similar problem. Tested on XMM7360 in HP EliteBook 855 G7 both with s2idle (which uses the existing suspend / resume handlers) and S4 (which uses the new code). [1]: https://lore.kernel.org/all/c248f0b4-2114-4c61-905f-466a786bdebb@leemhuis.info/ [2]: https://github.com/xmm7360/xmm7360-pci/issues/211#issuecomment-1804139413 Reviewed-by: Sergey Ryazanov Signed-off-by: Maciej S. Szmigiero Link: https://patch.msgid.link/e60287ebdb0ab54c4075071b72568a40a75d0205.1736372610.git.mail@maciej.szmigiero.name Signed-off-by: Jakub Kicinski --- drivers/net/wwan/iosm/iosm_ipc_pcie.c | 56 ++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/drivers/net/wwan/iosm/iosm_ipc_pcie.c b/drivers/net/wwan/iosm/iosm_ipc_pcie.c index 04517bd3325a..a066977af0be 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_pcie.c +++ b/drivers/net/wwan/iosm/iosm_ipc_pcie.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "iosm_ipc_imem.h" @@ -18,6 +19,7 @@ MODULE_LICENSE("GPL v2"); /* WWAN GUID */ static guid_t wwan_acpi_guid = GUID_INIT(0xbad01b75, 0x22a8, 0x4f48, 0x87, 0x92, 0xbd, 0xde, 0x94, 0x67, 0x74, 0x7d); +static bool pci_registered; static void ipc_pcie_resources_release(struct iosm_pcie *ipc_pcie) { @@ -448,7 +450,6 @@ static struct pci_driver iosm_ipc_driver = { }, .id_table = iosm_ipc_ids, }; -module_pci_driver(iosm_ipc_driver); int ipc_pcie_addr_map(struct iosm_pcie *ipc_pcie, unsigned char *data, size_t size, dma_addr_t *mapping, int direction) @@ -530,3 +531,56 @@ void ipc_pcie_kfree_skb(struct iosm_pcie *ipc_pcie, struct sk_buff *skb) IPC_CB(skb)->mapping = 0; dev_kfree_skb(skb); } + +static int pm_notify(struct notifier_block *nb, unsigned long mode, void *_unused) +{ + if (mode == PM_HIBERNATION_PREPARE || mode == PM_RESTORE_PREPARE) { + if (pci_registered) { + pci_unregister_driver(&iosm_ipc_driver); + pci_registered = false; + } + } else if (mode == PM_POST_HIBERNATION || mode == PM_POST_RESTORE) { + if (!pci_registered) { + int ret; + + ret = pci_register_driver(&iosm_ipc_driver); + if (ret) { + pr_err(KBUILD_MODNAME ": unable to re-register PCI driver: %d\n", + ret); + } else { + pci_registered = true; + } + } + } + + return 0; +} + +static struct notifier_block pm_notifier = { + .notifier_call = pm_notify, +}; + +static int __init iosm_ipc_driver_init(void) +{ + int ret; + + ret = pci_register_driver(&iosm_ipc_driver); + if (ret) + return ret; + + pci_registered = true; + + register_pm_notifier(&pm_notifier); + + return 0; +} +module_init(iosm_ipc_driver_init); + +static void __exit iosm_ipc_driver_exit(void) +{ + unregister_pm_notifier(&pm_notifier); + + if (pci_registered) + pci_unregister_driver(&iosm_ipc_driver); +} +module_exit(iosm_ipc_driver_exit); -- 2.50.1 From 0734d7c3d93cdcb8a56ce914d3c661300f24434d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Jan 2025 20:55:27 +0000 Subject: [PATCH 12/16] net: expedite synchronize_net() for cleanup_net() cleanup_net() is the single thread responsible for netns dismantles, and a serious bottleneck. Before we can get per-netns RTNL, make sure all synchronize_net() called from this thread are using rcu_synchronize_expedited(). v3: deal with CONFIG_NET_NS=n Signed-off-by: Eric Dumazet Reviewed-by: Jesse Brandeburg Link: https://patch.msgid.link/20250114205531.967841-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/net_namespace.h | 2 ++ net/core/dev.c | 11 ++++++++++- net/core/net_namespace.c | 5 +++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 5a2a0df8ad91..0f5eb9db0c62 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -210,6 +210,8 @@ void net_ns_barrier(void); struct ns_common *get_net_ns(struct ns_common *ns); struct net *get_net_ns_by_fd(int fd); +extern struct task_struct *cleanup_net_task; + #else /* CONFIG_NET_NS */ #include #include diff --git a/net/core/dev.c b/net/core/dev.c index 47e6b0f73cfc..115a7a0a1104 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10099,6 +10099,15 @@ static void dev_index_release(struct net *net, int ifindex) WARN_ON(xa_erase(&net->dev_by_index, ifindex)); } +static bool from_cleanup_net(void) +{ +#ifdef CONFIG_NET_NS + return current == cleanup_net_task; +#else + return false; +#endif +} + /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); @@ -11474,7 +11483,7 @@ EXPORT_SYMBOL_GPL(alloc_netdev_dummy); void synchronize_net(void) { might_sleep(); - if (rtnl_is_locked()) + if (from_cleanup_net() || rtnl_is_locked()) synchronize_rcu_expedited(); else synchronize_rcu(); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b5cd3ae4f04c..cb39a12b2f82 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -588,6 +588,8 @@ static void unhash_nsid(struct net *net, struct net *last) static LLIST_HEAD(cleanup_list); +struct task_struct *cleanup_net_task; + static void cleanup_net(struct work_struct *work) { const struct pernet_operations *ops; @@ -596,6 +598,8 @@ static void cleanup_net(struct work_struct *work) LIST_HEAD(net_exit_list); LIST_HEAD(dev_kill_list); + cleanup_net_task = current; + /* Atomically snapshot the list of namespaces to cleanup */ net_kill_list = llist_del_all(&cleanup_list); @@ -670,6 +674,7 @@ static void cleanup_net(struct work_struct *work) put_user_ns(net->user_ns); net_free(net); } + cleanup_net_task = NULL; } /** -- 2.50.1 From 8a2b61e9e87936649073e287242ccdcbfb636906 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Jan 2025 20:55:28 +0000 Subject: [PATCH 13/16] net: no longer assume RTNL is held in flush_all_backlogs() flush_all_backlogs() uses per-cpu and static data to hold its temporary data, on the assumption it is called under RTNL protection. Following patch in the series will break this assumption. Use instead a dynamically allocated piece of memory. In the unlikely case the allocation fails, use a boot-time allocated memory. Signed-off-by: Eric Dumazet Reviewed-by: Jesse Brandeburg Link: https://patch.msgid.link/20250114205531.967841-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 53 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 115a7a0a1104..41da51f95486 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6013,8 +6013,6 @@ void netif_receive_skb_list(struct list_head *head) } EXPORT_SYMBOL(netif_receive_skb_list); -static DEFINE_PER_CPU(struct work_struct, flush_works); - /* Network device is going away, flush any packets still pending */ static void flush_backlog(struct work_struct *work) { @@ -6071,36 +6069,54 @@ static bool flush_required(int cpu) return true; } +struct flush_backlogs { + cpumask_t flush_cpus; + struct work_struct w[]; +}; + +static struct flush_backlogs *flush_backlogs_alloc(void) +{ + return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids), + GFP_KERNEL); +} + +static struct flush_backlogs *flush_backlogs_fallback; +static DEFINE_MUTEX(flush_backlogs_mutex); + static void flush_all_backlogs(void) { - static cpumask_t flush_cpus; + struct flush_backlogs *ptr = flush_backlogs_alloc(); unsigned int cpu; - /* since we are under rtnl lock protection we can use static data - * for the cpumask and avoid allocating on stack the possibly - * large mask - */ - ASSERT_RTNL(); + if (!ptr) { + mutex_lock(&flush_backlogs_mutex); + ptr = flush_backlogs_fallback; + } + cpumask_clear(&ptr->flush_cpus); cpus_read_lock(); - cpumask_clear(&flush_cpus); for_each_online_cpu(cpu) { if (flush_required(cpu)) { - queue_work_on(cpu, system_highpri_wq, - per_cpu_ptr(&flush_works, cpu)); - cpumask_set_cpu(cpu, &flush_cpus); + INIT_WORK(&ptr->w[cpu], flush_backlog); + queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]); + __cpumask_set_cpu(cpu, &ptr->flush_cpus); } } /* we can have in flight packet[s] on the cpus we are not flushing, * synchronize_net() in unregister_netdevice_many() will take care of - * them + * them. */ - for_each_cpu(cpu, &flush_cpus) - flush_work(per_cpu_ptr(&flush_works, cpu)); + for_each_cpu(cpu, &ptr->flush_cpus) + flush_work(&ptr->w[cpu]); cpus_read_unlock(); + + if (ptr != flush_backlogs_fallback) + kfree(ptr); + else + mutex_unlock(&flush_backlogs_mutex); } static void net_rps_send_ipi(struct softnet_data *remsd) @@ -12313,12 +12329,13 @@ static int __init net_dev_init(void) * Initialise the packet receive queues. */ + flush_backlogs_fallback = flush_backlogs_alloc(); + if (!flush_backlogs_fallback) + goto out; + for_each_possible_cpu(i) { - struct work_struct *flush = per_cpu_ptr(&flush_works, i); struct softnet_data *sd = &per_cpu(softnet_data, i); - INIT_WORK(flush, flush_backlog); - skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); #ifdef CONFIG_XFRM_OFFLOAD -- 2.50.1 From cfa579f6665635b72d4a075fc91eb144c2b0f74e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Jan 2025 20:55:29 +0000 Subject: [PATCH 14/16] net: no longer hold RTNL while calling flush_all_backlogs() flush_all_backlogs() is called from unregister_netdevice_many_notify() as part of netdevice dismantles. This is currently called under RTNL, and can last up to 50 ms on busy hosts. There is no reason to hold RTNL at this stage, if our caller is cleanup_net() : netns are no more visible, devices are in NETREG_UNREGISTERING state and no other thread could mess our state while RTNL is temporarily released. In order to provide isolation, this patch provides a separate 'net_todo_list' for cleanup_net(). Signed-off-by: Eric Dumazet Reviewed-by: Jesse Brandeburg Link: https://patch.msgid.link/20250114205531.967841-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 41da51f95486..d652ea66e575 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10124,14 +10124,37 @@ static bool from_cleanup_net(void) #endif } +static void rtnl_drop_if_cleanup_net(void) +{ + if (from_cleanup_net()) + __rtnl_unlock(); +} + +static void rtnl_acquire_if_cleanup_net(void) +{ + if (from_cleanup_net()) + rtnl_lock(); +} + /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); +static LIST_HEAD(net_todo_list_for_cleanup_net); + +/* TODO: net_todo_list/net_todo_list_for_cleanup_net should probably + * be provided by callers, instead of being static, rtnl protected. + */ +static struct list_head *todo_list(void) +{ + return from_cleanup_net() ? &net_todo_list_for_cleanup_net : + &net_todo_list; +} + DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); atomic_t dev_unreg_count = ATOMIC_INIT(0); static void net_set_todo(struct net_device *dev) { - list_add_tail(&dev->todo_list, &net_todo_list); + list_add_tail(&dev->todo_list, todo_list()); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, @@ -10979,7 +11002,7 @@ void netdev_run_todo(void) #endif /* Snapshot list, allow later requests */ - list_replace_init(&net_todo_list, &list); + list_replace_init(todo_list(), &list); __rtnl_unlock(); @@ -11602,8 +11625,10 @@ void unregister_netdevice_many_notify(struct list_head *head, unlist_netdevice(dev); WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); } - flush_all_backlogs(); + rtnl_drop_if_cleanup_net(); + flush_all_backlogs(); + rtnl_acquire_if_cleanup_net(); synchronize_net(); list_for_each_entry(dev, head, unreg_list) { -- 2.50.1 From ae646f1a0bb97401bac0044bbe2a179a1e38b408 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Jan 2025 20:55:30 +0000 Subject: [PATCH 15/16] net: reduce RTNL hold duration in unregister_netdevice_many_notify() (part 1) Two synchronize_net() calls are currently done while holding RTNL. This is source of RTNL contention in workloads adding and deleting many network namespaces per second, because synchronize_rcu() and synchronize_rcu_expedited() can use 60+ ms in some cases. For cleanup_net() use, temporarily release RTNL while calling the last synchronize_net(). This should be safe, because devices are no longer visible to other threads at this point. In any case, the new netdev_lock() / netdev_unlock() infrastructure that we are adding should allow to fix potential issues, with a combination of a per-device mutex and dev->reg_state awareness. Signed-off-by: Eric Dumazet Reviewed-by: Jesse Brandeburg Link: https://patch.msgid.link/20250114205531.967841-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index d652ea66e575..3924a4af68b8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11629,6 +11629,7 @@ void unregister_netdevice_many_notify(struct list_head *head, rtnl_drop_if_cleanup_net(); flush_all_backlogs(); rtnl_acquire_if_cleanup_net(); + /* TODO: move this before the prior rtnl_acquire_if_cleanup_net() */ synchronize_net(); list_for_each_entry(dev, head, unreg_list) { @@ -11689,7 +11690,9 @@ void unregister_netdevice_many_notify(struct list_head *head, #endif } + rtnl_drop_if_cleanup_net(); synchronize_net(); + rtnl_acquire_if_cleanup_net(); list_for_each_entry(dev, head, unreg_list) { netdev_put(dev, &dev->dev_registered_tracker); -- 2.50.1 From 83419b61d187ce22aa3da5ffdda850fca3a12600 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Jan 2025 20:55:31 +0000 Subject: [PATCH 16/16] net: reduce RTNL hold duration in unregister_netdevice_many_notify() (part 2) One synchronize_net() call is currently done while holding RTNL. This is source of RTNL contention in workloads adding and deleting many network namespaces per second, because synchronize_rcu() and synchronize_rcu_expedited() can use 60+ ms in some cases. For cleanup_net() use, temporarily release RTNL while calling the last synchronize_net(). This should be safe, because devices are no longer visible to other threads after unlist_netdevice() call and setting dev->reg_state to NETREG_UNREGISTERING. In any case, the new netdev_lock() / netdev_unlock() infrastructure that we are adding should allow to fix potential issues, with a combination of a per-device mutex and dev->reg_state awareness. Signed-off-by: Eric Dumazet Reviewed-by: Jesse Brandeburg Link: https://patch.msgid.link/20250114205531.967841-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 3924a4af68b8..3c87cb1cb877 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11628,9 +11628,8 @@ void unregister_netdevice_many_notify(struct list_head *head, rtnl_drop_if_cleanup_net(); flush_all_backlogs(); - rtnl_acquire_if_cleanup_net(); - /* TODO: move this before the prior rtnl_acquire_if_cleanup_net() */ synchronize_net(); + rtnl_acquire_if_cleanup_net(); list_for_each_entry(dev, head, unreg_list) { struct sk_buff *skb = NULL; -- 2.50.1