From d14fe43e0007c61df3a35f3d201bba1ba755117b Mon Sep 17 00:00:00 2001
From: Rosen Penev <rosenp@gmail.com>
Date: Mon, 30 Sep 2024 11:18:23 -0700
Subject: [PATCH 01/16] net: ag71xx: move assignment into main loop

Effectively what's going on here is there's a main loop and an identical
one below with a single assignment. Simpler to move it up.

Signed-off-by: Rosen Penev <rosenp@gmail.com>
Link: https://patch.msgid.link/20240930181823.288892-6-rosenp@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/atheros/ag71xx.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/atheros/ag71xx.c b/drivers/net/ethernet/atheros/ag71xx.c
index 067a012a5799..3d4c3d8698e2 100644
--- a/drivers/net/ethernet/atheros/ag71xx.c
+++ b/drivers/net/ethernet/atheros/ag71xx.c
@@ -1646,6 +1646,7 @@ static int ag71xx_rx_packets(struct ag71xx *ag, int limit)
 
 		skb->dev = ndev;
 		skb->ip_summed = CHECKSUM_NONE;
+		skb->protocol = eth_type_trans(skb, ndev);
 		list_add_tail(&skb->list, &rx_list);
 
 next:
@@ -1657,8 +1658,6 @@ next:
 
 	ag71xx_ring_rx_refill(ag);
 
-	list_for_each_entry(skb, &rx_list, list)
-		skb->protocol = eth_type_trans(skb, ndev);
 	netif_receive_skb_list(&rx_list);
 
 	netif_dbg(ag, rx_status, ndev, "rx finish, curr=%u, dirty=%u, done=%d\n",
-- 
2.51.0


From 4d77e88ab42f76c7c09eb1c693453801a21b2c88 Mon Sep 17 00:00:00 2001
From: Rosen Penev <rosenp@gmail.com>
Date: Mon, 30 Sep 2024 13:29:50 -0700
Subject: [PATCH 02/16] net: mv643xx: use devm_platform_ioremap_resource

This combines multiple steps in one function.

Signed-off-by: Rosen Penev <rosenp@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20240930202951.297737-2-rosenp@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/marvell/mv643xx_eth.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index 9e80899546d9..3036ac9f042a 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -2843,25 +2843,20 @@ static int mv643xx_eth_shared_probe(struct platform_device *pdev)
 	struct mv643xx_eth_shared_platform_data *pd;
 	struct mv643xx_eth_shared_private *msp;
 	const struct mbus_dram_target_info *dram;
-	struct resource *res;
 	int ret;
 
 	if (!mv643xx_eth_version_printed++)
 		pr_notice("MV-643xx 10/100/1000 ethernet driver version %s\n",
 			  mv643xx_eth_driver_version);
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (res == NULL)
-		return -EINVAL;
-
 	msp = devm_kzalloc(&pdev->dev, sizeof(*msp), GFP_KERNEL);
 	if (msp == NULL)
 		return -ENOMEM;
 	platform_set_drvdata(pdev, msp);
 
-	msp->base = devm_ioremap(&pdev->dev, res->start, resource_size(res));
-	if (msp->base == NULL)
-		return -ENOMEM;
+	msp->base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(msp->base))
+		return PTR_ERR(msp->base);
 
 	msp->clk = devm_clk_get(&pdev->dev, NULL);
 	if (!IS_ERR(msp->clk))
-- 
2.51.0


From 50c3a7fbaa10a1973cfcf910601a5120b2595022 Mon Sep 17 00:00:00 2001
From: Rosen Penev <rosenp@gmail.com>
Date: Mon, 30 Sep 2024 13:29:51 -0700
Subject: [PATCH 03/16] net: mv643xx: fix wrong devm_clk_get usage

This clock should be optional. In addition, PTR_ERR can be -EPROBE_DEFER
in which case it should return.

devm_clk_get_optional_enabled also allows removing explicit clock enable
and disable calls.

Signed-off-by: Rosen Penev <rosenp@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20240930202951.297737-3-rosenp@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/marvell/mv643xx_eth.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index 3036ac9f042a..34880017d9dc 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -2858,9 +2858,9 @@ static int mv643xx_eth_shared_probe(struct platform_device *pdev)
 	if (IS_ERR(msp->base))
 		return PTR_ERR(msp->base);
 
-	msp->clk = devm_clk_get(&pdev->dev, NULL);
-	if (!IS_ERR(msp->clk))
-		clk_prepare_enable(msp->clk);
+	msp->clk = devm_clk_get_optional_enabled(&pdev->dev, NULL);
+	if (IS_ERR(msp->clk))
+		return PTR_ERR(msp->clk);
 
 	/*
 	 * (Re-)program MBUS remapping windows if we are asked to.
@@ -2871,7 +2871,7 @@ static int mv643xx_eth_shared_probe(struct platform_device *pdev)
 
 	ret = mv643xx_eth_shared_of_probe(pdev);
 	if (ret)
-		goto err_put_clk;
+		return ret;
 	pd = dev_get_platdata(&pdev->dev);
 
 	msp->tx_csum_limit = (pd != NULL && pd->tx_csum_limit) ?
@@ -2879,20 +2879,11 @@ static int mv643xx_eth_shared_probe(struct platform_device *pdev)
 	infer_hw_params(msp);
 
 	return 0;
-
-err_put_clk:
-	if (!IS_ERR(msp->clk))
-		clk_disable_unprepare(msp->clk);
-	return ret;
 }
 
 static void mv643xx_eth_shared_remove(struct platform_device *pdev)
 {
-	struct mv643xx_eth_shared_private *msp = platform_get_drvdata(pdev);
-
 	mv643xx_eth_shared_of_remove();
-	if (!IS_ERR(msp->clk))
-		clk_disable_unprepare(msp->clk);
 }
 
 static struct platform_driver mv643xx_eth_shared_driver = {
-- 
2.51.0


From b8db67d4df0022e3595263b542953c251f4ccf06 Mon Sep 17 00:00:00 2001
From: Michal Schmidt <mschmidt@redhat.com>
Date: Mon, 30 Sep 2024 22:13:04 +0200
Subject: [PATCH 04/16] qed: make 'ethtool -d' 10 times faster
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

As a side effect of commit 5401c3e09928 ("qed: allow sleep in
qed_mcp_trace_dump()"), 'ethtool -d' became much slower.
Almost all the time is spent collecting the "mcp_trace".
It is caused by sleeping too long in _qed_mcp_cmd_and_union.
When called with sleeping not allowed, the function delays for 10 Âµs
between firmware polls. But if sleeping is allowed, it sleeps for 10 ms
instead.

The sleeps in _qed_mcp_cmd_and_union are unnecessarily long.
Replace msleep with usleep_range, which allows to achieve a similar
polling interval like in the no-sleeping mode (10 - 20 Âµs).

The only caller, qed_mcp_cmd_and_union, can stop doing the
multiplication/division of the usecs/max_retries. The polling interval
and the number of retries do not need to be parameters at all.

On my test system, 'ethtool -d' now takes 4 seconds instead of 44.

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Link: https://patch.msgid.link/20240930201307.330692-2-mschmidt@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/qlogic/qed/qed_mcp.c | 36 ++++++++++-------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 16e6bd466143..00f0abc1c2d2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -459,12 +459,11 @@ static void qed_mcp_print_cpu_info(struct qed_hwfn *p_hwfn,
 static int
 _qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn,
 		       struct qed_ptt *p_ptt,
-		       struct qed_mcp_mb_params *p_mb_params,
-		       u32 max_retries, u32 usecs)
+		       struct qed_mcp_mb_params *p_mb_params)
 {
-	u32 cnt = 0, msecs = DIV_ROUND_UP(usecs, 1000);
 	struct qed_mcp_cmd_elem *p_cmd_elem;
 	u16 seq_num;
+	u32 cnt = 0;
 	int rc = 0;
 
 	/* Wait until the mailbox is non-occupied */
@@ -488,12 +487,13 @@ _qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn,
 		spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock);
 
 		if (QED_MB_FLAGS_IS_SET(p_mb_params, CAN_SLEEP))
-			msleep(msecs);
+			usleep_range(QED_MCP_RESP_ITER_US,
+				     QED_MCP_RESP_ITER_US * 2);
 		else
-			udelay(usecs);
-	} while (++cnt < max_retries);
+			udelay(QED_MCP_RESP_ITER_US);
+	} while (++cnt < QED_DRV_MB_MAX_RETRIES);
 
-	if (cnt >= max_retries) {
+	if (cnt >= QED_DRV_MB_MAX_RETRIES) {
 		DP_NOTICE(p_hwfn,
 			  "The MFW mailbox is occupied by an uncompleted command. Failed to send command 0x%08x [param 0x%08x].\n",
 			  p_mb_params->cmd, p_mb_params->param);
@@ -520,9 +520,10 @@ _qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn,
 		 */
 
 		if (QED_MB_FLAGS_IS_SET(p_mb_params, CAN_SLEEP))
-			msleep(msecs);
+			usleep_range(QED_MCP_RESP_ITER_US,
+				     QED_MCP_RESP_ITER_US * 2);
 		else
-			udelay(usecs);
+			udelay(QED_MCP_RESP_ITER_US);
 
 		spin_lock_bh(&p_hwfn->mcp_info->cmd_lock);
 
@@ -536,9 +537,9 @@ _qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn,
 			goto err;
 
 		spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock);
-	} while (++cnt < max_retries);
+	} while (++cnt < QED_DRV_MB_MAX_RETRIES);
 
-	if (cnt >= max_retries) {
+	if (cnt >= QED_DRV_MB_MAX_RETRIES) {
 		DP_NOTICE(p_hwfn,
 			  "The MFW failed to respond to command 0x%08x [param 0x%08x].\n",
 			  p_mb_params->cmd, p_mb_params->param);
@@ -564,7 +565,8 @@ _qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn,
 		   "MFW mailbox: response 0x%08x param 0x%08x [after %d.%03d ms]\n",
 		   p_mb_params->mcp_resp,
 		   p_mb_params->mcp_param,
-		   (cnt * usecs) / 1000, (cnt * usecs) % 1000);
+		   (cnt * QED_MCP_RESP_ITER_US) / 1000,
+		   (cnt * QED_MCP_RESP_ITER_US) % 1000);
 
 	/* Clear the sequence number from the MFW response */
 	p_mb_params->mcp_resp &= FW_MSG_CODE_MASK;
@@ -581,8 +583,6 @@ static int qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn,
 				 struct qed_mcp_mb_params *p_mb_params)
 {
 	size_t union_data_size = sizeof(union drv_union_data);
-	u32 max_retries = QED_DRV_MB_MAX_RETRIES;
-	u32 usecs = QED_MCP_RESP_ITER_US;
 
 	/* MCP not initialized */
 	if (!qed_mcp_is_init(p_hwfn)) {
@@ -606,13 +606,7 @@ static int qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn,
 		return -EINVAL;
 	}
 
-	if (QED_MB_FLAGS_IS_SET(p_mb_params, CAN_SLEEP)) {
-		max_retries = DIV_ROUND_UP(max_retries, 1000);
-		usecs *= 1000;
-	}
-
-	return _qed_mcp_cmd_and_union(p_hwfn, p_ptt, p_mb_params, max_retries,
-				      usecs);
+	return _qed_mcp_cmd_and_union(p_hwfn, p_ptt, p_mb_params);
 }
 
 static int _qed_mcp_cmd(struct qed_hwfn *p_hwfn,
-- 
2.51.0


From 6cd695706f8bb3d06a4dda1f7d673dbfcca45784 Mon Sep 17 00:00:00 2001
From: Michal Schmidt <mschmidt@redhat.com>
Date: Mon, 30 Sep 2024 22:13:05 +0200
Subject: [PATCH 05/16] qed: put cond_resched() in qed_grc_dump_ctx_data()

On a kernel with preemption none or voluntary, 'ethtool -d'
on a qede network device can cause a big latency spike.
The biggest part of it is the loop in qed_grc_dump_ctx_data.

The function is called only from the .get_size and .perform_dump
callbacks for the "grc" feature defined in qed_features_lookup[].
As far as I can see, they are used in:
 - qed's devlink healh reporter .dump op
 - qede's ethtool get_regs/get_regs_len/get_dump_data ops
 - qedf's qedf_get_grc_dump, called from:
   - qedf_sysfs_write_grcdump - "grcdump" sysfs attribute write
   - qedf_wq_grcdump - a workqueue

It is safe to sleep in all of them.
Let's insert a cond_resched() in the outer loop to let other tasks run.

Measured using this script:

  #!/bin/bash
  DEV=ens3f1
  echo wakeup_rt > /sys/kernel/tracing/current_tracer
  echo 0 > /sys/kernel/tracing/tracing_max_latency
  echo 1 > /sys/kernel/tracing/tracing_on
  echo "Setting the task CPU affinity"
  taskset -p 1 $$ > /dev/null
  echo "Starting the real-time task"
  chrt -f 50 bash -c 'while sleep 0.01; do :; done' &
  sleep 1
  echo "Running: ethtool -d $DEV"
  time ethtool -d $DEV > /dev/null
  kill %1
  echo 0 > /sys/kernel/tracing/tracing_on
  echo "Measured latency: $(</sys/kernel/tracing/tracing_max_latency) us"
  echo "To see the latency trace: less /sys/kernel/tracing/trace"

The patch lowers the latency from 180 ms to 53 ms on my test system with
voluntary preemption.

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Link: https://patch.msgid.link/20240930201307.330692-3-mschmidt@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/qlogic/qed/qed_debug.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c
index f67be4b8ad43..464a72afb758 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_debug.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c
@@ -2873,6 +2873,7 @@ static u32 qed_grc_dump_ctx_data(struct qed_hwfn *p_hwfn,
 							  false,
 							  SPLIT_TYPE_NONE, 0);
 		}
+		cond_resched();
 	}
 
 	return offset;
-- 
2.51.0


From cf54ae6b59203bea4f4c749043fa57a58d279e38 Mon Sep 17 00:00:00 2001
From: Michal Schmidt <mschmidt@redhat.com>
Date: Mon, 30 Sep 2024 22:13:06 +0200
Subject: [PATCH 06/16] qed: allow the callee of qed_mcp_nvm_read() to sleep

qed_mcp_nvm_read has a loop where it calls qed_mcp_nvm_rd_cmd with the
argument b_can_sleep=false. And it sleeps once every 0x1000 bytes
read.

Simplify this by letting qed_mcp_nvm_rd_cmd itself sleep
(b_can_sleep=true). It will have slept at least once when successful
(in the "Wait for the MFW response" loop). So the extra sleep once every
0x1000 bytes becomes superfluous. Delete it.

On my test system with voluntary preemption, this lowers the latency
caused by 'ethtool -d' from 53 ms to 10 ms.

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Link: https://patch.msgid.link/20240930201307.330692-4-mschmidt@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/qlogic/qed/qed_mcp.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 00f0abc1c2d2..26a714bfad4e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -3079,20 +3079,13 @@ int qed_mcp_nvm_read(struct qed_dev *cdev, u32 addr, u8 *p_buf, u32 len)
 					 DRV_MB_PARAM_NVM_LEN_OFFSET),
 					&resp, &resp_param,
 					&read_len,
-					(u32 *)(p_buf + offset), false);
+					(u32 *)(p_buf + offset), true);
 
 		if (rc || (resp != FW_MSG_CODE_NVM_OK)) {
 			DP_NOTICE(cdev, "MCP command rc = %d\n", rc);
 			break;
 		}
 
-		/* This can be a lengthy process, and it's possible scheduler
-		 * isn't preemptible. Sleep a bit to prevent CPU hogging.
-		 */
-		if (bytes_left % 0x1000 <
-		    (bytes_left - read_len) % 0x1000)
-			usleep_range(1000, 2000);
-
 		offset += read_len;
 		bytes_left -= read_len;
 	}
-- 
2.51.0


From 2efeaf1d2a13f4b7419d60cd145ac84a3c151214 Mon Sep 17 00:00:00 2001
From: Michal Schmidt <mschmidt@redhat.com>
Date: Mon, 30 Sep 2024 22:13:07 +0200
Subject: [PATCH 07/16] qed: put cond_resched() in qed_dmae_operation_wait()

It is OK to sleep in qed_dmae_operation_wait, because it is called only
in process context, while holding p_hwfn->dmae_info.mutex from one of
the qed_dmae_{host,grc}2{host,grc} functions.
The udelay(DMAE_MIN_WAIT_TIME=2) in the function is too short to replace
with usleep_range, but at least it's a suitable point for checking if we
should give up the CPU with cond_resched().

This lowers the latency caused by 'ethtool -d' from 10 ms to less than
2 ms on my test system with voluntary preemption.

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Link: https://patch.msgid.link/20240930201307.330692-5-mschmidt@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/qlogic/qed/qed_hw.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.c b/drivers/net/ethernet/qlogic/qed/qed_hw.c
index 6263f847b6b9..9e5f0dbc8a07 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hw.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_hw.c
@@ -596,6 +596,7 @@ static int qed_dmae_operation_wait(struct qed_hwfn *p_hwfn)
 	barrier();
 	while (*p_hwfn->dmae_info.p_completion_word != DMAE_COMPLETION_VAL) {
 		udelay(DMAE_MIN_WAIT_TIME);
+		cond_resched();
 		if (++wait_cnt > wait_cnt_limit) {
 			DP_NOTICE(p_hwfn->cdev,
 				  "Timed-out waiting for operation to complete. Completion word is 0x%08x expected 0x%08x.\n",
-- 
2.51.0


From 1d39d02a1535658962f9370312be7b2d634946a5 Mon Sep 17 00:00:00 2001
From: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Date: Mon, 30 Sep 2024 22:38:25 +0200
Subject: [PATCH 08/16] net: mdio: thunder: switch to scoped
 device_for_each_child_node()

There has already been an issue with the handling of early exits from
device_for_each_child() in this driver, and it was solved with commit
b1de5c78ebe9 ("net: mdio: thunder: Add missing fwnode_handle_put()") by
adding a call to fwnode_handle_put() right after the loop.

That solution is valid indeed, but if a new error path with a 'return'
is added to the loop, this solution will fail. A more secure approach
is using the scoped variant of the macro, which automatically
decrements the refcount of the child node when it goes out of scope,
removing the need for explicit calls to fwnode_handle_put().

Signed-off-by: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Link: https://patch.msgid.link/20240930-net-device_for_each_child_node_scoped-v2-1-35f09333c1d7@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/mdio/mdio-thunder.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/mdio/mdio-thunder.c b/drivers/net/mdio/mdio-thunder.c
index 6067d96b2b7b..1e1aa72b1eff 100644
--- a/drivers/net/mdio/mdio-thunder.c
+++ b/drivers/net/mdio/mdio-thunder.c
@@ -23,7 +23,6 @@ static int thunder_mdiobus_pci_probe(struct pci_dev *pdev,
 				     const struct pci_device_id *ent)
 {
 	struct device_node *node;
-	struct fwnode_handle *fwn;
 	struct thunder_mdiobus_nexus *nexus;
 	int err;
 	int i;
@@ -54,7 +53,7 @@ static int thunder_mdiobus_pci_probe(struct pci_dev *pdev,
 	}
 
 	i = 0;
-	device_for_each_child_node(&pdev->dev, fwn) {
+	device_for_each_child_node_scoped(&pdev->dev, fwn) {
 		struct resource r;
 		struct mii_bus *mii_bus;
 		struct cavium_mdiobus *bus;
@@ -106,7 +105,6 @@ static int thunder_mdiobus_pci_probe(struct pci_dev *pdev,
 		if (i >= ARRAY_SIZE(nexus->buses))
 			break;
 	}
-	fwnode_handle_put(fwn);
 	return 0;
 
 err_release_regions:
-- 
2.51.0


From e97dccd3e976331cd2c9fa17fcb716af19bd5c68 Mon Sep 17 00:00:00 2001
From: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Date: Mon, 30 Sep 2024 22:38:26 +0200
Subject: [PATCH 09/16] net: hns: hisilicon: hns_dsaf_mac: switch to scoped
 device_for_each_child_node()

Use device_for_each_child_node_scoped() to simplify the code by removing
the need for explicit calls to fwnode_handle_put() in every error path.
This approach also accounts for any error path that could be added.

Signed-off-by: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Link: https://patch.msgid.link/20240930-net-device_for_each_child_node_scoped-v2-2-35f09333c1d7@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c
index 58baac7103b3..5fa9b2eeb929 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c
@@ -1090,28 +1090,24 @@ int hns_mac_init(struct dsaf_device *dsaf_dev)
 	u32 port_id;
 	int max_port_num = hns_mac_get_max_port_num(dsaf_dev);
 	struct hns_mac_cb *mac_cb;
-	struct fwnode_handle *child;
 
-	device_for_each_child_node(dsaf_dev->dev, child) {
+	device_for_each_child_node_scoped(dsaf_dev->dev, child) {
 		ret = fwnode_property_read_u32(child, "reg", &port_id);
 		if (ret) {
-			fwnode_handle_put(child);
 			dev_err(dsaf_dev->dev,
 				"get reg fail, ret=%d!\n", ret);
 			return ret;
 		}
 		if (port_id >= max_port_num) {
-			fwnode_handle_put(child);
 			dev_err(dsaf_dev->dev,
 				"reg(%u) out of range!\n", port_id);
 			return -EINVAL;
 		}
 		mac_cb = devm_kzalloc(dsaf_dev->dev, sizeof(*mac_cb),
 				      GFP_KERNEL);
-		if (!mac_cb) {
-			fwnode_handle_put(child);
+		if (!mac_cb)
 			return -ENOMEM;
-		}
+
 		mac_cb->fw_port = child;
 		mac_cb->mac_id = (u8)port_id;
 		dsaf_dev->mac_cb[port_id] = mac_cb;
-- 
2.51.0


From 1f3e7ff4f296af1f4350f457d5bd82bc825e645a Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Tue, 1 Oct 2024 12:10:24 +0200
Subject: [PATCH 10/16] net: airoha: read default PSE reserved pages value
 before updating

Store the default value for the number of PSE reserved pages in orig_val
at the beginning of airoha_fe_set_pse_oq_rsv routine, before updating it
with airoha_fe_set_pse_queue_rsv_pages().
Introduce airoha_fe_get_pse_all_rsv utility routine.

Introduced by commit 23020f049327 ("net: airoha: Introduce ethernet support
for EN7581 SoC")

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20241001-airoha-eth-pse-fix-v2-1-9a56cdffd074@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mediatek/airoha_eth.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mediatek/airoha_eth.c b/drivers/net/ethernet/mediatek/airoha_eth.c
index 930f180688e5..480540526bdb 100644
--- a/drivers/net/ethernet/mediatek/airoha_eth.c
+++ b/drivers/net/ethernet/mediatek/airoha_eth.c
@@ -1116,17 +1116,23 @@ static void airoha_fe_set_pse_queue_rsv_pages(struct airoha_eth *eth,
 		      PSE_CFG_WR_EN_MASK | PSE_CFG_OQRSV_SEL_MASK);
 }
 
+static u32 airoha_fe_get_pse_all_rsv(struct airoha_eth *eth)
+{
+	u32 val = airoha_fe_rr(eth, REG_FE_PSE_BUF_SET);
+
+	return FIELD_GET(PSE_ALLRSV_MASK, val);
+}
+
 static int airoha_fe_set_pse_oq_rsv(struct airoha_eth *eth,
 				    u32 port, u32 queue, u32 val)
 {
-	u32 orig_val, tmp, all_rsv, fq_limit;
+	u32 orig_val = airoha_fe_get_pse_queue_rsv_pages(eth, port, queue);
+	u32 tmp, all_rsv, fq_limit;
 
 	airoha_fe_set_pse_queue_rsv_pages(eth, port, queue, val);
 
 	/* modify all rsv */
-	orig_val = airoha_fe_get_pse_queue_rsv_pages(eth, port, queue);
-	tmp = airoha_fe_rr(eth, REG_FE_PSE_BUF_SET);
-	all_rsv = FIELD_GET(PSE_ALLRSV_MASK, tmp);
+	all_rsv = airoha_fe_get_pse_all_rsv(eth);
 	all_rsv += (val - orig_val);
 	airoha_fe_rmw(eth, REG_FE_PSE_BUF_SET, PSE_ALLRSV_MASK,
 		      FIELD_PREP(PSE_ALLRSV_MASK, all_rsv));
-- 
2.51.0


From 8e38e08f2c560328a873c35aff1a0dbea6a7d084 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Tue, 1 Oct 2024 12:10:25 +0200
Subject: [PATCH 11/16] net: airoha: fix PSE memory configuration in
 airoha_fe_pse_ports_init()

Align PSE memory configuration to vendor SDK. In particular, increase
initial value of PSE reserved memory in airoha_fe_pse_ports_init()
routine by the value used for the second Packet Processor Engine (PPE2)
and do not overwrite the default value.

Introduced by commit 23020f049327 ("net: airoha: Introduce ethernet support
for EN7581 SoC")

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20241001-airoha-eth-pse-fix-v2-2-9a56cdffd074@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mediatek/airoha_eth.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mediatek/airoha_eth.c b/drivers/net/ethernet/mediatek/airoha_eth.c
index 480540526bdb..2e01abc70c17 100644
--- a/drivers/net/ethernet/mediatek/airoha_eth.c
+++ b/drivers/net/ethernet/mediatek/airoha_eth.c
@@ -1172,11 +1172,13 @@ static void airoha_fe_pse_ports_init(struct airoha_eth *eth)
 		[FE_PSE_PORT_GDM4] = 2,
 		[FE_PSE_PORT_CDM5] = 2,
 	};
+	u32 all_rsv;
 	int q;
 
+	all_rsv = airoha_fe_get_pse_all_rsv(eth);
 	/* hw misses PPE2 oq rsv */
-	airoha_fe_set(eth, REG_FE_PSE_BUF_SET,
-		      PSE_RSV_PAGES * pse_port_num_queues[FE_PSE_PORT_PPE2]);
+	all_rsv += PSE_RSV_PAGES * pse_port_num_queues[FE_PSE_PORT_PPE2];
+	airoha_fe_set(eth, REG_FE_PSE_BUF_SET, all_rsv);
 
 	/* CMD1 */
 	for (q = 0; q < pse_port_num_queues[FE_PSE_PORT_CDM1]; q++)
-- 
2.51.0


From 8389cdb5c192bad690caf64a0746216c27f0c9b8 Mon Sep 17 00:00:00 2001
From: Aleksander Jan Bajkowski <olek2@wp.pl>
Date: Thu, 3 Oct 2024 19:19:41 +0200
Subject: [PATCH 12/16] net: macb: Adding support for Jumbo Frames up to 10240
 Bytes in SAMA5D2

As per the SAMA5D2 device specification it supports Jumbo frames.
But the suggested flag and length of bytes it supports was not updated
in this driver config_structure.
The maximum jumbo frames the device supports:
10240 bytes as per the device spec.

While changing the MTU value greater than 1500, it threw error:
sudo ifconfig eth1 mtu 9000
SIOCSIFMTU: Invalid argument

Add this support to driver so that it works as expected and designed.

Signed-off-by: Aleksander Jan Bajkowski <olek2@wp.pl>
Reviewed-by: Simon Horman <horms@kernel.org>
Acked-by: Nicolas Ferre <nicolas.ferre@microchip.com>
Link: https://patch.msgid.link/20241003171941.8814-1-olek2@wp.pl
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/cadence/macb_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index f06babec04a0..9fda16557509 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -4841,10 +4841,11 @@ static const struct macb_config pc302gem_config = {
 };
 
 static const struct macb_config sama5d2_config = {
-	.caps = MACB_CAPS_USRIO_DEFAULT_IS_MII_GMII,
+	.caps = MACB_CAPS_USRIO_DEFAULT_IS_MII_GMII | MACB_CAPS_JUMBO,
 	.dma_burst_length = 16,
 	.clk_init = macb_clk_init,
 	.init = macb_init,
+	.jumbo_max_len = 10240,
 	.usrio = &macb_default_usrio,
 };
 
-- 
2.51.0


From c55ff46aeebed1704a9a6861777b799f15ce594d Mon Sep 17 00:00:00 2001
From: Riyan Dhiman <riyandhiman14@gmail.com>
Date: Tue, 1 Oct 2024 16:35:43 +0530
Subject: [PATCH 13/16] octeontx2-af: Change block parameter to const pointer
 in get_lf_str_list

Convert struct rvu_block block to const struct rvu_block *block in
get_lf_str_list() function parameter. This improves efficiency by
avoiding structure copying and reflects the function's read-only
access to block.

Signed-off-by: Riyan Dhiman <riyandhiman14@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20241001110542.5404-2-riyandhiman14@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../ethernet/marvell/octeontx2/af/rvu_debugfs.c    | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
index 87ba77e5026a..8c700ee4a82b 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
@@ -663,16 +663,16 @@ static ssize_t rvu_dbg_lmtst_map_table_display(struct file *filp,
 
 RVU_DEBUG_FOPS(lmtst_map_table, lmtst_map_table_display, NULL);
 
-static void get_lf_str_list(struct rvu_block block, int pcifunc,
+static void get_lf_str_list(const struct rvu_block *block, int pcifunc,
 			    char *lfs)
 {
-	int lf = 0, seq = 0, len = 0, prev_lf = block.lf.max;
+	int lf = 0, seq = 0, len = 0, prev_lf = block->lf.max;
 
-	for_each_set_bit(lf, block.lf.bmap, block.lf.max) {
-		if (lf >= block.lf.max)
+	for_each_set_bit(lf, block->lf.bmap, block->lf.max) {
+		if (lf >= block->lf.max)
 			break;
 
-		if (block.fn_map[lf] != pcifunc)
+		if (block->fn_map[lf] != pcifunc)
 			continue;
 
 		if (lf == prev_lf + 1) {
@@ -719,7 +719,7 @@ static int get_max_column_width(struct rvu *rvu)
 				if (!strlen(block.name))
 					continue;
 
-				get_lf_str_list(block, pcifunc, buf);
+				get_lf_str_list(&block, pcifunc, buf);
 				if (lf_str_size <= strlen(buf))
 					lf_str_size = strlen(buf) + 1;
 			}
@@ -803,7 +803,7 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp,
 					continue;
 				len = 0;
 				lfs[len] = '\0';
-				get_lf_str_list(block, pcifunc, lfs);
+				get_lf_str_list(&block, pcifunc, lfs);
 				if (strlen(lfs))
 					flag = 1;
 
-- 
2.51.0


From 5acd957a986c167d357e617a887630203be29ea4 Mon Sep 17 00:00:00 2001
From: Cosmin Ratiu <cratiu@nvidia.com>
Date: Tue, 1 Oct 2024 13:37:04 +0300
Subject: [PATCH 14/16] net/mlx5: hw counters: Make fc_stats & fc_pool private

The mlx5_fc_stats and mlx5_fc_pool structs are only used from
fs_counters.c. As such, make them private there.

mlx5_fc_pool is not used or referenced at all outside fs_counters.

mlx5_fc_stats is referenced from mlx5_core_dev, so instead of having it
as a direct member (which requires exporting it from fs_counters), store
a pointer to it, allocate it on init and clear it on destroy.
One caveat is that a simple container_of to get from a 'work' struct to
the outermost mlx5_core_dev struct directly no longer works, so an extra
pointer had to be added to mlx5_fc_stats back to the parent dev.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20241001103709.58127-2-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../ethernet/mellanox/mlx5/core/fs_counters.c | 79 ++++++++++++++-----
 include/linux/mlx5/driver.h                   | 33 +-------
 2 files changed, 60 insertions(+), 52 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index 0c26d707eed2..7d6174d0f260 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -69,6 +69,36 @@ struct mlx5_fc {
 	struct mlx5_fc_cache cache ____cacheline_aligned_in_smp;
 };
 
+struct mlx5_fc_pool {
+	struct mlx5_core_dev *dev;
+	struct mutex pool_lock; /* protects pool lists */
+	struct list_head fully_used;
+	struct list_head partially_used;
+	struct list_head unused;
+	int available_fcs;
+	int used_fcs;
+	int threshold;
+};
+
+struct mlx5_fc_stats {
+	spinlock_t counters_idr_lock; /* protects counters_idr */
+	struct idr counters_idr;
+	struct list_head counters;
+	struct llist_head addlist;
+	struct llist_head dellist;
+
+	struct workqueue_struct *wq;
+	struct delayed_work work;
+	unsigned long next_query;
+	unsigned long sampling_interval; /* jiffies */
+	u32 *bulk_query_out;
+	int bulk_query_len;
+	size_t num_counters;
+	bool bulk_query_alloc_failed;
+	unsigned long next_bulk_query_alloc;
+	struct mlx5_fc_pool fc_pool;
+};
+
 static void mlx5_fc_pool_init(struct mlx5_fc_pool *fc_pool, struct mlx5_core_dev *dev);
 static void mlx5_fc_pool_cleanup(struct mlx5_fc_pool *fc_pool);
 static struct mlx5_fc *mlx5_fc_pool_acquire_counter(struct mlx5_fc_pool *fc_pool);
@@ -109,7 +139,7 @@ static void mlx5_fc_pool_release_counter(struct mlx5_fc_pool *fc_pool, struct ml
 static struct list_head *mlx5_fc_counters_lookup_next(struct mlx5_core_dev *dev,
 						      u32 id)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 	unsigned long next_id = (unsigned long)id + 1;
 	struct mlx5_fc *counter;
 	unsigned long tmp;
@@ -137,7 +167,7 @@ static void mlx5_fc_stats_insert(struct mlx5_core_dev *dev,
 static void mlx5_fc_stats_remove(struct mlx5_core_dev *dev,
 				 struct mlx5_fc *counter)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 
 	list_del(&counter->list);
 
@@ -178,7 +208,7 @@ static void mlx5_fc_stats_query_counter_range(struct mlx5_core_dev *dev,
 					      struct mlx5_fc *first,
 					      u32 last_id)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 	bool query_more_counters = (first->id <= last_id);
 	int cur_bulk_len = fc_stats->bulk_query_len;
 	u32 *data = fc_stats->bulk_query_out;
@@ -225,7 +255,7 @@ static void mlx5_fc_free(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
 
 static void mlx5_fc_release(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 
 	if (counter->bulk)
 		mlx5_fc_pool_release_counter(&fc_stats->fc_pool, counter);
@@ -235,7 +265,7 @@ static void mlx5_fc_release(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
 
 static void mlx5_fc_stats_bulk_query_size_increase(struct mlx5_core_dev *dev)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 	int max_bulk_len = get_max_bulk_query_len(dev);
 	unsigned long now = jiffies;
 	u32 *bulk_query_out_tmp;
@@ -270,9 +300,9 @@ static void mlx5_fc_stats_bulk_query_size_increase(struct mlx5_core_dev *dev)
 
 static void mlx5_fc_stats_work(struct work_struct *work)
 {
-	struct mlx5_core_dev *dev = container_of(work, struct mlx5_core_dev,
-						 priv.fc_stats.work.work);
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = container_of(work, struct mlx5_fc_stats,
+						      work.work);
+	struct mlx5_core_dev *dev = fc_stats->fc_pool.dev;
 	/* Take dellist first to ensure that counters cannot be deleted before
 	 * they are inserted.
 	 */
@@ -334,7 +364,7 @@ static struct mlx5_fc *mlx5_fc_single_alloc(struct mlx5_core_dev *dev)
 
 static struct mlx5_fc *mlx5_fc_acquire(struct mlx5_core_dev *dev, bool aging)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 	struct mlx5_fc *counter;
 
 	if (aging && MLX5_CAP_GEN(dev, flow_counter_bulk_alloc) != 0) {
@@ -349,7 +379,7 @@ static struct mlx5_fc *mlx5_fc_acquire(struct mlx5_core_dev *dev, bool aging)
 struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging)
 {
 	struct mlx5_fc *counter = mlx5_fc_acquire(dev, aging);
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 	int err;
 
 	if (IS_ERR(counter))
@@ -389,7 +419,7 @@ err_out_alloc:
 struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging)
 {
 	struct mlx5_fc *counter = mlx5_fc_create_ex(dev, aging);
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 
 	if (aging)
 		mod_delayed_work(fc_stats->wq, &fc_stats->work, 0);
@@ -405,7 +435,7 @@ EXPORT_SYMBOL(mlx5_fc_id);
 
 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 
 	if (!counter)
 		return;
@@ -422,10 +452,14 @@ EXPORT_SYMBOL(mlx5_fc_destroy);
 
 int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats;
 	int init_bulk_len;
 	int init_out_len;
 
+	fc_stats = kzalloc(sizeof(*fc_stats), GFP_KERNEL);
+	if (!fc_stats)
+		return -ENOMEM;
+
 	spin_lock_init(&fc_stats->counters_idr_lock);
 	idr_init(&fc_stats->counters_idr);
 	INIT_LIST_HEAD(&fc_stats->counters);
@@ -436,7 +470,7 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 	init_out_len = mlx5_cmd_fc_get_bulk_query_out_len(init_bulk_len);
 	fc_stats->bulk_query_out = kzalloc(init_out_len, GFP_KERNEL);
 	if (!fc_stats->bulk_query_out)
-		return -ENOMEM;
+		goto err_bulk;
 	fc_stats->bulk_query_len = init_bulk_len;
 
 	fc_stats->wq = create_singlethread_workqueue("mlx5_fc");
@@ -447,23 +481,27 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 	INIT_DELAYED_WORK(&fc_stats->work, mlx5_fc_stats_work);
 
 	mlx5_fc_pool_init(&fc_stats->fc_pool, dev);
+	dev->priv.fc_stats = fc_stats;
+
 	return 0;
 
 err_wq_create:
 	kfree(fc_stats->bulk_query_out);
+err_bulk:
+	kfree(fc_stats);
 	return -ENOMEM;
 }
 
 void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 	struct llist_node *tmplist;
 	struct mlx5_fc *counter;
 	struct mlx5_fc *tmp;
 
-	cancel_delayed_work_sync(&dev->priv.fc_stats.work);
-	destroy_workqueue(dev->priv.fc_stats.wq);
-	dev->priv.fc_stats.wq = NULL;
+	cancel_delayed_work_sync(&fc_stats->work);
+	destroy_workqueue(fc_stats->wq);
+	fc_stats->wq = NULL;
 
 	tmplist = llist_del_all(&fc_stats->addlist);
 	llist_for_each_entry_safe(counter, tmp, tmplist, addlist)
@@ -475,6 +513,7 @@ void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
 	mlx5_fc_pool_cleanup(&fc_stats->fc_pool);
 	idr_destroy(&fc_stats->counters_idr);
 	kfree(fc_stats->bulk_query_out);
+	kfree(fc_stats);
 }
 
 int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter,
@@ -518,7 +557,7 @@ void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
 			      struct delayed_work *dwork,
 			      unsigned long delay)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 
 	queue_delayed_work(fc_stats->wq, dwork, delay);
 }
@@ -526,7 +565,7 @@ void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
 void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev,
 				      unsigned long interval)
 {
-	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
 
 	fc_stats->sampling_interval = min_t(unsigned long, interval,
 					    fc_stats->sampling_interval);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index e23c692a34c7..fc7e6153b73d 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -45,7 +45,6 @@
 #include <linux/workqueue.h>
 #include <linux/mempool.h>
 #include <linux/interrupt.h>
-#include <linux/idr.h>
 #include <linux/notifier.h>
 #include <linux/refcount.h>
 #include <linux/auxiliary_bus.h>
@@ -474,36 +473,6 @@ struct mlx5_core_sriov {
 	u16			max_ec_vfs;
 };
 
-struct mlx5_fc_pool {
-	struct mlx5_core_dev *dev;
-	struct mutex pool_lock; /* protects pool lists */
-	struct list_head fully_used;
-	struct list_head partially_used;
-	struct list_head unused;
-	int available_fcs;
-	int used_fcs;
-	int threshold;
-};
-
-struct mlx5_fc_stats {
-	spinlock_t counters_idr_lock; /* protects counters_idr */
-	struct idr counters_idr;
-	struct list_head counters;
-	struct llist_head addlist;
-	struct llist_head dellist;
-
-	struct workqueue_struct *wq;
-	struct delayed_work work;
-	unsigned long next_query;
-	unsigned long sampling_interval; /* jiffies */
-	u32 *bulk_query_out;
-	int bulk_query_len;
-	size_t num_counters;
-	bool bulk_query_alloc_failed;
-	unsigned long next_bulk_query_alloc;
-	struct mlx5_fc_pool fc_pool;
-};
-
 struct mlx5_events;
 struct mlx5_mpfs;
 struct mlx5_eswitch;
@@ -630,7 +599,7 @@ struct mlx5_priv {
 	struct mlx5_devcom_comp_dev *hca_devcom_comp;
 	struct mlx5_fw_reset	*fw_reset;
 	struct mlx5_core_roce	roce;
-	struct mlx5_fc_stats		fc_stats;
+	struct mlx5_fc_stats		*fc_stats;
 	struct mlx5_rl_table            rl_table;
 	struct mlx5_ft_pool		*ft_pool;
 
-- 
2.51.0


From 10cd92df833c3f6c35dc5e923651146d41332538 Mon Sep 17 00:00:00 2001
From: Cosmin Ratiu <cratiu@nvidia.com>
Date: Tue, 1 Oct 2024 13:37:05 +0300
Subject: [PATCH 15/16] net/mlx5: hw counters: Use kvmalloc for bulk query
 buffer

The bulk query buffer starts out small (see [1]) and as soon as the
number of counters goes past the initial threshold grows to max
size (32K entries, 512KB) with a retry scheme.

This commit switches to using kvmalloc for the buffer, which has a near
zero likelihood of failing, and thus the explicit retry scheme becomes
superfluous and is taken out. On the low chance the allocation fails, it
will still be retried every sampling_interval, when the wq task runs.

[1] commit b247f32aecad ("net/mlx5: Dynamically resize flow counters
query buffer")

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20241001103709.58127-3-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../ethernet/mellanox/mlx5/core/fs_counters.c | 59 +++++++------------
 1 file changed, 22 insertions(+), 37 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index 7d6174d0f260..9892895da9ee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -38,7 +38,6 @@
 #include "fs_cmd.h"
 
 #define MLX5_FC_STATS_PERIOD msecs_to_jiffies(1000)
-#define MLX5_FC_BULK_QUERY_ALLOC_PERIOD msecs_to_jiffies(180 * 1000)
 /* Max number of counters to query in bulk read is 32K */
 #define MLX5_SW_MAX_COUNTERS_BULK BIT(15)
 #define MLX5_INIT_COUNTERS_BULK 8
@@ -263,39 +262,28 @@ static void mlx5_fc_release(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
 		mlx5_fc_free(dev, counter);
 }
 
-static void mlx5_fc_stats_bulk_query_size_increase(struct mlx5_core_dev *dev)
+static void mlx5_fc_stats_bulk_query_buf_realloc(struct mlx5_core_dev *dev,
+						 int bulk_query_len)
 {
 	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
-	int max_bulk_len = get_max_bulk_query_len(dev);
-	unsigned long now = jiffies;
 	u32 *bulk_query_out_tmp;
-	int max_out_len;
-
-	if (fc_stats->bulk_query_alloc_failed &&
-	    time_before(now, fc_stats->next_bulk_query_alloc))
-		return;
+	int out_len;
 
-	max_out_len = mlx5_cmd_fc_get_bulk_query_out_len(max_bulk_len);
-	bulk_query_out_tmp = kzalloc(max_out_len, GFP_KERNEL);
+	out_len = mlx5_cmd_fc_get_bulk_query_out_len(bulk_query_len);
+	bulk_query_out_tmp = kvzalloc(out_len, GFP_KERNEL);
 	if (!bulk_query_out_tmp) {
 		mlx5_core_warn_once(dev,
-				    "Can't increase flow counters bulk query buffer size, insufficient memory, bulk_size(%d)\n",
-				    max_bulk_len);
-		fc_stats->bulk_query_alloc_failed = true;
-		fc_stats->next_bulk_query_alloc =
-			now + MLX5_FC_BULK_QUERY_ALLOC_PERIOD;
+				    "Can't increase flow counters bulk query buffer size, alloc failed, bulk_query_len(%d)\n",
+				    bulk_query_len);
 		return;
 	}
 
-	kfree(fc_stats->bulk_query_out);
+	kvfree(fc_stats->bulk_query_out);
 	fc_stats->bulk_query_out = bulk_query_out_tmp;
-	fc_stats->bulk_query_len = max_bulk_len;
-	if (fc_stats->bulk_query_alloc_failed) {
-		mlx5_core_info(dev,
-			       "Flow counters bulk query buffer size increased, bulk_size(%d)\n",
-			       max_bulk_len);
-		fc_stats->bulk_query_alloc_failed = false;
-	}
+	fc_stats->bulk_query_len = bulk_query_len;
+	mlx5_core_info(dev,
+		       "Flow counters bulk query buffer size increased, bulk_query_len(%d)\n",
+		       bulk_query_len);
 }
 
 static void mlx5_fc_stats_work(struct work_struct *work)
@@ -327,13 +315,14 @@ static void mlx5_fc_stats_work(struct work_struct *work)
 		fc_stats->num_counters--;
 	}
 
-	if (fc_stats->bulk_query_len < get_max_bulk_query_len(dev) &&
-	    fc_stats->num_counters > get_init_bulk_query_len(dev))
-		mlx5_fc_stats_bulk_query_size_increase(dev);
-
 	if (time_before(now, fc_stats->next_query) ||
 	    list_empty(&fc_stats->counters))
 		return;
+
+	if (fc_stats->bulk_query_len < get_max_bulk_query_len(dev) &&
+	    fc_stats->num_counters > get_init_bulk_query_len(dev))
+		mlx5_fc_stats_bulk_query_buf_realloc(dev, get_max_bulk_query_len(dev));
+
 	last = list_last_entry(&fc_stats->counters, struct mlx5_fc, list);
 
 	counter = list_first_entry(&fc_stats->counters, struct mlx5_fc,
@@ -453,12 +442,11 @@ EXPORT_SYMBOL(mlx5_fc_destroy);
 int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 {
 	struct mlx5_fc_stats *fc_stats;
-	int init_bulk_len;
-	int init_out_len;
 
 	fc_stats = kzalloc(sizeof(*fc_stats), GFP_KERNEL);
 	if (!fc_stats)
 		return -ENOMEM;
+	dev->priv.fc_stats = fc_stats;
 
 	spin_lock_init(&fc_stats->counters_idr_lock);
 	idr_init(&fc_stats->counters_idr);
@@ -466,12 +454,10 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 	init_llist_head(&fc_stats->addlist);
 	init_llist_head(&fc_stats->dellist);
 
-	init_bulk_len = get_init_bulk_query_len(dev);
-	init_out_len = mlx5_cmd_fc_get_bulk_query_out_len(init_bulk_len);
-	fc_stats->bulk_query_out = kzalloc(init_out_len, GFP_KERNEL);
+	/* Allocate initial (small) bulk query buffer. */
+	mlx5_fc_stats_bulk_query_buf_realloc(dev, get_init_bulk_query_len(dev));
 	if (!fc_stats->bulk_query_out)
 		goto err_bulk;
-	fc_stats->bulk_query_len = init_bulk_len;
 
 	fc_stats->wq = create_singlethread_workqueue("mlx5_fc");
 	if (!fc_stats->wq)
@@ -481,12 +467,11 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 	INIT_DELAYED_WORK(&fc_stats->work, mlx5_fc_stats_work);
 
 	mlx5_fc_pool_init(&fc_stats->fc_pool, dev);
-	dev->priv.fc_stats = fc_stats;
 
 	return 0;
 
 err_wq_create:
-	kfree(fc_stats->bulk_query_out);
+	kvfree(fc_stats->bulk_query_out);
 err_bulk:
 	kfree(fc_stats);
 	return -ENOMEM;
@@ -512,7 +497,7 @@ void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
 
 	mlx5_fc_pool_cleanup(&fc_stats->fc_pool);
 	idr_destroy(&fc_stats->counters_idr);
-	kfree(fc_stats->bulk_query_out);
+	kvfree(fc_stats->bulk_query_out);
 	kfree(fc_stats);
 }
 
-- 
2.51.0


From 918af0219a4d6a89cf02839005ede24e91f13bf6 Mon Sep 17 00:00:00 2001
From: Cosmin Ratiu <cratiu@nvidia.com>
Date: Tue, 1 Oct 2024 13:37:06 +0300
Subject: [PATCH 16/16] net/mlx5: hw counters: Replace IDR+lists with xarray

Previously, managing counters was a complicated affair involving an IDR,
a sorted double linked list, two single linked lists and a complex dance
between a non-periodic wq task and users adding/deleting counters.

Adding was done by inserting new counters into the IDR and into a single
linked list, leaving the wq to process the list and actually add the
counters into the double linked list, maintained sorted with the IDR.

Deleting involved adding the counter into another single linked list,
leaving the wq to actually unlink the counter from the other structures
and release it.

Dumping the counters is done with the bulk query API, which relies on
the counter list being sorted and unmutable during querying to
efficiently retrieve cached counter values.

Finally, the IDR data struct is deprecated.

This commit replaces all of that with an xarray.

Adding is now done directly, by using xa_lock.
Deleting is also done directly, under the xa_lock.

Querying is done from a periodic task running every sampling_interval
(default 1s) and uses the bulk query API for efficiency.
It works by iterating over the xarray:
- when a new bulk needs to be started, the bulk information is computed
  under the xa_lock.
- the xa iteration state is saved and the xa_lock dropped.
- the HW is queried for bulk counter values.
- the xa_lock is reacquired.
- counter caches with ids covered by the bulk response are updated.

Querying always requests the max bulk length, for simplicity.

Counters could be added/deleted while the HW is queried. This is safe,
as the HW API simply returns unknown values for counters not in HW, but
those values won't be accessed. Only counters present in xarray before
bulk query will actually read queried cache values.

This cuts down the size of mlx5_fc by 4 pointers (88->56 bytes), which
amounts to ~3MB / 100K counters.
But more importantly, this solves the wq spinlock congestion issue seen
happening on high-rate counter insertion+deletion.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20241001103709.58127-4-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../ethernet/mellanox/mlx5/core/fs_counters.c | 276 ++++++------------
 1 file changed, 82 insertions(+), 194 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index 9892895da9ee..05d9351ff577 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -32,7 +32,6 @@
 
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/fs.h>
-#include <linux/rbtree.h>
 #include "mlx5_core.h"
 #include "fs_core.h"
 #include "fs_cmd.h"
@@ -51,21 +50,13 @@ struct mlx5_fc_cache {
 };
 
 struct mlx5_fc {
-	struct list_head list;
-	struct llist_node addlist;
-	struct llist_node dellist;
-
-	/* last{packets,bytes} members are used when calculating the delta since
-	 * last reading
-	 */
-	u64 lastpackets;
-	u64 lastbytes;
-
-	struct mlx5_fc_bulk *bulk;
 	u32 id;
 	bool aging;
-
+	struct mlx5_fc_bulk *bulk;
 	struct mlx5_fc_cache cache ____cacheline_aligned_in_smp;
+	/* last{packets,bytes} are used for calculating deltas since last reading. */
+	u64 lastpackets;
+	u64 lastbytes;
 };
 
 struct mlx5_fc_pool {
@@ -80,19 +71,14 @@ struct mlx5_fc_pool {
 };
 
 struct mlx5_fc_stats {
-	spinlock_t counters_idr_lock; /* protects counters_idr */
-	struct idr counters_idr;
-	struct list_head counters;
-	struct llist_head addlist;
-	struct llist_head dellist;
+	struct xarray counters;
 
 	struct workqueue_struct *wq;
 	struct delayed_work work;
-	unsigned long next_query;
 	unsigned long sampling_interval; /* jiffies */
 	u32 *bulk_query_out;
 	int bulk_query_len;
-	size_t num_counters;
+	size_t num_counters;  /* Also protected by xarray->xa_lock. */
 	bool bulk_query_alloc_failed;
 	unsigned long next_bulk_query_alloc;
 	struct mlx5_fc_pool fc_pool;
@@ -103,78 +89,6 @@ static void mlx5_fc_pool_cleanup(struct mlx5_fc_pool *fc_pool);
 static struct mlx5_fc *mlx5_fc_pool_acquire_counter(struct mlx5_fc_pool *fc_pool);
 static void mlx5_fc_pool_release_counter(struct mlx5_fc_pool *fc_pool, struct mlx5_fc *fc);
 
-/* locking scheme:
- *
- * It is the responsibility of the user to prevent concurrent calls or bad
- * ordering to mlx5_fc_create(), mlx5_fc_destroy() and accessing a reference
- * to struct mlx5_fc.
- * e.g en_tc.c is protected by RTNL lock of its caller, and will never call a
- * dump (access to struct mlx5_fc) after a counter is destroyed.
- *
- * access to counter list:
- * - create (user context)
- *   - mlx5_fc_create() only adds to an addlist to be used by
- *     mlx5_fc_stats_work(). addlist is a lockless single linked list
- *     that doesn't require any additional synchronization when adding single
- *     node.
- *   - spawn thread to do the actual destroy
- *
- * - destroy (user context)
- *   - add a counter to lockless dellist
- *   - spawn thread to do the actual del
- *
- * - dump (user context)
- *   user should not call dump after destroy
- *
- * - query (single thread workqueue context)
- *   destroy/dump - no conflict (see destroy)
- *   query/dump - packets and bytes might be inconsistent (since update is not
- *                atomic)
- *   query/create - no conflict (see create)
- *   since every create/destroy spawn the work, only after necessary time has
- *   elapsed, the thread will actually query the hardware.
- */
-
-static struct list_head *mlx5_fc_counters_lookup_next(struct mlx5_core_dev *dev,
-						      u32 id)
-{
-	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
-	unsigned long next_id = (unsigned long)id + 1;
-	struct mlx5_fc *counter;
-	unsigned long tmp;
-
-	rcu_read_lock();
-	/* skip counters that are in idr, but not yet in counters list */
-	idr_for_each_entry_continue_ul(&fc_stats->counters_idr,
-				       counter, tmp, next_id) {
-		if (!list_empty(&counter->list))
-			break;
-	}
-	rcu_read_unlock();
-
-	return counter ? &counter->list : &fc_stats->counters;
-}
-
-static void mlx5_fc_stats_insert(struct mlx5_core_dev *dev,
-				 struct mlx5_fc *counter)
-{
-	struct list_head *next = mlx5_fc_counters_lookup_next(dev, counter->id);
-
-	list_add_tail(&counter->list, next);
-}
-
-static void mlx5_fc_stats_remove(struct mlx5_core_dev *dev,
-				 struct mlx5_fc *counter)
-{
-	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
-
-	list_del(&counter->list);
-
-	spin_lock(&fc_stats->counters_idr_lock);
-	WARN_ON(!idr_remove(&fc_stats->counters_idr, counter->id));
-	spin_unlock(&fc_stats->counters_idr_lock);
-}
-
 static int get_init_bulk_query_len(struct mlx5_core_dev *dev)
 {
 	return min_t(int, MLX5_INIT_COUNTERS_BULK,
@@ -203,47 +117,64 @@ static void update_counter_cache(int index, u32 *bulk_raw_data,
 	cache->lastuse = jiffies;
 }
 
-static void mlx5_fc_stats_query_counter_range(struct mlx5_core_dev *dev,
-					      struct mlx5_fc *first,
-					      u32 last_id)
+/* Synchronization notes
+ *
+ * Access to counter array:
+ * - create - mlx5_fc_create() (user context)
+ *   - inserts the counter into the xarray.
+ *
+ * - destroy - mlx5_fc_destroy() (user context)
+ *   - erases the counter from the xarray and releases it.
+ *
+ * - query mlx5_fc_query(), mlx5_fc_query_cached{,_raw}() (user context)
+ *   - user should not access a counter after destroy.
+ *
+ * - bulk query (single thread workqueue context)
+ *   - create: query relies on 'lastuse' to avoid updating counters added
+ *             around the same time as the current bulk cmd.
+ *   - destroy: destroyed counters will not be accessed, even if they are
+ *              destroyed during a bulk query command.
+ */
+static void mlx5_fc_stats_query_all_counters(struct mlx5_core_dev *dev)
 {
 	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
-	bool query_more_counters = (first->id <= last_id);
-	int cur_bulk_len = fc_stats->bulk_query_len;
+	u32 bulk_len = fc_stats->bulk_query_len;
+	XA_STATE(xas, &fc_stats->counters, 0);
 	u32 *data = fc_stats->bulk_query_out;
-	struct mlx5_fc *counter = first;
+	struct mlx5_fc *counter;
+	u32 last_bulk_id = 0;
+	u64 bulk_query_time;
 	u32 bulk_base_id;
-	int bulk_len;
 	int err;
 
-	while (query_more_counters) {
-		/* first id must be aligned to 4 when using bulk query */
-		bulk_base_id = counter->id & ~0x3;
-
-		/* number of counters to query inc. the last counter */
-		bulk_len = min_t(int, cur_bulk_len,
-				 ALIGN(last_id - bulk_base_id + 1, 4));
-
-		err = mlx5_cmd_fc_bulk_query(dev, bulk_base_id, bulk_len,
-					     data);
-		if (err) {
-			mlx5_core_err(dev, "Error doing bulk query: %d\n", err);
-			return;
-		}
-		query_more_counters = false;
-
-		list_for_each_entry_from(counter, &fc_stats->counters, list) {
-			int counter_index = counter->id - bulk_base_id;
-			struct mlx5_fc_cache *cache = &counter->cache;
-
-			if (counter->id >= bulk_base_id + bulk_len) {
-				query_more_counters = true;
-				break;
+	xas_lock(&xas);
+	xas_for_each(&xas, counter, U32_MAX) {
+		if (xas_retry(&xas, counter))
+			continue;
+		if (unlikely(counter->id >= last_bulk_id)) {
+			/* Start new bulk query. */
+			/* First id must be aligned to 4 when using bulk query. */
+			bulk_base_id = counter->id & ~0x3;
+			last_bulk_id = bulk_base_id + bulk_len;
+			/* The lock is released while querying the hw and reacquired after. */
+			xas_unlock(&xas);
+			/* The same id needs to be processed again in the next loop iteration. */
+			xas_reset(&xas);
+			bulk_query_time = jiffies;
+			err = mlx5_cmd_fc_bulk_query(dev, bulk_base_id, bulk_len, data);
+			if (err) {
+				mlx5_core_err(dev, "Error doing bulk query: %d\n", err);
+				return;
 			}
-
-			update_counter_cache(counter_index, data, cache);
+			xas_lock(&xas);
+			continue;
 		}
+		/* Do not update counters added after bulk query was started. */
+		if (time_after64(bulk_query_time, counter->cache.lastuse))
+			update_counter_cache(counter->id - bulk_base_id, data,
+					     &counter->cache);
 	}
+	xas_unlock(&xas);
 }
 
 static void mlx5_fc_free(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
@@ -291,46 +222,19 @@ static void mlx5_fc_stats_work(struct work_struct *work)
 	struct mlx5_fc_stats *fc_stats = container_of(work, struct mlx5_fc_stats,
 						      work.work);
 	struct mlx5_core_dev *dev = fc_stats->fc_pool.dev;
-	/* Take dellist first to ensure that counters cannot be deleted before
-	 * they are inserted.
-	 */
-	struct llist_node *dellist = llist_del_all(&fc_stats->dellist);
-	struct llist_node *addlist = llist_del_all(&fc_stats->addlist);
-	struct mlx5_fc *counter = NULL, *last = NULL, *tmp;
-	unsigned long now = jiffies;
-
-	if (addlist || !list_empty(&fc_stats->counters))
-		queue_delayed_work(fc_stats->wq, &fc_stats->work,
-				   fc_stats->sampling_interval);
-
-	llist_for_each_entry(counter, addlist, addlist) {
-		mlx5_fc_stats_insert(dev, counter);
-		fc_stats->num_counters++;
-	}
+	int num_counters;
 
-	llist_for_each_entry_safe(counter, tmp, dellist, dellist) {
-		mlx5_fc_stats_remove(dev, counter);
-
-		mlx5_fc_release(dev, counter);
-		fc_stats->num_counters--;
-	}
-
-	if (time_before(now, fc_stats->next_query) ||
-	    list_empty(&fc_stats->counters))
-		return;
+	queue_delayed_work(fc_stats->wq, &fc_stats->work, fc_stats->sampling_interval);
 
+	/* num_counters is only needed for determining whether to increase the buffer. */
+	xa_lock(&fc_stats->counters);
+	num_counters = fc_stats->num_counters;
+	xa_unlock(&fc_stats->counters);
 	if (fc_stats->bulk_query_len < get_max_bulk_query_len(dev) &&
-	    fc_stats->num_counters > get_init_bulk_query_len(dev))
+	    num_counters > get_init_bulk_query_len(dev))
 		mlx5_fc_stats_bulk_query_buf_realloc(dev, get_max_bulk_query_len(dev));
 
-	last = list_last_entry(&fc_stats->counters, struct mlx5_fc, list);
-
-	counter = list_first_entry(&fc_stats->counters, struct mlx5_fc,
-				   list);
-	if (counter)
-		mlx5_fc_stats_query_counter_range(dev, counter, last->id);
-
-	fc_stats->next_query = now + fc_stats->sampling_interval;
+	mlx5_fc_stats_query_all_counters(dev);
 }
 
 static struct mlx5_fc *mlx5_fc_single_alloc(struct mlx5_core_dev *dev)
@@ -374,7 +278,6 @@ struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging)
 	if (IS_ERR(counter))
 		return counter;
 
-	INIT_LIST_HEAD(&counter->list);
 	counter->aging = aging;
 
 	if (aging) {
@@ -384,18 +287,15 @@ struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging)
 		counter->lastbytes = counter->cache.bytes;
 		counter->lastpackets = counter->cache.packets;
 
-		idr_preload(GFP_KERNEL);
-		spin_lock(&fc_stats->counters_idr_lock);
+		xa_lock(&fc_stats->counters);
 
-		err = idr_alloc_u32(&fc_stats->counters_idr, counter, &id, id,
-				    GFP_NOWAIT);
-
-		spin_unlock(&fc_stats->counters_idr_lock);
-		idr_preload_end();
-		if (err)
+		err = xa_err(__xa_store(&fc_stats->counters, id, counter, GFP_KERNEL));
+		if (err != 0) {
+			xa_unlock(&fc_stats->counters);
 			goto err_out_alloc;
-
-		llist_add(&counter->addlist, &fc_stats->addlist);
+		}
+		fc_stats->num_counters++;
+		xa_unlock(&fc_stats->counters);
 	}
 
 	return counter;
@@ -407,12 +307,7 @@ err_out_alloc:
 
 struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging)
 {
-	struct mlx5_fc *counter = mlx5_fc_create_ex(dev, aging);
-	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
-
-	if (aging)
-		mod_delayed_work(fc_stats->wq, &fc_stats->work, 0);
-	return counter;
+	return mlx5_fc_create_ex(dev, aging);
 }
 EXPORT_SYMBOL(mlx5_fc_create);
 
@@ -430,11 +325,11 @@ void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
 		return;
 
 	if (counter->aging) {
-		llist_add(&counter->dellist, &fc_stats->dellist);
-		mod_delayed_work(fc_stats->wq, &fc_stats->work, 0);
-		return;
+		xa_lock(&fc_stats->counters);
+		fc_stats->num_counters--;
+		__xa_erase(&fc_stats->counters, counter->id);
+		xa_unlock(&fc_stats->counters);
 	}
-
 	mlx5_fc_release(dev, counter);
 }
 EXPORT_SYMBOL(mlx5_fc_destroy);
@@ -448,11 +343,7 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 		return -ENOMEM;
 	dev->priv.fc_stats = fc_stats;
 
-	spin_lock_init(&fc_stats->counters_idr_lock);
-	idr_init(&fc_stats->counters_idr);
-	INIT_LIST_HEAD(&fc_stats->counters);
-	init_llist_head(&fc_stats->addlist);
-	init_llist_head(&fc_stats->dellist);
+	xa_init(&fc_stats->counters);
 
 	/* Allocate initial (small) bulk query buffer. */
 	mlx5_fc_stats_bulk_query_buf_realloc(dev, get_init_bulk_query_len(dev));
@@ -467,7 +358,7 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 	INIT_DELAYED_WORK(&fc_stats->work, mlx5_fc_stats_work);
 
 	mlx5_fc_pool_init(&fc_stats->fc_pool, dev);
-
+	queue_delayed_work(fc_stats->wq, &fc_stats->work, MLX5_FC_STATS_PERIOD);
 	return 0;
 
 err_wq_create:
@@ -480,23 +371,20 @@ err_bulk:
 void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
 {
 	struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats;
-	struct llist_node *tmplist;
 	struct mlx5_fc *counter;
-	struct mlx5_fc *tmp;
+	unsigned long id;
 
 	cancel_delayed_work_sync(&fc_stats->work);
 	destroy_workqueue(fc_stats->wq);
 	fc_stats->wq = NULL;
 
-	tmplist = llist_del_all(&fc_stats->addlist);
-	llist_for_each_entry_safe(counter, tmp, tmplist, addlist)
-		mlx5_fc_release(dev, counter);
-
-	list_for_each_entry_safe(counter, tmp, &fc_stats->counters, list)
+	xa_for_each(&fc_stats->counters, id, counter) {
+		xa_erase(&fc_stats->counters, id);
 		mlx5_fc_release(dev, counter);
+	}
+	xa_destroy(&fc_stats->counters);
 
 	mlx5_fc_pool_cleanup(&fc_stats->fc_pool);
-	idr_destroy(&fc_stats->counters_idr);
 	kvfree(fc_stats->bulk_query_out);
 	kfree(fc_stats);
 }
-- 
2.51.0