From 347fcdc414f98998df1c5969e4612e4da67d6852 Mon Sep 17 00:00:00 2001
From: Joe Damato <jdamato@fastly.com>
Date: Sat, 9 Nov 2024 05:02:35 +0000
Subject: [PATCH 01/16] selftests: net: Add busy_poll_test

Add an epoll busy poll test using netdevsim.

This test is comprised of:
  - busy_poller (via busy_poller.c)
  - busy_poll_test.sh which loads netdevsim, sets up network namespaces,
    and runs busy_poller to receive data and socat to send data.

The selftest tests two different scenarios:
  - busy poll (the pre-existing version in the kernel)
  - busy poll with suspend enabled (what this series adds)

The data transmit is a 1MiB temporary file generated from /dev/urandom
and the test is considered passing if the md5sum of the input file to
socat matches the md5sum of the output file from busy_poller.

netdevsim was chosen instead of veth due to netdevsim's support for
netdev-genl.

For now, this test uses the functionality that netdevsim provides. In the
future, perhaps netdevsim can be extended to emulate device IRQs to more
thoroughly test all pre-existing kernel options (like defer_hard_irqs)
and suspend.

Signed-off-by: Joe Damato <jdamato@fastly.com>
Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20241109050245.191288-6-jdamato@fastly.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/.gitignore        |   1 +
 tools/testing/selftests/net/Makefile          |   9 +
 tools/testing/selftests/net/busy_poll_test.sh | 165 +++++++++
 tools/testing/selftests/net/busy_poller.c     | 346 ++++++++++++++++++
 4 files changed, 521 insertions(+)
 create mode 100755 tools/testing/selftests/net/busy_poll_test.sh
 create mode 100644 tools/testing/selftests/net/busy_poller.c

diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index a78debbd1fe7..48973e78d46b 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -2,6 +2,7 @@
 bind_bhash
 bind_timewait
 bind_wildcard
+busy_poller
 cmsg_sender
 diag_uid
 epoll_busy_poll
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 9322b904ad00..2b2a5ec7fa6a 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -97,6 +97,11 @@ TEST_PROGS += fq_band_pktlimit.sh
 TEST_PROGS += vlan_hw_filter.sh
 TEST_PROGS += bpf_offload.py
 TEST_PROGS += ipv6_route_update_soft_lockup.sh
+TEST_PROGS += busy_poll_test.sh
+
+# YNL files, must be before "include ..lib.mk"
+YNL_GEN_FILES := busy_poller
+TEST_GEN_FILES += $(YNL_GEN_FILES)
 
 TEST_FILES := settings
 TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh
@@ -107,6 +112,10 @@ TEST_INCLUDES := forwarding/lib.sh
 
 include ../lib.mk
 
+# YNL build
+YNL_GENS := netdev
+include ynl.mk
+
 $(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap
 $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
 $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto
diff --git a/tools/testing/selftests/net/busy_poll_test.sh b/tools/testing/selftests/net/busy_poll_test.sh
new file mode 100755
index 000000000000..7db292ec4884
--- /dev/null
+++ b/tools/testing/selftests/net/busy_poll_test.sh
@@ -0,0 +1,165 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+source net_helper.sh
+
+NSIM_SV_ID=$((256 + RANDOM % 256))
+NSIM_SV_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_SV_ID
+NSIM_CL_ID=$((512 + RANDOM % 256))
+NSIM_CL_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_CL_ID
+
+NSIM_DEV_SYS_NEW=/sys/bus/netdevsim/new_device
+NSIM_DEV_SYS_DEL=/sys/bus/netdevsim/del_device
+NSIM_DEV_SYS_LINK=/sys/bus/netdevsim/link_device
+NSIM_DEV_SYS_UNLINK=/sys/bus/netdevsim/unlink_device
+
+SERVER_IP=192.168.1.1
+CLIENT_IP=192.168.1.2
+SERVER_PORT=48675
+
+# busy poll config
+MAX_EVENTS=8
+BUSY_POLL_USECS=0
+BUSY_POLL_BUDGET=16
+PREFER_BUSY_POLL=1
+
+# IRQ deferral config
+NAPI_DEFER_HARD_IRQS=100
+GRO_FLUSH_TIMEOUT=50000
+SUSPEND_TIMEOUT=20000000
+
+setup_ns()
+{
+	set -e
+	ip netns add nssv
+	ip netns add nscl
+
+	NSIM_SV_NAME=$(find $NSIM_SV_SYS/net -maxdepth 1 -type d ! \
+		-path $NSIM_SV_SYS/net -exec basename {} \;)
+	NSIM_CL_NAME=$(find $NSIM_CL_SYS/net -maxdepth 1 -type d ! \
+		-path $NSIM_CL_SYS/net -exec basename {} \;)
+
+	# ensure the server has 1 queue
+	ethtool -L $NSIM_SV_NAME combined 1 2>/dev/null
+
+	ip link set $NSIM_SV_NAME netns nssv
+	ip link set $NSIM_CL_NAME netns nscl
+
+	ip netns exec nssv ip addr add "${SERVER_IP}/24" dev $NSIM_SV_NAME
+	ip netns exec nscl ip addr add "${CLIENT_IP}/24" dev $NSIM_CL_NAME
+
+	ip netns exec nssv ip link set dev $NSIM_SV_NAME up
+	ip netns exec nscl ip link set dev $NSIM_CL_NAME up
+
+	set +e
+}
+
+cleanup_ns()
+{
+	ip netns del nscl
+	ip netns del nssv
+}
+
+test_busypoll()
+{
+	suspend_value=${1:-0}
+	tmp_file=$(mktemp)
+	out_file=$(mktemp)
+
+	# fill a test file with random data
+	dd if=/dev/urandom of=${tmp_file} bs=1M count=1 2> /dev/null
+
+	timeout -k 1s 30s ip netns exec nssv ./busy_poller         \
+					     -p${SERVER_PORT}      \
+					     -b${SERVER_IP}        \
+					     -m${MAX_EVENTS}       \
+					     -u${BUSY_POLL_USECS}  \
+					     -P${PREFER_BUSY_POLL} \
+					     -g${BUSY_POLL_BUDGET} \
+					     -i${NSIM_SV_IFIDX}    \
+					     -s${suspend_value}    \
+					     -o${out_file}&
+
+	wait_local_port_listen nssv ${SERVER_PORT} tcp
+
+	ip netns exec nscl socat -u $tmp_file TCP:${SERVER_IP}:${SERVER_PORT}
+
+	wait
+
+	tmp_file_md5sum=$(md5sum $tmp_file | cut -f1 -d' ')
+	out_file_md5sum=$(md5sum $out_file | cut -f1 -d' ')
+
+	if [ "$tmp_file_md5sum" = "$out_file_md5sum" ]; then
+		res=0
+	else
+		echo "md5sum mismatch"
+		echo "input file md5sum: ${tmp_file_md5sum}";
+		echo "output file md5sum: ${out_file_md5sum}";
+		res=1
+	fi
+
+	rm $out_file $tmp_file
+
+	return $res
+}
+
+test_busypoll_with_suspend()
+{
+	test_busypoll ${SUSPEND_TIMEOUT}
+
+	return $?
+}
+
+###
+### Code start
+###
+
+modprobe netdevsim
+
+# linking
+
+echo $NSIM_SV_ID > $NSIM_DEV_SYS_NEW
+echo $NSIM_CL_ID > $NSIM_DEV_SYS_NEW
+udevadm settle
+
+setup_ns
+
+NSIM_SV_FD=$((256 + RANDOM % 256))
+exec {NSIM_SV_FD}</var/run/netns/nssv
+NSIM_SV_IFIDX=$(ip netns exec nssv cat /sys/class/net/$NSIM_SV_NAME/ifindex)
+
+NSIM_CL_FD=$((256 + RANDOM % 256))
+exec {NSIM_CL_FD}</var/run/netns/nscl
+NSIM_CL_IFIDX=$(ip netns exec nscl cat /sys/class/net/$NSIM_CL_NAME/ifindex)
+
+echo "$NSIM_SV_FD:$NSIM_SV_IFIDX $NSIM_CL_FD:$NSIM_CL_IFIDX" > \
+     $NSIM_DEV_SYS_LINK
+
+if [ $? -ne 0 ]; then
+	echo "linking netdevsim1 with netdevsim2 should succeed"
+	cleanup_ns
+	exit 1
+fi
+
+test_busypoll
+if [ $? -ne 0 ]; then
+	echo "test_busypoll failed"
+	cleanup_ns
+	exit 1
+fi
+
+test_busypoll_with_suspend
+if [ $? -ne 0 ]; then
+	echo "test_busypoll_with_suspend failed"
+	cleanup_ns
+	exit 1
+fi
+
+echo "$NSIM_SV_FD:$NSIM_SV_IFIDX" > $NSIM_DEV_SYS_UNLINK
+
+echo $NSIM_CL_ID > $NSIM_DEV_SYS_DEL
+
+cleanup_ns
+
+modprobe -r netdevsim
+
+exit 0
diff --git a/tools/testing/selftests/net/busy_poller.c b/tools/testing/selftests/net/busy_poller.c
new file mode 100644
index 000000000000..99b0e8c17fca
--- /dev/null
+++ b/tools/testing/selftests/net/busy_poller.c
@@ -0,0 +1,346 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <assert.h>
+#include <errno.h>
+#include <error.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <ynl.h>
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include <linux/genetlink.h>
+#include <linux/netlink.h>
+
+#include "netdev-user.h"
+
+/* The below ifdef blob is required because:
+ *
+ * - sys/epoll.h does not (yet) have the ioctl definitions included. So,
+ *   systems with older glibcs will not have them available. However,
+ *   sys/epoll.h does include the type definition for epoll_data, which is
+ *   needed by the user program (e.g. epoll_event.data.fd)
+ *
+ * - linux/eventpoll.h does not define the epoll_data type, it is simply an
+ *   opaque __u64. It does, however, include the ioctl definition.
+ *
+ * Including both headers is impossible (types would be redefined), so I've
+ * opted instead to take sys/epoll.h, and include the blob below.
+ *
+ * Someday, when glibc is globally up to date, the blob below can be removed.
+ */
+#if !defined(EPOLL_IOC_TYPE)
+struct epoll_params {
+	uint32_t busy_poll_usecs;
+	uint16_t busy_poll_budget;
+	uint8_t prefer_busy_poll;
+
+	/* pad the struct to a multiple of 64bits */
+	uint8_t __pad;
+};
+
+#define EPOLL_IOC_TYPE 0x8A
+#define EPIOCSPARAMS _IOW(EPOLL_IOC_TYPE, 0x01, struct epoll_params)
+#define EPIOCGPARAMS _IOR(EPOLL_IOC_TYPE, 0x02, struct epoll_params)
+#endif
+
+static uint32_t cfg_port = 8000;
+static struct in_addr cfg_bind_addr = { .s_addr = INADDR_ANY };
+static char *cfg_outfile;
+static int cfg_max_events = 8;
+static int cfg_ifindex;
+
+/* busy poll params */
+static uint32_t cfg_busy_poll_usecs;
+static uint32_t cfg_busy_poll_budget;
+static uint32_t cfg_prefer_busy_poll;
+
+/* IRQ params */
+static uint32_t cfg_defer_hard_irqs;
+static uint64_t cfg_gro_flush_timeout;
+static uint64_t cfg_irq_suspend_timeout;
+
+static void usage(const char *filepath)
+{
+	error(1, 0,
+	      "Usage: %s -p<port> -b<addr> -m<max_events> -u<busy_poll_usecs> -P<prefer_busy_poll> -g<busy_poll_budget> -o<outfile> -d<defer_hard_irqs> -r<gro_flush_timeout> -s<irq_suspend_timeout> -i<ifindex>",
+	      filepath);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+	int ret;
+	int c;
+
+	if (argc <= 1)
+		usage(argv[0]);
+
+	while ((c = getopt(argc, argv, "p:m:b:u:P:g:o:d:r:s:i:")) != -1) {
+		switch (c) {
+		case 'u':
+			cfg_busy_poll_usecs = strtoul(optarg, NULL, 0);
+			if (cfg_busy_poll_usecs == ULONG_MAX ||
+			    cfg_busy_poll_usecs > UINT32_MAX)
+				error(1, ERANGE, "busy_poll_usecs too large");
+			break;
+		case 'P':
+			cfg_prefer_busy_poll = strtoul(optarg, NULL, 0);
+			if (cfg_prefer_busy_poll == ULONG_MAX ||
+			    cfg_prefer_busy_poll > 1)
+				error(1, ERANGE,
+				      "prefer busy poll should be 0 or 1");
+			break;
+		case 'g':
+			cfg_busy_poll_budget = strtoul(optarg, NULL, 0);
+			if (cfg_busy_poll_budget == ULONG_MAX ||
+			    cfg_busy_poll_budget > UINT16_MAX)
+				error(1, ERANGE,
+				      "busy poll budget must be [0, UINT16_MAX]");
+			break;
+		case 'p':
+			cfg_port = strtoul(optarg, NULL, 0);
+			if (cfg_port > UINT16_MAX)
+				error(1, ERANGE, "port must be <= 65535");
+			break;
+		case 'b':
+			ret = inet_aton(optarg, &cfg_bind_addr);
+			if (ret == 0)
+				error(1, errno,
+				      "bind address %s invalid", optarg);
+			break;
+		case 'o':
+			cfg_outfile = strdup(optarg);
+			if (!cfg_outfile)
+				error(1, 0, "outfile invalid");
+			break;
+		case 'm':
+			cfg_max_events = strtol(optarg, NULL, 0);
+
+			if (cfg_max_events == LONG_MIN ||
+			    cfg_max_events == LONG_MAX ||
+			    cfg_max_events <= 0)
+				error(1, ERANGE,
+				      "max events must be > 0 and < LONG_MAX");
+			break;
+		case 'd':
+			cfg_defer_hard_irqs = strtoul(optarg, NULL, 0);
+
+			if (cfg_defer_hard_irqs == ULONG_MAX ||
+			    cfg_defer_hard_irqs > INT32_MAX)
+				error(1, ERANGE,
+				      "defer_hard_irqs must be <= INT32_MAX");
+			break;
+		case 'r':
+			cfg_gro_flush_timeout = strtoull(optarg, NULL, 0);
+
+			if (cfg_gro_flush_timeout == ULLONG_MAX)
+				error(1, ERANGE,
+				      "gro_flush_timeout must be < ULLONG_MAX");
+			break;
+		case 's':
+			cfg_irq_suspend_timeout = strtoull(optarg, NULL, 0);
+
+			if (cfg_irq_suspend_timeout == ULLONG_MAX)
+				error(1, ERANGE,
+				      "irq_suspend_timeout must be < ULLONG_MAX");
+			break;
+		case 'i':
+			cfg_ifindex = strtoul(optarg, NULL, 0);
+			if (cfg_ifindex == ULONG_MAX)
+				error(1, ERANGE,
+				      "ifindex must be < ULONG_MAX");
+			break;
+		}
+	}
+
+	if (!cfg_ifindex)
+		usage(argv[0]);
+
+	if (optind != argc)
+		usage(argv[0]);
+}
+
+static void epoll_ctl_add(int epfd, int fd, uint32_t events)
+{
+	struct epoll_event ev;
+
+	ev.events = events;
+	ev.data.fd = fd;
+	if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev) == -1)
+		error(1, errno, "epoll_ctl add fd: %d", fd);
+}
+
+static void setnonblock(int sockfd)
+{
+	int flags;
+
+	flags = fcntl(sockfd, F_GETFL, 0);
+
+	if (fcntl(sockfd, F_SETFL, flags | O_NONBLOCK) == -1)
+		error(1, errno, "unable to set socket to nonblocking mode");
+}
+
+static void write_chunk(int fd, char *buf, ssize_t buflen)
+{
+	ssize_t remaining = buflen;
+	char *buf_offset = buf;
+	ssize_t writelen = 0;
+	ssize_t write_result;
+
+	while (writelen < buflen) {
+		write_result = write(fd, buf_offset, remaining);
+		if (write_result == -1)
+			error(1, errno, "unable to write data to outfile");
+
+		writelen += write_result;
+		remaining -= write_result;
+		buf_offset += write_result;
+	}
+}
+
+static void setup_queue(void)
+{
+	struct netdev_napi_get_list *napi_list = NULL;
+	struct netdev_napi_get_req_dump *req = NULL;
+	struct netdev_napi_set_req *set_req = NULL;
+	struct ynl_sock *ys;
+	struct ynl_error yerr;
+	uint32_t napi_id;
+
+	ys = ynl_sock_create(&ynl_netdev_family, &yerr);
+	if (!ys)
+		error(1, 0, "YNL: %s", yerr.msg);
+
+	req = netdev_napi_get_req_dump_alloc();
+	netdev_napi_get_req_dump_set_ifindex(req, cfg_ifindex);
+	napi_list = netdev_napi_get_dump(ys, req);
+
+	/* assume there is 1 NAPI configured and take the first */
+	if (napi_list->obj._present.id)
+		napi_id = napi_list->obj.id;
+	else
+		error(1, 0, "napi ID not present?");
+
+	set_req = netdev_napi_set_req_alloc();
+	netdev_napi_set_req_set_id(set_req, napi_id);
+	netdev_napi_set_req_set_defer_hard_irqs(set_req, cfg_defer_hard_irqs);
+	netdev_napi_set_req_set_gro_flush_timeout(set_req,
+						  cfg_gro_flush_timeout);
+	netdev_napi_set_req_set_irq_suspend_timeout(set_req,
+						    cfg_irq_suspend_timeout);
+
+	if (netdev_napi_set(ys, set_req))
+		error(1, 0, "can't set NAPI params: %s\n", yerr.msg);
+
+	netdev_napi_get_list_free(napi_list);
+	netdev_napi_get_req_dump_free(req);
+	netdev_napi_set_req_free(set_req);
+	ynl_sock_destroy(ys);
+}
+
+static void run_poller(void)
+{
+	struct epoll_event events[cfg_max_events];
+	struct epoll_params epoll_params = {0};
+	struct sockaddr_in server_addr;
+	int i, epfd, nfds;
+	ssize_t readlen;
+	int outfile_fd;
+	char buf[1024];
+	int sockfd;
+	int conn;
+	int val;
+
+	outfile_fd = open(cfg_outfile, O_WRONLY | O_CREAT, 0644);
+	if (outfile_fd == -1)
+		error(1, errno, "unable to open outfile: %s", cfg_outfile);
+
+	sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+	if (sockfd == -1)
+		error(1, errno, "unable to create listen socket");
+
+	server_addr.sin_family = AF_INET;
+	server_addr.sin_port = htons(cfg_port);
+	server_addr.sin_addr = cfg_bind_addr;
+
+	/* these values are range checked during parse_opts, so casting is safe
+	 * here
+	 */
+	epoll_params.busy_poll_usecs = cfg_busy_poll_usecs;
+	epoll_params.busy_poll_budget = (uint16_t)cfg_busy_poll_budget;
+	epoll_params.prefer_busy_poll = (uint8_t)cfg_prefer_busy_poll;
+	epoll_params.__pad = 0;
+
+	val = 1;
+	if (setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)))
+		error(1, errno, "poller setsockopt reuseaddr");
+
+	setnonblock(sockfd);
+
+	if (bind(sockfd, (struct sockaddr *)&server_addr,
+		 sizeof(struct sockaddr_in)))
+		error(0, errno, "poller bind to port: %d\n", cfg_port);
+
+	if (listen(sockfd, 1))
+		error(1, errno, "poller listen");
+
+	epfd = epoll_create1(0);
+	if (ioctl(epfd, EPIOCSPARAMS, &epoll_params) == -1)
+		error(1, errno, "unable to set busy poll params");
+
+	epoll_ctl_add(epfd, sockfd, EPOLLIN | EPOLLOUT | EPOLLET);
+
+	for (;;) {
+		nfds = epoll_wait(epfd, events, cfg_max_events, -1);
+		for (i = 0; i < nfds; i++) {
+			if (events[i].data.fd == sockfd) {
+				conn = accept(sockfd, NULL, NULL);
+				if (conn == -1)
+					error(1, errno,
+					      "accepting incoming connection failed");
+
+				setnonblock(conn);
+				epoll_ctl_add(epfd, conn,
+					      EPOLLIN | EPOLLET | EPOLLRDHUP |
+					      EPOLLHUP);
+			} else if (events[i].events & EPOLLIN) {
+				for (;;) {
+					readlen = read(events[i].data.fd, buf,
+						       sizeof(buf));
+					if (readlen > 0)
+						write_chunk(outfile_fd, buf,
+							    readlen);
+					else
+						break;
+				}
+			} else {
+				/* spurious event ? */
+			}
+			if (events[i].events & (EPOLLRDHUP | EPOLLHUP)) {
+				epoll_ctl(epfd, EPOLL_CTL_DEL,
+					  events[i].data.fd, NULL);
+				close(events[i].data.fd);
+				close(outfile_fd);
+				return;
+			}
+		}
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	parse_opts(argc, argv);
+	setup_queue();
+	run_poller();
+	return 0;
+}
-- 
2.51.0


From a90a91e24b4851367e26e0990bd8c2e9213672ff Mon Sep 17 00:00:00 2001
From: Joe Damato <jdamato@fastly.com>
Date: Sat, 9 Nov 2024 05:02:36 +0000
Subject: [PATCH 02/16] docs: networking: Describe irq suspension

Describe irq suspension, the epoll ioctls, and the tradeoffs of using
different gro_flush_timeout values.

Signed-off-by: Joe Damato <jdamato@fastly.com>
Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
Reviewed-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Link: https://patch.msgid.link/20241109050245.191288-7-jdamato@fastly.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/napi.rst | 170 +++++++++++++++++++++++++++++-
 1 file changed, 168 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/napi.rst b/Documentation/networking/napi.rst
index dfa5d549be9c..02720dd71a76 100644
--- a/Documentation/networking/napi.rst
+++ b/Documentation/networking/napi.rst
@@ -192,6 +192,33 @@ is reused to control the delay of the timer, while
 ``napi_defer_hard_irqs`` controls the number of consecutive empty polls
 before NAPI gives up and goes back to using hardware IRQs.
 
+The above parameters can also be set on a per-NAPI basis using netlink via
+netdev-genl. When used with netlink and configured on a per-NAPI basis, the
+parameters mentioned above use hyphens instead of underscores:
+``gro-flush-timeout`` and ``napi-defer-hard-irqs``.
+
+Per-NAPI configuration can be done programmatically in a user application
+or by using a script included in the kernel source tree:
+``tools/net/ynl/cli.py``.
+
+For example, using the script:
+
+.. code-block:: bash
+
+  $ kernel-source/tools/net/ynl/cli.py \
+            --spec Documentation/netlink/specs/netdev.yaml \
+            --do napi-set \
+            --json='{"id": 345,
+                     "defer-hard-irqs": 111,
+                     "gro-flush-timeout": 11111}'
+
+Similarly, the parameter ``irq-suspend-timeout`` can be set using netlink
+via netdev-genl. There is no global sysfs parameter for this value.
+
+``irq-suspend-timeout`` is used to determine how long an application can
+completely suspend IRQs. It is used in combination with SO_PREFER_BUSY_POLL,
+which can be set on a per-epoll context basis with ``EPIOCSPARAMS`` ioctl.
+
 .. _poll:
 
 Busy polling
@@ -207,6 +234,46 @@ selected sockets or using the global ``net.core.busy_poll`` and
 ``net.core.busy_read`` sysctls. An io_uring API for NAPI busy polling
 also exists.
 
+epoll-based busy polling
+------------------------
+
+It is possible to trigger packet processing directly from calls to
+``epoll_wait``. In order to use this feature, a user application must ensure
+all file descriptors which are added to an epoll context have the same NAPI ID.
+
+If the application uses a dedicated acceptor thread, the application can obtain
+the NAPI ID of the incoming connection using SO_INCOMING_NAPI_ID and then
+distribute that file descriptor to a worker thread. The worker thread would add
+the file descriptor to its epoll context. This would ensure each worker thread
+has an epoll context with FDs that have the same NAPI ID.
+
+Alternatively, if the application uses SO_REUSEPORT, a bpf or ebpf program can
+be inserted to distribute incoming connections to threads such that each thread
+is only given incoming connections with the same NAPI ID. Care must be taken to
+carefully handle cases where a system may have multiple NICs.
+
+In order to enable busy polling, there are two choices:
+
+1. ``/proc/sys/net/core/busy_poll`` can be set with a time in useconds to busy
+   loop waiting for events. This is a system-wide setting and will cause all
+   epoll-based applications to busy poll when they call epoll_wait. This may
+   not be desirable as many applications may not have the need to busy poll.
+
+2. Applications using recent kernels can issue an ioctl on the epoll context
+   file descriptor to set (``EPIOCSPARAMS``) or get (``EPIOCGPARAMS``) ``struct
+   epoll_params``:, which user programs can define as follows:
+
+.. code-block:: c
+
+  struct epoll_params {
+      uint32_t busy_poll_usecs;
+      uint16_t busy_poll_budget;
+      uint8_t prefer_busy_poll;
+
+      /* pad the struct to a multiple of 64bits */
+      uint8_t __pad;
+  };
+
 IRQ mitigation
 ---------------
 
@@ -222,12 +289,111 @@ Such applications can pledge to the kernel that they will perform a busy
 polling operation periodically, and the driver should keep the device IRQs
 permanently masked. This mode is enabled by using the ``SO_PREFER_BUSY_POLL``
 socket option. To avoid system misbehavior the pledge is revoked
-if ``gro_flush_timeout`` passes without any busy poll call.
+if ``gro_flush_timeout`` passes without any busy poll call. For epoll-based
+busy polling applications, the ``prefer_busy_poll`` field of ``struct
+epoll_params`` can be set to 1 and the ``EPIOCSPARAMS`` ioctl can be issued to
+enable this mode. See the above section for more details.
 
 The NAPI budget for busy polling is lower than the default (which makes
 sense given the low latency intention of normal busy polling). This is
 not the case with IRQ mitigation, however, so the budget can be adjusted
-with the ``SO_BUSY_POLL_BUDGET`` socket option.
+with the ``SO_BUSY_POLL_BUDGET`` socket option. For epoll-based busy polling
+applications, the ``busy_poll_budget`` field can be adjusted to the desired value
+in ``struct epoll_params`` and set on a specific epoll context using the ``EPIOCSPARAMS``
+ioctl. See the above section for more details.
+
+It is important to note that choosing a large value for ``gro_flush_timeout``
+will defer IRQs to allow for better batch processing, but will induce latency
+when the system is not fully loaded. Choosing a small value for
+``gro_flush_timeout`` can cause interference of the user application which is
+attempting to busy poll by device IRQs and softirq processing. This value
+should be chosen carefully with these tradeoffs in mind. epoll-based busy
+polling applications may be able to mitigate how much user processing happens
+by choosing an appropriate value for ``maxevents``.
+
+Users may want to consider an alternate approach, IRQ suspension, to help deal
+with these tradeoffs.
+
+IRQ suspension
+--------------
+
+IRQ suspension is a mechanism wherein device IRQs are masked while epoll
+triggers NAPI packet processing.
+
+While application calls to epoll_wait successfully retrieve events, the kernel will
+defer the IRQ suspension timer. If the kernel does not retrieve any events
+while busy polling (for example, because network traffic levels subsided), IRQ
+suspension is disabled and the IRQ mitigation strategies described above are
+engaged.
+
+This allows users to balance CPU consumption with network processing
+efficiency.
+
+To use this mechanism:
+
+  1. The per-NAPI config parameter ``irq-suspend-timeout`` should be set to the
+     maximum time (in nanoseconds) the application can have its IRQs
+     suspended. This is done using netlink, as described above. This timeout
+     serves as a safety mechanism to restart IRQ driver interrupt processing if
+     the application has stalled. This value should be chosen so that it covers
+     the amount of time the user application needs to process data from its
+     call to epoll_wait, noting that applications can control how much data
+     they retrieve by setting ``max_events`` when calling epoll_wait.
+
+  2. The sysfs parameter or per-NAPI config parameters ``gro_flush_timeout``
+     and ``napi_defer_hard_irqs`` can be set to low values. They will be used
+     to defer IRQs after busy poll has found no data.
+
+  3. The ``prefer_busy_poll`` flag must be set to true. This can be done using
+     the ``EPIOCSPARAMS`` ioctl as described above.
+
+  4. The application uses epoll as described above to trigger NAPI packet
+     processing.
+
+As mentioned above, as long as subsequent calls to epoll_wait return events to
+userland, the ``irq-suspend-timeout`` is deferred and IRQs are disabled. This
+allows the application to process data without interference.
+
+Once a call to epoll_wait results in no events being found, IRQ suspension is
+automatically disabled and the ``gro_flush_timeout`` and
+``napi_defer_hard_irqs`` mitigation mechanisms take over.
+
+It is expected that ``irq-suspend-timeout`` will be set to a value much larger
+than ``gro_flush_timeout`` as ``irq-suspend-timeout`` should suspend IRQs for
+the duration of one userland processing cycle.
+
+While it is not stricly necessary to use ``napi_defer_hard_irqs`` and
+``gro_flush_timeout`` to use IRQ suspension, their use is strongly
+recommended.
+
+IRQ suspension causes the system to alternate between polling mode and
+irq-driven packet delivery. During busy periods, ``irq-suspend-timeout``
+overrides ``gro_flush_timeout`` and keeps the system busy polling, but when
+epoll finds no events, the setting of ``gro_flush_timeout`` and
+``napi_defer_hard_irqs`` determine the next step.
+
+There are essentially three possible loops for network processing and
+packet delivery:
+
+1) hardirq -> softirq -> napi poll; basic interrupt delivery
+2) timer -> softirq -> napi poll; deferred irq processing
+3) epoll -> busy-poll -> napi poll; busy looping
+
+Loop 2 can take control from Loop 1, if ``gro_flush_timeout`` and
+``napi_defer_hard_irqs`` are set.
+
+If ``gro_flush_timeout`` and ``napi_defer_hard_irqs`` are set, Loops 2
+and 3 "wrestle" with each other for control.
+
+During busy periods, ``irq-suspend-timeout`` is used as timer in Loop 2,
+which essentially tilts network processing in favour of Loop 3.
+
+If ``gro_flush_timeout`` and ``napi_defer_hard_irqs`` are not set, Loop 3
+cannot take control from Loop 1.
+
+Therefore, setting ``gro_flush_timeout`` and ``napi_defer_hard_irqs`` is
+the recommended usage, because otherwise setting ``irq-suspend-timeout``
+might not have any discernible effect.
 
 .. _threaded:
 
-- 
2.51.0


From 8cc5f4cb94c0b1c7c1ba8013c14fd02ffb1a25f3 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Fri, 8 Nov 2024 16:01:44 +0000
Subject: [PATCH 03/16] net: phylink: move manual flow control setting

Move the handling of manual flow control configuration to a common
location during resolve. We currently evaluate this for all but
fixed links.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1t9RQe-002Feh-T1@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phylink.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 6ca7ea970f51..65e81ef2225d 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1467,7 +1467,6 @@ static void phylink_resolve(struct work_struct *w)
 		switch (pl->cur_link_an_mode) {
 		case MLO_AN_PHY:
 			link_state = pl->phy_state;
-			phylink_apply_manual_flow(pl, &link_state);
 			mac_config = link_state.link;
 			break;
 
@@ -1528,11 +1527,13 @@ static void phylink_resolve(struct work_struct *w)
 				link_state.pause = pl->phy_state.pause;
 				mac_config = true;
 			}
-			phylink_apply_manual_flow(pl, &link_state);
 			break;
 		}
 	}
 
+	if (pl->cur_link_an_mode != MLO_AN_FIXED)
+		phylink_apply_manual_flow(pl, &link_state);
+
 	if (mac_config) {
 		if (link_state.interface != pl->link_config.interface) {
 			/* The interface has changed, force the link down and
-- 
2.51.0


From 92abfcb4ced482afbe65d18980e6734fe1e62a34 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Fri, 8 Nov 2024 16:01:50 +0000
Subject: [PATCH 04/16] net: phylink: move MLO_AN_FIXED resolve handling to
 if() statement

The switch() statement doesn't sit very well with the preceeding if()
statements, and results in excessive indentation that spoils code
readability. Begin cleaning this up by converting the MLO_AN_FIXED case
to an if() statement.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1t9RQk-002Fen-1A@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phylink.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 65e81ef2225d..bb20ae5674e5 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1463,6 +1463,9 @@ static void phylink_resolve(struct work_struct *w)
 	} else if (pl->mac_link_dropped) {
 		link_state.link = false;
 		retrigger = true;
+	} else if (pl->cur_link_an_mode == MLO_AN_FIXED) {
+		phylink_get_fixed_state(pl, &link_state);
+		mac_config = link_state.link;
 	} else {
 		switch (pl->cur_link_an_mode) {
 		case MLO_AN_PHY:
@@ -1470,11 +1473,6 @@ static void phylink_resolve(struct work_struct *w)
 			mac_config = link_state.link;
 			break;
 
-		case MLO_AN_FIXED:
-			phylink_get_fixed_state(pl, &link_state);
-			mac_config = link_state.link;
-			break;
-
 		case MLO_AN_INBAND:
 			phylink_mac_pcs_get_state(pl, &link_state);
 
-- 
2.51.0


From f0f46c2a3d8ea9d1427298c8103a777d9e616c29 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Fri, 8 Nov 2024 16:01:55 +0000
Subject: [PATCH 05/16] net: phylink: move MLO_AN_PHY resolve handling to if()
 statement

The switch() statement doesn't sit very well with the preceeding if()
statements, and results in excessive indentation that spoils code
readability. Continue cleaning this up by converting the MLO_AN_PHY
case to use an if() statmeent.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1t9RQp-002Fet-5W@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phylink.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index bb20ae5674e5..3af6368a9fbf 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1466,13 +1466,11 @@ static void phylink_resolve(struct work_struct *w)
 	} else if (pl->cur_link_an_mode == MLO_AN_FIXED) {
 		phylink_get_fixed_state(pl, &link_state);
 		mac_config = link_state.link;
+	} else if (pl->cur_link_an_mode == MLO_AN_PHY) {
+		link_state = pl->phy_state;
+		mac_config = link_state.link;
 	} else {
 		switch (pl->cur_link_an_mode) {
-		case MLO_AN_PHY:
-			link_state = pl->phy_state;
-			mac_config = link_state.link;
-			break;
-
 		case MLO_AN_INBAND:
 			phylink_mac_pcs_get_state(pl, &link_state);
 
-- 
2.51.0


From d1a16dbbd84e02d2a6dcfcb8d5c4b8b2c0289f00 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Fri, 8 Nov 2024 16:02:00 +0000
Subject: [PATCH 06/16] net: phylink: remove switch() statement in resolve
 handling

The switch() statement doesn't sit very well with the preceeding if()
statements, so let's just convert everything to if()s. As a result of
the two preceding commits, there is now only one case in the switch()
statement. Remove the switch statement and reduce the code indentation.
Code reformatting will be in the following commit.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1t9RQu-002Fez-AA@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phylink.c | 94 +++++++++++++++++++--------------------
 1 file changed, 45 insertions(+), 49 deletions(-)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 3af6368a9fbf..aaeb8b11e758 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1470,60 +1470,56 @@ static void phylink_resolve(struct work_struct *w)
 		link_state = pl->phy_state;
 		mac_config = link_state.link;
 	} else {
-		switch (pl->cur_link_an_mode) {
-		case MLO_AN_INBAND:
-			phylink_mac_pcs_get_state(pl, &link_state);
-
-			/* The PCS may have a latching link-fail indicator.
-			 * If the link was up, bring the link down and
-			 * re-trigger the resolve. Otherwise, re-read the
-			 * PCS state to get the current status of the link.
+		phylink_mac_pcs_get_state(pl, &link_state);
+
+		/* The PCS may have a latching link-fail indicator.
+		 * If the link was up, bring the link down and
+		 * re-trigger the resolve. Otherwise, re-read the
+		 * PCS state to get the current status of the link.
+		 */
+		if (!link_state.link) {
+			if (cur_link_state)
+				retrigger = true;
+			else
+				phylink_mac_pcs_get_state(pl,
+							  &link_state);
+		}
+
+		/* If we have a phy, the "up" state is the union of
+		 * both the PHY and the MAC
+		 */
+		if (pl->phydev)
+			link_state.link &= pl->phy_state.link;
+
+		/* Only update if the PHY link is up */
+		if (pl->phydev && pl->phy_state.link) {
+			/* If the interface has changed, force a
+			 * link down event if the link isn't already
+			 * down, and re-resolve.
 			 */
-			if (!link_state.link) {
-				if (cur_link_state)
-					retrigger = true;
-				else
-					phylink_mac_pcs_get_state(pl,
-								  &link_state);
+			if (link_state.interface !=
+			    pl->phy_state.interface) {
+				retrigger = true;
+				link_state.link = false;
 			}
+			link_state.interface = pl->phy_state.interface;
 
-			/* If we have a phy, the "up" state is the union of
-			 * both the PHY and the MAC
+			/* If we are doing rate matching, then the
+			 * link speed/duplex comes from the PHY
 			 */
-			if (pl->phydev)
-				link_state.link &= pl->phy_state.link;
-
-			/* Only update if the PHY link is up */
-			if (pl->phydev && pl->phy_state.link) {
-				/* If the interface has changed, force a
-				 * link down event if the link isn't already
-				 * down, and re-resolve.
-				 */
-				if (link_state.interface !=
-				    pl->phy_state.interface) {
-					retrigger = true;
-					link_state.link = false;
-				}
-				link_state.interface = pl->phy_state.interface;
-
-				/* If we are doing rate matching, then the
-				 * link speed/duplex comes from the PHY
-				 */
-				if (pl->phy_state.rate_matching) {
-					link_state.rate_matching =
-						pl->phy_state.rate_matching;
-					link_state.speed = pl->phy_state.speed;
-					link_state.duplex =
-						pl->phy_state.duplex;
-				}
-
-				/* If we have a PHY, we need to update with
-				 * the PHY flow control bits.
-				 */
-				link_state.pause = pl->phy_state.pause;
-				mac_config = true;
+			if (pl->phy_state.rate_matching) {
+				link_state.rate_matching =
+					pl->phy_state.rate_matching;
+				link_state.speed = pl->phy_state.speed;
+				link_state.duplex =
+					pl->phy_state.duplex;
 			}
-			break;
+
+			/* If we have a PHY, we need to update with
+			 * the PHY flow control bits.
+			 */
+			link_state.pause = pl->phy_state.pause;
+			mac_config = true;
 		}
 	}
 
-- 
2.51.0


From bc08ce37d99a3992e975a0f397503cb23404f25a Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Fri, 8 Nov 2024 16:02:05 +0000
Subject: [PATCH 07/16] net: phylink: clean up phylink_resolve()

Now that we have reduced the indentation level, clean up the code
formatting.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1t9RQz-002Ff5-EA@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phylink.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index aaeb8b11e758..b1e828a4286d 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1472,51 +1472,48 @@ static void phylink_resolve(struct work_struct *w)
 	} else {
 		phylink_mac_pcs_get_state(pl, &link_state);
 
-		/* The PCS may have a latching link-fail indicator.
-		 * If the link was up, bring the link down and
-		 * re-trigger the resolve. Otherwise, re-read the
-		 * PCS state to get the current status of the link.
+		/* The PCS may have a latching link-fail indicator. If the link
+		 * was up, bring the link down and re-trigger the resolve.
+		 * Otherwise, re-read the PCS state to get the current status
+		 * of the link.
 		 */
 		if (!link_state.link) {
 			if (cur_link_state)
 				retrigger = true;
 			else
-				phylink_mac_pcs_get_state(pl,
-							  &link_state);
+				phylink_mac_pcs_get_state(pl, &link_state);
 		}
 
-		/* If we have a phy, the "up" state is the union of
-		 * both the PHY and the MAC
+		/* If we have a phy, the "up" state is the union of both the
+		 * PHY and the MAC
 		 */
 		if (pl->phydev)
 			link_state.link &= pl->phy_state.link;
 
 		/* Only update if the PHY link is up */
 		if (pl->phydev && pl->phy_state.link) {
-			/* If the interface has changed, force a
-			 * link down event if the link isn't already
-			 * down, and re-resolve.
+			/* If the interface has changed, force a link down
+			 * event if the link isn't already down, and re-resolve.
 			 */
-			if (link_state.interface !=
-			    pl->phy_state.interface) {
+			if (link_state.interface != pl->phy_state.interface) {
 				retrigger = true;
 				link_state.link = false;
 			}
+
 			link_state.interface = pl->phy_state.interface;
 
-			/* If we are doing rate matching, then the
-			 * link speed/duplex comes from the PHY
+			/* If we are doing rate matching, then the link
+			 * speed/duplex comes from the PHY
 			 */
 			if (pl->phy_state.rate_matching) {
 				link_state.rate_matching =
 					pl->phy_state.rate_matching;
 				link_state.speed = pl->phy_state.speed;
-				link_state.duplex =
-					pl->phy_state.duplex;
+				link_state.duplex = pl->phy_state.duplex;
 			}
 
-			/* If we have a PHY, we need to update with
-			 * the PHY flow control bits.
+			/* If we have a PHY, we need to update with the PHY
+			 * flow control bits.
 			 */
 			link_state.pause = pl->phy_state.pause;
 			mac_config = true;
-- 
2.51.0


From 43271bb5bf67e78def9c2898040505e7cb5935f3 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Fri, 8 Nov 2024 06:59:25 -0800
Subject: [PATCH 08/16] net: netconsole: selftests: Check if netdevsim is
 available

The netconsole selftest relies on the availability of the netdevsim module.
To ensure the test can run correctly, we need to check if the netdevsim
module is either loaded or built-in before proceeding.

Update the netconsole selftest to check for the existence of
the /sys/bus/netdevsim/new_device file before running the test. If the
file is not found, the test is skipped with an explanation that the
CONFIG_NETDEVSIM kernel config option may not be enabled.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20241108-netcon_selftest_deps-v1-1-1789cbf3adcd@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/netcons_basic.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/drivers/net/netcons_basic.sh b/tools/testing/selftests/drivers/net/netcons_basic.sh
index 182eb1a97e59..b175f4d966e5 100755
--- a/tools/testing/selftests/drivers/net/netcons_basic.sh
+++ b/tools/testing/selftests/drivers/net/netcons_basic.sh
@@ -39,6 +39,7 @@ NAMESPACE=""
 # IDs for netdevsim
 NSIM_DEV_1_ID=$((256 + RANDOM % 256))
 NSIM_DEV_2_ID=$((512 + RANDOM % 256))
+NSIM_DEV_SYS_NEW="/sys/bus/netdevsim/new_device"
 
 # Used to create and delete namespaces
 source "${SCRIPTDIR}"/../../net/lib.sh
@@ -46,7 +47,6 @@ source "${SCRIPTDIR}"/../../net/net_helper.sh
 
 # Create netdevsim interfaces
 create_ifaces() {
-	local NSIM_DEV_SYS_NEW=/sys/bus/netdevsim/new_device
 
 	echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW"
 	echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW"
@@ -212,6 +212,11 @@ function check_for_dependencies() {
 		exit "${ksft_skip}"
 	fi
 
+	if [ ! -f "${NSIM_DEV_SYS_NEW}" ]; then
+		echo "SKIP: file ${NSIM_DEV_SYS_NEW} does not exist. Check if CONFIG_NETDEVSIM is enabled" >&2
+		exit "${ksft_skip}"
+	fi
+
 	if [ ! -d "${NETCONS_CONFIGFS}" ]; then
 		echo "SKIP: directory ${NETCONS_CONFIGFS} does not exist. Check if NETCONSOLE_DYNAMIC is enabled" >&2
 		exit "${ksft_skip}"
-- 
2.51.0


From 7d3f3b4367f315a61fc615e3138f3d320da8c466 Mon Sep 17 00:00:00 2001
From: Vladimir Vdovin <deliran@verdict.gg>
Date: Fri, 8 Nov 2024 09:34:24 +0000
Subject: [PATCH 09/16] net: ipv4: Cache pmtu for all packet paths if multipath
 enabled

Check number of paths by fib_info_num_path(),
and update_or_create_fnhe() for every path.
Problem is that pmtu is cached only for the oif
that has received icmp message "need to frag",
other oifs will still try to use "default" iface mtu.

An example topology showing the problem:

                    |  host1
                +---------+
                |  dummy0 | 10.179.20.18/32  mtu9000
                +---------+
        +-----------+----------------+
    +---------+                     +---------+
    | ens17f0 |  10.179.2.141/31    | ens17f1 |  10.179.2.13/31
    +---------+                     +---------+
        |    (all here have mtu 9000)    |
    +------+                         +------+
    | ro1  |  10.179.2.140/31        | ro2  |  10.179.2.12/31
    +------+                         +------+
        |                                |
---------+------------+-------------------+------
                        |
                    +-----+
                    | ro3 | 10.10.10.10  mtu1500
                    +-----+
                        |
    ========================================
                some networks
    ========================================
                        |
                    +-----+
                    | eth0| 10.10.30.30  mtu9000
                    +-----+
                        |  host2

host1 have enabled multipath and
sysctl net.ipv4.fib_multipath_hash_policy = 1:

default proto static src 10.179.20.18
        nexthop via 10.179.2.12 dev ens17f1 weight 1
        nexthop via 10.179.2.140 dev ens17f0 weight 1

When host1 tries to do pmtud from 10.179.20.18/32 to host2,
host1 receives at ens17f1 iface an icmp packet from ro3 that ro3 mtu=1500.
And host1 caches it in nexthop exceptions cache.

Problem is that it is cached only for the iface that has received icmp,
and there is no way that ro3 will send icmp msg to host1 via another path.

Host1 now have this routes to host2:

ip r g 10.10.30.30 sport 30000 dport 443
10.10.30.30 via 10.179.2.12 dev ens17f1 src 10.179.20.18 uid 0
    cache expires 521sec mtu 1500

ip r g 10.10.30.30 sport 30033 dport 443
10.10.30.30 via 10.179.2.140 dev ens17f0 src 10.179.20.18 uid 0
    cache

So when host1 tries again to reach host2 with mtu>1500,
if packet flow is lucky enough to be hashed with oif=ens17f1 its ok,
if oif=ens17f0 it blackholes and still gets icmp msgs from ro3 to ens17f1,
until lucky day when ro3 will send it through another flow to ens17f0.

Signed-off-by: Vladimir Vdovin <deliran@verdict.gg>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20241108093427.317942-1-deliran@verdict.gg
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv4/route.c                    |  13 ++++
 tools/testing/selftests/net/pmtu.sh | 112 +++++++++++++++++++++++-----
 2 files changed, 108 insertions(+), 17 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4c5e773002fe..ccdbe9c70132 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1027,6 +1027,19 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 		struct fib_nh_common *nhc;
 
 		fib_select_path(net, &res, fl4, NULL);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+		if (fib_info_num_path(res.fi) > 1) {
+			int nhsel;
+
+			for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) {
+				nhc = fib_info_nhc(res.fi, nhsel);
+				update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
+						      jiffies + net->ipv4.ip_rt_mtu_expires);
+			}
+			rcu_read_unlock();
+			return;
+		}
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
 		nhc = FIB_RES_NHC(res);
 		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
 				      jiffies + net->ipv4.ip_rt_mtu_expires);
diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index 6c651c880fe8..66be7699c72c 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -197,6 +197,12 @@
 #
 # - pmtu_ipv6_route_change
 #	Same as above but with IPv6
+#
+# - pmtu_ipv4_mp_exceptions
+#	Use the same topology as in pmtu_ipv4, but add routeable addresses
+#	on host A and B on lo reachable via both routers. Host A and B
+#	addresses have multipath routes to each other, b_r1 mtu = 1500.
+#	Check that PMTU exceptions are created for both paths.
 
 source lib.sh
 source net_helper.sh
@@ -266,7 +272,8 @@ tests="
 	list_flush_ipv4_exception	ipv4: list and flush cached exceptions	1
 	list_flush_ipv6_exception	ipv6: list and flush cached exceptions	1
 	pmtu_ipv4_route_change		ipv4: PMTU exception w/route replace	1
-	pmtu_ipv6_route_change		ipv6: PMTU exception w/route replace	1"
+	pmtu_ipv6_route_change		ipv6: PMTU exception w/route replace	1
+	pmtu_ipv4_mp_exceptions		ipv4: PMTU multipath nh exceptions	1"
 
 # Addressing and routing for tests with routers: four network segments, with
 # index SEGMENT between 1 and 4, a common prefix (PREFIX4 or PREFIX6) and an
@@ -343,6 +350,9 @@ tunnel6_a_addr="fd00:2::a"
 tunnel6_b_addr="fd00:2::b"
 tunnel6_mask="64"
 
+host4_a_addr="192.168.99.99"
+host4_b_addr="192.168.88.88"
+
 dummy6_0_prefix="fc00:1000::"
 dummy6_1_prefix="fc00:1001::"
 dummy6_mask="64"
@@ -984,6 +994,52 @@ setup_ovs_bridge() {
 	run_cmd ip route add ${prefix6}:${b_r1}::1 via ${prefix6}:${a_r1}::2
 }
 
+setup_multipath_new() {
+	# Set up host A with multipath routes to host B host4_b_addr
+	run_cmd ${ns_a} ip addr add ${host4_a_addr} dev lo
+	run_cmd ${ns_a} ip nexthop add id 401 via ${prefix4}.${a_r1}.2 dev veth_A-R1
+	run_cmd ${ns_a} ip nexthop add id 402 via ${prefix4}.${a_r2}.2 dev veth_A-R2
+	run_cmd ${ns_a} ip nexthop add id 403 group 401/402
+	run_cmd ${ns_a} ip route add ${host4_b_addr} src ${host4_a_addr} nhid 403
+
+	# Set up host B with multipath routes to host A host4_a_addr
+	run_cmd ${ns_b} ip addr add ${host4_b_addr} dev lo
+	run_cmd ${ns_b} ip nexthop add id 401 via ${prefix4}.${b_r1}.2 dev veth_B-R1
+	run_cmd ${ns_b} ip nexthop add id 402 via ${prefix4}.${b_r2}.2 dev veth_B-R2
+	run_cmd ${ns_b} ip nexthop add id 403 group 401/402
+	run_cmd ${ns_b} ip route add ${host4_a_addr} src ${host4_b_addr} nhid 403
+}
+
+setup_multipath_old() {
+	# Set up host A with multipath routes to host B host4_b_addr
+	run_cmd ${ns_a} ip addr add ${host4_a_addr} dev lo
+	run_cmd ${ns_a} ip route add ${host4_b_addr} \
+			src ${host4_a_addr} \
+			nexthop via ${prefix4}.${a_r1}.2 weight 1 \
+			nexthop via ${prefix4}.${a_r2}.2 weight 1
+
+	# Set up host B with multipath routes to host A host4_a_addr
+	run_cmd ${ns_b} ip addr add ${host4_b_addr} dev lo
+	run_cmd ${ns_b} ip route add ${host4_a_addr} \
+			src ${host4_b_addr} \
+			nexthop via ${prefix4}.${b_r1}.2 weight 1 \
+			nexthop via ${prefix4}.${b_r2}.2 weight 1
+}
+
+setup_multipath() {
+	if [ "$USE_NH" = "yes" ]; then
+		setup_multipath_new
+	else
+		setup_multipath_old
+	fi
+
+	# Set up routers with routes to dummies
+	run_cmd ${ns_r1} ip route add ${host4_a_addr} via ${prefix4}.${a_r1}.1
+	run_cmd ${ns_r2} ip route add ${host4_a_addr} via ${prefix4}.${a_r2}.1
+	run_cmd ${ns_r1} ip route add ${host4_b_addr} via ${prefix4}.${b_r1}.1
+	run_cmd ${ns_r2} ip route add ${host4_b_addr} via ${prefix4}.${b_r2}.1
+}
+
 setup() {
 	[ "$(id -u)" -ne 0 ] && echo "  need to run as root" && return $ksft_skip
 
@@ -1076,23 +1132,15 @@ link_get_mtu() {
 }
 
 route_get_dst_exception() {
-	ns_cmd="${1}"
-	dst="${2}"
-	dsfield="${3}"
+	ns_cmd="${1}"; shift
 
-	if [ -z "${dsfield}" ]; then
-		dsfield=0
-	fi
-
-	${ns_cmd} ip route get "${dst}" dsfield "${dsfield}"
+	${ns_cmd} ip route get "$@"
 }
 
 route_get_dst_pmtu_from_exception() {
-	ns_cmd="${1}"
-	dst="${2}"
-	dsfield="${3}"
+	ns_cmd="${1}"; shift
 
-	mtu_parse "$(route_get_dst_exception "${ns_cmd}" "${dst}" "${dsfield}")"
+	mtu_parse "$(route_get_dst_exception "${ns_cmd}" "$@")"
 }
 
 check_pmtu_value() {
@@ -1235,10 +1283,10 @@ test_pmtu_ipv4_dscp_icmp_exception() {
 	run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst2}"
 
 	# Check that exceptions have been created with the correct PMTU
-	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
+	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" dsfield "${policy_mark}")"
 	check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
 
-	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
+	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" dsfield "${policy_mark}")"
 	check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
 }
 
@@ -1285,9 +1333,9 @@ test_pmtu_ipv4_dscp_udp_exception() {
 		UDP:"${dst2}":50000,tos="${dsfield}"
 
 	# Check that exceptions have been created with the correct PMTU
-	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
+	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" dsfield "${policy_mark}")"
 	check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
-	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
+	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" dsfield "${policy_mark}")"
 	check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
 }
 
@@ -2329,6 +2377,36 @@ test_pmtu_ipv6_route_change() {
 	test_pmtu_ipvX_route_change 6
 }
 
+test_pmtu_ipv4_mp_exceptions() {
+	setup namespaces routing multipath || return $ksft_skip
+
+	trace "${ns_a}"  veth_A-R1    "${ns_r1}" veth_R1-A \
+	      "${ns_r1}" veth_R1-B    "${ns_b}"  veth_B-R1 \
+	      "${ns_a}"  veth_A-R2    "${ns_r2}" veth_R2-A \
+	      "${ns_r2}" veth_R2-B    "${ns_b}"  veth_B-R2
+
+	# Set up initial MTU values
+	mtu "${ns_a}"  veth_A-R1 2000
+	mtu "${ns_r1}" veth_R1-A 2000
+	mtu "${ns_r1}" veth_R1-B 1500
+	mtu "${ns_b}"  veth_B-R1 1500
+
+	mtu "${ns_a}"  veth_A-R2 2000
+	mtu "${ns_r2}" veth_R2-A 2000
+	mtu "${ns_r2}" veth_R2-B 1500
+	mtu "${ns_b}"  veth_B-R2 1500
+
+	# Ping and expect two nexthop exceptions for two routes
+	run_cmd ${ns_a} ping -q -M want -i 0.1 -c 1 -s 1800 "${host4_b_addr}"
+
+	# Check that exceptions have been created with the correct PMTU
+	pmtu_a_R1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${host4_b_addr}" oif veth_A-R1)"
+	pmtu_a_R2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${host4_b_addr}" oif veth_A-R2)"
+
+	check_pmtu_value "1500" "${pmtu_a_R1}" "exceeding MTU (veth_A-R1)" || return 1
+	check_pmtu_value "1500" "${pmtu_a_R2}" "exceeding MTU (veth_A-R2)" || return 1
+}
+
 usage() {
 	echo
 	echo "$0 [OPTIONS] [TEST]..."
-- 
2.51.0


From ab85ebf437231ceaf359c2a4679bebd7e8d6bdb2 Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad@nvidia.com>
Date: Thu, 7 Nov 2024 21:43:46 +0200
Subject: [PATCH 10/16] net/mlx5: E-switch, refactor eswitch mode change

The E-switch mode was previously updated before removing and re-adding the
IB device, which could cause a temporary mismatch between the E-switch mode
and the IB device configuration.

To prevent this discrepancy, the IB device is now removed first, then
the E-switch mode is updated, and finally, the IB device is re-added.
This sequence ensures consistent alignment between the E-switch mode and
the IB device whenever the mode changes, regardless of the new mode value.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20241107194357.683732-2-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/mellanox/mlx5/core/eswitch.c |  1 -
 .../mellanox/mlx5/core/eswitch_offloads.c     | 26 +++++++++++++++----
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index cead41ddbc38..d0dab8f4e1a3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1490,7 +1490,6 @@ int mlx5_eswitch_enable_locked(struct mlx5_eswitch *esw, int num_vfs)
 	if (esw->mode == MLX5_ESWITCH_LEGACY) {
 		err = esw_legacy_enable(esw);
 	} else {
-		mlx5_rescan_drivers(esw->dev);
 		err = esw_offloads_enable(esw);
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index fd34f43d18d5..5f1adebd9669 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -2332,18 +2332,35 @@ out_free:
 	return err;
 }
 
+static void esw_mode_change(struct mlx5_eswitch *esw, u16 mode)
+{
+	mlx5_devcom_comp_lock(esw->dev->priv.hca_devcom_comp);
+
+	if (esw->dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_IB_ADEV) {
+		esw->mode = mode;
+		mlx5_devcom_comp_unlock(esw->dev->priv.hca_devcom_comp);
+		return;
+	}
+
+	esw->dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
+	mlx5_rescan_drivers_locked(esw->dev);
+	esw->mode = mode;
+	esw->dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
+	mlx5_rescan_drivers_locked(esw->dev);
+	mlx5_devcom_comp_unlock(esw->dev->priv.hca_devcom_comp);
+}
+
 static int esw_offloads_start(struct mlx5_eswitch *esw,
 			      struct netlink_ext_ack *extack)
 {
 	int err;
 
-	esw->mode = MLX5_ESWITCH_OFFLOADS;
+	esw_mode_change(esw, MLX5_ESWITCH_OFFLOADS);
 	err = mlx5_eswitch_enable_locked(esw, esw->dev->priv.sriov.num_vfs);
 	if (err) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Failed setting eswitch to offloads");
-		esw->mode = MLX5_ESWITCH_LEGACY;
-		mlx5_rescan_drivers(esw->dev);
+		esw_mode_change(esw, MLX5_ESWITCH_LEGACY);
 		return err;
 	}
 	if (esw->offloads.inline_mode == MLX5_INLINE_MODE_NONE) {
@@ -3584,7 +3601,7 @@ static int esw_offloads_stop(struct mlx5_eswitch *esw,
 {
 	int err;
 
-	esw->mode = MLX5_ESWITCH_LEGACY;
+	esw_mode_change(esw, MLX5_ESWITCH_LEGACY);
 
 	/* If changing from switchdev to legacy mode without sriov enabled,
 	 * no need to create legacy fdb.
@@ -3770,7 +3787,6 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
 		err = esw_offloads_start(esw, extack);
 	} else if (mode == DEVLINK_ESWITCH_MODE_LEGACY) {
 		err = esw_offloads_stop(esw, extack);
-		mlx5_rescan_drivers(esw->dev);
 	} else {
 		err = -EINVAL;
 	}
-- 
2.51.0


From 5a731857656e3988935108f48800cd764a550005 Mon Sep 17 00:00:00 2001
From: Carolina Jubran <cjubran@nvidia.com>
Date: Thu, 7 Nov 2024 21:43:47 +0200
Subject: [PATCH 11/16] net/mlx5: Simplify QoS normalization by removing error
 handling

This change updates esw_qos_normalize_min_rate to not return errors,
significantly simplifying the code.

Normalization failures are software bugs, and it's unnecessary to
handle them with rollback mechanisms. Instead,
`esw_qos_update_sched_node_bw_share` and `esw_qos_normalize_min_rate`
now return void, with any errors logged as warnings to indicate
potential software issues.

This approach avoids compensating for hidden bugs and removes error
handling from all places that perform normalization, streamlining
future patches.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20241107194357.683732-3-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 72 +++++--------------
 1 file changed, 17 insertions(+), 55 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
index 940e1c2d1e39..0c371f27c693 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
@@ -208,64 +208,49 @@ static u32 esw_qos_calc_bw_share(u32 min_rate, u32 divider, u32 fw_max)
 	return min_t(u32, max_t(u32, DIV_ROUND_UP(min_rate, divider), MLX5_MIN_BW_SHARE), fw_max);
 }
 
-static int esw_qos_update_sched_node_bw_share(struct mlx5_esw_sched_node *node,
-					      u32 divider,
-					      struct netlink_ext_ack *extack)
+static void esw_qos_update_sched_node_bw_share(struct mlx5_esw_sched_node *node,
+					       u32 divider,
+					       struct netlink_ext_ack *extack)
 {
 	u32 fw_max_bw_share = MLX5_CAP_QOS(node->esw->dev, max_tsar_bw_share);
 	u32 bw_share;
-	int err;
 
 	bw_share = esw_qos_calc_bw_share(node->min_rate, divider, fw_max_bw_share);
 
 	if (bw_share == node->bw_share)
-		return 0;
-
-	err = esw_qos_sched_elem_config(node, node->max_rate, bw_share, extack);
-	if (err)
-		return err;
+		return;
 
+	esw_qos_sched_elem_config(node, node->max_rate, bw_share, extack);
 	node->bw_share = bw_share;
-
-	return err;
 }
 
-static int esw_qos_normalize_min_rate(struct mlx5_eswitch *esw,
-				      struct mlx5_esw_sched_node *parent,
-				      struct netlink_ext_ack *extack)
+static void esw_qos_normalize_min_rate(struct mlx5_eswitch *esw,
+				       struct mlx5_esw_sched_node *parent,
+				       struct netlink_ext_ack *extack)
 {
 	struct list_head *nodes = parent ? &parent->children : &esw->qos.domain->nodes;
 	u32 divider = esw_qos_calculate_min_rate_divider(esw, parent);
 	struct mlx5_esw_sched_node *node;
 
 	list_for_each_entry(node, nodes, entry) {
-		int err;
-
 		if (node->esw != esw || node->ix == esw->qos.root_tsar_ix)
 			continue;
 
-		err = esw_qos_update_sched_node_bw_share(node, divider, extack);
-		if (err)
-			return err;
+		esw_qos_update_sched_node_bw_share(node, divider, extack);
 
 		if (list_empty(&node->children))
 			continue;
 
-		err = esw_qos_normalize_min_rate(node->esw, node, extack);
-		if (err)
-			return err;
+		esw_qos_normalize_min_rate(node->esw, node, extack);
 	}
-
-	return 0;
 }
 
 static int esw_qos_set_vport_min_rate(struct mlx5_vport *vport,
 				      u32 min_rate, struct netlink_ext_ack *extack)
 {
 	struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
-	u32 fw_max_bw_share, previous_min_rate;
 	bool min_rate_supported;
-	int err;
+	u32 fw_max_bw_share;
 
 	esw_assert_qos_lock_held(vport_node->esw);
 	fw_max_bw_share = MLX5_CAP_QOS(vport->dev, max_tsar_bw_share);
@@ -276,13 +261,10 @@ static int esw_qos_set_vport_min_rate(struct mlx5_vport *vport,
 	if (min_rate == vport_node->min_rate)
 		return 0;
 
-	previous_min_rate = vport_node->min_rate;
 	vport_node->min_rate = min_rate;
-	err = esw_qos_normalize_min_rate(vport_node->parent->esw, vport_node->parent, extack);
-	if (err)
-		vport_node->min_rate = previous_min_rate;
+	esw_qos_normalize_min_rate(vport_node->parent->esw, vport_node->parent, extack);
 
-	return err;
+	return 0;
 }
 
 static int esw_qos_set_vport_max_rate(struct mlx5_vport *vport,
@@ -316,8 +298,6 @@ static int esw_qos_set_node_min_rate(struct mlx5_esw_sched_node *node,
 				     u32 min_rate, struct netlink_ext_ack *extack)
 {
 	struct mlx5_eswitch *esw = node->esw;
-	u32 previous_min_rate;
-	int err;
 
 	if (!MLX5_CAP_QOS(esw->dev, esw_bw_share) ||
 	    MLX5_CAP_QOS(esw->dev, max_tsar_bw_share) < MLX5_MIN_BW_SHARE)
@@ -326,19 +306,10 @@ static int esw_qos_set_node_min_rate(struct mlx5_esw_sched_node *node,
 	if (min_rate == node->min_rate)
 		return 0;
 
-	previous_min_rate = node->min_rate;
 	node->min_rate = min_rate;
-	err = esw_qos_normalize_min_rate(esw, NULL, extack);
-	if (err) {
-		NL_SET_ERR_MSG_MOD(extack, "E-Switch node min rate setting failed");
-
-		/* Attempt restoring previous configuration */
-		node->min_rate = previous_min_rate;
-		if (esw_qos_normalize_min_rate(esw, NULL, extack))
-			NL_SET_ERR_MSG_MOD(extack, "E-Switch BW share restore failed");
-	}
+	esw_qos_normalize_min_rate(esw, NULL, extack);
 
-	return err;
+	return 0;
 }
 
 static int esw_qos_set_node_max_rate(struct mlx5_esw_sched_node *node,
@@ -552,17 +523,11 @@ __esw_qos_create_vports_sched_node(struct mlx5_eswitch *esw, struct mlx5_esw_sch
 		goto err_alloc_node;
 	}
 
-	err = esw_qos_normalize_min_rate(esw, NULL, extack);
-	if (err) {
-		NL_SET_ERR_MSG_MOD(extack, "E-Switch nodes normalization failed");
-		goto err_min_rate;
-	}
+	esw_qos_normalize_min_rate(esw, NULL, extack);
 	trace_mlx5_esw_node_qos_create(esw->dev, node, node->ix);
 
 	return node;
 
-err_min_rate:
-	__esw_qos_free_node(node);
 err_alloc_node:
 	if (mlx5_destroy_scheduling_element_cmd(esw->dev,
 						SCHEDULING_HIERARCHY_E_SWITCH,
@@ -609,10 +574,7 @@ static int __esw_qos_destroy_node(struct mlx5_esw_sched_node *node, struct netli
 		NL_SET_ERR_MSG_MOD(extack, "E-Switch destroy TSAR_ID failed");
 	__esw_qos_free_node(node);
 
-	err = esw_qos_normalize_min_rate(esw, NULL, extack);
-	if (err)
-		NL_SET_ERR_MSG_MOD(extack, "E-Switch nodes normalization failed");
-
+	esw_qos_normalize_min_rate(esw, NULL, extack);
 
 	return err;
 }
-- 
2.51.0


From ac778fefed340e019bc9c022842b4a2cc5713559 Mon Sep 17 00:00:00 2001
From: Carolina Jubran <cjubran@nvidia.com>
Date: Thu, 7 Nov 2024 21:43:48 +0200
Subject: [PATCH 12/16] net/mlx5: Generalize max_rate and min_rate setting for
 nodes

Refactor max_rate and min_rate setting functions to operate on
mlx5_esw_sched_node, allowing for generalized handling of both vports
and nodes.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20241107194357.683732-4-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 69 ++++---------------
 1 file changed, 13 insertions(+), 56 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
index 0c371f27c693..82805bb20c76 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
@@ -245,69 +245,20 @@ static void esw_qos_normalize_min_rate(struct mlx5_eswitch *esw,
 	}
 }
 
-static int esw_qos_set_vport_min_rate(struct mlx5_vport *vport,
-				      u32 min_rate, struct netlink_ext_ack *extack)
-{
-	struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
-	bool min_rate_supported;
-	u32 fw_max_bw_share;
-
-	esw_assert_qos_lock_held(vport_node->esw);
-	fw_max_bw_share = MLX5_CAP_QOS(vport->dev, max_tsar_bw_share);
-	min_rate_supported = MLX5_CAP_QOS(vport->dev, esw_bw_share) &&
-				fw_max_bw_share >= MLX5_MIN_BW_SHARE;
-	if (min_rate && !min_rate_supported)
-		return -EOPNOTSUPP;
-	if (min_rate == vport_node->min_rate)
-		return 0;
-
-	vport_node->min_rate = min_rate;
-	esw_qos_normalize_min_rate(vport_node->parent->esw, vport_node->parent, extack);
-
-	return 0;
-}
-
-static int esw_qos_set_vport_max_rate(struct mlx5_vport *vport,
-				      u32 max_rate, struct netlink_ext_ack *extack)
-{
-	struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
-	u32 act_max_rate = max_rate;
-	bool max_rate_supported;
-	int err;
-
-	esw_assert_qos_lock_held(vport_node->esw);
-	max_rate_supported = MLX5_CAP_QOS(vport->dev, esw_rate_limit);
-
-	if (max_rate && !max_rate_supported)
-		return -EOPNOTSUPP;
-	if (max_rate == vport_node->max_rate)
-		return 0;
-
-	/* Use parent node limit if new max rate is 0. */
-	if (!max_rate)
-		act_max_rate = vport_node->parent->max_rate;
-
-	err = esw_qos_sched_elem_config(vport_node, act_max_rate, vport_node->bw_share, extack);
-	if (!err)
-		vport_node->max_rate = max_rate;
-
-	return err;
-}
-
 static int esw_qos_set_node_min_rate(struct mlx5_esw_sched_node *node,
 				     u32 min_rate, struct netlink_ext_ack *extack)
 {
 	struct mlx5_eswitch *esw = node->esw;
 
-	if (!MLX5_CAP_QOS(esw->dev, esw_bw_share) ||
-	    MLX5_CAP_QOS(esw->dev, max_tsar_bw_share) < MLX5_MIN_BW_SHARE)
+	if (min_rate && (!MLX5_CAP_QOS(esw->dev, esw_bw_share) ||
+			 MLX5_CAP_QOS(esw->dev, max_tsar_bw_share) < MLX5_MIN_BW_SHARE))
 		return -EOPNOTSUPP;
 
 	if (min_rate == node->min_rate)
 		return 0;
 
 	node->min_rate = min_rate;
-	esw_qos_normalize_min_rate(esw, NULL, extack);
+	esw_qos_normalize_min_rate(esw, node->parent, extack);
 
 	return 0;
 }
@@ -321,11 +272,17 @@ static int esw_qos_set_node_max_rate(struct mlx5_esw_sched_node *node,
 	if (node->max_rate == max_rate)
 		return 0;
 
+	/* Use parent node limit if new max rate is 0. */
+	if (!max_rate && node->parent)
+		max_rate = node->parent->max_rate;
+
 	err = esw_qos_sched_elem_config(node, max_rate, node->bw_share, extack);
 	if (err)
 		return err;
 
 	node->max_rate = max_rate;
+	if (node->type != SCHED_NODE_TYPE_VPORTS_TSAR)
+		return 0;
 
 	/* Any unlimited vports in the node should be set with the value of the node. */
 	list_for_each_entry(vport_node, &node->children, entry) {
@@ -748,9 +705,9 @@ int mlx5_esw_qos_set_vport_rate(struct mlx5_vport *vport, u32 max_rate, u32 min_
 	if (err)
 		goto unlock;
 
-	err = esw_qos_set_vport_min_rate(vport, min_rate, NULL);
+	err = esw_qos_set_node_min_rate(vport->qos.sched_node, min_rate, NULL);
 	if (!err)
-		err = esw_qos_set_vport_max_rate(vport, max_rate, NULL);
+		err = esw_qos_set_node_max_rate(vport->qos.sched_node, max_rate, NULL);
 unlock:
 	esw_qos_unlock(esw);
 	return err;
@@ -947,7 +904,7 @@ int mlx5_esw_devlink_rate_leaf_tx_share_set(struct devlink_rate *rate_leaf, void
 	if (err)
 		goto unlock;
 
-	err = esw_qos_set_vport_min_rate(vport, tx_share, extack);
+	err = esw_qos_set_node_min_rate(vport->qos.sched_node, tx_share, extack);
 unlock:
 	esw_qos_unlock(esw);
 	return err;
@@ -973,7 +930,7 @@ int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void *
 	if (err)
 		goto unlock;
 
-	err = esw_qos_set_vport_max_rate(vport, tx_max, extack);
+	err = esw_qos_set_node_max_rate(vport->qos.sched_node, tx_max, extack);
 unlock:
 	esw_qos_unlock(esw);
 	return err;
-- 
2.51.0


From cc4bb15ffa8412bfe1e189d37edb6ca7d9918cb4 Mon Sep 17 00:00:00 2001
From: Carolina Jubran <cjubran@nvidia.com>
Date: Thu, 7 Nov 2024 21:43:49 +0200
Subject: [PATCH 13/16] net/mlx5: Refactor scheduling element configuration
 bitmasks

Refactor esw_qos_sched_elem_config to set bitmasks only when max_rate
or bw_share values change, allowing the function to configure nodes
with only one of these parameters.

This enables more flexible usage for nodes where only one parameter
requires configuration.

Remove scattered assignments and checks to centralize them within this
function, removing the now redundant esw_qos_set_node_max_rate
entirely.

With this refactor, also remove the assignment of the vport scheduling
node max rate to the parent max rate for unlimited vports
(where max rate is set to zero), as firmware already handles this
behavior.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20241107194357.683732-5-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 80 ++++++-------------
 1 file changed, 24 insertions(+), 56 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
index 82805bb20c76..c1e7b2425ebe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
@@ -143,10 +143,21 @@ static int esw_qos_sched_elem_config(struct mlx5_esw_sched_node *node, u32 max_r
 	if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, esw_scheduling))
 		return -EOPNOTSUPP;
 
-	MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_rate);
-	MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share);
-	bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW;
-	bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE;
+	if (bw_share && (!MLX5_CAP_QOS(dev, esw_bw_share) ||
+			 MLX5_CAP_QOS(dev, max_tsar_bw_share) < MLX5_MIN_BW_SHARE))
+		return -EOPNOTSUPP;
+
+	if (node->max_rate == max_rate && node->bw_share == bw_share)
+		return 0;
+
+	if (node->max_rate != max_rate) {
+		MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_rate);
+		bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW;
+	}
+	if (node->bw_share != bw_share) {
+		MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share);
+		bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE;
+	}
 
 	err = mlx5_modify_scheduling_element_cmd(dev,
 						 SCHEDULING_HIERARCHY_E_SWITCH,
@@ -160,6 +171,8 @@ static int esw_qos_sched_elem_config(struct mlx5_esw_sched_node *node, u32 max_r
 		return err;
 	}
 
+	node->max_rate = max_rate;
+	node->bw_share = bw_share;
 	if (node->type == SCHED_NODE_TYPE_VPORTS_TSAR)
 		trace_mlx5_esw_node_qos_config(dev, node, node->ix, bw_share, max_rate);
 	else if (node->type == SCHED_NODE_TYPE_VPORT)
@@ -217,11 +230,7 @@ static void esw_qos_update_sched_node_bw_share(struct mlx5_esw_sched_node *node,
 
 	bw_share = esw_qos_calc_bw_share(node->min_rate, divider, fw_max_bw_share);
 
-	if (bw_share == node->bw_share)
-		return;
-
 	esw_qos_sched_elem_config(node, node->max_rate, bw_share, extack);
-	node->bw_share = bw_share;
 }
 
 static void esw_qos_normalize_min_rate(struct mlx5_eswitch *esw,
@@ -250,10 +259,6 @@ static int esw_qos_set_node_min_rate(struct mlx5_esw_sched_node *node,
 {
 	struct mlx5_eswitch *esw = node->esw;
 
-	if (min_rate && (!MLX5_CAP_QOS(esw->dev, esw_bw_share) ||
-			 MLX5_CAP_QOS(esw->dev, max_tsar_bw_share) < MLX5_MIN_BW_SHARE))
-		return -EOPNOTSUPP;
-
 	if (min_rate == node->min_rate)
 		return 0;
 
@@ -263,41 +268,6 @@ static int esw_qos_set_node_min_rate(struct mlx5_esw_sched_node *node,
 	return 0;
 }
 
-static int esw_qos_set_node_max_rate(struct mlx5_esw_sched_node *node,
-				     u32 max_rate, struct netlink_ext_ack *extack)
-{
-	struct mlx5_esw_sched_node *vport_node;
-	int err;
-
-	if (node->max_rate == max_rate)
-		return 0;
-
-	/* Use parent node limit if new max rate is 0. */
-	if (!max_rate && node->parent)
-		max_rate = node->parent->max_rate;
-
-	err = esw_qos_sched_elem_config(node, max_rate, node->bw_share, extack);
-	if (err)
-		return err;
-
-	node->max_rate = max_rate;
-	if (node->type != SCHED_NODE_TYPE_VPORTS_TSAR)
-		return 0;
-
-	/* Any unlimited vports in the node should be set with the value of the node. */
-	list_for_each_entry(vport_node, &node->children, entry) {
-		if (vport_node->max_rate)
-			continue;
-
-		err = esw_qos_sched_elem_config(vport_node, max_rate, vport_node->bw_share, extack);
-		if (err)
-			NL_SET_ERR_MSG_MOD(extack,
-					   "E-Switch vport implicit rate limit setting failed");
-	}
-
-	return err;
-}
-
 static int esw_qos_create_node_sched_elem(struct mlx5_core_dev *dev, u32 parent_element_id,
 					  u32 *tsar_ix)
 {
@@ -367,7 +337,6 @@ static int esw_qos_update_node_scheduling_element(struct mlx5_vport *vport,
 						  struct netlink_ext_ack *extack)
 {
 	struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
-	u32 max_rate;
 	int err;
 
 	err = mlx5_destroy_scheduling_element_cmd(curr_node->esw->dev,
@@ -378,9 +347,7 @@ static int esw_qos_update_node_scheduling_element(struct mlx5_vport *vport,
 		return err;
 	}
 
-	/* Use new node max rate if vport max rate is unlimited. */
-	max_rate = vport_node->max_rate ? vport_node->max_rate : new_node->max_rate;
-	err = esw_qos_vport_create_sched_element(vport, new_node, max_rate,
+	err = esw_qos_vport_create_sched_element(vport, new_node, vport_node->max_rate,
 						 vport_node->bw_share,
 						 &vport_node->ix);
 	if (err) {
@@ -393,8 +360,7 @@ static int esw_qos_update_node_scheduling_element(struct mlx5_vport *vport,
 	return 0;
 
 err_sched:
-	max_rate = vport_node->max_rate ? vport_node->max_rate : curr_node->max_rate;
-	if (esw_qos_vport_create_sched_element(vport, curr_node, max_rate,
+	if (esw_qos_vport_create_sched_element(vport, curr_node, vport_node->max_rate,
 					       vport_node->bw_share,
 					       &vport_node->ix))
 		esw_warn(curr_node->esw->dev, "E-Switch vport node restore failed (vport=%d)\n",
@@ -707,7 +673,8 @@ int mlx5_esw_qos_set_vport_rate(struct mlx5_vport *vport, u32 max_rate, u32 min_
 
 	err = esw_qos_set_node_min_rate(vport->qos.sched_node, min_rate, NULL);
 	if (!err)
-		err = esw_qos_set_node_max_rate(vport->qos.sched_node, max_rate, NULL);
+		err = esw_qos_sched_elem_config(vport->qos.sched_node, max_rate,
+						vport->qos.sched_node->bw_share, NULL);
 unlock:
 	esw_qos_unlock(esw);
 	return err;
@@ -930,7 +897,8 @@ int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void *
 	if (err)
 		goto unlock;
 
-	err = esw_qos_set_node_max_rate(vport->qos.sched_node, tx_max, extack);
+	err = esw_qos_sched_elem_config(vport->qos.sched_node, tx_max,
+					vport->qos.sched_node->bw_share, extack);
 unlock:
 	esw_qos_unlock(esw);
 	return err;
@@ -965,7 +933,7 @@ int mlx5_esw_devlink_rate_node_tx_max_set(struct devlink_rate *rate_node, void *
 		return err;
 
 	esw_qos_lock(esw);
-	err = esw_qos_set_node_max_rate(node, tx_max, extack);
+	err = esw_qos_sched_elem_config(node, tx_max, node->bw_share, extack);
 	esw_qos_unlock(esw);
 	return err;
 }
-- 
2.51.0


From 663bc605d0db8782ff9c2704db5ce6cf2ac7fa93 Mon Sep 17 00:00:00 2001
From: Carolina Jubran <cjubran@nvidia.com>
Date: Thu, 7 Nov 2024 21:43:50 +0200
Subject: [PATCH 14/16] net/mlx5: Generalize scheduling element operations

Introduce helper functions to create and destroy scheduling elements,
allowing flexible configuration for different scheduling element types.

The new helper functions streamline the process by centralizing error
handling and logging through esw_qos_sched_elem_op_warn, which now
accepts the operation type (create, destroy, or modify).

The changes also adjust the esw_qos_vport_enable and
mlx5_esw_qos_vport_disable functions to leverage the new generalized
create/destroy helpers.

The destroy functions now log errors with esw_warn without returning
them. This prevents unnecessary error handling since the node was
already destroyed and no further action is required from callers.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20241107194357.683732-6-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 157 +++++++++---------
 1 file changed, 76 insertions(+), 81 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
index c1e7b2425ebe..155400d36a1e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
@@ -118,18 +118,49 @@ mlx5_esw_qos_vport_get_parent(const struct mlx5_vport *vport)
 	return vport->qos.sched_node->parent;
 }
 
-static void esw_qos_sched_elem_config_warn(struct mlx5_esw_sched_node *node, int err)
+static void esw_qos_sched_elem_warn(struct mlx5_esw_sched_node *node, int err, const char *op)
 {
 	if (node->vport) {
 		esw_warn(node->esw->dev,
-			 "E-Switch modify %s scheduling element failed (vport=%d,err=%d)\n",
-			 sched_node_type_str[node->type], node->vport->vport, err);
+			 "E-Switch %s %s scheduling element failed (vport=%d,err=%d)\n",
+			 op, sched_node_type_str[node->type], node->vport->vport, err);
 		return;
 	}
 
 	esw_warn(node->esw->dev,
-		 "E-Switch modify %s scheduling element failed (err=%d)\n",
-		 sched_node_type_str[node->type], err);
+		 "E-Switch %s %s scheduling element failed (err=%d)\n",
+		 op, sched_node_type_str[node->type], err);
+}
+
+static int esw_qos_node_create_sched_element(struct mlx5_esw_sched_node *node, void *ctx,
+					     struct netlink_ext_ack *extack)
+{
+	int err;
+
+	err = mlx5_create_scheduling_element_cmd(node->esw->dev, SCHEDULING_HIERARCHY_E_SWITCH, ctx,
+						 &node->ix);
+	if (err) {
+		esw_qos_sched_elem_warn(node, err, "create");
+		NL_SET_ERR_MSG_MOD(extack, "E-Switch create scheduling element failed");
+	}
+
+	return err;
+}
+
+static int esw_qos_node_destroy_sched_element(struct mlx5_esw_sched_node *node,
+					      struct netlink_ext_ack *extack)
+{
+	int err;
+
+	err = mlx5_destroy_scheduling_element_cmd(node->esw->dev,
+						  SCHEDULING_HIERARCHY_E_SWITCH,
+						  node->ix);
+	if (err) {
+		esw_qos_sched_elem_warn(node, err, "destroy");
+		NL_SET_ERR_MSG_MOD(extack, "E-Switch destroying scheduling element failed.");
+	}
+
+	return err;
 }
 
 static int esw_qos_sched_elem_config(struct mlx5_esw_sched_node *node, u32 max_rate, u32 bw_share,
@@ -165,7 +196,7 @@ static int esw_qos_sched_elem_config(struct mlx5_esw_sched_node *node, u32 max_r
 						 node->ix,
 						 bitmask);
 	if (err) {
-		esw_qos_sched_elem_config_warn(node, err);
+		esw_qos_sched_elem_warn(node, err, "modify");
 		NL_SET_ERR_MSG_MOD(extack, "E-Switch modify scheduling element failed");
 
 		return err;
@@ -295,14 +326,12 @@ static int esw_qos_create_node_sched_elem(struct mlx5_core_dev *dev, u32 parent_
 						  tsar_ix);
 }
 
-static int
-esw_qos_vport_create_sched_element(struct mlx5_vport *vport, struct mlx5_esw_sched_node *parent,
-				   u32 max_rate, u32 bw_share, u32 *sched_elem_ix)
+static int esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_node, u32 bw_share,
+					      struct netlink_ext_ack *extack)
 {
 	u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
-	struct mlx5_core_dev *dev = parent->esw->dev;
+	struct mlx5_core_dev *dev = vport_node->esw->dev;
 	void *attr;
-	int err;
 
 	if (!mlx5_qos_element_type_supported(dev,
 					     SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT,
@@ -312,23 +341,12 @@ esw_qos_vport_create_sched_element(struct mlx5_vport *vport, struct mlx5_esw_sch
 	MLX5_SET(scheduling_context, sched_ctx, element_type,
 		 SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT);
 	attr = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes);
-	MLX5_SET(vport_element, attr, vport_number, vport->vport);
-	MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent->ix);
-	MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_rate);
+	MLX5_SET(vport_element, attr, vport_number, vport_node->vport->vport);
+	MLX5_SET(scheduling_context, sched_ctx, parent_element_id, vport_node->parent->ix);
+	MLX5_SET(scheduling_context, sched_ctx, max_average_bw, vport_node->max_rate);
 	MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share);
 
-	err = mlx5_create_scheduling_element_cmd(dev,
-						 SCHEDULING_HIERARCHY_E_SWITCH,
-						 sched_ctx,
-						 sched_elem_ix);
-	if (err) {
-		esw_warn(dev,
-			 "E-Switch create vport scheduling element failed (vport=%d,err=%d)\n",
-			 vport->vport, err);
-		return err;
-	}
-
-	return 0;
+	return esw_qos_node_create_sched_element(vport_node, sched_ctx, extack);
 }
 
 static int esw_qos_update_node_scheduling_element(struct mlx5_vport *vport,
@@ -339,30 +357,22 @@ static int esw_qos_update_node_scheduling_element(struct mlx5_vport *vport,
 	struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
 	int err;
 
-	err = mlx5_destroy_scheduling_element_cmd(curr_node->esw->dev,
-						  SCHEDULING_HIERARCHY_E_SWITCH,
-						  vport_node->ix);
-	if (err) {
-		NL_SET_ERR_MSG_MOD(extack, "E-Switch destroy vport scheduling element failed");
+	err = esw_qos_node_destroy_sched_element(vport_node, extack);
+	if (err)
 		return err;
-	}
 
-	err = esw_qos_vport_create_sched_element(vport, new_node, vport_node->max_rate,
-						 vport_node->bw_share,
-						 &vport_node->ix);
+	esw_qos_node_set_parent(vport_node, new_node);
+	err = esw_qos_vport_create_sched_element(vport_node, vport_node->bw_share, extack);
 	if (err) {
 		NL_SET_ERR_MSG_MOD(extack, "E-Switch vport node set failed.");
 		goto err_sched;
 	}
 
-	esw_qos_node_set_parent(vport->qos.sched_node, new_node);
-
 	return 0;
 
 err_sched:
-	if (esw_qos_vport_create_sched_element(vport, curr_node, vport_node->max_rate,
-					       vport_node->bw_share,
-					       &vport_node->ix))
+	esw_qos_node_set_parent(vport_node, curr_node);
+	if (esw_qos_vport_create_sched_element(vport_node, vport_node->bw_share, NULL))
 		esw_warn(curr_node->esw->dev, "E-Switch vport node restore failed (vport=%d)\n",
 			 vport->vport);
 
@@ -425,6 +435,12 @@ static void __esw_qos_free_node(struct mlx5_esw_sched_node *node)
 	kfree(node);
 }
 
+static void esw_qos_destroy_node(struct mlx5_esw_sched_node *node, struct netlink_ext_ack *extack)
+{
+	esw_qos_node_destroy_sched_element(node, extack);
+	__esw_qos_free_node(node);
+}
+
 static struct mlx5_esw_sched_node *
 __esw_qos_create_vports_sched_node(struct mlx5_eswitch *esw, struct mlx5_esw_sched_node *parent,
 				   struct netlink_ext_ack *extack)
@@ -483,23 +499,13 @@ esw_qos_create_vports_sched_node(struct mlx5_eswitch *esw, struct netlink_ext_ac
 	return node;
 }
 
-static int __esw_qos_destroy_node(struct mlx5_esw_sched_node *node, struct netlink_ext_ack *extack)
+static void __esw_qos_destroy_node(struct mlx5_esw_sched_node *node, struct netlink_ext_ack *extack)
 {
 	struct mlx5_eswitch *esw = node->esw;
-	int err;
 
 	trace_mlx5_esw_node_qos_destroy(esw->dev, node, node->ix);
-
-	err = mlx5_destroy_scheduling_element_cmd(esw->dev,
-						  SCHEDULING_HIERARCHY_E_SWITCH,
-						  node->ix);
-	if (err)
-		NL_SET_ERR_MSG_MOD(extack, "E-Switch destroy TSAR_ID failed");
-	__esw_qos_free_node(node);
-
+	esw_qos_destroy_node(node, extack);
 	esw_qos_normalize_min_rate(esw, NULL, extack);
-
-	return err;
 }
 
 static int esw_qos_create(struct mlx5_eswitch *esw, struct netlink_ext_ack *extack)
@@ -584,11 +590,11 @@ static void esw_qos_put(struct mlx5_eswitch *esw)
 		esw_qos_destroy(esw);
 }
 
-static int esw_qos_vport_enable(struct mlx5_vport *vport,
-				u32 max_rate, u32 bw_share, struct netlink_ext_ack *extack)
+static int esw_qos_vport_enable(struct mlx5_vport *vport, u32 max_rate, u32 bw_share,
+				struct netlink_ext_ack *extack)
 {
 	struct mlx5_eswitch *esw = vport->dev->priv.eswitch;
-	u32 sched_elem_ix;
+	struct mlx5_esw_sched_node *sched_node;
 	int err;
 
 	esw_assert_qos_lock_held(esw);
@@ -599,29 +605,28 @@ static int esw_qos_vport_enable(struct mlx5_vport *vport,
 	if (err)
 		return err;
 
-	err = esw_qos_vport_create_sched_element(vport, esw->qos.node0, max_rate, bw_share,
-						 &sched_elem_ix);
-	if (err)
-		goto err_out;
-
-	vport->qos.sched_node = __esw_qos_alloc_node(esw, sched_elem_ix, SCHED_NODE_TYPE_VPORT,
-						     esw->qos.node0);
-	if (!vport->qos.sched_node) {
+	sched_node = __esw_qos_alloc_node(esw, 0, SCHED_NODE_TYPE_VPORT, esw->qos.node0);
+	if (!sched_node) {
 		err = -ENOMEM;
 		goto err_alloc;
 	}
 
-	vport->qos.sched_node->vport = vport;
+	sched_node->max_rate = max_rate;
+	sched_node->min_rate = 0;
+	sched_node->bw_share = bw_share;
+	sched_node->vport = vport;
+	err = esw_qos_vport_create_sched_element(sched_node, 0, extack);
+	if (err)
+		goto err_vport_create;
 
 	trace_mlx5_esw_vport_qos_create(vport->dev, vport, bw_share, max_rate);
+	vport->qos.sched_node = sched_node;
 
 	return 0;
 
+err_vport_create:
+	__esw_qos_free_node(sched_node);
 err_alloc:
-	if (mlx5_destroy_scheduling_element_cmd(esw->dev,
-						SCHEDULING_HIERARCHY_E_SWITCH, sched_elem_ix))
-		esw_warn(esw->dev, "E-Switch destroy vport scheduling element failed.\n");
-err_out:
 	esw_qos_put(esw);
 
 	return err;
@@ -632,7 +637,6 @@ void mlx5_esw_qos_vport_disable(struct mlx5_vport *vport)
 	struct mlx5_eswitch *esw = vport->dev->priv.eswitch;
 	struct mlx5_esw_sched_node *vport_node;
 	struct mlx5_core_dev *dev;
-	int err;
 
 	lockdep_assert_held(&esw->state_lock);
 	esw_qos_lock(esw);
@@ -645,15 +649,7 @@ void mlx5_esw_qos_vport_disable(struct mlx5_vport *vport)
 	dev = vport_node->esw->dev;
 	trace_mlx5_esw_vport_qos_destroy(dev, vport);
 
-	err = mlx5_destroy_scheduling_element_cmd(dev,
-						  SCHEDULING_HIERARCHY_E_SWITCH,
-						  vport_node->ix);
-	if (err)
-		esw_warn(dev,
-			 "E-Switch destroy vport scheduling element failed (vport=%d,err=%d)\n",
-			 vport->vport, err);
-
-	__esw_qos_free_node(vport_node);
+	esw_qos_destroy_node(vport_node, NULL);
 	memset(&vport->qos, 0, sizeof(vport->qos));
 
 	esw_qos_put(esw);
@@ -974,13 +970,12 @@ int mlx5_esw_devlink_rate_node_del(struct devlink_rate *rate_node, void *priv,
 {
 	struct mlx5_esw_sched_node *node = priv;
 	struct mlx5_eswitch *esw = node->esw;
-	int err;
 
 	esw_qos_lock(esw);
-	err = __esw_qos_destroy_node(node, extack);
+	__esw_qos_destroy_node(node, extack);
 	esw_qos_put(esw);
 	esw_qos_unlock(esw);
-	return err;
+	return 0;
 }
 
 int mlx5_esw_qos_vport_update_node(struct mlx5_vport *vport,
-- 
2.51.0


From d67bfd10e668bfca717e0d94112f04f61c58dad7 Mon Sep 17 00:00:00 2001
From: Carolina Jubran <cjubran@nvidia.com>
Date: Thu, 7 Nov 2024 21:43:51 +0200
Subject: [PATCH 15/16] net/mlx5: Integrate esw_qos_vport_enable logic into
 rate operations

Fold the esw_qos_vport_enable function into operations for configuring
maximum and minimum rates, simplifying QoS logic. This change
consolidates enabling and updating the scheduling element
configuration, streamlining how vport QoS is initialized and adjusted.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20241107194357.683732-7-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 87 +++++++++----------
 1 file changed, 39 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
index 155400d36a1e..35e493924c09 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
@@ -590,22 +590,21 @@ static void esw_qos_put(struct mlx5_eswitch *esw)
 		esw_qos_destroy(esw);
 }
 
-static int esw_qos_vport_enable(struct mlx5_vport *vport, u32 max_rate, u32 bw_share,
-				struct netlink_ext_ack *extack)
+static int esw_qos_vport_enable(struct mlx5_vport *vport, struct mlx5_esw_sched_node *parent,
+				u32 max_rate, u32 bw_share, struct netlink_ext_ack *extack)
 {
 	struct mlx5_eswitch *esw = vport->dev->priv.eswitch;
 	struct mlx5_esw_sched_node *sched_node;
 	int err;
 
 	esw_assert_qos_lock_held(esw);
-	if (vport->qos.sched_node)
-		return 0;
 
 	err = esw_qos_get(esw, extack);
 	if (err)
 		return err;
 
-	sched_node = __esw_qos_alloc_node(esw, 0, SCHED_NODE_TYPE_VPORT, esw->qos.node0);
+	parent = parent ?: esw->qos.node0;
+	sched_node = __esw_qos_alloc_node(parent->esw, 0, SCHED_NODE_TYPE_VPORT, parent);
 	if (!sched_node) {
 		err = -ENOMEM;
 		goto err_alloc;
@@ -657,21 +656,42 @@ unlock:
 	esw_qos_unlock(esw);
 }
 
+static int mlx5_esw_qos_set_vport_max_rate(struct mlx5_vport *vport, u32 max_rate,
+					   struct netlink_ext_ack *extack)
+{
+	struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
+
+	esw_assert_qos_lock_held(vport->dev->priv.eswitch);
+
+	if (!vport_node)
+		return esw_qos_vport_enable(vport, NULL, max_rate, 0, extack);
+	else
+		return esw_qos_sched_elem_config(vport_node, max_rate, vport_node->bw_share,
+						 extack);
+}
+
+static int mlx5_esw_qos_set_vport_min_rate(struct mlx5_vport *vport, u32 min_rate,
+					   struct netlink_ext_ack *extack)
+{
+	struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
+
+	esw_assert_qos_lock_held(vport->dev->priv.eswitch);
+
+	if (!vport_node)
+		return esw_qos_vport_enable(vport, NULL, 0, min_rate, extack);
+	else
+		return esw_qos_set_node_min_rate(vport_node, min_rate, extack);
+}
+
 int mlx5_esw_qos_set_vport_rate(struct mlx5_vport *vport, u32 max_rate, u32 min_rate)
 {
 	struct mlx5_eswitch *esw = vport->dev->priv.eswitch;
 	int err;
 
 	esw_qos_lock(esw);
-	err = esw_qos_vport_enable(vport, 0, 0, NULL);
-	if (err)
-		goto unlock;
-
-	err = esw_qos_set_node_min_rate(vport->qos.sched_node, min_rate, NULL);
+	err = mlx5_esw_qos_set_vport_min_rate(vport, min_rate, NULL);
 	if (!err)
-		err = esw_qos_sched_elem_config(vport->qos.sched_node, max_rate,
-						vport->qos.sched_node->bw_share, NULL);
-unlock:
+		err = mlx5_esw_qos_set_vport_max_rate(vport, max_rate, NULL);
 	esw_qos_unlock(esw);
 	return err;
 }
@@ -757,10 +777,8 @@ static int mlx5_esw_qos_link_speed_verify(struct mlx5_core_dev *mdev,
 
 int mlx5_esw_qos_modify_vport_rate(struct mlx5_eswitch *esw, u16 vport_num, u32 rate_mbps)
 {
-	u32 ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
 	struct mlx5_vport *vport;
 	u32 link_speed_max;
-	u32 bitmask;
 	int err;
 
 	vport = mlx5_eswitch_get_vport(esw, vport_num);
@@ -779,20 +797,7 @@ int mlx5_esw_qos_modify_vport_rate(struct mlx5_eswitch *esw, u16 vport_num, u32
 	}
 
 	esw_qos_lock(esw);
-	if (!vport->qos.sched_node) {
-		/* Eswitch QoS wasn't enabled yet. Enable it and vport QoS. */
-		err = esw_qos_vport_enable(vport, rate_mbps, 0, NULL);
-	} else {
-		struct mlx5_core_dev *dev = vport->qos.sched_node->parent->esw->dev;
-
-		MLX5_SET(scheduling_context, ctx, max_average_bw, rate_mbps);
-		bitmask = MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW;
-		err = mlx5_modify_scheduling_element_cmd(dev,
-							 SCHEDULING_HIERARCHY_E_SWITCH,
-							 ctx,
-							 vport->qos.sched_node->ix,
-							 bitmask);
-	}
+	err = mlx5_esw_qos_set_vport_max_rate(vport, rate_mbps, NULL);
 	esw_qos_unlock(esw);
 
 	return err;
@@ -863,12 +868,7 @@ int mlx5_esw_devlink_rate_leaf_tx_share_set(struct devlink_rate *rate_leaf, void
 		return err;
 
 	esw_qos_lock(esw);
-	err = esw_qos_vport_enable(vport, 0, 0, extack);
-	if (err)
-		goto unlock;
-
-	err = esw_qos_set_node_min_rate(vport->qos.sched_node, tx_share, extack);
-unlock:
+	err = mlx5_esw_qos_set_vport_min_rate(vport, tx_share, extack);
 	esw_qos_unlock(esw);
 	return err;
 }
@@ -889,13 +889,7 @@ int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void *
 		return err;
 
 	esw_qos_lock(esw);
-	err = esw_qos_vport_enable(vport, 0, 0, extack);
-	if (err)
-		goto unlock;
-
-	err = esw_qos_sched_elem_config(vport->qos.sched_node, tx_max,
-					vport->qos.sched_node->bw_share, extack);
-unlock:
+	err = mlx5_esw_qos_set_vport_max_rate(vport, tx_max, extack);
 	esw_qos_unlock(esw);
 	return err;
 }
@@ -991,13 +985,10 @@ int mlx5_esw_qos_vport_update_node(struct mlx5_vport *vport,
 	}
 
 	esw_qos_lock(esw);
-	if (!vport->qos.sched_node && !node)
-		goto unlock;
-
-	err = esw_qos_vport_enable(vport, 0, 0, extack);
-	if (!err)
+	if (!vport->qos.sched_node && node)
+		err = esw_qos_vport_enable(vport, node, 0, 0, extack);
+	else if (vport->qos.sched_node)
 		err = esw_qos_vport_update_node(vport, node, extack);
-unlock:
 	esw_qos_unlock(esw);
 	return err;
 }
-- 
2.51.0


From be034baba83e2a80a0b2c0f24c08547b6eedc79a Mon Sep 17 00:00:00 2001
From: Carolina Jubran <cjubran@nvidia.com>
Date: Thu, 7 Nov 2024 21:43:52 +0200
Subject: [PATCH 16/16] net/mlx5: Make vport QoS enablement more flexible for
 future extensions

Refactor esw_qos_vport_enable to support more generic configurations,
allowing it to be reused for new vport node types in future patches.

This refactor includes a new way to change the vport parent node by
disabling the current setup and re-enabling it with the new parent.
This change sets the foundation for adapting configuration based on the
parent type in future patches.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20241107194357.683732-8-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../mellanox/mlx5/core/esw/devlink_port.c     |   2 +-
 .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 193 ++++++++----------
 .../net/ethernet/mellanox/mlx5/core/esw/qos.h |   1 +
 .../net/ethernet/mellanox/mlx5/core/eswitch.c |   6 +-
 .../net/ethernet/mellanox/mlx5/core/eswitch.h |   5 +-
 5 files changed, 96 insertions(+), 111 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c
index d0f38818363f..982fe3714683 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c
@@ -195,7 +195,7 @@ void mlx5_esw_offloads_devlink_port_unregister(struct mlx5_vport *vport)
 		return;
 	dl_port = vport->dl_port;
 
-	mlx5_esw_qos_vport_update_node(vport, NULL, NULL);
+	mlx5_esw_qos_vport_update_parent(vport, NULL, NULL);
 	devl_rate_leaf_destroy(&dl_port->dl_port);
 
 	devl_port_unregister(&dl_port->dl_port);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
index 35e493924c09..8b7c843446e1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
@@ -101,6 +101,12 @@ esw_qos_node_set_parent(struct mlx5_esw_sched_node *node, struct mlx5_esw_sched_
 	node->esw = parent->esw;
 }
 
+void mlx5_esw_qos_vport_qos_free(struct mlx5_vport *vport)
+{
+	kfree(vport->qos.sched_node);
+	memset(&vport->qos, 0, sizeof(vport->qos));
+}
+
 u32 mlx5_esw_qos_vport_get_sched_elem_ix(const struct mlx5_vport *vport)
 {
 	if (!vport->qos.sched_node)
@@ -326,7 +332,7 @@ static int esw_qos_create_node_sched_elem(struct mlx5_core_dev *dev, u32 parent_
 						  tsar_ix);
 }
 
-static int esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_node, u32 bw_share,
+static int esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_node,
 					      struct netlink_ext_ack *extack)
 {
 	u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
@@ -344,69 +350,10 @@ static int esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_
 	MLX5_SET(vport_element, attr, vport_number, vport_node->vport->vport);
 	MLX5_SET(scheduling_context, sched_ctx, parent_element_id, vport_node->parent->ix);
 	MLX5_SET(scheduling_context, sched_ctx, max_average_bw, vport_node->max_rate);
-	MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share);
 
 	return esw_qos_node_create_sched_element(vport_node, sched_ctx, extack);
 }
 
-static int esw_qos_update_node_scheduling_element(struct mlx5_vport *vport,
-						  struct mlx5_esw_sched_node *curr_node,
-						  struct mlx5_esw_sched_node *new_node,
-						  struct netlink_ext_ack *extack)
-{
-	struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
-	int err;
-
-	err = esw_qos_node_destroy_sched_element(vport_node, extack);
-	if (err)
-		return err;
-
-	esw_qos_node_set_parent(vport_node, new_node);
-	err = esw_qos_vport_create_sched_element(vport_node, vport_node->bw_share, extack);
-	if (err) {
-		NL_SET_ERR_MSG_MOD(extack, "E-Switch vport node set failed.");
-		goto err_sched;
-	}
-
-	return 0;
-
-err_sched:
-	esw_qos_node_set_parent(vport_node, curr_node);
-	if (esw_qos_vport_create_sched_element(vport_node, vport_node->bw_share, NULL))
-		esw_warn(curr_node->esw->dev, "E-Switch vport node restore failed (vport=%d)\n",
-			 vport->vport);
-
-	return err;
-}
-
-static int esw_qos_vport_update_node(struct mlx5_vport *vport,
-				     struct mlx5_esw_sched_node *node,
-				     struct netlink_ext_ack *extack)
-{
-	struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
-	struct mlx5_eswitch *esw = vport->dev->priv.eswitch;
-	struct mlx5_esw_sched_node *new_node, *curr_node;
-	int err;
-
-	esw_assert_qos_lock_held(esw);
-	curr_node = vport_node->parent;
-	new_node = node ?: esw->qos.node0;
-	if (curr_node == new_node)
-		return 0;
-
-	err = esw_qos_update_node_scheduling_element(vport, curr_node, new_node, extack);
-	if (err)
-		return err;
-
-	/* Recalculate bw share weights of old and new nodes */
-	if (vport_node->bw_share || new_node->bw_share) {
-		esw_qos_normalize_min_rate(curr_node->esw, curr_node, extack);
-		esw_qos_normalize_min_rate(new_node->esw, new_node, extack);
-	}
-
-	return 0;
-}
-
 static struct mlx5_esw_sched_node *
 __esw_qos_alloc_node(struct mlx5_eswitch *esw, u32 tsar_ix, enum sched_node_type type,
 		     struct mlx5_esw_sched_node *parent)
@@ -590,43 +537,62 @@ static void esw_qos_put(struct mlx5_eswitch *esw)
 		esw_qos_destroy(esw);
 }
 
+static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_ack *extack)
+{
+	struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
+	struct mlx5_esw_sched_node *parent = vport_node->parent;
+
+	esw_qos_node_destroy_sched_element(vport_node, extack);
+
+	vport_node->bw_share = 0;
+	list_del_init(&vport_node->entry);
+	esw_qos_normalize_min_rate(parent->esw, parent, extack);
+
+	trace_mlx5_esw_vport_qos_destroy(vport_node->esw->dev, vport);
+}
+
 static int esw_qos_vport_enable(struct mlx5_vport *vport, struct mlx5_esw_sched_node *parent,
-				u32 max_rate, u32 bw_share, struct netlink_ext_ack *extack)
+				struct netlink_ext_ack *extack)
+{
+	int err;
+
+	esw_assert_qos_lock_held(vport->dev->priv.eswitch);
+
+	esw_qos_node_set_parent(vport->qos.sched_node, parent);
+	err = esw_qos_vport_create_sched_element(vport->qos.sched_node, extack);
+	if (err)
+		return err;
+
+	esw_qos_normalize_min_rate(parent->esw, parent, extack);
+
+	return 0;
+}
+
+static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_type type,
+				     struct mlx5_esw_sched_node *parent, u32 max_rate,
+				     u32 min_rate, struct netlink_ext_ack *extack)
 {
 	struct mlx5_eswitch *esw = vport->dev->priv.eswitch;
 	struct mlx5_esw_sched_node *sched_node;
 	int err;
 
 	esw_assert_qos_lock_held(esw);
-
 	err = esw_qos_get(esw, extack);
 	if (err)
 		return err;
 
 	parent = parent ?: esw->qos.node0;
-	sched_node = __esw_qos_alloc_node(parent->esw, 0, SCHED_NODE_TYPE_VPORT, parent);
-	if (!sched_node) {
-		err = -ENOMEM;
-		goto err_alloc;
-	}
+	sched_node = __esw_qos_alloc_node(parent->esw, 0, type, parent);
+	if (!sched_node)
+		return -ENOMEM;
 
 	sched_node->max_rate = max_rate;
-	sched_node->min_rate = 0;
-	sched_node->bw_share = bw_share;
+	sched_node->min_rate = min_rate;
 	sched_node->vport = vport;
-	err = esw_qos_vport_create_sched_element(sched_node, 0, extack);
-	if (err)
-		goto err_vport_create;
-
-	trace_mlx5_esw_vport_qos_create(vport->dev, vport, bw_share, max_rate);
 	vport->qos.sched_node = sched_node;
-
-	return 0;
-
-err_vport_create:
-	__esw_qos_free_node(sched_node);
-err_alloc:
-	esw_qos_put(esw);
+	err = esw_qos_vport_enable(vport, parent, extack);
+	if (err)
+		esw_qos_put(esw);
 
 	return err;
 }
@@ -634,23 +600,18 @@ err_alloc:
 void mlx5_esw_qos_vport_disable(struct mlx5_vport *vport)
 {
 	struct mlx5_eswitch *esw = vport->dev->priv.eswitch;
-	struct mlx5_esw_sched_node *vport_node;
-	struct mlx5_core_dev *dev;
+	struct mlx5_esw_sched_node *parent;
 
 	lockdep_assert_held(&esw->state_lock);
 	esw_qos_lock(esw);
-	vport_node = vport->qos.sched_node;
-	if (!vport_node)
+	if (!vport->qos.sched_node)
 		goto unlock;
-	WARN(vport_node->parent != esw->qos.node0,
-	     "Disabling QoS on port before detaching it from node");
-
-	dev = vport_node->esw->dev;
-	trace_mlx5_esw_vport_qos_destroy(dev, vport);
 
-	esw_qos_destroy_node(vport_node, NULL);
-	memset(&vport->qos, 0, sizeof(vport->qos));
+	parent = vport->qos.sched_node->parent;
+	WARN(parent != esw->qos.node0, "Disabling QoS on port before detaching it from node");
 
+	esw_qos_vport_disable(vport, NULL);
+	mlx5_esw_qos_vport_qos_free(vport);
 	esw_qos_put(esw);
 unlock:
 	esw_qos_unlock(esw);
@@ -664,7 +625,8 @@ static int mlx5_esw_qos_set_vport_max_rate(struct mlx5_vport *vport, u32 max_rat
 	esw_assert_qos_lock_held(vport->dev->priv.eswitch);
 
 	if (!vport_node)
-		return esw_qos_vport_enable(vport, NULL, max_rate, 0, extack);
+		return mlx5_esw_qos_vport_enable(vport, SCHED_NODE_TYPE_VPORT, NULL, max_rate, 0,
+						 extack);
 	else
 		return esw_qos_sched_elem_config(vport_node, max_rate, vport_node->bw_share,
 						 extack);
@@ -678,7 +640,8 @@ static int mlx5_esw_qos_set_vport_min_rate(struct mlx5_vport *vport, u32 min_rat
 	esw_assert_qos_lock_held(vport->dev->priv.eswitch);
 
 	if (!vport_node)
-		return esw_qos_vport_enable(vport, NULL, 0, min_rate, extack);
+		return mlx5_esw_qos_vport_enable(vport, SCHED_NODE_TYPE_VPORT, NULL, 0, min_rate,
+						 extack);
 	else
 		return esw_qos_set_node_min_rate(vport_node, min_rate, extack);
 }
@@ -711,6 +674,31 @@ bool mlx5_esw_qos_get_vport_rate(struct mlx5_vport *vport, u32 *max_rate, u32 *m
 	return enabled;
 }
 
+static int esw_qos_vport_update_parent(struct mlx5_vport *vport, struct mlx5_esw_sched_node *parent,
+				       struct netlink_ext_ack *extack)
+{
+	struct mlx5_eswitch *esw = vport->dev->priv.eswitch;
+	struct mlx5_esw_sched_node *curr_parent;
+	int err;
+
+	esw_assert_qos_lock_held(esw);
+	curr_parent = vport->qos.sched_node->parent;
+	parent = parent ?: esw->qos.node0;
+	if (curr_parent == parent)
+		return 0;
+
+	esw_qos_vport_disable(vport, extack);
+
+	err = esw_qos_vport_enable(vport, parent, extack);
+	if (err) {
+		if (esw_qos_vport_enable(vport, curr_parent, NULL))
+			esw_warn(parent->esw->dev, "vport restore QoS failed (vport=%d)\n",
+				 vport->vport);
+	}
+
+	return err;
+}
+
 static u32 mlx5_esw_qos_lag_link_speed_get_locked(struct mlx5_core_dev *mdev)
 {
 	struct ethtool_link_ksettings lksettings;
@@ -972,23 +960,22 @@ int mlx5_esw_devlink_rate_node_del(struct devlink_rate *rate_node, void *priv,
 	return 0;
 }
 
-int mlx5_esw_qos_vport_update_node(struct mlx5_vport *vport,
-				   struct mlx5_esw_sched_node *node,
-				   struct netlink_ext_ack *extack)
+int mlx5_esw_qos_vport_update_parent(struct mlx5_vport *vport, struct mlx5_esw_sched_node *parent,
+				     struct netlink_ext_ack *extack)
 {
 	struct mlx5_eswitch *esw = vport->dev->priv.eswitch;
 	int err = 0;
 
-	if (node && node->esw != esw) {
+	if (parent && parent->esw != esw) {
 		NL_SET_ERR_MSG_MOD(extack, "Cross E-Switch scheduling is not supported");
 		return -EOPNOTSUPP;
 	}
 
 	esw_qos_lock(esw);
-	if (!vport->qos.sched_node && node)
-		err = esw_qos_vport_enable(vport, node, 0, 0, extack);
+	if (!vport->qos.sched_node && parent)
+		err = mlx5_esw_qos_vport_enable(vport, SCHED_NODE_TYPE_VPORT, parent, 0, 0, extack);
 	else if (vport->qos.sched_node)
-		err = esw_qos_vport_update_node(vport, node, extack);
+		err = esw_qos_vport_update_parent(vport, parent, extack);
 	esw_qos_unlock(esw);
 	return err;
 }
@@ -1002,8 +989,8 @@ int mlx5_esw_devlink_rate_parent_set(struct devlink_rate *devlink_rate,
 	struct mlx5_vport *vport = priv;
 
 	if (!parent)
-		return mlx5_esw_qos_vport_update_node(vport, NULL, extack);
+		return mlx5_esw_qos_vport_update_parent(vport, NULL, extack);
 
 	node = parent_priv;
-	return mlx5_esw_qos_vport_update_node(vport, node, extack);
+	return mlx5_esw_qos_vport_update_parent(vport, node, extack);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h
index 61a6fdd5c267..6eb8f6a648c8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h
@@ -13,6 +13,7 @@ int mlx5_esw_qos_set_vport_rate(struct mlx5_vport *evport, u32 max_rate, u32 min
 bool mlx5_esw_qos_get_vport_rate(struct mlx5_vport *vport, u32 *max_rate, u32 *min_rate);
 void mlx5_esw_qos_vport_disable(struct mlx5_vport *vport);
 
+void mlx5_esw_qos_vport_qos_free(struct mlx5_vport *vport);
 u32 mlx5_esw_qos_vport_get_sched_elem_ix(const struct mlx5_vport *vport);
 struct mlx5_esw_sched_node *mlx5_esw_qos_vport_get_parent(const struct mlx5_vport *vport);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index d0dab8f4e1a3..7fb8a3381f84 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1061,8 +1061,7 @@ static void mlx5_eswitch_clear_vf_vports_info(struct mlx5_eswitch *esw)
 	unsigned long i;
 
 	mlx5_esw_for_each_vf_vport(esw, i, vport, esw->esw_funcs.num_vfs) {
-		kfree(vport->qos.sched_node);
-		memset(&vport->qos, 0, sizeof(vport->qos));
+		mlx5_esw_qos_vport_qos_free(vport);
 		memset(&vport->info, 0, sizeof(vport->info));
 		vport->info.link_state = MLX5_VPORT_ADMIN_STATE_AUTO;
 	}
@@ -1074,8 +1073,7 @@ static void mlx5_eswitch_clear_ec_vf_vports_info(struct mlx5_eswitch *esw)
 	unsigned long i;
 
 	mlx5_esw_for_each_ec_vf_vport(esw, i, vport, esw->esw_funcs.num_ec_vfs) {
-		kfree(vport->qos.sched_node);
-		memset(&vport->qos, 0, sizeof(vport->qos));
+		mlx5_esw_qos_vport_qos_free(vport);
 		memset(&vport->info, 0, sizeof(vport->info));
 		vport->info.link_state = MLX5_VPORT_ADMIN_STATE_AUTO;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 14dd42d44e6f..a83d41121db6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -427,9 +427,8 @@ int mlx5_eswitch_set_vport_trust(struct mlx5_eswitch *esw,
 				 u16 vport_num, bool setting);
 int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw, u16 vport,
 				u32 max_rate, u32 min_rate);
-int mlx5_esw_qos_vport_update_node(struct mlx5_vport *vport,
-				   struct mlx5_esw_sched_node *node,
-				   struct netlink_ext_ack *extack);
+int mlx5_esw_qos_vport_update_parent(struct mlx5_vport *vport, struct mlx5_esw_sched_node *node,
+				     struct netlink_ext_ack *extack);
 int mlx5_eswitch_set_vepa(struct mlx5_eswitch *esw, u8 setting);
 int mlx5_eswitch_get_vepa(struct mlx5_eswitch *esw, u8 *setting);
 int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw,
-- 
2.51.0