From 32607a332cfea5a4b2a185f3e3d605a9bf4f8df0 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 24 Apr 2025 10:35:18 -0400 Subject: [PATCH 01/16] ipv4: prefer multipath nexthop that matches source address With multipath routes, try to ensure that packets leave on the device that is associated with the source address. Avoid the following tcpdump example: veth0 Out IP 10.1.0.2.38640 > 10.2.0.3.8000: Flags [S] veth1 Out IP 10.1.0.2.38648 > 10.2.0.3.8000: Flags [S] Which can happen easily with the most straightforward setup: ip addr add 10.0.0.1/24 dev veth0 ip addr add 10.1.0.1/24 dev veth1 ip route add 10.2.0.3 nexthop via 10.0.0.2 dev veth0 \ nexthop via 10.1.0.2 dev veth1 This is apparently considered WAI, based on the comment in ip_route_output_key_hash_rcu: * 2. Moreover, we are allowed to send packets with saddr * of another iface. --ANK It may be ok for some uses of multipath, but not all. For instance, when using two ISPs, a router may drop packets with unknown source. The behavior occurs because tcp_v4_connect makes three route lookups when establishing a connection: 1. ip_route_connect calls to select a source address, with saddr zero. 2. ip_route_connect calls again now that saddr and daddr are known. 3. ip_route_newports calls again after a source port is also chosen. With a route with multiple nexthops, each lookup may make a different choice depending on available entropy to fib_select_multipath. So it is possible for 1 to select the saddr from the first entry, but 3 to select the second entry. Leading to the above situation. Address this by preferring a match that matches the flowi4 saddr. This will make 2 and 3 make the same choice as 1. Continue to update the backup choice until a choice that matches saddr is found. Do this in fib_select_multipath itself, rather than passing an fl4_oif constraint, to avoid changing non-multipath route selection. Commit e6b45241c57a ("ipv4: reset flowi parameters on route connect") shows how that may cause regressions. Also read ipv4.sysctl_fib_multipath_use_neigh only once. No need to refresh in the loop. This does not happen in IPv6, which performs only one lookup. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Eric Dumazet Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250424143549.669426-2-willemdebruijn.kernel@gmail.com Signed-off-by: Paolo Abeni --- include/net/ip_fib.h | 3 ++- net/ipv4/fib_semantics.c | 39 +++++++++++++++++++++++++-------------- net/ipv4/route.c | 2 +- 3 files changed, 28 insertions(+), 16 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index e3864b74e92a..48bb3cf41469 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -574,7 +574,8 @@ static inline u32 fib_multipath_hash_from_keys(const struct net *net, int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack); -void fib_select_multipath(struct fib_result *res, int hash); +void fib_select_multipath(struct fib_result *res, int hash, + const struct flowi4 *fl4); void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 5326f1501af0..2371f311a1e1 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -2170,34 +2170,45 @@ static bool fib_good_nh(const struct fib_nh *nh) return !!(state & NUD_VALID); } -void fib_select_multipath(struct fib_result *res, int hash) +void fib_select_multipath(struct fib_result *res, int hash, + const struct flowi4 *fl4) { struct fib_info *fi = res->fi; struct net *net = fi->fib_net; - bool first = false; + bool found = false; + bool use_neigh; + __be32 saddr; if (unlikely(res->fi->nh)) { nexthop_path_fib_result(res, hash); return; } + use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh); + saddr = fl4 ? fl4->saddr : 0; + change_nexthops(fi) { - if (READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh)) { - if (!fib_good_nh(nexthop_nh)) - continue; - if (!first) { - res->nh_sel = nhsel; - res->nhc = &nexthop_nh->nh_common; - first = true; - } + if (use_neigh && !fib_good_nh(nexthop_nh)) + continue; + + if (!found) { + res->nh_sel = nhsel; + res->nhc = &nexthop_nh->nh_common; + found = !saddr || nexthop_nh->nh_saddr == saddr; } if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound)) continue; - res->nh_sel = nhsel; - res->nhc = &nexthop_nh->nh_common; - return; + if (!saddr || nexthop_nh->nh_saddr == saddr) { + res->nh_sel = nhsel; + res->nhc = &nexthop_nh->nh_common; + return; + } + + if (found) + return; + } endfor_nexthops(fi); } #endif @@ -2212,7 +2223,7 @@ void fib_select_path(struct net *net, struct fib_result *res, if (fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(net, fl4, skb, NULL); - fib_select_multipath(res, h); + fib_select_multipath(res, h, fl4); } else #endif diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 49cffbe83802..e5e4c71be3af 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2154,7 +2154,7 @@ ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, if (res->fi && fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); - fib_select_multipath(res, h); + fib_select_multipath(res, h, NULL); IPCB(skb)->flags |= IPSKB_MULTIPATH; } #endif -- 2.50.1 From 65e9024643c7512ade3aedbb341e11d77ed7abc2 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 24 Apr 2025 10:35:19 -0400 Subject: [PATCH 02/16] ip: load balance tcp connections to single dst addr and port Load balance new TCP connections across nexthops also when they connect to the same service at a single remote address and port. This affects only port-based multipath hashing: fib_multipath_hash_policy 1 or 3. Local connections must choose both a source address and port when connecting to a remote service, in ip_route_connect. This "chicken-and-egg problem" (commit 2d7192d6cbab ("ipv4: Sanitize and simplify ip_route_{connect,newports}()")) is resolved by first selecting a source address, by looking up a route using the zero wildcard source port and address. As a result multiple connections to the same destination address and port have no entropy in fib_multipath_hash. This is not a problem when forwarding, as skb-based hashing has a 4-tuple. Nor when establishing UDP connections, as autobind there selects a port before reaching ip_route_connect. Load balance also TCP, by using a random port in fib_multipath_hash. Port assignment in inet_hash_connect is not atomic with ip_route_connect. Thus ports are unpredictable, effectively random. Implementation details: Do not actually pass a random fl4_sport, as that affects not only hashing, but routing more broadly, and can match a source port based policy route, which existing wildcard port 0 will not. Instead, define a new wildcard flowi flag that is used only for hashing. Selecting a random source is equivalent to just selecting a random hash entirely. But for code clarity, follow the normal 4-tuple hash process and only update this field. fib_multipath_hash can be reached with zero sport from other code paths, so explicitly pass this flowi flag, rather than trying to infer this case in the function itself. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Eric Dumazet Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250424143549.669426-3-willemdebruijn.kernel@gmail.com Signed-off-by: Paolo Abeni --- include/net/flow.h | 1 + include/net/route.h | 3 +++ net/ipv4/route.c | 13 ++++++++++--- net/ipv6/route.c | 13 ++++++++++--- net/ipv6/tcp_ipv6.c | 2 ++ 5 files changed, 26 insertions(+), 6 deletions(-) diff --git a/include/net/flow.h b/include/net/flow.h index 2a3f0c42f092..a1839c278d87 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -39,6 +39,7 @@ struct flowi_common { #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 #define FLOWI_FLAG_L3MDEV_OIF 0x04 +#define FLOWI_FLAG_ANY_SPORT 0x08 __u32 flowic_secid; kuid_t flowic_uid; __u32 flowic_multipath_hash; diff --git a/include/net/route.h b/include/net/route.h index c605fd5ec0c0..8e39aa822cf9 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -326,6 +326,9 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, if (inet_test_bit(TRANSPARENT, sk)) flow_flags |= FLOWI_FLAG_ANYSRC; + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !sport) + flow_flags |= FLOWI_FLAG_ANY_SPORT; + flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), protocol, flow_flags, dst, src, dport, sport, sk->sk_uid); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e5e4c71be3af..507b2e5dec50 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2037,8 +2037,12 @@ static u32 fib_multipath_custom_hash_fl4(const struct net *net, hash_keys.addrs.v4addrs.dst = fl4->daddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = fl4->flowi4_proto; - if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) - hash_keys.ports.src = fl4->fl4_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { + if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl4->fl4_sport; + } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl4->fl4_dport; @@ -2093,7 +2097,10 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; - hash_keys.ports.src = fl4->fl4_sport; + if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl4->fl4_sport; hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d0351e95d916..aa6b45bd3515 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2492,8 +2492,12 @@ static u32 rt6_multipath_custom_hash_fl6(const struct net *net, hash_keys.basic.ip_proto = fl6->flowi6_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); - if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) - hash_keys.ports.src = fl6->fl6_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { + if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl6->fl6_sport; + } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl6->fl6_dport; @@ -2547,7 +2551,10 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; - hash_keys.ports.src = fl6->fl6_sport; + if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl6->fl6_sport; hash_keys.ports.dst = fl6->fl6_dport; hash_keys.basic.ip_proto = fl6->flowi6_proto; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7dcb33f879ee..e8e68a142649 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -267,6 +267,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport) + fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT; fl6.flowi6_uid = sk->sk_uid; opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); -- 2.50.1 From 4d0dac499bf384fe3f42acc30906d304c3499dd8 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 24 Apr 2025 10:35:20 -0400 Subject: [PATCH 03/16] selftests/net: test tcp connection load balancing Verify that TCP connections use both routes when connecting multiple times to a remote service over a two nexthop multipath route. Use socat to create the connections. Use tc prio + tc filter to count routes taken, counting SYN packets across the two egress devices. Also verify that the saddr matches that of the device. To avoid flaky tests when testing inherently randomized behavior, set a low bar and pass if even a single SYN is observed on each device. Signed-off-by: Willem de Bruijn Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Link: https://patch.msgid.link/20250424143549.669426-4-willemdebruijn.kernel@gmail.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/fib_tests.sh | 120 ++++++++++++++++++++++- tools/testing/selftests/net/lib.sh | 24 +++++ 2 files changed, 143 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 3ea6f886a210..c58dc4ac2810 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -11,7 +11,7 @@ TESTS="unregister down carrier nexthop suppress ipv6_notify ipv4_notify \ ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics \ ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr \ ipv6_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh fib6_gc_test \ - ipv4_mpath_list ipv6_mpath_list" + ipv4_mpath_list ipv6_mpath_list ipv4_mpath_balance ipv6_mpath_balance" VERBOSE=0 PAUSE_ON_FAIL=no @@ -1085,6 +1085,35 @@ route_setup() set +e } +forwarding_cleanup() +{ + cleanup_ns $ns3 + + route_cleanup +} + +# extend route_setup with an ns3 reachable through ns2 over both devices +forwarding_setup() +{ + forwarding_cleanup + + route_setup + + setup_ns ns3 + + ip link add veth5 netns $ns3 type veth peer name veth6 netns $ns2 + ip -netns $ns3 link set veth5 up + ip -netns $ns2 link set veth6 up + + ip -netns $ns3 -4 addr add dev veth5 172.16.105.1/24 + ip -netns $ns2 -4 addr add dev veth6 172.16.105.2/24 + ip -netns $ns3 -4 route add 172.16.100.0/22 via 172.16.105.2 + + ip -netns $ns3 -6 addr add dev veth5 2001:db8:105::1/64 nodad + ip -netns $ns2 -6 addr add dev veth6 2001:db8:105::2/64 nodad + ip -netns $ns3 -6 route add 2001:db8:101::/33 via 2001:db8:105::2 +} + # assumption is that basic add of a single path route works # otherwise just adding an address on an interface is broken ipv6_rt_add() @@ -2600,6 +2629,93 @@ ipv6_mpath_list_test() route_cleanup } +tc_set_flower_counter__saddr_syn() { + tc_set_flower_counter $1 $2 $3 "src_ip $4 ip_proto tcp tcp_flags 0x2" +} + +ip_mpath_balance_dep_check() +{ + if [ ! -x "$(command -v socat)" ]; then + echo "socat command not found. Skipping test" + return 1 + fi + + if [ ! -x "$(command -v jq)" ]; then + echo "jq command not found. Skipping test" + return 1 + fi +} + +ip_mpath_balance() { + local -r ipver=$1 + local -r daddr=$2 + local -r num_conn=20 + + for i in $(seq 1 $num_conn); do + ip netns exec $ns3 socat $ipver TCP-LISTEN:8000 STDIO >/dev/null & + sleep 0.02 + echo -n a | ip netns exec $ns1 socat $ipver STDIO TCP:$daddr:8000 + done + + local -r syn0="$(tc_get_flower_counter $ns1 veth1)" + local -r syn1="$(tc_get_flower_counter $ns1 veth3)" + local -r syns=$((syn0+syn1)) + + [ "$VERBOSE" = "1" ] && echo "multipath: syns seen: ($syn0,$syn1)" + + [[ $syns -ge $num_conn ]] && [[ $syn0 -gt 0 ]] && [[ $syn1 -gt 0 ]] +} + +ipv4_mpath_balance_test() +{ + echo + echo "IPv4 multipath load balance test" + + ip_mpath_balance_dep_check || return 1 + forwarding_setup + + $IP route add 172.16.105.1 \ + nexthop via 172.16.101.2 \ + nexthop via 172.16.103.2 + + ip netns exec $ns1 \ + sysctl -q -w net.ipv4.fib_multipath_hash_policy=1 + + tc_set_flower_counter__saddr_syn $ns1 4 veth1 172.16.101.1 + tc_set_flower_counter__saddr_syn $ns1 4 veth3 172.16.103.1 + + ip_mpath_balance -4 172.16.105.1 + + log_test $? 0 "IPv4 multipath loadbalance" + + forwarding_cleanup +} + +ipv6_mpath_balance_test() +{ + echo + echo "IPv6 multipath load balance test" + + ip_mpath_balance_dep_check || return 1 + forwarding_setup + + $IP route add 2001:db8:105::1\ + nexthop via 2001:db8:101::2 \ + nexthop via 2001:db8:103::2 + + ip netns exec $ns1 \ + sysctl -q -w net.ipv6.fib_multipath_hash_policy=1 + + tc_set_flower_counter__saddr_syn $ns1 6 veth1 2001:db8:101::1 + tc_set_flower_counter__saddr_syn $ns1 6 veth3 2001:db8:103::1 + + ip_mpath_balance -6 "[2001:db8:105::1]" + + log_test $? 0 "IPv6 multipath loadbalance" + + forwarding_cleanup +} + ################################################################################ # usage @@ -2683,6 +2799,8 @@ do fib6_gc_test|ipv6_gc) fib6_gc_test;; ipv4_mpath_list) ipv4_mpath_list_test;; ipv6_mpath_list) ipv6_mpath_list_test;; + ipv4_mpath_balance) ipv4_mpath_balance_test;; + ipv6_mpath_balance) ipv6_mpath_balance_test;; help) echo "Test names: $TESTS"; exit 0;; esac diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 701905eeff66..7e1e56318625 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -270,6 +270,30 @@ tc_rule_handle_stats_get() .options.actions[0].stats$selector" } +# attach a qdisc with two children match/no-match and a flower filter to match +tc_set_flower_counter() { + local -r ns=$1 + local -r ipver=$2 + local -r dev=$3 + local -r flower_expr=$4 + + tc -n $ns qdisc add dev $dev root handle 1: prio bands 2 \ + priomap 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + + tc -n $ns qdisc add dev $dev parent 1:1 handle 11: pfifo + tc -n $ns qdisc add dev $dev parent 1:2 handle 12: pfifo + + tc -n $ns filter add dev $dev parent 1: protocol ipv$ipver \ + flower $flower_expr classid 1:2 +} + +tc_get_flower_counter() { + local -r ns=$1 + local -r dev=$2 + + tc -n $ns -j -s qdisc show dev $dev handle 12: | jq .[0].packets +} + ret_set_ksft_status() { local ksft_status=$1; shift -- 2.50.1 From 6fc54c408dc9103bf98470b3877b741c44642baa Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Wed, 19 Mar 2025 11:26:39 +0100 Subject: [PATCH 04/16] igb: Link IRQs to NAPI instances Link IRQs to NAPI instances via netdev-genl API. This allows users to query that information via netlink: |$ ./tools/net/ynl/pyynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \ | --dump napi-get --json='{"ifindex": 2}' |[{'defer-hard-irqs': 0, | 'gro-flush-timeout': 0, | 'id': 8204, | 'ifindex': 2, | 'irq': 127, | 'irq-suspend-timeout': 0}, | {'defer-hard-irqs': 0, | 'gro-flush-timeout': 0, | 'id': 8203, | 'ifindex': 2, | 'irq': 126, | 'irq-suspend-timeout': 0}, | {'defer-hard-irqs': 0, | 'gro-flush-timeout': 0, | 'id': 8202, | 'ifindex': 2, | 'irq': 125, | 'irq-suspend-timeout': 0}, | {'defer-hard-irqs': 0, | 'gro-flush-timeout': 0, | 'id': 8201, | 'ifindex': 2, | 'irq': 124, | 'irq-suspend-timeout': 0}] |$ cat /proc/interrupts | grep enp2s0 |123: 0 1 IR-PCI-MSIX-0000:02:00.0 0-edge enp2s0 |124: 0 7 IR-PCI-MSIX-0000:02:00.0 1-edge enp2s0-TxRx-0 |125: 0 0 IR-PCI-MSIX-0000:02:00.0 2-edge enp2s0-TxRx-1 |126: 0 5 IR-PCI-MSIX-0000:02:00.0 3-edge enp2s0-TxRx-2 |127: 0 0 IR-PCI-MSIX-0000:02:00.0 4-edge enp2s0-TxRx-3 Reviewed-by: Joe Damato Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Kurt Kanzenbach Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index c646c71915f0..401af6dce2a8 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -947,6 +947,9 @@ static int igb_request_msix(struct igb_adapter *adapter) q_vector); if (err) goto err_free; + + netif_napi_set_irq(&q_vector->napi, + adapter->msix_entries[vector].vector); } igb_configure_msix(adapter); -- 2.50.1 From b75a1dea500f4397a46e971cd41e613b7f18e317 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Wed, 19 Mar 2025 11:26:40 +0100 Subject: [PATCH 05/16] igb: Link queues to NAPI instances Link queues to NAPI instances via netdev-genl API. This is required to use XDP/ZC busy polling. See commit 5ef44b3cb43b ("xsk: Bring back busy polling support") for details. This also allows users to query the info with netlink: |$ ./tools/net/ynl/pyynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \ | --dump queue-get --json='{"ifindex": 2}' |[{'id': 0, 'ifindex': 2, 'napi-id': 8201, 'type': 'rx'}, | {'id': 1, 'ifindex': 2, 'napi-id': 8202, 'type': 'rx'}, | {'id': 2, 'ifindex': 2, 'napi-id': 8203, 'type': 'rx'}, | {'id': 3, 'ifindex': 2, 'napi-id': 8204, 'type': 'rx'}, | {'id': 0, 'ifindex': 2, 'napi-id': 8201, 'type': 'tx'}, | {'id': 1, 'ifindex': 2, 'napi-id': 8202, 'type': 'tx'}, | {'id': 2, 'ifindex': 2, 'napi-id': 8203, 'type': 'tx'}, | {'id': 3, 'ifindex': 2, 'napi-id': 8204, 'type': 'tx'}] Add rtnl locking to PCI error handlers, because netif_queue_set_napi() requires the lock held. While at __igb_open() use RCT coding style. Signed-off-by: Kurt Kanzenbach Reviewed-by: Joe Damato Tested-by: Sweta Kumari Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb.h | 2 ++ drivers/net/ethernet/intel/igb/igb_main.c | 43 ++++++++++++++++++++--- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index 02f340280d20..79eca385a751 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -722,6 +722,8 @@ enum igb_boards { extern char igb_driver_name[]; +void igb_set_queue_napi(struct igb_adapter *adapter, int q_idx, + struct napi_struct *napi); int igb_xmit_xdp_ring(struct igb_adapter *adapter, struct igb_ring *ring, struct xdp_frame *xdpf); diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 401af6dce2a8..0535cc72b11b 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2099,6 +2099,22 @@ static void igb_check_swap_media(struct igb_adapter *adapter) wr32(E1000_CTRL_EXT, ctrl_ext); } +void igb_set_queue_napi(struct igb_adapter *adapter, int vector, + struct napi_struct *napi) +{ + struct igb_q_vector *q_vector = adapter->q_vector[vector]; + + if (q_vector->rx.ring) + netif_queue_set_napi(adapter->netdev, + q_vector->rx.ring->queue_index, + NETDEV_QUEUE_TYPE_RX, napi); + + if (q_vector->tx.ring) + netif_queue_set_napi(adapter->netdev, + q_vector->tx.ring->queue_index, + NETDEV_QUEUE_TYPE_TX, napi); +} + /** * igb_up - Open the interface and prepare it to handle traffic * @adapter: board private structure @@ -2106,6 +2122,7 @@ static void igb_check_swap_media(struct igb_adapter *adapter) int igb_up(struct igb_adapter *adapter) { struct e1000_hw *hw = &adapter->hw; + struct napi_struct *napi; int i; /* hardware has been reset, we need to reload some things */ @@ -2113,8 +2130,11 @@ int igb_up(struct igb_adapter *adapter) clear_bit(__IGB_DOWN, &adapter->state); - for (i = 0; i < adapter->num_q_vectors; i++) - napi_enable(&(adapter->q_vector[i]->napi)); + for (i = 0; i < adapter->num_q_vectors; i++) { + napi = &adapter->q_vector[i]->napi; + napi_enable(napi); + igb_set_queue_napi(adapter, i, napi); + } if (adapter->flags & IGB_FLAG_HAS_MSIX) igb_configure_msix(adapter); @@ -2184,6 +2204,7 @@ void igb_down(struct igb_adapter *adapter) for (i = 0; i < adapter->num_q_vectors; i++) { if (adapter->q_vector[i]) { napi_synchronize(&adapter->q_vector[i]->napi); + igb_set_queue_napi(adapter, i, NULL); napi_disable(&adapter->q_vector[i]->napi); } } @@ -4116,8 +4137,9 @@ static int igb_sw_init(struct igb_adapter *adapter) static int __igb_open(struct net_device *netdev, bool resuming) { struct igb_adapter *adapter = netdev_priv(netdev); - struct e1000_hw *hw = &adapter->hw; struct pci_dev *pdev = adapter->pdev; + struct e1000_hw *hw = &adapter->hw; + struct napi_struct *napi; int err; int i; @@ -4169,8 +4191,11 @@ static int __igb_open(struct net_device *netdev, bool resuming) /* From here on the code is the same as igb_up() */ clear_bit(__IGB_DOWN, &adapter->state); - for (i = 0; i < adapter->num_q_vectors; i++) - napi_enable(&(adapter->q_vector[i]->napi)); + for (i = 0; i < adapter->num_q_vectors; i++) { + napi = &adapter->q_vector[i]->napi; + napi_enable(napi); + igb_set_queue_napi(adapter, i, napi); + } /* Clear any pending interrupts. */ rd32(E1000_TSICR); @@ -9677,8 +9702,11 @@ static pci_ers_result_t igb_io_error_detected(struct pci_dev *pdev, if (state == pci_channel_io_perm_failure) return PCI_ERS_RESULT_DISCONNECT; + rtnl_lock(); if (netif_running(netdev)) igb_down(adapter); + rtnl_unlock(); + pci_disable_device(pdev); /* Request a slot reset. */ @@ -9737,16 +9765,21 @@ static void igb_io_resume(struct pci_dev *pdev) struct net_device *netdev = pci_get_drvdata(pdev); struct igb_adapter *adapter = netdev_priv(netdev); + rtnl_lock(); if (netif_running(netdev)) { if (!test_bit(__IGB_DOWN, &adapter->state)) { dev_dbg(&pdev->dev, "Resuming from non-fatal error, do nothing.\n"); + rtnl_unlock(); return; } + if (igb_up(adapter)) { dev_err(&pdev->dev, "igb_up failed after reset\n"); + rtnl_unlock(); return; } } + rtnl_unlock(); netif_device_attach(netdev); -- 2.50.1 From fc0fb1f116e97012d90bb77a6032972564ed8fd6 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Wed, 19 Mar 2025 11:26:41 +0100 Subject: [PATCH 06/16] igb: Add support for persistent NAPI config Use netif_napi_add_config() to assign persistent per-NAPI config. This is useful for preserving NAPI settings when changing queue counts or for user space programs using SO_INCOMING_NAPI_ID. Reviewed-by: Joe Damato Reviewed-by: Aleksandr Loktionov Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Kurt Kanzenbach Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 0535cc72b11b..d9205573886e 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -1197,7 +1197,8 @@ static int igb_alloc_q_vector(struct igb_adapter *adapter, return -ENOMEM; /* initialize NAPI */ - netif_napi_add(adapter->netdev, &q_vector->napi, igb_poll); + netif_napi_add_config(adapter->netdev, &q_vector->napi, igb_poll, + v_idx); /* tie q_vector and adapter together */ adapter->q_vector[v_idx] = q_vector; -- 2.50.1 From a22ed15c99a052c6ea015b516acfd7e02bcdb995 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Wed, 19 Mar 2025 11:26:42 +0100 Subject: [PATCH 07/16] igb: Get rid of spurious interrupts When running the igc with XDP/ZC in busy polling mode with deferral of hard interrupts, interrupts still happen from time to time. That is caused by the igb task watchdog which triggers Rx interrupts periodically. That mechanism has been introduced to overcome skb/memory allocation failures [1]. So the Rx clean functions stop processing the Rx ring in case of such failure. The task watchdog triggers Rx interrupts periodically in the hope that memory became available in the mean time. The current behavior is undesirable for real time applications, because the driver induced Rx interrupts trigger also the softirq processing. However, all real time packets should be processed by the application which uses the busy polling method. Therefore, only trigger the Rx interrupts in case of real allocation failures. Introduce a new flag for signaling that condition. Follow the same logic as in commit 8dcf2c212078 ("igc: Get rid of spurious interrupts"). [1] - https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/?id=3be507547e6177e5c808544bd6a2efa2c7f1d436 Reviewed-by: Joe Damato Reviewed-by: Aleksandr Loktionov Tested-by: Sweta Kumari Signed-off-by: Kurt Kanzenbach Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb.h | 3 ++- drivers/net/ethernet/intel/igb/igb_main.c | 29 +++++++++++++++++++---- drivers/net/ethernet/intel/igb/igb_xsk.c | 1 + 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index 79eca385a751..f34ead8243e9 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -391,7 +391,8 @@ enum e1000_ring_flags_t { IGB_RING_FLAG_RX_LB_VLAN_BSWAP, IGB_RING_FLAG_TX_CTX_IDX, IGB_RING_FLAG_TX_DETECT_HANG, - IGB_RING_FLAG_TX_DISABLED + IGB_RING_FLAG_TX_DISABLED, + IGB_RING_FLAG_RX_ALLOC_FAILED, }; #define ring_uses_large_buffer(ring) \ diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index d9205573886e..9e9a5900e6e5 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -5755,11 +5755,29 @@ no_wait: if (adapter->flags & IGB_FLAG_HAS_MSIX) { u32 eics = 0; - for (i = 0; i < adapter->num_q_vectors; i++) - eics |= adapter->q_vector[i]->eims_value; - wr32(E1000_EICS, eics); + for (i = 0; i < adapter->num_q_vectors; i++) { + struct igb_q_vector *q_vector = adapter->q_vector[i]; + struct igb_ring *rx_ring; + + if (!q_vector->rx.ring) + continue; + + rx_ring = adapter->rx_ring[q_vector->rx.ring->queue_index]; + + if (test_bit(IGB_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags)) { + eics |= q_vector->eims_value; + clear_bit(IGB_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags); + } + } + if (eics) + wr32(E1000_EICS, eics); } else { - wr32(E1000_ICS, E1000_ICS_RXDMT0); + struct igb_ring *rx_ring = adapter->rx_ring[0]; + + if (test_bit(IGB_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags)) { + clear_bit(IGB_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags); + wr32(E1000_ICS, E1000_ICS_RXDMT0); + } } igb_spoof_check(adapter); @@ -9090,6 +9108,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) if (!xdp_res && !skb) { rx_ring->rx_stats.alloc_failed++; rx_buffer->pagecnt_bias++; + set_bit(IGB_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags); break; } @@ -9149,6 +9168,7 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, page = dev_alloc_pages(igb_rx_pg_order(rx_ring)); if (unlikely(!page)) { rx_ring->rx_stats.alloc_failed++; + set_bit(IGB_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags); return false; } @@ -9165,6 +9185,7 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, __free_pages(page, igb_rx_pg_order(rx_ring)); rx_ring->rx_stats.alloc_failed++; + set_bit(IGB_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags); return false; } diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c b/drivers/net/ethernet/intel/igb/igb_xsk.c index 157d43787fa0..5cf67ba29269 100644 --- a/drivers/net/ethernet/intel/igb/igb_xsk.c +++ b/drivers/net/ethernet/intel/igb/igb_xsk.c @@ -415,6 +415,7 @@ int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector, /* exit if we failed to retrieve a buffer */ if (!skb) { rx_ring->rx_stats.alloc_failed++; + set_bit(IGB_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags); break; } -- 2.50.1 From 68f37f26b0ff28c93d94084178b946136da6d140 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Fri, 21 Mar 2025 14:52:38 +0100 Subject: [PATCH 08/16] igc: Limit netdev_tc calls to MQPRIO Limit netdev_tc calls to MQPRIO. Currently these calls are made in igc_tsn_enable_offload() and igc_tsn_disable_offload() which are used by TAPRIO and ETF as well. However, these are only required for MQPRIO. Signed-off-by: Kurt Kanzenbach Reviewed-by: Simon Horman Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 18 +++++++++++++++++- drivers/net/ethernet/intel/igc/igc_tsn.c | 20 -------------------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 5b06765a35e9..27575a1e1777 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6730,13 +6730,14 @@ static int igc_tsn_enable_mqprio(struct igc_adapter *adapter, struct tc_mqprio_qopt_offload *mqprio) { struct igc_hw *hw = &adapter->hw; - int i; + int err, i; if (hw->mac.type != igc_i225) return -EOPNOTSUPP; if (!mqprio->qopt.num_tc) { adapter->strict_priority_enable = false; + netdev_reset_tc(adapter->netdev); goto apply; } @@ -6767,6 +6768,21 @@ static int igc_tsn_enable_mqprio(struct igc_adapter *adapter, igc_save_mqprio_params(adapter, mqprio->qopt.num_tc, mqprio->qopt.offset); + err = netdev_set_num_tc(adapter->netdev, adapter->num_tc); + if (err) + return err; + + for (i = 0; i < adapter->num_tc; i++) { + err = netdev_set_tc_queue(adapter->netdev, i, 1, + adapter->queue_per_tc[i]); + if (err) + return err; + } + + /* In case the card is configured with less than four queues. */ + for (; i < IGC_MAX_TX_QUEUES; i++) + adapter->queue_per_tc[i] = i; + mqprio->qopt.hw = TC_MQPRIO_HW_OFFLOAD_TCS; apply: diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index b23bf0be007d..cbc49cdcaf6b 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -320,9 +320,6 @@ static int igc_tsn_disable_offload(struct igc_adapter *adapter) wr32(IGC_QBVCYCLET_S, 0); wr32(IGC_QBVCYCLET, NSEC_PER_SEC); - /* Reset mqprio TC configuration. */ - netdev_reset_tc(adapter->netdev); - /* Restore the default Tx arbitration: Priority 0 has the highest * priority and is assigned to queue 0 and so on and so forth. */ @@ -394,23 +391,6 @@ static int igc_tsn_enable_offload(struct igc_adapter *adapter) igc_tsn_set_retx_qbvfullthreshold(adapter); if (adapter->strict_priority_enable) { - int err; - - err = netdev_set_num_tc(adapter->netdev, adapter->num_tc); - if (err) - return err; - - for (i = 0; i < adapter->num_tc; i++) { - err = netdev_set_tc_queue(adapter->netdev, i, 1, - adapter->queue_per_tc[i]); - if (err) - return err; - } - - /* In case the card is configured with less than four queues. */ - for (; i < IGC_MAX_TX_QUEUES; i++) - adapter->queue_per_tc[i] = i; - /* Configure queue priorities according to the user provided * mapping. */ -- 2.50.1 From 876863c3fc755730c942b4c5e1297255d81787e6 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Fri, 21 Mar 2025 14:52:39 +0100 Subject: [PATCH 09/16] igc: Change Tx mode for MQPRIO offloading The current MQPRIO offload implementation uses the legacy TSN Tx mode. In this mode the hardware uses four packet buffers and considers queue priorities. In order to harmonize the TAPRIO implementation with MQPRIO, switch to the regular TSN Tx mode. This mode also uses four packet buffers and considers queue priorities. In addition to the legacy mode, transmission is always coupled to Qbv. The driver already has mechanisms to use a dummy schedule of 1 second with all gates open for ETF. Simply use this for MQPRIO too. This reduces code and makes it easier to add support for frame preemption later. Tested on i225 with real time application using high priority queue, iperf3 using low priority queue and network TAP device. Acked-by: Faizal Rahim Signed-off-by: Kurt Kanzenbach Reviewed-by: Simon Horman Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc.h | 5 ++--- drivers/net/ethernet/intel/igc/igc_tsn.c | 19 ++----------------- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 9bd243f23b46..859a15e4ccba 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -394,12 +394,11 @@ extern char igc_driver_name[]; #define IGC_FLAG_RX_LEGACY BIT(16) #define IGC_FLAG_TSN_QBV_ENABLED BIT(17) #define IGC_FLAG_TSN_QAV_ENABLED BIT(18) -#define IGC_FLAG_TSN_LEGACY_ENABLED BIT(19) -#define IGC_FLAG_TSN_PREEMPT_ENABLED BIT(20) +#define IGC_FLAG_TSN_PREEMPT_ENABLED BIT(19) #define IGC_FLAG_TSN_ANY_ENABLED \ (IGC_FLAG_TSN_QBV_ENABLED | IGC_FLAG_TSN_QAV_ENABLED | \ - IGC_FLAG_TSN_LEGACY_ENABLED | IGC_FLAG_TSN_PREEMPT_ENABLED) + IGC_FLAG_TSN_PREEMPT_ENABLED) #define IGC_FLAG_RSS_FIELD_IPV4_UDP BIT(6) #define IGC_FLAG_RSS_FIELD_IPV6_UDP BIT(7) diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index cbc49cdcaf6b..f22cc4d4f459 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -171,18 +171,14 @@ static unsigned int igc_tsn_new_flags(struct igc_adapter *adapter) { unsigned int new_flags = adapter->flags & ~IGC_FLAG_TSN_ANY_ENABLED; - if (adapter->taprio_offload_enable) - new_flags |= IGC_FLAG_TSN_QBV_ENABLED; - if (is_any_launchtime(adapter)) + if (adapter->taprio_offload_enable || is_any_launchtime(adapter) || + adapter->strict_priority_enable) new_flags |= IGC_FLAG_TSN_QBV_ENABLED; if (is_cbs_enabled(adapter)) new_flags |= IGC_FLAG_TSN_QAV_ENABLED; - if (adapter->strict_priority_enable) - new_flags |= IGC_FLAG_TSN_LEGACY_ENABLED; - if (adapter->fpe.mmsv.pmac_enabled) new_flags |= IGC_FLAG_TSN_PREEMPT_ENABLED; @@ -326,7 +322,6 @@ static int igc_tsn_disable_offload(struct igc_adapter *adapter) igc_tsn_tx_arb(adapter, queue_per_tc); adapter->flags &= ~IGC_FLAG_TSN_QBV_ENABLED; - adapter->flags &= ~IGC_FLAG_TSN_LEGACY_ENABLED; return 0; } @@ -395,16 +390,6 @@ static int igc_tsn_enable_offload(struct igc_adapter *adapter) * mapping. */ igc_tsn_tx_arb(adapter, adapter->queue_per_tc); - - /* Enable legacy TSN mode which will do strict priority without - * any other TSN features. - */ - tqavctrl = rd32(IGC_TQAVCTRL); - tqavctrl |= IGC_TQAVCTRL_TRANSMIT_MODE_TSN; - tqavctrl &= ~IGC_TQAVCTRL_ENHANCED_QAV; - wr32(IGC_TQAVCTRL, tqavctrl); - - return 0; } for (i = 0; i < adapter->num_tx_queues; i++) { -- 2.50.1 From 462cc09ac37ddde5a01477613220f0271fe49291 Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Mon, 3 Mar 2025 13:06:27 +0100 Subject: [PATCH 10/16] ixgbe: create E610 specific ethtool_ops structure E610's implementation of various ethtool ops is different than the ones corresponding to ixgbe legacy products. Therefore create separate E610 ethtool_ops struct which will be filled out in the forthcoming patches. Add adequate ops struct basing on MAC type. This step requires changing a bit the flow of probing by placing ixgbe_set_ethtool_ops after hw.mac.type is assigned. So move the whole netdev assignment block after hw.mac.type is known. This step doesn't have any additional impact on probing sequence. Suggested-by: Aleksandr Loktionov Reviewed-by: Aleksandr Loktionov Signed-off-by: Jedrzej Jagielski Reviewed-by: Simon Horman Tested-by: Bharath R Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 52 ++++++++++++++++++- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 10 ++-- 2 files changed, 56 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index c86103eccc8a..83d9ee3941e5 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -3650,7 +3650,57 @@ static const struct ethtool_ops ixgbe_ethtool_ops = { .set_link_ksettings = ixgbe_set_link_ksettings, }; +static const struct ethtool_ops ixgbe_ethtool_ops_e610 = { + .supported_coalesce_params = ETHTOOL_COALESCE_USECS, + .get_drvinfo = ixgbe_get_drvinfo, + .get_regs_len = ixgbe_get_regs_len, + .get_regs = ixgbe_get_regs, + .get_wol = ixgbe_get_wol, + .set_wol = ixgbe_set_wol, + .nway_reset = ixgbe_nway_reset, + .get_link = ethtool_op_get_link, + .get_eeprom_len = ixgbe_get_eeprom_len, + .get_eeprom = ixgbe_get_eeprom, + .set_eeprom = ixgbe_set_eeprom, + .get_ringparam = ixgbe_get_ringparam, + .set_ringparam = ixgbe_set_ringparam, + .get_pause_stats = ixgbe_get_pause_stats, + .get_pauseparam = ixgbe_get_pauseparam, + .set_pauseparam = ixgbe_set_pauseparam, + .get_msglevel = ixgbe_get_msglevel, + .set_msglevel = ixgbe_set_msglevel, + .self_test = ixgbe_diag_test, + .get_strings = ixgbe_get_strings, + .set_phys_id = ixgbe_set_phys_id, + .get_sset_count = ixgbe_get_sset_count, + .get_ethtool_stats = ixgbe_get_ethtool_stats, + .get_coalesce = ixgbe_get_coalesce, + .set_coalesce = ixgbe_set_coalesce, + .get_rxnfc = ixgbe_get_rxnfc, + .set_rxnfc = ixgbe_set_rxnfc, + .get_rxfh_indir_size = ixgbe_rss_indir_size, + .get_rxfh_key_size = ixgbe_get_rxfh_key_size, + .get_rxfh = ixgbe_get_rxfh, + .set_rxfh = ixgbe_set_rxfh, + .get_eee = ixgbe_get_eee, + .set_eee = ixgbe_set_eee, + .get_channels = ixgbe_get_channels, + .set_channels = ixgbe_set_channels, + .get_priv_flags = ixgbe_get_priv_flags, + .set_priv_flags = ixgbe_set_priv_flags, + .get_ts_info = ixgbe_get_ts_info, + .get_module_info = ixgbe_get_module_info, + .get_module_eeprom = ixgbe_get_module_eeprom, + .get_link_ksettings = ixgbe_get_link_ksettings, + .set_link_ksettings = ixgbe_set_link_ksettings, +}; + void ixgbe_set_ethtool_ops(struct net_device *netdev) { - netdev->ethtool_ops = &ixgbe_ethtool_ops; + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); + + if (adapter->hw.mac.type == ixgbe_mac_e610) + netdev->ethtool_ops = &ixgbe_ethtool_ops_e610; + else + netdev->ethtool_ops = &ixgbe_ethtool_ops; } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index cdfafc477ee0..40daab34e4fe 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -11433,11 +11433,6 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_ioremap; } - netdev->netdev_ops = &ixgbe_netdev_ops; - ixgbe_set_ethtool_ops(netdev); - netdev->watchdog_timeo = 5 * HZ; - strscpy(netdev->name, pci_name(pdev), sizeof(netdev->name)); - /* Setup hw api */ hw->mac.ops = *ii->mac_ops; hw->mac.type = ii->mac; @@ -11467,6 +11462,11 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) hw->phy.mdio.mdio_read = ixgbe_mdio_read; hw->phy.mdio.mdio_write = ixgbe_mdio_write; + netdev->netdev_ops = &ixgbe_netdev_ops; + ixgbe_set_ethtool_ops(netdev); + netdev->watchdog_timeo = 5 * HZ; + strscpy(netdev->name, pci_name(pdev), sizeof(netdev->name)); + /* setup the private structure */ err = ixgbe_sw_init(adapter, ii); if (err) -- 2.50.1 From 451c6bc923e2217d1085e8c9ce88fe1b7a30a1db Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Mon, 3 Mar 2025 13:06:28 +0100 Subject: [PATCH 11/16] ixgbe: add support for ACPI WOL for E610 Currently only APM (Advanced Power Management) is supported by the ixgbe driver. It works for magic packets only, as for different sources of wake-up E610 adapter utilizes different feature. Add E610 specific implementation of ixgbe_set_wol() callback. When any of broadcast/multicast/unicast wake-up is set, disable APM and configure ACPI (Advanced Configuration and Power Interface). Reviewed-by: Michal Swiatkowski Reviewed-by: Aleksandr Loktionov Signed-off-by: Jedrzej Jagielski Reviewed-by: Simon Horman Tested-by: Bharath R Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index 83d9ee3941e5..abc8c279192a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -2365,6 +2365,50 @@ static int ixgbe_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) return 0; } +static int ixgbe_set_wol_acpi(struct net_device *netdev, + struct ethtool_wolinfo *wol) +{ + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); + struct ixgbe_hw *hw = &adapter->hw; + u32 grc; + + if (ixgbe_wol_exclusion(adapter, wol)) + return wol->wolopts ? -EOPNOTSUPP : 0; + + /* disable APM wakeup */ + grc = IXGBE_READ_REG(hw, IXGBE_GRC_X550EM_a); + grc &= ~IXGBE_GRC_APME; + IXGBE_WRITE_REG(hw, IXGBE_GRC_X550EM_a, grc); + + /* erase existing filters */ + IXGBE_WRITE_REG(hw, IXGBE_WUFC, 0); + adapter->wol = 0; + + if (wol->wolopts & WAKE_UCAST) + adapter->wol |= IXGBE_WUFC_EX; + if (wol->wolopts & WAKE_MCAST) + adapter->wol |= IXGBE_WUFC_MC; + if (wol->wolopts & WAKE_BCAST) + adapter->wol |= IXGBE_WUFC_BC; + + IXGBE_WRITE_REG(hw, IXGBE_WUC, IXGBE_WUC_PME_EN); + IXGBE_WRITE_REG(hw, IXGBE_WUFC, adapter->wol); + + hw->wol_enabled = adapter->wol; + device_set_wakeup_enable(&adapter->pdev->dev, adapter->wol); + + return 0; +} + +static int ixgbe_set_wol_e610(struct net_device *netdev, + struct ethtool_wolinfo *wol) +{ + if (wol->wolopts & (WAKE_UCAST | WAKE_MCAST | WAKE_BCAST)) + return ixgbe_set_wol_acpi(netdev, wol); + else + return ixgbe_set_wol(netdev, wol); +} + static int ixgbe_nway_reset(struct net_device *netdev) { struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); @@ -3656,7 +3700,7 @@ static const struct ethtool_ops ixgbe_ethtool_ops_e610 = { .get_regs_len = ixgbe_get_regs_len, .get_regs = ixgbe_get_regs, .get_wol = ixgbe_get_wol, - .set_wol = ixgbe_set_wol, + .set_wol = ixgbe_set_wol_e610, .nway_reset = ixgbe_nway_reset, .get_link = ethtool_op_get_link, .get_eeprom_len = ixgbe_get_eeprom_len, -- 2.50.1 From 7f58648dbc53826d9480e45468717466d5b37343 Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Mon, 3 Mar 2025 13:06:29 +0100 Subject: [PATCH 12/16] ixgbe: apply different rules for setting FC on E610 E610 device doesn't support disabling FC autonegotiation. Create dedicated E610 .set_pauseparam() implementation and assign it to ixgbe_ethtool_ops_e610. Reviewed-by: Aleksandr Loktionov Signed-off-by: Jedrzej Jagielski Reviewed-by: Simon Horman Tested-by: Bharath R Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 57 ++++++++++++++++--- 1 file changed, 49 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index abc8c279192a..435f3fc3cec3 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -564,6 +564,22 @@ static void ixgbe_get_pauseparam(struct net_device *netdev, } } +static void ixgbe_set_pauseparam_finalize(struct net_device *netdev, + struct ixgbe_fc_info *fc) +{ + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); + struct ixgbe_hw *hw = &adapter->hw; + + /* If the thing changed then we'll update and use new autoneg. */ + if (memcmp(fc, &hw->fc, sizeof(*fc))) { + hw->fc = *fc; + if (netif_running(netdev)) + ixgbe_reinit_locked(adapter); + else + ixgbe_reset(adapter); + } +} + static int ixgbe_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause) { @@ -592,15 +608,40 @@ static int ixgbe_set_pauseparam(struct net_device *netdev, else fc.requested_mode = ixgbe_fc_none; - /* if the thing changed then we'll update and use new autoneg */ - if (memcmp(&fc, &hw->fc, sizeof(struct ixgbe_fc_info))) { - hw->fc = fc; - if (netif_running(netdev)) - ixgbe_reinit_locked(adapter); - else - ixgbe_reset(adapter); + ixgbe_set_pauseparam_finalize(netdev, &fc); + + return 0; +} + +static int ixgbe_set_pauseparam_e610(struct net_device *netdev, + struct ethtool_pauseparam *pause) +{ + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); + struct ixgbe_hw *hw = &adapter->hw; + struct ixgbe_fc_info fc = hw->fc; + + if (!ixgbe_device_supports_autoneg_fc(hw)) + return -EOPNOTSUPP; + + if (pause->autoneg == AUTONEG_DISABLE) { + netdev_info(netdev, + "Cannot disable autonegotiation on this device.\n"); + return -EOPNOTSUPP; } + fc.disable_fc_autoneg = false; + + if (pause->rx_pause && pause->tx_pause) + fc.requested_mode = ixgbe_fc_full; + else if (pause->rx_pause) + fc.requested_mode = ixgbe_fc_rx_pause; + else if (pause->tx_pause) + fc.requested_mode = ixgbe_fc_tx_pause; + else + fc.requested_mode = ixgbe_fc_none; + + ixgbe_set_pauseparam_finalize(netdev, &fc); + return 0; } @@ -3710,7 +3751,7 @@ static const struct ethtool_ops ixgbe_ethtool_ops_e610 = { .set_ringparam = ixgbe_set_ringparam, .get_pause_stats = ixgbe_get_pause_stats, .get_pauseparam = ixgbe_get_pauseparam, - .set_pauseparam = ixgbe_set_pauseparam, + .set_pauseparam = ixgbe_set_pauseparam_e610, .get_msglevel = ixgbe_get_msglevel, .set_msglevel = ixgbe_set_msglevel, .self_test = ixgbe_diag_test, -- 2.50.1 From 4bf2d11902efe9a3699cb0339f6a34cdef4e9aa6 Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Mon, 3 Mar 2025 13:06:30 +0100 Subject: [PATCH 13/16] ixgbe: add E610 .set_phys_id() callback implementation Legacy implementation of .set_phys_id() ethtool callback is not applicable for E610 device. Add new implementation which uses 0x06E9 command by calling ixgbe_aci_set_port_id_led(). Reviewed-by: Aleksandr Loktionov Reviewed-by: Michal Swiatkowski Signed-off-by: Jedrzej Jagielski Reviewed-by: Simon Horman Tested-by: Bharath R Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 29 +++++++++++++++++++ drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h | 1 + .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 22 +++++++++++++- .../ethernet/intel/ixgbe/ixgbe_type_e610.h | 14 +++++++++ 4 files changed, 65 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index 9ada35f7d8f7..71ea25de1bac 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -1484,6 +1484,35 @@ static int ixgbe_start_hw_e610(struct ixgbe_hw *hw) return 0; } +/** + * ixgbe_aci_set_port_id_led - set LED value for the given port + * @hw: pointer to the HW struct + * @orig_mode: set LED original mode + * + * Set LED value for the given port (0x06E9) + * + * Return: the exit code of the operation. + */ +int ixgbe_aci_set_port_id_led(struct ixgbe_hw *hw, bool orig_mode) +{ + struct ixgbe_aci_cmd_set_port_id_led *cmd; + struct ixgbe_aci_desc desc; + + cmd = &desc.params.set_port_id_led; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_set_port_id_led); + + cmd->lport_num = (u8)hw->bus.func; + cmd->lport_num_valid = IXGBE_ACI_PORT_ID_PORT_NUM_VALID; + + if (orig_mode) + cmd->ident_mode = IXGBE_ACI_PORT_IDENT_LED_ORIG; + else + cmd->ident_mode = IXGBE_ACI_PORT_IDENT_LED_BLINK; + + return ixgbe_aci_send_cmd(hw, &desc, NULL, 0); +} + /** * ixgbe_get_media_type_e610 - Gets media type * @hw: pointer to the HW struct diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h index 30bc1f1b2549..bb31d65bd1c8 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h @@ -36,6 +36,7 @@ int ixgbe_aci_get_link_info(struct ixgbe_hw *hw, bool ena_lse, struct ixgbe_link_status *link); int ixgbe_aci_set_event_mask(struct ixgbe_hw *hw, u8 port_num, u16 mask); int ixgbe_configure_lse(struct ixgbe_hw *hw, bool activate, u16 mask); +int ixgbe_aci_set_port_id_led(struct ixgbe_hw *hw, bool orig_mode); enum ixgbe_media_type ixgbe_get_media_type_e610(struct ixgbe_hw *hw); int ixgbe_setup_link_e610(struct ixgbe_hw *hw, ixgbe_link_speed speed, bool autoneg_wait); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index 435f3fc3cec3..d8a919ab7027 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -2491,6 +2491,26 @@ static int ixgbe_set_phys_id(struct net_device *netdev, return 0; } +static int ixgbe_set_phys_id_e610(struct net_device *netdev, + enum ethtool_phys_id_state state) +{ + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); + bool led_active; + + switch (state) { + case ETHTOOL_ID_ACTIVE: + led_active = true; + break; + case ETHTOOL_ID_INACTIVE: + led_active = false; + break; + default: + return -EOPNOTSUPP; + } + + return ixgbe_aci_set_port_id_led(&adapter->hw, !led_active); +} + static int ixgbe_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, @@ -3756,7 +3776,7 @@ static const struct ethtool_ops ixgbe_ethtool_ops_e610 = { .set_msglevel = ixgbe_set_msglevel, .self_test = ixgbe_diag_test, .get_strings = ixgbe_get_strings, - .set_phys_id = ixgbe_set_phys_id, + .set_phys_id = ixgbe_set_phys_id_e610, .get_sset_count = ixgbe_get_sset_count, .get_ethtool_stats = ixgbe_get_ethtool_stats, .get_coalesce = ixgbe_get_coalesce, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h index bea94e5ccb73..09df67f03cf4 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h @@ -223,6 +223,7 @@ enum ixgbe_aci_opc { ixgbe_aci_opc_write_mdio = 0x06E5, ixgbe_aci_opc_set_gpio_by_func = 0x06E6, ixgbe_aci_opc_get_gpio_by_func = 0x06E7, + ixgbe_aci_opc_set_port_id_led = 0x06E9, ixgbe_aci_opc_set_gpio = 0x06EC, ixgbe_aci_opc_get_gpio = 0x06ED, ixgbe_aci_opc_sff_eeprom = 0x06EE, @@ -808,6 +809,18 @@ struct ixgbe_aci_cmd_get_link_topo_pin { u8 rsvd[7]; }; +/* Set Port Identification LED (direct, 0x06E9) */ +struct ixgbe_aci_cmd_set_port_id_led { + u8 lport_num; + u8 lport_num_valid; + u8 ident_mode; + u8 rsvd[13]; +}; + +#define IXGBE_ACI_PORT_ID_PORT_NUM_VALID BIT(0) +#define IXGBE_ACI_PORT_IDENT_LED_ORIG 0 +#define IXGBE_ACI_PORT_IDENT_LED_BLINK BIT(0) + /* Read/Write SFF EEPROM command (indirect 0x06EE) */ struct ixgbe_aci_cmd_sff_eeprom { u8 lport_num; @@ -985,6 +998,7 @@ struct ixgbe_aci_desc { struct ixgbe_aci_cmd_restart_an restart_an; struct ixgbe_aci_cmd_get_link_status get_link_status; struct ixgbe_aci_cmd_set_event_mask set_event_mask; + struct ixgbe_aci_cmd_set_port_id_led set_port_id_led; struct ixgbe_aci_cmd_get_link_topo get_link_topo; struct ixgbe_aci_cmd_get_link_topo_pin get_link_topo_pin; struct ixgbe_aci_cmd_sff_eeprom read_write_sff_param; -- 2.50.1 From fe259a1bb26ec78842c975d992331705b0c2c2e8 Mon Sep 17 00:00:00 2001 From: Slawomir Mrozowicz Date: Fri, 11 Apr 2025 15:06:26 +0200 Subject: [PATCH 14/16] ixgbe: devlink: add devlink region support for E610 Provide support for the following devlink cmds: -DEVLINK_CMD_REGION_GET -DEVLINK_CMD_REGION_NEW -DEVLINK_CMD_REGION_DEL -DEVLINK_CMD_REGION_READ ixgbe devlink region implementation, similarly to the ice one, lets user to create snapshots of content of Non Volatile Memory, content of Shadow RAM, and capabilities of the device. For both NVM and SRAM regions provide .read() handler to let user read their contents without the need to create full snapshots. Reviewed-by: Przemek Kitszel Signed-off-by: Slawomir Mrozowicz Co-developed-by: Jedrzej Jagielski Signed-off-by: Jedrzej Jagielski Tested-by: Bharath R Signed-off-by: Tony Nguyen --- Documentation/networking/devlink/ixgbe.rst | 49 +++ drivers/net/ethernet/intel/ixgbe/Makefile | 3 +- .../ethernet/intel/ixgbe/devlink/devlink.h | 2 + .../net/ethernet/intel/ixgbe/devlink/region.c | 290 ++++++++++++++++++ drivers/net/ethernet/intel/ixgbe/ixgbe.h | 3 + drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 + 6 files changed, 349 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/intel/ixgbe/devlink/region.c diff --git a/Documentation/networking/devlink/ixgbe.rst b/Documentation/networking/devlink/ixgbe.rst index 3fce291348fa..c27d1436c70e 100644 --- a/Documentation/networking/devlink/ixgbe.rst +++ b/Documentation/networking/devlink/ixgbe.rst @@ -120,3 +120,52 @@ EMP firmware image. The driver does not currently support reloading the driver via ``DEVLINK_RELOAD_ACTION_DRIVER_REINIT``. + +Regions +======= + +The ``ixgbe`` driver implements the following regions for accessing internal +device data. + +.. list-table:: regions implemented + :widths: 15 85 + + * - Name + - Description + * - ``nvm-flash`` + - The contents of the entire flash chip, sometimes referred to as + the device's Non Volatile Memory. + * - ``shadow-ram`` + - The contents of the Shadow RAM, which is loaded from the beginning + of the flash. Although the contents are primarily from the flash, + this area also contains data generated during device boot which is + not stored in flash. + * - ``device-caps`` + - The contents of the device firmware's capabilities buffer. Useful to + determine the current state and configuration of the device. + +Both the ``nvm-flash`` and ``shadow-ram`` regions can be accessed without a +snapshot. The ``device-caps`` region requires a snapshot as the contents are +sent by firmware and can't be split into separate reads. + +Users can request an immediate capture of a snapshot for all three regions +via the ``DEVLINK_CMD_REGION_NEW`` command. + +.. code:: shell + + $ devlink region show + pci/0000:01:00.0/nvm-flash: size 10485760 snapshot [] max 1 + pci/0000:01:00.0/device-caps: size 4096 snapshot [] max 10 + + $ devlink region new pci/0000:01:00.0/nvm-flash snapshot 1 + + $ devlink region dump pci/0000:01:00.0/nvm-flash snapshot 1 + 0000000000000000 0014 95dc 0014 9514 0035 1670 0034 db30 + 0000000000000010 0000 0000 ffff ff04 0029 8c00 0028 8cc8 + 0000000000000020 0016 0bb8 0016 1720 0000 0000 c00f 3ffc + 0000000000000030 bada cce5 bada cce5 bada cce5 bada cce5 + + $ devlink region read pci/0000:01:00.0/nvm-flash snapshot 1 address 0 length 16 + 0000000000000000 0014 95dc 0014 9514 0035 1670 0034 db30 + + $ devlink region delete pci/0000:01:00.0/device-caps snapshot 1 diff --git a/drivers/net/ethernet/intel/ixgbe/Makefile b/drivers/net/ethernet/intel/ixgbe/Makefile index ce447540d146..2e7738f41c58 100644 --- a/drivers/net/ethernet/intel/ixgbe/Makefile +++ b/drivers/net/ethernet/intel/ixgbe/Makefile @@ -10,7 +10,8 @@ obj-$(CONFIG_IXGBE) += ixgbe.o ixgbe-y := ixgbe_main.o ixgbe_common.o ixgbe_ethtool.o \ ixgbe_82599.o ixgbe_82598.o ixgbe_phy.o ixgbe_sriov.o \ ixgbe_mbx.o ixgbe_x540.o ixgbe_x550.o ixgbe_lib.o ixgbe_ptp.o \ - ixgbe_xsk.o ixgbe_e610.o devlink/devlink.o ixgbe_fw_update.o + ixgbe_xsk.o ixgbe_e610.o devlink/devlink.o ixgbe_fw_update.o \ + devlink/region.o ixgbe-$(CONFIG_IXGBE_DCB) += ixgbe_dcb.o ixgbe_dcb_82598.o \ ixgbe_dcb_82599.o ixgbe_dcb_nl.o diff --git a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.h b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.h index 0b27653a3407..381558058048 100644 --- a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.h +++ b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.h @@ -6,5 +6,7 @@ struct ixgbe_adapter *ixgbe_allocate_devlink(struct device *dev); int ixgbe_devlink_register_port(struct ixgbe_adapter *adapter); +void ixgbe_devlink_init_regions(struct ixgbe_adapter *adapter); +void ixgbe_devlink_destroy_regions(struct ixgbe_adapter *adapter); #endif /* _IXGBE_DEVLINK_H_ */ diff --git a/drivers/net/ethernet/intel/ixgbe/devlink/region.c b/drivers/net/ethernet/intel/ixgbe/devlink/region.c new file mode 100644 index 000000000000..76f6571c3c34 --- /dev/null +++ b/drivers/net/ethernet/intel/ixgbe/devlink/region.c @@ -0,0 +1,290 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025, Intel Corporation. */ + +#include "ixgbe.h" +#include "devlink.h" + +#define IXGBE_DEVLINK_READ_BLK_SIZE (1024 * 1024) + +static const struct devlink_region_ops ixgbe_nvm_region_ops; +static const struct devlink_region_ops ixgbe_sram_region_ops; + +static int ixgbe_devlink_parse_region(struct ixgbe_hw *hw, + const struct devlink_region_ops *ops, + bool *read_shadow_ram, u32 *nvm_size) +{ + if (ops == &ixgbe_nvm_region_ops) { + *read_shadow_ram = false; + *nvm_size = hw->flash.flash_size; + } else if (ops == &ixgbe_sram_region_ops) { + *read_shadow_ram = true; + *nvm_size = hw->flash.sr_words * 2u; + } else { + return -EOPNOTSUPP; + } + + return 0; +} + +/** + * ixgbe_devlink_nvm_snapshot - Capture a snapshot of the NVM content + * @devlink: the devlink instance + * @ops: the devlink region being snapshotted + * @extack: extended ACK response structure + * @data: on exit points to snapshot data buffer + * + * This function is called in response to the DEVLINK_CMD_REGION_NEW cmd. + * + * Capture a snapshot of the whole requested NVM region. + * + * No need to worry with freeing @data, devlink core takes care if it. + * + * Return: 0 on success, -EOPNOTSUPP for unsupported regions, -EBUSY when + * cannot lock NVM, -ENOMEM when cannot alloc mem and -EIO when error + * occurs during reading. + */ +static int ixgbe_devlink_nvm_snapshot(struct devlink *devlink, + const struct devlink_region_ops *ops, + struct netlink_ext_ack *extack, u8 **data) +{ + struct ixgbe_adapter *adapter = devlink_priv(devlink); + struct ixgbe_hw *hw = &adapter->hw; + bool read_shadow_ram; + u8 *nvm_data, *buf; + u32 nvm_size, left; + u8 num_blks; + int err; + + err = ixgbe_devlink_parse_region(hw, ops, &read_shadow_ram, &nvm_size); + if (err) + return err; + + nvm_data = kvzalloc(nvm_size, GFP_KERNEL); + if (!nvm_data) + return -ENOMEM; + + num_blks = DIV_ROUND_UP(nvm_size, IXGBE_DEVLINK_READ_BLK_SIZE); + buf = nvm_data; + left = nvm_size; + + for (int i = 0; i < num_blks; i++) { + u32 read_sz = min_t(u32, IXGBE_DEVLINK_READ_BLK_SIZE, left); + + /* Need to acquire NVM lock during each loop run because the + * total period of reading whole NVM is longer than the maximum + * period the lock can be taken defined by the IXGBE_NVM_TIMEOUT. + */ + err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to acquire NVM semaphore"); + kvfree(nvm_data); + return -EBUSY; + } + + err = ixgbe_read_flat_nvm(hw, i * IXGBE_DEVLINK_READ_BLK_SIZE, + &read_sz, buf, read_shadow_ram); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to read RAM content"); + ixgbe_release_nvm(hw); + kvfree(nvm_data); + return -EIO; + } + + ixgbe_release_nvm(hw); + + buf += read_sz; + left -= read_sz; + } + + *data = nvm_data; + return 0; +} + +/** + * ixgbe_devlink_devcaps_snapshot - Capture a snapshot of device capabilities + * @devlink: the devlink instance + * @ops: the devlink region being snapshotted + * @extack: extended ACK response structure + * @data: on exit points to snapshot data buffer + * + * This function is called in response to the DEVLINK_CMD_REGION_NEW for + * the device-caps devlink region. + * + * Capture a snapshot of the device capabilities reported by firmware. + * + * No need to worry with freeing @data, devlink core takes care if it. + * + * Return: 0 on success, -ENOMEM when cannot alloc mem, or return code of + * the reading operation. + */ +static int ixgbe_devlink_devcaps_snapshot(struct devlink *devlink, + const struct devlink_region_ops *ops, + struct netlink_ext_ack *extack, + u8 **data) +{ + struct ixgbe_adapter *adapter = devlink_priv(devlink); + struct ixgbe_aci_cmd_list_caps_elem *caps; + struct ixgbe_hw *hw = &adapter->hw; + int err; + + caps = kvzalloc(IXGBE_ACI_MAX_BUFFER_SIZE, GFP_KERNEL); + if (!caps) + return -ENOMEM; + + err = ixgbe_aci_list_caps(hw, caps, IXGBE_ACI_MAX_BUFFER_SIZE, NULL, + ixgbe_aci_opc_list_dev_caps); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to read device capabilities"); + kvfree(caps); + return err; + } + + *data = (u8 *)caps; + return 0; +} + +/** + * ixgbe_devlink_nvm_read - Read a portion of NVM flash content + * @devlink: the devlink instance + * @ops: the devlink region to snapshot + * @extack: extended ACK response structure + * @offset: the offset to start at + * @size: the amount to read + * @data: the data buffer to read into + * + * This function is called in response to DEVLINK_CMD_REGION_READ to directly + * read a section of the NVM contents. + * + * Read from either the nvm-flash region either shadow-ram region. + * + * Return: 0 on success, -EOPNOTSUPP for unsupported regions, -EBUSY when + * cannot lock NVM, -ERANGE when buffer limit exceeded and -EIO when error + * occurs during reading. + */ +static int ixgbe_devlink_nvm_read(struct devlink *devlink, + const struct devlink_region_ops *ops, + struct netlink_ext_ack *extack, + u64 offset, u32 size, u8 *data) +{ + struct ixgbe_adapter *adapter = devlink_priv(devlink); + struct ixgbe_hw *hw = &adapter->hw; + bool read_shadow_ram; + u32 nvm_size; + int err; + + err = ixgbe_devlink_parse_region(hw, ops, &read_shadow_ram, &nvm_size); + if (err) + return err; + + if (offset + size > nvm_size) { + NL_SET_ERR_MSG_MOD(extack, "Cannot read beyond the region size"); + return -ERANGE; + } + + err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Failed to acquire NVM semaphore"); + return -EBUSY; + } + + err = ixgbe_read_flat_nvm(hw, (u32)offset, &size, data, read_shadow_ram); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Failed to read NVM contents"); + ixgbe_release_nvm(hw); + return -EIO; + } + + ixgbe_release_nvm(hw); + return 0; +} + +static const struct devlink_region_ops ixgbe_nvm_region_ops = { + .name = "nvm-flash", + .destructor = kvfree, + .snapshot = ixgbe_devlink_nvm_snapshot, + .read = ixgbe_devlink_nvm_read, +}; + +static const struct devlink_region_ops ixgbe_sram_region_ops = { + .name = "shadow-ram", + .destructor = kvfree, + .snapshot = ixgbe_devlink_nvm_snapshot, + .read = ixgbe_devlink_nvm_read, +}; + +static const struct devlink_region_ops ixgbe_devcaps_region_ops = { + .name = "device-caps", + .destructor = kvfree, + .snapshot = ixgbe_devlink_devcaps_snapshot, +}; + +/** + * ixgbe_devlink_init_regions - Initialize devlink regions + * @adapter: adapter instance + * + * Create devlink regions used to enable access to dump the contents of the + * flash memory of the device. + */ +void ixgbe_devlink_init_regions(struct ixgbe_adapter *adapter) +{ + struct devlink *devlink = adapter->devlink; + struct device *dev = &adapter->pdev->dev; + u64 nvm_size, sram_size; + + if (adapter->hw.mac.type != ixgbe_mac_e610) + return; + + nvm_size = adapter->hw.flash.flash_size; + adapter->nvm_region = devl_region_create(devlink, &ixgbe_nvm_region_ops, + 1, nvm_size); + if (IS_ERR(adapter->nvm_region)) { + dev_err(dev, + "Failed to create NVM devlink region, err %ld\n", + PTR_ERR(adapter->nvm_region)); + adapter->nvm_region = NULL; + } + + sram_size = adapter->hw.flash.sr_words * 2u; + adapter->sram_region = devl_region_create(devlink, &ixgbe_sram_region_ops, + 1, sram_size); + if (IS_ERR(adapter->sram_region)) { + dev_err(dev, + "Failed to create shadow-ram devlink region, err %ld\n", + PTR_ERR(adapter->sram_region)); + adapter->sram_region = NULL; + } + + adapter->devcaps_region = devl_region_create(devlink, + &ixgbe_devcaps_region_ops, + 10, IXGBE_ACI_MAX_BUFFER_SIZE); + if (IS_ERR(adapter->devcaps_region)) { + dev_err(dev, + "Failed to create device-caps devlink region, err %ld\n", + PTR_ERR(adapter->devcaps_region)); + adapter->devcaps_region = NULL; + } +} + +/** + * ixgbe_devlink_destroy_regions - Destroy devlink regions + * @adapter: adapter instance + * + * Remove previously created regions for this adapter instance. + */ +void ixgbe_devlink_destroy_regions(struct ixgbe_adapter *adapter) +{ + if (adapter->hw.mac.type != ixgbe_mac_e610) + return; + + if (adapter->nvm_region) + devl_region_destroy(adapter->nvm_region); + + if (adapter->sram_region) + devl_region_destroy(adapter->sram_region); + + if (adapter->devcaps_region) + devl_region_destroy(adapter->devcaps_region); +} diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 23c2e2c2649c..47311b134a7a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -616,6 +616,9 @@ struct ixgbe_adapter { struct mii_bus *mii_bus; struct devlink *devlink; struct devlink_port devlink_port; + struct devlink_region *nvm_region; + struct devlink_region *sram_region; + struct devlink_region *devcaps_region; unsigned long state; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 40daab34e4fe..03d31e5b131d 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -11317,6 +11317,7 @@ static int ixgbe_recovery_probe(struct ixgbe_adapter *adapter) ixgbe_devlink_register_port(adapter); SET_NETDEV_DEVLINK_PORT(adapter->netdev, &adapter->devlink_port); + ixgbe_devlink_init_regions(adapter); devl_register(adapter->devlink); devl_unlock(adapter->devlink); @@ -11824,6 +11825,7 @@ skip_sriov: if (err) goto err_netdev; + ixgbe_devlink_init_regions(adapter); devl_register(adapter->devlink); devl_unlock(adapter->devlink); return 0; @@ -11882,6 +11884,7 @@ static void ixgbe_remove(struct pci_dev *pdev) netdev = adapter->netdev; devl_lock(adapter->devlink); devl_unregister(adapter->devlink); + ixgbe_devlink_destroy_regions(adapter); ixgbe_dbg_adapter_exit(adapter); set_bit(__IXGBE_REMOVING, &adapter->state); -- 2.50.1 From 508d374b8dc018a72f2d77dbd5658e0d2e576679 Mon Sep 17 00:00:00 2001 From: Mateusz Polchlopek Date: Sat, 1 Mar 2025 20:02:44 +0100 Subject: [PATCH 15/16] idpf: assign extracted ptype to struct libeth_rqe_info field Assign the ptype extracted from qword to the ptype field of struct libeth_rqe_info. Remove the now excess ptype param of idpf_rx_singleq_extract_fields(), idpf_rx_singleq_extract_base_fields() and idpf_rx_singleq_extract_flex_fields(). Suggested-by: Alexander Lobakin Reviewed-by: Przemek Kitszel Reviewed-by: Alexander Lobakin Signed-off-by: Mateusz Polchlopek Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- .../ethernet/intel/idpf/idpf_singleq_txrx.c | 25 ++++++++----------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c index eae1b6f474e6..2e356dd10812 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c @@ -891,7 +891,6 @@ bool idpf_rx_singleq_buf_hw_alloc_all(struct idpf_rx_queue *rx_q, * idpf_rx_singleq_extract_base_fields - Extract fields from the Rx descriptor * @rx_desc: the descriptor to process * @fields: storage for extracted values - * @ptype: pointer that will store packet type * * Decode the Rx descriptor and extract relevant information including the * size and Rx packet type. @@ -901,21 +900,20 @@ bool idpf_rx_singleq_buf_hw_alloc_all(struct idpf_rx_queue *rx_q, */ static void idpf_rx_singleq_extract_base_fields(const union virtchnl2_rx_desc *rx_desc, - struct libeth_rqe_info *fields, u32 *ptype) + struct libeth_rqe_info *fields) { u64 qword; qword = le64_to_cpu(rx_desc->base_wb.qword1.status_error_ptype_len); fields->len = FIELD_GET(VIRTCHNL2_RX_BASE_DESC_QW1_LEN_PBUF_M, qword); - *ptype = FIELD_GET(VIRTCHNL2_RX_BASE_DESC_QW1_PTYPE_M, qword); + fields->ptype = FIELD_GET(VIRTCHNL2_RX_BASE_DESC_QW1_PTYPE_M, qword); } /** * idpf_rx_singleq_extract_flex_fields - Extract fields from the Rx descriptor * @rx_desc: the descriptor to process * @fields: storage for extracted values - * @ptype: pointer that will store packet type * * Decode the Rx descriptor and extract relevant information including the * size and Rx packet type. @@ -925,12 +923,12 @@ idpf_rx_singleq_extract_base_fields(const union virtchnl2_rx_desc *rx_desc, */ static void idpf_rx_singleq_extract_flex_fields(const union virtchnl2_rx_desc *rx_desc, - struct libeth_rqe_info *fields, u32 *ptype) + struct libeth_rqe_info *fields) { fields->len = FIELD_GET(VIRTCHNL2_RX_FLEX_DESC_PKT_LEN_M, le16_to_cpu(rx_desc->flex_nic_wb.pkt_len)); - *ptype = FIELD_GET(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M, - le16_to_cpu(rx_desc->flex_nic_wb.ptype_flex_flags0)); + fields->ptype = FIELD_GET(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M, + le16_to_cpu(rx_desc->flex_nic_wb.ptype_flex_flags0)); } /** @@ -938,18 +936,17 @@ idpf_rx_singleq_extract_flex_fields(const union virtchnl2_rx_desc *rx_desc, * @rx_q: Rx descriptor queue * @rx_desc: the descriptor to process * @fields: storage for extracted values - * @ptype: pointer that will store packet type * */ static void idpf_rx_singleq_extract_fields(const struct idpf_rx_queue *rx_q, const union virtchnl2_rx_desc *rx_desc, - struct libeth_rqe_info *fields, u32 *ptype) + struct libeth_rqe_info *fields) { if (rx_q->rxdids == VIRTCHNL2_RXDID_1_32B_BASE_M) - idpf_rx_singleq_extract_base_fields(rx_desc, fields, ptype); + idpf_rx_singleq_extract_base_fields(rx_desc, fields); else - idpf_rx_singleq_extract_flex_fields(rx_desc, fields, ptype); + idpf_rx_singleq_extract_flex_fields(rx_desc, fields); } /** @@ -972,7 +969,6 @@ static int idpf_rx_singleq_clean(struct idpf_rx_queue *rx_q, int budget) struct libeth_rqe_info fields = { }; union virtchnl2_rx_desc *rx_desc; struct idpf_rx_buf *rx_buf; - u32 ptype; /* get the Rx desc from Rx queue based on 'next_to_clean' */ rx_desc = &rx_q->rx[ntc]; @@ -993,7 +989,7 @@ static int idpf_rx_singleq_clean(struct idpf_rx_queue *rx_q, int budget) */ dma_rmb(); - idpf_rx_singleq_extract_fields(rx_q, rx_desc, &fields, &ptype); + idpf_rx_singleq_extract_fields(rx_q, rx_desc, &fields); rx_buf = &rx_q->rx_buf[ntc]; if (!libeth_rx_sync_for_cpu(rx_buf, fields.len)) @@ -1037,7 +1033,8 @@ skip_data: total_rx_bytes += skb->len; /* protocol */ - idpf_rx_singleq_process_skb_fields(rx_q, skb, rx_desc, ptype); + idpf_rx_singleq_process_skb_fields(rx_q, skb, rx_desc, + fields.ptype); /* send completed skb up the stack */ napi_gro_receive(rx_q->pp->p.napi, skb); -- 2.50.1 From c058c5f8b6e424261c4974b39f2596e9d521f13f Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Wed, 9 Apr 2025 08:29:45 +0200 Subject: [PATCH 16/16] idpf: remove unreachable code from setting mailbox Remove code that isn't reached. There is no need to check for adapter->req_vec_chunks, because if it isn't set idpf_set_mb_vec_id() won't be called. Only one path when idpf_set_mb_vec_id() is called: idpf_intr_req() -> idpf_send_alloc_vectors_msg() -> adapter->req_vec_chunk is allocated here, otherwise an error is returned and idpf_intr_req() exits with an error. The idpf_set_mb_vec_id() becomes one-liner and it is called only once. Remove it and set mailbox vector index directly. Reviewed-by: Aleksandr Loktionov Reviewed-by: Michal Kubiak Reviewed-by: Pavan Kumar Linga Signed-off-by: Michal Swiatkowski Reviewed-by: Simon Horman Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_lib.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c index aa755dedb41d..0e428dc476ed 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_lib.c +++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c @@ -143,22 +143,6 @@ static int idpf_mb_intr_req_irq(struct idpf_adapter *adapter) return 0; } -/** - * idpf_set_mb_vec_id - Set vector index for mailbox - * @adapter: adapter structure to access the vector chunks - * - * The first vector id in the requested vector chunks from the CP is for - * the mailbox - */ -static void idpf_set_mb_vec_id(struct idpf_adapter *adapter) -{ - if (adapter->req_vec_chunks) - adapter->mb_vector.v_idx = - le16_to_cpu(adapter->caps.mailbox_vector_id); - else - adapter->mb_vector.v_idx = 0; -} - /** * idpf_mb_intr_init - Initialize the mailbox interrupt * @adapter: adapter structure to store the mailbox vector @@ -349,7 +333,7 @@ int idpf_intr_req(struct idpf_adapter *adapter) goto free_irq; } - idpf_set_mb_vec_id(adapter); + adapter->mb_vector.v_idx = le16_to_cpu(adapter->caps.mailbox_vector_id); vecids = kcalloc(total_vecs, sizeof(u16), GFP_KERNEL); if (!vecids) { -- 2.50.1