From 3595599fa8360bb3c7afa7ee50c810b4a64106ea Mon Sep 17 00:00:00 2001 From: =?utf8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 27 Jan 2025 14:13:42 +0100 Subject: [PATCH 01/16] net: xdp: Disallow attaching device-bound programs in generic mode MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Device-bound programs are used to support RX metadata kfuncs. These kfuncs are driver-specific and rely on the driver context to read the metadata. This means they can't work in generic XDP mode. However, there is no check to disallow such programs from being attached in generic mode, in which case the metadata kfuncs will be called in an invalid context, leading to crashes. Fix this by adding a check to disallow attaching device-bound programs in generic mode. Fixes: 2b3486bc2d23 ("bpf: Introduce device-bound XDP programs") Reported-by: Marcus Wichelmann Closes: https://lore.kernel.org/r/dae862ec-43b5-41a0-8edf-46c59071cdda@hetzner-cloud.de Tested-by: Marcus Wichelmann Acked-by: Stanislav Fomichev Signed-off-by: Toke Høiland-Jørgensen Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20250127131344.238147-1-toke@redhat.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index 07b2bb1ce64f..02cd75b512f9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9924,6 +9924,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack NL_SET_ERR_MSG(extack, "Program bound to different device"); return -EINVAL; } + if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) { + NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode"); + return -EINVAL; + } if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); return -EINVAL; -- 2.51.0 From f7bf624b1fedf232195804ac0f6584cb3e4b86bc Mon Sep 17 00:00:00 2001 From: =?utf8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 27 Jan 2025 14:13:43 +0100 Subject: [PATCH 02/16] selftests/net: Add test for loading devbound XDP program in generic mode MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add a test to bpf_offload.py for loading a devbound XDP program in generic mode, checking that it fails correctly. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250127131344.238147-2-toke@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bpf_offload.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/bpf_offload.py b/tools/testing/selftests/net/bpf_offload.py index d10f420e4ef6..fd0d959914e4 100755 --- a/tools/testing/selftests/net/bpf_offload.py +++ b/tools/testing/selftests/net/bpf_offload.py @@ -215,12 +215,14 @@ def bpftool_map_list_wait(expected=0, n_retry=20, ns=""): raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps)) def bpftool_prog_load(sample, file_name, maps=[], prog_type="xdp", dev=None, - fail=True, include_stderr=False): + fail=True, include_stderr=False, dev_bind=None): args = "prog load %s %s" % (os.path.join(bpf_test_dir, sample), file_name) if prog_type is not None: args += " type " + prog_type if dev is not None: args += " dev " + dev + elif dev_bind is not None: + args += " xdpmeta_dev " + dev_bind if len(maps): args += " map " + " map ".join(maps) @@ -980,6 +982,16 @@ try: rm("/sys/fs/bpf/offload") sim.wait_for_flush() + bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/devbound", + dev_bind=sim['ifname']) + devbound = bpf_pinned("/sys/fs/bpf/devbound") + start_test("Test dev-bound program in generic mode...") + ret, _, err = sim.set_xdp(devbound, "generic", fail=False, include_stderr=True) + fail(ret == 0, "devbound program in generic mode allowed") + check_extack(err, "Can't attach device-bound programs in generic mode.", args) + rm("/sys/fs/bpf/devbound") + sim.wait_for_flush() + start_test("Test XDP load failure...") sim.dfs["dev/bpf_bind_verifier_accept"] = 0 ret, _, err = bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload", -- 2.51.0 From 2c2ebb2b49573e5f8726112ad06b1dffc3c9ea03 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 29 Jan 2025 10:50:46 +0100 Subject: [PATCH 03/16] net: ravb: Fix missing rtnl lock in suspend/resume path MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Fix the suspend/resume path by ensuring the rtnl lock is held where required. Calls to ravb_open, ravb_close and wol operations must be performed under the rtnl lock to prevent conflicts with ongoing ndo operations. Without this fix, the following warning is triggered: [ 39.032969] ============================= [ 39.032983] WARNING: suspicious RCU usage [ 39.033019] ----------------------------- [ 39.033033] drivers/net/phy/phy_device.c:2004 suspicious rcu_dereference_protected() usage! ... [ 39.033597] stack backtrace: [ 39.033613] CPU: 0 UID: 0 PID: 174 Comm: python3 Not tainted 6.13.0-rc7-next-20250116-arm64-renesas-00002-g35245dfdc62c #7 [ 39.033623] Hardware name: Renesas SMARC EVK version 2 based on r9a08g045s33 (DT) [ 39.033628] Call trace: [ 39.033633] show_stack+0x14/0x1c (C) [ 39.033652] dump_stack_lvl+0xb4/0xc4 [ 39.033664] dump_stack+0x14/0x1c [ 39.033671] lockdep_rcu_suspicious+0x16c/0x22c [ 39.033682] phy_detach+0x160/0x190 [ 39.033694] phy_disconnect+0x40/0x54 [ 39.033703] ravb_close+0x6c/0x1cc [ 39.033714] ravb_suspend+0x48/0x120 [ 39.033721] dpm_run_callback+0x4c/0x14c [ 39.033731] device_suspend+0x11c/0x4dc [ 39.033740] dpm_suspend+0xdc/0x214 [ 39.033748] dpm_suspend_start+0x48/0x60 [ 39.033758] suspend_devices_and_enter+0x124/0x574 [ 39.033769] pm_suspend+0x1ac/0x274 [ 39.033778] state_store+0x88/0x124 [ 39.033788] kobj_attr_store+0x14/0x24 [ 39.033798] sysfs_kf_write+0x48/0x6c [ 39.033808] kernfs_fop_write_iter+0x118/0x1a8 [ 39.033817] vfs_write+0x27c/0x378 [ 39.033825] ksys_write+0x64/0xf4 [ 39.033833] __arm64_sys_write+0x18/0x20 [ 39.033841] invoke_syscall+0x44/0x104 [ 39.033852] el0_svc_common.constprop.0+0xb4/0xd4 [ 39.033862] do_el0_svc+0x18/0x20 [ 39.033870] el0_svc+0x3c/0xf0 [ 39.033880] el0t_64_sync_handler+0xc0/0xc4 [ 39.033888] el0t_64_sync+0x154/0x158 [ 39.041274] ravb 11c30000.ethernet eth0: Link is Down Reported-by: Claudiu Beznea Closes: https://lore.kernel.org/netdev/4c6419d8-c06b-495c-b987-d66c2e1ff848@tuxon.dev/ Fixes: 0184165b2f42 ("ravb: add sleep PM suspend/resume support") Signed-off-by: Kory Maincent Tested-by: Niklas Söderlund Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index bc395294a32d..c9f4976a3527 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -3217,10 +3217,15 @@ static int ravb_suspend(struct device *dev) netif_device_detach(ndev); - if (priv->wol_enabled) - return ravb_wol_setup(ndev); + rtnl_lock(); + if (priv->wol_enabled) { + ret = ravb_wol_setup(ndev); + rtnl_unlock(); + return ret; + } ret = ravb_close(ndev); + rtnl_unlock(); if (ret) return ret; @@ -3245,19 +3250,20 @@ static int ravb_resume(struct device *dev) if (!netif_running(ndev)) return 0; + rtnl_lock(); /* If WoL is enabled restore the interface. */ - if (priv->wol_enabled) { + if (priv->wol_enabled) ret = ravb_wol_restore(ndev); - if (ret) - return ret; - } else { + else ret = pm_runtime_force_resume(dev); - if (ret) - return ret; + if (ret) { + rtnl_unlock(); + return ret; } /* Reopening the interface will restore the device to the working state. */ ret = ravb_open(ndev); + rtnl_unlock(); if (ret < 0) goto out_rpm_put; -- 2.51.0 From b95102215a8d0987789715ce11c0d4ec031cbfbe Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 29 Jan 2025 10:50:47 +0100 Subject: [PATCH 04/16] net: sh_eth: Fix missing rtnl lock in suspend/resume path MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Fix the suspend/resume path by ensuring the rtnl lock is held where required. Calls to sh_eth_close, sh_eth_open and wol operations must be performed under the rtnl lock to prevent conflicts with ongoing ndo operations. Fixes: b71af04676e9 ("sh_eth: add more PM methods") Tested-by: Niklas Söderlund Reviewed-by: Sergey Shtylyov Signed-off-by: Kory Maincent Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/sh_eth.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 8887b8921009..5fc8027c92c7 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -3494,10 +3494,12 @@ static int sh_eth_suspend(struct device *dev) netif_device_detach(ndev); + rtnl_lock(); if (mdp->wol_enabled) ret = sh_eth_wol_setup(ndev); else ret = sh_eth_close(ndev); + rtnl_unlock(); return ret; } @@ -3511,10 +3513,12 @@ static int sh_eth_resume(struct device *dev) if (!netif_running(ndev)) return 0; + rtnl_lock(); if (mdp->wol_enabled) ret = sh_eth_wol_restore(ndev); else ret = sh_eth_open(ndev); + rtnl_unlock(); if (ret < 0) return ret; -- 2.51.0 From 1b9335a8000fb70742f7db10af314104b6ace220 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 28 Jan 2025 12:26:33 +0100 Subject: [PATCH 05/16] netfilter: nf_tables: reject mismatching sum of field_len with set key length The field length description provides the length of each separated key field in the concatenation, each field gets rounded up to 32-bits to calculate the pipapo rule width from pipapo_init(). The set key length provides the total size of the key aligned to 32-bits. Register-based arithmetics still allows for combining mismatching set key length and field length description, eg. set key length 10 and field description [ 5, 4 ] leading to pipapo width of 12. Cc: stable@vger.kernel.org Fixes: 3ce67e3793f4 ("netfilter: nf_tables: do not allow mismatch field size and set key length") Reported-by: Noam Rathaus Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c4af283356e7..e5662dc087c8 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5065,7 +5065,7 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr, static int nft_set_desc_concat(struct nft_set_desc *desc, const struct nlattr *nla) { - u32 num_regs = 0, key_num_regs = 0; + u32 len = 0, num_regs; struct nlattr *attr; int rem, err, i; @@ -5079,12 +5079,12 @@ static int nft_set_desc_concat(struct nft_set_desc *desc, } for (i = 0; i < desc->field_count; i++) - num_regs += DIV_ROUND_UP(desc->field_len[i], sizeof(u32)); + len += round_up(desc->field_len[i], sizeof(u32)); - key_num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); - if (key_num_regs != num_regs) + if (len != desc->klen) return -EINVAL; + num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); if (num_regs > NFT_REG32_COUNT) return -E2BIG; -- 2.51.0 From e598d8981fd34470b78a1ae777dbf131b15d5bf2 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 29 Jan 2025 13:24:32 +0100 Subject: [PATCH 06/16] mptcp: blackhole only if 1st SYN retrans w/o MPC is accepted The Fixes commit mentioned this: > An MPTCP firewall blackhole can be detected if the following SYN > retransmission after a fallback to "plain" TCP is accepted. But in fact, this blackhole was detected if any following SYN retransmissions after a fallback to TCP was accepted. That's because 'mptcp_subflow_early_fallback()' will set 'request_mptcp' to 0, and 'mpc_drop' will never be reset to 0 after. This is an issue, because some not so unusual situations might cause the kernel to detect a false-positive blackhole, e.g. a client trying to connect to a server while the network is not ready yet, causing a few SYN retransmissions, before reaching the end server. Fixes: 27069e7cb3d1 ("mptcp: disable active MPTCP in case of blackhole") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/ctrl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 3999e0ba2c35..2dd81e6c26bd 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -418,9 +418,9 @@ void mptcp_active_detect_blackhole(struct sock *ssk, bool expired) MPTCP_INC_STATS(net, MPTCP_MIB_MPCAPABLEACTIVEDROP); subflow->mpc_drop = 1; mptcp_subflow_early_fallback(mptcp_sk(subflow->conn), subflow); - } else { - subflow->mpc_drop = 0; } + } else if (ssk->sk_state == TCP_SYN_SENT) { + subflow->mpc_drop = 0; } } -- 2.51.0 From 18da4b5d123285dea470b15ff51c7fbe61dc37fd Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 29 Jan 2025 13:24:33 +0100 Subject: [PATCH 07/16] doc: mptcp: sysctl: blackhole_timeout is per-netns All other sysctl entries mention it, and it is a per-namespace sysctl. So mention it as well. Fixes: 27069e7cb3d1 ("mptcp: disable active MPTCP in case of blackhole") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- Documentation/networking/mptcp-sysctl.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index dc45c0211353..03e1d3610333 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -41,7 +41,7 @@ blackhole_timeout - INTEGER (seconds) MPTCP is re-enabled and will reset to the initial value when the blackhole issue goes away. - 0 to disable the blackhole detection. + 0 to disable the blackhole detection. This is a per-namespace sysctl. Default: 3600 -- 2.51.0 From 0f5697f1a3f99bc2b674b8aa3c5da822c5673c11 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Jan 2025 13:00:07 +0000 Subject: [PATCH 08/16] net: hsr: fix fill_frame_info() regression vs VLAN packets Stephan Wurm reported that my recent patch broke VLAN support. Apparently skb->mac_len is not correct for VLAN traffic as shown by debug traces [1]. Use instead pskb_may_pull() to make sure the expected header is present in skb->head. Many thanks to Stephan for his help. [1] kernel: skb len=170 headroom=2 headlen=170 tailroom=20 mac=(2,14) mac_len=14 net=(16,-1) trans=-1 shinfo(txflags=0 nr_frags=0 gso(size=0 type=0 segs=0)) csum(0x0 start=0 offset=0 ip_summed=0 complete_sw=0 valid=0 level=0) hash(0x0 sw=0 l4=0) proto=0x0000 pkttype=0 iif=0 priority=0x0 mark=0x0 alloc_cpu=0 vlan_all=0x0 encapsulation=0 inner(proto=0x0000, mac=0, net=0, trans=0) kernel: dev name=prp0 feat=0x0000000000007000 kernel: sk family=17 type=3 proto=0 kernel: skb headroom: 00000000: 74 00 kernel: skb linear: 00000000: 01 0c cd 01 00 01 00 d0 93 53 9c cb 81 00 80 00 kernel: skb linear: 00000010: 88 b8 00 01 00 98 00 00 00 00 61 81 8d 80 16 52 kernel: skb linear: 00000020: 45 47 44 4e 43 54 52 4c 2f 4c 4c 4e 30 24 47 4f kernel: skb linear: 00000030: 24 47 6f 43 62 81 01 14 82 16 52 45 47 44 4e 43 kernel: skb linear: 00000040: 54 52 4c 2f 4c 4c 4e 30 24 44 73 47 6f 6f 73 65 kernel: skb linear: 00000050: 83 07 47 6f 49 64 65 6e 74 84 08 67 8d f5 93 7e kernel: skb linear: 00000060: 76 c8 00 85 01 01 86 01 00 87 01 00 88 01 01 89 kernel: skb linear: 00000070: 01 00 8a 01 02 ab 33 a2 15 83 01 00 84 03 03 00 kernel: skb linear: 00000080: 00 91 08 67 8d f5 92 77 4b c6 1f 83 01 00 a2 1a kernel: skb linear: 00000090: a2 06 85 01 00 83 01 00 84 03 03 00 00 91 08 67 kernel: skb linear: 000000a0: 8d f5 92 77 4b c6 1f 83 01 00 kernel: skb tailroom: 00000000: 80 18 02 00 fe 4e 00 00 01 01 08 0a 4f fd 5e d1 kernel: skb tailroom: 00000010: 4f fd 5e cd Fixes: b9653d19e556 ("net: hsr: avoid potential out-of-bound access in fill_frame_info()") Reported-by: Stephan Wurm Tested-by: Stephan Wurm Closes: https://lore.kernel.org/netdev/Z4o_UC0HweBHJ_cw@PC-LX-SteWu/ Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250129130007.644084-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/hsr/hsr_forward.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 87bb3a91598e..a4bacf198555 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -700,9 +700,12 @@ static int fill_frame_info(struct hsr_frame_info *frame, frame->is_vlan = true; if (frame->is_vlan) { - if (skb->mac_len < offsetofend(struct hsr_vlan_ethhdr, vlanhdr)) + /* Note: skb->mac_len might be wrong here. */ + if (!pskb_may_pull(skb, + skb_mac_offset(skb) + + offsetofend(struct hsr_vlan_ethhdr, vlanhdr))) return -EINVAL; - vlan_hdr = (struct hsr_vlan_ethhdr *)ethhdr; + vlan_hdr = (struct hsr_vlan_ethhdr *)skb_mac_header(skb); proto = vlan_hdr->vlanhdr.h_vlan_encapsulated_proto; } -- 2.51.0 From e759e1e4a4bd2926d082afe56046a90224433a31 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Jan 2025 14:27:26 +0000 Subject: [PATCH 09/16] net: revert RTNL changes in unregister_netdevice_many_notify() This patch reverts following changes: 83419b61d187 net: reduce RTNL hold duration in unregister_netdevice_many_notify() (part 2) ae646f1a0bb9 net: reduce RTNL hold duration in unregister_netdevice_many_notify() (part 1) cfa579f66656 net: no longer hold RTNL while calling flush_all_backlogs() This caused issues in layers holding a private mutex: cleanup_net() rtnl_lock(); mutex_lock(subsystem_mutex); unregister_netdevice(); rtnl_unlock(); // LOCKDEP violation rtnl_lock(); I will revisit this in next cycle, opt-in for the new behavior from safe contexts only. Fixes: cfa579f66656 ("net: no longer hold RTNL while calling flush_all_backlogs()") Fixes: ae646f1a0bb9 ("net: reduce RTNL hold duration in unregister_netdevice_many_notify() (part 1)") Fixes: 83419b61d187 ("net: reduce RTNL hold duration in unregister_netdevice_many_notify() (part 2)") Reported-by: syzbot+5b9196ecf74447172a9a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6789d55f.050a0220.20d369.004e.GAE@google.com/ Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250129142726.747726-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 33 +++------------------------------ 1 file changed, 3 insertions(+), 30 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 02cd75b512f9..c0021cbd28fc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10264,37 +10264,14 @@ static bool from_cleanup_net(void) #endif } -static void rtnl_drop_if_cleanup_net(void) -{ - if (from_cleanup_net()) - __rtnl_unlock(); -} - -static void rtnl_acquire_if_cleanup_net(void) -{ - if (from_cleanup_net()) - rtnl_lock(); -} - /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); -static LIST_HEAD(net_todo_list_for_cleanup_net); - -/* TODO: net_todo_list/net_todo_list_for_cleanup_net should probably - * be provided by callers, instead of being static, rtnl protected. - */ -static struct list_head *todo_list(void) -{ - return from_cleanup_net() ? &net_todo_list_for_cleanup_net : - &net_todo_list; -} - DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); atomic_t dev_unreg_count = ATOMIC_INIT(0); static void net_set_todo(struct net_device *dev) { - list_add_tail(&dev->todo_list, todo_list()); + list_add_tail(&dev->todo_list, &net_todo_list); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, @@ -11144,7 +11121,7 @@ void netdev_run_todo(void) #endif /* Snapshot list, allow later requests */ - list_replace_init(todo_list(), &list); + list_replace_init(&net_todo_list, &list); __rtnl_unlock(); @@ -11789,11 +11766,9 @@ void unregister_netdevice_many_notify(struct list_head *head, WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); netdev_unlock(dev); } - - rtnl_drop_if_cleanup_net(); flush_all_backlogs(); + synchronize_net(); - rtnl_acquire_if_cleanup_net(); list_for_each_entry(dev, head, unreg_list) { struct sk_buff *skb = NULL; @@ -11853,9 +11828,7 @@ void unregister_netdevice_many_notify(struct list_head *head, #endif } - rtnl_drop_if_cleanup_net(); synchronize_net(); - rtnl_acquire_if_cleanup_net(); list_for_each_entry(dev, head, unreg_list) { netdev_put(dev, &dev->dev_registered_tracker); -- 2.51.0 From d7dda216ca49f1ac214cb577cbeeee760a2b425b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 Jan 2025 11:13:32 -0800 Subject: [PATCH 10/16] MAINTAINERS: add Neal to TCP maintainers Neal Cardwell has been indispensable in TCP reviews and investigations, especially protocol-related. Neal is also the author of packetdrill. Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250129191332.2526140-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index d996f393fcf4..720c692b9dfb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16525,6 +16525,7 @@ F: tools/testing/selftests/net/mptcp/ NETWORKING [TCP] M: Eric Dumazet +M: Neal Cardwell L: netdev@vger.kernel.org S: Maintained F: Documentation/networking/net_cachelines/tcp_sock.rst -- 2.51.0 From c3a392bdd31adc474f1009ee85c13fdd01fe800d Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 3 Dec 2024 07:58:09 +0100 Subject: [PATCH 11/16] ice: count combined queues using Rx/Tx count Previous implementation assumes that there is 1:1 matching between vectors and queues. It isn't always true. Get minimum value from Rx/Tx queues to determine combined queues number. Reviewed-by: Jacob Keller Tested-by: Pucha Himasekhar Reddy Signed-off-by: Michal Swiatkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ethtool.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index f241493a6ac8..6bbb304ad9ab 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -3817,8 +3817,7 @@ static u32 ice_get_combined_cnt(struct ice_vsi *vsi) ice_for_each_q_vector(vsi, q_idx) { struct ice_q_vector *q_vector = vsi->q_vectors[q_idx]; - if (q_vector->rx.rx_ring && q_vector->tx.tx_ring) - combined++; + combined += min(q_vector->num_ring_tx, q_vector->num_ring_rx); } return combined; -- 2.51.0 From b2657259fce933ae0ba3e07cd0052fb6a8e90689 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 3 Dec 2024 07:58:10 +0100 Subject: [PATCH 12/16] ice: devlink PF MSI-X max and min parameter Use generic devlink PF MSI-X parameter to allow user to change MSI-X range. Add notes about this parameters into ice devlink documentation. Tested-by: Pucha Himasekhar Reddy Signed-off-by: Michal Swiatkowski Signed-off-by: Tony Nguyen --- Documentation/networking/devlink/ice.rst | 11 +++ .../net/ethernet/intel/ice/devlink/devlink.c | 81 +++++++++++++++++++ drivers/net/ethernet/intel/ice/ice.h | 7 ++ drivers/net/ethernet/intel/ice/ice_irq.c | 7 ++ 4 files changed, 106 insertions(+) diff --git a/Documentation/networking/devlink/ice.rst b/Documentation/networking/devlink/ice.rst index e3972d03cea0..792e9f8c846a 100644 --- a/Documentation/networking/devlink/ice.rst +++ b/Documentation/networking/devlink/ice.rst @@ -69,6 +69,17 @@ Parameters To verify that value has been set: $ devlink dev param show pci/0000:16:00.0 name tx_scheduling_layers + * - ``msix_vec_per_pf_max`` + - driverinit + - Set the max MSI-X that can be used by the PF, rest can be utilized for + SRIOV. The range is from min value set in msix_vec_per_pf_min to + 2k/number of ports. + * - ``msix_vec_per_pf_min`` + - driverinit + - Set the min MSI-X that will be used by the PF. This value inform how many + MSI-X will be allocated statically. The range is from 2 to value set + in msix_vec_per_pf_max. + .. list-table:: Driver specific parameters implemented :widths: 5 5 90 diff --git a/drivers/net/ethernet/intel/ice/devlink/devlink.c b/drivers/net/ethernet/intel/ice/devlink/devlink.c index d116e2b10bce..81de5fbc130e 100644 --- a/drivers/net/ethernet/intel/ice/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ice/devlink/devlink.c @@ -1202,6 +1202,25 @@ static int ice_devlink_set_parent(struct devlink_rate *devlink_rate, return status; } +static void ice_set_min_max_msix(struct ice_pf *pf) +{ + struct devlink *devlink = priv_to_devlink(pf); + union devlink_param_value val; + int err; + + err = devl_param_driverinit_value_get(devlink, + DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, + &val); + if (!err) + pf->msix.min = val.vu32; + + err = devl_param_driverinit_value_get(devlink, + DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, + &val); + if (!err) + pf->msix.max = val.vu32; +} + /** * ice_devlink_reinit_up - do reinit of the given PF * @pf: pointer to the PF struct @@ -1217,6 +1236,9 @@ static int ice_devlink_reinit_up(struct ice_pf *pf) return err; } + /* load MSI-X values */ + ice_set_min_max_msix(pf); + err = ice_init_dev(pf); if (err) goto unroll_hw_init; @@ -1530,6 +1552,30 @@ static int ice_devlink_local_fwd_validate(struct devlink *devlink, u32 id, return 0; } +static int +ice_devlink_msix_max_pf_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + struct ice_pf *pf = devlink_priv(devlink); + + if (val.vu32 > pf->hw.func_caps.common_cap.num_msix_vectors) + return -EINVAL; + + return 0; +} + +static int +ice_devlink_msix_min_pf_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + if (val.vu32 < ICE_MIN_MSIX) + return -EINVAL; + + return 0; +} + enum ice_param_id { ICE_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX, ICE_DEVLINK_PARAM_ID_TX_SCHED_LAYERS, @@ -1547,6 +1593,15 @@ static const struct devlink_param ice_dvl_rdma_params[] = { ice_devlink_enable_iw_validate), }; +static const struct devlink_param ice_dvl_msix_params[] = { + DEVLINK_PARAM_GENERIC(MSIX_VEC_PER_PF_MAX, + BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + NULL, NULL, ice_devlink_msix_max_pf_validate), + DEVLINK_PARAM_GENERIC(MSIX_VEC_PER_PF_MIN, + BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + NULL, NULL, ice_devlink_msix_min_pf_validate), +}; + static const struct devlink_param ice_dvl_sched_params[] = { DEVLINK_PARAM_DRIVER(ICE_DEVLINK_PARAM_ID_TX_SCHED_LAYERS, "tx_scheduling_layers", @@ -1648,6 +1703,7 @@ void ice_devlink_unregister(struct ice_pf *pf) int ice_devlink_register_params(struct ice_pf *pf) { struct devlink *devlink = priv_to_devlink(pf); + union devlink_param_value value; struct ice_hw *hw = &pf->hw; int status; @@ -1656,10 +1712,33 @@ int ice_devlink_register_params(struct ice_pf *pf) if (status) return status; + status = devl_params_register(devlink, ice_dvl_msix_params, + ARRAY_SIZE(ice_dvl_msix_params)); + if (status) + goto unregister_rdma_params; + if (hw->func_caps.common_cap.tx_sched_topo_comp_mode_en) status = devl_params_register(devlink, ice_dvl_sched_params, ARRAY_SIZE(ice_dvl_sched_params)); + if (status) + goto unregister_msix_params; + + value.vu32 = pf->msix.max; + devl_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, + value); + value.vu32 = pf->msix.min; + devl_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, + value); + return 0; +unregister_msix_params: + devl_params_unregister(devlink, ice_dvl_msix_params, + ARRAY_SIZE(ice_dvl_msix_params)); +unregister_rdma_params: + devl_params_unregister(devlink, ice_dvl_rdma_params, + ARRAY_SIZE(ice_dvl_rdma_params)); return status; } @@ -1670,6 +1749,8 @@ void ice_devlink_unregister_params(struct ice_pf *pf) devl_params_unregister(devlink, ice_dvl_rdma_params, ARRAY_SIZE(ice_dvl_rdma_params)); + devl_params_unregister(devlink, ice_dvl_msix_params, + ARRAY_SIZE(ice_dvl_msix_params)); if (hw->func_caps.common_cap.tx_sched_topo_comp_mode_en) devl_params_unregister(devlink, ice_dvl_sched_params, diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 71e05d30f0fd..d041b04ff324 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -542,6 +542,12 @@ struct ice_agg_node { u8 valid; }; +struct ice_pf_msix { + u32 cur; + u32 min; + u32 max; +}; + struct ice_pf { struct pci_dev *pdev; struct ice_adapter *adapter; @@ -612,6 +618,7 @@ struct ice_pf { struct msi_map ll_ts_irq; /* LL_TS interrupt MSIX vector */ u16 max_pf_txqs; /* Total Tx queues PF wide */ u16 max_pf_rxqs; /* Total Rx queues PF wide */ + struct ice_pf_msix msix; u16 num_lan_msix; /* Total MSIX vectors for base driver */ u16 num_lan_tx; /* num LAN Tx queues setup */ u16 num_lan_rx; /* num LAN Rx queues setup */ diff --git a/drivers/net/ethernet/intel/ice/ice_irq.c b/drivers/net/ethernet/intel/ice/ice_irq.c index ad82ff7d1995..0659b96b9b8c 100644 --- a/drivers/net/ethernet/intel/ice/ice_irq.c +++ b/drivers/net/ethernet/intel/ice/ice_irq.c @@ -254,6 +254,13 @@ int ice_init_interrupt_scheme(struct ice_pf *pf) int total_vectors = pf->hw.func_caps.common_cap.num_msix_vectors; int vectors, max_vectors; + /* load default PF MSI-X range */ + if (!pf->msix.min) + pf->msix.min = ICE_MIN_MSIX; + + if (!pf->msix.max) + pf->msix.max = total_vectors / 2; + vectors = ice_ena_msix_range(pf); if (vectors < 0) -- 2.51.0 From 79d97b8cf9a8500e2a18bb37f96268b17b7e7dd5 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 3 Dec 2024 07:58:11 +0100 Subject: [PATCH 13/16] ice: remove splitting MSI-X between features With dynamic approach to alloc MSI-X there is no sense to statically split MSI-X between PF features. Splitting was also calculating needed MSI-X. Move this part to separate function and use as max value. Remove ICE_ESWITCH_MSIX, as there is no need for additional MSI-X for switchdev. Reviewed-by: Jacob Keller Reviewed-by: Wojciech Drewek Tested-by: Pucha Himasekhar Reddy Signed-off-by: Michal Swiatkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 2 - drivers/net/ethernet/intel/ice/ice_irq.c | 172 +++-------------------- 2 files changed, 16 insertions(+), 158 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index d041b04ff324..c78bd45cf016 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -98,8 +98,6 @@ #define ICE_MIN_MSIX (ICE_MIN_LAN_TXRX_MSIX + ICE_MIN_LAN_OICR_MSIX) #define ICE_FDIR_MSIX 2 #define ICE_RDMA_NUM_AEQ_MSIX 4 -#define ICE_MIN_RDMA_MSIX 2 -#define ICE_ESWITCH_MSIX 1 #define ICE_NO_VSI 0xffff #define ICE_VSI_MAP_CONTIG 0 #define ICE_VSI_MAP_SCATTER 1 diff --git a/drivers/net/ethernet/intel/ice/ice_irq.c b/drivers/net/ethernet/intel/ice/ice_irq.c index 0659b96b9b8c..4a50a6dc817e 100644 --- a/drivers/net/ethernet/intel/ice/ice_irq.c +++ b/drivers/net/ethernet/intel/ice/ice_irq.c @@ -84,155 +84,11 @@ static struct ice_irq_entry *ice_get_irq_res(struct ice_pf *pf, bool dyn_only) return entry; } -/** - * ice_reduce_msix_usage - Reduce usage of MSI-X vectors - * @pf: board private structure - * @v_remain: number of remaining MSI-X vectors to be distributed - * - * Reduce the usage of MSI-X vectors when entire request cannot be fulfilled. - * pf->num_lan_msix and pf->num_rdma_msix values are set based on number of - * remaining vectors. - */ -static void ice_reduce_msix_usage(struct ice_pf *pf, int v_remain) +static int ice_get_default_msix_amount(struct ice_pf *pf) { - int v_rdma; - - if (!ice_is_rdma_ena(pf)) { - pf->num_lan_msix = v_remain; - return; - } - - /* RDMA needs at least 1 interrupt in addition to AEQ MSIX */ - v_rdma = ICE_RDMA_NUM_AEQ_MSIX + 1; - - if (v_remain < ICE_MIN_LAN_TXRX_MSIX + ICE_MIN_RDMA_MSIX) { - dev_warn(ice_pf_to_dev(pf), "Not enough MSI-X vectors to support RDMA.\n"); - clear_bit(ICE_FLAG_RDMA_ENA, pf->flags); - - pf->num_rdma_msix = 0; - pf->num_lan_msix = ICE_MIN_LAN_TXRX_MSIX; - } else if ((v_remain < ICE_MIN_LAN_TXRX_MSIX + v_rdma) || - (v_remain - v_rdma < v_rdma)) { - /* Support minimum RDMA and give remaining vectors to LAN MSIX - */ - pf->num_rdma_msix = ICE_MIN_RDMA_MSIX; - pf->num_lan_msix = v_remain - ICE_MIN_RDMA_MSIX; - } else { - /* Split remaining MSIX with RDMA after accounting for AEQ MSIX - */ - pf->num_rdma_msix = (v_remain - ICE_RDMA_NUM_AEQ_MSIX) / 2 + - ICE_RDMA_NUM_AEQ_MSIX; - pf->num_lan_msix = v_remain - pf->num_rdma_msix; - } -} - -/** - * ice_ena_msix_range - Request a range of MSIX vectors from the OS - * @pf: board private structure - * - * Compute the number of MSIX vectors wanted and request from the OS. Adjust - * device usage if there are not enough vectors. Return the number of vectors - * reserved or negative on failure. - */ -static int ice_ena_msix_range(struct ice_pf *pf) -{ - int num_cpus, hw_num_msix, v_other, v_wanted, v_actual; - struct device *dev = ice_pf_to_dev(pf); - int err; - - hw_num_msix = pf->hw.func_caps.common_cap.num_msix_vectors; - num_cpus = num_online_cpus(); - - /* LAN miscellaneous handler */ - v_other = ICE_MIN_LAN_OICR_MSIX; - - /* Flow Director */ - if (test_bit(ICE_FLAG_FD_ENA, pf->flags)) - v_other += ICE_FDIR_MSIX; - - /* switchdev */ - v_other += ICE_ESWITCH_MSIX; - - v_wanted = v_other; - - /* LAN traffic */ - pf->num_lan_msix = num_cpus; - v_wanted += pf->num_lan_msix; - - /* RDMA auxiliary driver */ - if (ice_is_rdma_ena(pf)) { - pf->num_rdma_msix = num_cpus + ICE_RDMA_NUM_AEQ_MSIX; - v_wanted += pf->num_rdma_msix; - } - - if (v_wanted > hw_num_msix) { - int v_remain; - - dev_warn(dev, "not enough device MSI-X vectors. wanted = %d, available = %d\n", - v_wanted, hw_num_msix); - - if (hw_num_msix < ICE_MIN_MSIX) { - err = -ERANGE; - goto exit_err; - } - - v_remain = hw_num_msix - v_other; - if (v_remain < ICE_MIN_LAN_TXRX_MSIX) { - v_other = ICE_MIN_MSIX - ICE_MIN_LAN_TXRX_MSIX; - v_remain = ICE_MIN_LAN_TXRX_MSIX; - } - - ice_reduce_msix_usage(pf, v_remain); - v_wanted = pf->num_lan_msix + pf->num_rdma_msix + v_other; - - dev_notice(dev, "Reducing request to %d MSI-X vectors for LAN traffic.\n", - pf->num_lan_msix); - if (ice_is_rdma_ena(pf)) - dev_notice(dev, "Reducing request to %d MSI-X vectors for RDMA.\n", - pf->num_rdma_msix); - } - - /* actually reserve the vectors */ - v_actual = pci_alloc_irq_vectors(pf->pdev, ICE_MIN_MSIX, v_wanted, - PCI_IRQ_MSIX); - if (v_actual < 0) { - dev_err(dev, "unable to reserve MSI-X vectors\n"); - err = v_actual; - goto exit_err; - } - - if (v_actual < v_wanted) { - dev_warn(dev, "not enough OS MSI-X vectors. requested = %d, obtained = %d\n", - v_wanted, v_actual); - - if (v_actual < ICE_MIN_MSIX) { - /* error if we can't get minimum vectors */ - pci_free_irq_vectors(pf->pdev); - err = -ERANGE; - goto exit_err; - } else { - int v_remain = v_actual - v_other; - - if (v_remain < ICE_MIN_LAN_TXRX_MSIX) - v_remain = ICE_MIN_LAN_TXRX_MSIX; - - ice_reduce_msix_usage(pf, v_remain); - - dev_notice(dev, "Enabled %d MSI-X vectors for LAN traffic.\n", - pf->num_lan_msix); - - if (ice_is_rdma_ena(pf)) - dev_notice(dev, "Enabled %d MSI-X vectors for RDMA.\n", - pf->num_rdma_msix); - } - } - - return v_actual; - -exit_err: - pf->num_rdma_msix = 0; - pf->num_lan_msix = 0; - return err; + return ICE_MIN_LAN_OICR_MSIX + num_online_cpus() + + (test_bit(ICE_FLAG_FD_ENA, pf->flags) ? ICE_FDIR_MSIX : 0) + + (ice_is_rdma_ena(pf) ? num_online_cpus() + ICE_RDMA_NUM_AEQ_MSIX : 0); } /** @@ -259,17 +115,21 @@ int ice_init_interrupt_scheme(struct ice_pf *pf) pf->msix.min = ICE_MIN_MSIX; if (!pf->msix.max) - pf->msix.max = total_vectors / 2; - - vectors = ice_ena_msix_range(pf); + pf->msix.max = min(total_vectors, + ice_get_default_msix_amount(pf)); - if (vectors < 0) - return -ENOMEM; - - if (pci_msix_can_alloc_dyn(pf->pdev)) + if (pci_msix_can_alloc_dyn(pf->pdev)) { + vectors = pf->msix.min; max_vectors = total_vectors; - else + } else { + vectors = pf->msix.max; max_vectors = vectors; + } + + vectors = pci_alloc_irq_vectors(pf->pdev, pf->msix.min, vectors, + PCI_IRQ_MSIX); + if (vectors < pf->msix.min) + return -ENOMEM; ice_init_irq_tracker(pf, max_vectors, vectors); -- 2.51.0 From ad61cd9c67ad592668fc0e7253c507b50f72acab Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 3 Dec 2024 07:58:12 +0100 Subject: [PATCH 14/16] ice: get rid of num_lan_msix field Remove the field to allow having more queues than MSI-X on VSI. As default the number will be the same, but if there won't be more MSI-X available VSI can run with at least one MSI-X. Reviewed-by: Jacob Keller Reviewed-by: Wojciech Drewek Tested-by: Pucha Himasekhar Reddy Signed-off-by: Michal Swiatkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 1 - drivers/net/ethernet/intel/ice/ice_base.c | 10 ++++---- drivers/net/ethernet/intel/ice/ice_ethtool.c | 6 ++--- drivers/net/ethernet/intel/ice/ice_irq.c | 11 ++++----- drivers/net/ethernet/intel/ice/ice_lib.c | 25 +++++++++++--------- 5 files changed, 24 insertions(+), 29 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index c78bd45cf016..ad61dd688871 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -617,7 +617,6 @@ struct ice_pf { u16 max_pf_txqs; /* Total Tx queues PF wide */ u16 max_pf_rxqs; /* Total Rx queues PF wide */ struct ice_pf_msix msix; - u16 num_lan_msix; /* Total MSIX vectors for base driver */ u16 num_lan_tx; /* num LAN Tx queues setup */ u16 num_lan_rx; /* num LAN Rx queues setup */ u16 next_vsi; /* Next free slot in pf->vsi[] - 0-based! */ diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index b2af8e3586f7..0e862f20427a 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -801,13 +801,11 @@ int ice_vsi_alloc_q_vectors(struct ice_vsi *vsi) return 0; err_out: - while (v_idx--) - ice_free_q_vector(vsi, v_idx); - dev_err(dev, "Failed to allocate %d q_vector for VSI %d, ret=%d\n", - vsi->num_q_vectors, vsi->vsi_num, err); - vsi->num_q_vectors = 0; - return err; + dev_info(dev, "Failed to allocate %d q_vectors for VSI %d, new value %d", + vsi->num_q_vectors, vsi->vsi_num, v_idx); + vsi->num_q_vectors = v_idx; + return v_idx ? 0 : err; } /** diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 6bbb304ad9ab..b0805704834d 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -3788,8 +3788,7 @@ ice_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info) */ static int ice_get_max_txq(struct ice_pf *pf) { - return min3(pf->num_lan_msix, (u16)num_online_cpus(), - (u16)pf->hw.func_caps.common_cap.num_txq); + return min(num_online_cpus(), pf->hw.func_caps.common_cap.num_txq); } /** @@ -3798,8 +3797,7 @@ static int ice_get_max_txq(struct ice_pf *pf) */ static int ice_get_max_rxq(struct ice_pf *pf) { - return min3(pf->num_lan_msix, (u16)num_online_cpus(), - (u16)pf->hw.func_caps.common_cap.num_rxq); + return min(num_online_cpus(), pf->hw.func_caps.common_cap.num_rxq); } /** diff --git a/drivers/net/ethernet/intel/ice/ice_irq.c b/drivers/net/ethernet/intel/ice/ice_irq.c index 4a50a6dc817e..1a7d446ab5f1 100644 --- a/drivers/net/ethernet/intel/ice/ice_irq.c +++ b/drivers/net/ethernet/intel/ice/ice_irq.c @@ -108,7 +108,7 @@ void ice_clear_interrupt_scheme(struct ice_pf *pf) int ice_init_interrupt_scheme(struct ice_pf *pf) { int total_vectors = pf->hw.func_caps.common_cap.num_msix_vectors; - int vectors, max_vectors; + int vectors; /* load default PF MSI-X range */ if (!pf->msix.min) @@ -118,20 +118,17 @@ int ice_init_interrupt_scheme(struct ice_pf *pf) pf->msix.max = min(total_vectors, ice_get_default_msix_amount(pf)); - if (pci_msix_can_alloc_dyn(pf->pdev)) { + if (pci_msix_can_alloc_dyn(pf->pdev)) vectors = pf->msix.min; - max_vectors = total_vectors; - } else { + else vectors = pf->msix.max; - max_vectors = vectors; - } vectors = pci_alloc_irq_vectors(pf->pdev, pf->msix.min, vectors, PCI_IRQ_MSIX); if (vectors < pf->msix.min) return -ENOMEM; - ice_init_irq_tracker(pf, max_vectors, vectors); + ice_init_irq_tracker(pf, pf->msix.max, vectors); return 0; } diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 38a1c8372180..4b8d7aa7b1bb 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -157,6 +157,16 @@ static void ice_vsi_set_num_desc(struct ice_vsi *vsi) } } +static u16 ice_get_rxq_count(struct ice_pf *pf) +{ + return min(ice_get_avail_rxq_count(pf), num_online_cpus()); +} + +static u16 ice_get_txq_count(struct ice_pf *pf) +{ + return min(ice_get_avail_txq_count(pf), num_online_cpus()); +} + /** * ice_vsi_set_num_qs - Set number of queues, descriptors and vectors for a VSI * @vsi: the VSI being configured @@ -178,9 +188,7 @@ static void ice_vsi_set_num_qs(struct ice_vsi *vsi) vsi->alloc_txq = vsi->req_txq; vsi->num_txq = vsi->req_txq; } else { - vsi->alloc_txq = min3(pf->num_lan_msix, - ice_get_avail_txq_count(pf), - (u16)num_online_cpus()); + vsi->alloc_txq = ice_get_txq_count(pf); } pf->num_lan_tx = vsi->alloc_txq; @@ -193,17 +201,13 @@ static void ice_vsi_set_num_qs(struct ice_vsi *vsi) vsi->alloc_rxq = vsi->req_rxq; vsi->num_rxq = vsi->req_rxq; } else { - vsi->alloc_rxq = min3(pf->num_lan_msix, - ice_get_avail_rxq_count(pf), - (u16)num_online_cpus()); + vsi->alloc_rxq = ice_get_rxq_count(pf); } } pf->num_lan_rx = vsi->alloc_rxq; - vsi->num_q_vectors = min_t(int, pf->num_lan_msix, - max_t(int, vsi->alloc_rxq, - vsi->alloc_txq)); + vsi->num_q_vectors = max(vsi->alloc_rxq, vsi->alloc_txq); break; case ICE_VSI_SF: vsi->alloc_txq = 1; @@ -1173,12 +1177,11 @@ static void ice_set_rss_vsi_ctx(struct ice_vsi_ctx *ctxt, struct ice_vsi *vsi) static void ice_chnl_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt) { - struct ice_pf *pf = vsi->back; u16 qcount, qmap; u8 offset = 0; int pow; - qcount = min_t(int, vsi->num_rxq, pf->num_lan_msix); + qcount = vsi->num_rxq; pow = order_base_2(qcount); qmap = FIELD_PREP(ICE_AQ_VSI_TC_Q_OFFSET_M, offset); -- 2.51.0 From 3e0d3cb3fbe06a7bc09d98324a21a446c80f9d3b Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 3 Dec 2024 07:58:13 +0100 Subject: [PATCH 15/16] ice, irdma: move interrupts code to irdma Move responsibility of MSI-X requesting for RDMA feature from ice driver to irdma driver. It is done to allow simple fallback when there is not enough MSI-X available. Change amount of MSI-X used for control from 4 to 1, as it isn't needed to have more than one MSI-X for this purpose. Reviewed-by: Jacob Keller Signed-off-by: Michal Swiatkowski Signed-off-by: Tony Nguyen --- drivers/infiniband/hw/irdma/hw.c | 2 - drivers/infiniband/hw/irdma/main.c | 46 ++++++++++++++++- drivers/infiniband/hw/irdma/main.h | 3 ++ drivers/net/ethernet/intel/ice/ice.h | 1 - drivers/net/ethernet/intel/ice/ice_idc.c | 64 ++++++------------------ drivers/net/ethernet/intel/ice/ice_irq.c | 3 +- include/linux/net/intel/iidc.h | 2 + 7 files changed, 65 insertions(+), 56 deletions(-) diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index ad50b77282f8..69ce1862eabe 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -498,8 +498,6 @@ static int irdma_save_msix_info(struct irdma_pci_f *rf) iw_qvlist->num_vectors = rf->msix_count; if (rf->msix_count <= num_online_cpus()) rf->msix_shared = true; - else if (rf->msix_count > num_online_cpus() + 1) - rf->msix_count = num_online_cpus() + 1; pmsix = rf->msix_entries; for (i = 0, ceq_idx = 0; i < rf->msix_count; i++, iw_qvinfo++) { diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c index 3f13200ff71b..1ee8969595d3 100644 --- a/drivers/infiniband/hw/irdma/main.c +++ b/drivers/infiniband/hw/irdma/main.c @@ -206,6 +206,43 @@ static void irdma_lan_unregister_qset(struct irdma_sc_vsi *vsi, ibdev_dbg(&iwdev->ibdev, "WS: LAN free_res for rdma qset failed.\n"); } +static int irdma_init_interrupts(struct irdma_pci_f *rf, struct ice_pf *pf) +{ + int i; + + rf->msix_count = num_online_cpus() + IRDMA_NUM_AEQ_MSIX; + rf->msix_entries = kcalloc(rf->msix_count, sizeof(*rf->msix_entries), + GFP_KERNEL); + if (!rf->msix_entries) + return -ENOMEM; + + for (i = 0; i < rf->msix_count; i++) + if (ice_alloc_rdma_qvector(pf, &rf->msix_entries[i])) + break; + + if (i < IRDMA_MIN_MSIX) { + for (; i > 0; i--) + ice_free_rdma_qvector(pf, &rf->msix_entries[i]); + + kfree(rf->msix_entries); + return -ENOMEM; + } + + rf->msix_count = i; + + return 0; +} + +static void irdma_deinit_interrupts(struct irdma_pci_f *rf, struct ice_pf *pf) +{ + int i; + + for (i = 0; i < rf->msix_count; i++) + ice_free_rdma_qvector(pf, &rf->msix_entries[i]); + + kfree(rf->msix_entries); +} + static void irdma_remove(struct auxiliary_device *aux_dev) { struct iidc_auxiliary_dev *iidc_adev = container_of(aux_dev, @@ -216,6 +253,7 @@ static void irdma_remove(struct auxiliary_device *aux_dev) irdma_ib_unregister_device(iwdev); ice_rdma_update_vsi_filter(pf, iwdev->vsi_num, false); + irdma_deinit_interrupts(iwdev->rf, pf); pr_debug("INIT: Gen2 PF[%d] device remove success\n", PCI_FUNC(pf->pdev->devfn)); } @@ -230,9 +268,7 @@ static void irdma_fill_device_info(struct irdma_device *iwdev, struct ice_pf *pf rf->gen_ops.unregister_qset = irdma_lan_unregister_qset; rf->hw.hw_addr = pf->hw.hw_addr; rf->pcidev = pf->pdev; - rf->msix_count = pf->num_rdma_msix; rf->pf_id = pf->hw.pf_id; - rf->msix_entries = &pf->msix_entries[pf->rdma_base_vector]; rf->default_vsi.vsi_idx = vsi->vsi_num; rf->protocol_used = pf->rdma_mode & IIDC_RDMA_PROTOCOL_ROCEV2 ? IRDMA_ROCE_PROTOCOL_ONLY : IRDMA_IWARP_PROTOCOL_ONLY; @@ -281,6 +317,10 @@ static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_ irdma_fill_device_info(iwdev, pf, vsi); rf = iwdev->rf; + err = irdma_init_interrupts(rf, pf); + if (err) + goto err_init_interrupts; + err = irdma_ctrl_init_hw(rf); if (err) goto err_ctrl_init; @@ -311,6 +351,8 @@ err_ibreg: err_rt_init: irdma_ctrl_deinit_hw(rf); err_ctrl_init: + irdma_deinit_interrupts(rf, pf); +err_init_interrupts: kfree(iwdev->rf); ib_dealloc_device(&iwdev->ibdev); diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index 9f0ed6e84471..ef9a9b79d711 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -117,6 +117,9 @@ extern struct auxiliary_driver i40iw_auxiliary_drv; #define IRDMA_IRQ_NAME_STR_LEN (64) +#define IRDMA_NUM_AEQ_MSIX 1 +#define IRDMA_MIN_MSIX 2 + enum init_completion_state { INVALID_STATE = 0, INITIAL_STATE, diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index ad61dd688871..0dbc98ba69f4 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -97,7 +97,6 @@ #define ICE_MIN_LAN_OICR_MSIX 1 #define ICE_MIN_MSIX (ICE_MIN_LAN_TXRX_MSIX + ICE_MIN_LAN_OICR_MSIX) #define ICE_FDIR_MSIX 2 -#define ICE_RDMA_NUM_AEQ_MSIX 4 #define ICE_NO_VSI 0xffff #define ICE_VSI_MAP_CONTIG 0 #define ICE_VSI_MAP_SCATTER 1 diff --git a/drivers/net/ethernet/intel/ice/ice_idc.c b/drivers/net/ethernet/intel/ice/ice_idc.c index 145b27f2a4ce..bab3e81cad5d 100644 --- a/drivers/net/ethernet/intel/ice/ice_idc.c +++ b/drivers/net/ethernet/intel/ice/ice_idc.c @@ -228,61 +228,34 @@ void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos) } EXPORT_SYMBOL_GPL(ice_get_qos_params); -/** - * ice_alloc_rdma_qvectors - Allocate vector resources for RDMA driver - * @pf: board private structure to initialize - */ -static int ice_alloc_rdma_qvectors(struct ice_pf *pf) +int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry) { - if (ice_is_rdma_ena(pf)) { - int i; - - pf->msix_entries = kcalloc(pf->num_rdma_msix, - sizeof(*pf->msix_entries), - GFP_KERNEL); - if (!pf->msix_entries) - return -ENOMEM; + struct msi_map map = ice_alloc_irq(pf, true); - /* RDMA is the only user of pf->msix_entries array */ - pf->rdma_base_vector = 0; - - for (i = 0; i < pf->num_rdma_msix; i++) { - struct msix_entry *entry = &pf->msix_entries[i]; - struct msi_map map; + if (map.index < 0) + return -ENOMEM; - map = ice_alloc_irq(pf, false); - if (map.index < 0) - break; + entry->entry = map.index; + entry->vector = map.virq; - entry->entry = map.index; - entry->vector = map.virq; - } - } return 0; } +EXPORT_SYMBOL_GPL(ice_alloc_rdma_qvector); /** * ice_free_rdma_qvector - free vector resources reserved for RDMA driver * @pf: board private structure to initialize + * @entry: MSI-X entry to be removed */ -static void ice_free_rdma_qvector(struct ice_pf *pf) +void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry) { - int i; - - if (!pf->msix_entries) - return; - - for (i = 0; i < pf->num_rdma_msix; i++) { - struct msi_map map; + struct msi_map map; - map.index = pf->msix_entries[i].entry; - map.virq = pf->msix_entries[i].vector; - ice_free_irq(pf, map); - } - - kfree(pf->msix_entries); - pf->msix_entries = NULL; + map.index = entry->entry; + map.virq = entry->vector; + ice_free_irq(pf, map); } +EXPORT_SYMBOL_GPL(ice_free_rdma_qvector); /** * ice_adev_release - function to be mapped to AUX dev's release op @@ -382,12 +355,6 @@ int ice_init_rdma(struct ice_pf *pf) return -ENOMEM; } - /* Reserve vector resources */ - ret = ice_alloc_rdma_qvectors(pf); - if (ret < 0) { - dev_err(dev, "failed to reserve vectors for RDMA\n"); - goto err_reserve_rdma_qvector; - } pf->rdma_mode |= IIDC_RDMA_PROTOCOL_ROCEV2; ret = ice_plug_aux_dev(pf); if (ret) @@ -395,8 +362,6 @@ int ice_init_rdma(struct ice_pf *pf) return 0; err_plug_aux_dev: - ice_free_rdma_qvector(pf); -err_reserve_rdma_qvector: pf->adev = NULL; xa_erase(&ice_aux_id, pf->aux_idx); return ret; @@ -412,6 +377,5 @@ void ice_deinit_rdma(struct ice_pf *pf) return; ice_unplug_aux_dev(pf); - ice_free_rdma_qvector(pf); xa_erase(&ice_aux_id, pf->aux_idx); } diff --git a/drivers/net/ethernet/intel/ice/ice_irq.c b/drivers/net/ethernet/intel/ice/ice_irq.c index 1a7d446ab5f1..80c9ee2e64c1 100644 --- a/drivers/net/ethernet/intel/ice/ice_irq.c +++ b/drivers/net/ethernet/intel/ice/ice_irq.c @@ -84,11 +84,12 @@ static struct ice_irq_entry *ice_get_irq_res(struct ice_pf *pf, bool dyn_only) return entry; } +#define ICE_RDMA_AEQ_MSIX 1 static int ice_get_default_msix_amount(struct ice_pf *pf) { return ICE_MIN_LAN_OICR_MSIX + num_online_cpus() + (test_bit(ICE_FLAG_FD_ENA, pf->flags) ? ICE_FDIR_MSIX : 0) + - (ice_is_rdma_ena(pf) ? num_online_cpus() + ICE_RDMA_NUM_AEQ_MSIX : 0); + (ice_is_rdma_ena(pf) ? num_online_cpus() + ICE_RDMA_AEQ_MSIX : 0); } /** diff --git a/include/linux/net/intel/iidc.h b/include/linux/net/intel/iidc.h index 1c1332e4df26..13274c3def66 100644 --- a/include/linux/net/intel/iidc.h +++ b/include/linux/net/intel/iidc.h @@ -78,6 +78,8 @@ int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type); int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos); +int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); +void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); /* Structure representing auxiliary driver tailored information about the core * PCI dev, each auxiliary driver using the IIDC interface will have an -- 2.51.0 From a8c2d3932c1106af2764cc6869b29bcf3cb5bc47 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 3 Dec 2024 07:58:14 +0100 Subject: [PATCH 16/16] ice: treat dyn_allowed only as suggestion It can be needed to have some MSI-X allocated as static and rest as dynamic. For example on PF VSI. We want to always have minimum one MSI-X on it, because of that it is allocated as a static one, rest can be dynamic if it is supported. Change the ice_get_irq_res() to allow using static entries if they are free even if caller wants dynamic one. Adjust limit values to the new approach. Min and max in limit means the values that are valid, so decrease max and num_static by one. Set vsi::irq_dyn_alloc if dynamic allocation is supported. Reviewed-by: Jacob Keller Reviewed-by: Wojciech Drewek Tested-by: Pucha Himasekhar Reddy Signed-off-by: Michal Swiatkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_irq.c | 25 ++++++++++++------------ drivers/net/ethernet/intel/ice/ice_lib.c | 2 ++ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_irq.c b/drivers/net/ethernet/intel/ice/ice_irq.c index 80c9ee2e64c1..d466d29b2ef1 100644 --- a/drivers/net/ethernet/intel/ice/ice_irq.c +++ b/drivers/net/ethernet/intel/ice/ice_irq.c @@ -45,7 +45,7 @@ static void ice_free_irq_res(struct ice_pf *pf, u16 index) /** * ice_get_irq_res - get an interrupt resource * @pf: board private structure - * @dyn_only: force entry to be dynamically allocated + * @dyn_allowed: allow entry to be dynamically allocated * * Allocate new irq entry in the free slot of the tracker. Since xarray * is used, always allocate new entry at the lowest possible index. Set @@ -53,11 +53,12 @@ static void ice_free_irq_res(struct ice_pf *pf, u16 index) * * Returns allocated irq entry or NULL on failure. */ -static struct ice_irq_entry *ice_get_irq_res(struct ice_pf *pf, bool dyn_only) +static struct ice_irq_entry *ice_get_irq_res(struct ice_pf *pf, + bool dyn_allowed) { - struct xa_limit limit = { .max = pf->irq_tracker.num_entries, + struct xa_limit limit = { .max = pf->irq_tracker.num_entries - 1, .min = 0 }; - unsigned int num_static = pf->irq_tracker.num_static; + unsigned int num_static = pf->irq_tracker.num_static - 1; struct ice_irq_entry *entry; unsigned int index; int ret; @@ -66,9 +67,9 @@ static struct ice_irq_entry *ice_get_irq_res(struct ice_pf *pf, bool dyn_only) if (!entry) return NULL; - /* skip preallocated entries if the caller says so */ - if (dyn_only) - limit.min = num_static; + /* only already allocated if the caller says so */ + if (!dyn_allowed) + limit.max = num_static; ret = xa_alloc(&pf->irq_tracker.entries, &index, entry, limit, GFP_KERNEL); @@ -78,7 +79,7 @@ static struct ice_irq_entry *ice_get_irq_res(struct ice_pf *pf, bool dyn_only) entry = NULL; } else { entry->index = index; - entry->dynamic = index >= num_static; + entry->dynamic = index > num_static; } return entry; @@ -137,7 +138,7 @@ int ice_init_interrupt_scheme(struct ice_pf *pf) /** * ice_alloc_irq - Allocate new interrupt vector * @pf: board private structure - * @dyn_only: force dynamic allocation of the interrupt + * @dyn_allowed: allow dynamic allocation of the interrupt * * Allocate new interrupt vector for a given owner id. * return struct msi_map with interrupt details and track @@ -150,20 +151,20 @@ int ice_init_interrupt_scheme(struct ice_pf *pf) * interrupt will be allocated with pci_msix_alloc_irq_at. * * Some callers may only support dynamically allocated interrupts. - * This is indicated with dyn_only flag. + * This is indicated with dyn_allowed flag. * * On failure, return map with negative .index. The caller * is expected to check returned map index. * */ -struct msi_map ice_alloc_irq(struct ice_pf *pf, bool dyn_only) +struct msi_map ice_alloc_irq(struct ice_pf *pf, bool dyn_allowed) { int sriov_base_vector = pf->sriov_base_vector; struct msi_map map = { .index = -ENOENT }; struct device *dev = ice_pf_to_dev(pf); struct ice_irq_entry *entry; - entry = ice_get_irq_res(pf, dyn_only); + entry = ice_get_irq_res(pf, dyn_allowed); if (!entry) return map; diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 4b8d7aa7b1bb..1827f1f20ce8 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -571,6 +571,8 @@ ice_vsi_alloc_def(struct ice_vsi *vsi, struct ice_channel *ch) return -ENOMEM; } + vsi->irq_dyn_alloc = pci_msix_can_alloc_dyn(vsi->back->pdev); + switch (vsi->type) { case ICE_VSI_PF: case ICE_VSI_SF: -- 2.51.0