From be43a6b2382983c89b59166ba2c32ec0f1092cfe Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 7 Nov 2024 10:12:10 -0800 Subject: [PATCH 01/16] selftests: ncdevmem: Move ncdevmem under drivers/net/hw This is where all the tests that depend on the HW functionality live in and this is where the automated test is gonna be added in the next patch. Reviewed-by: Mina Almasry Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20241107181211.3934153-12-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/.gitignore | 1 + tools/testing/selftests/drivers/net/hw/Makefile | 8 ++++++++ .../testing/selftests/{net => drivers/net/hw}/ncdevmem.c | 0 tools/testing/selftests/net/.gitignore | 1 - tools/testing/selftests/net/Makefile | 8 -------- 5 files changed, 9 insertions(+), 9 deletions(-) create mode 100644 tools/testing/selftests/drivers/net/hw/.gitignore rename tools/testing/selftests/{net => drivers/net/hw}/ncdevmem.c (100%) diff --git a/tools/testing/selftests/drivers/net/hw/.gitignore b/tools/testing/selftests/drivers/net/hw/.gitignore new file mode 100644 index 000000000000..e9fe6ede681a --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/.gitignore @@ -0,0 +1 @@ +ncdevmem diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index c9f2f48fc30f..182348f4bd40 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -26,4 +26,12 @@ TEST_INCLUDES := \ ../../../net/forwarding/tc_common.sh \ # +# YNL files, must be before "include ..lib.mk" +YNL_GEN_FILES := ncdevmem +TEST_GEN_FILES += $(YNL_GEN_FILES) + include ../../../lib.mk + +# YNL build +YNL_GENS := ethtool netdev +include ../../../net/ynl.mk diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c similarity index 100% rename from tools/testing/selftests/net/ncdevmem.c rename to tools/testing/selftests/drivers/net/hw/ncdevmem.c diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 217d8b7a7365..a78debbd1fe7 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -18,7 +18,6 @@ ipv6_flowlabel_mgr log.txt msg_oob msg_zerocopy -ncdevmem nettest psock_fanout psock_snd diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 61cce028f105..9322b904ad00 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -98,10 +98,6 @@ TEST_PROGS += vlan_hw_filter.sh TEST_PROGS += bpf_offload.py TEST_PROGS += ipv6_route_update_soft_lockup.sh -# YNL files, must be before "include ..lib.mk" -YNL_GEN_FILES := ncdevmem -TEST_GEN_FILES += $(YNL_GEN_FILES) - TEST_FILES := settings TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh @@ -111,10 +107,6 @@ TEST_INCLUDES := forwarding/lib.sh include ../lib.mk -# YNL build -YNL_GENS := ethtool netdev -include ynl.mk - $(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto -- 2.51.0 From 80230864b7b0fd9b54b294ab08a28f01d4193aa2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 7 Nov 2024 10:12:11 -0800 Subject: [PATCH 02/16] selftests: ncdevmem: Add automated test Only RX side for now and small message to test the setup. In the future, we can extend it to TX side and to testing both sides with a couple of megs of data. make \ -C tools/testing/selftests \ TARGETS="drivers/hw/net" \ install INSTALL_PATH=~/tmp/ksft scp ~/tmp/ksft ${HOST}: scp ~/tmp/ksft ${PEER}: cfg+="NETIF=${DEV}\n" cfg+="LOCAL_V6=${HOST_IP}\n" cfg+="REMOTE_V6=${PEER_IP}\n" cfg+="REMOTE_TYPE=ssh\n" cfg+="REMOTE_ARGS=root@${PEER}\n" echo -e "$cfg" | ssh root@${HOST} "cat > ksft/drivers/net/net.config" ssh root@${HOST} "cd ksft && ./run_kselftest.sh -t drivers/net:devmem.py" Reviewed-by: Mina Almasry Signed-off-by: Stanislav Fomichev Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241107181211.3934153-13-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- .../testing/selftests/drivers/net/hw/Makefile | 1 + .../selftests/drivers/net/hw/devmem.py | 45 +++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/hw/devmem.py diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 182348f4bd40..1c6a77480923 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -3,6 +3,7 @@ TEST_PROGS = \ csum.py \ devlink_port_split.py \ + devmem.py \ ethtool.sh \ ethtool_extended_state.sh \ ethtool_mm.sh \ diff --git a/tools/testing/selftests/drivers/net/hw/devmem.py b/tools/testing/selftests/drivers/net/hw/devmem.py new file mode 100755 index 000000000000..1223f0f5c10c --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/devmem.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_run, ksft_exit +from lib.py import ksft_eq, KsftSkipEx +from lib.py import NetDrvEpEnv +from lib.py import bkg, cmd, rand_port, wait_port_listen +from lib.py import ksft_disruptive + + +def require_devmem(cfg): + if not hasattr(cfg, "_devmem_probed"): + port = rand_port() + probe_command = f"./ncdevmem -f {cfg.ifname}" + cfg._devmem_supported = cmd(probe_command, fail=False, shell=True).ret == 0 + cfg._devmem_probed = True + + if not cfg._devmem_supported: + raise KsftSkipEx("Test requires devmem support") + + +@ksft_disruptive +def check_rx(cfg) -> None: + cfg.require_v6() + require_devmem(cfg) + + port = rand_port() + listen_cmd = f"./ncdevmem -l -f {cfg.ifname} -s {cfg.v6} -p {port}" + + with bkg(listen_cmd) as socat: + wait_port_listen(port) + cmd(f"echo -e \"hello\\nworld\"| socat -u - TCP6:[{cfg.v6}]:{port}", host=cfg.remote, shell=True) + + ksft_eq(socat.stdout.strip(), "hello\nworld") + + +def main() -> None: + with NetDrvEpEnv(__file__) as cfg: + ksft_run([check_rx], + args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() -- 2.51.0 From 7a3bcd39ae1f0e3ab896d9df62339ab4297a0bfd Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 9 Nov 2024 23:12:12 +0100 Subject: [PATCH 03/16] r8169: use helper r8169_mod_reg8_cond to simplify rtl_jumbo_config Use recently added helper r8169_mod_reg8_cond() to simplify jumbo mode configuration. Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Link: https://patch.msgid.link/3df1d484-a02e-46e7-8f75-db5b428e422e@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 77 ++++------------------- 1 file changed, 11 insertions(+), 66 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 6578a9947b82..907a482c012f 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2543,86 +2543,31 @@ static void rtl8169_init_ring_indexes(struct rtl8169_private *tp) tp->dirty_tx = tp->cur_tx = tp->cur_rx = 0; } -static void r8168c_hw_jumbo_enable(struct rtl8169_private *tp) -{ - RTL_W8(tp, Config3, RTL_R8(tp, Config3) | Jumbo_En0); - RTL_W8(tp, Config4, RTL_R8(tp, Config4) | Jumbo_En1); -} - -static void r8168c_hw_jumbo_disable(struct rtl8169_private *tp) -{ - RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Jumbo_En0); - RTL_W8(tp, Config4, RTL_R8(tp, Config4) & ~Jumbo_En1); -} - -static void r8168dp_hw_jumbo_enable(struct rtl8169_private *tp) -{ - RTL_W8(tp, Config3, RTL_R8(tp, Config3) | Jumbo_En0); -} - -static void r8168dp_hw_jumbo_disable(struct rtl8169_private *tp) -{ - RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Jumbo_En0); -} - -static void r8168e_hw_jumbo_enable(struct rtl8169_private *tp) -{ - RTL_W8(tp, MaxTxPacketSize, 0x24); - RTL_W8(tp, Config3, RTL_R8(tp, Config3) | Jumbo_En0); - RTL_W8(tp, Config4, RTL_R8(tp, Config4) | 0x01); -} - -static void r8168e_hw_jumbo_disable(struct rtl8169_private *tp) -{ - RTL_W8(tp, MaxTxPacketSize, 0x3f); - RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Jumbo_En0); - RTL_W8(tp, Config4, RTL_R8(tp, Config4) & ~0x01); -} - -static void r8168b_1_hw_jumbo_enable(struct rtl8169_private *tp) -{ - RTL_W8(tp, Config4, RTL_R8(tp, Config4) | (1 << 0)); -} - -static void r8168b_1_hw_jumbo_disable(struct rtl8169_private *tp) -{ - RTL_W8(tp, Config4, RTL_R8(tp, Config4) & ~(1 << 0)); -} - static void rtl_jumbo_config(struct rtl8169_private *tp) { bool jumbo = tp->dev->mtu > ETH_DATA_LEN; int readrq = 4096; + if (jumbo && tp->mac_version >= RTL_GIGA_MAC_VER_17 && + tp->mac_version <= RTL_GIGA_MAC_VER_26) + readrq = 512; + rtl_unlock_config_regs(tp); switch (tp->mac_version) { case RTL_GIGA_MAC_VER_17: - if (jumbo) { - readrq = 512; - r8168b_1_hw_jumbo_enable(tp); - } else { - r8168b_1_hw_jumbo_disable(tp); - } + r8169_mod_reg8_cond(tp, Config4, BIT(0), jumbo); break; case RTL_GIGA_MAC_VER_18 ... RTL_GIGA_MAC_VER_26: - if (jumbo) { - readrq = 512; - r8168c_hw_jumbo_enable(tp); - } else { - r8168c_hw_jumbo_disable(tp); - } + r8169_mod_reg8_cond(tp, Config3, Jumbo_En0, jumbo); + r8169_mod_reg8_cond(tp, Config4, Jumbo_En1, jumbo); break; case RTL_GIGA_MAC_VER_28: - if (jumbo) - r8168dp_hw_jumbo_enable(tp); - else - r8168dp_hw_jumbo_disable(tp); + r8169_mod_reg8_cond(tp, Config3, Jumbo_En0, jumbo); break; case RTL_GIGA_MAC_VER_31 ... RTL_GIGA_MAC_VER_33: - if (jumbo) - r8168e_hw_jumbo_enable(tp); - else - r8168e_hw_jumbo_disable(tp); + RTL_W8(tp, MaxTxPacketSize, jumbo ? 0x24 : 0x3f); + r8169_mod_reg8_cond(tp, Config3, Jumbo_En0, jumbo); + r8169_mod_reg8_cond(tp, Config4, BIT(0), jumbo); break; default: break; -- 2.51.0 From d5ec8d91f82ef78405b506737952dec8af95a95b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:14 -0800 Subject: [PATCH 04/16] rtnetlink: Remove __rtnl_link_unregister(). rtnl_link_unregister() holds RTNL and calls __rtnl_link_unregister(), where we call synchronize_srcu() to wait inflight RTM_NEWLINK requests for per-netns RTNL. We put synchronize_srcu() in __rtnl_link_unregister() due to ifb.ko and dummy.ko. However, rtnl_newlink() will acquire SRCU before RTNL later in this series. Then, lockdep will detect the deadlock: rtnl_link_unregister() rtnl_newlink() ---- ---- lock(rtnl_mutex); lock(&ops->srcu); lock(rtnl_mutex); sync(&ops->srcu); To avoid the problem, we must call synchronize_srcu() before RTNL in rtnl_link_unregister(). As a preparation, let's remove __rtnl_link_unregister(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241108004823.29419-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/dummy.c | 6 +++++- drivers/net/ifb.c | 6 +++++- include/net/rtnetlink.h | 1 - net/core/rtnetlink.c | 32 ++++++++++---------------------- 4 files changed, 20 insertions(+), 25 deletions(-) diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index e9c5e1e11fa0..72618b6af44e 100644 --- a/drivers/net/dummy.c +++ b/drivers/net/dummy.c @@ -166,6 +166,7 @@ err: static int __init dummy_init_module(void) { + bool need_unregister = false; int i, err = 0; down_write(&pernet_ops_rwsem); @@ -179,12 +180,15 @@ static int __init dummy_init_module(void) cond_resched(); } if (err < 0) - __rtnl_link_unregister(&dummy_link_ops); + need_unregister = true; out: rtnl_unlock(); up_write(&pernet_ops_rwsem); + if (need_unregister) + rtnl_link_unregister(&dummy_link_ops); + return err; } diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index 2c1b5def4a0b..a4b9ec4e8f30 100644 --- a/drivers/net/ifb.c +++ b/drivers/net/ifb.c @@ -424,6 +424,7 @@ err: static int __init ifb_init_module(void) { + bool need_unregister = false; int i, err; down_write(&pernet_ops_rwsem); @@ -437,12 +438,15 @@ static int __init ifb_init_module(void) cond_resched(); } if (err) - __rtnl_link_unregister(&ifb_link_ops); + need_unregister = true; out: rtnl_unlock(); up_write(&pernet_ops_rwsem); + if (need_unregister) + rtnl_link_unregister(&ifb_link_ops); + return err; } diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index b260c0cc9671..3ebfcc6e56fd 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -165,7 +165,6 @@ struct rtnl_link_ops { }; int __rtnl_link_register(struct rtnl_link_ops *ops); -void __rtnl_link_unregister(struct rtnl_link_ops *ops); int rtnl_link_register(struct rtnl_link_ops *ops); void rtnl_link_unregister(struct rtnl_link_ops *ops); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a5c386a45501..f0246ecec7fa 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -568,27 +568,6 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) unregister_netdevice_many(&list_kill); } -/** - * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. - * @ops: struct rtnl_link_ops * to unregister - * - * The caller must hold the rtnl_mutex and guarantee net_namespace_list - * integrity (hold pernet_ops_rwsem for writing to close the race - * with setup_net() and cleanup_net()). - */ -void __rtnl_link_unregister(struct rtnl_link_ops *ops) -{ - struct net *net; - - list_del_rcu(&ops->list); - synchronize_srcu(&ops->srcu); - cleanup_srcu_struct(&ops->srcu); - - for_each_net(net) - __rtnl_kill_links(net, ops); -} -EXPORT_SYMBOL_GPL(__rtnl_link_unregister); - /* Return with the rtnl_lock held when there are no network * devices unregistering in any network namespace. */ @@ -617,10 +596,19 @@ static void rtnl_lock_unregistering_all(void) */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { + struct net *net; + /* Close the race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock_unregistering_all(); - __rtnl_link_unregister(ops); + + list_del_rcu(&ops->list); + synchronize_srcu(&ops->srcu); + cleanup_srcu_struct(&ops->srcu); + + for_each_net(net) + __rtnl_kill_links(net, ops); + rtnl_unlock(); up_write(&pernet_ops_rwsem); } -- 2.51.0 From 6b57ff21a3109b1dba2d286ff415463e6fb1fca3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:15 -0800 Subject: [PATCH 05/16] rtnetlink: Protect link_ops by mutex. rtnl_link_unregister() holds RTNL and calls synchronize_srcu(), but rtnl_newlink() will acquire SRCU frist and then RTNL. Then, we need to unlink ops and call synchronize_srcu() outside of RTNL to avoid the deadlock. rtnl_link_unregister() rtnl_newlink() ---- ---- lock(rtnl_mutex); lock(&ops->srcu); lock(rtnl_mutex); sync(&ops->srcu); Let's move as such and add a mutex to protect link_ops. Now, link_ops is protected by its dedicated mutex and rtnl_link_register() no longer needs to hold RTNL. While at it, we move the initialisation of ops->dellink and ops->srcu out of the mutex scope. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241108004823.29419-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 2 +- net/core/rtnetlink.c | 33 ++++++++++++++++++++------------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 3ebfcc6e56fd..7559020f760c 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -71,7 +71,7 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) /** * struct rtnl_link_ops - rtnetlink link operations * - * @list: Used internally, protected by RTNL and SRCU + * @list: Used internally, protected by link_ops_mutex and SRCU * @srcu: Used internally * @kind: Identifier * @netns_refund: Physical device, move to init_net on netns exit diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f0246ecec7fa..21154ef0048f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -466,6 +466,7 @@ void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n) } EXPORT_SYMBOL_GPL(__rtnl_unregister_many); +static DEFINE_MUTEX(link_ops_mutex); static LIST_HEAD(link_ops); static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index) @@ -508,14 +509,6 @@ int __rtnl_link_register(struct rtnl_link_ops *ops) struct rtnl_link_ops *tmp; int err; - /* When RTNL is removed, add lock for link_ops. */ - ASSERT_RTNL(); - - list_for_each_entry(tmp, &link_ops, list) { - if (!strcmp(ops->kind, tmp->kind)) - return -EEXIST; - } - /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible * to use the ops for creating device. So do not @@ -528,9 +521,20 @@ int __rtnl_link_register(struct rtnl_link_ops *ops) if (err) return err; + mutex_lock(&link_ops_mutex); + + list_for_each_entry(tmp, &link_ops, list) { + if (!strcmp(ops->kind, tmp->kind)) { + err = -EEXIST; + goto unlock; + } + } + list_add_tail_rcu(&ops->list, &link_ops); +unlock: + mutex_unlock(&link_ops_mutex); - return 0; + return err; } EXPORT_SYMBOL_GPL(__rtnl_link_register); @@ -598,14 +602,17 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops) { struct net *net; - /* Close the race with setup_net() and cleanup_net() */ - down_write(&pernet_ops_rwsem); - rtnl_lock_unregistering_all(); - + mutex_lock(&link_ops_mutex); list_del_rcu(&ops->list); + mutex_unlock(&link_ops_mutex); + synchronize_srcu(&ops->srcu); cleanup_srcu_struct(&ops->srcu); + /* Close the race with setup_net() and cleanup_net() */ + down_write(&pernet_ops_rwsem); + rtnl_lock_unregistering_all(); + for_each_net(net) __rtnl_kill_links(net, ops); -- 2.51.0 From 68297dbb967f87c3c92af9d2f652270f57c547c7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:16 -0800 Subject: [PATCH 06/16] rtnetlink: Remove __rtnl_link_register() link_ops is protected by link_ops_mutex and no longer needs RTNL, so we have no reason to have __rtnl_link_register() separately. Let's remove it and call rtnl_link_register() from ifb.ko and dummy.ko. Note that both modules' init() work on init_net only, so we need not export pernet_ops_rwsem and can use rtnl_net_lock() there. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241108004823.29419-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/dummy.c | 17 ++++++----------- drivers/net/ifb.c | 17 ++++++----------- include/net/rtnetlink.h | 2 -- net/core/net_namespace.c | 1 - net/core/rtnetlink.c | 35 +++++++---------------------------- 5 files changed, 19 insertions(+), 53 deletions(-) diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index 72618b6af44e..005d79975f3b 100644 --- a/drivers/net/dummy.c +++ b/drivers/net/dummy.c @@ -166,27 +166,22 @@ err: static int __init dummy_init_module(void) { - bool need_unregister = false; int i, err = 0; - down_write(&pernet_ops_rwsem); - rtnl_lock(); - err = __rtnl_link_register(&dummy_link_ops); + err = rtnl_link_register(&dummy_link_ops); if (err < 0) - goto out; + return err; + + rtnl_net_lock(&init_net); for (i = 0; i < numdummies && !err; i++) { err = dummy_init_one(); cond_resched(); } - if (err < 0) - need_unregister = true; -out: - rtnl_unlock(); - up_write(&pernet_ops_rwsem); + rtnl_net_unlock(&init_net); - if (need_unregister) + if (err < 0) rtnl_link_unregister(&dummy_link_ops); return err; diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index a4b9ec4e8f30..67424888ff0a 100644 --- a/drivers/net/ifb.c +++ b/drivers/net/ifb.c @@ -424,27 +424,22 @@ err: static int __init ifb_init_module(void) { - bool need_unregister = false; int i, err; - down_write(&pernet_ops_rwsem); - rtnl_lock(); - err = __rtnl_link_register(&ifb_link_ops); + err = rtnl_link_register(&ifb_link_ops); if (err < 0) - goto out; + return err; + + rtnl_net_lock(&init_net); for (i = 0; i < numifbs && !err; i++) { err = ifb_init_one(i); cond_resched(); } - if (err) - need_unregister = true; -out: - rtnl_unlock(); - up_write(&pernet_ops_rwsem); + rtnl_net_unlock(&init_net); - if (need_unregister) + if (err) rtnl_link_unregister(&ifb_link_ops); return err; diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 7559020f760c..ef7c11f0d74c 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -164,8 +164,6 @@ struct rtnl_link_ops { int *prividx, int attr); }; -int __rtnl_link_register(struct rtnl_link_ops *ops); - int rtnl_link_register(struct rtnl_link_ops *ops); void rtnl_link_unregister(struct rtnl_link_ops *ops); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 809b48c0a528..157021ced442 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -56,7 +56,6 @@ static bool init_net_initialized; * outside. */ DECLARE_RWSEM(pernet_ops_rwsem); -EXPORT_SYMBOL_GPL(pernet_ops_rwsem); #define MIN_PERNET_OPS_ID \ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 21154ef0048f..e8357a3b9c7e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -495,20 +495,21 @@ static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index) } /** - * __rtnl_link_register - Register rtnl_link_ops with rtnetlink. + * rtnl_link_register - Register rtnl_link_ops with rtnetlink. * @ops: struct rtnl_link_ops * to register * - * The caller must hold the rtnl_mutex. This function should be used - * by drivers that create devices during module initialization. It - * must be called before registering the devices. - * * Returns 0 on success or a negative error code. */ -int __rtnl_link_register(struct rtnl_link_ops *ops) +int rtnl_link_register(struct rtnl_link_ops *ops) { struct rtnl_link_ops *tmp; int err; + /* Sanity-check max sizes to avoid stack buffer overflow. */ + if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE || + ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) + return -EINVAL; + /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible * to use the ops for creating device. So do not @@ -536,28 +537,6 @@ unlock: return err; } -EXPORT_SYMBOL_GPL(__rtnl_link_register); - -/** - * rtnl_link_register - Register rtnl_link_ops with rtnetlink. - * @ops: struct rtnl_link_ops * to register - * - * Returns 0 on success or a negative error code. - */ -int rtnl_link_register(struct rtnl_link_ops *ops) -{ - int err; - - /* Sanity-check max sizes to avoid stack buffer overflow. */ - if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE || - ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) - return -EINVAL; - - rtnl_lock(); - err = __rtnl_link_register(ops); - rtnl_unlock(); - return err; -} EXPORT_SYMBOL_GPL(rtnl_link_register); static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) -- 2.51.0 From cbaaa6326bc58e75239df437a8fdcdb2335d3b24 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:17 -0800 Subject: [PATCH 07/16] rtnetlink: Introduce struct rtnl_nets and helpers. rtnl_newlink() needs to hold 3 per-netns RTNL: 2 for a new device and 1 for its peer. We will add rtnl_nets_lock() later, which performs the nested locking based on struct rtnl_nets, which has an array of struct net pointers. rtnl_nets_add() adds a net pointer to the array and sorts it so that rtnl_nets_lock() can simply acquire per-netns RTNL from array[0] to [2]. Before calling rtnl_nets_add(), get_net() must be called for the net, and rtnl_nets_destroy() will call put_net() for each. Let's apply the helpers to rtnl_newlink(). When CONFIG_DEBUG_NET_SMALL_RTNL is disabled, we do not call rtnl_net_lock() thus do not care about the array order, so rtnl_net_cmp_locks() returns -1 so that the loop in rtnl_nets_add() can be optimised to NOP. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241108004823.29419-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 70 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 3 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e8357a3b9c7e..960d9d2c6aec 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -258,8 +258,67 @@ bool lockdep_rtnl_net_is_held(struct net *net) return lockdep_rtnl_is_held() && lockdep_is_held(&net->rtnl_mutex); } EXPORT_SYMBOL(lockdep_rtnl_net_is_held); +#else +static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) +{ + /* No need to swap */ + return -1; +} #endif +struct rtnl_nets { + /* ->newlink() needs to freeze 3 netns at most; + * 2 for the new device, 1 for its peer. + */ + struct net *net[3]; + unsigned char len; +}; + +static void rtnl_nets_init(struct rtnl_nets *rtnl_nets) +{ + memset(rtnl_nets, 0, sizeof(*rtnl_nets)); +} + +static void rtnl_nets_destroy(struct rtnl_nets *rtnl_nets) +{ + int i; + + for (i = 0; i < rtnl_nets->len; i++) { + put_net(rtnl_nets->net[i]); + rtnl_nets->net[i] = NULL; + } + + rtnl_nets->len = 0; +} + +/** + * rtnl_nets_add - Add netns to be locked before ->newlink(). + * + * @rtnl_nets: rtnl_nets pointer passed to ->get_peer_net(). + * @net: netns pointer with an extra refcnt held. + * + * The extra refcnt is released in rtnl_nets_destroy(). + */ +static void rtnl_nets_add(struct rtnl_nets *rtnl_nets, struct net *net) +{ + int i; + + DEBUG_NET_WARN_ON_ONCE(rtnl_nets->len == ARRAY_SIZE(rtnl_nets->net)); + + for (i = 0; i < rtnl_nets->len; i++) { + switch (rtnl_net_cmp_locks(rtnl_nets->net[i], net)) { + case 0: + put_net(net); + return; + case 1: + swap(rtnl_nets->net[i], net); + } + } + + rtnl_nets->net[i] = net; + rtnl_nets->len++; +} + static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) @@ -3767,6 +3826,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *tgt_net, *link_net = NULL; struct rtnl_link_ops *ops = NULL; struct rtnl_newlink_tbs *tbs; + struct rtnl_nets rtnl_nets; int ops_srcu_index; int ret; @@ -3810,6 +3870,8 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, #endif } + rtnl_nets_init(&rtnl_nets); + if (ops) { if (ops->maxtype > RTNL_MAX_TYPE) { ret = -EINVAL; @@ -3839,6 +3901,8 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto put_ops; } + rtnl_nets_add(&rtnl_nets, tgt_net); + if (tb[IFLA_LINK_NETNSID]) { int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); @@ -3849,6 +3913,8 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto put_net; } + rtnl_nets_add(&rtnl_nets, link_net); + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; goto put_net; @@ -3858,9 +3924,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, tbs, data, extack); put_net: - if (link_net) - put_net(link_net); - put_net(tgt_net); + rtnl_nets_destroy(&rtnl_nets); put_ops: if (ops) rtnl_link_ops_put(ops, ops_srcu_index); -- 2.51.0 From 28690e5361c05fd4ef0ca3a17d1c667cba790554 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:18 -0800 Subject: [PATCH 08/16] rtnetlink: Add peer_type in struct rtnl_link_ops. In ops->newlink(), veth, vxcan, and netkit call rtnl_link_get_net() with a net pointer, which is the first argument of ->newlink(). rtnl_link_get_net() could return another netns based on IFLA_NET_NS_PID and IFLA_NET_NS_FD in the peer device's attributes. We want to get it and fill rtnl_nets->nets[] in advance in rtnl_newlink() for per-netns RTNL. All of the three get the peer netns in the same way: 1. Call rtnl_nla_parse_ifinfomsg() 2. Call ops->validate() (vxcan doesn't have) 3. Call rtnl_link_get_net_tb() Let's add a new field peer_type to struct rtnl_link_ops and prefetch netns in the peer ifla to add it to rtnl_nets in rtnl_newlink(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241108004823.29419-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 2 ++ net/core/rtnetlink.c | 55 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index ef7c11f0d74c..bef76abcff8d 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -75,6 +75,7 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) * @srcu: Used internally * @kind: Identifier * @netns_refund: Physical device, move to init_net on netns exit + * @peer_type: Peer device specific netlink attribute number (e.g. VETH_INFO_PEER) * @maxtype: Highest device specific netlink attribute number * @policy: Netlink policy for device specific attribute validation * @validate: Optional validation function for netlink/changelink parameters @@ -116,6 +117,7 @@ struct rtnl_link_ops { void (*setup)(struct net_device *dev); bool netns_refund; + const u16 peer_type; unsigned int maxtype; const struct nla_policy *policy; int (*validate)(struct nlattr *tb[], diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 960d9d2c6aec..1af187a4a3f1 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2492,9 +2492,10 @@ int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, } EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg); -struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +static struct net *rtnl_link_get_net_ifla(struct nlattr *tb[]) { - struct net *net; + struct net *net = NULL; + /* Examine the link attributes and figure out which * network namespace we are talking about. */ @@ -2502,8 +2503,17 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); else if (tb[IFLA_NET_NS_FD]) net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); - else + + return net; +} + +struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +{ + struct net *net = rtnl_link_get_net_ifla(tb); + + if (!net) net = get_net(src_net); + return net; } EXPORT_SYMBOL(rtnl_link_get_net); @@ -3765,6 +3775,37 @@ out_unregister: goto out; } +static int rtnl_add_peer_net(struct rtnl_nets *rtnl_nets, + const struct rtnl_link_ops *ops, + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_MAX + 1]; + struct net *net; + int err; + + if (!data || !data[ops->peer_type]) + return 0; + + err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); + if (err < 0) + return err; + + if (ops->validate) { + err = ops->validate(tb, NULL, extack); + if (err < 0) + return err; + } + + net = rtnl_link_get_net_ifla(tb); + if (IS_ERR(net)) + return PTR_ERR(net); + if (net) + rtnl_nets_add(rtnl_nets, net); + + return 0; +} + static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, struct net *tgt_net, struct net *link_net, @@ -3893,12 +3934,18 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (ret < 0) goto put_ops; } + + if (ops->peer_type) { + ret = rtnl_add_peer_net(&rtnl_nets, ops, data, extack); + if (ret < 0) + goto put_ops; + } } tgt_net = rtnl_link_get_net_capable(skb, sock_net(skb->sk), tb, CAP_NET_ADMIN); if (IS_ERR(tgt_net)) { ret = PTR_ERR(tgt_net); - goto put_ops; + goto put_net; } rtnl_nets_add(&rtnl_nets, tgt_net); -- 2.51.0 From 0eb87b02a7058f1dc64bcd6fa619d8556186de3d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:19 -0800 Subject: [PATCH 09/16] veth: Set VETH_INFO_PEER to veth_link_ops.peer_type. For per-netns RTNL, we need to prefetch the peer device's netns. Let's set rtnl_link_ops.peer_type and accordingly remove duplicated validation in ->newlink(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241108004823.29419-7-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/veth.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 18148e068aa0..0d6d0d749d44 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1781,19 +1781,11 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, /* * create and register peer first */ - if (data != NULL && data[VETH_INFO_PEER] != NULL) { - struct nlattr *nla_peer; + if (data && data[VETH_INFO_PEER]) { + struct nlattr *nla_peer = data[VETH_INFO_PEER]; - nla_peer = data[VETH_INFO_PEER]; ifmp = nla_data(nla_peer); - err = rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); - if (err < 0) - return err; - - err = veth_validate(peer_tb, NULL, extack); - if (err < 0) - return err; - + rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); tbp = peer_tb; } else { ifmp = NULL; @@ -1809,9 +1801,6 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, } net = rtnl_link_get_net(src_net, tbp); - if (IS_ERR(net)) - return PTR_ERR(net); - peer = rtnl_create_link(net, ifname, name_assign_type, &veth_link_ops, tbp, extack); if (IS_ERR(peer)) { @@ -1952,6 +1941,7 @@ static struct rtnl_link_ops veth_link_ops = { .newlink = veth_newlink, .dellink = veth_dellink, .policy = veth_policy, + .peer_type = VETH_INFO_PEER, .maxtype = VETH_INFO_MAX, .get_link_net = veth_get_link_net, .get_num_tx_queues = veth_get_num_queues, -- 2.51.0 From 6b84e558e95d95f3bb1139dfbb693bc22c760dad Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:20 -0800 Subject: [PATCH 10/16] vxcan: Set VXCAN_INFO_PEER to vxcan_link_ops.peer_type. For per-netns RTNL, we need to prefetch the peer device's netns. Let's set rtnl_link_ops.peer_type and accordingly remove duplicated validation in ->newlink(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241108004823.29419-8-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/can/vxcan.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c index 9e1b7d41005f..da7c72105fb6 100644 --- a/drivers/net/can/vxcan.c +++ b/drivers/net/can/vxcan.c @@ -188,14 +188,10 @@ static int vxcan_newlink(struct net *net, struct net_device *dev, /* register peer device */ if (data && data[VXCAN_INFO_PEER]) { - struct nlattr *nla_peer; + struct nlattr *nla_peer = data[VXCAN_INFO_PEER]; - nla_peer = data[VXCAN_INFO_PEER]; ifmp = nla_data(nla_peer); - err = rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); - if (err < 0) - return err; - + rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); tbp = peer_tb; } @@ -208,9 +204,6 @@ static int vxcan_newlink(struct net *net, struct net_device *dev, } peer_net = rtnl_link_get_net(net, tbp); - if (IS_ERR(peer_net)) - return PTR_ERR(peer_net); - peer = rtnl_create_link(peer_net, ifname, name_assign_type, &vxcan_link_ops, tbp, extack); if (IS_ERR(peer)) { @@ -302,6 +295,7 @@ static struct rtnl_link_ops vxcan_link_ops = { .newlink = vxcan_newlink, .dellink = vxcan_dellink, .policy = vxcan_policy, + .peer_type = VXCAN_INFO_PEER, .maxtype = VXCAN_INFO_MAX, .get_link_net = vxcan_get_link_net, }; -- 2.51.0 From fefd5d08217284a8894502eb1148ff88bc8510c0 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:21 -0800 Subject: [PATCH 11/16] netkit: Set IFLA_NETKIT_PEER_INFO to netkit_link_ops.peer_type. For per-netns RTNL, we need to prefetch the peer device's netns. Let's set rtnl_link_ops.peer_type and accordingly remove duplicated validation in ->newlink(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241108004823.29419-9-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/netkit.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index cd8360b9bbde..bb07725d1c72 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -351,12 +351,7 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, if (data[IFLA_NETKIT_PEER_INFO]) { attr = data[IFLA_NETKIT_PEER_INFO]; ifmp = nla_data(attr); - err = rtnl_nla_parse_ifinfomsg(peer_tb, attr, extack); - if (err < 0) - return err; - err = netkit_validate(peer_tb, NULL, extack); - if (err < 0) - return err; + rtnl_nla_parse_ifinfomsg(peer_tb, attr, extack); tbp = peer_tb; } if (data[IFLA_NETKIT_SCRUB]) @@ -391,9 +386,6 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, return -EOPNOTSUPP; net = rtnl_link_get_net(src_net, tbp); - if (IS_ERR(net)) - return PTR_ERR(net); - peer = rtnl_create_link(net, ifname, ifname_assign_type, &netkit_link_ops, tbp, extack); if (IS_ERR(peer)) { @@ -978,6 +970,7 @@ static struct rtnl_link_ops netkit_link_ops = { .fill_info = netkit_fill_info, .policy = netkit_policy, .validate = netkit_validate, + .peer_type = IFLA_NETKIT_PEER_INFO, .maxtype = IFLA_NETKIT_MAX, }; -- 2.51.0 From d91191ffe23f927b14b8e861f22037cf153c48cb Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:22 -0800 Subject: [PATCH 12/16] rtnetlink: Convert RTM_NEWLINK to per-netns RTNL. Now, we are ready to convert rtnl_newlink() to per-netns RTNL; rtnl_link_ops is protected by SRCU and netns is prefetched in rtnl_newlink(). Let's register rtnl_newlink() with RTNL_FLAG_DOIT_PERNET and push RTNL down as rtnl_nets_lock(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241108004823.29419-10-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1af187a4a3f1..30191d17add3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -319,6 +319,26 @@ static void rtnl_nets_add(struct rtnl_nets *rtnl_nets, struct net *net) rtnl_nets->len++; } +static void rtnl_nets_lock(struct rtnl_nets *rtnl_nets) +{ + int i; + + rtnl_lock(); + + for (i = 0; i < rtnl_nets->len; i++) + __rtnl_net_lock(rtnl_nets->net[i]); +} + +static void rtnl_nets_unlock(struct rtnl_nets *rtnl_nets) +{ + int i; + + for (i = 0; i < rtnl_nets->len; i++) + __rtnl_net_unlock(rtnl_nets->net[i]); + + rtnl_unlock(); +} + static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) @@ -3903,9 +3923,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, ops = rtnl_link_ops_get(kind, &ops_srcu_index); #ifdef CONFIG_MODULES if (!ops) { - __rtnl_unlock(); request_module("rtnl-link-%s", kind); - rtnl_lock(); ops = rtnl_link_ops_get(kind, &ops_srcu_index); } #endif @@ -3968,7 +3986,9 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, } } + rtnl_nets_lock(&rtnl_nets); ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, tbs, data, extack); + rtnl_nets_unlock(&rtnl_nets); put_net: rtnl_nets_destroy(&rtnl_nets); @@ -6972,7 +6992,8 @@ static struct pernet_operations rtnetlink_net_ops = { }; static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst = { - {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink}, + {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink, + .flags = RTNL_FLAG_DOIT_PERNET}, {.msgtype = RTM_DELLINK, .doit = rtnl_dellink}, {.msgtype = RTM_GETLINK, .doit = rtnl_getlink, .dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, -- 2.51.0 From 636af13f213bf9b28a34254327934bc72a797754 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:23 -0800 Subject: [PATCH 13/16] rtnetlink: Register rtnl_dellink() and rtnl_setlink() with RTNL_FLAG_DOIT_PERNET_WIP. Currently, rtnl_setlink() and rtnl_dellink() cannot be fully converted to per-netns RTNL due to a lack of handling peer/lower/upper devices in different netns. For example, when we change a device in rtnl_setlink() and need to propagate that to its upper devices, we want to avoid acquiring all netns locks, for which we do not know the upper limit. The same situation happens when we remove a device. rtnl_dellink() could be transformed to remove a single device in the requested netns and delegate other devices to per-netns work, and rtnl_setlink() might be ? Until we come up with a better idea, let's use a new flag RTNL_FLAG_DOIT_PERNET_WIP for rtnl_dellink() and rtnl_setlink(). This will unblock converting RTNL users where such devices are not related. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Nikolay Aleksandrov Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241108004823.29419-11-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 1 + net/core/rtnetlink.c | 19 ++++++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index bef76abcff8d..bc0069a8b6ea 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -13,6 +13,7 @@ typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); enum rtnl_link_flags { RTNL_FLAG_DOIT_UNLOCKED = BIT(0), #define RTNL_FLAG_DOIT_PERNET RTNL_FLAG_DOIT_UNLOCKED +#define RTNL_FLAG_DOIT_PERNET_WIP RTNL_FLAG_DOIT_UNLOCKED RTNL_FLAG_BULK_DEL_SUPPORTED = BIT(1), RTNL_FLAG_DUMP_UNLOCKED = BIT(2), RTNL_FLAG_DUMP_SPLIT_NLM_DONE = BIT(3), /* legacy behavior */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 30191d17add3..327fa4957929 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3379,6 +3379,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; + struct rtnl_nets rtnl_nets; struct net *tgt_net; int err; @@ -3397,6 +3398,12 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } + rtnl_nets_init(&rtnl_nets); + rtnl_nets_add(&rtnl_nets, get_net(net)); + rtnl_nets_add(&rtnl_nets, tgt_net); + + rtnl_nets_lock(&rtnl_nets); + if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) @@ -3409,7 +3416,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, else if (!err) err = -ENODEV; - put_net(tgt_net); + rtnl_nets_unlock(&rtnl_nets); errout: return err; } @@ -3494,6 +3501,8 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(tgt_net); } + rtnl_net_lock(tgt_net); + if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) @@ -3508,6 +3517,8 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, else err = -EINVAL; + rtnl_net_unlock(tgt_net); + if (netnsid >= 0) put_net(tgt_net); @@ -6994,10 +7005,12 @@ static struct pernet_operations rtnetlink_net_ops = { static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink, .flags = RTNL_FLAG_DOIT_PERNET}, - {.msgtype = RTM_DELLINK, .doit = rtnl_dellink}, + {.msgtype = RTM_DELLINK, .doit = rtnl_dellink, + .flags = RTNL_FLAG_DOIT_PERNET_WIP}, {.msgtype = RTM_GETLINK, .doit = rtnl_getlink, .dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, - {.msgtype = RTM_SETLINK, .doit = rtnl_setlink}, + {.msgtype = RTM_SETLINK, .doit = rtnl_setlink, + .flags = RTNL_FLAG_DOIT_PERNET_WIP}, {.msgtype = RTM_GETADDR, .dumpit = rtnl_dump_all}, {.msgtype = RTM_GETROUTE, .dumpit = rtnl_dump_all}, {.msgtype = RTM_GETNETCONF, .dumpit = rtnl_dump_all}, -- 2.51.0 From f0fe51a043868bd12e0eb9ee532723342ce3faf6 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Thu, 7 Nov 2024 13:49:17 -0800 Subject: [PATCH 14/16] bnxt_en: add unlocked version of bnxt_refclk_read Serialization of PHC read with FW reset mechanism uses ptp_lock which also protects timecounter updates. This means we cannot grab it when called from bnxt_cc_read(). Let's move locking into different function. Fixes: 6c0828d00f07 ("bnxt_en: replace PTP spinlock with seqlock") Signed-off-by: Vadim Fedorenko Reviewed-by: Pavan Chebbi Link: https://patch.msgid.link/20241107214917.2980976-1-vadfed@meta.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c | 35 +++++++++++++------ 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c index f74afdab4f7d..91e7e08fabb1 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c @@ -73,19 +73,15 @@ static int bnxt_ptp_settime(struct ptp_clock_info *ptp_info, return 0; } -static int bnxt_refclk_read(struct bnxt *bp, struct ptp_system_timestamp *sts, - u64 *ns) +/* Caller holds ptp_lock */ +static int __bnxt_refclk_read(struct bnxt *bp, struct ptp_system_timestamp *sts, + u64 *ns) { struct bnxt_ptp_cfg *ptp = bp->ptp_cfg; u32 high_before, high_now, low; - unsigned long flags; - /* We have to serialize reg access and FW reset */ - read_seqlock_excl_irqsave(&ptp->ptp_lock, flags); - if (test_bit(BNXT_STATE_IN_FW_RESET, &bp->state)) { - read_sequnlock_excl_irqrestore(&ptp->ptp_lock, flags); + if (test_bit(BNXT_STATE_IN_FW_RESET, &bp->state)) return -EIO; - } high_before = readl(bp->bar0 + ptp->refclk_mapped_regs[1]); ptp_read_system_prets(sts); @@ -97,12 +93,25 @@ static int bnxt_refclk_read(struct bnxt *bp, struct ptp_system_timestamp *sts, low = readl(bp->bar0 + ptp->refclk_mapped_regs[0]); ptp_read_system_postts(sts); } - read_sequnlock_excl_irqrestore(&ptp->ptp_lock, flags); *ns = ((u64)high_now << 32) | low; return 0; } +static int bnxt_refclk_read(struct bnxt *bp, struct ptp_system_timestamp *sts, + u64 *ns) +{ + struct bnxt_ptp_cfg *ptp = bp->ptp_cfg; + unsigned long flags; + int rc; + + /* We have to serialize reg access and FW reset */ + read_seqlock_excl_irqsave(&ptp->ptp_lock, flags); + rc = __bnxt_refclk_read(bp, sts, ns); + read_sequnlock_excl_irqrestore(&ptp->ptp_lock, flags); + return rc; +} + static void bnxt_ptp_get_current_time(struct bnxt *bp) { struct bnxt_ptp_cfg *ptp = bp->ptp_cfg; @@ -674,7 +683,7 @@ static u64 bnxt_cc_read(const struct cyclecounter *cc) struct bnxt_ptp_cfg *ptp = container_of(cc, struct bnxt_ptp_cfg, cc); u64 ns = 0; - bnxt_refclk_read(ptp->bp, NULL, &ns); + __bnxt_refclk_read(ptp->bp, NULL, &ns); return ns; } @@ -936,6 +945,7 @@ static bool bnxt_pps_config_ok(struct bnxt *bp) static void bnxt_ptp_timecounter_init(struct bnxt *bp, bool init_tc) { struct bnxt_ptp_cfg *ptp = bp->ptp_cfg; + unsigned long flags; if (!ptp->ptp_clock) { memset(&ptp->cc, 0, sizeof(ptp->cc)); @@ -952,8 +962,11 @@ static void bnxt_ptp_timecounter_init(struct bnxt *bp, bool init_tc) } ptp->next_overflow_check = jiffies + BNXT_PHC_OVERFLOW_PERIOD; } - if (init_tc) + if (init_tc) { + write_seqlock_irqsave(&ptp->ptp_lock, flags); timecounter_init(&ptp->tc, &ptp->cc, ktime_to_ns(ktime_get_real())); + write_sequnlock_irqrestore(&ptp->ptp_lock, flags); + } } /* Caller holds ptp_lock */ -- 2.51.0 From 5dc51ec86df6e2214d8398079c1e31736593ab53 Mon Sep 17 00:00:00 2001 From: Martin Karsten Date: Sat, 9 Nov 2024 05:02:31 +0000 Subject: [PATCH 15/16] net: Add napi_struct parameter irq_suspend_timeout Add a per-NAPI IRQ suspension parameter, which can be get/set with netdev-genl. This patch doesn't change any behavior but prepares the code for other changes in the following commits which use irq_suspend_timeout as a timeout for IRQ suspension. Signed-off-by: Martin Karsten Co-developed-by: Joe Damato Signed-off-by: Joe Damato Tested-by: Joe Damato Tested-by: Martin Karsten Acked-by: Stanislav Fomichev Reviewed-by: Sridhar Samudrala Link: https://patch.msgid.link/20241109050245.191288-2-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 7 +++++++ include/linux/netdevice.h | 2 ++ include/uapi/linux/netdev.h | 1 + net/core/dev.c | 2 ++ net/core/dev.h | 25 +++++++++++++++++++++++++ net/core/netdev-genl-gen.c | 5 +++-- net/core/netdev-genl.c | 12 ++++++++++++ tools/include/uapi/linux/netdev.h | 1 + 8 files changed, 53 insertions(+), 2 deletions(-) diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index f9cb97d6106c..cbb544bd6c84 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -263,6 +263,11 @@ attribute-sets: the end of a NAPI cycle. This may add receive latency in exchange for reducing the number of frames processed by the network stack. type: uint + - + name: irq-suspend-timeout + doc: The timeout, in nanoseconds, of how long to suspend irq + processing, if event polling finds events + type: uint - name: queue attributes: @@ -653,6 +658,7 @@ operations: - pid - defer-hard-irqs - gro-flush-timeout + - irq-suspend-timeout dump: request: attributes: @@ -704,6 +710,7 @@ operations: - id - defer-hard-irqs - gro-flush-timeout + - irq-suspend-timeout kernel-family: headers: [ "linux/list.h"] diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index df4483598628..0aae346d919e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -348,6 +348,7 @@ struct gro_list { */ struct napi_config { u64 gro_flush_timeout; + u64 irq_suspend_timeout; u32 defer_hard_irqs; unsigned int napi_id; }; @@ -384,6 +385,7 @@ struct napi_struct { struct hrtimer timer; struct task_struct *thread; unsigned long gro_flush_timeout; + unsigned long irq_suspend_timeout; u32 defer_hard_irqs; /* control-path-only fields follow */ struct list_head dev_list; diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index e3ebb49f60d2..e4be227d3ad6 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -124,6 +124,7 @@ enum { NETDEV_A_NAPI_PID, NETDEV_A_NAPI_DEFER_HARD_IRQS, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) diff --git a/net/core/dev.c b/net/core/dev.c index 6a31152e4606..4d910872963f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6666,6 +6666,7 @@ static void napi_restore_config(struct napi_struct *n) { n->defer_hard_irqs = n->config->defer_hard_irqs; n->gro_flush_timeout = n->config->gro_flush_timeout; + n->irq_suspend_timeout = n->config->irq_suspend_timeout; /* a NAPI ID might be stored in the config, if so use it. if not, use * napi_hash_add to generate one for us. It will be saved to the config * in napi_disable. @@ -6680,6 +6681,7 @@ static void napi_save_config(struct napi_struct *n) { n->config->defer_hard_irqs = n->defer_hard_irqs; n->config->gro_flush_timeout = n->gro_flush_timeout; + n->config->irq_suspend_timeout = n->irq_suspend_timeout; n->config->napi_id = n->napi_id; napi_hash_del(n); } diff --git a/net/core/dev.h b/net/core/dev.h index 7881bced70a9..d043dee25a68 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -236,6 +236,31 @@ static inline void netdev_set_gro_flush_timeout(struct net_device *netdev, netdev->napi_config[i].gro_flush_timeout = timeout; } +/** + * napi_get_irq_suspend_timeout - get the irq_suspend_timeout + * @n: napi struct to get the irq_suspend_timeout from + * + * Return: the per-NAPI value of the irq_suspend_timeout field. + */ +static inline unsigned long +napi_get_irq_suspend_timeout(const struct napi_struct *n) +{ + return READ_ONCE(n->irq_suspend_timeout); +} + +/** + * napi_set_irq_suspend_timeout - set the irq_suspend_timeout for a napi + * @n: napi struct to set the irq_suspend_timeout + * @timeout: timeout value to set + * + * napi_set_irq_suspend_timeout sets the per-NAPI irq_suspend_timeout + */ +static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, + unsigned long timeout) +{ + WRITE_ONCE(n->irq_suspend_timeout, timeout); +} + int rps_cpumask_housekeeping(struct cpumask *mask); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 21de7e10be16..a89cbd8d87c3 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -92,10 +92,11 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] }; /* NETDEV_CMD_NAPI_SET - do */ -static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT + 1] = { +static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT + 1] = { [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, + [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, }; /* Ops table for netdev */ @@ -186,7 +187,7 @@ static const struct genl_split_ops netdev_nl_ops[] = { .cmd = NETDEV_CMD_NAPI_SET, .doit = netdev_nl_napi_set_doit, .policy = netdev_napi_set_nl_policy, - .maxattr = NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + .maxattr = NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, }; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index b49c3b4e5fbe..765ce7c9d73b 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -161,6 +161,7 @@ static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { + unsigned long irq_suspend_timeout; unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; void *hdr; @@ -196,6 +197,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, napi_defer_hard_irqs)) goto nla_put_failure; + irq_suspend_timeout = napi_get_irq_suspend_timeout(napi); + if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, + irq_suspend_timeout)) + goto nla_put_failure; + gro_flush_timeout = napi_get_gro_flush_timeout(napi); if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, gro_flush_timeout)) @@ -306,6 +312,7 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) static int netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) { + u64 irq_suspend_timeout = 0; u64 gro_flush_timeout = 0; u32 defer = 0; @@ -314,6 +321,11 @@ netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) napi_set_defer_hard_irqs(napi, defer); } + if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) { + irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]); + napi_set_irq_suspend_timeout(napi, irq_suspend_timeout); + } + if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) { gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]); napi_set_gro_flush_timeout(napi, gro_flush_timeout); diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index e3ebb49f60d2..e4be227d3ad6 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -124,6 +124,7 @@ enum { NETDEV_A_NAPI_PID, NETDEV_A_NAPI_DEFER_HARD_IRQS, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) -- 2.51.0 From 3fcbecbdeb048dfd1bea824f4276717fed02d10e Mon Sep 17 00:00:00 2001 From: Martin Karsten Date: Sat, 9 Nov 2024 05:02:32 +0000 Subject: [PATCH 16/16] net: Add control functions for irq suspension The napi_suspend_irqs routine bootstraps irq suspension by elongating the defer timeout to irq_suspend_timeout. The napi_resume_irqs routine effectively cancels irq suspension by forcing the napi to be scheduled immediately. Signed-off-by: Martin Karsten Co-developed-by: Joe Damato Signed-off-by: Joe Damato Tested-by: Joe Damato Tested-by: Martin Karsten Acked-by: Stanislav Fomichev Reviewed-by: Sridhar Samudrala Link: https://patch.msgid.link/20241109050245.191288-3-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- include/net/busy_poll.h | 3 +++ net/core/dev.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index f03040baaefd..c858270141bc 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -52,6 +52,9 @@ void napi_busy_loop_rcu(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); +void napi_suspend_irqs(unsigned int napi_id); +void napi_resume_irqs(unsigned int napi_id); + #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) { diff --git a/net/core/dev.c b/net/core/dev.c index 4d910872963f..13d00fc10f55 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6507,6 +6507,43 @@ void napi_busy_loop(unsigned int napi_id, } EXPORT_SYMBOL(napi_busy_loop); +void napi_suspend_irqs(unsigned int napi_id) +{ + struct napi_struct *napi; + + rcu_read_lock(); + napi = napi_by_id(napi_id); + if (napi) { + unsigned long timeout = napi_get_irq_suspend_timeout(napi); + + if (timeout) + hrtimer_start(&napi->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + } + rcu_read_unlock(); +} + +void napi_resume_irqs(unsigned int napi_id) +{ + struct napi_struct *napi; + + rcu_read_lock(); + napi = napi_by_id(napi_id); + if (napi) { + /* If irq_suspend_timeout is set to 0 between the call to + * napi_suspend_irqs and now, the original value still + * determines the safety timeout as intended and napi_watchdog + * will resume irq processing. + */ + if (napi_get_irq_suspend_timeout(napi)) { + local_bh_disable(); + napi_schedule(napi); + local_bh_enable(); + } + } + rcu_read_unlock(); +} + #endif /* CONFIG_NET_RX_BUSY_POLL */ static void __napi_hash_add_with_id(struct napi_struct *napi, -- 2.51.0